4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 /* LU-4398: do not cache write open lock if the file has exec bit */
357 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
358 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
359 LDLM_IBITS, &policy, lockmode, &lockh))
360 rc = ll_md_real_close(inode, fd->fd_omode);
363 LUSTRE_FPRIVATE(file) = NULL;
364 ll_file_data_put(fd);
369 /* While this returns an error code, fput() the caller does not, so we need
370 * to make every effort to clean up all of our state here. Also, applications
371 * rarely check close errors and even if an error is returned they will not
372 * re-try the close call.
374 int ll_file_release(struct inode *inode, struct file *file)
376 struct ll_file_data *fd;
377 struct ll_sb_info *sbi = ll_i2sbi(inode);
378 struct ll_inode_info *lli = ll_i2info(inode);
382 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
383 PFID(ll_inode2fid(inode)), inode);
385 if (inode->i_sb->s_root != file_dentry(file))
386 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
387 fd = LUSTRE_FPRIVATE(file);
390 /* The last ref on @file, maybe not the the owner pid of statahead,
391 * because parent and child process can share the same file handle. */
392 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
393 ll_deauthorize_statahead(inode, fd);
395 if (inode->i_sb->s_root == file_dentry(file)) {
396 LUSTRE_FPRIVATE(file) = NULL;
397 ll_file_data_put(fd);
401 pcc_file_release(inode, file);
403 if (!S_ISDIR(inode->i_mode)) {
404 if (lli->lli_clob != NULL)
405 lov_read_and_clear_async_rc(lli->lli_clob);
406 lli->lli_async_rc = 0;
409 rc = ll_md_close(inode, file);
411 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
412 libcfs_debug_dumplog();
417 static inline int ll_dom_readpage(void *data, struct page *page)
419 struct niobuf_local *lnb = data;
422 kaddr = ll_kmap_atomic(page, KM_USER0);
423 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
424 if (lnb->lnb_len < PAGE_SIZE)
425 memset(kaddr + lnb->lnb_len, 0,
426 PAGE_SIZE - lnb->lnb_len);
427 flush_dcache_page(page);
428 SetPageUptodate(page);
429 ll_kunmap_atomic(kaddr, KM_USER0);
435 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
436 struct lookup_intent *it)
438 struct ll_inode_info *lli = ll_i2info(inode);
439 struct cl_object *obj = lli->lli_clob;
440 struct address_space *mapping = inode->i_mapping;
442 struct niobuf_remote *rnb;
443 struct mdt_body *body;
445 unsigned long index, start;
446 struct niobuf_local lnb;
453 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
457 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
458 if (rnb == NULL || rnb->rnb_len == 0)
461 /* LU-11595: Server may return whole file and that is OK always or
462 * it may return just file tail and its offset must be aligned with
463 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
464 * smaller then offset may be not aligned and that data is just ignored.
466 if (rnb->rnb_offset % PAGE_SIZE)
469 /* Server returns whole file or just file tail if it fills in reply
470 * buffer, in both cases total size should be equal to the file size.
472 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
473 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
474 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
475 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
476 rnb->rnb_len, body->mbo_dom_size);
480 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
481 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
483 data = (char *)rnb + sizeof(*rnb);
485 lnb.lnb_file_offset = rnb->rnb_offset;
486 start = lnb.lnb_file_offset / PAGE_SIZE;
488 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
489 lnb.lnb_page_offset = 0;
491 lnb.lnb_data = data + (index << PAGE_SHIFT);
492 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
493 if (lnb.lnb_len > PAGE_SIZE)
494 lnb.lnb_len = PAGE_SIZE;
496 vmpage = read_cache_page(mapping, index + start,
497 ll_dom_readpage, &lnb);
498 if (IS_ERR(vmpage)) {
499 CWARN("%s: cannot fill page %lu for "DFID
500 " with data: rc = %li\n",
501 ll_i2sbi(inode)->ll_fsname, index + start,
502 PFID(lu_object_fid(&obj->co_lu)),
508 } while (rnb->rnb_len > (index << PAGE_SHIFT));
512 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
513 struct lookup_intent *itp)
515 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
516 struct dentry *parent = de->d_parent;
519 struct md_op_data *op_data;
520 struct ptlrpc_request *req = NULL;
524 LASSERT(parent != NULL);
525 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
527 /* if server supports open-by-fid, or file name is invalid, don't pack
528 * name in open request */
529 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
530 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
532 len = de->d_name.len;
533 name = kmalloc(len + 1, GFP_NOFS);
538 spin_lock(&de->d_lock);
539 if (len != de->d_name.len) {
540 spin_unlock(&de->d_lock);
544 memcpy(name, de->d_name.name, len);
546 spin_unlock(&de->d_lock);
548 if (!lu_name_is_valid_2(name, len)) {
554 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
555 name, len, 0, LUSTRE_OPC_ANY, NULL);
556 if (IS_ERR(op_data)) {
558 RETURN(PTR_ERR(op_data));
560 op_data->op_data = lmm;
561 op_data->op_data_size = lmmsize;
563 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
564 &ll_md_blocking_ast, 0);
566 ll_finish_md_op_data(op_data);
568 /* reason for keep own exit path - don`t flood log
569 * with messages with -ESTALE errors.
571 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
572 it_open_error(DISP_OPEN_OPEN, itp))
574 ll_release_openhandle(de, itp);
578 if (it_disposition(itp, DISP_LOOKUP_NEG))
579 GOTO(out, rc = -ENOENT);
581 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
582 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
583 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
587 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
589 if (!rc && itp->it_lock_mode) {
590 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
591 struct ldlm_lock *lock;
592 bool has_dom_bit = false;
594 /* If we got a lock back and it has a LOOKUP bit set,
595 * make sure the dentry is marked as valid so we can find it.
596 * We don't need to care about actual hashing since other bits
597 * of kernel will deal with that later.
599 lock = ldlm_handle2lock(&handle);
601 has_dom_bit = ldlm_has_dom(lock);
602 if (lock->l_policy_data.l_inodebits.bits &
603 MDS_INODELOCK_LOOKUP)
604 d_lustre_revalidate(de);
608 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
610 ll_dom_finish_open(de->d_inode, req, itp);
614 ptlrpc_req_finished(req);
615 ll_intent_drop_lock(itp);
617 /* We did open by fid, but by the time we got to the server,
618 * the object disappeared. If this is a create, we cannot really
619 * tell the userspace that the file it was trying to create
620 * does not exist. Instead let's return -ESTALE, and the VFS will
621 * retry the create with LOOKUP_REVAL that we are going to catch
622 * in ll_revalidate_dentry() and use lookup then.
624 if (rc == -ENOENT && itp->it_op & IT_CREAT)
630 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
631 struct obd_client_handle *och)
633 struct mdt_body *body;
635 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
636 och->och_open_handle = body->mbo_open_handle;
637 och->och_fid = body->mbo_fid1;
638 och->och_lease_handle.cookie = it->it_lock_handle;
639 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
640 och->och_flags = it->it_flags;
642 return md_set_open_replay_data(md_exp, och, it);
645 static int ll_local_open(struct file *file, struct lookup_intent *it,
646 struct ll_file_data *fd, struct obd_client_handle *och)
648 struct inode *inode = file_inode(file);
651 LASSERT(!LUSTRE_FPRIVATE(file));
658 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
663 LUSTRE_FPRIVATE(file) = fd;
664 ll_readahead_init(inode, &fd->fd_ras);
665 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
667 /* ll_cl_context initialize */
668 rwlock_init(&fd->fd_lock);
669 INIT_LIST_HEAD(&fd->fd_lccs);
674 /* Open a file, and (for the very first open) create objects on the OSTs at
675 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
676 * creation or open until ll_lov_setstripe() ioctl is called.
678 * If we already have the stripe MD locally then we don't request it in
679 * md_open(), by passing a lmm_size = 0.
681 * It is up to the application to ensure no other processes open this file
682 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
683 * used. We might be able to avoid races of that sort by getting lli_open_sem
684 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
685 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
687 int ll_file_open(struct inode *inode, struct file *file)
689 struct ll_inode_info *lli = ll_i2info(inode);
690 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
691 .it_flags = file->f_flags };
692 struct obd_client_handle **och_p = NULL;
693 __u64 *och_usecount = NULL;
694 struct ll_file_data *fd;
698 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
699 PFID(ll_inode2fid(inode)), inode, file->f_flags);
701 it = file->private_data; /* XXX: compat macro */
702 file->private_data = NULL; /* prevent ll_local_open assertion */
704 fd = ll_file_data_get();
706 GOTO(out_nofiledata, rc = -ENOMEM);
709 if (S_ISDIR(inode->i_mode))
710 ll_authorize_statahead(inode, fd);
712 if (inode->i_sb->s_root == file_dentry(file)) {
713 LUSTRE_FPRIVATE(file) = fd;
717 if (!it || !it->it_disposition) {
718 /* Convert f_flags into access mode. We cannot use file->f_mode,
719 * because everything but O_ACCMODE mask was stripped from
721 if ((oit.it_flags + 1) & O_ACCMODE)
723 if (file->f_flags & O_TRUNC)
724 oit.it_flags |= FMODE_WRITE;
726 /* kernel only call f_op->open in dentry_open. filp_open calls
727 * dentry_open after call to open_namei that checks permissions.
728 * Only nfsd_open call dentry_open directly without checking
729 * permissions and because of that this code below is safe.
731 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
732 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
734 /* We do not want O_EXCL here, presumably we opened the file
735 * already? XXX - NFS implications? */
736 oit.it_flags &= ~O_EXCL;
738 /* bug20584, if "it_flags" contains O_CREAT, the file will be
739 * created if necessary, then "IT_CREAT" should be set to keep
740 * consistent with it */
741 if (oit.it_flags & O_CREAT)
742 oit.it_op |= IT_CREAT;
748 /* Let's see if we have file open on MDS already. */
749 if (it->it_flags & FMODE_WRITE) {
750 och_p = &lli->lli_mds_write_och;
751 och_usecount = &lli->lli_open_fd_write_count;
752 } else if (it->it_flags & FMODE_EXEC) {
753 och_p = &lli->lli_mds_exec_och;
754 och_usecount = &lli->lli_open_fd_exec_count;
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
760 mutex_lock(&lli->lli_och_mutex);
761 if (*och_p) { /* Open handle is present */
762 if (it_disposition(it, DISP_OPEN_OPEN)) {
763 /* Well, there's extra open request that we do not need,
764 let's close it somehow. This will decref request. */
765 rc = it_open_error(DISP_OPEN_OPEN, it);
767 mutex_unlock(&lli->lli_och_mutex);
768 GOTO(out_openerr, rc);
771 ll_release_openhandle(file_dentry(file), it);
775 rc = ll_local_open(file, it, fd, NULL);
778 mutex_unlock(&lli->lli_och_mutex);
779 GOTO(out_openerr, rc);
782 LASSERT(*och_usecount == 0);
783 if (!it->it_disposition) {
784 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
785 /* We cannot just request lock handle now, new ELC code
786 means that one of other OPEN locks for this file
787 could be cancelled, and since blocking ast handler
788 would attempt to grab och_mutex as well, that would
789 result in a deadlock */
790 mutex_unlock(&lli->lli_och_mutex);
792 * Normally called under two situations:
794 * 2. A race/condition on MDS resulting in no open
795 * handle to be returned from LOOKUP|OPEN request,
796 * for example if the target entry was a symlink.
798 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
799 * marked by a bit set in ll_iget_for_nfs. Clear the
800 * bit so that it's not confusing later callers.
802 * NB; when ldd is NULL, it must have come via normal
803 * lookup path only, since ll_iget_for_nfs always calls
806 if (ldd && ldd->lld_nfs_dentry) {
807 ldd->lld_nfs_dentry = 0;
808 it->it_flags |= MDS_OPEN_LOCK;
812 * Always specify MDS_OPEN_BY_FID because we don't want
813 * to get file with different fid.
815 it->it_flags |= MDS_OPEN_BY_FID;
816 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
819 GOTO(out_openerr, rc);
823 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
825 GOTO(out_och_free, rc = -ENOMEM);
829 /* md_intent_lock() didn't get a request ref if there was an
830 * open error, so don't do cleanup on the request here
832 /* XXX (green): Should not we bail out on any error here, not
833 * just open error? */
834 rc = it_open_error(DISP_OPEN_OPEN, it);
836 GOTO(out_och_free, rc);
838 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
839 "inode %p: disposition %x, status %d\n", inode,
840 it_disposition(it, ~0), it->it_status);
842 rc = ll_local_open(file, it, fd, *och_p);
844 GOTO(out_och_free, rc);
847 rc = pcc_file_open(inode, file);
849 GOTO(out_och_free, rc);
851 mutex_unlock(&lli->lli_och_mutex);
854 /* Must do this outside lli_och_mutex lock to prevent deadlock where
855 different kind of OPEN lock for this same inode gets cancelled
856 by ldlm_cancel_lru */
857 if (!S_ISREG(inode->i_mode))
858 GOTO(out_och_free, rc);
860 cl_lov_delay_create_clear(&file->f_flags);
861 GOTO(out_och_free, rc);
865 if (och_p && *och_p) {
866 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
867 *och_p = NULL; /* OBD_FREE writes some magic there */
870 mutex_unlock(&lli->lli_och_mutex);
873 if (lli->lli_opendir_key == fd)
874 ll_deauthorize_statahead(inode, fd);
877 ll_file_data_put(fd);
879 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
883 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
884 ptlrpc_req_finished(it->it_request);
885 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
891 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
892 struct ldlm_lock_desc *desc, void *data, int flag)
895 struct lustre_handle lockh;
899 case LDLM_CB_BLOCKING:
900 ldlm_lock2handle(lock, &lockh);
901 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
903 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
907 case LDLM_CB_CANCELING:
915 * When setting a lease on a file, we take ownership of the lli_mds_*_och
916 * and save it as fd->fd_och so as to force client to reopen the file even
917 * if it has an open lock in cache already.
919 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
920 struct lustre_handle *old_open_handle)
922 struct ll_inode_info *lli = ll_i2info(inode);
923 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
924 struct obd_client_handle **och_p;
929 /* Get the openhandle of the file */
930 mutex_lock(&lli->lli_och_mutex);
931 if (fd->fd_lease_och != NULL)
932 GOTO(out_unlock, rc = -EBUSY);
934 if (fd->fd_och == NULL) {
935 if (file->f_mode & FMODE_WRITE) {
936 LASSERT(lli->lli_mds_write_och != NULL);
937 och_p = &lli->lli_mds_write_och;
938 och_usecount = &lli->lli_open_fd_write_count;
940 LASSERT(lli->lli_mds_read_och != NULL);
941 och_p = &lli->lli_mds_read_och;
942 och_usecount = &lli->lli_open_fd_read_count;
945 if (*och_usecount > 1)
946 GOTO(out_unlock, rc = -EBUSY);
953 *old_open_handle = fd->fd_och->och_open_handle;
957 mutex_unlock(&lli->lli_och_mutex);
962 * Release ownership on lli_mds_*_och when putting back a file lease.
964 static int ll_lease_och_release(struct inode *inode, struct file *file)
966 struct ll_inode_info *lli = ll_i2info(inode);
967 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
968 struct obd_client_handle **och_p;
969 struct obd_client_handle *old_och = NULL;
974 mutex_lock(&lli->lli_och_mutex);
975 if (file->f_mode & FMODE_WRITE) {
976 och_p = &lli->lli_mds_write_och;
977 och_usecount = &lli->lli_open_fd_write_count;
979 och_p = &lli->lli_mds_read_och;
980 och_usecount = &lli->lli_open_fd_read_count;
983 /* The file may have been open by another process (broken lease) so
984 * *och_p is not NULL. In this case we should simply increase usecount
987 if (*och_p != NULL) {
988 old_och = fd->fd_och;
995 mutex_unlock(&lli->lli_och_mutex);
998 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1004 * Acquire a lease and open the file.
1006 static struct obd_client_handle *
1007 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1010 struct lookup_intent it = { .it_op = IT_OPEN };
1011 struct ll_sb_info *sbi = ll_i2sbi(inode);
1012 struct md_op_data *op_data;
1013 struct ptlrpc_request *req = NULL;
1014 struct lustre_handle old_open_handle = { 0 };
1015 struct obd_client_handle *och = NULL;
1020 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1021 RETURN(ERR_PTR(-EINVAL));
1024 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1025 RETURN(ERR_PTR(-EPERM));
1027 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1029 RETURN(ERR_PTR(rc));
1034 RETURN(ERR_PTR(-ENOMEM));
1036 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1037 LUSTRE_OPC_ANY, NULL);
1038 if (IS_ERR(op_data))
1039 GOTO(out, rc = PTR_ERR(op_data));
1041 /* To tell the MDT this openhandle is from the same owner */
1042 op_data->op_open_handle = old_open_handle;
1044 it.it_flags = fmode | open_flags;
1045 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1046 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1047 &ll_md_blocking_lease_ast,
1048 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1049 * it can be cancelled which may mislead applications that the lease is
1051 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1052 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1053 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1054 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1055 ll_finish_md_op_data(op_data);
1056 ptlrpc_req_finished(req);
1058 GOTO(out_release_it, rc);
1060 if (it_disposition(&it, DISP_LOOKUP_NEG))
1061 GOTO(out_release_it, rc = -ENOENT);
1063 rc = it_open_error(DISP_OPEN_OPEN, &it);
1065 GOTO(out_release_it, rc);
1067 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1068 ll_och_fill(sbi->ll_md_exp, &it, och);
1070 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1071 GOTO(out_close, rc = -EOPNOTSUPP);
1073 /* already get lease, handle lease lock */
1074 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1075 if (it.it_lock_mode == 0 ||
1076 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1077 /* open lock must return for lease */
1078 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1079 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1081 GOTO(out_close, rc = -EPROTO);
1084 ll_intent_release(&it);
1088 /* Cancel open lock */
1089 if (it.it_lock_mode != 0) {
1090 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1092 it.it_lock_mode = 0;
1093 och->och_lease_handle.cookie = 0ULL;
1095 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1097 CERROR("%s: error closing file "DFID": %d\n",
1098 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1099 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1101 ll_intent_release(&it);
1105 RETURN(ERR_PTR(rc));
1109 * Check whether a layout swap can be done between two inodes.
1111 * \param[in] inode1 First inode to check
1112 * \param[in] inode2 Second inode to check
1114 * \retval 0 on success, layout swap can be performed between both inodes
1115 * \retval negative error code if requirements are not met
1117 static int ll_check_swap_layouts_validity(struct inode *inode1,
1118 struct inode *inode2)
1120 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1123 if (inode_permission(inode1, MAY_WRITE) ||
1124 inode_permission(inode2, MAY_WRITE))
1127 if (inode1->i_sb != inode2->i_sb)
1133 static int ll_swap_layouts_close(struct obd_client_handle *och,
1134 struct inode *inode, struct inode *inode2)
1136 const struct lu_fid *fid1 = ll_inode2fid(inode);
1137 const struct lu_fid *fid2;
1141 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1142 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1144 rc = ll_check_swap_layouts_validity(inode, inode2);
1146 GOTO(out_free_och, rc);
1148 /* We now know that inode2 is a lustre inode */
1149 fid2 = ll_inode2fid(inode2);
1151 rc = lu_fid_cmp(fid1, fid2);
1153 GOTO(out_free_och, rc = -EINVAL);
1155 /* Close the file and {swap,merge} layouts between inode & inode2.
1156 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1157 * because we still need it to pack l_remote_handle to MDT. */
1158 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1161 och = NULL; /* freed in ll_close_inode_openhandle() */
1171 * Release lease and close the file.
1172 * It will check if the lease has ever broken.
1174 static int ll_lease_close_intent(struct obd_client_handle *och,
1175 struct inode *inode,
1176 bool *lease_broken, enum mds_op_bias bias,
1179 struct ldlm_lock *lock;
1180 bool cancelled = true;
1184 lock = ldlm_handle2lock(&och->och_lease_handle);
1186 lock_res_and_lock(lock);
1187 cancelled = ldlm_is_cancel(lock);
1188 unlock_res_and_lock(lock);
1189 LDLM_LOCK_PUT(lock);
1192 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1193 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1195 if (lease_broken != NULL)
1196 *lease_broken = cancelled;
1198 if (!cancelled && !bias)
1199 ldlm_cli_cancel(&och->och_lease_handle, 0);
1201 if (cancelled) { /* no need to excute intent */
1206 rc = ll_close_inode_openhandle(inode, och, bias, data);
1210 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1213 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1217 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1219 static int ll_lease_file_resync(struct obd_client_handle *och,
1220 struct inode *inode, unsigned long arg)
1222 struct ll_sb_info *sbi = ll_i2sbi(inode);
1223 struct md_op_data *op_data;
1224 struct ll_ioc_lease_id ioc;
1225 __u64 data_version_unused;
1229 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1230 LUSTRE_OPC_ANY, NULL);
1231 if (IS_ERR(op_data))
1232 RETURN(PTR_ERR(op_data));
1234 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1238 /* before starting file resync, it's necessary to clean up page cache
1239 * in client memory, otherwise once the layout version is increased,
1240 * writing back cached data will be denied the OSTs. */
1241 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1245 op_data->op_lease_handle = och->och_lease_handle;
1246 op_data->op_mirror_id = ioc.lil_mirror_id;
1247 rc = md_file_resync(sbi->ll_md_exp, op_data);
1253 ll_finish_md_op_data(op_data);
1257 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1259 struct ll_inode_info *lli = ll_i2info(inode);
1260 struct cl_object *obj = lli->lli_clob;
1261 struct cl_attr *attr = vvp_env_thread_attr(env);
1269 ll_inode_size_lock(inode);
1271 /* Merge timestamps the most recently obtained from MDS with
1272 * timestamps obtained from OSTs.
1274 * Do not overwrite atime of inode because it may be refreshed
1275 * by file_accessed() function. If the read was served by cache
1276 * data, there is no RPC to be sent so that atime may not be
1277 * transferred to OSTs at all. MDT only updates atime at close time
1278 * if it's at least 'mdd.*.atime_diff' older.
1279 * All in all, the atime in Lustre does not strictly comply with
1280 * POSIX. Solving this problem needs to send an RPC to MDT for each
1281 * read, this will hurt performance.
1283 if (inode->i_atime.tv_sec < lli->lli_atime ||
1284 lli->lli_update_atime) {
1285 inode->i_atime.tv_sec = lli->lli_atime;
1286 lli->lli_update_atime = 0;
1288 inode->i_mtime.tv_sec = lli->lli_mtime;
1289 inode->i_ctime.tv_sec = lli->lli_ctime;
1291 mtime = inode->i_mtime.tv_sec;
1292 atime = inode->i_atime.tv_sec;
1293 ctime = inode->i_ctime.tv_sec;
1295 cl_object_attr_lock(obj);
1296 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1299 rc = cl_object_attr_get(env, obj, attr);
1300 cl_object_attr_unlock(obj);
1303 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1305 if (atime < attr->cat_atime)
1306 atime = attr->cat_atime;
1308 if (ctime < attr->cat_ctime)
1309 ctime = attr->cat_ctime;
1311 if (mtime < attr->cat_mtime)
1312 mtime = attr->cat_mtime;
1314 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1315 PFID(&lli->lli_fid), attr->cat_size);
1317 i_size_write(inode, attr->cat_size);
1318 inode->i_blocks = attr->cat_blocks;
1320 inode->i_mtime.tv_sec = mtime;
1321 inode->i_atime.tv_sec = atime;
1322 inode->i_ctime.tv_sec = ctime;
1325 ll_inode_size_unlock(inode);
1331 * Set designated mirror for I/O.
1333 * So far only read, write, and truncated can support to issue I/O to
1334 * designated mirror.
1336 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1340 /* clear layout version for generic(non-resync) I/O in case it carries
1341 * stale layout version due to I/O restart */
1342 io->ci_layout_version = 0;
1344 /* FLR: disable non-delay for designated mirror I/O because obviously
1345 * only one mirror is available */
1346 if (fd->fd_designated_mirror > 0) {
1348 io->ci_designated_mirror = fd->fd_designated_mirror;
1349 io->ci_layout_version = fd->fd_layout_version;
1352 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1353 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1356 static bool file_is_noatime(const struct file *file)
1358 const struct vfsmount *mnt = file->f_path.mnt;
1359 const struct inode *inode = file_inode((struct file *)file);
1361 /* Adapted from file_accessed() and touch_atime().*/
1362 if (file->f_flags & O_NOATIME)
1365 if (inode->i_flags & S_NOATIME)
1368 if (IS_NOATIME(inode))
1371 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1374 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1377 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1383 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1384 struct vvp_io_args *args)
1386 struct inode *inode = file_inode(file);
1387 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1389 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1390 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1392 if (iot == CIT_WRITE) {
1393 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1394 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1395 file->f_flags & O_DIRECT ||
1397 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1398 io->u.ci_wr.wr_sync |= !!(args &&
1399 args->via_io_subtype == IO_NORMAL &&
1400 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1404 io->ci_obj = ll_i2info(inode)->lli_clob;
1405 io->ci_lockreq = CILR_MAYBE;
1406 if (ll_file_nolock(file)) {
1407 io->ci_lockreq = CILR_NEVER;
1408 io->ci_no_srvlock = 1;
1409 } else if (file->f_flags & O_APPEND) {
1410 io->ci_lockreq = CILR_MANDATORY;
1412 io->ci_noatime = file_is_noatime(file);
1413 io->ci_async_readahead = false;
1415 /* FLR: only use non-delay I/O for read as there is only one
1416 * avaliable mirror for write. */
1417 io->ci_ndelay = !(iot == CIT_WRITE);
1419 ll_io_set_mirror(io, file);
1422 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1425 struct ll_inode_info *lli = ll_i2info(inode);
1426 struct ll_sb_info *sbi = ll_i2sbi(inode);
1427 enum obd_heat_type sample_type;
1428 enum obd_heat_type iobyte_type;
1429 __u64 now = ktime_get_real_seconds();
1431 if (!ll_sbi_has_file_heat(sbi) ||
1432 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1435 if (iot == CIT_READ) {
1436 sample_type = OBD_HEAT_READSAMPLE;
1437 iobyte_type = OBD_HEAT_READBYTE;
1438 } else if (iot == CIT_WRITE) {
1439 sample_type = OBD_HEAT_WRITESAMPLE;
1440 iobyte_type = OBD_HEAT_WRITEBYTE;
1445 spin_lock(&lli->lli_heat_lock);
1446 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1447 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1448 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1449 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1450 spin_unlock(&lli->lli_heat_lock);
1454 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1455 struct file *file, enum cl_io_type iot,
1456 loff_t *ppos, size_t count)
1458 struct vvp_io *vio = vvp_env_io(env);
1459 struct inode *inode = file_inode(file);
1460 struct ll_inode_info *lli = ll_i2info(inode);
1461 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1462 struct range_lock range;
1466 unsigned retried = 0;
1467 bool restarted = false;
1471 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1472 file_dentry(file)->d_name.name,
1473 iot == CIT_READ ? "read" : "write", *ppos, count);
1476 io = vvp_env_thread_io(env);
1477 ll_io_init(io, file, iot, args);
1478 io->ci_ndelay_tried = retried;
1480 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1481 bool range_locked = false;
1483 if (file->f_flags & O_APPEND)
1484 range_lock_init(&range, 0, LUSTRE_EOF);
1486 range_lock_init(&range, *ppos, *ppos + count - 1);
1488 vio->vui_fd = LUSTRE_FPRIVATE(file);
1489 vio->vui_io_subtype = args->via_io_subtype;
1491 switch (vio->vui_io_subtype) {
1493 vio->vui_iter = args->u.normal.via_iter;
1494 vio->vui_iocb = args->u.normal.via_iocb;
1495 /* Direct IO reads must also take range lock,
1496 * or multiple reads will try to work on the same pages
1497 * See LU-6227 for details. */
1498 if (((iot == CIT_WRITE) ||
1499 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1500 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1501 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1503 rc = range_lock(&lli->lli_write_tree, &range);
1507 range_locked = true;
1511 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1512 vio->u.splice.vui_flags = args->u.splice.via_flags;
1515 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1519 ll_cl_add(file, env, io, LCC_RW);
1520 rc = cl_io_loop(env, io);
1521 ll_cl_remove(file, env);
1524 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1526 range_unlock(&lli->lli_write_tree, &range);
1529 /* cl_io_rw_init() handled IO */
1533 if (io->ci_nob > 0) {
1534 result += io->ci_nob;
1535 count -= io->ci_nob;
1536 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1538 /* prepare IO restart */
1539 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1540 args->u.normal.via_iter = vio->vui_iter;
1543 cl_io_fini(env, io);
1546 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1547 file->f_path.dentry->d_name.name,
1548 iot, rc, result, io->ci_need_restart);
1550 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1552 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1553 file_dentry(file)->d_name.name,
1554 iot == CIT_READ ? "read" : "write",
1555 *ppos, count, result, rc);
1556 /* preserve the tried count for FLR */
1557 retried = io->ci_ndelay_tried;
1562 if (iot == CIT_READ) {
1564 ll_stats_ops_tally(ll_i2sbi(inode),
1565 LPROC_LL_READ_BYTES, result);
1566 } else if (iot == CIT_WRITE) {
1568 ll_stats_ops_tally(ll_i2sbi(inode),
1569 LPROC_LL_WRITE_BYTES, result);
1570 fd->fd_write_failed = false;
1571 } else if (result == 0 && rc == 0) {
1574 fd->fd_write_failed = true;
1576 fd->fd_write_failed = false;
1577 } else if (rc != -ERESTARTSYS) {
1578 fd->fd_write_failed = true;
1582 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1584 ll_heat_add(inode, iot, result);
1586 RETURN(result > 0 ? result : rc);
1590 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1591 * especially for small I/O.
1593 * To serve a read request, CLIO has to create and initialize a cl_io and
1594 * then request DLM lock. This has turned out to have siginificant overhead
1595 * and affects the performance of small I/O dramatically.
1597 * It's not necessary to create a cl_io for each I/O. Under the help of read
1598 * ahead, most of the pages being read are already in memory cache and we can
1599 * read those pages directly because if the pages exist, the corresponding DLM
1600 * lock must exist so that page content must be valid.
1602 * In fast read implementation, the llite speculatively finds and reads pages
1603 * in memory cache. There are three scenarios for fast read:
1604 * - If the page exists and is uptodate, kernel VM will provide the data and
1605 * CLIO won't be intervened;
1606 * - If the page was brought into memory by read ahead, it will be exported
1607 * and read ahead parameters will be updated;
1608 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1609 * it will go back and invoke normal read, i.e., a cl_io will be created
1610 * and DLM lock will be requested.
1612 * POSIX compliance: posix standard states that read is intended to be atomic.
1613 * Lustre read implementation is in line with Linux kernel read implementation
1614 * and neither of them complies with POSIX standard in this matter. Fast read
1615 * doesn't make the situation worse on single node but it may interleave write
1616 * results from multiple nodes due to short read handling in ll_file_aio_read().
1618 * \param env - lu_env
1619 * \param iocb - kiocb from kernel
1620 * \param iter - user space buffers where the data will be copied
1622 * \retval - number of bytes have been read, or error code if error occurred.
1625 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1629 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1632 /* NB: we can't do direct IO for fast read because it will need a lock
1633 * to make IO engine happy. */
1634 if (iocb->ki_filp->f_flags & O_DIRECT)
1637 result = generic_file_read_iter(iocb, iter);
1639 /* If the first page is not in cache, generic_file_aio_read() will be
1640 * returned with -ENODATA.
1641 * See corresponding code in ll_readpage(). */
1642 if (result == -ENODATA)
1646 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1647 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1648 LPROC_LL_READ_BYTES, result);
1655 * Read from a file (through the page cache).
1657 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1660 struct vvp_io_args *args;
1666 if (!iov_iter_count(to))
1670 * Currently when PCC read failed, we do not fall back to the
1671 * normal read path, just return the error.
1672 * The resaon is that: for RW-PCC, the file data may be modified
1673 * in the PCC and inconsistent with the data on OSTs (or file
1674 * data has been removed from the Lustre file system), at this
1675 * time, fallback to the normal read path may read the wrong
1677 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1678 * path: read data from data copy on OSTs.
1680 result = pcc_file_read_iter(iocb, to, &cached);
1684 ll_ras_enter(iocb->ki_filp);
1686 result = ll_do_fast_read(iocb, to);
1687 if (result < 0 || iov_iter_count(to) == 0)
1690 env = cl_env_get(&refcheck);
1692 return PTR_ERR(env);
1694 args = ll_env_args(env, IO_NORMAL);
1695 args->u.normal.via_iter = to;
1696 args->u.normal.via_iocb = iocb;
1698 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1699 &iocb->ki_pos, iov_iter_count(to));
1702 else if (result == 0)
1705 cl_env_put(env, &refcheck);
1711 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1712 * If a page is already in the page cache and dirty (and some other things -
1713 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1714 * write to it without doing a full I/O, because Lustre already knows about it
1715 * and will write it out. This saves a lot of processing time.
1717 * All writes here are within one page, so exclusion is handled by the page
1718 * lock on the vm page. We do not do tiny writes for writes which touch
1719 * multiple pages because it's very unlikely multiple sequential pages are
1720 * are already dirty.
1722 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1723 * and are unlikely to be to already dirty pages.
1725 * Attribute updates are important here, we do them in ll_tiny_write_end.
1727 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1729 ssize_t count = iov_iter_count(iter);
1730 struct file *file = iocb->ki_filp;
1731 struct inode *inode = file_inode(file);
1732 bool lock_inode = !IS_NOSEC(inode);
1737 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1738 * of function for why.
1740 if (count >= PAGE_SIZE ||
1741 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1744 if (unlikely(lock_inode))
1746 result = __generic_file_write_iter(iocb, iter);
1748 if (unlikely(lock_inode))
1749 inode_unlock(inode);
1751 /* If the page is not already dirty, ll_tiny_write_begin returns
1752 * -ENODATA. We continue on to normal write.
1754 if (result == -ENODATA)
1758 ll_heat_add(inode, CIT_WRITE, result);
1759 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1761 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1764 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1770 * Write to a file (through the page cache).
1772 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1774 struct vvp_io_args *args;
1776 ssize_t rc_tiny = 0, rc_normal;
1783 if (!iov_iter_count(from))
1784 GOTO(out, rc_normal = 0);
1787 * When PCC write failed, we usually do not fall back to the normal
1788 * write path, just return the error. But there is a special case when
1789 * returned error code is -ENOSPC due to running out of space on PCC HSM
1790 * bakcend. At this time, it will fall back to normal I/O path and
1791 * retry the I/O. As the file is in HSM released state, it will restore
1792 * the file data to OSTs first and redo the write again. And the
1793 * restore process will revoke the layout lock and detach the file
1794 * from PCC cache automatically.
1796 result = pcc_file_write_iter(iocb, from, &cached);
1797 if (cached && result != -ENOSPC && result != -EDQUOT)
1800 /* NB: we can't do direct IO for tiny writes because they use the page
1801 * cache, we can't do sync writes because tiny writes can't flush
1802 * pages, and we can't do append writes because we can't guarantee the
1803 * required DLM locks are held to protect file size.
1805 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1806 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1807 rc_tiny = ll_do_tiny_write(iocb, from);
1809 /* In case of error, go on and try normal write - Only stop if tiny
1810 * write completed I/O.
1812 if (iov_iter_count(from) == 0)
1813 GOTO(out, rc_normal = rc_tiny);
1815 env = cl_env_get(&refcheck);
1817 return PTR_ERR(env);
1819 args = ll_env_args(env, IO_NORMAL);
1820 args->u.normal.via_iter = from;
1821 args->u.normal.via_iocb = iocb;
1823 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1824 &iocb->ki_pos, iov_iter_count(from));
1826 /* On success, combine bytes written. */
1827 if (rc_tiny >= 0 && rc_normal > 0)
1828 rc_normal += rc_tiny;
1829 /* On error, only return error from normal write if tiny write did not
1830 * write any bytes. Otherwise return bytes written by tiny write.
1832 else if (rc_tiny > 0)
1833 rc_normal = rc_tiny;
1835 cl_env_put(env, &refcheck);
1840 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1842 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1844 static int ll_file_get_iov_count(const struct iovec *iov,
1845 unsigned long *nr_segs, size_t *count)
1850 for (seg = 0; seg < *nr_segs; seg++) {
1851 const struct iovec *iv = &iov[seg];
1854 * If any segment has a negative length, or the cumulative
1855 * length ever wraps negative then return -EINVAL.
1858 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1860 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1865 cnt -= iv->iov_len; /* This segment is no good */
1872 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1873 unsigned long nr_segs, loff_t pos)
1880 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1887 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1888 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1889 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1890 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1891 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1893 result = ll_file_read_iter(iocb, &to);
1898 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1901 struct iovec iov = { .iov_base = buf, .iov_len = count };
1910 init_sync_kiocb(&kiocb, file);
1911 kiocb.ki_pos = *ppos;
1912 #ifdef HAVE_KIOCB_KI_LEFT
1913 kiocb.ki_left = count;
1914 #elif defined(HAVE_KI_NBYTES)
1915 kiocb.i_nbytes = count;
1918 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1919 *ppos = kiocb.ki_pos;
1925 * Write to a file (through the page cache).
1928 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1929 unsigned long nr_segs, loff_t pos)
1931 struct iov_iter from;
1936 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1943 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1944 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1945 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1946 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1947 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1949 result = ll_file_write_iter(iocb, &from);
1954 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1955 size_t count, loff_t *ppos)
1957 struct iovec iov = { .iov_base = (void __user *)buf,
1967 init_sync_kiocb(&kiocb, file);
1968 kiocb.ki_pos = *ppos;
1969 #ifdef HAVE_KIOCB_KI_LEFT
1970 kiocb.ki_left = count;
1971 #elif defined(HAVE_KI_NBYTES)
1972 kiocb.ki_nbytes = count;
1975 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1976 *ppos = kiocb.ki_pos;
1980 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1983 * Send file content (through pagecache) somewhere with helper
1985 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1986 struct pipe_inode_info *pipe, size_t count,
1990 struct vvp_io_args *args;
1997 result = pcc_file_splice_read(in_file, ppos, pipe,
1998 count, flags, &cached);
2002 ll_ras_enter(in_file);
2004 env = cl_env_get(&refcheck);
2006 RETURN(PTR_ERR(env));
2008 args = ll_env_args(env, IO_SPLICE);
2009 args->u.splice.via_pipe = pipe;
2010 args->u.splice.via_flags = flags;
2012 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2013 cl_env_put(env, &refcheck);
2017 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2018 __u64 flags, struct lov_user_md *lum, int lum_size)
2020 struct lookup_intent oit = {
2022 .it_flags = flags | MDS_OPEN_BY_FID,
2027 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2028 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2029 /* this code will only exist for big-endian systems */
2030 lustre_swab_lov_user_md(lum);
2033 ll_inode_size_lock(inode);
2034 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2036 GOTO(out_unlock, rc);
2038 ll_release_openhandle(dentry, &oit);
2041 ll_inode_size_unlock(inode);
2042 ll_intent_release(&oit);
2047 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2048 struct lov_mds_md **lmmp, int *lmm_size,
2049 struct ptlrpc_request **request)
2051 struct ll_sb_info *sbi = ll_i2sbi(inode);
2052 struct mdt_body *body;
2053 struct lov_mds_md *lmm = NULL;
2054 struct ptlrpc_request *req = NULL;
2055 struct md_op_data *op_data;
2058 rc = ll_get_default_mdsize(sbi, &lmmsize);
2062 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2063 strlen(filename), lmmsize,
2064 LUSTRE_OPC_ANY, NULL);
2065 if (IS_ERR(op_data))
2066 RETURN(PTR_ERR(op_data));
2068 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2069 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2070 ll_finish_md_op_data(op_data);
2072 CDEBUG(D_INFO, "md_getattr_name failed "
2073 "on %s: rc %d\n", filename, rc);
2077 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2078 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2080 lmmsize = body->mbo_eadatasize;
2082 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2084 GOTO(out, rc = -ENODATA);
2087 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2088 LASSERT(lmm != NULL);
2090 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2091 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2092 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2093 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2094 GOTO(out, rc = -EPROTO);
2097 * This is coming from the MDS, so is probably in
2098 * little endian. We convert it to host endian before
2099 * passing it to userspace.
2101 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2102 __swab32(LOV_MAGIC_MAGIC)) {
2103 int stripe_count = 0;
2105 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2106 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2107 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2108 if (le32_to_cpu(lmm->lmm_pattern) &
2109 LOV_PATTERN_F_RELEASED)
2113 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
2115 /* if function called for directory - we should
2116 * avoid swab not existent lsm objects */
2117 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2118 lustre_swab_lov_user_md_objects(
2119 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2121 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2122 S_ISREG(body->mbo_mode))
2123 lustre_swab_lov_user_md_objects(
2124 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2130 *lmm_size = lmmsize;
2135 static int ll_lov_setea(struct inode *inode, struct file *file,
2138 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2139 struct lov_user_md *lump;
2140 int lum_size = sizeof(struct lov_user_md) +
2141 sizeof(struct lov_user_ost_data);
2145 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2148 OBD_ALLOC_LARGE(lump, lum_size);
2152 if (copy_from_user(lump, arg, lum_size))
2153 GOTO(out_lump, rc = -EFAULT);
2155 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2157 cl_lov_delay_create_clear(&file->f_flags);
2160 OBD_FREE_LARGE(lump, lum_size);
2164 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2171 env = cl_env_get(&refcheck);
2173 RETURN(PTR_ERR(env));
2175 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2176 cl_env_put(env, &refcheck);
2180 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2183 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2184 struct lov_user_md *klum;
2186 __u64 flags = FMODE_WRITE;
2189 rc = ll_copy_user_md(lum, &klum);
2194 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2199 rc = put_user(0, &lum->lmm_stripe_count);
2203 rc = ll_layout_refresh(inode, &gen);
2207 rc = ll_file_getstripe(inode, arg, lum_size);
2209 cl_lov_delay_create_clear(&file->f_flags);
2212 OBD_FREE_LARGE(klum, lum_size);
2217 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2219 struct ll_inode_info *lli = ll_i2info(inode);
2220 struct cl_object *obj = lli->lli_clob;
2221 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2222 struct ll_grouplock grouplock;
2227 CWARN("group id for group lock must not be 0\n");
2231 if (ll_file_nolock(file))
2232 RETURN(-EOPNOTSUPP);
2234 spin_lock(&lli->lli_lock);
2235 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2236 CWARN("group lock already existed with gid %lu\n",
2237 fd->fd_grouplock.lg_gid);
2238 spin_unlock(&lli->lli_lock);
2241 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2242 spin_unlock(&lli->lli_lock);
2245 * XXX: group lock needs to protect all OST objects while PFL
2246 * can add new OST objects during the IO, so we'd instantiate
2247 * all OST objects before getting its group lock.
2252 struct cl_layout cl = {
2253 .cl_is_composite = false,
2255 struct lu_extent ext = {
2257 .e_end = OBD_OBJECT_EOF,
2260 env = cl_env_get(&refcheck);
2262 RETURN(PTR_ERR(env));
2264 rc = cl_object_layout_get(env, obj, &cl);
2265 if (!rc && cl.cl_is_composite)
2266 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2269 cl_env_put(env, &refcheck);
2274 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2275 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2279 spin_lock(&lli->lli_lock);
2280 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2281 spin_unlock(&lli->lli_lock);
2282 CERROR("another thread just won the race\n");
2283 cl_put_grouplock(&grouplock);
2287 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2288 fd->fd_grouplock = grouplock;
2289 spin_unlock(&lli->lli_lock);
2291 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2295 static int ll_put_grouplock(struct inode *inode, struct file *file,
2298 struct ll_inode_info *lli = ll_i2info(inode);
2299 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2300 struct ll_grouplock grouplock;
2303 spin_lock(&lli->lli_lock);
2304 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2305 spin_unlock(&lli->lli_lock);
2306 CWARN("no group lock held\n");
2310 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2312 if (fd->fd_grouplock.lg_gid != arg) {
2313 CWARN("group lock %lu doesn't match current id %lu\n",
2314 arg, fd->fd_grouplock.lg_gid);
2315 spin_unlock(&lli->lli_lock);
2319 grouplock = fd->fd_grouplock;
2320 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2321 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2322 spin_unlock(&lli->lli_lock);
2324 cl_put_grouplock(&grouplock);
2325 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2330 * Close inode open handle
2332 * \param dentry [in] dentry which contains the inode
2333 * \param it [in,out] intent which contains open info and result
2336 * \retval <0 failure
2338 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2340 struct inode *inode = dentry->d_inode;
2341 struct obd_client_handle *och;
2347 /* Root ? Do nothing. */
2348 if (dentry->d_inode->i_sb->s_root == dentry)
2351 /* No open handle to close? Move away */
2352 if (!it_disposition(it, DISP_OPEN_OPEN))
2355 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2357 OBD_ALLOC(och, sizeof(*och));
2359 GOTO(out, rc = -ENOMEM);
2361 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2363 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2365 /* this one is in place of ll_file_open */
2366 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2367 ptlrpc_req_finished(it->it_request);
2368 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2374 * Get size for inode for which FIEMAP mapping is requested.
2375 * Make the FIEMAP get_info call and returns the result.
2376 * \param fiemap kernel buffer to hold extens
2377 * \param num_bytes kernel buffer size
2379 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2385 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2388 /* Checks for fiemap flags */
2389 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2390 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2394 /* Check for FIEMAP_FLAG_SYNC */
2395 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2396 rc = filemap_fdatawrite(inode->i_mapping);
2401 env = cl_env_get(&refcheck);
2403 RETURN(PTR_ERR(env));
2405 if (i_size_read(inode) == 0) {
2406 rc = ll_glimpse_size(inode);
2411 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2412 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2413 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2415 /* If filesize is 0, then there would be no objects for mapping */
2416 if (fmkey.lfik_oa.o_size == 0) {
2417 fiemap->fm_mapped_extents = 0;
2421 fmkey.lfik_fiemap = *fiemap;
2423 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2424 &fmkey, fiemap, &num_bytes);
2426 cl_env_put(env, &refcheck);
2430 int ll_fid2path(struct inode *inode, void __user *arg)
2432 struct obd_export *exp = ll_i2mdexp(inode);
2433 const struct getinfo_fid2path __user *gfin = arg;
2435 struct getinfo_fid2path *gfout;
2441 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2442 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2445 /* Only need to get the buflen */
2446 if (get_user(pathlen, &gfin->gf_pathlen))
2449 if (pathlen > PATH_MAX)
2452 outsize = sizeof(*gfout) + pathlen;
2453 OBD_ALLOC(gfout, outsize);
2457 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2458 GOTO(gf_free, rc = -EFAULT);
2459 /* append root FID after gfout to let MDT know the root FID so that it
2460 * can lookup the correct path, this is mainly for fileset.
2461 * old server without fileset mount support will ignore this. */
2462 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2464 /* Call mdc_iocontrol */
2465 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2469 if (copy_to_user(arg, gfout, outsize))
2473 OBD_FREE(gfout, outsize);
2478 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2480 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2488 ioc->idv_version = 0;
2489 ioc->idv_layout_version = UINT_MAX;
2491 /* If no file object initialized, we consider its version is 0. */
2495 env = cl_env_get(&refcheck);
2497 RETURN(PTR_ERR(env));
2499 io = vvp_env_thread_io(env);
2501 io->u.ci_data_version.dv_data_version = 0;
2502 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2503 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2506 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2507 result = cl_io_loop(env, io);
2509 result = io->ci_result;
2511 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2512 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2514 cl_io_fini(env, io);
2516 if (unlikely(io->ci_need_restart))
2519 cl_env_put(env, &refcheck);
2525 * Read the data_version for inode.
2527 * This value is computed using stripe object version on OST.
2528 * Version is computed using server side locking.
2530 * @param flags if do sync on the OST side;
2532 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2533 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2535 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2537 struct ioc_data_version ioc = { .idv_flags = flags };
2540 rc = ll_ioc_data_version(inode, &ioc);
2542 *data_version = ioc.idv_version;
2548 * Trigger a HSM release request for the provided inode.
2550 int ll_hsm_release(struct inode *inode)
2553 struct obd_client_handle *och = NULL;
2554 __u64 data_version = 0;
2559 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2560 ll_i2sbi(inode)->ll_fsname,
2561 PFID(&ll_i2info(inode)->lli_fid));
2563 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2565 GOTO(out, rc = PTR_ERR(och));
2567 /* Grab latest data_version and [am]time values */
2568 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2572 env = cl_env_get(&refcheck);
2574 GOTO(out, rc = PTR_ERR(env));
2576 rc = ll_merge_attr(env, inode);
2577 cl_env_put(env, &refcheck);
2579 /* If error happen, we have the wrong size for a file.
2585 /* Release the file.
2586 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2587 * we still need it to pack l_remote_handle to MDT. */
2588 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2594 if (och != NULL && !IS_ERR(och)) /* close the file */
2595 ll_lease_close(och, inode, NULL);
2600 struct ll_swap_stack {
2603 struct inode *inode1;
2604 struct inode *inode2;
2609 static int ll_swap_layouts(struct file *file1, struct file *file2,
2610 struct lustre_swap_layouts *lsl)
2612 struct mdc_swap_layouts msl;
2613 struct md_op_data *op_data;
2616 struct ll_swap_stack *llss = NULL;
2619 OBD_ALLOC_PTR(llss);
2623 llss->inode1 = file_inode(file1);
2624 llss->inode2 = file_inode(file2);
2626 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2630 /* we use 2 bool because it is easier to swap than 2 bits */
2631 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2632 llss->check_dv1 = true;
2634 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2635 llss->check_dv2 = true;
2637 /* we cannot use lsl->sl_dvX directly because we may swap them */
2638 llss->dv1 = lsl->sl_dv1;
2639 llss->dv2 = lsl->sl_dv2;
2641 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2642 if (rc == 0) /* same file, done! */
2645 if (rc < 0) { /* sequentialize it */
2646 swap(llss->inode1, llss->inode2);
2648 swap(llss->dv1, llss->dv2);
2649 swap(llss->check_dv1, llss->check_dv2);
2653 if (gid != 0) { /* application asks to flush dirty cache */
2654 rc = ll_get_grouplock(llss->inode1, file1, gid);
2658 rc = ll_get_grouplock(llss->inode2, file2, gid);
2660 ll_put_grouplock(llss->inode1, file1, gid);
2665 /* ultimate check, before swaping the layouts we check if
2666 * dataversion has changed (if requested) */
2667 if (llss->check_dv1) {
2668 rc = ll_data_version(llss->inode1, &dv, 0);
2671 if (dv != llss->dv1)
2672 GOTO(putgl, rc = -EAGAIN);
2675 if (llss->check_dv2) {
2676 rc = ll_data_version(llss->inode2, &dv, 0);
2679 if (dv != llss->dv2)
2680 GOTO(putgl, rc = -EAGAIN);
2683 /* struct md_op_data is used to send the swap args to the mdt
2684 * only flags is missing, so we use struct mdc_swap_layouts
2685 * through the md_op_data->op_data */
2686 /* flags from user space have to be converted before they are send to
2687 * server, no flag is sent today, they are only used on the client */
2690 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2691 0, LUSTRE_OPC_ANY, &msl);
2692 if (IS_ERR(op_data))
2693 GOTO(free, rc = PTR_ERR(op_data));
2695 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2696 sizeof(*op_data), op_data, NULL);
2697 ll_finish_md_op_data(op_data);
2704 ll_put_grouplock(llss->inode2, file2, gid);
2705 ll_put_grouplock(llss->inode1, file1, gid);
2715 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2717 struct obd_export *exp = ll_i2mdexp(inode);
2718 struct md_op_data *op_data;
2722 /* Detect out-of range masks */
2723 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2726 /* Non-root users are forbidden to set or clear flags which are
2727 * NOT defined in HSM_USER_MASK. */
2728 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2729 !cfs_capable(CFS_CAP_SYS_ADMIN))
2732 if (!exp_connect_archive_id_array(exp)) {
2733 /* Detect out-of range archive id */
2734 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2735 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2739 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2740 LUSTRE_OPC_ANY, hss);
2741 if (IS_ERR(op_data))
2742 RETURN(PTR_ERR(op_data));
2744 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2747 ll_finish_md_op_data(op_data);
2752 static int ll_hsm_import(struct inode *inode, struct file *file,
2753 struct hsm_user_import *hui)
2755 struct hsm_state_set *hss = NULL;
2756 struct iattr *attr = NULL;
2760 if (!S_ISREG(inode->i_mode))
2766 GOTO(out, rc = -ENOMEM);
2768 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2769 hss->hss_archive_id = hui->hui_archive_id;
2770 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2771 rc = ll_hsm_state_set(inode, hss);
2775 OBD_ALLOC_PTR(attr);
2777 GOTO(out, rc = -ENOMEM);
2779 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2780 attr->ia_mode |= S_IFREG;
2781 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2782 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2783 attr->ia_size = hui->hui_size;
2784 attr->ia_mtime.tv_sec = hui->hui_mtime;
2785 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2786 attr->ia_atime.tv_sec = hui->hui_atime;
2787 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2789 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2790 ATTR_UID | ATTR_GID |
2791 ATTR_MTIME | ATTR_MTIME_SET |
2792 ATTR_ATIME | ATTR_ATIME_SET;
2796 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2800 inode_unlock(inode);
2812 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2814 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2815 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2818 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2820 struct inode *inode = file_inode(file);
2822 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2823 ATTR_MTIME | ATTR_MTIME_SET |
2826 .tv_sec = lfu->lfu_atime_sec,
2827 .tv_nsec = lfu->lfu_atime_nsec,
2830 .tv_sec = lfu->lfu_mtime_sec,
2831 .tv_nsec = lfu->lfu_mtime_nsec,
2834 .tv_sec = lfu->lfu_ctime_sec,
2835 .tv_nsec = lfu->lfu_ctime_nsec,
2841 if (!capable(CAP_SYS_ADMIN))
2844 if (!S_ISREG(inode->i_mode))
2848 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2850 inode_unlock(inode);
2855 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2858 case MODE_READ_USER:
2860 case MODE_WRITE_USER:
2867 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2869 /* Used to allow the upper layers of the client to request an LDLM lock
2870 * without doing an actual read or write.
2872 * Used for ladvise lockahead to manually request specific locks.
2874 * \param[in] file file this ladvise lock request is on
2875 * \param[in] ladvise ladvise struct describing this lock request
2877 * \retval 0 success, no detailed result available (sync requests
2878 * and requests sent to the server [not handled locally]
2879 * cannot return detailed results)
2880 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2881 * see definitions for details.
2882 * \retval negative negative errno on error
2884 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2886 struct lu_env *env = NULL;
2887 struct cl_io *io = NULL;
2888 struct cl_lock *lock = NULL;
2889 struct cl_lock_descr *descr = NULL;
2890 struct dentry *dentry = file->f_path.dentry;
2891 struct inode *inode = dentry->d_inode;
2892 enum cl_lock_mode cl_mode;
2893 off_t start = ladvise->lla_start;
2894 off_t end = ladvise->lla_end;
2900 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2901 "start=%llu, end=%llu\n", dentry->d_name.len,
2902 dentry->d_name.name, dentry->d_inode,
2903 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2906 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2908 GOTO(out, result = cl_mode);
2910 /* Get IO environment */
2911 result = cl_io_get(inode, &env, &io, &refcheck);
2915 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2918 * nothing to do for this io. This currently happens when
2919 * stripe sub-object's are not yet created.
2921 result = io->ci_result;
2922 } else if (result == 0) {
2923 lock = vvp_env_lock(env);
2924 descr = &lock->cll_descr;
2926 descr->cld_obj = io->ci_obj;
2927 /* Convert byte offsets to pages */
2928 descr->cld_start = cl_index(io->ci_obj, start);
2929 descr->cld_end = cl_index(io->ci_obj, end);
2930 descr->cld_mode = cl_mode;
2931 /* CEF_MUST is used because we do not want to convert a
2932 * lockahead request to a lockless lock */
2933 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2936 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2937 descr->cld_enq_flags |= CEF_SPECULATIVE;
2939 result = cl_lock_request(env, io, lock);
2941 /* On success, we need to release the lock */
2943 cl_lock_release(env, lock);
2945 cl_io_fini(env, io);
2946 cl_env_put(env, &refcheck);
2948 /* -ECANCELED indicates a matching lock with a different extent
2949 * was already present, and -EEXIST indicates a matching lock
2950 * on exactly the same extent was already present.
2951 * We convert them to positive values for userspace to make
2952 * recognizing true errors easier.
2953 * Note we can only return these detailed results on async requests,
2954 * as sync requests look the same as i/o requests for locking. */
2955 if (result == -ECANCELED)
2956 result = LLA_RESULT_DIFFERENT;
2957 else if (result == -EEXIST)
2958 result = LLA_RESULT_SAME;
2963 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2965 static int ll_ladvise_sanity(struct inode *inode,
2966 struct llapi_lu_ladvise *ladvise)
2968 struct ll_sb_info *sbi = ll_i2sbi(inode);
2969 enum lu_ladvise_type advice = ladvise->lla_advice;
2970 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2971 * be in the first 32 bits of enum ladvise_flags */
2972 __u32 flags = ladvise->lla_peradvice_flags;
2973 /* 3 lines at 80 characters per line, should be plenty */
2976 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2978 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2979 "last supported advice is %s (value '%d'): rc = %d\n",
2980 sbi->ll_fsname, advice,
2981 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2985 /* Per-advice checks */
2987 case LU_LADVISE_LOCKNOEXPAND:
2988 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2990 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2991 "rc = %d\n", sbi->ll_fsname, flags,
2992 ladvise_names[advice], rc);
2996 case LU_LADVISE_LOCKAHEAD:
2997 /* Currently only READ and WRITE modes can be requested */
2998 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2999 ladvise->lla_lockahead_mode == 0) {
3001 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3002 "rc = %d\n", sbi->ll_fsname,
3003 ladvise->lla_lockahead_mode,
3004 ladvise_names[advice], rc);
3007 case LU_LADVISE_WILLREAD:
3008 case LU_LADVISE_DONTNEED:
3010 /* Note fall through above - These checks apply to all advices
3011 * except LOCKNOEXPAND */
3012 if (flags & ~LF_DEFAULT_MASK) {
3014 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3015 "rc = %d\n", sbi->ll_fsname, flags,
3016 ladvise_names[advice], rc);
3019 if (ladvise->lla_start >= ladvise->lla_end) {
3021 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3022 "for %s: rc = %d\n", sbi->ll_fsname,
3023 ladvise->lla_start, ladvise->lla_end,
3024 ladvise_names[advice], rc);
3036 * Give file access advices
3038 * The ladvise interface is similar to Linux fadvise() system call, except it
3039 * forwards the advices directly from Lustre client to server. The server side
3040 * codes will apply appropriate read-ahead and caching techniques for the
3041 * corresponding files.
3043 * A typical workload for ladvise is e.g. a bunch of different clients are
3044 * doing small random reads of a file, so prefetching pages into OSS cache
3045 * with big linear reads before the random IO is a net benefit. Fetching
3046 * all that data into each client cache with fadvise() may not be, due to
3047 * much more data being sent to the client.
3049 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3050 struct llapi_lu_ladvise *ladvise)
3054 struct cl_ladvise_io *lio;
3059 env = cl_env_get(&refcheck);
3061 RETURN(PTR_ERR(env));
3063 io = vvp_env_thread_io(env);
3064 io->ci_obj = ll_i2info(inode)->lli_clob;
3066 /* initialize parameters for ladvise */
3067 lio = &io->u.ci_ladvise;
3068 lio->li_start = ladvise->lla_start;
3069 lio->li_end = ladvise->lla_end;
3070 lio->li_fid = ll_inode2fid(inode);
3071 lio->li_advice = ladvise->lla_advice;
3072 lio->li_flags = flags;
3074 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3075 rc = cl_io_loop(env, io);
3079 cl_io_fini(env, io);
3080 cl_env_put(env, &refcheck);
3084 static int ll_lock_noexpand(struct file *file, int flags)
3086 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3088 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3093 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3096 struct fsxattr fsxattr;
3098 if (copy_from_user(&fsxattr,
3099 (const struct fsxattr __user *)arg,
3103 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3104 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3105 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3106 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3107 if (copy_to_user((struct fsxattr __user *)arg,
3108 &fsxattr, sizeof(fsxattr)))
3114 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3117 * Project Quota ID state is only allowed to change from within the init
3118 * namespace. Enforce that restriction only if we are trying to change
3119 * the quota ID state. Everything else is allowed in user namespaces.
3121 if (current_user_ns() == &init_user_ns)
3124 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3127 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3128 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3131 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3138 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3142 struct md_op_data *op_data;
3143 struct ptlrpc_request *req = NULL;
3145 struct fsxattr fsxattr;
3146 struct cl_object *obj;
3150 if (copy_from_user(&fsxattr,
3151 (const struct fsxattr __user *)arg,
3155 rc = ll_ioctl_check_project(inode, &fsxattr);
3159 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3160 LUSTRE_OPC_ANY, NULL);
3161 if (IS_ERR(op_data))
3162 RETURN(PTR_ERR(op_data));
3164 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3165 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3166 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3167 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3168 op_data->op_projid = fsxattr.fsx_projid;
3169 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3170 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3172 ptlrpc_req_finished(req);
3174 GOTO(out_fsxattr, rc);
3175 ll_update_inode_flags(inode, op_data->op_attr_flags);
3176 obj = ll_i2info(inode)->lli_clob;
3178 GOTO(out_fsxattr, rc);
3180 OBD_ALLOC_PTR(attr);
3182 GOTO(out_fsxattr, rc = -ENOMEM);
3184 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3185 fsxattr.fsx_xflags);
3188 ll_finish_md_op_data(op_data);
3192 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3195 struct inode *inode = file_inode(file);
3196 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3197 struct ll_inode_info *lli = ll_i2info(inode);
3198 struct obd_client_handle *och = NULL;
3199 struct split_param sp;
3200 struct pcc_param param;
3201 bool lease_broken = false;
3203 enum mds_op_bias bias = 0;
3204 struct file *layout_file = NULL;
3206 size_t data_size = 0;
3207 bool attached = false;
3212 mutex_lock(&lli->lli_och_mutex);
3213 if (fd->fd_lease_och != NULL) {
3214 och = fd->fd_lease_och;
3215 fd->fd_lease_och = NULL;
3217 mutex_unlock(&lli->lli_och_mutex);
3222 fmode = och->och_flags;
3224 switch (ioc->lil_flags) {
3225 case LL_LEASE_RESYNC_DONE:
3226 if (ioc->lil_count > IOC_IDS_MAX)
3227 GOTO(out_lease_close, rc = -EINVAL);
3229 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3230 OBD_ALLOC(data, data_size);
3232 GOTO(out_lease_close, rc = -ENOMEM);
3234 if (copy_from_user(data, (void __user *)arg, data_size))
3235 GOTO(out_lease_close, rc = -EFAULT);
3237 bias = MDS_CLOSE_RESYNC_DONE;
3239 case LL_LEASE_LAYOUT_MERGE: {
3242 if (ioc->lil_count != 1)
3243 GOTO(out_lease_close, rc = -EINVAL);
3245 arg += sizeof(*ioc);
3246 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3247 GOTO(out_lease_close, rc = -EFAULT);
3249 layout_file = fget(fd);
3251 GOTO(out_lease_close, rc = -EBADF);
3253 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3254 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3255 GOTO(out_lease_close, rc = -EPERM);
3257 data = file_inode(layout_file);
3258 bias = MDS_CLOSE_LAYOUT_MERGE;
3261 case LL_LEASE_LAYOUT_SPLIT: {
3265 if (ioc->lil_count != 2)
3266 GOTO(out_lease_close, rc = -EINVAL);
3268 arg += sizeof(*ioc);
3269 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3270 GOTO(out_lease_close, rc = -EFAULT);
3272 arg += sizeof(__u32);
3273 if (copy_from_user(&mirror_id, (void __user *)arg,
3275 GOTO(out_lease_close, rc = -EFAULT);
3277 layout_file = fget(fdv);
3279 GOTO(out_lease_close, rc = -EBADF);
3281 sp.sp_inode = file_inode(layout_file);
3282 sp.sp_mirror_id = (__u16)mirror_id;
3284 bias = MDS_CLOSE_LAYOUT_SPLIT;
3287 case LL_LEASE_PCC_ATTACH:
3288 if (ioc->lil_count != 1)
3291 arg += sizeof(*ioc);
3292 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3294 GOTO(out_lease_close, rc2 = -EFAULT);
3296 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3298 GOTO(out_lease_close, rc2);
3301 /* Grab latest data version */
3302 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3305 GOTO(out_lease_close, rc2);
3308 bias = MDS_PCC_ATTACH;
3311 /* without close intent */
3316 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3320 rc = ll_lease_och_release(inode, file);
3329 switch (ioc->lil_flags) {
3330 case LL_LEASE_RESYNC_DONE:
3332 OBD_FREE(data, data_size);
3334 case LL_LEASE_LAYOUT_MERGE:
3335 case LL_LEASE_LAYOUT_SPLIT:
3339 case LL_LEASE_PCC_ATTACH:
3342 rc = pcc_readwrite_attach_fini(file, inode,
3343 param.pa_layout_gen,
3350 rc = ll_lease_type_from_fmode(fmode);
3354 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3357 struct inode *inode = file_inode(file);
3358 struct ll_inode_info *lli = ll_i2info(inode);
3359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3360 struct obd_client_handle *och = NULL;
3361 __u64 open_flags = 0;
3367 switch (ioc->lil_mode) {
3368 case LL_LEASE_WRLCK:
3369 if (!(file->f_mode & FMODE_WRITE))
3371 fmode = FMODE_WRITE;
3373 case LL_LEASE_RDLCK:
3374 if (!(file->f_mode & FMODE_READ))
3378 case LL_LEASE_UNLCK:
3379 RETURN(ll_file_unlock_lease(file, ioc, arg));
3384 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3386 /* apply for lease */
3387 if (ioc->lil_flags & LL_LEASE_RESYNC)
3388 open_flags = MDS_OPEN_RESYNC;
3389 och = ll_lease_open(inode, file, fmode, open_flags);
3391 RETURN(PTR_ERR(och));
3393 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3394 rc = ll_lease_file_resync(och, inode, arg);
3396 ll_lease_close(och, inode, NULL);
3399 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3401 ll_lease_close(och, inode, NULL);
3407 mutex_lock(&lli->lli_och_mutex);
3408 if (fd->fd_lease_och == NULL) {
3409 fd->fd_lease_och = och;
3412 mutex_unlock(&lli->lli_och_mutex);
3414 /* impossible now that only excl is supported for now */
3415 ll_lease_close(och, inode, &lease_broken);
3421 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3423 struct ll_inode_info *lli = ll_i2info(inode);
3424 struct ll_sb_info *sbi = ll_i2sbi(inode);
3425 __u64 now = ktime_get_real_seconds();
3428 spin_lock(&lli->lli_heat_lock);
3429 heat->lh_flags = lli->lli_heat_flags;
3430 for (i = 0; i < heat->lh_count; i++)
3431 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3432 now, sbi->ll_heat_decay_weight,
3433 sbi->ll_heat_period_second);
3434 spin_unlock(&lli->lli_heat_lock);
3437 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3439 struct ll_inode_info *lli = ll_i2info(inode);
3442 spin_lock(&lli->lli_heat_lock);
3443 if (flags & LU_HEAT_FLAG_CLEAR)
3444 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3446 if (flags & LU_HEAT_FLAG_OFF)
3447 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3449 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3451 spin_unlock(&lli->lli_heat_lock);
3457 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3459 struct inode *inode = file_inode(file);
3460 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3464 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3465 PFID(ll_inode2fid(inode)), inode, cmd);
3466 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3468 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3469 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3473 case LL_IOC_GETFLAGS:
3474 /* Get the current value of the file flags */
3475 return put_user(fd->fd_flags, (int __user *)arg);
3476 case LL_IOC_SETFLAGS:
3477 case LL_IOC_CLRFLAGS:
3478 /* Set or clear specific file flags */
3479 /* XXX This probably needs checks to ensure the flags are
3480 * not abused, and to handle any flag side effects.
3482 if (get_user(flags, (int __user *) arg))
3485 if (cmd == LL_IOC_SETFLAGS) {
3486 if ((flags & LL_FILE_IGNORE_LOCK) &&
3487 !(file->f_flags & O_DIRECT)) {
3488 CERROR("%s: unable to disable locking on "
3489 "non-O_DIRECT file\n", current->comm);
3493 fd->fd_flags |= flags;
3495 fd->fd_flags &= ~flags;
3498 case LL_IOC_LOV_SETSTRIPE:
3499 case LL_IOC_LOV_SETSTRIPE_NEW:
3500 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3501 case LL_IOC_LOV_SETEA:
3502 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3503 case LL_IOC_LOV_SWAP_LAYOUTS: {
3505 struct lustre_swap_layouts lsl;
3507 if (copy_from_user(&lsl, (char __user *)arg,
3508 sizeof(struct lustre_swap_layouts)))
3511 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3514 file2 = fget(lsl.sl_fd);
3518 /* O_WRONLY or O_RDWR */
3519 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3520 GOTO(out, rc = -EPERM);
3522 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3523 struct inode *inode2;
3524 struct ll_inode_info *lli;
3525 struct obd_client_handle *och = NULL;
3527 lli = ll_i2info(inode);
3528 mutex_lock(&lli->lli_och_mutex);
3529 if (fd->fd_lease_och != NULL) {
3530 och = fd->fd_lease_och;
3531 fd->fd_lease_och = NULL;
3533 mutex_unlock(&lli->lli_och_mutex);
3535 GOTO(out, rc = -ENOLCK);
3536 inode2 = file_inode(file2);
3537 rc = ll_swap_layouts_close(och, inode, inode2);
3539 rc = ll_swap_layouts(file, file2, &lsl);
3545 case LL_IOC_LOV_GETSTRIPE:
3546 case LL_IOC_LOV_GETSTRIPE_NEW:
3547 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3548 case FS_IOC_GETFLAGS:
3549 case FS_IOC_SETFLAGS:
3550 RETURN(ll_iocontrol(inode, file, cmd, arg));
3551 case FSFILT_IOC_GETVERSION:
3552 case FS_IOC_GETVERSION:
3553 RETURN(put_user(inode->i_generation, (int __user *)arg));
3554 /* We need to special case any other ioctls we want to handle,
3555 * to send them to the MDS/OST as appropriate and to properly
3556 * network encode the arg field. */
3557 case FS_IOC_SETVERSION:
3560 case LL_IOC_GROUP_LOCK:
3561 RETURN(ll_get_grouplock(inode, file, arg));
3562 case LL_IOC_GROUP_UNLOCK:
3563 RETURN(ll_put_grouplock(inode, file, arg));
3564 case IOC_OBD_STATFS:
3565 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3567 case LL_IOC_FLUSHCTX:
3568 RETURN(ll_flush_ctx(inode));
3569 case LL_IOC_PATH2FID: {
3570 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3571 sizeof(struct lu_fid)))
3576 case LL_IOC_GETPARENT:
3577 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3579 case OBD_IOC_FID2PATH:
3580 RETURN(ll_fid2path(inode, (void __user *)arg));
3581 case LL_IOC_DATA_VERSION: {
3582 struct ioc_data_version idv;
3585 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3588 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3589 rc = ll_ioc_data_version(inode, &idv);
3592 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3598 case LL_IOC_GET_MDTIDX: {
3601 mdtidx = ll_get_mdt_idx(inode);
3605 if (put_user((int)mdtidx, (int __user *)arg))
3610 case OBD_IOC_GETDTNAME:
3611 case OBD_IOC_GETMDNAME:
3612 RETURN(ll_get_obd_name(inode, cmd, arg));
3613 case LL_IOC_HSM_STATE_GET: {
3614 struct md_op_data *op_data;
3615 struct hsm_user_state *hus;
3622 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3623 LUSTRE_OPC_ANY, hus);
3624 if (IS_ERR(op_data)) {
3626 RETURN(PTR_ERR(op_data));
3629 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3632 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3635 ll_finish_md_op_data(op_data);
3639 case LL_IOC_HSM_STATE_SET: {
3640 struct hsm_state_set *hss;
3647 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3652 rc = ll_hsm_state_set(inode, hss);
3657 case LL_IOC_HSM_ACTION: {
3658 struct md_op_data *op_data;
3659 struct hsm_current_action *hca;
3666 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3667 LUSTRE_OPC_ANY, hca);
3668 if (IS_ERR(op_data)) {
3670 RETURN(PTR_ERR(op_data));
3673 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3676 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3679 ll_finish_md_op_data(op_data);
3683 case LL_IOC_SET_LEASE_OLD: {
3684 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3686 RETURN(ll_file_set_lease(file, &ioc, 0));
3688 case LL_IOC_SET_LEASE: {
3689 struct ll_ioc_lease ioc;
3691 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3694 RETURN(ll_file_set_lease(file, &ioc, arg));
3696 case LL_IOC_GET_LEASE: {
3697 struct ll_inode_info *lli = ll_i2info(inode);
3698 struct ldlm_lock *lock = NULL;
3701 mutex_lock(&lli->lli_och_mutex);
3702 if (fd->fd_lease_och != NULL) {
3703 struct obd_client_handle *och = fd->fd_lease_och;
3705 lock = ldlm_handle2lock(&och->och_lease_handle);
3707 lock_res_and_lock(lock);
3708 if (!ldlm_is_cancel(lock))
3709 fmode = och->och_flags;
3711 unlock_res_and_lock(lock);
3712 LDLM_LOCK_PUT(lock);
3715 mutex_unlock(&lli->lli_och_mutex);
3717 RETURN(ll_lease_type_from_fmode(fmode));
3719 case LL_IOC_HSM_IMPORT: {
3720 struct hsm_user_import *hui;
3726 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3731 rc = ll_hsm_import(inode, file, hui);
3736 case LL_IOC_FUTIMES_3: {
3737 struct ll_futimes_3 lfu;
3739 if (copy_from_user(&lfu,
3740 (const struct ll_futimes_3 __user *)arg,
3744 RETURN(ll_file_futimes_3(file, &lfu));
3746 case LL_IOC_LADVISE: {
3747 struct llapi_ladvise_hdr *k_ladvise_hdr;
3748 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3751 int alloc_size = sizeof(*k_ladvise_hdr);
3754 u_ladvise_hdr = (void __user *)arg;
3755 OBD_ALLOC_PTR(k_ladvise_hdr);
3756 if (k_ladvise_hdr == NULL)
3759 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3760 GOTO(out_ladvise, rc = -EFAULT);
3762 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3763 k_ladvise_hdr->lah_count < 1)
3764 GOTO(out_ladvise, rc = -EINVAL);
3766 num_advise = k_ladvise_hdr->lah_count;
3767 if (num_advise >= LAH_COUNT_MAX)
3768 GOTO(out_ladvise, rc = -EFBIG);
3770 OBD_FREE_PTR(k_ladvise_hdr);
3771 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3772 lah_advise[num_advise]);
3773 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3774 if (k_ladvise_hdr == NULL)
3778 * TODO: submit multiple advices to one server in a single RPC
3780 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3781 GOTO(out_ladvise, rc = -EFAULT);
3783 for (i = 0; i < num_advise; i++) {
3784 struct llapi_lu_ladvise *k_ladvise =
3785 &k_ladvise_hdr->lah_advise[i];
3786 struct llapi_lu_ladvise __user *u_ladvise =
3787 &u_ladvise_hdr->lah_advise[i];
3789 rc = ll_ladvise_sanity(inode, k_ladvise);
3791 GOTO(out_ladvise, rc);
3793 switch (k_ladvise->lla_advice) {
3794 case LU_LADVISE_LOCKNOEXPAND:
3795 rc = ll_lock_noexpand(file,
3796 k_ladvise->lla_peradvice_flags);
3797 GOTO(out_ladvise, rc);
3798 case LU_LADVISE_LOCKAHEAD:
3800 rc = ll_file_lock_ahead(file, k_ladvise);
3803 GOTO(out_ladvise, rc);
3806 &u_ladvise->lla_lockahead_result))
3807 GOTO(out_ladvise, rc = -EFAULT);
3810 rc = ll_ladvise(inode, file,
3811 k_ladvise_hdr->lah_flags,
3814 GOTO(out_ladvise, rc);
3821 OBD_FREE(k_ladvise_hdr, alloc_size);
3824 case LL_IOC_FLR_SET_MIRROR: {
3825 /* mirror I/O must be direct to avoid polluting page cache
3827 if (!(file->f_flags & O_DIRECT))
3830 fd->fd_designated_mirror = (__u32)arg;
3833 case LL_IOC_FSGETXATTR:
3834 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3835 case LL_IOC_FSSETXATTR:
3836 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3838 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3839 case LL_IOC_HEAT_GET: {
3840 struct lu_heat uheat;
3841 struct lu_heat *heat;
3844 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3847 if (uheat.lh_count > OBD_HEAT_COUNT)
3848 uheat.lh_count = OBD_HEAT_COUNT;
3850 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3851 OBD_ALLOC(heat, size);
3855 heat->lh_count = uheat.lh_count;
3856 ll_heat_get(inode, heat);
3857 rc = copy_to_user((char __user *)arg, heat, size);
3858 OBD_FREE(heat, size);
3859 RETURN(rc ? -EFAULT : 0);
3861 case LL_IOC_HEAT_SET: {
3864 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3867 rc = ll_heat_set(inode, flags);
3870 case LL_IOC_PCC_DETACH: {
3871 struct lu_pcc_detach *detach;
3873 OBD_ALLOC_PTR(detach);
3877 if (copy_from_user(detach,
3878 (const struct lu_pcc_detach __user *)arg,
3880 GOTO(out_detach_free, rc = -EFAULT);
3882 if (!S_ISREG(inode->i_mode))
3883 GOTO(out_detach_free, rc = -EINVAL);
3885 if (!inode_owner_or_capable(inode))
3886 GOTO(out_detach_free, rc = -EPERM);
3888 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3890 OBD_FREE_PTR(detach);
3893 case LL_IOC_PCC_STATE: {
3894 struct lu_pcc_state __user *ustate =
3895 (struct lu_pcc_state __user *)arg;
3896 struct lu_pcc_state *state;
3898 OBD_ALLOC_PTR(state);
3902 if (copy_from_user(state, ustate, sizeof(*state)))
3903 GOTO(out_state, rc = -EFAULT);
3905 rc = pcc_ioctl_state(file, inode, state);
3907 GOTO(out_state, rc);
3909 if (copy_to_user(ustate, state, sizeof(*state)))
3910 GOTO(out_state, rc = -EFAULT);
3913 OBD_FREE_PTR(state);
3917 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3918 (void __user *)arg));
3922 #ifndef HAVE_FILE_LLSEEK_SIZE
3923 static inline loff_t
3924 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3926 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3928 if (offset > maxsize)
3931 if (offset != file->f_pos) {
3932 file->f_pos = offset;
3933 file->f_version = 0;
3939 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3940 loff_t maxsize, loff_t eof)
3942 struct inode *inode = file_inode(file);
3950 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3951 * position-querying operation. Avoid rewriting the "same"
3952 * f_pos value back to the file because a concurrent read(),
3953 * write() or lseek() might have altered it
3958 * f_lock protects against read/modify/write race with other
3959 * SEEK_CURs. Note that parallel writes and reads behave
3963 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3964 inode_unlock(inode);
3968 * In the generic case the entire file is data, so as long as
3969 * offset isn't at the end of the file then the offset is data.
3976 * There is a virtual hole at the end of the file, so as long as
3977 * offset isn't i_size or larger, return i_size.
3985 return llseek_execute(file, offset, maxsize);
3989 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3991 struct inode *inode = file_inode(file);
3992 loff_t retval, eof = 0;
3995 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3996 (origin == SEEK_CUR) ? file->f_pos : 0);
3997 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3998 PFID(ll_inode2fid(inode)), inode, retval, retval,
4000 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4002 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4003 retval = ll_glimpse_size(inode);
4006 eof = i_size_read(inode);
4009 retval = ll_generic_file_llseek_size(file, offset, origin,
4010 ll_file_maxbytes(inode), eof);
4014 static int ll_flush(struct file *file, fl_owner_t id)
4016 struct inode *inode = file_inode(file);
4017 struct ll_inode_info *lli = ll_i2info(inode);
4018 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4021 LASSERT(!S_ISDIR(inode->i_mode));
4023 /* catch async errors that were recorded back when async writeback
4024 * failed for pages in this mapping. */
4025 rc = lli->lli_async_rc;
4026 lli->lli_async_rc = 0;
4027 if (lli->lli_clob != NULL) {
4028 err = lov_read_and_clear_async_rc(lli->lli_clob);
4033 /* The application has been told write failure already.
4034 * Do not report failure again. */
4035 if (fd->fd_write_failed)
4037 return rc ? -EIO : 0;
4041 * Called to make sure a portion of file has been written out.
4042 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4044 * Return how many pages have been written.
4046 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4047 enum cl_fsync_mode mode, int ignore_layout)
4051 struct cl_fsync_io *fio;
4056 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4057 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4060 env = cl_env_get(&refcheck);
4062 RETURN(PTR_ERR(env));
4064 io = vvp_env_thread_io(env);
4065 io->ci_obj = ll_i2info(inode)->lli_clob;
4066 io->ci_ignore_layout = ignore_layout;
4068 /* initialize parameters for sync */
4069 fio = &io->u.ci_fsync;
4070 fio->fi_start = start;
4072 fio->fi_fid = ll_inode2fid(inode);
4073 fio->fi_mode = mode;
4074 fio->fi_nr_written = 0;
4076 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4077 result = cl_io_loop(env, io);
4079 result = io->ci_result;
4081 result = fio->fi_nr_written;
4082 cl_io_fini(env, io);
4083 cl_env_put(env, &refcheck);
4089 * When dentry is provided (the 'else' case), file_dentry() may be
4090 * null and dentry must be used directly rather than pulled from
4091 * file_dentry() as is done otherwise.
4094 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4096 struct dentry *dentry = file_dentry(file);
4097 struct inode *inode = dentry->d_inode;
4098 struct ll_inode_info *lli = ll_i2info(inode);
4099 struct ptlrpc_request *req;
4104 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4106 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4108 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4110 /* fsync's caller has already called _fdata{sync,write}, we want
4111 * that IO to finish before calling the osc and mdc sync methods */
4112 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4115 /* catch async errors that were recorded back when async writeback
4116 * failed for pages in this mapping. */
4117 if (!S_ISDIR(inode->i_mode)) {
4118 err = lli->lli_async_rc;
4119 lli->lli_async_rc = 0;
4122 if (lli->lli_clob != NULL) {
4123 err = lov_read_and_clear_async_rc(lli->lli_clob);
4129 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4133 ptlrpc_req_finished(req);
4135 if (S_ISREG(inode->i_mode)) {
4136 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4139 /* Sync metadata on MDT first, and then sync the cached data
4142 err = pcc_fsync(file, start, end, datasync, &cached);
4144 err = cl_sync_file_range(inode, start, end,
4146 if (rc == 0 && err < 0)
4149 fd->fd_write_failed = true;
4151 fd->fd_write_failed = false;
4154 inode_unlock(inode);
4159 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4161 struct inode *inode = file_inode(file);
4162 struct ll_sb_info *sbi = ll_i2sbi(inode);
4163 struct ldlm_enqueue_info einfo = {
4164 .ei_type = LDLM_FLOCK,
4165 .ei_cb_cp = ldlm_flock_completion_ast,
4166 .ei_cbdata = file_lock,
4168 struct md_op_data *op_data;
4169 struct lustre_handle lockh = { 0 };
4170 union ldlm_policy_data flock = { { 0 } };
4171 int fl_type = file_lock->fl_type;
4177 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4178 PFID(ll_inode2fid(inode)), file_lock);
4180 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4182 if (file_lock->fl_flags & FL_FLOCK) {
4183 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4184 /* flocks are whole-file locks */
4185 flock.l_flock.end = OFFSET_MAX;
4186 /* For flocks owner is determined by the local file desctiptor*/
4187 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4188 } else if (file_lock->fl_flags & FL_POSIX) {
4189 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4190 flock.l_flock.start = file_lock->fl_start;
4191 flock.l_flock.end = file_lock->fl_end;
4195 flock.l_flock.pid = file_lock->fl_pid;
4197 /* Somewhat ugly workaround for svc lockd.
4198 * lockd installs custom fl_lmops->lm_compare_owner that checks
4199 * for the fl_owner to be the same (which it always is on local node
4200 * I guess between lockd processes) and then compares pid.
4201 * As such we assign pid to the owner field to make it all work,
4202 * conflict with normal locks is unlikely since pid space and
4203 * pointer space for current->files are not intersecting */
4204 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4205 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4209 einfo.ei_mode = LCK_PR;
4212 /* An unlock request may or may not have any relation to
4213 * existing locks so we may not be able to pass a lock handle
4214 * via a normal ldlm_lock_cancel() request. The request may even
4215 * unlock a byte range in the middle of an existing lock. In
4216 * order to process an unlock request we need all of the same
4217 * information that is given with a normal read or write record
4218 * lock request. To avoid creating another ldlm unlock (cancel)
4219 * message we'll treat a LCK_NL flock request as an unlock. */
4220 einfo.ei_mode = LCK_NL;
4223 einfo.ei_mode = LCK_PW;
4226 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4241 flags = LDLM_FL_BLOCK_NOWAIT;
4247 flags = LDLM_FL_TEST_LOCK;
4250 CERROR("unknown fcntl lock command: %d\n", cmd);
4254 /* Save the old mode so that if the mode in the lock changes we
4255 * can decrement the appropriate reader or writer refcount. */
4256 file_lock->fl_type = einfo.ei_mode;
4258 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4259 LUSTRE_OPC_ANY, NULL);
4260 if (IS_ERR(op_data))
4261 RETURN(PTR_ERR(op_data));
4263 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4264 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4265 flock.l_flock.pid, flags, einfo.ei_mode,
4266 flock.l_flock.start, flock.l_flock.end);
4268 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4271 /* Restore the file lock type if not TEST lock. */
4272 if (!(flags & LDLM_FL_TEST_LOCK))
4273 file_lock->fl_type = fl_type;
4275 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4276 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4277 !(flags & LDLM_FL_TEST_LOCK))
4278 rc2 = locks_lock_file_wait(file, file_lock);
4280 if ((file_lock->fl_flags & FL_FLOCK) &&
4281 (rc == 0 || file_lock->fl_type == F_UNLCK))
4282 rc2 = flock_lock_file_wait(file, file_lock);
4283 if ((file_lock->fl_flags & FL_POSIX) &&
4284 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4285 !(flags & LDLM_FL_TEST_LOCK))
4286 rc2 = posix_lock_file_wait(file, file_lock);
4287 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4289 if (rc2 && file_lock->fl_type != F_UNLCK) {
4290 einfo.ei_mode = LCK_NL;
4291 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4296 ll_finish_md_op_data(op_data);
4301 int ll_get_fid_by_name(struct inode *parent, const char *name,
4302 int namelen, struct lu_fid *fid,
4303 struct inode **inode)
4305 struct md_op_data *op_data = NULL;
4306 struct mdt_body *body;
4307 struct ptlrpc_request *req;
4311 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4312 LUSTRE_OPC_ANY, NULL);
4313 if (IS_ERR(op_data))
4314 RETURN(PTR_ERR(op_data));
4316 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4317 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4318 ll_finish_md_op_data(op_data);
4322 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4324 GOTO(out_req, rc = -EFAULT);
4326 *fid = body->mbo_fid1;
4329 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4331 ptlrpc_req_finished(req);
4335 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4338 struct dentry *dchild = NULL;
4339 struct inode *child_inode = NULL;
4340 struct md_op_data *op_data;
4341 struct ptlrpc_request *request = NULL;
4342 struct obd_client_handle *och = NULL;
4344 struct mdt_body *body;
4345 __u64 data_version = 0;
4346 size_t namelen = strlen(name);
4347 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4351 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4352 PFID(ll_inode2fid(parent)), name,
4353 lum->lum_stripe_offset, lum->lum_stripe_count);
4355 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4356 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4357 lustre_swab_lmv_user_md(lum);
4359 /* Get child FID first */
4360 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4363 dchild = d_lookup(file_dentry(file), &qstr);
4365 if (dchild->d_inode)
4366 child_inode = igrab(dchild->d_inode);
4371 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4380 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4381 OBD_CONNECT2_DIR_MIGRATE)) {
4382 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4383 ll_dir_striped(child_inode)) {
4384 CERROR("%s: MDT doesn't support stripe directory "
4385 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4386 GOTO(out_iput, rc = -EOPNOTSUPP);
4391 * lfs migrate command needs to be blocked on the client
4392 * by checking the migrate FID against the FID of the
4395 if (child_inode == parent->i_sb->s_root->d_inode)
4396 GOTO(out_iput, rc = -EINVAL);
4398 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4399 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4400 if (IS_ERR(op_data))
4401 GOTO(out_iput, rc = PTR_ERR(op_data));
4403 inode_lock(child_inode);
4404 op_data->op_fid3 = *ll_inode2fid(child_inode);
4405 if (!fid_is_sane(&op_data->op_fid3)) {
4406 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4407 ll_i2sbi(parent)->ll_fsname, name,
4408 PFID(&op_data->op_fid3));
4409 GOTO(out_unlock, rc = -EINVAL);
4412 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4413 op_data->op_data = lum;
4414 op_data->op_data_size = lumlen;
4417 if (S_ISREG(child_inode->i_mode)) {
4418 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4422 GOTO(out_unlock, rc);
4425 rc = ll_data_version(child_inode, &data_version,
4428 GOTO(out_close, rc);
4430 op_data->op_open_handle = och->och_open_handle;
4431 op_data->op_data_version = data_version;
4432 op_data->op_lease_handle = och->och_lease_handle;
4433 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4435 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4436 och->och_mod->mod_open_req->rq_replay = 0;
4437 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4440 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4441 name, namelen, &request);
4443 LASSERT(request != NULL);
4444 ll_update_times(request, parent);
4447 if (rc == 0 || rc == -EAGAIN) {
4448 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4449 LASSERT(body != NULL);
4451 /* If the server does release layout lock, then we cleanup
4452 * the client och here, otherwise release it in out_close: */
4453 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4454 obd_mod_put(och->och_mod);
4455 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4457 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4463 if (request != NULL) {
4464 ptlrpc_req_finished(request);
4468 /* Try again if the lease has cancelled. */
4469 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4474 ll_lease_close(och, child_inode, NULL);
4476 clear_nlink(child_inode);
4478 inode_unlock(child_inode);
4479 ll_finish_md_op_data(op_data);
4486 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4488 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4492 * In order to avoid flood of warning messages, only print one message
4493 * for one file. And the entire message rate on the client is limited
4494 * by CDEBUG_LIMIT too.
4496 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4497 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4498 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4499 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4505 * test if some locks matching bits and l_req_mode are acquired
4506 * - bits can be in different locks
4507 * - if found clear the common lock bits in *bits
4508 * - the bits not found, are kept in *bits
4510 * \param bits [IN] searched lock bits [IN]
4511 * \param l_req_mode [IN] searched lock mode
4512 * \retval boolean, true iff all bits are found
4514 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4516 struct lustre_handle lockh;
4517 union ldlm_policy_data policy;
4518 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4519 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4528 fid = &ll_i2info(inode)->lli_fid;
4529 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4530 ldlm_lockname[mode]);
4532 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4533 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4534 policy.l_inodebits.bits = *bits & (1 << i);
4535 if (policy.l_inodebits.bits == 0)
4538 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4539 &policy, mode, &lockh)) {
4540 struct ldlm_lock *lock;
4542 lock = ldlm_handle2lock(&lockh);
4545 ~(lock->l_policy_data.l_inodebits.bits);
4546 LDLM_LOCK_PUT(lock);
4548 *bits &= ~policy.l_inodebits.bits;
4555 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4556 struct lustre_handle *lockh, __u64 flags,
4557 enum ldlm_mode mode)
4559 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4564 fid = &ll_i2info(inode)->lli_fid;
4565 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4567 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4568 fid, LDLM_IBITS, &policy, mode, lockh);
4573 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4575 /* Already unlinked. Just update nlink and return success */
4576 if (rc == -ENOENT) {
4578 /* If it is striped directory, and there is bad stripe
4579 * Let's revalidate the dentry again, instead of returning
4581 if (ll_dir_striped(inode))
4584 /* This path cannot be hit for regular files unless in
4585 * case of obscure races, so no need to to validate
4587 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4589 } else if (rc != 0) {
4590 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4591 "%s: revalidate FID "DFID" error: rc = %d\n",
4592 ll_i2sbi(inode)->ll_fsname,
4593 PFID(ll_inode2fid(inode)), rc);
4599 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4601 struct inode *inode = dentry->d_inode;
4602 struct obd_export *exp = ll_i2mdexp(inode);
4603 struct lookup_intent oit = {
4606 struct ptlrpc_request *req = NULL;
4607 struct md_op_data *op_data;
4611 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4612 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4614 /* Call getattr by fid, so do not provide name at all. */
4615 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4616 LUSTRE_OPC_ANY, NULL);
4617 if (IS_ERR(op_data))
4618 RETURN(PTR_ERR(op_data));
4620 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4621 ll_finish_md_op_data(op_data);
4623 rc = ll_inode_revalidate_fini(inode, rc);
4627 rc = ll_revalidate_it_finish(req, &oit, dentry);
4629 ll_intent_release(&oit);
4633 /* Unlinked? Unhash dentry, so it is not picked up later by
4634 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4635 * here to preserve get_cwd functionality on 2.6.
4637 if (!dentry->d_inode->i_nlink) {
4638 spin_lock(&inode->i_lock);
4639 d_lustre_invalidate(dentry, 0);
4640 spin_unlock(&inode->i_lock);
4643 ll_lookup_finish_locks(&oit, dentry);
4645 ptlrpc_req_finished(req);
4650 static int ll_merge_md_attr(struct inode *inode)
4652 struct ll_inode_info *lli = ll_i2info(inode);
4653 struct cl_attr attr = { 0 };
4656 LASSERT(lli->lli_lsm_md != NULL);
4658 if (!lmv_dir_striped(lli->lli_lsm_md))
4661 down_read(&lli->lli_lsm_sem);
4662 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4663 &attr, ll_md_blocking_ast);
4664 up_read(&lli->lli_lsm_sem);
4668 set_nlink(inode, attr.cat_nlink);
4669 inode->i_blocks = attr.cat_blocks;
4670 i_size_write(inode, attr.cat_size);
4672 ll_i2info(inode)->lli_atime = attr.cat_atime;
4673 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4674 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4679 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4681 struct inode *inode = de->d_inode;
4682 struct ll_sb_info *sbi = ll_i2sbi(inode);
4683 struct ll_inode_info *lli = ll_i2info(inode);
4686 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4688 rc = ll_inode_revalidate(de, IT_GETATTR);
4692 if (S_ISREG(inode->i_mode)) {
4695 rc = pcc_inode_getattr(inode, &cached);
4696 if (cached && rc < 0)
4699 /* In case of restore, the MDT has the right size and has
4700 * already send it back without granting the layout lock,
4701 * inode is up-to-date so glimpse is useless.
4702 * Also to glimpse we need the layout, in case of a running
4703 * restore the MDT holds the layout lock so the glimpse will
4704 * block up to the end of restore (getattr will block)
4706 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4707 rc = ll_glimpse_size(inode);
4712 /* If object isn't regular a file then don't validate size. */
4713 if (ll_dir_striped(inode)) {
4714 rc = ll_merge_md_attr(inode);
4719 inode->i_atime.tv_sec = lli->lli_atime;
4720 inode->i_mtime.tv_sec = lli->lli_mtime;
4721 inode->i_ctime.tv_sec = lli->lli_ctime;
4724 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4726 if (ll_need_32bit_api(sbi)) {
4727 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4728 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4729 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4731 stat->ino = inode->i_ino;
4732 stat->dev = inode->i_sb->s_dev;
4733 stat->rdev = inode->i_rdev;
4736 stat->mode = inode->i_mode;
4737 stat->uid = inode->i_uid;
4738 stat->gid = inode->i_gid;
4739 stat->atime = inode->i_atime;
4740 stat->mtime = inode->i_mtime;
4741 stat->ctime = inode->i_ctime;
4742 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4744 stat->nlink = inode->i_nlink;
4745 stat->size = i_size_read(inode);
4746 stat->blocks = inode->i_blocks;
4751 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4752 int ll_getattr(const struct path *path, struct kstat *stat,
4753 u32 request_mask, unsigned int flags)
4755 struct dentry *de = path->dentry;
4757 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4760 return ll_getattr_dentry(de, stat);
4763 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4764 __u64 start, __u64 len)
4768 struct fiemap *fiemap;
4769 unsigned int extent_count = fieinfo->fi_extents_max;
4771 num_bytes = sizeof(*fiemap) + (extent_count *
4772 sizeof(struct fiemap_extent));
4773 OBD_ALLOC_LARGE(fiemap, num_bytes);
4778 fiemap->fm_flags = fieinfo->fi_flags;
4779 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4780 fiemap->fm_start = start;
4781 fiemap->fm_length = len;
4782 if (extent_count > 0 &&
4783 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4784 sizeof(struct fiemap_extent)) != 0)
4785 GOTO(out, rc = -EFAULT);
4787 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4789 fieinfo->fi_flags = fiemap->fm_flags;
4790 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4791 if (extent_count > 0 &&
4792 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4793 fiemap->fm_mapped_extents *
4794 sizeof(struct fiemap_extent)) != 0)
4795 GOTO(out, rc = -EFAULT);
4797 OBD_FREE_LARGE(fiemap, num_bytes);
4801 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4803 struct ll_inode_info *lli = ll_i2info(inode);
4804 struct posix_acl *acl = NULL;
4807 spin_lock(&lli->lli_lock);
4808 /* VFS' acl_permission_check->check_acl will release the refcount */
4809 acl = posix_acl_dup(lli->lli_posix_acl);
4810 spin_unlock(&lli->lli_lock);
4815 #ifdef HAVE_IOP_SET_ACL
4816 #ifdef CONFIG_FS_POSIX_ACL
4817 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4819 struct ll_sb_info *sbi = ll_i2sbi(inode);
4820 struct ptlrpc_request *req = NULL;
4821 const char *name = NULL;
4823 size_t value_size = 0;
4828 case ACL_TYPE_ACCESS:
4829 name = XATTR_NAME_POSIX_ACL_ACCESS;
4831 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4834 case ACL_TYPE_DEFAULT:
4835 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4836 if (!S_ISDIR(inode->i_mode))
4837 rc = acl ? -EACCES : 0;
4848 value_size = posix_acl_xattr_size(acl->a_count);
4849 value = kmalloc(value_size, GFP_NOFS);
4851 GOTO(out, rc = -ENOMEM);
4853 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4855 GOTO(out_value, rc);
4858 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4859 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4860 name, value, value_size, 0, 0, &req);
4862 ptlrpc_req_finished(req);
4867 forget_cached_acl(inode, type);
4869 set_cached_acl(inode, type, acl);
4872 #endif /* CONFIG_FS_POSIX_ACL */
4873 #endif /* HAVE_IOP_SET_ACL */
4875 int ll_inode_permission(struct inode *inode, int mask)
4878 struct ll_sb_info *sbi;
4879 struct root_squash_info *squash;
4880 struct cred *cred = NULL;
4881 const struct cred *old_cred = NULL;
4883 bool squash_id = false;
4886 if (mask & MAY_NOT_BLOCK)
4889 /* as root inode are NOT getting validated in lookup operation,
4890 * need to do it before permission check. */
4892 if (inode == inode->i_sb->s_root->d_inode) {
4893 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4898 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4899 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4901 /* squash fsuid/fsgid if needed */
4902 sbi = ll_i2sbi(inode);
4903 squash = &sbi->ll_squash;
4904 if (unlikely(squash->rsi_uid != 0 &&
4905 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4906 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4910 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4911 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4912 squash->rsi_uid, squash->rsi_gid);
4914 /* update current process's credentials
4915 * and FS capability */
4916 cred = prepare_creds();
4920 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4921 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4922 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4923 if ((1 << cap) & CFS_CAP_FS_MASK)
4924 cap_lower(cred->cap_effective, cap);
4926 old_cred = override_creds(cred);
4929 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4930 rc = generic_permission(inode, mask);
4931 /* restore current process's credentials and FS capability */
4933 revert_creds(old_cred);
4940 /* -o localflock - only provides locally consistent flock locks */
4941 struct file_operations ll_file_operations = {
4942 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4943 # ifdef HAVE_SYNC_READ_WRITE
4944 .read = new_sync_read,
4945 .write = new_sync_write,
4947 .read_iter = ll_file_read_iter,
4948 .write_iter = ll_file_write_iter,
4949 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4950 .read = ll_file_read,
4951 .aio_read = ll_file_aio_read,
4952 .write = ll_file_write,
4953 .aio_write = ll_file_aio_write,
4954 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4955 .unlocked_ioctl = ll_file_ioctl,
4956 .open = ll_file_open,
4957 .release = ll_file_release,
4958 .mmap = ll_file_mmap,
4959 .llseek = ll_file_seek,
4960 .splice_read = ll_file_splice_read,
4965 struct file_operations ll_file_operations_flock = {
4966 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4967 # ifdef HAVE_SYNC_READ_WRITE
4968 .read = new_sync_read,
4969 .write = new_sync_write,
4970 # endif /* HAVE_SYNC_READ_WRITE */
4971 .read_iter = ll_file_read_iter,
4972 .write_iter = ll_file_write_iter,
4973 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4974 .read = ll_file_read,
4975 .aio_read = ll_file_aio_read,
4976 .write = ll_file_write,
4977 .aio_write = ll_file_aio_write,
4978 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4979 .unlocked_ioctl = ll_file_ioctl,
4980 .open = ll_file_open,
4981 .release = ll_file_release,
4982 .mmap = ll_file_mmap,
4983 .llseek = ll_file_seek,
4984 .splice_read = ll_file_splice_read,
4987 .flock = ll_file_flock,
4988 .lock = ll_file_flock
4991 /* These are for -o noflock - to return ENOSYS on flock calls */
4992 struct file_operations ll_file_operations_noflock = {
4993 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4994 # ifdef HAVE_SYNC_READ_WRITE
4995 .read = new_sync_read,
4996 .write = new_sync_write,
4997 # endif /* HAVE_SYNC_READ_WRITE */
4998 .read_iter = ll_file_read_iter,
4999 .write_iter = ll_file_write_iter,
5000 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5001 .read = ll_file_read,
5002 .aio_read = ll_file_aio_read,
5003 .write = ll_file_write,
5004 .aio_write = ll_file_aio_write,
5005 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5006 .unlocked_ioctl = ll_file_ioctl,
5007 .open = ll_file_open,
5008 .release = ll_file_release,
5009 .mmap = ll_file_mmap,
5010 .llseek = ll_file_seek,
5011 .splice_read = ll_file_splice_read,
5014 .flock = ll_file_noflock,
5015 .lock = ll_file_noflock
5018 struct inode_operations ll_file_inode_operations = {
5019 .setattr = ll_setattr,
5020 .getattr = ll_getattr,
5021 .permission = ll_inode_permission,
5022 #ifdef HAVE_IOP_XATTR
5023 .setxattr = ll_setxattr,
5024 .getxattr = ll_getxattr,
5025 .removexattr = ll_removexattr,
5027 .listxattr = ll_listxattr,
5028 .fiemap = ll_fiemap,
5029 #ifdef HAVE_IOP_GET_ACL
5030 .get_acl = ll_get_acl,
5032 #ifdef HAVE_IOP_SET_ACL
5033 .set_acl = ll_set_acl,
5037 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5039 struct ll_inode_info *lli = ll_i2info(inode);
5040 struct cl_object *obj = lli->lli_clob;
5049 env = cl_env_get(&refcheck);
5051 RETURN(PTR_ERR(env));
5053 rc = cl_conf_set(env, lli->lli_clob, conf);
5057 if (conf->coc_opc == OBJECT_CONF_SET) {
5058 struct ldlm_lock *lock = conf->coc_lock;
5059 struct cl_layout cl = {
5063 LASSERT(lock != NULL);
5064 LASSERT(ldlm_has_layout(lock));
5066 /* it can only be allowed to match after layout is
5067 * applied to inode otherwise false layout would be
5068 * seen. Applying layout shoud happen before dropping
5069 * the intent lock. */
5070 ldlm_lock_allow_match(lock);
5072 rc = cl_object_layout_get(env, obj, &cl);
5077 DFID": layout version change: %u -> %u\n",
5078 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5080 ll_layout_version_set(lli, cl.cl_layout_gen);
5084 cl_env_put(env, &refcheck);
5089 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5090 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5093 struct ll_sb_info *sbi = ll_i2sbi(inode);
5094 struct ptlrpc_request *req;
5101 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5102 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5103 lock->l_lvb_data, lock->l_lvb_len);
5105 if (lock->l_lvb_data != NULL)
5108 /* if layout lock was granted right away, the layout is returned
5109 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5110 * blocked and then granted via completion ast, we have to fetch
5111 * layout here. Please note that we can't use the LVB buffer in
5112 * completion AST because it doesn't have a large enough buffer */
5113 rc = ll_get_default_mdsize(sbi, &lmmsize);
5117 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5118 XATTR_NAME_LOV, lmmsize, &req);
5121 GOTO(out, rc = 0); /* empty layout */
5128 if (lmmsize == 0) /* empty layout */
5131 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5133 GOTO(out, rc = -EFAULT);
5135 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5136 if (lvbdata == NULL)
5137 GOTO(out, rc = -ENOMEM);
5139 memcpy(lvbdata, lmm, lmmsize);
5140 lock_res_and_lock(lock);
5141 if (unlikely(lock->l_lvb_data == NULL)) {
5142 lock->l_lvb_type = LVB_T_LAYOUT;
5143 lock->l_lvb_data = lvbdata;
5144 lock->l_lvb_len = lmmsize;
5147 unlock_res_and_lock(lock);
5150 OBD_FREE_LARGE(lvbdata, lmmsize);
5155 ptlrpc_req_finished(req);
5160 * Apply the layout to the inode. Layout lock is held and will be released
5163 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5164 struct inode *inode)
5166 struct ll_inode_info *lli = ll_i2info(inode);
5167 struct ll_sb_info *sbi = ll_i2sbi(inode);
5168 struct ldlm_lock *lock;
5169 struct cl_object_conf conf;
5172 bool wait_layout = false;
5175 LASSERT(lustre_handle_is_used(lockh));
5177 lock = ldlm_handle2lock(lockh);
5178 LASSERT(lock != NULL);
5179 LASSERT(ldlm_has_layout(lock));
5181 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5182 PFID(&lli->lli_fid), inode);
5184 /* in case this is a caching lock and reinstate with new inode */
5185 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5187 lock_res_and_lock(lock);
5188 lvb_ready = ldlm_is_lvb_ready(lock);
5189 unlock_res_and_lock(lock);
5191 /* checking lvb_ready is racy but this is okay. The worst case is
5192 * that multi processes may configure the file on the same time. */
5196 rc = ll_layout_fetch(inode, lock);
5200 /* for layout lock, lmm is stored in lock's lvb.
5201 * lvb_data is immutable if the lock is held so it's safe to access it
5204 * set layout to file. Unlikely this will fail as old layout was
5205 * surely eliminated */
5206 memset(&conf, 0, sizeof conf);
5207 conf.coc_opc = OBJECT_CONF_SET;
5208 conf.coc_inode = inode;
5209 conf.coc_lock = lock;
5210 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5211 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5212 rc = ll_layout_conf(inode, &conf);
5214 /* refresh layout failed, need to wait */
5215 wait_layout = rc == -EBUSY;
5218 LDLM_LOCK_PUT(lock);
5219 ldlm_lock_decref(lockh, mode);
5221 /* wait for IO to complete if it's still being used. */
5223 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5224 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5226 memset(&conf, 0, sizeof conf);
5227 conf.coc_opc = OBJECT_CONF_WAIT;
5228 conf.coc_inode = inode;
5229 rc = ll_layout_conf(inode, &conf);
5233 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5234 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5240 * Issue layout intent RPC to MDS.
5241 * \param inode [in] file inode
5242 * \param intent [in] layout intent
5244 * \retval 0 on success
5245 * \retval < 0 error code
5247 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5249 struct ll_inode_info *lli = ll_i2info(inode);
5250 struct ll_sb_info *sbi = ll_i2sbi(inode);
5251 struct md_op_data *op_data;
5252 struct lookup_intent it;
5253 struct ptlrpc_request *req;
5257 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5258 0, 0, LUSTRE_OPC_ANY, NULL);
5259 if (IS_ERR(op_data))
5260 RETURN(PTR_ERR(op_data));
5262 op_data->op_data = intent;
5263 op_data->op_data_size = sizeof(*intent);
5265 memset(&it, 0, sizeof(it));
5266 it.it_op = IT_LAYOUT;
5267 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5268 intent->li_opc == LAYOUT_INTENT_TRUNC)
5269 it.it_flags = FMODE_WRITE;
5271 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5272 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5274 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5275 &ll_md_blocking_ast, 0);
5276 if (it.it_request != NULL)
5277 ptlrpc_req_finished(it.it_request);
5278 it.it_request = NULL;
5280 ll_finish_md_op_data(op_data);
5282 /* set lock data in case this is a new lock */
5284 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5286 ll_intent_drop_lock(&it);
5292 * This function checks if there exists a LAYOUT lock on the client side,
5293 * or enqueues it if it doesn't have one in cache.
5295 * This function will not hold layout lock so it may be revoked any time after
5296 * this function returns. Any operations depend on layout should be redone
5299 * This function should be called before lov_io_init() to get an uptodate
5300 * layout version, the caller should save the version number and after IO
5301 * is finished, this function should be called again to verify that layout
5302 * is not changed during IO time.
5304 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5306 struct ll_inode_info *lli = ll_i2info(inode);
5307 struct ll_sb_info *sbi = ll_i2sbi(inode);
5308 struct lustre_handle lockh;
5309 struct layout_intent intent = {
5310 .li_opc = LAYOUT_INTENT_ACCESS,
5312 enum ldlm_mode mode;
5316 *gen = ll_layout_version_get(lli);
5317 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5321 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5322 LASSERT(S_ISREG(inode->i_mode));
5324 /* take layout lock mutex to enqueue layout lock exclusively. */
5325 mutex_lock(&lli->lli_layout_mutex);
5328 /* mostly layout lock is caching on the local side, so try to
5329 * match it before grabbing layout lock mutex. */
5330 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5331 LCK_CR | LCK_CW | LCK_PR |
5333 if (mode != 0) { /* hit cached lock */
5334 rc = ll_layout_lock_set(&lockh, mode, inode);
5340 rc = ll_layout_intent(inode, &intent);
5346 *gen = ll_layout_version_get(lli);
5347 mutex_unlock(&lli->lli_layout_mutex);
5353 * Issue layout intent RPC indicating where in a file an IO is about to write.
5355 * \param[in] inode file inode.
5356 * \param[in] ext write range with start offset of fille in bytes where
5357 * an IO is about to write, and exclusive end offset in
5360 * \retval 0 on success
5361 * \retval < 0 error code
5363 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5364 struct lu_extent *ext)
5366 struct layout_intent intent = {
5368 .li_extent.e_start = ext->e_start,
5369 .li_extent.e_end = ext->e_end,
5374 rc = ll_layout_intent(inode, &intent);
5380 * This function send a restore request to the MDT
5382 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5384 struct hsm_user_request *hur;
5388 len = sizeof(struct hsm_user_request) +
5389 sizeof(struct hsm_user_item);
5390 OBD_ALLOC(hur, len);
5394 hur->hur_request.hr_action = HUA_RESTORE;
5395 hur->hur_request.hr_archive_id = 0;
5396 hur->hur_request.hr_flags = 0;
5397 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5398 sizeof(hur->hur_user_item[0].hui_fid));
5399 hur->hur_user_item[0].hui_extent.offset = offset;
5400 hur->hur_user_item[0].hui_extent.length = length;
5401 hur->hur_request.hr_itemcount = 1;
5402 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,