4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
167 case MDS_CLOSE_LAYOUT_SPLIT:
168 case MDS_CLOSE_LAYOUT_SWAP: {
169 struct split_param *sp = data;
171 LASSERT(data != NULL);
172 op_data->op_bias |= bias;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
176 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
177 op_data->op_mirror_id = sp->sp_mirror_id;
179 op_data->op_fid2 = *ll_inode2fid(data);
184 case MDS_CLOSE_RESYNC_DONE: {
185 struct ll_ioc_lease *ioc = data;
187 LASSERT(data != NULL);
188 op_data->op_attr_blocks +=
189 ioc->lil_count * op_data->op_attr_blocks;
190 op_data->op_attr.ia_valid |= ATTR_SIZE;
191 op_data->op_xvalid |= OP_XVALID_BLOCKS;
192 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
194 op_data->op_lease_handle = och->och_lease_handle;
195 op_data->op_data = &ioc->lil_ids[0];
196 op_data->op_data_size =
197 ioc->lil_count * sizeof(ioc->lil_ids[0]);
201 case MDS_PCC_ATTACH: {
202 struct pcc_param *param = data;
204 LASSERT(data != NULL);
205 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
206 op_data->op_archive_id = param->pa_archive_id;
207 op_data->op_data_version = param->pa_data_version;
208 op_data->op_lease_handle = och->och_lease_handle;
212 case MDS_HSM_RELEASE:
213 LASSERT(data != NULL);
214 op_data->op_bias |= MDS_HSM_RELEASE;
215 op_data->op_data_version = *(__u64 *)data;
216 op_data->op_lease_handle = och->och_lease_handle;
217 op_data->op_attr.ia_valid |= ATTR_SIZE;
218 op_data->op_xvalid |= OP_XVALID_BLOCKS;
222 LASSERT(data == NULL);
226 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
227 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
228 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
229 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
231 rc = md_close(md_exp, op_data, och->och_mod, &req);
232 if (rc != 0 && rc != -EINTR)
233 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
234 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
236 if (rc == 0 && op_data->op_bias & bias) {
237 struct mdt_body *body;
239 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
240 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
243 if (bias & MDS_PCC_ATTACH) {
244 struct pcc_param *param = data;
246 param->pa_layout_gen = body->mbo_layout_gen;
250 ll_finish_md_op_data(op_data);
254 md_clear_open_replay_data(md_exp, och);
255 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
258 ptlrpc_req_finished(req); /* This is close request */
262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
264 struct ll_inode_info *lli = ll_i2info(inode);
265 struct obd_client_handle **och_p;
266 struct obd_client_handle *och;
271 if (fmode & FMODE_WRITE) {
272 och_p = &lli->lli_mds_write_och;
273 och_usecount = &lli->lli_open_fd_write_count;
274 } else if (fmode & FMODE_EXEC) {
275 och_p = &lli->lli_mds_exec_och;
276 och_usecount = &lli->lli_open_fd_exec_count;
278 LASSERT(fmode & FMODE_READ);
279 och_p = &lli->lli_mds_read_och;
280 och_usecount = &lli->lli_open_fd_read_count;
283 mutex_lock(&lli->lli_och_mutex);
284 if (*och_usecount > 0) {
285 /* There are still users of this handle, so skip
287 mutex_unlock(&lli->lli_och_mutex);
293 mutex_unlock(&lli->lli_och_mutex);
296 /* There might be a race and this handle may already
298 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
304 static int ll_md_close(struct inode *inode, struct file *file)
306 union ldlm_policy_data policy = {
307 .l_inodebits = { MDS_INODELOCK_OPEN },
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
311 struct ll_inode_info *lli = ll_i2info(inode);
312 struct lustre_handle lockh;
313 enum ldlm_mode lockmode;
317 /* clear group lock, if present */
318 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
319 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
321 if (fd->fd_lease_och != NULL) {
324 /* Usually the lease is not released when the
325 * application crashed, we need to release here. */
326 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
327 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
328 PFID(&lli->lli_fid), rc, lease_broken);
330 fd->fd_lease_och = NULL;
333 if (fd->fd_och != NULL) {
334 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
339 /* Let's see if we have good enough OPEN lock on the file and if
340 we can skip talking to MDS */
341 mutex_lock(&lli->lli_och_mutex);
342 if (fd->fd_omode & FMODE_WRITE) {
344 LASSERT(lli->lli_open_fd_write_count);
345 lli->lli_open_fd_write_count--;
346 } else if (fd->fd_omode & FMODE_EXEC) {
348 LASSERT(lli->lli_open_fd_exec_count);
349 lli->lli_open_fd_exec_count--;
352 LASSERT(lli->lli_open_fd_read_count);
353 lli->lli_open_fd_read_count--;
355 mutex_unlock(&lli->lli_och_mutex);
357 /* LU-4398: do not cache write open lock if the file has exec bit */
358 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
359 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
360 LDLM_IBITS, &policy, lockmode, &lockh))
361 rc = ll_md_real_close(inode, fd->fd_omode);
364 LUSTRE_FPRIVATE(file) = NULL;
365 ll_file_data_put(fd);
370 /* While this returns an error code, fput() the caller does not, so we need
371 * to make every effort to clean up all of our state here. Also, applications
372 * rarely check close errors and even if an error is returned they will not
373 * re-try the close call.
375 int ll_file_release(struct inode *inode, struct file *file)
377 struct ll_file_data *fd;
378 struct ll_sb_info *sbi = ll_i2sbi(inode);
379 struct ll_inode_info *lli = ll_i2info(inode);
383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
384 PFID(ll_inode2fid(inode)), inode);
386 if (inode->i_sb->s_root != file_dentry(file))
387 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
388 fd = LUSTRE_FPRIVATE(file);
391 /* The last ref on @file, maybe not the the owner pid of statahead,
392 * because parent and child process can share the same file handle. */
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
394 ll_deauthorize_statahead(inode, fd);
396 if (inode->i_sb->s_root == file_dentry(file)) {
397 LUSTRE_FPRIVATE(file) = NULL;
398 ll_file_data_put(fd);
402 pcc_file_release(inode, file);
404 if (!S_ISDIR(inode->i_mode)) {
405 if (lli->lli_clob != NULL)
406 lov_read_and_clear_async_rc(lli->lli_clob);
407 lli->lli_async_rc = 0;
410 rc = ll_md_close(inode, file);
412 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
413 libcfs_debug_dumplog();
418 static inline int ll_dom_readpage(void *data, struct page *page)
420 struct niobuf_local *lnb = data;
423 kaddr = ll_kmap_atomic(page, KM_USER0);
424 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
425 if (lnb->lnb_len < PAGE_SIZE)
426 memset(kaddr + lnb->lnb_len, 0,
427 PAGE_SIZE - lnb->lnb_len);
428 flush_dcache_page(page);
429 SetPageUptodate(page);
430 ll_kunmap_atomic(kaddr, KM_USER0);
436 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
437 struct lookup_intent *it)
439 struct ll_inode_info *lli = ll_i2info(inode);
440 struct cl_object *obj = lli->lli_clob;
441 struct address_space *mapping = inode->i_mapping;
443 struct niobuf_remote *rnb;
444 struct mdt_body *body;
446 unsigned long index, start;
447 struct niobuf_local lnb;
454 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
458 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
459 if (rnb == NULL || rnb->rnb_len == 0)
462 /* LU-11595: Server may return whole file and that is OK always or
463 * it may return just file tail and its offset must be aligned with
464 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
465 * smaller then offset may be not aligned and that data is just ignored.
467 if (rnb->rnb_offset % PAGE_SIZE)
470 /* Server returns whole file or just file tail if it fills in reply
471 * buffer, in both cases total size should be equal to the file size.
473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
475 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
476 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
477 rnb->rnb_len, body->mbo_dom_size);
481 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
482 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
484 data = (char *)rnb + sizeof(*rnb);
486 lnb.lnb_file_offset = rnb->rnb_offset;
487 start = lnb.lnb_file_offset / PAGE_SIZE;
489 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
490 lnb.lnb_page_offset = 0;
492 lnb.lnb_data = data + (index << PAGE_SHIFT);
493 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
494 if (lnb.lnb_len > PAGE_SIZE)
495 lnb.lnb_len = PAGE_SIZE;
497 vmpage = read_cache_page(mapping, index + start,
498 ll_dom_readpage, &lnb);
499 if (IS_ERR(vmpage)) {
500 CWARN("%s: cannot fill page %lu for "DFID
501 " with data: rc = %li\n",
502 ll_i2sbi(inode)->ll_fsname, index + start,
503 PFID(lu_object_fid(&obj->co_lu)),
509 } while (rnb->rnb_len > (index << PAGE_SHIFT));
513 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
514 struct lookup_intent *itp)
516 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
517 struct dentry *parent = de->d_parent;
520 struct md_op_data *op_data;
521 struct ptlrpc_request *req = NULL;
525 LASSERT(parent != NULL);
526 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
528 /* if server supports open-by-fid, or file name is invalid, don't pack
529 * name in open request */
530 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
531 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
533 len = de->d_name.len;
534 name = kmalloc(len + 1, GFP_NOFS);
539 spin_lock(&de->d_lock);
540 if (len != de->d_name.len) {
541 spin_unlock(&de->d_lock);
545 memcpy(name, de->d_name.name, len);
547 spin_unlock(&de->d_lock);
549 if (!lu_name_is_valid_2(name, len)) {
555 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
556 name, len, 0, LUSTRE_OPC_ANY, NULL);
557 if (IS_ERR(op_data)) {
559 RETURN(PTR_ERR(op_data));
561 op_data->op_data = lmm;
562 op_data->op_data_size = lmmsize;
564 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
565 &ll_md_blocking_ast, 0);
567 ll_finish_md_op_data(op_data);
569 /* reason for keep own exit path - don`t flood log
570 * with messages with -ESTALE errors.
572 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
573 it_open_error(DISP_OPEN_OPEN, itp))
575 ll_release_openhandle(de, itp);
579 if (it_disposition(itp, DISP_LOOKUP_NEG))
580 GOTO(out, rc = -ENOENT);
582 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
583 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
584 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
588 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
590 if (!rc && itp->it_lock_mode) {
591 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
592 struct ldlm_lock *lock;
593 bool has_dom_bit = false;
595 /* If we got a lock back and it has a LOOKUP bit set,
596 * make sure the dentry is marked as valid so we can find it.
597 * We don't need to care about actual hashing since other bits
598 * of kernel will deal with that later.
600 lock = ldlm_handle2lock(&handle);
602 has_dom_bit = ldlm_has_dom(lock);
603 if (lock->l_policy_data.l_inodebits.bits &
604 MDS_INODELOCK_LOOKUP)
605 d_lustre_revalidate(de);
609 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
611 ll_dom_finish_open(de->d_inode, req, itp);
615 ptlrpc_req_finished(req);
616 ll_intent_drop_lock(itp);
618 /* We did open by fid, but by the time we got to the server,
619 * the object disappeared. If this is a create, we cannot really
620 * tell the userspace that the file it was trying to create
621 * does not exist. Instead let's return -ESTALE, and the VFS will
622 * retry the create with LOOKUP_REVAL that we are going to catch
623 * in ll_revalidate_dentry() and use lookup then.
625 if (rc == -ENOENT && itp->it_op & IT_CREAT)
631 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
632 struct obd_client_handle *och)
634 struct mdt_body *body;
636 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
637 och->och_open_handle = body->mbo_open_handle;
638 och->och_fid = body->mbo_fid1;
639 och->och_lease_handle.cookie = it->it_lock_handle;
640 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
641 och->och_flags = it->it_flags;
643 return md_set_open_replay_data(md_exp, och, it);
646 static int ll_local_open(struct file *file, struct lookup_intent *it,
647 struct ll_file_data *fd, struct obd_client_handle *och)
649 struct inode *inode = file_inode(file);
652 LASSERT(!LUSTRE_FPRIVATE(file));
659 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
664 LUSTRE_FPRIVATE(file) = fd;
665 ll_readahead_init(inode, &fd->fd_ras);
666 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
668 /* ll_cl_context initialize */
669 rwlock_init(&fd->fd_lock);
670 INIT_LIST_HEAD(&fd->fd_lccs);
675 /* Open a file, and (for the very first open) create objects on the OSTs at
676 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
677 * creation or open until ll_lov_setstripe() ioctl is called.
679 * If we already have the stripe MD locally then we don't request it in
680 * md_open(), by passing a lmm_size = 0.
682 * It is up to the application to ensure no other processes open this file
683 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
684 * used. We might be able to avoid races of that sort by getting lli_open_sem
685 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
686 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
688 int ll_file_open(struct inode *inode, struct file *file)
690 struct ll_inode_info *lli = ll_i2info(inode);
691 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
692 .it_flags = file->f_flags };
693 struct obd_client_handle **och_p = NULL;
694 __u64 *och_usecount = NULL;
695 struct ll_file_data *fd;
699 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
700 PFID(ll_inode2fid(inode)), inode, file->f_flags);
702 it = file->private_data; /* XXX: compat macro */
703 file->private_data = NULL; /* prevent ll_local_open assertion */
705 fd = ll_file_data_get();
707 GOTO(out_nofiledata, rc = -ENOMEM);
710 if (S_ISDIR(inode->i_mode))
711 ll_authorize_statahead(inode, fd);
713 if (inode->i_sb->s_root == file_dentry(file)) {
714 LUSTRE_FPRIVATE(file) = fd;
718 if (!it || !it->it_disposition) {
719 /* Convert f_flags into access mode. We cannot use file->f_mode,
720 * because everything but O_ACCMODE mask was stripped from
722 if ((oit.it_flags + 1) & O_ACCMODE)
724 if (file->f_flags & O_TRUNC)
725 oit.it_flags |= FMODE_WRITE;
727 /* kernel only call f_op->open in dentry_open. filp_open calls
728 * dentry_open after call to open_namei that checks permissions.
729 * Only nfsd_open call dentry_open directly without checking
730 * permissions and because of that this code below is safe.
732 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
733 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
735 /* We do not want O_EXCL here, presumably we opened the file
736 * already? XXX - NFS implications? */
737 oit.it_flags &= ~O_EXCL;
739 /* bug20584, if "it_flags" contains O_CREAT, the file will be
740 * created if necessary, then "IT_CREAT" should be set to keep
741 * consistent with it */
742 if (oit.it_flags & O_CREAT)
743 oit.it_op |= IT_CREAT;
749 /* Let's see if we have file open on MDS already. */
750 if (it->it_flags & FMODE_WRITE) {
751 och_p = &lli->lli_mds_write_och;
752 och_usecount = &lli->lli_open_fd_write_count;
753 } else if (it->it_flags & FMODE_EXEC) {
754 och_p = &lli->lli_mds_exec_och;
755 och_usecount = &lli->lli_open_fd_exec_count;
757 och_p = &lli->lli_mds_read_och;
758 och_usecount = &lli->lli_open_fd_read_count;
761 mutex_lock(&lli->lli_och_mutex);
762 if (*och_p) { /* Open handle is present */
763 if (it_disposition(it, DISP_OPEN_OPEN)) {
764 /* Well, there's extra open request that we do not need,
765 let's close it somehow. This will decref request. */
766 rc = it_open_error(DISP_OPEN_OPEN, it);
768 mutex_unlock(&lli->lli_och_mutex);
769 GOTO(out_openerr, rc);
772 ll_release_openhandle(file_dentry(file), it);
776 rc = ll_local_open(file, it, fd, NULL);
779 mutex_unlock(&lli->lli_och_mutex);
780 GOTO(out_openerr, rc);
783 LASSERT(*och_usecount == 0);
784 if (!it->it_disposition) {
785 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
786 /* We cannot just request lock handle now, new ELC code
787 means that one of other OPEN locks for this file
788 could be cancelled, and since blocking ast handler
789 would attempt to grab och_mutex as well, that would
790 result in a deadlock */
791 mutex_unlock(&lli->lli_och_mutex);
793 * Normally called under two situations:
795 * 2. A race/condition on MDS resulting in no open
796 * handle to be returned from LOOKUP|OPEN request,
797 * for example if the target entry was a symlink.
799 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
800 * marked by a bit set in ll_iget_for_nfs. Clear the
801 * bit so that it's not confusing later callers.
803 * NB; when ldd is NULL, it must have come via normal
804 * lookup path only, since ll_iget_for_nfs always calls
807 if (ldd && ldd->lld_nfs_dentry) {
808 ldd->lld_nfs_dentry = 0;
809 it->it_flags |= MDS_OPEN_LOCK;
813 * Always specify MDS_OPEN_BY_FID because we don't want
814 * to get file with different fid.
816 it->it_flags |= MDS_OPEN_BY_FID;
817 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
820 GOTO(out_openerr, rc);
824 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
826 GOTO(out_och_free, rc = -ENOMEM);
830 /* md_intent_lock() didn't get a request ref if there was an
831 * open error, so don't do cleanup on the request here
833 /* XXX (green): Should not we bail out on any error here, not
834 * just open error? */
835 rc = it_open_error(DISP_OPEN_OPEN, it);
837 GOTO(out_och_free, rc);
839 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
840 "inode %p: disposition %x, status %d\n", inode,
841 it_disposition(it, ~0), it->it_status);
843 rc = ll_local_open(file, it, fd, *och_p);
845 GOTO(out_och_free, rc);
848 rc = pcc_file_open(inode, file);
850 GOTO(out_och_free, rc);
852 mutex_unlock(&lli->lli_och_mutex);
855 /* Must do this outside lli_och_mutex lock to prevent deadlock where
856 different kind of OPEN lock for this same inode gets cancelled
857 by ldlm_cancel_lru */
858 if (!S_ISREG(inode->i_mode))
859 GOTO(out_och_free, rc);
861 cl_lov_delay_create_clear(&file->f_flags);
862 GOTO(out_och_free, rc);
866 if (och_p && *och_p) {
867 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
868 *och_p = NULL; /* OBD_FREE writes some magic there */
871 mutex_unlock(&lli->lli_och_mutex);
874 if (lli->lli_opendir_key == fd)
875 ll_deauthorize_statahead(inode, fd);
878 ll_file_data_put(fd);
880 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
884 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
885 ptlrpc_req_finished(it->it_request);
886 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
892 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
893 struct ldlm_lock_desc *desc, void *data, int flag)
896 struct lustre_handle lockh;
900 case LDLM_CB_BLOCKING:
901 ldlm_lock2handle(lock, &lockh);
902 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
904 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
908 case LDLM_CB_CANCELING:
916 * When setting a lease on a file, we take ownership of the lli_mds_*_och
917 * and save it as fd->fd_och so as to force client to reopen the file even
918 * if it has an open lock in cache already.
920 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
921 struct lustre_handle *old_open_handle)
923 struct ll_inode_info *lli = ll_i2info(inode);
924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
925 struct obd_client_handle **och_p;
930 /* Get the openhandle of the file */
931 mutex_lock(&lli->lli_och_mutex);
932 if (fd->fd_lease_och != NULL)
933 GOTO(out_unlock, rc = -EBUSY);
935 if (fd->fd_och == NULL) {
936 if (file->f_mode & FMODE_WRITE) {
937 LASSERT(lli->lli_mds_write_och != NULL);
938 och_p = &lli->lli_mds_write_och;
939 och_usecount = &lli->lli_open_fd_write_count;
941 LASSERT(lli->lli_mds_read_och != NULL);
942 och_p = &lli->lli_mds_read_och;
943 och_usecount = &lli->lli_open_fd_read_count;
946 if (*och_usecount > 1)
947 GOTO(out_unlock, rc = -EBUSY);
954 *old_open_handle = fd->fd_och->och_open_handle;
958 mutex_unlock(&lli->lli_och_mutex);
963 * Release ownership on lli_mds_*_och when putting back a file lease.
965 static int ll_lease_och_release(struct inode *inode, struct file *file)
967 struct ll_inode_info *lli = ll_i2info(inode);
968 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
969 struct obd_client_handle **och_p;
970 struct obd_client_handle *old_och = NULL;
975 mutex_lock(&lli->lli_och_mutex);
976 if (file->f_mode & FMODE_WRITE) {
977 och_p = &lli->lli_mds_write_och;
978 och_usecount = &lli->lli_open_fd_write_count;
980 och_p = &lli->lli_mds_read_och;
981 och_usecount = &lli->lli_open_fd_read_count;
984 /* The file may have been open by another process (broken lease) so
985 * *och_p is not NULL. In this case we should simply increase usecount
988 if (*och_p != NULL) {
989 old_och = fd->fd_och;
996 mutex_unlock(&lli->lli_och_mutex);
999 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1005 * Acquire a lease and open the file.
1007 static struct obd_client_handle *
1008 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1011 struct lookup_intent it = { .it_op = IT_OPEN };
1012 struct ll_sb_info *sbi = ll_i2sbi(inode);
1013 struct md_op_data *op_data;
1014 struct ptlrpc_request *req = NULL;
1015 struct lustre_handle old_open_handle = { 0 };
1016 struct obd_client_handle *och = NULL;
1021 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1022 RETURN(ERR_PTR(-EINVAL));
1025 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1026 RETURN(ERR_PTR(-EPERM));
1028 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1030 RETURN(ERR_PTR(rc));
1035 RETURN(ERR_PTR(-ENOMEM));
1037 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1038 LUSTRE_OPC_ANY, NULL);
1039 if (IS_ERR(op_data))
1040 GOTO(out, rc = PTR_ERR(op_data));
1042 /* To tell the MDT this openhandle is from the same owner */
1043 op_data->op_open_handle = old_open_handle;
1045 it.it_flags = fmode | open_flags;
1046 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1047 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1048 &ll_md_blocking_lease_ast,
1049 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1050 * it can be cancelled which may mislead applications that the lease is
1052 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1053 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1054 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1055 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1056 ll_finish_md_op_data(op_data);
1057 ptlrpc_req_finished(req);
1059 GOTO(out_release_it, rc);
1061 if (it_disposition(&it, DISP_LOOKUP_NEG))
1062 GOTO(out_release_it, rc = -ENOENT);
1064 rc = it_open_error(DISP_OPEN_OPEN, &it);
1066 GOTO(out_release_it, rc);
1068 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1069 ll_och_fill(sbi->ll_md_exp, &it, och);
1071 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1072 GOTO(out_close, rc = -EOPNOTSUPP);
1074 /* already get lease, handle lease lock */
1075 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1076 if (it.it_lock_mode == 0 ||
1077 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1078 /* open lock must return for lease */
1079 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1080 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1082 GOTO(out_close, rc = -EPROTO);
1085 ll_intent_release(&it);
1089 /* Cancel open lock */
1090 if (it.it_lock_mode != 0) {
1091 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1093 it.it_lock_mode = 0;
1094 och->och_lease_handle.cookie = 0ULL;
1096 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1098 CERROR("%s: error closing file "DFID": %d\n",
1099 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1100 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1102 ll_intent_release(&it);
1106 RETURN(ERR_PTR(rc));
1110 * Check whether a layout swap can be done between two inodes.
1112 * \param[in] inode1 First inode to check
1113 * \param[in] inode2 Second inode to check
1115 * \retval 0 on success, layout swap can be performed between both inodes
1116 * \retval negative error code if requirements are not met
1118 static int ll_check_swap_layouts_validity(struct inode *inode1,
1119 struct inode *inode2)
1121 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1124 if (inode_permission(inode1, MAY_WRITE) ||
1125 inode_permission(inode2, MAY_WRITE))
1128 if (inode1->i_sb != inode2->i_sb)
1134 static int ll_swap_layouts_close(struct obd_client_handle *och,
1135 struct inode *inode, struct inode *inode2)
1137 const struct lu_fid *fid1 = ll_inode2fid(inode);
1138 const struct lu_fid *fid2;
1142 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1143 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1145 rc = ll_check_swap_layouts_validity(inode, inode2);
1147 GOTO(out_free_och, rc);
1149 /* We now know that inode2 is a lustre inode */
1150 fid2 = ll_inode2fid(inode2);
1152 rc = lu_fid_cmp(fid1, fid2);
1154 GOTO(out_free_och, rc = -EINVAL);
1156 /* Close the file and {swap,merge} layouts between inode & inode2.
1157 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1158 * because we still need it to pack l_remote_handle to MDT. */
1159 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1162 och = NULL; /* freed in ll_close_inode_openhandle() */
1172 * Release lease and close the file.
1173 * It will check if the lease has ever broken.
1175 static int ll_lease_close_intent(struct obd_client_handle *och,
1176 struct inode *inode,
1177 bool *lease_broken, enum mds_op_bias bias,
1180 struct ldlm_lock *lock;
1181 bool cancelled = true;
1185 lock = ldlm_handle2lock(&och->och_lease_handle);
1187 lock_res_and_lock(lock);
1188 cancelled = ldlm_is_cancel(lock);
1189 unlock_res_and_lock(lock);
1190 LDLM_LOCK_PUT(lock);
1193 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1194 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1196 if (lease_broken != NULL)
1197 *lease_broken = cancelled;
1199 if (!cancelled && !bias)
1200 ldlm_cli_cancel(&och->och_lease_handle, 0);
1202 if (cancelled) { /* no need to excute intent */
1207 rc = ll_close_inode_openhandle(inode, och, bias, data);
1211 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1214 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1218 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1220 static int ll_lease_file_resync(struct obd_client_handle *och,
1221 struct inode *inode, unsigned long arg)
1223 struct ll_sb_info *sbi = ll_i2sbi(inode);
1224 struct md_op_data *op_data;
1225 struct ll_ioc_lease_id ioc;
1226 __u64 data_version_unused;
1230 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1231 LUSTRE_OPC_ANY, NULL);
1232 if (IS_ERR(op_data))
1233 RETURN(PTR_ERR(op_data));
1235 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1239 /* before starting file resync, it's necessary to clean up page cache
1240 * in client memory, otherwise once the layout version is increased,
1241 * writing back cached data will be denied the OSTs. */
1242 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1246 op_data->op_lease_handle = och->och_lease_handle;
1247 op_data->op_mirror_id = ioc.lil_mirror_id;
1248 rc = md_file_resync(sbi->ll_md_exp, op_data);
1254 ll_finish_md_op_data(op_data);
1258 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1260 struct ll_inode_info *lli = ll_i2info(inode);
1261 struct cl_object *obj = lli->lli_clob;
1262 struct cl_attr *attr = vvp_env_thread_attr(env);
1270 ll_inode_size_lock(inode);
1272 /* Merge timestamps the most recently obtained from MDS with
1273 * timestamps obtained from OSTs.
1275 * Do not overwrite atime of inode because it may be refreshed
1276 * by file_accessed() function. If the read was served by cache
1277 * data, there is no RPC to be sent so that atime may not be
1278 * transferred to OSTs at all. MDT only updates atime at close time
1279 * if it's at least 'mdd.*.atime_diff' older.
1280 * All in all, the atime in Lustre does not strictly comply with
1281 * POSIX. Solving this problem needs to send an RPC to MDT for each
1282 * read, this will hurt performance.
1284 if (inode->i_atime.tv_sec < lli->lli_atime ||
1285 lli->lli_update_atime) {
1286 inode->i_atime.tv_sec = lli->lli_atime;
1287 lli->lli_update_atime = 0;
1289 inode->i_mtime.tv_sec = lli->lli_mtime;
1290 inode->i_ctime.tv_sec = lli->lli_ctime;
1292 mtime = inode->i_mtime.tv_sec;
1293 atime = inode->i_atime.tv_sec;
1294 ctime = inode->i_ctime.tv_sec;
1296 cl_object_attr_lock(obj);
1297 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1300 rc = cl_object_attr_get(env, obj, attr);
1301 cl_object_attr_unlock(obj);
1304 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1306 if (atime < attr->cat_atime)
1307 atime = attr->cat_atime;
1309 if (ctime < attr->cat_ctime)
1310 ctime = attr->cat_ctime;
1312 if (mtime < attr->cat_mtime)
1313 mtime = attr->cat_mtime;
1315 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1316 PFID(&lli->lli_fid), attr->cat_size);
1318 i_size_write(inode, attr->cat_size);
1319 inode->i_blocks = attr->cat_blocks;
1321 inode->i_mtime.tv_sec = mtime;
1322 inode->i_atime.tv_sec = atime;
1323 inode->i_ctime.tv_sec = ctime;
1326 ll_inode_size_unlock(inode);
1332 * Set designated mirror for I/O.
1334 * So far only read, write, and truncated can support to issue I/O to
1335 * designated mirror.
1337 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1339 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1341 /* clear layout version for generic(non-resync) I/O in case it carries
1342 * stale layout version due to I/O restart */
1343 io->ci_layout_version = 0;
1345 /* FLR: disable non-delay for designated mirror I/O because obviously
1346 * only one mirror is available */
1347 if (fd->fd_designated_mirror > 0) {
1349 io->ci_designated_mirror = fd->fd_designated_mirror;
1350 io->ci_layout_version = fd->fd_layout_version;
1353 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1354 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1357 static bool file_is_noatime(const struct file *file)
1359 const struct vfsmount *mnt = file->f_path.mnt;
1360 const struct inode *inode = file_inode((struct file *)file);
1362 /* Adapted from file_accessed() and touch_atime().*/
1363 if (file->f_flags & O_NOATIME)
1366 if (inode->i_flags & S_NOATIME)
1369 if (IS_NOATIME(inode))
1372 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1375 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1378 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1384 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1385 struct vvp_io_args *args)
1387 struct inode *inode = file_inode(file);
1388 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1390 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1391 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1393 if (iot == CIT_WRITE) {
1394 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1395 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1396 file->f_flags & O_DIRECT ||
1398 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1399 io->u.ci_wr.wr_sync |= !!(args &&
1400 args->via_io_subtype == IO_NORMAL &&
1401 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1405 io->ci_obj = ll_i2info(inode)->lli_clob;
1406 io->ci_lockreq = CILR_MAYBE;
1407 if (ll_file_nolock(file)) {
1408 io->ci_lockreq = CILR_NEVER;
1409 io->ci_no_srvlock = 1;
1410 } else if (file->f_flags & O_APPEND) {
1411 io->ci_lockreq = CILR_MANDATORY;
1413 io->ci_noatime = file_is_noatime(file);
1414 io->ci_async_readahead = false;
1416 /* FLR: only use non-delay I/O for read as there is only one
1417 * avaliable mirror for write. */
1418 io->ci_ndelay = !(iot == CIT_WRITE);
1420 ll_io_set_mirror(io, file);
1423 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1426 struct ll_inode_info *lli = ll_i2info(inode);
1427 struct ll_sb_info *sbi = ll_i2sbi(inode);
1428 enum obd_heat_type sample_type;
1429 enum obd_heat_type iobyte_type;
1430 __u64 now = ktime_get_real_seconds();
1432 if (!ll_sbi_has_file_heat(sbi) ||
1433 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1436 if (iot == CIT_READ) {
1437 sample_type = OBD_HEAT_READSAMPLE;
1438 iobyte_type = OBD_HEAT_READBYTE;
1439 } else if (iot == CIT_WRITE) {
1440 sample_type = OBD_HEAT_WRITESAMPLE;
1441 iobyte_type = OBD_HEAT_WRITEBYTE;
1446 spin_lock(&lli->lli_heat_lock);
1447 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1448 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1449 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1450 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1451 spin_unlock(&lli->lli_heat_lock);
1455 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1456 struct file *file, enum cl_io_type iot,
1457 loff_t *ppos, size_t count)
1459 struct vvp_io *vio = vvp_env_io(env);
1460 struct inode *inode = file_inode(file);
1461 struct ll_inode_info *lli = ll_i2info(inode);
1462 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1463 struct range_lock range;
1467 unsigned retried = 0;
1468 bool restarted = false;
1472 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1473 file_dentry(file)->d_name.name,
1474 iot == CIT_READ ? "read" : "write", *ppos, count);
1477 io = vvp_env_thread_io(env);
1478 ll_io_init(io, file, iot, args);
1479 io->ci_ndelay_tried = retried;
1481 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1482 bool range_locked = false;
1484 if (file->f_flags & O_APPEND)
1485 range_lock_init(&range, 0, LUSTRE_EOF);
1487 range_lock_init(&range, *ppos, *ppos + count - 1);
1489 vio->vui_fd = LUSTRE_FPRIVATE(file);
1490 vio->vui_io_subtype = args->via_io_subtype;
1492 switch (vio->vui_io_subtype) {
1494 vio->vui_iter = args->u.normal.via_iter;
1495 vio->vui_iocb = args->u.normal.via_iocb;
1496 /* Direct IO reads must also take range lock,
1497 * or multiple reads will try to work on the same pages
1498 * See LU-6227 for details. */
1499 if (((iot == CIT_WRITE) ||
1500 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1501 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1502 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1504 rc = range_lock(&lli->lli_write_tree, &range);
1508 range_locked = true;
1512 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1513 vio->u.splice.vui_flags = args->u.splice.via_flags;
1516 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1520 ll_cl_add(file, env, io, LCC_RW);
1521 rc = cl_io_loop(env, io);
1522 ll_cl_remove(file, env);
1525 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1527 range_unlock(&lli->lli_write_tree, &range);
1530 /* cl_io_rw_init() handled IO */
1534 if (io->ci_nob > 0) {
1535 result += io->ci_nob;
1536 count -= io->ci_nob;
1537 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1539 /* prepare IO restart */
1540 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1541 args->u.normal.via_iter = vio->vui_iter;
1544 cl_io_fini(env, io);
1547 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1548 file->f_path.dentry->d_name.name,
1549 iot, rc, result, io->ci_need_restart);
1551 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1553 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1554 file_dentry(file)->d_name.name,
1555 iot == CIT_READ ? "read" : "write",
1556 *ppos, count, result, rc);
1557 /* preserve the tried count for FLR */
1558 retried = io->ci_ndelay_tried;
1563 if (iot == CIT_READ) {
1565 ll_stats_ops_tally(ll_i2sbi(inode),
1566 LPROC_LL_READ_BYTES, result);
1567 } else if (iot == CIT_WRITE) {
1569 ll_stats_ops_tally(ll_i2sbi(inode),
1570 LPROC_LL_WRITE_BYTES, result);
1571 fd->fd_write_failed = false;
1572 } else if (result == 0 && rc == 0) {
1575 fd->fd_write_failed = true;
1577 fd->fd_write_failed = false;
1578 } else if (rc != -ERESTARTSYS) {
1579 fd->fd_write_failed = true;
1583 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1585 ll_heat_add(inode, iot, result);
1587 RETURN(result > 0 ? result : rc);
1591 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1592 * especially for small I/O.
1594 * To serve a read request, CLIO has to create and initialize a cl_io and
1595 * then request DLM lock. This has turned out to have siginificant overhead
1596 * and affects the performance of small I/O dramatically.
1598 * It's not necessary to create a cl_io for each I/O. Under the help of read
1599 * ahead, most of the pages being read are already in memory cache and we can
1600 * read those pages directly because if the pages exist, the corresponding DLM
1601 * lock must exist so that page content must be valid.
1603 * In fast read implementation, the llite speculatively finds and reads pages
1604 * in memory cache. There are three scenarios for fast read:
1605 * - If the page exists and is uptodate, kernel VM will provide the data and
1606 * CLIO won't be intervened;
1607 * - If the page was brought into memory by read ahead, it will be exported
1608 * and read ahead parameters will be updated;
1609 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1610 * it will go back and invoke normal read, i.e., a cl_io will be created
1611 * and DLM lock will be requested.
1613 * POSIX compliance: posix standard states that read is intended to be atomic.
1614 * Lustre read implementation is in line with Linux kernel read implementation
1615 * and neither of them complies with POSIX standard in this matter. Fast read
1616 * doesn't make the situation worse on single node but it may interleave write
1617 * results from multiple nodes due to short read handling in ll_file_aio_read().
1619 * \param env - lu_env
1620 * \param iocb - kiocb from kernel
1621 * \param iter - user space buffers where the data will be copied
1623 * \retval - number of bytes have been read, or error code if error occurred.
1626 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1630 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1633 /* NB: we can't do direct IO for fast read because it will need a lock
1634 * to make IO engine happy. */
1635 if (iocb->ki_filp->f_flags & O_DIRECT)
1638 result = generic_file_read_iter(iocb, iter);
1640 /* If the first page is not in cache, generic_file_aio_read() will be
1641 * returned with -ENODATA.
1642 * See corresponding code in ll_readpage(). */
1643 if (result == -ENODATA)
1647 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1648 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1649 LPROC_LL_READ_BYTES, result);
1656 * Read from a file (through the page cache).
1658 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1661 struct vvp_io_args *args;
1662 struct file *file = iocb->ki_filp;
1668 if (!iov_iter_count(to))
1672 * Currently when PCC read failed, we do not fall back to the
1673 * normal read path, just return the error.
1674 * The resaon is that: for RW-PCC, the file data may be modified
1675 * in the PCC and inconsistent with the data on OSTs (or file
1676 * data has been removed from the Lustre file system), at this
1677 * time, fallback to the normal read path may read the wrong
1679 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1680 * path: read data from data copy on OSTs.
1682 result = pcc_file_read_iter(iocb, to, &cached);
1688 result = ll_do_fast_read(iocb, to);
1689 if (result < 0 || iov_iter_count(to) == 0)
1692 env = cl_env_get(&refcheck);
1694 return PTR_ERR(env);
1696 args = ll_env_args(env, IO_NORMAL);
1697 args->u.normal.via_iter = to;
1698 args->u.normal.via_iocb = iocb;
1700 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1701 &iocb->ki_pos, iov_iter_count(to));
1704 else if (result == 0)
1707 cl_env_put(env, &refcheck);
1710 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1711 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1718 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1719 * If a page is already in the page cache and dirty (and some other things -
1720 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1721 * write to it without doing a full I/O, because Lustre already knows about it
1722 * and will write it out. This saves a lot of processing time.
1724 * All writes here are within one page, so exclusion is handled by the page
1725 * lock on the vm page. We do not do tiny writes for writes which touch
1726 * multiple pages because it's very unlikely multiple sequential pages are
1727 * are already dirty.
1729 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1730 * and are unlikely to be to already dirty pages.
1732 * Attribute updates are important here, we do them in ll_tiny_write_end.
1734 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1736 ssize_t count = iov_iter_count(iter);
1737 struct file *file = iocb->ki_filp;
1738 struct inode *inode = file_inode(file);
1739 bool lock_inode = !IS_NOSEC(inode);
1744 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1745 * of function for why.
1747 if (count >= PAGE_SIZE ||
1748 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1751 if (unlikely(lock_inode))
1753 result = __generic_file_write_iter(iocb, iter);
1755 if (unlikely(lock_inode))
1756 inode_unlock(inode);
1758 /* If the page is not already dirty, ll_tiny_write_begin returns
1759 * -ENODATA. We continue on to normal write.
1761 if (result == -ENODATA)
1765 ll_heat_add(inode, CIT_WRITE, result);
1766 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1768 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1771 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1777 * Write to a file (through the page cache).
1779 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1781 struct vvp_io_args *args;
1783 ssize_t rc_tiny = 0, rc_normal;
1784 struct file *file = iocb->ki_filp;
1791 if (!iov_iter_count(from))
1792 GOTO(out, rc_normal = 0);
1795 * When PCC write failed, we usually do not fall back to the normal
1796 * write path, just return the error. But there is a special case when
1797 * returned error code is -ENOSPC due to running out of space on PCC HSM
1798 * bakcend. At this time, it will fall back to normal I/O path and
1799 * retry the I/O. As the file is in HSM released state, it will restore
1800 * the file data to OSTs first and redo the write again. And the
1801 * restore process will revoke the layout lock and detach the file
1802 * from PCC cache automatically.
1804 result = pcc_file_write_iter(iocb, from, &cached);
1805 if (cached && result != -ENOSPC && result != -EDQUOT)
1808 /* NB: we can't do direct IO for tiny writes because they use the page
1809 * cache, we can't do sync writes because tiny writes can't flush
1810 * pages, and we can't do append writes because we can't guarantee the
1811 * required DLM locks are held to protect file size.
1813 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1814 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1815 rc_tiny = ll_do_tiny_write(iocb, from);
1817 /* In case of error, go on and try normal write - Only stop if tiny
1818 * write completed I/O.
1820 if (iov_iter_count(from) == 0)
1821 GOTO(out, rc_normal = rc_tiny);
1823 env = cl_env_get(&refcheck);
1825 return PTR_ERR(env);
1827 args = ll_env_args(env, IO_NORMAL);
1828 args->u.normal.via_iter = from;
1829 args->u.normal.via_iocb = iocb;
1831 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1832 &iocb->ki_pos, iov_iter_count(from));
1834 /* On success, combine bytes written. */
1835 if (rc_tiny >= 0 && rc_normal > 0)
1836 rc_normal += rc_tiny;
1837 /* On error, only return error from normal write if tiny write did not
1838 * write any bytes. Otherwise return bytes written by tiny write.
1840 else if (rc_tiny > 0)
1841 rc_normal = rc_tiny;
1843 cl_env_put(env, &refcheck);
1846 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1847 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1852 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1854 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1856 static int ll_file_get_iov_count(const struct iovec *iov,
1857 unsigned long *nr_segs, size_t *count)
1862 for (seg = 0; seg < *nr_segs; seg++) {
1863 const struct iovec *iv = &iov[seg];
1866 * If any segment has a negative length, or the cumulative
1867 * length ever wraps negative then return -EINVAL.
1870 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1872 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1877 cnt -= iv->iov_len; /* This segment is no good */
1884 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1885 unsigned long nr_segs, loff_t pos)
1892 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1899 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1900 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1901 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1902 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1903 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1905 result = ll_file_read_iter(iocb, &to);
1910 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1913 struct iovec iov = { .iov_base = buf, .iov_len = count };
1922 init_sync_kiocb(&kiocb, file);
1923 kiocb.ki_pos = *ppos;
1924 #ifdef HAVE_KIOCB_KI_LEFT
1925 kiocb.ki_left = count;
1926 #elif defined(HAVE_KI_NBYTES)
1927 kiocb.i_nbytes = count;
1930 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1931 *ppos = kiocb.ki_pos;
1937 * Write to a file (through the page cache).
1940 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1941 unsigned long nr_segs, loff_t pos)
1943 struct iov_iter from;
1948 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1955 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1956 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1957 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1958 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1959 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1961 result = ll_file_write_iter(iocb, &from);
1966 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1967 size_t count, loff_t *ppos)
1969 struct iovec iov = { .iov_base = (void __user *)buf,
1979 init_sync_kiocb(&kiocb, file);
1980 kiocb.ki_pos = *ppos;
1981 #ifdef HAVE_KIOCB_KI_LEFT
1982 kiocb.ki_left = count;
1983 #elif defined(HAVE_KI_NBYTES)
1984 kiocb.ki_nbytes = count;
1987 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1988 *ppos = kiocb.ki_pos;
1992 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1995 * Send file content (through pagecache) somewhere with helper
1997 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1998 struct pipe_inode_info *pipe, size_t count,
2002 struct vvp_io_args *args;
2009 result = pcc_file_splice_read(in_file, ppos, pipe,
2010 count, flags, &cached);
2014 ll_ras_enter(in_file);
2016 env = cl_env_get(&refcheck);
2018 RETURN(PTR_ERR(env));
2020 args = ll_env_args(env, IO_SPLICE);
2021 args->u.splice.via_pipe = pipe;
2022 args->u.splice.via_flags = flags;
2024 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2025 cl_env_put(env, &refcheck);
2028 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2029 LUSTRE_FPRIVATE(in_file), *ppos, result,
2034 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2035 __u64 flags, struct lov_user_md *lum, int lum_size)
2037 struct lookup_intent oit = {
2039 .it_flags = flags | MDS_OPEN_BY_FID,
2044 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2045 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2046 /* this code will only exist for big-endian systems */
2047 lustre_swab_lov_user_md(lum, 0);
2050 ll_inode_size_lock(inode);
2051 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2053 GOTO(out_unlock, rc);
2055 ll_release_openhandle(dentry, &oit);
2058 ll_inode_size_unlock(inode);
2059 ll_intent_release(&oit);
2064 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2065 struct lov_mds_md **lmmp, int *lmm_size,
2066 struct ptlrpc_request **request)
2068 struct ll_sb_info *sbi = ll_i2sbi(inode);
2069 struct mdt_body *body;
2070 struct lov_mds_md *lmm = NULL;
2071 struct ptlrpc_request *req = NULL;
2072 struct md_op_data *op_data;
2075 rc = ll_get_default_mdsize(sbi, &lmmsize);
2079 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2080 strlen(filename), lmmsize,
2081 LUSTRE_OPC_ANY, NULL);
2082 if (IS_ERR(op_data))
2083 RETURN(PTR_ERR(op_data));
2085 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2086 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2087 ll_finish_md_op_data(op_data);
2089 CDEBUG(D_INFO, "md_getattr_name failed "
2090 "on %s: rc %d\n", filename, rc);
2094 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2095 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2097 lmmsize = body->mbo_eadatasize;
2099 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2101 GOTO(out, rc = -ENODATA);
2104 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2105 LASSERT(lmm != NULL);
2107 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2108 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2109 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2110 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2111 GOTO(out, rc = -EPROTO);
2114 * This is coming from the MDS, so is probably in
2115 * little endian. We convert it to host endian before
2116 * passing it to userspace.
2118 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2119 __swab32(LOV_MAGIC_MAGIC)) {
2120 int stripe_count = 0;
2122 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2123 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2124 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2125 if (le32_to_cpu(lmm->lmm_pattern) &
2126 LOV_PATTERN_F_RELEASED)
2130 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2132 /* if function called for directory - we should
2133 * avoid swab not existent lsm objects */
2134 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2135 lustre_swab_lov_user_md_objects(
2136 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2138 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2139 S_ISREG(body->mbo_mode))
2140 lustre_swab_lov_user_md_objects(
2141 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2147 *lmm_size = lmmsize;
2152 static int ll_lov_setea(struct inode *inode, struct file *file,
2155 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2156 struct lov_user_md *lump;
2157 int lum_size = sizeof(struct lov_user_md) +
2158 sizeof(struct lov_user_ost_data);
2162 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2165 OBD_ALLOC_LARGE(lump, lum_size);
2169 if (copy_from_user(lump, arg, lum_size))
2170 GOTO(out_lump, rc = -EFAULT);
2172 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2174 cl_lov_delay_create_clear(&file->f_flags);
2177 OBD_FREE_LARGE(lump, lum_size);
2181 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2188 env = cl_env_get(&refcheck);
2190 RETURN(PTR_ERR(env));
2192 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2193 cl_env_put(env, &refcheck);
2197 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2200 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2201 struct lov_user_md *klum;
2203 __u64 flags = FMODE_WRITE;
2206 rc = ll_copy_user_md(lum, &klum);
2211 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2216 rc = put_user(0, &lum->lmm_stripe_count);
2220 rc = ll_layout_refresh(inode, &gen);
2224 rc = ll_file_getstripe(inode, arg, lum_size);
2226 cl_lov_delay_create_clear(&file->f_flags);
2229 OBD_FREE_LARGE(klum, lum_size);
2235 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2237 struct ll_inode_info *lli = ll_i2info(inode);
2238 struct cl_object *obj = lli->lli_clob;
2239 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2240 struct ll_grouplock grouplock;
2245 CWARN("group id for group lock must not be 0\n");
2249 if (ll_file_nolock(file))
2250 RETURN(-EOPNOTSUPP);
2252 if (file->f_flags & O_NONBLOCK) {
2253 if (!mutex_trylock(&lli->lli_group_mutex))
2256 mutex_lock(&lli->lli_group_mutex);
2258 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2259 CWARN("group lock already existed with gid %lu\n",
2260 fd->fd_grouplock.lg_gid);
2261 GOTO(out, rc = -EINVAL);
2263 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2264 if (file->f_flags & O_NONBLOCK)
2265 GOTO(out, rc = -EAGAIN);
2266 mutex_unlock(&lli->lli_group_mutex);
2267 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2268 GOTO(retry, rc = 0);
2270 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2273 * XXX: group lock needs to protect all OST objects while PFL
2274 * can add new OST objects during the IO, so we'd instantiate
2275 * all OST objects before getting its group lock.
2280 struct cl_layout cl = {
2281 .cl_is_composite = false,
2283 struct lu_extent ext = {
2285 .e_end = OBD_OBJECT_EOF,
2288 env = cl_env_get(&refcheck);
2290 GOTO(out, rc = PTR_ERR(env));
2292 rc = cl_object_layout_get(env, obj, &cl);
2293 if (!rc && cl.cl_is_composite)
2294 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2297 cl_env_put(env, &refcheck);
2302 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2303 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2308 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2309 fd->fd_grouplock = grouplock;
2310 if (lli->lli_group_users == 0)
2311 lli->lli_group_gid = grouplock.lg_gid;
2312 lli->lli_group_users++;
2314 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2316 mutex_unlock(&lli->lli_group_mutex);
2321 static int ll_put_grouplock(struct inode *inode, struct file *file,
2324 struct ll_inode_info *lli = ll_i2info(inode);
2325 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2326 struct ll_grouplock grouplock;
2330 mutex_lock(&lli->lli_group_mutex);
2331 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2332 CWARN("no group lock held\n");
2333 GOTO(out, rc = -EINVAL);
2336 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2338 if (fd->fd_grouplock.lg_gid != arg) {
2339 CWARN("group lock %lu doesn't match current id %lu\n",
2340 arg, fd->fd_grouplock.lg_gid);
2341 GOTO(out, rc = -EINVAL);
2344 grouplock = fd->fd_grouplock;
2345 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2346 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2348 cl_put_grouplock(&grouplock);
2350 lli->lli_group_users--;
2351 if (lli->lli_group_users == 0) {
2352 lli->lli_group_gid = 0;
2353 wake_up_var(&lli->lli_group_users);
2355 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2358 mutex_unlock(&lli->lli_group_mutex);
2364 * Close inode open handle
2366 * \param dentry [in] dentry which contains the inode
2367 * \param it [in,out] intent which contains open info and result
2370 * \retval <0 failure
2372 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2374 struct inode *inode = dentry->d_inode;
2375 struct obd_client_handle *och;
2381 /* Root ? Do nothing. */
2382 if (dentry->d_inode->i_sb->s_root == dentry)
2385 /* No open handle to close? Move away */
2386 if (!it_disposition(it, DISP_OPEN_OPEN))
2389 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2391 OBD_ALLOC(och, sizeof(*och));
2393 GOTO(out, rc = -ENOMEM);
2395 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2397 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2399 /* this one is in place of ll_file_open */
2400 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2401 ptlrpc_req_finished(it->it_request);
2402 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2408 * Get size for inode for which FIEMAP mapping is requested.
2409 * Make the FIEMAP get_info call and returns the result.
2410 * \param fiemap kernel buffer to hold extens
2411 * \param num_bytes kernel buffer size
2413 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2419 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2422 /* Checks for fiemap flags */
2423 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2424 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2428 /* Check for FIEMAP_FLAG_SYNC */
2429 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2430 rc = filemap_fdatawrite(inode->i_mapping);
2435 env = cl_env_get(&refcheck);
2437 RETURN(PTR_ERR(env));
2439 if (i_size_read(inode) == 0) {
2440 rc = ll_glimpse_size(inode);
2445 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2446 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2447 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2449 /* If filesize is 0, then there would be no objects for mapping */
2450 if (fmkey.lfik_oa.o_size == 0) {
2451 fiemap->fm_mapped_extents = 0;
2455 fmkey.lfik_fiemap = *fiemap;
2457 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2458 &fmkey, fiemap, &num_bytes);
2460 cl_env_put(env, &refcheck);
2464 int ll_fid2path(struct inode *inode, void __user *arg)
2466 struct obd_export *exp = ll_i2mdexp(inode);
2467 const struct getinfo_fid2path __user *gfin = arg;
2469 struct getinfo_fid2path *gfout;
2475 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2476 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2479 /* Only need to get the buflen */
2480 if (get_user(pathlen, &gfin->gf_pathlen))
2483 if (pathlen > PATH_MAX)
2486 outsize = sizeof(*gfout) + pathlen;
2487 OBD_ALLOC(gfout, outsize);
2491 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2492 GOTO(gf_free, rc = -EFAULT);
2493 /* append root FID after gfout to let MDT know the root FID so that it
2494 * can lookup the correct path, this is mainly for fileset.
2495 * old server without fileset mount support will ignore this. */
2496 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2498 /* Call mdc_iocontrol */
2499 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2503 if (copy_to_user(arg, gfout, outsize))
2507 OBD_FREE(gfout, outsize);
2512 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2514 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2522 ioc->idv_version = 0;
2523 ioc->idv_layout_version = UINT_MAX;
2525 /* If no file object initialized, we consider its version is 0. */
2529 env = cl_env_get(&refcheck);
2531 RETURN(PTR_ERR(env));
2533 io = vvp_env_thread_io(env);
2535 io->u.ci_data_version.dv_data_version = 0;
2536 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2537 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2540 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2541 result = cl_io_loop(env, io);
2543 result = io->ci_result;
2545 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2546 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2548 cl_io_fini(env, io);
2550 if (unlikely(io->ci_need_restart))
2553 cl_env_put(env, &refcheck);
2559 * Read the data_version for inode.
2561 * This value is computed using stripe object version on OST.
2562 * Version is computed using server side locking.
2564 * @param flags if do sync on the OST side;
2566 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2567 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2569 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2571 struct ioc_data_version ioc = { .idv_flags = flags };
2574 rc = ll_ioc_data_version(inode, &ioc);
2576 *data_version = ioc.idv_version;
2582 * Trigger a HSM release request for the provided inode.
2584 int ll_hsm_release(struct inode *inode)
2587 struct obd_client_handle *och = NULL;
2588 __u64 data_version = 0;
2593 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2594 ll_i2sbi(inode)->ll_fsname,
2595 PFID(&ll_i2info(inode)->lli_fid));
2597 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2599 GOTO(out, rc = PTR_ERR(och));
2601 /* Grab latest data_version and [am]time values */
2602 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2606 env = cl_env_get(&refcheck);
2608 GOTO(out, rc = PTR_ERR(env));
2610 rc = ll_merge_attr(env, inode);
2611 cl_env_put(env, &refcheck);
2613 /* If error happen, we have the wrong size for a file.
2619 /* Release the file.
2620 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2621 * we still need it to pack l_remote_handle to MDT. */
2622 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2628 if (och != NULL && !IS_ERR(och)) /* close the file */
2629 ll_lease_close(och, inode, NULL);
2634 struct ll_swap_stack {
2637 struct inode *inode1;
2638 struct inode *inode2;
2643 static int ll_swap_layouts(struct file *file1, struct file *file2,
2644 struct lustre_swap_layouts *lsl)
2646 struct mdc_swap_layouts msl;
2647 struct md_op_data *op_data;
2650 struct ll_swap_stack *llss = NULL;
2653 OBD_ALLOC_PTR(llss);
2657 llss->inode1 = file_inode(file1);
2658 llss->inode2 = file_inode(file2);
2660 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2664 /* we use 2 bool because it is easier to swap than 2 bits */
2665 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2666 llss->check_dv1 = true;
2668 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2669 llss->check_dv2 = true;
2671 /* we cannot use lsl->sl_dvX directly because we may swap them */
2672 llss->dv1 = lsl->sl_dv1;
2673 llss->dv2 = lsl->sl_dv2;
2675 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2676 if (rc == 0) /* same file, done! */
2679 if (rc < 0) { /* sequentialize it */
2680 swap(llss->inode1, llss->inode2);
2682 swap(llss->dv1, llss->dv2);
2683 swap(llss->check_dv1, llss->check_dv2);
2687 if (gid != 0) { /* application asks to flush dirty cache */
2688 rc = ll_get_grouplock(llss->inode1, file1, gid);
2692 rc = ll_get_grouplock(llss->inode2, file2, gid);
2694 ll_put_grouplock(llss->inode1, file1, gid);
2699 /* ultimate check, before swaping the layouts we check if
2700 * dataversion has changed (if requested) */
2701 if (llss->check_dv1) {
2702 rc = ll_data_version(llss->inode1, &dv, 0);
2705 if (dv != llss->dv1)
2706 GOTO(putgl, rc = -EAGAIN);
2709 if (llss->check_dv2) {
2710 rc = ll_data_version(llss->inode2, &dv, 0);
2713 if (dv != llss->dv2)
2714 GOTO(putgl, rc = -EAGAIN);
2717 /* struct md_op_data is used to send the swap args to the mdt
2718 * only flags is missing, so we use struct mdc_swap_layouts
2719 * through the md_op_data->op_data */
2720 /* flags from user space have to be converted before they are send to
2721 * server, no flag is sent today, they are only used on the client */
2724 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2725 0, LUSTRE_OPC_ANY, &msl);
2726 if (IS_ERR(op_data))
2727 GOTO(free, rc = PTR_ERR(op_data));
2729 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2730 sizeof(*op_data), op_data, NULL);
2731 ll_finish_md_op_data(op_data);
2738 ll_put_grouplock(llss->inode2, file2, gid);
2739 ll_put_grouplock(llss->inode1, file1, gid);
2749 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2751 struct obd_export *exp = ll_i2mdexp(inode);
2752 struct md_op_data *op_data;
2756 /* Detect out-of range masks */
2757 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2760 /* Non-root users are forbidden to set or clear flags which are
2761 * NOT defined in HSM_USER_MASK. */
2762 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2763 !cfs_capable(CFS_CAP_SYS_ADMIN))
2766 if (!exp_connect_archive_id_array(exp)) {
2767 /* Detect out-of range archive id */
2768 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2769 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2773 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2774 LUSTRE_OPC_ANY, hss);
2775 if (IS_ERR(op_data))
2776 RETURN(PTR_ERR(op_data));
2778 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2781 ll_finish_md_op_data(op_data);
2786 static int ll_hsm_import(struct inode *inode, struct file *file,
2787 struct hsm_user_import *hui)
2789 struct hsm_state_set *hss = NULL;
2790 struct iattr *attr = NULL;
2794 if (!S_ISREG(inode->i_mode))
2800 GOTO(out, rc = -ENOMEM);
2802 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2803 hss->hss_archive_id = hui->hui_archive_id;
2804 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2805 rc = ll_hsm_state_set(inode, hss);
2809 OBD_ALLOC_PTR(attr);
2811 GOTO(out, rc = -ENOMEM);
2813 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2814 attr->ia_mode |= S_IFREG;
2815 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2816 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2817 attr->ia_size = hui->hui_size;
2818 attr->ia_mtime.tv_sec = hui->hui_mtime;
2819 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2820 attr->ia_atime.tv_sec = hui->hui_atime;
2821 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2823 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2824 ATTR_UID | ATTR_GID |
2825 ATTR_MTIME | ATTR_MTIME_SET |
2826 ATTR_ATIME | ATTR_ATIME_SET;
2830 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2834 inode_unlock(inode);
2846 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2848 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2849 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2852 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2854 struct inode *inode = file_inode(file);
2856 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2857 ATTR_MTIME | ATTR_MTIME_SET |
2860 .tv_sec = lfu->lfu_atime_sec,
2861 .tv_nsec = lfu->lfu_atime_nsec,
2864 .tv_sec = lfu->lfu_mtime_sec,
2865 .tv_nsec = lfu->lfu_mtime_nsec,
2868 .tv_sec = lfu->lfu_ctime_sec,
2869 .tv_nsec = lfu->lfu_ctime_nsec,
2875 if (!capable(CAP_SYS_ADMIN))
2878 if (!S_ISREG(inode->i_mode))
2882 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2884 inode_unlock(inode);
2889 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2892 case MODE_READ_USER:
2894 case MODE_WRITE_USER:
2901 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2903 /* Used to allow the upper layers of the client to request an LDLM lock
2904 * without doing an actual read or write.
2906 * Used for ladvise lockahead to manually request specific locks.
2908 * \param[in] file file this ladvise lock request is on
2909 * \param[in] ladvise ladvise struct describing this lock request
2911 * \retval 0 success, no detailed result available (sync requests
2912 * and requests sent to the server [not handled locally]
2913 * cannot return detailed results)
2914 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2915 * see definitions for details.
2916 * \retval negative negative errno on error
2918 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2920 struct lu_env *env = NULL;
2921 struct cl_io *io = NULL;
2922 struct cl_lock *lock = NULL;
2923 struct cl_lock_descr *descr = NULL;
2924 struct dentry *dentry = file->f_path.dentry;
2925 struct inode *inode = dentry->d_inode;
2926 enum cl_lock_mode cl_mode;
2927 off_t start = ladvise->lla_start;
2928 off_t end = ladvise->lla_end;
2934 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2935 "start=%llu, end=%llu\n", dentry->d_name.len,
2936 dentry->d_name.name, dentry->d_inode,
2937 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2940 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2942 GOTO(out, result = cl_mode);
2944 /* Get IO environment */
2945 result = cl_io_get(inode, &env, &io, &refcheck);
2949 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2952 * nothing to do for this io. This currently happens when
2953 * stripe sub-object's are not yet created.
2955 result = io->ci_result;
2956 } else if (result == 0) {
2957 lock = vvp_env_lock(env);
2958 descr = &lock->cll_descr;
2960 descr->cld_obj = io->ci_obj;
2961 /* Convert byte offsets to pages */
2962 descr->cld_start = cl_index(io->ci_obj, start);
2963 descr->cld_end = cl_index(io->ci_obj, end);
2964 descr->cld_mode = cl_mode;
2965 /* CEF_MUST is used because we do not want to convert a
2966 * lockahead request to a lockless lock */
2967 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2970 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2971 descr->cld_enq_flags |= CEF_SPECULATIVE;
2973 result = cl_lock_request(env, io, lock);
2975 /* On success, we need to release the lock */
2977 cl_lock_release(env, lock);
2979 cl_io_fini(env, io);
2980 cl_env_put(env, &refcheck);
2982 /* -ECANCELED indicates a matching lock with a different extent
2983 * was already present, and -EEXIST indicates a matching lock
2984 * on exactly the same extent was already present.
2985 * We convert them to positive values for userspace to make
2986 * recognizing true errors easier.
2987 * Note we can only return these detailed results on async requests,
2988 * as sync requests look the same as i/o requests for locking. */
2989 if (result == -ECANCELED)
2990 result = LLA_RESULT_DIFFERENT;
2991 else if (result == -EEXIST)
2992 result = LLA_RESULT_SAME;
2997 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2999 static int ll_ladvise_sanity(struct inode *inode,
3000 struct llapi_lu_ladvise *ladvise)
3002 struct ll_sb_info *sbi = ll_i2sbi(inode);
3003 enum lu_ladvise_type advice = ladvise->lla_advice;
3004 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3005 * be in the first 32 bits of enum ladvise_flags */
3006 __u32 flags = ladvise->lla_peradvice_flags;
3007 /* 3 lines at 80 characters per line, should be plenty */
3010 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3012 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3013 "last supported advice is %s (value '%d'): rc = %d\n",
3014 sbi->ll_fsname, advice,
3015 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3019 /* Per-advice checks */
3021 case LU_LADVISE_LOCKNOEXPAND:
3022 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3024 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3025 "rc = %d\n", sbi->ll_fsname, flags,
3026 ladvise_names[advice], rc);
3030 case LU_LADVISE_LOCKAHEAD:
3031 /* Currently only READ and WRITE modes can be requested */
3032 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3033 ladvise->lla_lockahead_mode == 0) {
3035 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3036 "rc = %d\n", sbi->ll_fsname,
3037 ladvise->lla_lockahead_mode,
3038 ladvise_names[advice], rc);
3042 case LU_LADVISE_WILLREAD:
3043 case LU_LADVISE_DONTNEED:
3045 /* Note fall through above - These checks apply to all advices
3046 * except LOCKNOEXPAND */
3047 if (flags & ~LF_DEFAULT_MASK) {
3049 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3050 "rc = %d\n", sbi->ll_fsname, flags,
3051 ladvise_names[advice], rc);
3054 if (ladvise->lla_start >= ladvise->lla_end) {
3056 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3057 "for %s: rc = %d\n", sbi->ll_fsname,
3058 ladvise->lla_start, ladvise->lla_end,
3059 ladvise_names[advice], rc);
3071 * Give file access advices
3073 * The ladvise interface is similar to Linux fadvise() system call, except it
3074 * forwards the advices directly from Lustre client to server. The server side
3075 * codes will apply appropriate read-ahead and caching techniques for the
3076 * corresponding files.
3078 * A typical workload for ladvise is e.g. a bunch of different clients are
3079 * doing small random reads of a file, so prefetching pages into OSS cache
3080 * with big linear reads before the random IO is a net benefit. Fetching
3081 * all that data into each client cache with fadvise() may not be, due to
3082 * much more data being sent to the client.
3084 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3085 struct llapi_lu_ladvise *ladvise)
3089 struct cl_ladvise_io *lio;
3094 env = cl_env_get(&refcheck);
3096 RETURN(PTR_ERR(env));
3098 io = vvp_env_thread_io(env);
3099 io->ci_obj = ll_i2info(inode)->lli_clob;
3101 /* initialize parameters for ladvise */
3102 lio = &io->u.ci_ladvise;
3103 lio->li_start = ladvise->lla_start;
3104 lio->li_end = ladvise->lla_end;
3105 lio->li_fid = ll_inode2fid(inode);
3106 lio->li_advice = ladvise->lla_advice;
3107 lio->li_flags = flags;
3109 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3110 rc = cl_io_loop(env, io);
3114 cl_io_fini(env, io);
3115 cl_env_put(env, &refcheck);
3119 static int ll_lock_noexpand(struct file *file, int flags)
3121 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3123 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3128 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3131 struct fsxattr fsxattr;
3133 if (copy_from_user(&fsxattr,
3134 (const struct fsxattr __user *)arg,
3138 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3139 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3140 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3141 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3142 if (copy_to_user((struct fsxattr __user *)arg,
3143 &fsxattr, sizeof(fsxattr)))
3149 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3152 * Project Quota ID state is only allowed to change from within the init
3153 * namespace. Enforce that restriction only if we are trying to change
3154 * the quota ID state. Everything else is allowed in user namespaces.
3156 if (current_user_ns() == &init_user_ns)
3159 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3162 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3163 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3166 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3173 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3177 struct md_op_data *op_data;
3178 struct ptlrpc_request *req = NULL;
3180 struct fsxattr fsxattr;
3181 struct cl_object *obj;
3185 if (copy_from_user(&fsxattr,
3186 (const struct fsxattr __user *)arg,
3190 rc = ll_ioctl_check_project(inode, &fsxattr);
3194 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3195 LUSTRE_OPC_ANY, NULL);
3196 if (IS_ERR(op_data))
3197 RETURN(PTR_ERR(op_data));
3199 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3200 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3201 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3202 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3203 op_data->op_projid = fsxattr.fsx_projid;
3204 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3205 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3207 ptlrpc_req_finished(req);
3209 GOTO(out_fsxattr, rc);
3210 ll_update_inode_flags(inode, op_data->op_attr_flags);
3211 obj = ll_i2info(inode)->lli_clob;
3213 GOTO(out_fsxattr, rc);
3215 OBD_ALLOC_PTR(attr);
3217 GOTO(out_fsxattr, rc = -ENOMEM);
3219 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3220 fsxattr.fsx_xflags);
3223 ll_finish_md_op_data(op_data);
3227 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3230 struct inode *inode = file_inode(file);
3231 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3232 struct ll_inode_info *lli = ll_i2info(inode);
3233 struct obd_client_handle *och = NULL;
3234 struct split_param sp;
3235 struct pcc_param param;
3236 bool lease_broken = false;
3238 enum mds_op_bias bias = 0;
3239 struct file *layout_file = NULL;
3241 size_t data_size = 0;
3242 bool attached = false;
3247 mutex_lock(&lli->lli_och_mutex);
3248 if (fd->fd_lease_och != NULL) {
3249 och = fd->fd_lease_och;
3250 fd->fd_lease_och = NULL;
3252 mutex_unlock(&lli->lli_och_mutex);
3257 fmode = och->och_flags;
3259 switch (ioc->lil_flags) {
3260 case LL_LEASE_RESYNC_DONE:
3261 if (ioc->lil_count > IOC_IDS_MAX)
3262 GOTO(out_lease_close, rc = -EINVAL);
3264 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3265 OBD_ALLOC(data, data_size);
3267 GOTO(out_lease_close, rc = -ENOMEM);
3269 if (copy_from_user(data, (void __user *)arg, data_size))
3270 GOTO(out_lease_close, rc = -EFAULT);
3272 bias = MDS_CLOSE_RESYNC_DONE;
3274 case LL_LEASE_LAYOUT_MERGE: {
3277 if (ioc->lil_count != 1)
3278 GOTO(out_lease_close, rc = -EINVAL);
3280 arg += sizeof(*ioc);
3281 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3282 GOTO(out_lease_close, rc = -EFAULT);
3284 layout_file = fget(fd);
3286 GOTO(out_lease_close, rc = -EBADF);
3288 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3289 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3290 GOTO(out_lease_close, rc = -EPERM);
3292 data = file_inode(layout_file);
3293 bias = MDS_CLOSE_LAYOUT_MERGE;
3296 case LL_LEASE_LAYOUT_SPLIT: {
3300 if (ioc->lil_count != 2)
3301 GOTO(out_lease_close, rc = -EINVAL);
3303 arg += sizeof(*ioc);
3304 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3305 GOTO(out_lease_close, rc = -EFAULT);
3307 arg += sizeof(__u32);
3308 if (copy_from_user(&mirror_id, (void __user *)arg,
3310 GOTO(out_lease_close, rc = -EFAULT);
3312 layout_file = fget(fdv);
3314 GOTO(out_lease_close, rc = -EBADF);
3316 sp.sp_inode = file_inode(layout_file);
3317 sp.sp_mirror_id = (__u16)mirror_id;
3319 bias = MDS_CLOSE_LAYOUT_SPLIT;
3322 case LL_LEASE_PCC_ATTACH:
3323 if (ioc->lil_count != 1)
3326 arg += sizeof(*ioc);
3327 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3329 GOTO(out_lease_close, rc2 = -EFAULT);
3331 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3333 GOTO(out_lease_close, rc2);
3336 /* Grab latest data version */
3337 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3340 GOTO(out_lease_close, rc2);
3343 bias = MDS_PCC_ATTACH;
3346 /* without close intent */
3351 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3355 rc = ll_lease_och_release(inode, file);
3364 switch (ioc->lil_flags) {
3365 case LL_LEASE_RESYNC_DONE:
3367 OBD_FREE(data, data_size);
3369 case LL_LEASE_LAYOUT_MERGE:
3370 case LL_LEASE_LAYOUT_SPLIT:
3374 case LL_LEASE_PCC_ATTACH:
3377 rc = pcc_readwrite_attach_fini(file, inode,
3378 param.pa_layout_gen,
3385 rc = ll_lease_type_from_fmode(fmode);
3389 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3392 struct inode *inode = file_inode(file);
3393 struct ll_inode_info *lli = ll_i2info(inode);
3394 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3395 struct obd_client_handle *och = NULL;
3396 __u64 open_flags = 0;
3402 switch (ioc->lil_mode) {
3403 case LL_LEASE_WRLCK:
3404 if (!(file->f_mode & FMODE_WRITE))
3406 fmode = FMODE_WRITE;
3408 case LL_LEASE_RDLCK:
3409 if (!(file->f_mode & FMODE_READ))
3413 case LL_LEASE_UNLCK:
3414 RETURN(ll_file_unlock_lease(file, ioc, arg));
3419 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3421 /* apply for lease */
3422 if (ioc->lil_flags & LL_LEASE_RESYNC)
3423 open_flags = MDS_OPEN_RESYNC;
3424 och = ll_lease_open(inode, file, fmode, open_flags);
3426 RETURN(PTR_ERR(och));
3428 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3429 rc = ll_lease_file_resync(och, inode, arg);
3431 ll_lease_close(och, inode, NULL);
3434 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3436 ll_lease_close(och, inode, NULL);
3442 mutex_lock(&lli->lli_och_mutex);
3443 if (fd->fd_lease_och == NULL) {
3444 fd->fd_lease_och = och;
3447 mutex_unlock(&lli->lli_och_mutex);
3449 /* impossible now that only excl is supported for now */
3450 ll_lease_close(och, inode, &lease_broken);
3456 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3458 struct ll_inode_info *lli = ll_i2info(inode);
3459 struct ll_sb_info *sbi = ll_i2sbi(inode);
3460 __u64 now = ktime_get_real_seconds();
3463 spin_lock(&lli->lli_heat_lock);
3464 heat->lh_flags = lli->lli_heat_flags;
3465 for (i = 0; i < heat->lh_count; i++)
3466 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3467 now, sbi->ll_heat_decay_weight,
3468 sbi->ll_heat_period_second);
3469 spin_unlock(&lli->lli_heat_lock);
3472 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3474 struct ll_inode_info *lli = ll_i2info(inode);
3477 spin_lock(&lli->lli_heat_lock);
3478 if (flags & LU_HEAT_FLAG_CLEAR)
3479 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3481 if (flags & LU_HEAT_FLAG_OFF)
3482 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3484 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3486 spin_unlock(&lli->lli_heat_lock);
3492 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3494 struct inode *inode = file_inode(file);
3495 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3499 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3500 PFID(ll_inode2fid(inode)), inode, cmd);
3501 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3503 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3504 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3508 case LL_IOC_GETFLAGS:
3509 /* Get the current value of the file flags */
3510 return put_user(fd->fd_flags, (int __user *)arg);
3511 case LL_IOC_SETFLAGS:
3512 case LL_IOC_CLRFLAGS:
3513 /* Set or clear specific file flags */
3514 /* XXX This probably needs checks to ensure the flags are
3515 * not abused, and to handle any flag side effects.
3517 if (get_user(flags, (int __user *) arg))
3520 if (cmd == LL_IOC_SETFLAGS) {
3521 if ((flags & LL_FILE_IGNORE_LOCK) &&
3522 !(file->f_flags & O_DIRECT)) {
3523 CERROR("%s: unable to disable locking on "
3524 "non-O_DIRECT file\n", current->comm);
3528 fd->fd_flags |= flags;
3530 fd->fd_flags &= ~flags;
3533 case LL_IOC_LOV_SETSTRIPE:
3534 case LL_IOC_LOV_SETSTRIPE_NEW:
3535 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3536 case LL_IOC_LOV_SETEA:
3537 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3538 case LL_IOC_LOV_SWAP_LAYOUTS: {
3540 struct lustre_swap_layouts lsl;
3542 if (copy_from_user(&lsl, (char __user *)arg,
3543 sizeof(struct lustre_swap_layouts)))
3546 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3549 file2 = fget(lsl.sl_fd);
3553 /* O_WRONLY or O_RDWR */
3554 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3555 GOTO(out, rc = -EPERM);
3557 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3558 struct inode *inode2;
3559 struct ll_inode_info *lli;
3560 struct obd_client_handle *och = NULL;
3562 lli = ll_i2info(inode);
3563 mutex_lock(&lli->lli_och_mutex);
3564 if (fd->fd_lease_och != NULL) {
3565 och = fd->fd_lease_och;
3566 fd->fd_lease_och = NULL;
3568 mutex_unlock(&lli->lli_och_mutex);
3570 GOTO(out, rc = -ENOLCK);
3571 inode2 = file_inode(file2);
3572 rc = ll_swap_layouts_close(och, inode, inode2);
3574 rc = ll_swap_layouts(file, file2, &lsl);
3580 case LL_IOC_LOV_GETSTRIPE:
3581 case LL_IOC_LOV_GETSTRIPE_NEW:
3582 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3583 case FS_IOC_GETFLAGS:
3584 case FS_IOC_SETFLAGS:
3585 RETURN(ll_iocontrol(inode, file, cmd, arg));
3586 case FSFILT_IOC_GETVERSION:
3587 case FS_IOC_GETVERSION:
3588 RETURN(put_user(inode->i_generation, (int __user *)arg));
3589 /* We need to special case any other ioctls we want to handle,
3590 * to send them to the MDS/OST as appropriate and to properly
3591 * network encode the arg field. */
3592 case FS_IOC_SETVERSION:
3595 case LL_IOC_GROUP_LOCK:
3596 RETURN(ll_get_grouplock(inode, file, arg));
3597 case LL_IOC_GROUP_UNLOCK:
3598 RETURN(ll_put_grouplock(inode, file, arg));
3599 case IOC_OBD_STATFS:
3600 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3602 case LL_IOC_FLUSHCTX:
3603 RETURN(ll_flush_ctx(inode));
3604 case LL_IOC_PATH2FID: {
3605 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3606 sizeof(struct lu_fid)))
3611 case LL_IOC_GETPARENT:
3612 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3614 case OBD_IOC_FID2PATH:
3615 RETURN(ll_fid2path(inode, (void __user *)arg));
3616 case LL_IOC_DATA_VERSION: {
3617 struct ioc_data_version idv;
3620 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3623 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3624 rc = ll_ioc_data_version(inode, &idv);
3627 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3633 case LL_IOC_GET_MDTIDX: {
3636 mdtidx = ll_get_mdt_idx(inode);
3640 if (put_user((int)mdtidx, (int __user *)arg))
3645 case OBD_IOC_GETDTNAME:
3646 case OBD_IOC_GETMDNAME:
3647 RETURN(ll_get_obd_name(inode, cmd, arg));
3648 case LL_IOC_HSM_STATE_GET: {
3649 struct md_op_data *op_data;
3650 struct hsm_user_state *hus;
3657 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3658 LUSTRE_OPC_ANY, hus);
3659 if (IS_ERR(op_data)) {
3661 RETURN(PTR_ERR(op_data));
3664 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3667 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3670 ll_finish_md_op_data(op_data);
3674 case LL_IOC_HSM_STATE_SET: {
3675 struct hsm_state_set *hss;
3682 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3687 rc = ll_hsm_state_set(inode, hss);
3692 case LL_IOC_HSM_ACTION: {
3693 struct md_op_data *op_data;
3694 struct hsm_current_action *hca;
3701 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3702 LUSTRE_OPC_ANY, hca);
3703 if (IS_ERR(op_data)) {
3705 RETURN(PTR_ERR(op_data));
3708 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3711 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3714 ll_finish_md_op_data(op_data);
3718 case LL_IOC_SET_LEASE_OLD: {
3719 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3721 RETURN(ll_file_set_lease(file, &ioc, 0));
3723 case LL_IOC_SET_LEASE: {
3724 struct ll_ioc_lease ioc;
3726 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3729 RETURN(ll_file_set_lease(file, &ioc, arg));
3731 case LL_IOC_GET_LEASE: {
3732 struct ll_inode_info *lli = ll_i2info(inode);
3733 struct ldlm_lock *lock = NULL;
3736 mutex_lock(&lli->lli_och_mutex);
3737 if (fd->fd_lease_och != NULL) {
3738 struct obd_client_handle *och = fd->fd_lease_och;
3740 lock = ldlm_handle2lock(&och->och_lease_handle);
3742 lock_res_and_lock(lock);
3743 if (!ldlm_is_cancel(lock))
3744 fmode = och->och_flags;
3746 unlock_res_and_lock(lock);
3747 LDLM_LOCK_PUT(lock);
3750 mutex_unlock(&lli->lli_och_mutex);
3752 RETURN(ll_lease_type_from_fmode(fmode));
3754 case LL_IOC_HSM_IMPORT: {
3755 struct hsm_user_import *hui;
3761 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3766 rc = ll_hsm_import(inode, file, hui);
3771 case LL_IOC_FUTIMES_3: {
3772 struct ll_futimes_3 lfu;
3774 if (copy_from_user(&lfu,
3775 (const struct ll_futimes_3 __user *)arg,
3779 RETURN(ll_file_futimes_3(file, &lfu));
3781 case LL_IOC_LADVISE: {
3782 struct llapi_ladvise_hdr *k_ladvise_hdr;
3783 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3786 int alloc_size = sizeof(*k_ladvise_hdr);
3789 u_ladvise_hdr = (void __user *)arg;
3790 OBD_ALLOC_PTR(k_ladvise_hdr);
3791 if (k_ladvise_hdr == NULL)
3794 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3795 GOTO(out_ladvise, rc = -EFAULT);
3797 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3798 k_ladvise_hdr->lah_count < 1)
3799 GOTO(out_ladvise, rc = -EINVAL);
3801 num_advise = k_ladvise_hdr->lah_count;
3802 if (num_advise >= LAH_COUNT_MAX)
3803 GOTO(out_ladvise, rc = -EFBIG);
3805 OBD_FREE_PTR(k_ladvise_hdr);
3806 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3807 lah_advise[num_advise]);
3808 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3809 if (k_ladvise_hdr == NULL)
3813 * TODO: submit multiple advices to one server in a single RPC
3815 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3816 GOTO(out_ladvise, rc = -EFAULT);
3818 for (i = 0; i < num_advise; i++) {
3819 struct llapi_lu_ladvise *k_ladvise =
3820 &k_ladvise_hdr->lah_advise[i];
3821 struct llapi_lu_ladvise __user *u_ladvise =
3822 &u_ladvise_hdr->lah_advise[i];
3824 rc = ll_ladvise_sanity(inode, k_ladvise);
3826 GOTO(out_ladvise, rc);
3828 switch (k_ladvise->lla_advice) {
3829 case LU_LADVISE_LOCKNOEXPAND:
3830 rc = ll_lock_noexpand(file,
3831 k_ladvise->lla_peradvice_flags);
3832 GOTO(out_ladvise, rc);
3833 case LU_LADVISE_LOCKAHEAD:
3835 rc = ll_file_lock_ahead(file, k_ladvise);
3838 GOTO(out_ladvise, rc);
3841 &u_ladvise->lla_lockahead_result))
3842 GOTO(out_ladvise, rc = -EFAULT);
3845 rc = ll_ladvise(inode, file,
3846 k_ladvise_hdr->lah_flags,
3849 GOTO(out_ladvise, rc);
3856 OBD_FREE(k_ladvise_hdr, alloc_size);
3859 case LL_IOC_FLR_SET_MIRROR: {
3860 /* mirror I/O must be direct to avoid polluting page cache
3862 if (!(file->f_flags & O_DIRECT))
3865 fd->fd_designated_mirror = (__u32)arg;
3868 case LL_IOC_FSGETXATTR:
3869 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3870 case LL_IOC_FSSETXATTR:
3871 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3873 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3874 case LL_IOC_HEAT_GET: {
3875 struct lu_heat uheat;
3876 struct lu_heat *heat;
3879 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3882 if (uheat.lh_count > OBD_HEAT_COUNT)
3883 uheat.lh_count = OBD_HEAT_COUNT;
3885 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3886 OBD_ALLOC(heat, size);
3890 heat->lh_count = uheat.lh_count;
3891 ll_heat_get(inode, heat);
3892 rc = copy_to_user((char __user *)arg, heat, size);
3893 OBD_FREE(heat, size);
3894 RETURN(rc ? -EFAULT : 0);
3896 case LL_IOC_HEAT_SET: {
3899 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3902 rc = ll_heat_set(inode, flags);
3905 case LL_IOC_PCC_DETACH: {
3906 struct lu_pcc_detach *detach;
3908 OBD_ALLOC_PTR(detach);
3912 if (copy_from_user(detach,
3913 (const struct lu_pcc_detach __user *)arg,
3915 GOTO(out_detach_free, rc = -EFAULT);
3917 if (!S_ISREG(inode->i_mode))
3918 GOTO(out_detach_free, rc = -EINVAL);
3920 if (!inode_owner_or_capable(inode))
3921 GOTO(out_detach_free, rc = -EPERM);
3923 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3925 OBD_FREE_PTR(detach);
3928 case LL_IOC_PCC_STATE: {
3929 struct lu_pcc_state __user *ustate =
3930 (struct lu_pcc_state __user *)arg;
3931 struct lu_pcc_state *state;
3933 OBD_ALLOC_PTR(state);
3937 if (copy_from_user(state, ustate, sizeof(*state)))
3938 GOTO(out_state, rc = -EFAULT);
3940 rc = pcc_ioctl_state(file, inode, state);
3942 GOTO(out_state, rc);
3944 if (copy_to_user(ustate, state, sizeof(*state)))
3945 GOTO(out_state, rc = -EFAULT);
3948 OBD_FREE_PTR(state);
3952 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3953 (void __user *)arg));
3957 #ifndef HAVE_FILE_LLSEEK_SIZE
3958 static inline loff_t
3959 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3961 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3963 if (offset > maxsize)
3966 if (offset != file->f_pos) {
3967 file->f_pos = offset;
3968 file->f_version = 0;
3974 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3975 loff_t maxsize, loff_t eof)
3977 struct inode *inode = file_inode(file);
3985 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3986 * position-querying operation. Avoid rewriting the "same"
3987 * f_pos value back to the file because a concurrent read(),
3988 * write() or lseek() might have altered it
3993 * f_lock protects against read/modify/write race with other
3994 * SEEK_CURs. Note that parallel writes and reads behave
3998 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3999 inode_unlock(inode);
4003 * In the generic case the entire file is data, so as long as
4004 * offset isn't at the end of the file then the offset is data.
4011 * There is a virtual hole at the end of the file, so as long as
4012 * offset isn't i_size or larger, return i_size.
4020 return llseek_execute(file, offset, maxsize);
4024 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4026 struct inode *inode = file_inode(file);
4027 loff_t retval, eof = 0;
4030 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4031 (origin == SEEK_CUR) ? file->f_pos : 0);
4032 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4033 PFID(ll_inode2fid(inode)), inode, retval, retval,
4035 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4037 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4038 retval = ll_glimpse_size(inode);
4041 eof = i_size_read(inode);
4044 retval = ll_generic_file_llseek_size(file, offset, origin,
4045 ll_file_maxbytes(inode), eof);
4049 static int ll_flush(struct file *file, fl_owner_t id)
4051 struct inode *inode = file_inode(file);
4052 struct ll_inode_info *lli = ll_i2info(inode);
4053 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4056 LASSERT(!S_ISDIR(inode->i_mode));
4058 /* catch async errors that were recorded back when async writeback
4059 * failed for pages in this mapping. */
4060 rc = lli->lli_async_rc;
4061 lli->lli_async_rc = 0;
4062 if (lli->lli_clob != NULL) {
4063 err = lov_read_and_clear_async_rc(lli->lli_clob);
4068 /* The application has been told write failure already.
4069 * Do not report failure again. */
4070 if (fd->fd_write_failed)
4072 return rc ? -EIO : 0;
4076 * Called to make sure a portion of file has been written out.
4077 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4079 * Return how many pages have been written.
4081 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4082 enum cl_fsync_mode mode, int ignore_layout)
4086 struct cl_fsync_io *fio;
4091 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4092 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4095 env = cl_env_get(&refcheck);
4097 RETURN(PTR_ERR(env));
4099 io = vvp_env_thread_io(env);
4100 io->ci_obj = ll_i2info(inode)->lli_clob;
4101 io->ci_ignore_layout = ignore_layout;
4103 /* initialize parameters for sync */
4104 fio = &io->u.ci_fsync;
4105 fio->fi_start = start;
4107 fio->fi_fid = ll_inode2fid(inode);
4108 fio->fi_mode = mode;
4109 fio->fi_nr_written = 0;
4111 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4112 result = cl_io_loop(env, io);
4114 result = io->ci_result;
4116 result = fio->fi_nr_written;
4117 cl_io_fini(env, io);
4118 cl_env_put(env, &refcheck);
4124 * When dentry is provided (the 'else' case), file_dentry() may be
4125 * null and dentry must be used directly rather than pulled from
4126 * file_dentry() as is done otherwise.
4129 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4131 struct dentry *dentry = file_dentry(file);
4132 struct inode *inode = dentry->d_inode;
4133 struct ll_inode_info *lli = ll_i2info(inode);
4134 struct ptlrpc_request *req;
4139 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4141 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4143 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4145 /* fsync's caller has already called _fdata{sync,write}, we want
4146 * that IO to finish before calling the osc and mdc sync methods */
4147 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4150 /* catch async errors that were recorded back when async writeback
4151 * failed for pages in this mapping. */
4152 if (!S_ISDIR(inode->i_mode)) {
4153 err = lli->lli_async_rc;
4154 lli->lli_async_rc = 0;
4157 if (lli->lli_clob != NULL) {
4158 err = lov_read_and_clear_async_rc(lli->lli_clob);
4164 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4168 ptlrpc_req_finished(req);
4170 if (S_ISREG(inode->i_mode)) {
4171 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4174 /* Sync metadata on MDT first, and then sync the cached data
4177 err = pcc_fsync(file, start, end, datasync, &cached);
4179 err = cl_sync_file_range(inode, start, end,
4181 if (rc == 0 && err < 0)
4184 fd->fd_write_failed = true;
4186 fd->fd_write_failed = false;
4189 inode_unlock(inode);
4194 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4196 struct inode *inode = file_inode(file);
4197 struct ll_sb_info *sbi = ll_i2sbi(inode);
4198 struct ldlm_enqueue_info einfo = {
4199 .ei_type = LDLM_FLOCK,
4200 .ei_cb_cp = ldlm_flock_completion_ast,
4201 .ei_cbdata = file_lock,
4203 struct md_op_data *op_data;
4204 struct lustre_handle lockh = { 0 };
4205 union ldlm_policy_data flock = { { 0 } };
4206 int fl_type = file_lock->fl_type;
4212 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4213 PFID(ll_inode2fid(inode)), file_lock);
4215 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4217 if (file_lock->fl_flags & FL_FLOCK) {
4218 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4219 /* flocks are whole-file locks */
4220 flock.l_flock.end = OFFSET_MAX;
4221 /* For flocks owner is determined by the local file desctiptor*/
4222 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4223 } else if (file_lock->fl_flags & FL_POSIX) {
4224 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4225 flock.l_flock.start = file_lock->fl_start;
4226 flock.l_flock.end = file_lock->fl_end;
4230 flock.l_flock.pid = file_lock->fl_pid;
4232 /* Somewhat ugly workaround for svc lockd.
4233 * lockd installs custom fl_lmops->lm_compare_owner that checks
4234 * for the fl_owner to be the same (which it always is on local node
4235 * I guess between lockd processes) and then compares pid.
4236 * As such we assign pid to the owner field to make it all work,
4237 * conflict with normal locks is unlikely since pid space and
4238 * pointer space for current->files are not intersecting */
4239 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4240 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4244 einfo.ei_mode = LCK_PR;
4247 /* An unlock request may or may not have any relation to
4248 * existing locks so we may not be able to pass a lock handle
4249 * via a normal ldlm_lock_cancel() request. The request may even
4250 * unlock a byte range in the middle of an existing lock. In
4251 * order to process an unlock request we need all of the same
4252 * information that is given with a normal read or write record
4253 * lock request. To avoid creating another ldlm unlock (cancel)
4254 * message we'll treat a LCK_NL flock request as an unlock. */
4255 einfo.ei_mode = LCK_NL;
4258 einfo.ei_mode = LCK_PW;
4261 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4276 flags = LDLM_FL_BLOCK_NOWAIT;
4282 flags = LDLM_FL_TEST_LOCK;
4285 CERROR("unknown fcntl lock command: %d\n", cmd);
4289 /* Save the old mode so that if the mode in the lock changes we
4290 * can decrement the appropriate reader or writer refcount. */
4291 file_lock->fl_type = einfo.ei_mode;
4293 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4294 LUSTRE_OPC_ANY, NULL);
4295 if (IS_ERR(op_data))
4296 RETURN(PTR_ERR(op_data));
4298 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4299 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4300 flock.l_flock.pid, flags, einfo.ei_mode,
4301 flock.l_flock.start, flock.l_flock.end);
4303 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4306 /* Restore the file lock type if not TEST lock. */
4307 if (!(flags & LDLM_FL_TEST_LOCK))
4308 file_lock->fl_type = fl_type;
4310 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4311 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4312 !(flags & LDLM_FL_TEST_LOCK))
4313 rc2 = locks_lock_file_wait(file, file_lock);
4315 if ((file_lock->fl_flags & FL_FLOCK) &&
4316 (rc == 0 || file_lock->fl_type == F_UNLCK))
4317 rc2 = flock_lock_file_wait(file, file_lock);
4318 if ((file_lock->fl_flags & FL_POSIX) &&
4319 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4320 !(flags & LDLM_FL_TEST_LOCK))
4321 rc2 = posix_lock_file_wait(file, file_lock);
4322 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4324 if (rc2 && file_lock->fl_type != F_UNLCK) {
4325 einfo.ei_mode = LCK_NL;
4326 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4331 ll_finish_md_op_data(op_data);
4336 int ll_get_fid_by_name(struct inode *parent, const char *name,
4337 int namelen, struct lu_fid *fid,
4338 struct inode **inode)
4340 struct md_op_data *op_data = NULL;
4341 struct mdt_body *body;
4342 struct ptlrpc_request *req;
4346 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4347 LUSTRE_OPC_ANY, NULL);
4348 if (IS_ERR(op_data))
4349 RETURN(PTR_ERR(op_data));
4351 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4352 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4353 ll_finish_md_op_data(op_data);
4357 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4359 GOTO(out_req, rc = -EFAULT);
4361 *fid = body->mbo_fid1;
4364 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4366 ptlrpc_req_finished(req);
4370 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4373 struct dentry *dchild = NULL;
4374 struct inode *child_inode = NULL;
4375 struct md_op_data *op_data;
4376 struct ptlrpc_request *request = NULL;
4377 struct obd_client_handle *och = NULL;
4379 struct mdt_body *body;
4380 __u64 data_version = 0;
4381 size_t namelen = strlen(name);
4382 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4386 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4387 PFID(ll_inode2fid(parent)), name,
4388 lum->lum_stripe_offset, lum->lum_stripe_count);
4390 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4391 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4392 lustre_swab_lmv_user_md(lum);
4394 /* Get child FID first */
4395 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4398 dchild = d_lookup(file_dentry(file), &qstr);
4400 if (dchild->d_inode)
4401 child_inode = igrab(dchild->d_inode);
4406 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4415 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4416 OBD_CONNECT2_DIR_MIGRATE)) {
4417 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4418 ll_dir_striped(child_inode)) {
4419 CERROR("%s: MDT doesn't support stripe directory "
4420 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4421 GOTO(out_iput, rc = -EOPNOTSUPP);
4426 * lfs migrate command needs to be blocked on the client
4427 * by checking the migrate FID against the FID of the
4430 if (child_inode == parent->i_sb->s_root->d_inode)
4431 GOTO(out_iput, rc = -EINVAL);
4433 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4434 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4435 if (IS_ERR(op_data))
4436 GOTO(out_iput, rc = PTR_ERR(op_data));
4438 inode_lock(child_inode);
4439 op_data->op_fid3 = *ll_inode2fid(child_inode);
4440 if (!fid_is_sane(&op_data->op_fid3)) {
4441 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4442 ll_i2sbi(parent)->ll_fsname, name,
4443 PFID(&op_data->op_fid3));
4444 GOTO(out_unlock, rc = -EINVAL);
4447 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4448 op_data->op_data = lum;
4449 op_data->op_data_size = lumlen;
4452 if (S_ISREG(child_inode->i_mode)) {
4453 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4457 GOTO(out_unlock, rc);
4460 rc = ll_data_version(child_inode, &data_version,
4463 GOTO(out_close, rc);
4465 op_data->op_open_handle = och->och_open_handle;
4466 op_data->op_data_version = data_version;
4467 op_data->op_lease_handle = och->och_lease_handle;
4468 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4470 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4471 och->och_mod->mod_open_req->rq_replay = 0;
4472 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4475 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4476 name, namelen, &request);
4478 LASSERT(request != NULL);
4479 ll_update_times(request, parent);
4482 if (rc == 0 || rc == -EAGAIN) {
4483 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4484 LASSERT(body != NULL);
4486 /* If the server does release layout lock, then we cleanup
4487 * the client och here, otherwise release it in out_close: */
4488 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4489 obd_mod_put(och->och_mod);
4490 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4492 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4498 if (request != NULL) {
4499 ptlrpc_req_finished(request);
4503 /* Try again if the lease has cancelled. */
4504 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4509 ll_lease_close(och, child_inode, NULL);
4511 clear_nlink(child_inode);
4513 inode_unlock(child_inode);
4514 ll_finish_md_op_data(op_data);
4521 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4523 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4527 * In order to avoid flood of warning messages, only print one message
4528 * for one file. And the entire message rate on the client is limited
4529 * by CDEBUG_LIMIT too.
4531 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4532 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4533 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4534 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4540 * test if some locks matching bits and l_req_mode are acquired
4541 * - bits can be in different locks
4542 * - if found clear the common lock bits in *bits
4543 * - the bits not found, are kept in *bits
4545 * \param bits [IN] searched lock bits [IN]
4546 * \param l_req_mode [IN] searched lock mode
4547 * \retval boolean, true iff all bits are found
4549 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4551 struct lustre_handle lockh;
4552 union ldlm_policy_data policy;
4553 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4554 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4563 fid = &ll_i2info(inode)->lli_fid;
4564 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4565 ldlm_lockname[mode]);
4567 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4568 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4569 policy.l_inodebits.bits = *bits & (1 << i);
4570 if (policy.l_inodebits.bits == 0)
4573 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4574 &policy, mode, &lockh)) {
4575 struct ldlm_lock *lock;
4577 lock = ldlm_handle2lock(&lockh);
4580 ~(lock->l_policy_data.l_inodebits.bits);
4581 LDLM_LOCK_PUT(lock);
4583 *bits &= ~policy.l_inodebits.bits;
4590 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4591 struct lustre_handle *lockh, __u64 flags,
4592 enum ldlm_mode mode)
4594 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4599 fid = &ll_i2info(inode)->lli_fid;
4600 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4602 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4603 fid, LDLM_IBITS, &policy, mode, lockh);
4608 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4610 /* Already unlinked. Just update nlink and return success */
4611 if (rc == -ENOENT) {
4613 /* If it is striped directory, and there is bad stripe
4614 * Let's revalidate the dentry again, instead of returning
4616 if (ll_dir_striped(inode))
4619 /* This path cannot be hit for regular files unless in
4620 * case of obscure races, so no need to to validate
4622 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4624 } else if (rc != 0) {
4625 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4626 "%s: revalidate FID "DFID" error: rc = %d\n",
4627 ll_i2sbi(inode)->ll_fsname,
4628 PFID(ll_inode2fid(inode)), rc);
4634 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4636 struct inode *inode = dentry->d_inode;
4637 struct obd_export *exp = ll_i2mdexp(inode);
4638 struct lookup_intent oit = {
4641 struct ptlrpc_request *req = NULL;
4642 struct md_op_data *op_data;
4646 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4647 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4649 /* Call getattr by fid, so do not provide name at all. */
4650 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4651 LUSTRE_OPC_ANY, NULL);
4652 if (IS_ERR(op_data))
4653 RETURN(PTR_ERR(op_data));
4655 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4656 ll_finish_md_op_data(op_data);
4658 rc = ll_inode_revalidate_fini(inode, rc);
4662 rc = ll_revalidate_it_finish(req, &oit, dentry);
4664 ll_intent_release(&oit);
4668 /* Unlinked? Unhash dentry, so it is not picked up later by
4669 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4670 * here to preserve get_cwd functionality on 2.6.
4672 if (!dentry->d_inode->i_nlink) {
4673 spin_lock(&inode->i_lock);
4674 d_lustre_invalidate(dentry, 0);
4675 spin_unlock(&inode->i_lock);
4678 ll_lookup_finish_locks(&oit, dentry);
4680 ptlrpc_req_finished(req);
4685 static int ll_merge_md_attr(struct inode *inode)
4687 struct ll_inode_info *lli = ll_i2info(inode);
4688 struct cl_attr attr = { 0 };
4691 LASSERT(lli->lli_lsm_md != NULL);
4693 if (!lmv_dir_striped(lli->lli_lsm_md))
4696 down_read(&lli->lli_lsm_sem);
4697 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4698 &attr, ll_md_blocking_ast);
4699 up_read(&lli->lli_lsm_sem);
4703 set_nlink(inode, attr.cat_nlink);
4704 inode->i_blocks = attr.cat_blocks;
4705 i_size_write(inode, attr.cat_size);
4707 ll_i2info(inode)->lli_atime = attr.cat_atime;
4708 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4709 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4714 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4716 struct inode *inode = de->d_inode;
4717 struct ll_sb_info *sbi = ll_i2sbi(inode);
4718 struct ll_inode_info *lli = ll_i2info(inode);
4721 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4723 rc = ll_inode_revalidate(de, IT_GETATTR);
4727 if (S_ISREG(inode->i_mode)) {
4730 rc = pcc_inode_getattr(inode, &cached);
4731 if (cached && rc < 0)
4734 /* In case of restore, the MDT has the right size and has
4735 * already send it back without granting the layout lock,
4736 * inode is up-to-date so glimpse is useless.
4737 * Also to glimpse we need the layout, in case of a running
4738 * restore the MDT holds the layout lock so the glimpse will
4739 * block up to the end of restore (getattr will block)
4741 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4742 rc = ll_glimpse_size(inode);
4747 /* If object isn't regular a file then don't validate size. */
4748 if (ll_dir_striped(inode)) {
4749 rc = ll_merge_md_attr(inode);
4754 inode->i_atime.tv_sec = lli->lli_atime;
4755 inode->i_mtime.tv_sec = lli->lli_mtime;
4756 inode->i_ctime.tv_sec = lli->lli_ctime;
4759 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4761 if (ll_need_32bit_api(sbi)) {
4762 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4763 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4764 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4766 stat->ino = inode->i_ino;
4767 stat->dev = inode->i_sb->s_dev;
4768 stat->rdev = inode->i_rdev;
4771 stat->mode = inode->i_mode;
4772 stat->uid = inode->i_uid;
4773 stat->gid = inode->i_gid;
4774 stat->atime = inode->i_atime;
4775 stat->mtime = inode->i_mtime;
4776 stat->ctime = inode->i_ctime;
4777 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4779 stat->nlink = inode->i_nlink;
4780 stat->size = i_size_read(inode);
4781 stat->blocks = inode->i_blocks;
4786 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4787 int ll_getattr(const struct path *path, struct kstat *stat,
4788 u32 request_mask, unsigned int flags)
4790 struct dentry *de = path->dentry;
4792 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4795 return ll_getattr_dentry(de, stat);
4798 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4799 __u64 start, __u64 len)
4803 struct fiemap *fiemap;
4804 unsigned int extent_count = fieinfo->fi_extents_max;
4806 num_bytes = sizeof(*fiemap) + (extent_count *
4807 sizeof(struct fiemap_extent));
4808 OBD_ALLOC_LARGE(fiemap, num_bytes);
4813 fiemap->fm_flags = fieinfo->fi_flags;
4814 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4815 fiemap->fm_start = start;
4816 fiemap->fm_length = len;
4817 if (extent_count > 0 &&
4818 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4819 sizeof(struct fiemap_extent)) != 0)
4820 GOTO(out, rc = -EFAULT);
4822 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4824 fieinfo->fi_flags = fiemap->fm_flags;
4825 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4826 if (extent_count > 0 &&
4827 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4828 fiemap->fm_mapped_extents *
4829 sizeof(struct fiemap_extent)) != 0)
4830 GOTO(out, rc = -EFAULT);
4832 OBD_FREE_LARGE(fiemap, num_bytes);
4836 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4838 struct ll_inode_info *lli = ll_i2info(inode);
4839 struct posix_acl *acl = NULL;
4842 spin_lock(&lli->lli_lock);
4843 /* VFS' acl_permission_check->check_acl will release the refcount */
4844 acl = posix_acl_dup(lli->lli_posix_acl);
4845 spin_unlock(&lli->lli_lock);
4850 #ifdef HAVE_IOP_SET_ACL
4851 #ifdef CONFIG_FS_POSIX_ACL
4852 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4854 struct ll_sb_info *sbi = ll_i2sbi(inode);
4855 struct ptlrpc_request *req = NULL;
4856 const char *name = NULL;
4858 size_t value_size = 0;
4863 case ACL_TYPE_ACCESS:
4864 name = XATTR_NAME_POSIX_ACL_ACCESS;
4866 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4869 case ACL_TYPE_DEFAULT:
4870 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4871 if (!S_ISDIR(inode->i_mode))
4872 rc = acl ? -EACCES : 0;
4883 value_size = posix_acl_xattr_size(acl->a_count);
4884 value = kmalloc(value_size, GFP_NOFS);
4886 GOTO(out, rc = -ENOMEM);
4888 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4890 GOTO(out_value, rc);
4893 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4894 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4895 name, value, value_size, 0, 0, &req);
4897 ptlrpc_req_finished(req);
4902 forget_cached_acl(inode, type);
4904 set_cached_acl(inode, type, acl);
4907 #endif /* CONFIG_FS_POSIX_ACL */
4908 #endif /* HAVE_IOP_SET_ACL */
4910 int ll_inode_permission(struct inode *inode, int mask)
4913 struct ll_sb_info *sbi;
4914 struct root_squash_info *squash;
4915 struct cred *cred = NULL;
4916 const struct cred *old_cred = NULL;
4918 bool squash_id = false;
4921 if (mask & MAY_NOT_BLOCK)
4924 /* as root inode are NOT getting validated in lookup operation,
4925 * need to do it before permission check. */
4927 if (inode == inode->i_sb->s_root->d_inode) {
4928 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4933 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4934 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4936 /* squash fsuid/fsgid if needed */
4937 sbi = ll_i2sbi(inode);
4938 squash = &sbi->ll_squash;
4939 if (unlikely(squash->rsi_uid != 0 &&
4940 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4941 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4945 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4946 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4947 squash->rsi_uid, squash->rsi_gid);
4949 /* update current process's credentials
4950 * and FS capability */
4951 cred = prepare_creds();
4955 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4956 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4957 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4958 if ((1 << cap) & CFS_CAP_FS_MASK)
4959 cap_lower(cred->cap_effective, cap);
4961 old_cred = override_creds(cred);
4964 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4965 rc = generic_permission(inode, mask);
4966 /* restore current process's credentials and FS capability */
4968 revert_creds(old_cred);
4975 /* -o localflock - only provides locally consistent flock locks */
4976 struct file_operations ll_file_operations = {
4977 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4978 # ifdef HAVE_SYNC_READ_WRITE
4979 .read = new_sync_read,
4980 .write = new_sync_write,
4982 .read_iter = ll_file_read_iter,
4983 .write_iter = ll_file_write_iter,
4984 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4985 .read = ll_file_read,
4986 .aio_read = ll_file_aio_read,
4987 .write = ll_file_write,
4988 .aio_write = ll_file_aio_write,
4989 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4990 .unlocked_ioctl = ll_file_ioctl,
4991 .open = ll_file_open,
4992 .release = ll_file_release,
4993 .mmap = ll_file_mmap,
4994 .llseek = ll_file_seek,
4995 .splice_read = ll_file_splice_read,
5000 struct file_operations ll_file_operations_flock = {
5001 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5002 # ifdef HAVE_SYNC_READ_WRITE
5003 .read = new_sync_read,
5004 .write = new_sync_write,
5005 # endif /* HAVE_SYNC_READ_WRITE */
5006 .read_iter = ll_file_read_iter,
5007 .write_iter = ll_file_write_iter,
5008 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5009 .read = ll_file_read,
5010 .aio_read = ll_file_aio_read,
5011 .write = ll_file_write,
5012 .aio_write = ll_file_aio_write,
5013 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5014 .unlocked_ioctl = ll_file_ioctl,
5015 .open = ll_file_open,
5016 .release = ll_file_release,
5017 .mmap = ll_file_mmap,
5018 .llseek = ll_file_seek,
5019 .splice_read = ll_file_splice_read,
5022 .flock = ll_file_flock,
5023 .lock = ll_file_flock
5026 /* These are for -o noflock - to return ENOSYS on flock calls */
5027 struct file_operations ll_file_operations_noflock = {
5028 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5029 # ifdef HAVE_SYNC_READ_WRITE
5030 .read = new_sync_read,
5031 .write = new_sync_write,
5032 # endif /* HAVE_SYNC_READ_WRITE */
5033 .read_iter = ll_file_read_iter,
5034 .write_iter = ll_file_write_iter,
5035 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5036 .read = ll_file_read,
5037 .aio_read = ll_file_aio_read,
5038 .write = ll_file_write,
5039 .aio_write = ll_file_aio_write,
5040 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5041 .unlocked_ioctl = ll_file_ioctl,
5042 .open = ll_file_open,
5043 .release = ll_file_release,
5044 .mmap = ll_file_mmap,
5045 .llseek = ll_file_seek,
5046 .splice_read = ll_file_splice_read,
5049 .flock = ll_file_noflock,
5050 .lock = ll_file_noflock
5053 struct inode_operations ll_file_inode_operations = {
5054 .setattr = ll_setattr,
5055 .getattr = ll_getattr,
5056 .permission = ll_inode_permission,
5057 #ifdef HAVE_IOP_XATTR
5058 .setxattr = ll_setxattr,
5059 .getxattr = ll_getxattr,
5060 .removexattr = ll_removexattr,
5062 .listxattr = ll_listxattr,
5063 .fiemap = ll_fiemap,
5064 #ifdef HAVE_IOP_GET_ACL
5065 .get_acl = ll_get_acl,
5067 #ifdef HAVE_IOP_SET_ACL
5068 .set_acl = ll_set_acl,
5072 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5074 struct ll_inode_info *lli = ll_i2info(inode);
5075 struct cl_object *obj = lli->lli_clob;
5084 env = cl_env_get(&refcheck);
5086 RETURN(PTR_ERR(env));
5088 rc = cl_conf_set(env, lli->lli_clob, conf);
5092 if (conf->coc_opc == OBJECT_CONF_SET) {
5093 struct ldlm_lock *lock = conf->coc_lock;
5094 struct cl_layout cl = {
5098 LASSERT(lock != NULL);
5099 LASSERT(ldlm_has_layout(lock));
5101 /* it can only be allowed to match after layout is
5102 * applied to inode otherwise false layout would be
5103 * seen. Applying layout shoud happen before dropping
5104 * the intent lock. */
5105 ldlm_lock_allow_match(lock);
5107 rc = cl_object_layout_get(env, obj, &cl);
5112 DFID": layout version change: %u -> %u\n",
5113 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5115 ll_layout_version_set(lli, cl.cl_layout_gen);
5119 cl_env_put(env, &refcheck);
5124 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5125 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5128 struct ll_sb_info *sbi = ll_i2sbi(inode);
5129 struct ptlrpc_request *req;
5136 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5137 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5138 lock->l_lvb_data, lock->l_lvb_len);
5140 if (lock->l_lvb_data != NULL)
5143 /* if layout lock was granted right away, the layout is returned
5144 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5145 * blocked and then granted via completion ast, we have to fetch
5146 * layout here. Please note that we can't use the LVB buffer in
5147 * completion AST because it doesn't have a large enough buffer */
5148 rc = ll_get_default_mdsize(sbi, &lmmsize);
5152 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5153 XATTR_NAME_LOV, lmmsize, &req);
5156 GOTO(out, rc = 0); /* empty layout */
5163 if (lmmsize == 0) /* empty layout */
5166 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5168 GOTO(out, rc = -EFAULT);
5170 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5171 if (lvbdata == NULL)
5172 GOTO(out, rc = -ENOMEM);
5174 memcpy(lvbdata, lmm, lmmsize);
5175 lock_res_and_lock(lock);
5176 if (unlikely(lock->l_lvb_data == NULL)) {
5177 lock->l_lvb_type = LVB_T_LAYOUT;
5178 lock->l_lvb_data = lvbdata;
5179 lock->l_lvb_len = lmmsize;
5182 unlock_res_and_lock(lock);
5185 OBD_FREE_LARGE(lvbdata, lmmsize);
5190 ptlrpc_req_finished(req);
5195 * Apply the layout to the inode. Layout lock is held and will be released
5198 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5199 struct inode *inode)
5201 struct ll_inode_info *lli = ll_i2info(inode);
5202 struct ll_sb_info *sbi = ll_i2sbi(inode);
5203 struct ldlm_lock *lock;
5204 struct cl_object_conf conf;
5207 bool wait_layout = false;
5210 LASSERT(lustre_handle_is_used(lockh));
5212 lock = ldlm_handle2lock(lockh);
5213 LASSERT(lock != NULL);
5214 LASSERT(ldlm_has_layout(lock));
5216 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5217 PFID(&lli->lli_fid), inode);
5219 /* in case this is a caching lock and reinstate with new inode */
5220 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5222 lock_res_and_lock(lock);
5223 lvb_ready = ldlm_is_lvb_ready(lock);
5224 unlock_res_and_lock(lock);
5226 /* checking lvb_ready is racy but this is okay. The worst case is
5227 * that multi processes may configure the file on the same time. */
5231 rc = ll_layout_fetch(inode, lock);
5235 /* for layout lock, lmm is stored in lock's lvb.
5236 * lvb_data is immutable if the lock is held so it's safe to access it
5239 * set layout to file. Unlikely this will fail as old layout was
5240 * surely eliminated */
5241 memset(&conf, 0, sizeof conf);
5242 conf.coc_opc = OBJECT_CONF_SET;
5243 conf.coc_inode = inode;
5244 conf.coc_lock = lock;
5245 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5246 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5247 rc = ll_layout_conf(inode, &conf);
5249 /* refresh layout failed, need to wait */
5250 wait_layout = rc == -EBUSY;
5253 LDLM_LOCK_PUT(lock);
5254 ldlm_lock_decref(lockh, mode);
5256 /* wait for IO to complete if it's still being used. */
5258 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5259 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5261 memset(&conf, 0, sizeof conf);
5262 conf.coc_opc = OBJECT_CONF_WAIT;
5263 conf.coc_inode = inode;
5264 rc = ll_layout_conf(inode, &conf);
5268 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5269 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5275 * Issue layout intent RPC to MDS.
5276 * \param inode [in] file inode
5277 * \param intent [in] layout intent
5279 * \retval 0 on success
5280 * \retval < 0 error code
5282 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5284 struct ll_inode_info *lli = ll_i2info(inode);
5285 struct ll_sb_info *sbi = ll_i2sbi(inode);
5286 struct md_op_data *op_data;
5287 struct lookup_intent it;
5288 struct ptlrpc_request *req;
5292 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5293 0, 0, LUSTRE_OPC_ANY, NULL);
5294 if (IS_ERR(op_data))
5295 RETURN(PTR_ERR(op_data));
5297 op_data->op_data = intent;
5298 op_data->op_data_size = sizeof(*intent);
5300 memset(&it, 0, sizeof(it));
5301 it.it_op = IT_LAYOUT;
5302 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5303 intent->li_opc == LAYOUT_INTENT_TRUNC)
5304 it.it_flags = FMODE_WRITE;
5306 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5307 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5309 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5310 &ll_md_blocking_ast, 0);
5311 if (it.it_request != NULL)
5312 ptlrpc_req_finished(it.it_request);
5313 it.it_request = NULL;
5315 ll_finish_md_op_data(op_data);
5317 /* set lock data in case this is a new lock */
5319 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5321 ll_intent_drop_lock(&it);
5327 * This function checks if there exists a LAYOUT lock on the client side,
5328 * or enqueues it if it doesn't have one in cache.
5330 * This function will not hold layout lock so it may be revoked any time after
5331 * this function returns. Any operations depend on layout should be redone
5334 * This function should be called before lov_io_init() to get an uptodate
5335 * layout version, the caller should save the version number and after IO
5336 * is finished, this function should be called again to verify that layout
5337 * is not changed during IO time.
5339 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5341 struct ll_inode_info *lli = ll_i2info(inode);
5342 struct ll_sb_info *sbi = ll_i2sbi(inode);
5343 struct lustre_handle lockh;
5344 struct layout_intent intent = {
5345 .li_opc = LAYOUT_INTENT_ACCESS,
5347 enum ldlm_mode mode;
5351 *gen = ll_layout_version_get(lli);
5352 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5356 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5357 LASSERT(S_ISREG(inode->i_mode));
5359 /* take layout lock mutex to enqueue layout lock exclusively. */
5360 mutex_lock(&lli->lli_layout_mutex);
5363 /* mostly layout lock is caching on the local side, so try to
5364 * match it before grabbing layout lock mutex. */
5365 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5366 LCK_CR | LCK_CW | LCK_PR |
5368 if (mode != 0) { /* hit cached lock */
5369 rc = ll_layout_lock_set(&lockh, mode, inode);
5375 rc = ll_layout_intent(inode, &intent);
5381 *gen = ll_layout_version_get(lli);
5382 mutex_unlock(&lli->lli_layout_mutex);
5388 * Issue layout intent RPC indicating where in a file an IO is about to write.
5390 * \param[in] inode file inode.
5391 * \param[in] ext write range with start offset of fille in bytes where
5392 * an IO is about to write, and exclusive end offset in
5395 * \retval 0 on success
5396 * \retval < 0 error code
5398 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5399 struct lu_extent *ext)
5401 struct layout_intent intent = {
5403 .li_extent.e_start = ext->e_start,
5404 .li_extent.e_end = ext->e_end,
5409 rc = ll_layout_intent(inode, &intent);
5415 * This function send a restore request to the MDT
5417 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5419 struct hsm_user_request *hur;
5423 len = sizeof(struct hsm_user_request) +
5424 sizeof(struct hsm_user_item);
5425 OBD_ALLOC(hur, len);
5429 hur->hur_request.hr_action = HUA_RESTORE;
5430 hur->hur_request.hr_archive_id = 0;
5431 hur->hur_request.hr_flags = 0;
5432 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5433 sizeof(hur->hur_user_item[0].hui_fid));
5434 hur->hur_user_item[0].hui_extent.offset = offset;
5435 hur->hur_user_item[0].hui_extent.length = length;
5436 hur->hur_request.hr_itemcount = 1;
5437 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,