4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
167 case MDS_CLOSE_LAYOUT_SPLIT:
168 case MDS_CLOSE_LAYOUT_SWAP: {
169 struct split_param *sp = data;
171 LASSERT(data != NULL);
172 op_data->op_bias |= bias;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
176 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
177 op_data->op_mirror_id = sp->sp_mirror_id;
179 op_data->op_fid2 = *ll_inode2fid(data);
184 case MDS_CLOSE_RESYNC_DONE: {
185 struct ll_ioc_lease *ioc = data;
187 LASSERT(data != NULL);
188 op_data->op_attr_blocks +=
189 ioc->lil_count * op_data->op_attr_blocks;
190 op_data->op_attr.ia_valid |= ATTR_SIZE;
191 op_data->op_xvalid |= OP_XVALID_BLOCKS;
192 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
194 op_data->op_lease_handle = och->och_lease_handle;
195 op_data->op_data = &ioc->lil_ids[0];
196 op_data->op_data_size =
197 ioc->lil_count * sizeof(ioc->lil_ids[0]);
201 case MDS_PCC_ATTACH: {
202 struct pcc_param *param = data;
204 LASSERT(data != NULL);
205 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
206 op_data->op_archive_id = param->pa_archive_id;
207 op_data->op_data_version = param->pa_data_version;
208 op_data->op_lease_handle = och->och_lease_handle;
212 case MDS_HSM_RELEASE:
213 LASSERT(data != NULL);
214 op_data->op_bias |= MDS_HSM_RELEASE;
215 op_data->op_data_version = *(__u64 *)data;
216 op_data->op_lease_handle = och->och_lease_handle;
217 op_data->op_attr.ia_valid |= ATTR_SIZE;
218 op_data->op_xvalid |= OP_XVALID_BLOCKS;
222 LASSERT(data == NULL);
226 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
227 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
228 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
229 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
231 rc = md_close(md_exp, op_data, och->och_mod, &req);
232 if (rc != 0 && rc != -EINTR)
233 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
234 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
236 if (rc == 0 && op_data->op_bias & bias) {
237 struct mdt_body *body;
239 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
240 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
243 if (bias & MDS_PCC_ATTACH) {
244 struct pcc_param *param = data;
246 param->pa_layout_gen = body->mbo_layout_gen;
250 ll_finish_md_op_data(op_data);
254 md_clear_open_replay_data(md_exp, och);
255 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
258 ptlrpc_req_finished(req); /* This is close request */
262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
264 struct ll_inode_info *lli = ll_i2info(inode);
265 struct obd_client_handle **och_p;
266 struct obd_client_handle *och;
271 if (fmode & FMODE_WRITE) {
272 och_p = &lli->lli_mds_write_och;
273 och_usecount = &lli->lli_open_fd_write_count;
274 } else if (fmode & FMODE_EXEC) {
275 och_p = &lli->lli_mds_exec_och;
276 och_usecount = &lli->lli_open_fd_exec_count;
278 LASSERT(fmode & FMODE_READ);
279 och_p = &lli->lli_mds_read_och;
280 och_usecount = &lli->lli_open_fd_read_count;
283 mutex_lock(&lli->lli_och_mutex);
284 if (*och_usecount > 0) {
285 /* There are still users of this handle, so skip
287 mutex_unlock(&lli->lli_och_mutex);
293 mutex_unlock(&lli->lli_och_mutex);
296 /* There might be a race and this handle may already
298 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
304 static int ll_md_close(struct inode *inode, struct file *file)
306 union ldlm_policy_data policy = {
307 .l_inodebits = { MDS_INODELOCK_OPEN },
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
311 struct ll_inode_info *lli = ll_i2info(inode);
312 struct lustre_handle lockh;
313 enum ldlm_mode lockmode;
317 /* clear group lock, if present */
318 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
319 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
321 if (fd->fd_lease_och != NULL) {
324 /* Usually the lease is not released when the
325 * application crashed, we need to release here. */
326 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
327 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
328 PFID(&lli->lli_fid), rc, lease_broken);
330 fd->fd_lease_och = NULL;
333 if (fd->fd_och != NULL) {
334 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
339 /* Let's see if we have good enough OPEN lock on the file and if
340 we can skip talking to MDS */
341 mutex_lock(&lli->lli_och_mutex);
342 if (fd->fd_omode & FMODE_WRITE) {
344 LASSERT(lli->lli_open_fd_write_count);
345 lli->lli_open_fd_write_count--;
346 } else if (fd->fd_omode & FMODE_EXEC) {
348 LASSERT(lli->lli_open_fd_exec_count);
349 lli->lli_open_fd_exec_count--;
352 LASSERT(lli->lli_open_fd_read_count);
353 lli->lli_open_fd_read_count--;
355 mutex_unlock(&lli->lli_och_mutex);
357 /* LU-4398: do not cache write open lock if the file has exec bit */
358 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
359 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
360 LDLM_IBITS, &policy, lockmode, &lockh))
361 rc = ll_md_real_close(inode, fd->fd_omode);
364 LUSTRE_FPRIVATE(file) = NULL;
365 ll_file_data_put(fd);
370 /* While this returns an error code, fput() the caller does not, so we need
371 * to make every effort to clean up all of our state here. Also, applications
372 * rarely check close errors and even if an error is returned they will not
373 * re-try the close call.
375 int ll_file_release(struct inode *inode, struct file *file)
377 struct ll_file_data *fd;
378 struct ll_sb_info *sbi = ll_i2sbi(inode);
379 struct ll_inode_info *lli = ll_i2info(inode);
383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
384 PFID(ll_inode2fid(inode)), inode);
386 if (inode->i_sb->s_root != file_dentry(file))
387 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
388 fd = LUSTRE_FPRIVATE(file);
391 /* The last ref on @file, maybe not the the owner pid of statahead,
392 * because parent and child process can share the same file handle. */
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
394 ll_deauthorize_statahead(inode, fd);
396 if (inode->i_sb->s_root == file_dentry(file)) {
397 LUSTRE_FPRIVATE(file) = NULL;
398 ll_file_data_put(fd);
402 pcc_file_release(inode, file);
404 if (!S_ISDIR(inode->i_mode)) {
405 if (lli->lli_clob != NULL)
406 lov_read_and_clear_async_rc(lli->lli_clob);
407 lli->lli_async_rc = 0;
410 rc = ll_md_close(inode, file);
412 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
413 libcfs_debug_dumplog();
418 static inline int ll_dom_readpage(void *data, struct page *page)
420 struct niobuf_local *lnb = data;
423 kaddr = ll_kmap_atomic(page, KM_USER0);
424 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
425 if (lnb->lnb_len < PAGE_SIZE)
426 memset(kaddr + lnb->lnb_len, 0,
427 PAGE_SIZE - lnb->lnb_len);
428 flush_dcache_page(page);
429 SetPageUptodate(page);
430 ll_kunmap_atomic(kaddr, KM_USER0);
436 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
437 struct lookup_intent *it)
439 struct ll_inode_info *lli = ll_i2info(inode);
440 struct cl_object *obj = lli->lli_clob;
441 struct address_space *mapping = inode->i_mapping;
443 struct niobuf_remote *rnb;
444 struct mdt_body *body;
446 unsigned long index, start;
447 struct niobuf_local lnb;
454 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
458 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
459 if (rnb == NULL || rnb->rnb_len == 0)
462 /* LU-11595: Server may return whole file and that is OK always or
463 * it may return just file tail and its offset must be aligned with
464 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
465 * smaller then offset may be not aligned and that data is just ignored.
467 if (rnb->rnb_offset % PAGE_SIZE)
470 /* Server returns whole file or just file tail if it fills in reply
471 * buffer, in both cases total size should be equal to the file size.
473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
475 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
476 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
477 rnb->rnb_len, body->mbo_dom_size);
481 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
482 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
484 data = (char *)rnb + sizeof(*rnb);
486 lnb.lnb_file_offset = rnb->rnb_offset;
487 start = lnb.lnb_file_offset / PAGE_SIZE;
489 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
490 lnb.lnb_page_offset = 0;
492 lnb.lnb_data = data + (index << PAGE_SHIFT);
493 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
494 if (lnb.lnb_len > PAGE_SIZE)
495 lnb.lnb_len = PAGE_SIZE;
497 vmpage = read_cache_page(mapping, index + start,
498 ll_dom_readpage, &lnb);
499 if (IS_ERR(vmpage)) {
500 CWARN("%s: cannot fill page %lu for "DFID
501 " with data: rc = %li\n",
502 ll_i2sbi(inode)->ll_fsname, index + start,
503 PFID(lu_object_fid(&obj->co_lu)),
509 } while (rnb->rnb_len > (index << PAGE_SHIFT));
513 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
514 struct lookup_intent *itp)
516 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
517 struct dentry *parent = de->d_parent;
520 struct md_op_data *op_data;
521 struct ptlrpc_request *req = NULL;
525 LASSERT(parent != NULL);
526 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
528 /* if server supports open-by-fid, or file name is invalid, don't pack
529 * name in open request */
530 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
531 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
533 len = de->d_name.len;
534 name = kmalloc(len + 1, GFP_NOFS);
539 spin_lock(&de->d_lock);
540 if (len != de->d_name.len) {
541 spin_unlock(&de->d_lock);
545 memcpy(name, de->d_name.name, len);
547 spin_unlock(&de->d_lock);
549 if (!lu_name_is_valid_2(name, len)) {
555 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
556 name, len, 0, LUSTRE_OPC_ANY, NULL);
557 if (IS_ERR(op_data)) {
559 RETURN(PTR_ERR(op_data));
561 op_data->op_data = lmm;
562 op_data->op_data_size = lmmsize;
564 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
565 &ll_md_blocking_ast, 0);
567 ll_finish_md_op_data(op_data);
569 /* reason for keep own exit path - don`t flood log
570 * with messages with -ESTALE errors.
572 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
573 it_open_error(DISP_OPEN_OPEN, itp))
575 ll_release_openhandle(de, itp);
579 if (it_disposition(itp, DISP_LOOKUP_NEG))
580 GOTO(out, rc = -ENOENT);
582 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
583 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
584 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
588 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
590 if (!rc && itp->it_lock_mode) {
591 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
592 struct ldlm_lock *lock;
593 bool has_dom_bit = false;
595 /* If we got a lock back and it has a LOOKUP bit set,
596 * make sure the dentry is marked as valid so we can find it.
597 * We don't need to care about actual hashing since other bits
598 * of kernel will deal with that later.
600 lock = ldlm_handle2lock(&handle);
602 has_dom_bit = ldlm_has_dom(lock);
603 if (lock->l_policy_data.l_inodebits.bits &
604 MDS_INODELOCK_LOOKUP)
605 d_lustre_revalidate(de);
609 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
611 ll_dom_finish_open(de->d_inode, req, itp);
615 ptlrpc_req_finished(req);
616 ll_intent_drop_lock(itp);
618 /* We did open by fid, but by the time we got to the server,
619 * the object disappeared. If this is a create, we cannot really
620 * tell the userspace that the file it was trying to create
621 * does not exist. Instead let's return -ESTALE, and the VFS will
622 * retry the create with LOOKUP_REVAL that we are going to catch
623 * in ll_revalidate_dentry() and use lookup then.
625 if (rc == -ENOENT && itp->it_op & IT_CREAT)
631 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
632 struct obd_client_handle *och)
634 struct mdt_body *body;
636 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
637 och->och_open_handle = body->mbo_open_handle;
638 och->och_fid = body->mbo_fid1;
639 och->och_lease_handle.cookie = it->it_lock_handle;
640 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
641 och->och_flags = it->it_flags;
643 return md_set_open_replay_data(md_exp, och, it);
646 static int ll_local_open(struct file *file, struct lookup_intent *it,
647 struct ll_file_data *fd, struct obd_client_handle *och)
649 struct inode *inode = file_inode(file);
652 LASSERT(!LUSTRE_FPRIVATE(file));
659 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
664 LUSTRE_FPRIVATE(file) = fd;
665 ll_readahead_init(inode, &fd->fd_ras);
666 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
668 /* ll_cl_context initialize */
669 rwlock_init(&fd->fd_lock);
670 INIT_LIST_HEAD(&fd->fd_lccs);
675 /* Open a file, and (for the very first open) create objects on the OSTs at
676 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
677 * creation or open until ll_lov_setstripe() ioctl is called.
679 * If we already have the stripe MD locally then we don't request it in
680 * md_open(), by passing a lmm_size = 0.
682 * It is up to the application to ensure no other processes open this file
683 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
684 * used. We might be able to avoid races of that sort by getting lli_open_sem
685 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
686 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
688 int ll_file_open(struct inode *inode, struct file *file)
690 struct ll_inode_info *lli = ll_i2info(inode);
691 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
692 .it_flags = file->f_flags };
693 struct obd_client_handle **och_p = NULL;
694 __u64 *och_usecount = NULL;
695 struct ll_file_data *fd;
699 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
700 PFID(ll_inode2fid(inode)), inode, file->f_flags);
702 it = file->private_data; /* XXX: compat macro */
703 file->private_data = NULL; /* prevent ll_local_open assertion */
705 fd = ll_file_data_get();
707 GOTO(out_nofiledata, rc = -ENOMEM);
710 if (S_ISDIR(inode->i_mode))
711 ll_authorize_statahead(inode, fd);
713 if (inode->i_sb->s_root == file_dentry(file)) {
714 LUSTRE_FPRIVATE(file) = fd;
718 if (!it || !it->it_disposition) {
719 /* Convert f_flags into access mode. We cannot use file->f_mode,
720 * because everything but O_ACCMODE mask was stripped from
722 if ((oit.it_flags + 1) & O_ACCMODE)
724 if (file->f_flags & O_TRUNC)
725 oit.it_flags |= FMODE_WRITE;
727 /* kernel only call f_op->open in dentry_open. filp_open calls
728 * dentry_open after call to open_namei that checks permissions.
729 * Only nfsd_open call dentry_open directly without checking
730 * permissions and because of that this code below is safe.
732 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
733 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
735 /* We do not want O_EXCL here, presumably we opened the file
736 * already? XXX - NFS implications? */
737 oit.it_flags &= ~O_EXCL;
739 /* bug20584, if "it_flags" contains O_CREAT, the file will be
740 * created if necessary, then "IT_CREAT" should be set to keep
741 * consistent with it */
742 if (oit.it_flags & O_CREAT)
743 oit.it_op |= IT_CREAT;
749 /* Let's see if we have file open on MDS already. */
750 if (it->it_flags & FMODE_WRITE) {
751 och_p = &lli->lli_mds_write_och;
752 och_usecount = &lli->lli_open_fd_write_count;
753 } else if (it->it_flags & FMODE_EXEC) {
754 och_p = &lli->lli_mds_exec_och;
755 och_usecount = &lli->lli_open_fd_exec_count;
757 och_p = &lli->lli_mds_read_och;
758 och_usecount = &lli->lli_open_fd_read_count;
761 mutex_lock(&lli->lli_och_mutex);
762 if (*och_p) { /* Open handle is present */
763 if (it_disposition(it, DISP_OPEN_OPEN)) {
764 /* Well, there's extra open request that we do not need,
765 let's close it somehow. This will decref request. */
766 rc = it_open_error(DISP_OPEN_OPEN, it);
768 mutex_unlock(&lli->lli_och_mutex);
769 GOTO(out_openerr, rc);
772 ll_release_openhandle(file_dentry(file), it);
776 rc = ll_local_open(file, it, fd, NULL);
779 mutex_unlock(&lli->lli_och_mutex);
780 GOTO(out_openerr, rc);
783 LASSERT(*och_usecount == 0);
784 if (!it->it_disposition) {
785 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
786 /* We cannot just request lock handle now, new ELC code
787 means that one of other OPEN locks for this file
788 could be cancelled, and since blocking ast handler
789 would attempt to grab och_mutex as well, that would
790 result in a deadlock */
791 mutex_unlock(&lli->lli_och_mutex);
793 * Normally called under two situations:
795 * 2. A race/condition on MDS resulting in no open
796 * handle to be returned from LOOKUP|OPEN request,
797 * for example if the target entry was a symlink.
799 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
800 * marked by a bit set in ll_iget_for_nfs. Clear the
801 * bit so that it's not confusing later callers.
803 * NB; when ldd is NULL, it must have come via normal
804 * lookup path only, since ll_iget_for_nfs always calls
807 if (ldd && ldd->lld_nfs_dentry) {
808 ldd->lld_nfs_dentry = 0;
809 it->it_flags |= MDS_OPEN_LOCK;
813 * Always specify MDS_OPEN_BY_FID because we don't want
814 * to get file with different fid.
816 it->it_flags |= MDS_OPEN_BY_FID;
817 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
820 GOTO(out_openerr, rc);
824 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
826 GOTO(out_och_free, rc = -ENOMEM);
830 /* md_intent_lock() didn't get a request ref if there was an
831 * open error, so don't do cleanup on the request here
833 /* XXX (green): Should not we bail out on any error here, not
834 * just open error? */
835 rc = it_open_error(DISP_OPEN_OPEN, it);
837 GOTO(out_och_free, rc);
839 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
840 "inode %p: disposition %x, status %d\n", inode,
841 it_disposition(it, ~0), it->it_status);
843 rc = ll_local_open(file, it, fd, *och_p);
845 GOTO(out_och_free, rc);
848 rc = pcc_file_open(inode, file);
850 GOTO(out_och_free, rc);
852 mutex_unlock(&lli->lli_och_mutex);
855 /* Must do this outside lli_och_mutex lock to prevent deadlock where
856 different kind of OPEN lock for this same inode gets cancelled
857 by ldlm_cancel_lru */
858 if (!S_ISREG(inode->i_mode))
859 GOTO(out_och_free, rc);
861 cl_lov_delay_create_clear(&file->f_flags);
862 GOTO(out_och_free, rc);
866 if (och_p && *och_p) {
867 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
868 *och_p = NULL; /* OBD_FREE writes some magic there */
871 mutex_unlock(&lli->lli_och_mutex);
874 if (lli->lli_opendir_key == fd)
875 ll_deauthorize_statahead(inode, fd);
878 ll_file_data_put(fd);
880 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
884 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
885 ptlrpc_req_finished(it->it_request);
886 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
892 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
893 struct ldlm_lock_desc *desc, void *data, int flag)
896 struct lustre_handle lockh;
900 case LDLM_CB_BLOCKING:
901 ldlm_lock2handle(lock, &lockh);
902 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
904 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
908 case LDLM_CB_CANCELING:
916 * When setting a lease on a file, we take ownership of the lli_mds_*_och
917 * and save it as fd->fd_och so as to force client to reopen the file even
918 * if it has an open lock in cache already.
920 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
921 struct lustre_handle *old_open_handle)
923 struct ll_inode_info *lli = ll_i2info(inode);
924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
925 struct obd_client_handle **och_p;
930 /* Get the openhandle of the file */
931 mutex_lock(&lli->lli_och_mutex);
932 if (fd->fd_lease_och != NULL)
933 GOTO(out_unlock, rc = -EBUSY);
935 if (fd->fd_och == NULL) {
936 if (file->f_mode & FMODE_WRITE) {
937 LASSERT(lli->lli_mds_write_och != NULL);
938 och_p = &lli->lli_mds_write_och;
939 och_usecount = &lli->lli_open_fd_write_count;
941 LASSERT(lli->lli_mds_read_och != NULL);
942 och_p = &lli->lli_mds_read_och;
943 och_usecount = &lli->lli_open_fd_read_count;
946 if (*och_usecount > 1)
947 GOTO(out_unlock, rc = -EBUSY);
954 *old_open_handle = fd->fd_och->och_open_handle;
958 mutex_unlock(&lli->lli_och_mutex);
963 * Release ownership on lli_mds_*_och when putting back a file lease.
965 static int ll_lease_och_release(struct inode *inode, struct file *file)
967 struct ll_inode_info *lli = ll_i2info(inode);
968 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
969 struct obd_client_handle **och_p;
970 struct obd_client_handle *old_och = NULL;
975 mutex_lock(&lli->lli_och_mutex);
976 if (file->f_mode & FMODE_WRITE) {
977 och_p = &lli->lli_mds_write_och;
978 och_usecount = &lli->lli_open_fd_write_count;
980 och_p = &lli->lli_mds_read_och;
981 och_usecount = &lli->lli_open_fd_read_count;
984 /* The file may have been open by another process (broken lease) so
985 * *och_p is not NULL. In this case we should simply increase usecount
988 if (*och_p != NULL) {
989 old_och = fd->fd_och;
996 mutex_unlock(&lli->lli_och_mutex);
999 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1005 * Acquire a lease and open the file.
1007 static struct obd_client_handle *
1008 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1011 struct lookup_intent it = { .it_op = IT_OPEN };
1012 struct ll_sb_info *sbi = ll_i2sbi(inode);
1013 struct md_op_data *op_data;
1014 struct ptlrpc_request *req = NULL;
1015 struct lustre_handle old_open_handle = { 0 };
1016 struct obd_client_handle *och = NULL;
1021 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1022 RETURN(ERR_PTR(-EINVAL));
1025 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1026 RETURN(ERR_PTR(-EPERM));
1028 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1030 RETURN(ERR_PTR(rc));
1035 RETURN(ERR_PTR(-ENOMEM));
1037 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1038 LUSTRE_OPC_ANY, NULL);
1039 if (IS_ERR(op_data))
1040 GOTO(out, rc = PTR_ERR(op_data));
1042 /* To tell the MDT this openhandle is from the same owner */
1043 op_data->op_open_handle = old_open_handle;
1045 it.it_flags = fmode | open_flags;
1046 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1047 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1048 &ll_md_blocking_lease_ast,
1049 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1050 * it can be cancelled which may mislead applications that the lease is
1052 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1053 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1054 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1055 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1056 ll_finish_md_op_data(op_data);
1057 ptlrpc_req_finished(req);
1059 GOTO(out_release_it, rc);
1061 if (it_disposition(&it, DISP_LOOKUP_NEG))
1062 GOTO(out_release_it, rc = -ENOENT);
1064 rc = it_open_error(DISP_OPEN_OPEN, &it);
1066 GOTO(out_release_it, rc);
1068 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1069 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1071 GOTO(out_release_it, rc);
1073 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1074 GOTO(out_close, rc = -EOPNOTSUPP);
1076 /* already get lease, handle lease lock */
1077 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1078 if (it.it_lock_mode == 0 ||
1079 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1080 /* open lock must return for lease */
1081 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1082 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1084 GOTO(out_close, rc = -EPROTO);
1087 ll_intent_release(&it);
1091 /* Cancel open lock */
1092 if (it.it_lock_mode != 0) {
1093 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1095 it.it_lock_mode = 0;
1096 och->och_lease_handle.cookie = 0ULL;
1098 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1100 CERROR("%s: error closing file "DFID": %d\n",
1101 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1102 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1104 ll_intent_release(&it);
1108 RETURN(ERR_PTR(rc));
1112 * Check whether a layout swap can be done between two inodes.
1114 * \param[in] inode1 First inode to check
1115 * \param[in] inode2 Second inode to check
1117 * \retval 0 on success, layout swap can be performed between both inodes
1118 * \retval negative error code if requirements are not met
1120 static int ll_check_swap_layouts_validity(struct inode *inode1,
1121 struct inode *inode2)
1123 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1126 if (inode_permission(inode1, MAY_WRITE) ||
1127 inode_permission(inode2, MAY_WRITE))
1130 if (inode1->i_sb != inode2->i_sb)
1136 static int ll_swap_layouts_close(struct obd_client_handle *och,
1137 struct inode *inode, struct inode *inode2)
1139 const struct lu_fid *fid1 = ll_inode2fid(inode);
1140 const struct lu_fid *fid2;
1144 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1145 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1147 rc = ll_check_swap_layouts_validity(inode, inode2);
1149 GOTO(out_free_och, rc);
1151 /* We now know that inode2 is a lustre inode */
1152 fid2 = ll_inode2fid(inode2);
1154 rc = lu_fid_cmp(fid1, fid2);
1156 GOTO(out_free_och, rc = -EINVAL);
1158 /* Close the file and {swap,merge} layouts between inode & inode2.
1159 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1160 * because we still need it to pack l_remote_handle to MDT. */
1161 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1164 och = NULL; /* freed in ll_close_inode_openhandle() */
1174 * Release lease and close the file.
1175 * It will check if the lease has ever broken.
1177 static int ll_lease_close_intent(struct obd_client_handle *och,
1178 struct inode *inode,
1179 bool *lease_broken, enum mds_op_bias bias,
1182 struct ldlm_lock *lock;
1183 bool cancelled = true;
1187 lock = ldlm_handle2lock(&och->och_lease_handle);
1189 lock_res_and_lock(lock);
1190 cancelled = ldlm_is_cancel(lock);
1191 unlock_res_and_lock(lock);
1192 LDLM_LOCK_PUT(lock);
1195 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1196 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1198 if (lease_broken != NULL)
1199 *lease_broken = cancelled;
1201 if (!cancelled && !bias)
1202 ldlm_cli_cancel(&och->och_lease_handle, 0);
1204 if (cancelled) { /* no need to excute intent */
1209 rc = ll_close_inode_openhandle(inode, och, bias, data);
1213 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1216 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1220 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1222 static int ll_lease_file_resync(struct obd_client_handle *och,
1223 struct inode *inode, unsigned long arg)
1225 struct ll_sb_info *sbi = ll_i2sbi(inode);
1226 struct md_op_data *op_data;
1227 struct ll_ioc_lease_id ioc;
1228 __u64 data_version_unused;
1232 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1233 LUSTRE_OPC_ANY, NULL);
1234 if (IS_ERR(op_data))
1235 RETURN(PTR_ERR(op_data));
1237 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1241 /* before starting file resync, it's necessary to clean up page cache
1242 * in client memory, otherwise once the layout version is increased,
1243 * writing back cached data will be denied the OSTs. */
1244 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1248 op_data->op_lease_handle = och->och_lease_handle;
1249 op_data->op_mirror_id = ioc.lil_mirror_id;
1250 rc = md_file_resync(sbi->ll_md_exp, op_data);
1256 ll_finish_md_op_data(op_data);
1260 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1262 struct ll_inode_info *lli = ll_i2info(inode);
1263 struct cl_object *obj = lli->lli_clob;
1264 struct cl_attr *attr = vvp_env_thread_attr(env);
1272 ll_inode_size_lock(inode);
1274 /* Merge timestamps the most recently obtained from MDS with
1275 * timestamps obtained from OSTs.
1277 * Do not overwrite atime of inode because it may be refreshed
1278 * by file_accessed() function. If the read was served by cache
1279 * data, there is no RPC to be sent so that atime may not be
1280 * transferred to OSTs at all. MDT only updates atime at close time
1281 * if it's at least 'mdd.*.atime_diff' older.
1282 * All in all, the atime in Lustre does not strictly comply with
1283 * POSIX. Solving this problem needs to send an RPC to MDT for each
1284 * read, this will hurt performance.
1286 if (inode->i_atime.tv_sec < lli->lli_atime ||
1287 lli->lli_update_atime) {
1288 inode->i_atime.tv_sec = lli->lli_atime;
1289 lli->lli_update_atime = 0;
1291 inode->i_mtime.tv_sec = lli->lli_mtime;
1292 inode->i_ctime.tv_sec = lli->lli_ctime;
1294 mtime = inode->i_mtime.tv_sec;
1295 atime = inode->i_atime.tv_sec;
1296 ctime = inode->i_ctime.tv_sec;
1298 cl_object_attr_lock(obj);
1299 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1302 rc = cl_object_attr_get(env, obj, attr);
1303 cl_object_attr_unlock(obj);
1306 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1308 if (atime < attr->cat_atime)
1309 atime = attr->cat_atime;
1311 if (ctime < attr->cat_ctime)
1312 ctime = attr->cat_ctime;
1314 if (mtime < attr->cat_mtime)
1315 mtime = attr->cat_mtime;
1317 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1318 PFID(&lli->lli_fid), attr->cat_size);
1320 i_size_write(inode, attr->cat_size);
1321 inode->i_blocks = attr->cat_blocks;
1323 inode->i_mtime.tv_sec = mtime;
1324 inode->i_atime.tv_sec = atime;
1325 inode->i_ctime.tv_sec = ctime;
1328 ll_inode_size_unlock(inode);
1334 * Set designated mirror for I/O.
1336 * So far only read, write, and truncated can support to issue I/O to
1337 * designated mirror.
1339 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1343 /* clear layout version for generic(non-resync) I/O in case it carries
1344 * stale layout version due to I/O restart */
1345 io->ci_layout_version = 0;
1347 /* FLR: disable non-delay for designated mirror I/O because obviously
1348 * only one mirror is available */
1349 if (fd->fd_designated_mirror > 0) {
1351 io->ci_designated_mirror = fd->fd_designated_mirror;
1352 io->ci_layout_version = fd->fd_layout_version;
1355 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1356 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1359 static bool file_is_noatime(const struct file *file)
1361 const struct vfsmount *mnt = file->f_path.mnt;
1362 const struct inode *inode = file_inode((struct file *)file);
1364 /* Adapted from file_accessed() and touch_atime().*/
1365 if (file->f_flags & O_NOATIME)
1368 if (inode->i_flags & S_NOATIME)
1371 if (IS_NOATIME(inode))
1374 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1377 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1380 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1386 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1387 struct vvp_io_args *args)
1389 struct inode *inode = file_inode(file);
1390 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1392 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1393 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1395 if (iot == CIT_WRITE) {
1396 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1397 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1398 file->f_flags & O_DIRECT ||
1400 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1401 io->u.ci_wr.wr_sync |= !!(args &&
1402 args->via_io_subtype == IO_NORMAL &&
1403 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1407 io->ci_obj = ll_i2info(inode)->lli_clob;
1408 io->ci_lockreq = CILR_MAYBE;
1409 if (ll_file_nolock(file)) {
1410 io->ci_lockreq = CILR_NEVER;
1411 io->ci_no_srvlock = 1;
1412 } else if (file->f_flags & O_APPEND) {
1413 io->ci_lockreq = CILR_MANDATORY;
1415 io->ci_noatime = file_is_noatime(file);
1416 io->ci_async_readahead = false;
1418 /* FLR: only use non-delay I/O for read as there is only one
1419 * avaliable mirror for write. */
1420 io->ci_ndelay = !(iot == CIT_WRITE);
1422 ll_io_set_mirror(io, file);
1425 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1428 struct ll_inode_info *lli = ll_i2info(inode);
1429 struct ll_sb_info *sbi = ll_i2sbi(inode);
1430 enum obd_heat_type sample_type;
1431 enum obd_heat_type iobyte_type;
1432 __u64 now = ktime_get_real_seconds();
1434 if (!ll_sbi_has_file_heat(sbi) ||
1435 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1438 if (iot == CIT_READ) {
1439 sample_type = OBD_HEAT_READSAMPLE;
1440 iobyte_type = OBD_HEAT_READBYTE;
1441 } else if (iot == CIT_WRITE) {
1442 sample_type = OBD_HEAT_WRITESAMPLE;
1443 iobyte_type = OBD_HEAT_WRITEBYTE;
1448 spin_lock(&lli->lli_heat_lock);
1449 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1450 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1451 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1452 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1453 spin_unlock(&lli->lli_heat_lock);
1457 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1458 struct file *file, enum cl_io_type iot,
1459 loff_t *ppos, size_t count)
1461 struct vvp_io *vio = vvp_env_io(env);
1462 struct inode *inode = file_inode(file);
1463 struct ll_inode_info *lli = ll_i2info(inode);
1464 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1465 struct range_lock range;
1469 unsigned retried = 0;
1470 bool restarted = false;
1474 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1475 file_dentry(file)->d_name.name,
1476 iot == CIT_READ ? "read" : "write", *ppos, count);
1479 io = vvp_env_thread_io(env);
1480 ll_io_init(io, file, iot, args);
1481 io->ci_ndelay_tried = retried;
1483 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1484 bool range_locked = false;
1486 if (file->f_flags & O_APPEND)
1487 range_lock_init(&range, 0, LUSTRE_EOF);
1489 range_lock_init(&range, *ppos, *ppos + count - 1);
1491 vio->vui_fd = LUSTRE_FPRIVATE(file);
1492 vio->vui_io_subtype = args->via_io_subtype;
1494 switch (vio->vui_io_subtype) {
1496 vio->vui_iter = args->u.normal.via_iter;
1497 vio->vui_iocb = args->u.normal.via_iocb;
1498 /* Direct IO reads must also take range lock,
1499 * or multiple reads will try to work on the same pages
1500 * See LU-6227 for details. */
1501 if (((iot == CIT_WRITE) ||
1502 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1503 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1504 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1506 rc = range_lock(&lli->lli_write_tree, &range);
1510 range_locked = true;
1514 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1515 vio->u.splice.vui_flags = args->u.splice.via_flags;
1518 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1522 ll_cl_add(file, env, io, LCC_RW);
1523 rc = cl_io_loop(env, io);
1524 ll_cl_remove(file, env);
1527 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1529 range_unlock(&lli->lli_write_tree, &range);
1532 /* cl_io_rw_init() handled IO */
1536 if (io->ci_nob > 0) {
1537 result += io->ci_nob;
1538 count -= io->ci_nob;
1539 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1541 /* prepare IO restart */
1542 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1543 args->u.normal.via_iter = vio->vui_iter;
1546 cl_io_fini(env, io);
1549 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1550 file->f_path.dentry->d_name.name,
1551 iot, rc, result, io->ci_need_restart);
1553 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1555 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1556 file_dentry(file)->d_name.name,
1557 iot == CIT_READ ? "read" : "write",
1558 *ppos, count, result, rc);
1559 /* preserve the tried count for FLR */
1560 retried = io->ci_ndelay_tried;
1565 if (iot == CIT_READ) {
1567 ll_stats_ops_tally(ll_i2sbi(inode),
1568 LPROC_LL_READ_BYTES, result);
1569 } else if (iot == CIT_WRITE) {
1571 ll_stats_ops_tally(ll_i2sbi(inode),
1572 LPROC_LL_WRITE_BYTES, result);
1573 fd->fd_write_failed = false;
1574 } else if (result == 0 && rc == 0) {
1577 fd->fd_write_failed = true;
1579 fd->fd_write_failed = false;
1580 } else if (rc != -ERESTARTSYS) {
1581 fd->fd_write_failed = true;
1585 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1587 ll_heat_add(inode, iot, result);
1589 RETURN(result > 0 ? result : rc);
1593 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1594 * especially for small I/O.
1596 * To serve a read request, CLIO has to create and initialize a cl_io and
1597 * then request DLM lock. This has turned out to have siginificant overhead
1598 * and affects the performance of small I/O dramatically.
1600 * It's not necessary to create a cl_io for each I/O. Under the help of read
1601 * ahead, most of the pages being read are already in memory cache and we can
1602 * read those pages directly because if the pages exist, the corresponding DLM
1603 * lock must exist so that page content must be valid.
1605 * In fast read implementation, the llite speculatively finds and reads pages
1606 * in memory cache. There are three scenarios for fast read:
1607 * - If the page exists and is uptodate, kernel VM will provide the data and
1608 * CLIO won't be intervened;
1609 * - If the page was brought into memory by read ahead, it will be exported
1610 * and read ahead parameters will be updated;
1611 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1612 * it will go back and invoke normal read, i.e., a cl_io will be created
1613 * and DLM lock will be requested.
1615 * POSIX compliance: posix standard states that read is intended to be atomic.
1616 * Lustre read implementation is in line with Linux kernel read implementation
1617 * and neither of them complies with POSIX standard in this matter. Fast read
1618 * doesn't make the situation worse on single node but it may interleave write
1619 * results from multiple nodes due to short read handling in ll_file_aio_read().
1621 * \param env - lu_env
1622 * \param iocb - kiocb from kernel
1623 * \param iter - user space buffers where the data will be copied
1625 * \retval - number of bytes have been read, or error code if error occurred.
1628 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1632 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1635 /* NB: we can't do direct IO for fast read because it will need a lock
1636 * to make IO engine happy. */
1637 if (iocb->ki_filp->f_flags & O_DIRECT)
1640 result = generic_file_read_iter(iocb, iter);
1642 /* If the first page is not in cache, generic_file_aio_read() will be
1643 * returned with -ENODATA.
1644 * See corresponding code in ll_readpage(). */
1645 if (result == -ENODATA)
1649 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1650 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1651 LPROC_LL_READ_BYTES, result);
1658 * Read from a file (through the page cache).
1660 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1663 struct vvp_io_args *args;
1664 struct file *file = iocb->ki_filp;
1670 if (!iov_iter_count(to))
1674 * Currently when PCC read failed, we do not fall back to the
1675 * normal read path, just return the error.
1676 * The resaon is that: for RW-PCC, the file data may be modified
1677 * in the PCC and inconsistent with the data on OSTs (or file
1678 * data has been removed from the Lustre file system), at this
1679 * time, fallback to the normal read path may read the wrong
1681 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1682 * path: read data from data copy on OSTs.
1684 result = pcc_file_read_iter(iocb, to, &cached);
1690 result = ll_do_fast_read(iocb, to);
1691 if (result < 0 || iov_iter_count(to) == 0)
1694 env = cl_env_get(&refcheck);
1696 return PTR_ERR(env);
1698 args = ll_env_args(env, IO_NORMAL);
1699 args->u.normal.via_iter = to;
1700 args->u.normal.via_iocb = iocb;
1702 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1703 &iocb->ki_pos, iov_iter_count(to));
1706 else if (result == 0)
1709 cl_env_put(env, &refcheck);
1712 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1713 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1720 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1721 * If a page is already in the page cache and dirty (and some other things -
1722 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1723 * write to it without doing a full I/O, because Lustre already knows about it
1724 * and will write it out. This saves a lot of processing time.
1726 * All writes here are within one page, so exclusion is handled by the page
1727 * lock on the vm page. We do not do tiny writes for writes which touch
1728 * multiple pages because it's very unlikely multiple sequential pages are
1729 * are already dirty.
1731 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1732 * and are unlikely to be to already dirty pages.
1734 * Attribute updates are important here, we do them in ll_tiny_write_end.
1736 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1738 ssize_t count = iov_iter_count(iter);
1739 struct file *file = iocb->ki_filp;
1740 struct inode *inode = file_inode(file);
1741 bool lock_inode = !IS_NOSEC(inode);
1746 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1747 * of function for why.
1749 if (count >= PAGE_SIZE ||
1750 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1753 if (unlikely(lock_inode))
1755 result = __generic_file_write_iter(iocb, iter);
1757 if (unlikely(lock_inode))
1758 inode_unlock(inode);
1760 /* If the page is not already dirty, ll_tiny_write_begin returns
1761 * -ENODATA. We continue on to normal write.
1763 if (result == -ENODATA)
1767 ll_heat_add(inode, CIT_WRITE, result);
1768 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1770 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1773 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1779 * Write to a file (through the page cache).
1781 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1783 struct vvp_io_args *args;
1785 ssize_t rc_tiny = 0, rc_normal;
1786 struct file *file = iocb->ki_filp;
1793 if (!iov_iter_count(from))
1794 GOTO(out, rc_normal = 0);
1797 * When PCC write failed, we usually do not fall back to the normal
1798 * write path, just return the error. But there is a special case when
1799 * returned error code is -ENOSPC due to running out of space on PCC HSM
1800 * bakcend. At this time, it will fall back to normal I/O path and
1801 * retry the I/O. As the file is in HSM released state, it will restore
1802 * the file data to OSTs first and redo the write again. And the
1803 * restore process will revoke the layout lock and detach the file
1804 * from PCC cache automatically.
1806 result = pcc_file_write_iter(iocb, from, &cached);
1807 if (cached && result != -ENOSPC && result != -EDQUOT)
1810 /* NB: we can't do direct IO for tiny writes because they use the page
1811 * cache, we can't do sync writes because tiny writes can't flush
1812 * pages, and we can't do append writes because we can't guarantee the
1813 * required DLM locks are held to protect file size.
1815 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1816 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1817 rc_tiny = ll_do_tiny_write(iocb, from);
1819 /* In case of error, go on and try normal write - Only stop if tiny
1820 * write completed I/O.
1822 if (iov_iter_count(from) == 0)
1823 GOTO(out, rc_normal = rc_tiny);
1825 env = cl_env_get(&refcheck);
1827 return PTR_ERR(env);
1829 args = ll_env_args(env, IO_NORMAL);
1830 args->u.normal.via_iter = from;
1831 args->u.normal.via_iocb = iocb;
1833 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1834 &iocb->ki_pos, iov_iter_count(from));
1836 /* On success, combine bytes written. */
1837 if (rc_tiny >= 0 && rc_normal > 0)
1838 rc_normal += rc_tiny;
1839 /* On error, only return error from normal write if tiny write did not
1840 * write any bytes. Otherwise return bytes written by tiny write.
1842 else if (rc_tiny > 0)
1843 rc_normal = rc_tiny;
1845 cl_env_put(env, &refcheck);
1848 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1849 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1854 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1856 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1858 static int ll_file_get_iov_count(const struct iovec *iov,
1859 unsigned long *nr_segs, size_t *count)
1864 for (seg = 0; seg < *nr_segs; seg++) {
1865 const struct iovec *iv = &iov[seg];
1868 * If any segment has a negative length, or the cumulative
1869 * length ever wraps negative then return -EINVAL.
1872 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1874 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1879 cnt -= iv->iov_len; /* This segment is no good */
1886 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1887 unsigned long nr_segs, loff_t pos)
1894 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1901 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1902 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1903 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1904 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1905 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1907 result = ll_file_read_iter(iocb, &to);
1912 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1915 struct iovec iov = { .iov_base = buf, .iov_len = count };
1924 init_sync_kiocb(&kiocb, file);
1925 kiocb.ki_pos = *ppos;
1926 #ifdef HAVE_KIOCB_KI_LEFT
1927 kiocb.ki_left = count;
1928 #elif defined(HAVE_KI_NBYTES)
1929 kiocb.i_nbytes = count;
1932 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1933 *ppos = kiocb.ki_pos;
1939 * Write to a file (through the page cache).
1942 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1943 unsigned long nr_segs, loff_t pos)
1945 struct iov_iter from;
1950 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1957 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1958 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1959 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1960 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1961 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1963 result = ll_file_write_iter(iocb, &from);
1968 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1969 size_t count, loff_t *ppos)
1971 struct iovec iov = { .iov_base = (void __user *)buf,
1981 init_sync_kiocb(&kiocb, file);
1982 kiocb.ki_pos = *ppos;
1983 #ifdef HAVE_KIOCB_KI_LEFT
1984 kiocb.ki_left = count;
1985 #elif defined(HAVE_KI_NBYTES)
1986 kiocb.ki_nbytes = count;
1989 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1990 *ppos = kiocb.ki_pos;
1994 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1997 * Send file content (through pagecache) somewhere with helper
1999 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2000 struct pipe_inode_info *pipe, size_t count,
2004 struct vvp_io_args *args;
2011 result = pcc_file_splice_read(in_file, ppos, pipe,
2012 count, flags, &cached);
2016 ll_ras_enter(in_file);
2018 env = cl_env_get(&refcheck);
2020 RETURN(PTR_ERR(env));
2022 args = ll_env_args(env, IO_SPLICE);
2023 args->u.splice.via_pipe = pipe;
2024 args->u.splice.via_flags = flags;
2026 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2027 cl_env_put(env, &refcheck);
2030 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2031 LUSTRE_FPRIVATE(in_file), *ppos, result,
2036 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2037 __u64 flags, struct lov_user_md *lum, int lum_size)
2039 struct lookup_intent oit = {
2041 .it_flags = flags | MDS_OPEN_BY_FID,
2046 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2047 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2048 /* this code will only exist for big-endian systems */
2049 lustre_swab_lov_user_md(lum, 0);
2052 ll_inode_size_lock(inode);
2053 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2055 GOTO(out_unlock, rc);
2057 ll_release_openhandle(dentry, &oit);
2060 ll_inode_size_unlock(inode);
2061 ll_intent_release(&oit);
2066 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2067 struct lov_mds_md **lmmp, int *lmm_size,
2068 struct ptlrpc_request **request)
2070 struct ll_sb_info *sbi = ll_i2sbi(inode);
2071 struct mdt_body *body;
2072 struct lov_mds_md *lmm = NULL;
2073 struct ptlrpc_request *req = NULL;
2074 struct md_op_data *op_data;
2077 rc = ll_get_default_mdsize(sbi, &lmmsize);
2081 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2082 strlen(filename), lmmsize,
2083 LUSTRE_OPC_ANY, NULL);
2084 if (IS_ERR(op_data))
2085 RETURN(PTR_ERR(op_data));
2087 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2088 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2089 ll_finish_md_op_data(op_data);
2091 CDEBUG(D_INFO, "md_getattr_name failed "
2092 "on %s: rc %d\n", filename, rc);
2096 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2097 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2099 lmmsize = body->mbo_eadatasize;
2101 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2103 GOTO(out, rc = -ENODATA);
2106 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2107 LASSERT(lmm != NULL);
2109 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2110 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2111 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2112 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2113 GOTO(out, rc = -EPROTO);
2116 * This is coming from the MDS, so is probably in
2117 * little endian. We convert it to host endian before
2118 * passing it to userspace.
2120 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2121 __swab32(LOV_MAGIC_MAGIC)) {
2122 int stripe_count = 0;
2124 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2125 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2126 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2127 if (le32_to_cpu(lmm->lmm_pattern) &
2128 LOV_PATTERN_F_RELEASED)
2132 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2134 /* if function called for directory - we should
2135 * avoid swab not existent lsm objects */
2136 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2137 lustre_swab_lov_user_md_objects(
2138 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2140 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2141 S_ISREG(body->mbo_mode))
2142 lustre_swab_lov_user_md_objects(
2143 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2149 *lmm_size = lmmsize;
2154 static int ll_lov_setea(struct inode *inode, struct file *file,
2157 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2158 struct lov_user_md *lump;
2159 int lum_size = sizeof(struct lov_user_md) +
2160 sizeof(struct lov_user_ost_data);
2164 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2167 OBD_ALLOC_LARGE(lump, lum_size);
2171 if (copy_from_user(lump, arg, lum_size))
2172 GOTO(out_lump, rc = -EFAULT);
2174 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2176 cl_lov_delay_create_clear(&file->f_flags);
2179 OBD_FREE_LARGE(lump, lum_size);
2183 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2190 env = cl_env_get(&refcheck);
2192 RETURN(PTR_ERR(env));
2194 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2195 cl_env_put(env, &refcheck);
2199 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2202 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2203 struct lov_user_md *klum;
2205 __u64 flags = FMODE_WRITE;
2208 rc = ll_copy_user_md(lum, &klum);
2213 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2218 rc = put_user(0, &lum->lmm_stripe_count);
2222 rc = ll_layout_refresh(inode, &gen);
2226 rc = ll_file_getstripe(inode, arg, lum_size);
2228 cl_lov_delay_create_clear(&file->f_flags);
2231 OBD_FREE_LARGE(klum, lum_size);
2237 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2239 struct ll_inode_info *lli = ll_i2info(inode);
2240 struct cl_object *obj = lli->lli_clob;
2241 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2242 struct ll_grouplock grouplock;
2247 CWARN("group id for group lock must not be 0\n");
2251 if (ll_file_nolock(file))
2252 RETURN(-EOPNOTSUPP);
2254 if (file->f_flags & O_NONBLOCK) {
2255 if (!mutex_trylock(&lli->lli_group_mutex))
2258 mutex_lock(&lli->lli_group_mutex);
2260 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2261 CWARN("group lock already existed with gid %lu\n",
2262 fd->fd_grouplock.lg_gid);
2263 GOTO(out, rc = -EINVAL);
2265 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2266 if (file->f_flags & O_NONBLOCK)
2267 GOTO(out, rc = -EAGAIN);
2268 mutex_unlock(&lli->lli_group_mutex);
2269 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2270 GOTO(retry, rc = 0);
2272 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2275 * XXX: group lock needs to protect all OST objects while PFL
2276 * can add new OST objects during the IO, so we'd instantiate
2277 * all OST objects before getting its group lock.
2282 struct cl_layout cl = {
2283 .cl_is_composite = false,
2285 struct lu_extent ext = {
2287 .e_end = OBD_OBJECT_EOF,
2290 env = cl_env_get(&refcheck);
2292 GOTO(out, rc = PTR_ERR(env));
2294 rc = cl_object_layout_get(env, obj, &cl);
2295 if (!rc && cl.cl_is_composite)
2296 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2299 cl_env_put(env, &refcheck);
2304 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2305 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2310 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2311 fd->fd_grouplock = grouplock;
2312 if (lli->lli_group_users == 0)
2313 lli->lli_group_gid = grouplock.lg_gid;
2314 lli->lli_group_users++;
2316 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2318 mutex_unlock(&lli->lli_group_mutex);
2323 static int ll_put_grouplock(struct inode *inode, struct file *file,
2326 struct ll_inode_info *lli = ll_i2info(inode);
2327 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2328 struct ll_grouplock grouplock;
2332 mutex_lock(&lli->lli_group_mutex);
2333 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2334 CWARN("no group lock held\n");
2335 GOTO(out, rc = -EINVAL);
2338 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2340 if (fd->fd_grouplock.lg_gid != arg) {
2341 CWARN("group lock %lu doesn't match current id %lu\n",
2342 arg, fd->fd_grouplock.lg_gid);
2343 GOTO(out, rc = -EINVAL);
2346 grouplock = fd->fd_grouplock;
2347 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2348 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2350 cl_put_grouplock(&grouplock);
2352 lli->lli_group_users--;
2353 if (lli->lli_group_users == 0) {
2354 lli->lli_group_gid = 0;
2355 wake_up_var(&lli->lli_group_users);
2357 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2360 mutex_unlock(&lli->lli_group_mutex);
2366 * Close inode open handle
2368 * \param dentry [in] dentry which contains the inode
2369 * \param it [in,out] intent which contains open info and result
2372 * \retval <0 failure
2374 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2376 struct inode *inode = dentry->d_inode;
2377 struct obd_client_handle *och;
2383 /* Root ? Do nothing. */
2384 if (dentry->d_inode->i_sb->s_root == dentry)
2387 /* No open handle to close? Move away */
2388 if (!it_disposition(it, DISP_OPEN_OPEN))
2391 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2393 OBD_ALLOC(och, sizeof(*och));
2395 GOTO(out, rc = -ENOMEM);
2397 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2401 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2403 /* this one is in place of ll_file_open */
2404 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2405 ptlrpc_req_finished(it->it_request);
2406 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2412 * Get size for inode for which FIEMAP mapping is requested.
2413 * Make the FIEMAP get_info call and returns the result.
2414 * \param fiemap kernel buffer to hold extens
2415 * \param num_bytes kernel buffer size
2417 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2423 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2426 /* Checks for fiemap flags */
2427 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2428 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2432 /* Check for FIEMAP_FLAG_SYNC */
2433 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2434 rc = filemap_fdatawrite(inode->i_mapping);
2439 env = cl_env_get(&refcheck);
2441 RETURN(PTR_ERR(env));
2443 if (i_size_read(inode) == 0) {
2444 rc = ll_glimpse_size(inode);
2449 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2450 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2451 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2453 /* If filesize is 0, then there would be no objects for mapping */
2454 if (fmkey.lfik_oa.o_size == 0) {
2455 fiemap->fm_mapped_extents = 0;
2459 fmkey.lfik_fiemap = *fiemap;
2461 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2462 &fmkey, fiemap, &num_bytes);
2464 cl_env_put(env, &refcheck);
2468 int ll_fid2path(struct inode *inode, void __user *arg)
2470 struct obd_export *exp = ll_i2mdexp(inode);
2471 const struct getinfo_fid2path __user *gfin = arg;
2473 struct getinfo_fid2path *gfout;
2479 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2480 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2483 /* Only need to get the buflen */
2484 if (get_user(pathlen, &gfin->gf_pathlen))
2487 if (pathlen > PATH_MAX)
2490 outsize = sizeof(*gfout) + pathlen;
2491 OBD_ALLOC(gfout, outsize);
2495 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2496 GOTO(gf_free, rc = -EFAULT);
2497 /* append root FID after gfout to let MDT know the root FID so that it
2498 * can lookup the correct path, this is mainly for fileset.
2499 * old server without fileset mount support will ignore this. */
2500 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2502 /* Call mdc_iocontrol */
2503 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2507 if (copy_to_user(arg, gfout, outsize))
2511 OBD_FREE(gfout, outsize);
2516 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2518 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2526 ioc->idv_version = 0;
2527 ioc->idv_layout_version = UINT_MAX;
2529 /* If no file object initialized, we consider its version is 0. */
2533 env = cl_env_get(&refcheck);
2535 RETURN(PTR_ERR(env));
2537 io = vvp_env_thread_io(env);
2539 io->u.ci_data_version.dv_data_version = 0;
2540 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2541 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2544 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2545 result = cl_io_loop(env, io);
2547 result = io->ci_result;
2549 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2550 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2552 cl_io_fini(env, io);
2554 if (unlikely(io->ci_need_restart))
2557 cl_env_put(env, &refcheck);
2563 * Read the data_version for inode.
2565 * This value is computed using stripe object version on OST.
2566 * Version is computed using server side locking.
2568 * @param flags if do sync on the OST side;
2570 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2571 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2573 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2575 struct ioc_data_version ioc = { .idv_flags = flags };
2578 rc = ll_ioc_data_version(inode, &ioc);
2580 *data_version = ioc.idv_version;
2586 * Trigger a HSM release request for the provided inode.
2588 int ll_hsm_release(struct inode *inode)
2591 struct obd_client_handle *och = NULL;
2592 __u64 data_version = 0;
2597 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2598 ll_i2sbi(inode)->ll_fsname,
2599 PFID(&ll_i2info(inode)->lli_fid));
2601 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2603 GOTO(out, rc = PTR_ERR(och));
2605 /* Grab latest data_version and [am]time values */
2606 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2610 env = cl_env_get(&refcheck);
2612 GOTO(out, rc = PTR_ERR(env));
2614 rc = ll_merge_attr(env, inode);
2615 cl_env_put(env, &refcheck);
2617 /* If error happen, we have the wrong size for a file.
2623 /* Release the file.
2624 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2625 * we still need it to pack l_remote_handle to MDT. */
2626 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2632 if (och != NULL && !IS_ERR(och)) /* close the file */
2633 ll_lease_close(och, inode, NULL);
2638 struct ll_swap_stack {
2641 struct inode *inode1;
2642 struct inode *inode2;
2647 static int ll_swap_layouts(struct file *file1, struct file *file2,
2648 struct lustre_swap_layouts *lsl)
2650 struct mdc_swap_layouts msl;
2651 struct md_op_data *op_data;
2654 struct ll_swap_stack *llss = NULL;
2657 OBD_ALLOC_PTR(llss);
2661 llss->inode1 = file_inode(file1);
2662 llss->inode2 = file_inode(file2);
2664 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2668 /* we use 2 bool because it is easier to swap than 2 bits */
2669 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2670 llss->check_dv1 = true;
2672 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2673 llss->check_dv2 = true;
2675 /* we cannot use lsl->sl_dvX directly because we may swap them */
2676 llss->dv1 = lsl->sl_dv1;
2677 llss->dv2 = lsl->sl_dv2;
2679 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2680 if (rc == 0) /* same file, done! */
2683 if (rc < 0) { /* sequentialize it */
2684 swap(llss->inode1, llss->inode2);
2686 swap(llss->dv1, llss->dv2);
2687 swap(llss->check_dv1, llss->check_dv2);
2691 if (gid != 0) { /* application asks to flush dirty cache */
2692 rc = ll_get_grouplock(llss->inode1, file1, gid);
2696 rc = ll_get_grouplock(llss->inode2, file2, gid);
2698 ll_put_grouplock(llss->inode1, file1, gid);
2703 /* ultimate check, before swaping the layouts we check if
2704 * dataversion has changed (if requested) */
2705 if (llss->check_dv1) {
2706 rc = ll_data_version(llss->inode1, &dv, 0);
2709 if (dv != llss->dv1)
2710 GOTO(putgl, rc = -EAGAIN);
2713 if (llss->check_dv2) {
2714 rc = ll_data_version(llss->inode2, &dv, 0);
2717 if (dv != llss->dv2)
2718 GOTO(putgl, rc = -EAGAIN);
2721 /* struct md_op_data is used to send the swap args to the mdt
2722 * only flags is missing, so we use struct mdc_swap_layouts
2723 * through the md_op_data->op_data */
2724 /* flags from user space have to be converted before they are send to
2725 * server, no flag is sent today, they are only used on the client */
2728 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2729 0, LUSTRE_OPC_ANY, &msl);
2730 if (IS_ERR(op_data))
2731 GOTO(free, rc = PTR_ERR(op_data));
2733 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2734 sizeof(*op_data), op_data, NULL);
2735 ll_finish_md_op_data(op_data);
2742 ll_put_grouplock(llss->inode2, file2, gid);
2743 ll_put_grouplock(llss->inode1, file1, gid);
2753 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2755 struct obd_export *exp = ll_i2mdexp(inode);
2756 struct md_op_data *op_data;
2760 /* Detect out-of range masks */
2761 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2764 /* Non-root users are forbidden to set or clear flags which are
2765 * NOT defined in HSM_USER_MASK. */
2766 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2767 !cfs_capable(CFS_CAP_SYS_ADMIN))
2770 if (!exp_connect_archive_id_array(exp)) {
2771 /* Detect out-of range archive id */
2772 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2773 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2777 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2778 LUSTRE_OPC_ANY, hss);
2779 if (IS_ERR(op_data))
2780 RETURN(PTR_ERR(op_data));
2782 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2785 ll_finish_md_op_data(op_data);
2790 static int ll_hsm_import(struct inode *inode, struct file *file,
2791 struct hsm_user_import *hui)
2793 struct hsm_state_set *hss = NULL;
2794 struct iattr *attr = NULL;
2798 if (!S_ISREG(inode->i_mode))
2804 GOTO(out, rc = -ENOMEM);
2806 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2807 hss->hss_archive_id = hui->hui_archive_id;
2808 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2809 rc = ll_hsm_state_set(inode, hss);
2813 OBD_ALLOC_PTR(attr);
2815 GOTO(out, rc = -ENOMEM);
2817 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2818 attr->ia_mode |= S_IFREG;
2819 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2820 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2821 attr->ia_size = hui->hui_size;
2822 attr->ia_mtime.tv_sec = hui->hui_mtime;
2823 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2824 attr->ia_atime.tv_sec = hui->hui_atime;
2825 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2827 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2828 ATTR_UID | ATTR_GID |
2829 ATTR_MTIME | ATTR_MTIME_SET |
2830 ATTR_ATIME | ATTR_ATIME_SET;
2834 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2838 inode_unlock(inode);
2850 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2852 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2853 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2856 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2858 struct inode *inode = file_inode(file);
2860 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2861 ATTR_MTIME | ATTR_MTIME_SET |
2864 .tv_sec = lfu->lfu_atime_sec,
2865 .tv_nsec = lfu->lfu_atime_nsec,
2868 .tv_sec = lfu->lfu_mtime_sec,
2869 .tv_nsec = lfu->lfu_mtime_nsec,
2872 .tv_sec = lfu->lfu_ctime_sec,
2873 .tv_nsec = lfu->lfu_ctime_nsec,
2879 if (!capable(CAP_SYS_ADMIN))
2882 if (!S_ISREG(inode->i_mode))
2886 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2888 inode_unlock(inode);
2893 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2896 case MODE_READ_USER:
2898 case MODE_WRITE_USER:
2905 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2907 /* Used to allow the upper layers of the client to request an LDLM lock
2908 * without doing an actual read or write.
2910 * Used for ladvise lockahead to manually request specific locks.
2912 * \param[in] file file this ladvise lock request is on
2913 * \param[in] ladvise ladvise struct describing this lock request
2915 * \retval 0 success, no detailed result available (sync requests
2916 * and requests sent to the server [not handled locally]
2917 * cannot return detailed results)
2918 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2919 * see definitions for details.
2920 * \retval negative negative errno on error
2922 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2924 struct lu_env *env = NULL;
2925 struct cl_io *io = NULL;
2926 struct cl_lock *lock = NULL;
2927 struct cl_lock_descr *descr = NULL;
2928 struct dentry *dentry = file->f_path.dentry;
2929 struct inode *inode = dentry->d_inode;
2930 enum cl_lock_mode cl_mode;
2931 off_t start = ladvise->lla_start;
2932 off_t end = ladvise->lla_end;
2938 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2939 "start=%llu, end=%llu\n", dentry->d_name.len,
2940 dentry->d_name.name, dentry->d_inode,
2941 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2944 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2946 GOTO(out, result = cl_mode);
2948 /* Get IO environment */
2949 result = cl_io_get(inode, &env, &io, &refcheck);
2953 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2956 * nothing to do for this io. This currently happens when
2957 * stripe sub-object's are not yet created.
2959 result = io->ci_result;
2960 } else if (result == 0) {
2961 lock = vvp_env_lock(env);
2962 descr = &lock->cll_descr;
2964 descr->cld_obj = io->ci_obj;
2965 /* Convert byte offsets to pages */
2966 descr->cld_start = cl_index(io->ci_obj, start);
2967 descr->cld_end = cl_index(io->ci_obj, end);
2968 descr->cld_mode = cl_mode;
2969 /* CEF_MUST is used because we do not want to convert a
2970 * lockahead request to a lockless lock */
2971 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2974 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2975 descr->cld_enq_flags |= CEF_SPECULATIVE;
2977 result = cl_lock_request(env, io, lock);
2979 /* On success, we need to release the lock */
2981 cl_lock_release(env, lock);
2983 cl_io_fini(env, io);
2984 cl_env_put(env, &refcheck);
2986 /* -ECANCELED indicates a matching lock with a different extent
2987 * was already present, and -EEXIST indicates a matching lock
2988 * on exactly the same extent was already present.
2989 * We convert them to positive values for userspace to make
2990 * recognizing true errors easier.
2991 * Note we can only return these detailed results on async requests,
2992 * as sync requests look the same as i/o requests for locking. */
2993 if (result == -ECANCELED)
2994 result = LLA_RESULT_DIFFERENT;
2995 else if (result == -EEXIST)
2996 result = LLA_RESULT_SAME;
3001 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3003 static int ll_ladvise_sanity(struct inode *inode,
3004 struct llapi_lu_ladvise *ladvise)
3006 struct ll_sb_info *sbi = ll_i2sbi(inode);
3007 enum lu_ladvise_type advice = ladvise->lla_advice;
3008 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3009 * be in the first 32 bits of enum ladvise_flags */
3010 __u32 flags = ladvise->lla_peradvice_flags;
3011 /* 3 lines at 80 characters per line, should be plenty */
3014 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3016 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3017 "last supported advice is %s (value '%d'): rc = %d\n",
3018 sbi->ll_fsname, advice,
3019 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3023 /* Per-advice checks */
3025 case LU_LADVISE_LOCKNOEXPAND:
3026 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3028 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3029 "rc = %d\n", sbi->ll_fsname, flags,
3030 ladvise_names[advice], rc);
3034 case LU_LADVISE_LOCKAHEAD:
3035 /* Currently only READ and WRITE modes can be requested */
3036 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3037 ladvise->lla_lockahead_mode == 0) {
3039 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3040 "rc = %d\n", sbi->ll_fsname,
3041 ladvise->lla_lockahead_mode,
3042 ladvise_names[advice], rc);
3046 case LU_LADVISE_WILLREAD:
3047 case LU_LADVISE_DONTNEED:
3049 /* Note fall through above - These checks apply to all advices
3050 * except LOCKNOEXPAND */
3051 if (flags & ~LF_DEFAULT_MASK) {
3053 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3054 "rc = %d\n", sbi->ll_fsname, flags,
3055 ladvise_names[advice], rc);
3058 if (ladvise->lla_start >= ladvise->lla_end) {
3060 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3061 "for %s: rc = %d\n", sbi->ll_fsname,
3062 ladvise->lla_start, ladvise->lla_end,
3063 ladvise_names[advice], rc);
3075 * Give file access advices
3077 * The ladvise interface is similar to Linux fadvise() system call, except it
3078 * forwards the advices directly from Lustre client to server. The server side
3079 * codes will apply appropriate read-ahead and caching techniques for the
3080 * corresponding files.
3082 * A typical workload for ladvise is e.g. a bunch of different clients are
3083 * doing small random reads of a file, so prefetching pages into OSS cache
3084 * with big linear reads before the random IO is a net benefit. Fetching
3085 * all that data into each client cache with fadvise() may not be, due to
3086 * much more data being sent to the client.
3088 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3089 struct llapi_lu_ladvise *ladvise)
3093 struct cl_ladvise_io *lio;
3098 env = cl_env_get(&refcheck);
3100 RETURN(PTR_ERR(env));
3102 io = vvp_env_thread_io(env);
3103 io->ci_obj = ll_i2info(inode)->lli_clob;
3105 /* initialize parameters for ladvise */
3106 lio = &io->u.ci_ladvise;
3107 lio->li_start = ladvise->lla_start;
3108 lio->li_end = ladvise->lla_end;
3109 lio->li_fid = ll_inode2fid(inode);
3110 lio->li_advice = ladvise->lla_advice;
3111 lio->li_flags = flags;
3113 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3114 rc = cl_io_loop(env, io);
3118 cl_io_fini(env, io);
3119 cl_env_put(env, &refcheck);
3123 static int ll_lock_noexpand(struct file *file, int flags)
3125 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3127 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3132 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3135 struct fsxattr fsxattr;
3137 if (copy_from_user(&fsxattr,
3138 (const struct fsxattr __user *)arg,
3142 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3143 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3144 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3145 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3146 if (copy_to_user((struct fsxattr __user *)arg,
3147 &fsxattr, sizeof(fsxattr)))
3153 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3156 * Project Quota ID state is only allowed to change from within the init
3157 * namespace. Enforce that restriction only if we are trying to change
3158 * the quota ID state. Everything else is allowed in user namespaces.
3160 if (current_user_ns() == &init_user_ns)
3163 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3166 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3167 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3170 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3177 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3181 struct md_op_data *op_data;
3182 struct ptlrpc_request *req = NULL;
3184 struct fsxattr fsxattr;
3185 struct cl_object *obj;
3189 if (copy_from_user(&fsxattr,
3190 (const struct fsxattr __user *)arg,
3194 rc = ll_ioctl_check_project(inode, &fsxattr);
3198 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3199 LUSTRE_OPC_ANY, NULL);
3200 if (IS_ERR(op_data))
3201 RETURN(PTR_ERR(op_data));
3203 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3204 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3205 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3206 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3207 op_data->op_projid = fsxattr.fsx_projid;
3208 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3209 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3211 ptlrpc_req_finished(req);
3213 GOTO(out_fsxattr, rc);
3214 ll_update_inode_flags(inode, op_data->op_attr_flags);
3215 obj = ll_i2info(inode)->lli_clob;
3217 GOTO(out_fsxattr, rc);
3219 OBD_ALLOC_PTR(attr);
3221 GOTO(out_fsxattr, rc = -ENOMEM);
3223 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3224 fsxattr.fsx_xflags);
3227 ll_finish_md_op_data(op_data);
3231 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3234 struct inode *inode = file_inode(file);
3235 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3236 struct ll_inode_info *lli = ll_i2info(inode);
3237 struct obd_client_handle *och = NULL;
3238 struct split_param sp;
3239 struct pcc_param param;
3240 bool lease_broken = false;
3242 enum mds_op_bias bias = 0;
3243 struct file *layout_file = NULL;
3245 size_t data_size = 0;
3246 bool attached = false;
3251 mutex_lock(&lli->lli_och_mutex);
3252 if (fd->fd_lease_och != NULL) {
3253 och = fd->fd_lease_och;
3254 fd->fd_lease_och = NULL;
3256 mutex_unlock(&lli->lli_och_mutex);
3261 fmode = och->och_flags;
3263 switch (ioc->lil_flags) {
3264 case LL_LEASE_RESYNC_DONE:
3265 if (ioc->lil_count > IOC_IDS_MAX)
3266 GOTO(out_lease_close, rc = -EINVAL);
3268 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3269 OBD_ALLOC(data, data_size);
3271 GOTO(out_lease_close, rc = -ENOMEM);
3273 if (copy_from_user(data, (void __user *)arg, data_size))
3274 GOTO(out_lease_close, rc = -EFAULT);
3276 bias = MDS_CLOSE_RESYNC_DONE;
3278 case LL_LEASE_LAYOUT_MERGE: {
3281 if (ioc->lil_count != 1)
3282 GOTO(out_lease_close, rc = -EINVAL);
3284 arg += sizeof(*ioc);
3285 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3286 GOTO(out_lease_close, rc = -EFAULT);
3288 layout_file = fget(fd);
3290 GOTO(out_lease_close, rc = -EBADF);
3292 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3293 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3294 GOTO(out_lease_close, rc = -EPERM);
3296 data = file_inode(layout_file);
3297 bias = MDS_CLOSE_LAYOUT_MERGE;
3300 case LL_LEASE_LAYOUT_SPLIT: {
3304 if (ioc->lil_count != 2)
3305 GOTO(out_lease_close, rc = -EINVAL);
3307 arg += sizeof(*ioc);
3308 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3309 GOTO(out_lease_close, rc = -EFAULT);
3311 arg += sizeof(__u32);
3312 if (copy_from_user(&mirror_id, (void __user *)arg,
3314 GOTO(out_lease_close, rc = -EFAULT);
3316 layout_file = fget(fdv);
3318 GOTO(out_lease_close, rc = -EBADF);
3320 sp.sp_inode = file_inode(layout_file);
3321 sp.sp_mirror_id = (__u16)mirror_id;
3323 bias = MDS_CLOSE_LAYOUT_SPLIT;
3326 case LL_LEASE_PCC_ATTACH:
3327 if (ioc->lil_count != 1)
3330 arg += sizeof(*ioc);
3331 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3333 GOTO(out_lease_close, rc2 = -EFAULT);
3335 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3337 GOTO(out_lease_close, rc2);
3340 /* Grab latest data version */
3341 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3344 GOTO(out_lease_close, rc2);
3347 bias = MDS_PCC_ATTACH;
3350 /* without close intent */
3355 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3359 rc = ll_lease_och_release(inode, file);
3368 switch (ioc->lil_flags) {
3369 case LL_LEASE_RESYNC_DONE:
3371 OBD_FREE(data, data_size);
3373 case LL_LEASE_LAYOUT_MERGE:
3374 case LL_LEASE_LAYOUT_SPLIT:
3378 case LL_LEASE_PCC_ATTACH:
3381 rc = pcc_readwrite_attach_fini(file, inode,
3382 param.pa_layout_gen,
3389 rc = ll_lease_type_from_fmode(fmode);
3393 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3396 struct inode *inode = file_inode(file);
3397 struct ll_inode_info *lli = ll_i2info(inode);
3398 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3399 struct obd_client_handle *och = NULL;
3400 __u64 open_flags = 0;
3406 switch (ioc->lil_mode) {
3407 case LL_LEASE_WRLCK:
3408 if (!(file->f_mode & FMODE_WRITE))
3410 fmode = FMODE_WRITE;
3412 case LL_LEASE_RDLCK:
3413 if (!(file->f_mode & FMODE_READ))
3417 case LL_LEASE_UNLCK:
3418 RETURN(ll_file_unlock_lease(file, ioc, arg));
3423 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3425 /* apply for lease */
3426 if (ioc->lil_flags & LL_LEASE_RESYNC)
3427 open_flags = MDS_OPEN_RESYNC;
3428 och = ll_lease_open(inode, file, fmode, open_flags);
3430 RETURN(PTR_ERR(och));
3432 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3433 rc = ll_lease_file_resync(och, inode, arg);
3435 ll_lease_close(och, inode, NULL);
3438 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3440 ll_lease_close(och, inode, NULL);
3446 mutex_lock(&lli->lli_och_mutex);
3447 if (fd->fd_lease_och == NULL) {
3448 fd->fd_lease_och = och;
3451 mutex_unlock(&lli->lli_och_mutex);
3453 /* impossible now that only excl is supported for now */
3454 ll_lease_close(och, inode, &lease_broken);
3460 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3462 struct ll_inode_info *lli = ll_i2info(inode);
3463 struct ll_sb_info *sbi = ll_i2sbi(inode);
3464 __u64 now = ktime_get_real_seconds();
3467 spin_lock(&lli->lli_heat_lock);
3468 heat->lh_flags = lli->lli_heat_flags;
3469 for (i = 0; i < heat->lh_count; i++)
3470 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3471 now, sbi->ll_heat_decay_weight,
3472 sbi->ll_heat_period_second);
3473 spin_unlock(&lli->lli_heat_lock);
3476 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3478 struct ll_inode_info *lli = ll_i2info(inode);
3481 spin_lock(&lli->lli_heat_lock);
3482 if (flags & LU_HEAT_FLAG_CLEAR)
3483 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3485 if (flags & LU_HEAT_FLAG_OFF)
3486 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3488 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3490 spin_unlock(&lli->lli_heat_lock);
3496 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3498 struct inode *inode = file_inode(file);
3499 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3503 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3504 PFID(ll_inode2fid(inode)), inode, cmd);
3505 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3507 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3508 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3512 case LL_IOC_GETFLAGS:
3513 /* Get the current value of the file flags */
3514 return put_user(fd->fd_flags, (int __user *)arg);
3515 case LL_IOC_SETFLAGS:
3516 case LL_IOC_CLRFLAGS:
3517 /* Set or clear specific file flags */
3518 /* XXX This probably needs checks to ensure the flags are
3519 * not abused, and to handle any flag side effects.
3521 if (get_user(flags, (int __user *) arg))
3524 if (cmd == LL_IOC_SETFLAGS) {
3525 if ((flags & LL_FILE_IGNORE_LOCK) &&
3526 !(file->f_flags & O_DIRECT)) {
3527 CERROR("%s: unable to disable locking on "
3528 "non-O_DIRECT file\n", current->comm);
3532 fd->fd_flags |= flags;
3534 fd->fd_flags &= ~flags;
3537 case LL_IOC_LOV_SETSTRIPE:
3538 case LL_IOC_LOV_SETSTRIPE_NEW:
3539 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3540 case LL_IOC_LOV_SETEA:
3541 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3542 case LL_IOC_LOV_SWAP_LAYOUTS: {
3544 struct lustre_swap_layouts lsl;
3546 if (copy_from_user(&lsl, (char __user *)arg,
3547 sizeof(struct lustre_swap_layouts)))
3550 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3553 file2 = fget(lsl.sl_fd);
3557 /* O_WRONLY or O_RDWR */
3558 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3559 GOTO(out, rc = -EPERM);
3561 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3562 struct inode *inode2;
3563 struct ll_inode_info *lli;
3564 struct obd_client_handle *och = NULL;
3566 lli = ll_i2info(inode);
3567 mutex_lock(&lli->lli_och_mutex);
3568 if (fd->fd_lease_och != NULL) {
3569 och = fd->fd_lease_och;
3570 fd->fd_lease_och = NULL;
3572 mutex_unlock(&lli->lli_och_mutex);
3574 GOTO(out, rc = -ENOLCK);
3575 inode2 = file_inode(file2);
3576 rc = ll_swap_layouts_close(och, inode, inode2);
3578 rc = ll_swap_layouts(file, file2, &lsl);
3584 case LL_IOC_LOV_GETSTRIPE:
3585 case LL_IOC_LOV_GETSTRIPE_NEW:
3586 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3587 case FS_IOC_GETFLAGS:
3588 case FS_IOC_SETFLAGS:
3589 RETURN(ll_iocontrol(inode, file, cmd, arg));
3590 case FSFILT_IOC_GETVERSION:
3591 case FS_IOC_GETVERSION:
3592 RETURN(put_user(inode->i_generation, (int __user *)arg));
3593 /* We need to special case any other ioctls we want to handle,
3594 * to send them to the MDS/OST as appropriate and to properly
3595 * network encode the arg field. */
3596 case FS_IOC_SETVERSION:
3599 case LL_IOC_GROUP_LOCK:
3600 RETURN(ll_get_grouplock(inode, file, arg));
3601 case LL_IOC_GROUP_UNLOCK:
3602 RETURN(ll_put_grouplock(inode, file, arg));
3603 case IOC_OBD_STATFS:
3604 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3606 case LL_IOC_FLUSHCTX:
3607 RETURN(ll_flush_ctx(inode));
3608 case LL_IOC_PATH2FID: {
3609 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3610 sizeof(struct lu_fid)))
3615 case LL_IOC_GETPARENT:
3616 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3618 case OBD_IOC_FID2PATH:
3619 RETURN(ll_fid2path(inode, (void __user *)arg));
3620 case LL_IOC_DATA_VERSION: {
3621 struct ioc_data_version idv;
3624 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3627 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3628 rc = ll_ioc_data_version(inode, &idv);
3631 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3637 case LL_IOC_GET_MDTIDX: {
3640 mdtidx = ll_get_mdt_idx(inode);
3644 if (put_user((int)mdtidx, (int __user *)arg))
3649 case OBD_IOC_GETDTNAME:
3650 case OBD_IOC_GETMDNAME:
3651 RETURN(ll_get_obd_name(inode, cmd, arg));
3652 case LL_IOC_HSM_STATE_GET: {
3653 struct md_op_data *op_data;
3654 struct hsm_user_state *hus;
3661 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3662 LUSTRE_OPC_ANY, hus);
3663 if (IS_ERR(op_data)) {
3665 RETURN(PTR_ERR(op_data));
3668 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3671 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3674 ll_finish_md_op_data(op_data);
3678 case LL_IOC_HSM_STATE_SET: {
3679 struct hsm_state_set *hss;
3686 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3691 rc = ll_hsm_state_set(inode, hss);
3696 case LL_IOC_HSM_ACTION: {
3697 struct md_op_data *op_data;
3698 struct hsm_current_action *hca;
3705 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3706 LUSTRE_OPC_ANY, hca);
3707 if (IS_ERR(op_data)) {
3709 RETURN(PTR_ERR(op_data));
3712 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3715 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3718 ll_finish_md_op_data(op_data);
3722 case LL_IOC_SET_LEASE_OLD: {
3723 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3725 RETURN(ll_file_set_lease(file, &ioc, 0));
3727 case LL_IOC_SET_LEASE: {
3728 struct ll_ioc_lease ioc;
3730 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3733 RETURN(ll_file_set_lease(file, &ioc, arg));
3735 case LL_IOC_GET_LEASE: {
3736 struct ll_inode_info *lli = ll_i2info(inode);
3737 struct ldlm_lock *lock = NULL;
3740 mutex_lock(&lli->lli_och_mutex);
3741 if (fd->fd_lease_och != NULL) {
3742 struct obd_client_handle *och = fd->fd_lease_och;
3744 lock = ldlm_handle2lock(&och->och_lease_handle);
3746 lock_res_and_lock(lock);
3747 if (!ldlm_is_cancel(lock))
3748 fmode = och->och_flags;
3750 unlock_res_and_lock(lock);
3751 LDLM_LOCK_PUT(lock);
3754 mutex_unlock(&lli->lli_och_mutex);
3756 RETURN(ll_lease_type_from_fmode(fmode));
3758 case LL_IOC_HSM_IMPORT: {
3759 struct hsm_user_import *hui;
3765 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3770 rc = ll_hsm_import(inode, file, hui);
3775 case LL_IOC_FUTIMES_3: {
3776 struct ll_futimes_3 lfu;
3778 if (copy_from_user(&lfu,
3779 (const struct ll_futimes_3 __user *)arg,
3783 RETURN(ll_file_futimes_3(file, &lfu));
3785 case LL_IOC_LADVISE: {
3786 struct llapi_ladvise_hdr *k_ladvise_hdr;
3787 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3790 int alloc_size = sizeof(*k_ladvise_hdr);
3793 u_ladvise_hdr = (void __user *)arg;
3794 OBD_ALLOC_PTR(k_ladvise_hdr);
3795 if (k_ladvise_hdr == NULL)
3798 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3799 GOTO(out_ladvise, rc = -EFAULT);
3801 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3802 k_ladvise_hdr->lah_count < 1)
3803 GOTO(out_ladvise, rc = -EINVAL);
3805 num_advise = k_ladvise_hdr->lah_count;
3806 if (num_advise >= LAH_COUNT_MAX)
3807 GOTO(out_ladvise, rc = -EFBIG);
3809 OBD_FREE_PTR(k_ladvise_hdr);
3810 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3811 lah_advise[num_advise]);
3812 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3813 if (k_ladvise_hdr == NULL)
3817 * TODO: submit multiple advices to one server in a single RPC
3819 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3820 GOTO(out_ladvise, rc = -EFAULT);
3822 for (i = 0; i < num_advise; i++) {
3823 struct llapi_lu_ladvise *k_ladvise =
3824 &k_ladvise_hdr->lah_advise[i];
3825 struct llapi_lu_ladvise __user *u_ladvise =
3826 &u_ladvise_hdr->lah_advise[i];
3828 rc = ll_ladvise_sanity(inode, k_ladvise);
3830 GOTO(out_ladvise, rc);
3832 switch (k_ladvise->lla_advice) {
3833 case LU_LADVISE_LOCKNOEXPAND:
3834 rc = ll_lock_noexpand(file,
3835 k_ladvise->lla_peradvice_flags);
3836 GOTO(out_ladvise, rc);
3837 case LU_LADVISE_LOCKAHEAD:
3839 rc = ll_file_lock_ahead(file, k_ladvise);
3842 GOTO(out_ladvise, rc);
3845 &u_ladvise->lla_lockahead_result))
3846 GOTO(out_ladvise, rc = -EFAULT);
3849 rc = ll_ladvise(inode, file,
3850 k_ladvise_hdr->lah_flags,
3853 GOTO(out_ladvise, rc);
3860 OBD_FREE(k_ladvise_hdr, alloc_size);
3863 case LL_IOC_FLR_SET_MIRROR: {
3864 /* mirror I/O must be direct to avoid polluting page cache
3866 if (!(file->f_flags & O_DIRECT))
3869 fd->fd_designated_mirror = (__u32)arg;
3872 case LL_IOC_FSGETXATTR:
3873 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3874 case LL_IOC_FSSETXATTR:
3875 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3877 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3878 case LL_IOC_HEAT_GET: {
3879 struct lu_heat uheat;
3880 struct lu_heat *heat;
3883 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3886 if (uheat.lh_count > OBD_HEAT_COUNT)
3887 uheat.lh_count = OBD_HEAT_COUNT;
3889 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3890 OBD_ALLOC(heat, size);
3894 heat->lh_count = uheat.lh_count;
3895 ll_heat_get(inode, heat);
3896 rc = copy_to_user((char __user *)arg, heat, size);
3897 OBD_FREE(heat, size);
3898 RETURN(rc ? -EFAULT : 0);
3900 case LL_IOC_HEAT_SET: {
3903 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3906 rc = ll_heat_set(inode, flags);
3909 case LL_IOC_PCC_DETACH: {
3910 struct lu_pcc_detach *detach;
3912 OBD_ALLOC_PTR(detach);
3916 if (copy_from_user(detach,
3917 (const struct lu_pcc_detach __user *)arg,
3919 GOTO(out_detach_free, rc = -EFAULT);
3921 if (!S_ISREG(inode->i_mode))
3922 GOTO(out_detach_free, rc = -EINVAL);
3924 if (!inode_owner_or_capable(inode))
3925 GOTO(out_detach_free, rc = -EPERM);
3927 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3929 OBD_FREE_PTR(detach);
3932 case LL_IOC_PCC_STATE: {
3933 struct lu_pcc_state __user *ustate =
3934 (struct lu_pcc_state __user *)arg;
3935 struct lu_pcc_state *state;
3937 OBD_ALLOC_PTR(state);
3941 if (copy_from_user(state, ustate, sizeof(*state)))
3942 GOTO(out_state, rc = -EFAULT);
3944 rc = pcc_ioctl_state(file, inode, state);
3946 GOTO(out_state, rc);
3948 if (copy_to_user(ustate, state, sizeof(*state)))
3949 GOTO(out_state, rc = -EFAULT);
3952 OBD_FREE_PTR(state);
3956 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3957 (void __user *)arg));
3961 #ifndef HAVE_FILE_LLSEEK_SIZE
3962 static inline loff_t
3963 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3965 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3967 if (offset > maxsize)
3970 if (offset != file->f_pos) {
3971 file->f_pos = offset;
3972 file->f_version = 0;
3978 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3979 loff_t maxsize, loff_t eof)
3981 struct inode *inode = file_inode(file);
3989 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3990 * position-querying operation. Avoid rewriting the "same"
3991 * f_pos value back to the file because a concurrent read(),
3992 * write() or lseek() might have altered it
3997 * f_lock protects against read/modify/write race with other
3998 * SEEK_CURs. Note that parallel writes and reads behave
4002 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4003 inode_unlock(inode);
4007 * In the generic case the entire file is data, so as long as
4008 * offset isn't at the end of the file then the offset is data.
4015 * There is a virtual hole at the end of the file, so as long as
4016 * offset isn't i_size or larger, return i_size.
4024 return llseek_execute(file, offset, maxsize);
4028 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4030 struct inode *inode = file_inode(file);
4031 loff_t retval, eof = 0;
4034 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4035 (origin == SEEK_CUR) ? file->f_pos : 0);
4036 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4037 PFID(ll_inode2fid(inode)), inode, retval, retval,
4039 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4041 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4042 retval = ll_glimpse_size(inode);
4045 eof = i_size_read(inode);
4048 retval = ll_generic_file_llseek_size(file, offset, origin,
4049 ll_file_maxbytes(inode), eof);
4053 static int ll_flush(struct file *file, fl_owner_t id)
4055 struct inode *inode = file_inode(file);
4056 struct ll_inode_info *lli = ll_i2info(inode);
4057 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4060 LASSERT(!S_ISDIR(inode->i_mode));
4062 /* catch async errors that were recorded back when async writeback
4063 * failed for pages in this mapping. */
4064 rc = lli->lli_async_rc;
4065 lli->lli_async_rc = 0;
4066 if (lli->lli_clob != NULL) {
4067 err = lov_read_and_clear_async_rc(lli->lli_clob);
4072 /* The application has been told write failure already.
4073 * Do not report failure again. */
4074 if (fd->fd_write_failed)
4076 return rc ? -EIO : 0;
4080 * Called to make sure a portion of file has been written out.
4081 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4083 * Return how many pages have been written.
4085 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4086 enum cl_fsync_mode mode, int ignore_layout)
4090 struct cl_fsync_io *fio;
4095 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4096 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4099 env = cl_env_get(&refcheck);
4101 RETURN(PTR_ERR(env));
4103 io = vvp_env_thread_io(env);
4104 io->ci_obj = ll_i2info(inode)->lli_clob;
4105 io->ci_ignore_layout = ignore_layout;
4107 /* initialize parameters for sync */
4108 fio = &io->u.ci_fsync;
4109 fio->fi_start = start;
4111 fio->fi_fid = ll_inode2fid(inode);
4112 fio->fi_mode = mode;
4113 fio->fi_nr_written = 0;
4115 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4116 result = cl_io_loop(env, io);
4118 result = io->ci_result;
4120 result = fio->fi_nr_written;
4121 cl_io_fini(env, io);
4122 cl_env_put(env, &refcheck);
4128 * When dentry is provided (the 'else' case), file_dentry() may be
4129 * null and dentry must be used directly rather than pulled from
4130 * file_dentry() as is done otherwise.
4133 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4135 struct dentry *dentry = file_dentry(file);
4136 struct inode *inode = dentry->d_inode;
4137 struct ll_inode_info *lli = ll_i2info(inode);
4138 struct ptlrpc_request *req;
4143 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4145 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4147 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4149 /* fsync's caller has already called _fdata{sync,write}, we want
4150 * that IO to finish before calling the osc and mdc sync methods */
4151 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4154 /* catch async errors that were recorded back when async writeback
4155 * failed for pages in this mapping. */
4156 if (!S_ISDIR(inode->i_mode)) {
4157 err = lli->lli_async_rc;
4158 lli->lli_async_rc = 0;
4161 if (lli->lli_clob != NULL) {
4162 err = lov_read_and_clear_async_rc(lli->lli_clob);
4168 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4172 ptlrpc_req_finished(req);
4174 if (S_ISREG(inode->i_mode)) {
4175 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4178 /* Sync metadata on MDT first, and then sync the cached data
4181 err = pcc_fsync(file, start, end, datasync, &cached);
4183 err = cl_sync_file_range(inode, start, end,
4185 if (rc == 0 && err < 0)
4188 fd->fd_write_failed = true;
4190 fd->fd_write_failed = false;
4193 inode_unlock(inode);
4198 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4200 struct inode *inode = file_inode(file);
4201 struct ll_sb_info *sbi = ll_i2sbi(inode);
4202 struct ldlm_enqueue_info einfo = {
4203 .ei_type = LDLM_FLOCK,
4204 .ei_cb_cp = ldlm_flock_completion_ast,
4205 .ei_cbdata = file_lock,
4207 struct md_op_data *op_data;
4208 struct lustre_handle lockh = { 0 };
4209 union ldlm_policy_data flock = { { 0 } };
4210 int fl_type = file_lock->fl_type;
4216 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4217 PFID(ll_inode2fid(inode)), file_lock);
4219 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4221 if (file_lock->fl_flags & FL_FLOCK) {
4222 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4223 /* flocks are whole-file locks */
4224 flock.l_flock.end = OFFSET_MAX;
4225 /* For flocks owner is determined by the local file desctiptor*/
4226 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4227 } else if (file_lock->fl_flags & FL_POSIX) {
4228 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4229 flock.l_flock.start = file_lock->fl_start;
4230 flock.l_flock.end = file_lock->fl_end;
4234 flock.l_flock.pid = file_lock->fl_pid;
4236 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4237 /* Somewhat ugly workaround for svc lockd.
4238 * lockd installs custom fl_lmops->lm_compare_owner that checks
4239 * for the fl_owner to be the same (which it always is on local node
4240 * I guess between lockd processes) and then compares pid.
4241 * As such we assign pid to the owner field to make it all work,
4242 * conflict with normal locks is unlikely since pid space and
4243 * pointer space for current->files are not intersecting */
4244 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4245 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4250 einfo.ei_mode = LCK_PR;
4253 /* An unlock request may or may not have any relation to
4254 * existing locks so we may not be able to pass a lock handle
4255 * via a normal ldlm_lock_cancel() request. The request may even
4256 * unlock a byte range in the middle of an existing lock. In
4257 * order to process an unlock request we need all of the same
4258 * information that is given with a normal read or write record
4259 * lock request. To avoid creating another ldlm unlock (cancel)
4260 * message we'll treat a LCK_NL flock request as an unlock. */
4261 einfo.ei_mode = LCK_NL;
4264 einfo.ei_mode = LCK_PW;
4267 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4282 flags = LDLM_FL_BLOCK_NOWAIT;
4288 flags = LDLM_FL_TEST_LOCK;
4291 CERROR("unknown fcntl lock command: %d\n", cmd);
4295 /* Save the old mode so that if the mode in the lock changes we
4296 * can decrement the appropriate reader or writer refcount. */
4297 file_lock->fl_type = einfo.ei_mode;
4299 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4300 LUSTRE_OPC_ANY, NULL);
4301 if (IS_ERR(op_data))
4302 RETURN(PTR_ERR(op_data));
4304 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4305 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4306 flock.l_flock.pid, flags, einfo.ei_mode,
4307 flock.l_flock.start, flock.l_flock.end);
4309 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4312 /* Restore the file lock type if not TEST lock. */
4313 if (!(flags & LDLM_FL_TEST_LOCK))
4314 file_lock->fl_type = fl_type;
4316 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4317 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4318 !(flags & LDLM_FL_TEST_LOCK))
4319 rc2 = locks_lock_file_wait(file, file_lock);
4321 if ((file_lock->fl_flags & FL_FLOCK) &&
4322 (rc == 0 || file_lock->fl_type == F_UNLCK))
4323 rc2 = flock_lock_file_wait(file, file_lock);
4324 if ((file_lock->fl_flags & FL_POSIX) &&
4325 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4326 !(flags & LDLM_FL_TEST_LOCK))
4327 rc2 = posix_lock_file_wait(file, file_lock);
4328 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4330 if (rc2 && file_lock->fl_type != F_UNLCK) {
4331 einfo.ei_mode = LCK_NL;
4332 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4337 ll_finish_md_op_data(op_data);
4342 int ll_get_fid_by_name(struct inode *parent, const char *name,
4343 int namelen, struct lu_fid *fid,
4344 struct inode **inode)
4346 struct md_op_data *op_data = NULL;
4347 struct mdt_body *body;
4348 struct ptlrpc_request *req;
4352 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4353 LUSTRE_OPC_ANY, NULL);
4354 if (IS_ERR(op_data))
4355 RETURN(PTR_ERR(op_data));
4357 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4358 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4359 ll_finish_md_op_data(op_data);
4363 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4365 GOTO(out_req, rc = -EFAULT);
4367 *fid = body->mbo_fid1;
4370 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4372 ptlrpc_req_finished(req);
4376 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4379 struct dentry *dchild = NULL;
4380 struct inode *child_inode = NULL;
4381 struct md_op_data *op_data;
4382 struct ptlrpc_request *request = NULL;
4383 struct obd_client_handle *och = NULL;
4385 struct mdt_body *body;
4386 __u64 data_version = 0;
4387 size_t namelen = strlen(name);
4388 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4392 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4393 PFID(ll_inode2fid(parent)), name,
4394 lum->lum_stripe_offset, lum->lum_stripe_count);
4396 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4397 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4398 lustre_swab_lmv_user_md(lum);
4400 /* Get child FID first */
4401 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4404 dchild = d_lookup(file_dentry(file), &qstr);
4406 if (dchild->d_inode)
4407 child_inode = igrab(dchild->d_inode);
4412 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4421 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4422 OBD_CONNECT2_DIR_MIGRATE)) {
4423 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4424 ll_dir_striped(child_inode)) {
4425 CERROR("%s: MDT doesn't support stripe directory "
4426 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4427 GOTO(out_iput, rc = -EOPNOTSUPP);
4432 * lfs migrate command needs to be blocked on the client
4433 * by checking the migrate FID against the FID of the
4436 if (child_inode == parent->i_sb->s_root->d_inode)
4437 GOTO(out_iput, rc = -EINVAL);
4439 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4440 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4441 if (IS_ERR(op_data))
4442 GOTO(out_iput, rc = PTR_ERR(op_data));
4444 inode_lock(child_inode);
4445 op_data->op_fid3 = *ll_inode2fid(child_inode);
4446 if (!fid_is_sane(&op_data->op_fid3)) {
4447 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4448 ll_i2sbi(parent)->ll_fsname, name,
4449 PFID(&op_data->op_fid3));
4450 GOTO(out_unlock, rc = -EINVAL);
4453 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4454 op_data->op_data = lum;
4455 op_data->op_data_size = lumlen;
4458 if (S_ISREG(child_inode->i_mode)) {
4459 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4463 GOTO(out_unlock, rc);
4466 rc = ll_data_version(child_inode, &data_version,
4469 GOTO(out_close, rc);
4471 op_data->op_open_handle = och->och_open_handle;
4472 op_data->op_data_version = data_version;
4473 op_data->op_lease_handle = och->och_lease_handle;
4474 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4476 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4477 och->och_mod->mod_open_req->rq_replay = 0;
4478 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4481 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4482 name, namelen, &request);
4484 LASSERT(request != NULL);
4485 ll_update_times(request, parent);
4488 if (rc == 0 || rc == -EAGAIN) {
4489 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4490 LASSERT(body != NULL);
4492 /* If the server does release layout lock, then we cleanup
4493 * the client och here, otherwise release it in out_close: */
4494 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4495 obd_mod_put(och->och_mod);
4496 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4498 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4504 if (request != NULL) {
4505 ptlrpc_req_finished(request);
4509 /* Try again if the lease has cancelled. */
4510 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4515 ll_lease_close(och, child_inode, NULL);
4517 clear_nlink(child_inode);
4519 inode_unlock(child_inode);
4520 ll_finish_md_op_data(op_data);
4527 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4529 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4533 * In order to avoid flood of warning messages, only print one message
4534 * for one file. And the entire message rate on the client is limited
4535 * by CDEBUG_LIMIT too.
4537 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4538 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4539 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4540 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4546 * test if some locks matching bits and l_req_mode are acquired
4547 * - bits can be in different locks
4548 * - if found clear the common lock bits in *bits
4549 * - the bits not found, are kept in *bits
4551 * \param bits [IN] searched lock bits [IN]
4552 * \param l_req_mode [IN] searched lock mode
4553 * \retval boolean, true iff all bits are found
4555 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4557 struct lustre_handle lockh;
4558 union ldlm_policy_data policy;
4559 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4560 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4569 fid = &ll_i2info(inode)->lli_fid;
4570 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4571 ldlm_lockname[mode]);
4573 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4574 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4575 policy.l_inodebits.bits = *bits & (1 << i);
4576 if (policy.l_inodebits.bits == 0)
4579 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4580 &policy, mode, &lockh)) {
4581 struct ldlm_lock *lock;
4583 lock = ldlm_handle2lock(&lockh);
4586 ~(lock->l_policy_data.l_inodebits.bits);
4587 LDLM_LOCK_PUT(lock);
4589 *bits &= ~policy.l_inodebits.bits;
4596 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4597 struct lustre_handle *lockh, __u64 flags,
4598 enum ldlm_mode mode)
4600 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4605 fid = &ll_i2info(inode)->lli_fid;
4606 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4608 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4609 fid, LDLM_IBITS, &policy, mode, lockh);
4614 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4616 /* Already unlinked. Just update nlink and return success */
4617 if (rc == -ENOENT) {
4619 /* If it is striped directory, and there is bad stripe
4620 * Let's revalidate the dentry again, instead of returning
4622 if (ll_dir_striped(inode))
4625 /* This path cannot be hit for regular files unless in
4626 * case of obscure races, so no need to to validate
4628 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4630 } else if (rc != 0) {
4631 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4632 "%s: revalidate FID "DFID" error: rc = %d\n",
4633 ll_i2sbi(inode)->ll_fsname,
4634 PFID(ll_inode2fid(inode)), rc);
4640 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4642 struct inode *inode = dentry->d_inode;
4643 struct obd_export *exp = ll_i2mdexp(inode);
4644 struct lookup_intent oit = {
4647 struct ptlrpc_request *req = NULL;
4648 struct md_op_data *op_data;
4652 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4653 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4655 /* Call getattr by fid, so do not provide name at all. */
4656 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4657 LUSTRE_OPC_ANY, NULL);
4658 if (IS_ERR(op_data))
4659 RETURN(PTR_ERR(op_data));
4661 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4662 ll_finish_md_op_data(op_data);
4664 rc = ll_inode_revalidate_fini(inode, rc);
4668 rc = ll_revalidate_it_finish(req, &oit, dentry);
4670 ll_intent_release(&oit);
4674 /* Unlinked? Unhash dentry, so it is not picked up later by
4675 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4676 * here to preserve get_cwd functionality on 2.6.
4678 if (!dentry->d_inode->i_nlink) {
4679 spin_lock(&inode->i_lock);
4680 d_lustre_invalidate(dentry, 0);
4681 spin_unlock(&inode->i_lock);
4684 ll_lookup_finish_locks(&oit, dentry);
4686 ptlrpc_req_finished(req);
4691 static int ll_merge_md_attr(struct inode *inode)
4693 struct ll_inode_info *lli = ll_i2info(inode);
4694 struct cl_attr attr = { 0 };
4697 LASSERT(lli->lli_lsm_md != NULL);
4699 if (!lmv_dir_striped(lli->lli_lsm_md))
4702 down_read(&lli->lli_lsm_sem);
4703 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4704 &attr, ll_md_blocking_ast);
4705 up_read(&lli->lli_lsm_sem);
4709 set_nlink(inode, attr.cat_nlink);
4710 inode->i_blocks = attr.cat_blocks;
4711 i_size_write(inode, attr.cat_size);
4713 ll_i2info(inode)->lli_atime = attr.cat_atime;
4714 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4715 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4720 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4722 struct inode *inode = de->d_inode;
4723 struct ll_sb_info *sbi = ll_i2sbi(inode);
4724 struct ll_inode_info *lli = ll_i2info(inode);
4727 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4729 rc = ll_inode_revalidate(de, IT_GETATTR);
4733 if (S_ISREG(inode->i_mode)) {
4736 rc = pcc_inode_getattr(inode, &cached);
4737 if (cached && rc < 0)
4740 /* In case of restore, the MDT has the right size and has
4741 * already send it back without granting the layout lock,
4742 * inode is up-to-date so glimpse is useless.
4743 * Also to glimpse we need the layout, in case of a running
4744 * restore the MDT holds the layout lock so the glimpse will
4745 * block up to the end of restore (getattr will block)
4747 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4748 rc = ll_glimpse_size(inode);
4753 /* If object isn't regular a file then don't validate size. */
4754 if (ll_dir_striped(inode)) {
4755 rc = ll_merge_md_attr(inode);
4760 inode->i_atime.tv_sec = lli->lli_atime;
4761 inode->i_mtime.tv_sec = lli->lli_mtime;
4762 inode->i_ctime.tv_sec = lli->lli_ctime;
4765 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4767 if (ll_need_32bit_api(sbi)) {
4768 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4769 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4770 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4772 stat->ino = inode->i_ino;
4773 stat->dev = inode->i_sb->s_dev;
4774 stat->rdev = inode->i_rdev;
4777 stat->mode = inode->i_mode;
4778 stat->uid = inode->i_uid;
4779 stat->gid = inode->i_gid;
4780 stat->atime = inode->i_atime;
4781 stat->mtime = inode->i_mtime;
4782 stat->ctime = inode->i_ctime;
4783 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4785 stat->nlink = inode->i_nlink;
4786 stat->size = i_size_read(inode);
4787 stat->blocks = inode->i_blocks;
4792 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4793 int ll_getattr(const struct path *path, struct kstat *stat,
4794 u32 request_mask, unsigned int flags)
4796 struct dentry *de = path->dentry;
4798 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4801 return ll_getattr_dentry(de, stat);
4804 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4805 __u64 start, __u64 len)
4809 struct fiemap *fiemap;
4810 unsigned int extent_count = fieinfo->fi_extents_max;
4812 num_bytes = sizeof(*fiemap) + (extent_count *
4813 sizeof(struct fiemap_extent));
4814 OBD_ALLOC_LARGE(fiemap, num_bytes);
4819 fiemap->fm_flags = fieinfo->fi_flags;
4820 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4821 fiemap->fm_start = start;
4822 fiemap->fm_length = len;
4823 if (extent_count > 0 &&
4824 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4825 sizeof(struct fiemap_extent)) != 0)
4826 GOTO(out, rc = -EFAULT);
4828 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4830 fieinfo->fi_flags = fiemap->fm_flags;
4831 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4832 if (extent_count > 0 &&
4833 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4834 fiemap->fm_mapped_extents *
4835 sizeof(struct fiemap_extent)) != 0)
4836 GOTO(out, rc = -EFAULT);
4838 OBD_FREE_LARGE(fiemap, num_bytes);
4842 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4844 struct ll_inode_info *lli = ll_i2info(inode);
4845 struct posix_acl *acl = NULL;
4848 spin_lock(&lli->lli_lock);
4849 /* VFS' acl_permission_check->check_acl will release the refcount */
4850 acl = posix_acl_dup(lli->lli_posix_acl);
4851 spin_unlock(&lli->lli_lock);
4856 #ifdef HAVE_IOP_SET_ACL
4857 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4858 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4860 struct ll_sb_info *sbi = ll_i2sbi(inode);
4861 struct ptlrpc_request *req = NULL;
4862 const char *name = NULL;
4864 size_t value_size = 0;
4869 case ACL_TYPE_ACCESS:
4870 name = XATTR_NAME_POSIX_ACL_ACCESS;
4872 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4875 case ACL_TYPE_DEFAULT:
4876 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4877 if (!S_ISDIR(inode->i_mode))
4878 rc = acl ? -EACCES : 0;
4889 value_size = posix_acl_xattr_size(acl->a_count);
4890 value = kmalloc(value_size, GFP_NOFS);
4892 GOTO(out, rc = -ENOMEM);
4894 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4896 GOTO(out_value, rc);
4899 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4900 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4901 name, value, value_size, 0, 0, &req);
4903 ptlrpc_req_finished(req);
4908 forget_cached_acl(inode, type);
4910 set_cached_acl(inode, type, acl);
4913 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4914 #endif /* HAVE_IOP_SET_ACL */
4916 int ll_inode_permission(struct inode *inode, int mask)
4919 struct ll_sb_info *sbi;
4920 struct root_squash_info *squash;
4921 struct cred *cred = NULL;
4922 const struct cred *old_cred = NULL;
4924 bool squash_id = false;
4927 if (mask & MAY_NOT_BLOCK)
4930 /* as root inode are NOT getting validated in lookup operation,
4931 * need to do it before permission check. */
4933 if (inode == inode->i_sb->s_root->d_inode) {
4934 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4939 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4940 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4942 /* squash fsuid/fsgid if needed */
4943 sbi = ll_i2sbi(inode);
4944 squash = &sbi->ll_squash;
4945 if (unlikely(squash->rsi_uid != 0 &&
4946 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4947 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4951 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4952 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4953 squash->rsi_uid, squash->rsi_gid);
4955 /* update current process's credentials
4956 * and FS capability */
4957 cred = prepare_creds();
4961 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4962 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4963 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4964 if ((1 << cap) & CFS_CAP_FS_MASK)
4965 cap_lower(cred->cap_effective, cap);
4967 old_cred = override_creds(cred);
4970 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4971 rc = generic_permission(inode, mask);
4972 /* restore current process's credentials and FS capability */
4974 revert_creds(old_cred);
4981 /* -o localflock - only provides locally consistent flock locks */
4982 struct file_operations ll_file_operations = {
4983 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4984 # ifdef HAVE_SYNC_READ_WRITE
4985 .read = new_sync_read,
4986 .write = new_sync_write,
4988 .read_iter = ll_file_read_iter,
4989 .write_iter = ll_file_write_iter,
4990 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4991 .read = ll_file_read,
4992 .aio_read = ll_file_aio_read,
4993 .write = ll_file_write,
4994 .aio_write = ll_file_aio_write,
4995 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4996 .unlocked_ioctl = ll_file_ioctl,
4997 .open = ll_file_open,
4998 .release = ll_file_release,
4999 .mmap = ll_file_mmap,
5000 .llseek = ll_file_seek,
5001 .splice_read = ll_file_splice_read,
5006 struct file_operations ll_file_operations_flock = {
5007 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5008 # ifdef HAVE_SYNC_READ_WRITE
5009 .read = new_sync_read,
5010 .write = new_sync_write,
5011 # endif /* HAVE_SYNC_READ_WRITE */
5012 .read_iter = ll_file_read_iter,
5013 .write_iter = ll_file_write_iter,
5014 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5015 .read = ll_file_read,
5016 .aio_read = ll_file_aio_read,
5017 .write = ll_file_write,
5018 .aio_write = ll_file_aio_write,
5019 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5020 .unlocked_ioctl = ll_file_ioctl,
5021 .open = ll_file_open,
5022 .release = ll_file_release,
5023 .mmap = ll_file_mmap,
5024 .llseek = ll_file_seek,
5025 .splice_read = ll_file_splice_read,
5028 .flock = ll_file_flock,
5029 .lock = ll_file_flock
5032 /* These are for -o noflock - to return ENOSYS on flock calls */
5033 struct file_operations ll_file_operations_noflock = {
5034 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5035 # ifdef HAVE_SYNC_READ_WRITE
5036 .read = new_sync_read,
5037 .write = new_sync_write,
5038 # endif /* HAVE_SYNC_READ_WRITE */
5039 .read_iter = ll_file_read_iter,
5040 .write_iter = ll_file_write_iter,
5041 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5042 .read = ll_file_read,
5043 .aio_read = ll_file_aio_read,
5044 .write = ll_file_write,
5045 .aio_write = ll_file_aio_write,
5046 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5047 .unlocked_ioctl = ll_file_ioctl,
5048 .open = ll_file_open,
5049 .release = ll_file_release,
5050 .mmap = ll_file_mmap,
5051 .llseek = ll_file_seek,
5052 .splice_read = ll_file_splice_read,
5055 .flock = ll_file_noflock,
5056 .lock = ll_file_noflock
5059 struct inode_operations ll_file_inode_operations = {
5060 .setattr = ll_setattr,
5061 .getattr = ll_getattr,
5062 .permission = ll_inode_permission,
5063 #ifdef HAVE_IOP_XATTR
5064 .setxattr = ll_setxattr,
5065 .getxattr = ll_getxattr,
5066 .removexattr = ll_removexattr,
5068 .listxattr = ll_listxattr,
5069 .fiemap = ll_fiemap,
5070 #ifdef HAVE_IOP_GET_ACL
5071 .get_acl = ll_get_acl,
5073 #ifdef HAVE_IOP_SET_ACL
5074 .set_acl = ll_set_acl,
5078 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5080 struct ll_inode_info *lli = ll_i2info(inode);
5081 struct cl_object *obj = lli->lli_clob;
5090 env = cl_env_get(&refcheck);
5092 RETURN(PTR_ERR(env));
5094 rc = cl_conf_set(env, lli->lli_clob, conf);
5098 if (conf->coc_opc == OBJECT_CONF_SET) {
5099 struct ldlm_lock *lock = conf->coc_lock;
5100 struct cl_layout cl = {
5104 LASSERT(lock != NULL);
5105 LASSERT(ldlm_has_layout(lock));
5107 /* it can only be allowed to match after layout is
5108 * applied to inode otherwise false layout would be
5109 * seen. Applying layout shoud happen before dropping
5110 * the intent lock. */
5111 ldlm_lock_allow_match(lock);
5113 rc = cl_object_layout_get(env, obj, &cl);
5118 DFID": layout version change: %u -> %u\n",
5119 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5121 ll_layout_version_set(lli, cl.cl_layout_gen);
5125 cl_env_put(env, &refcheck);
5130 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5131 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5134 struct ll_sb_info *sbi = ll_i2sbi(inode);
5135 struct ptlrpc_request *req;
5142 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5143 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5144 lock->l_lvb_data, lock->l_lvb_len);
5146 if (lock->l_lvb_data != NULL)
5149 /* if layout lock was granted right away, the layout is returned
5150 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5151 * blocked and then granted via completion ast, we have to fetch
5152 * layout here. Please note that we can't use the LVB buffer in
5153 * completion AST because it doesn't have a large enough buffer */
5154 rc = ll_get_default_mdsize(sbi, &lmmsize);
5158 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5159 XATTR_NAME_LOV, lmmsize, &req);
5162 GOTO(out, rc = 0); /* empty layout */
5169 if (lmmsize == 0) /* empty layout */
5172 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5174 GOTO(out, rc = -EFAULT);
5176 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5177 if (lvbdata == NULL)
5178 GOTO(out, rc = -ENOMEM);
5180 memcpy(lvbdata, lmm, lmmsize);
5181 lock_res_and_lock(lock);
5182 if (unlikely(lock->l_lvb_data == NULL)) {
5183 lock->l_lvb_type = LVB_T_LAYOUT;
5184 lock->l_lvb_data = lvbdata;
5185 lock->l_lvb_len = lmmsize;
5188 unlock_res_and_lock(lock);
5191 OBD_FREE_LARGE(lvbdata, lmmsize);
5196 ptlrpc_req_finished(req);
5201 * Apply the layout to the inode. Layout lock is held and will be released
5204 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5205 struct inode *inode)
5207 struct ll_inode_info *lli = ll_i2info(inode);
5208 struct ll_sb_info *sbi = ll_i2sbi(inode);
5209 struct ldlm_lock *lock;
5210 struct cl_object_conf conf;
5213 bool wait_layout = false;
5216 LASSERT(lustre_handle_is_used(lockh));
5218 lock = ldlm_handle2lock(lockh);
5219 LASSERT(lock != NULL);
5220 LASSERT(ldlm_has_layout(lock));
5222 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5223 PFID(&lli->lli_fid), inode);
5225 /* in case this is a caching lock and reinstate with new inode */
5226 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5228 lock_res_and_lock(lock);
5229 lvb_ready = ldlm_is_lvb_ready(lock);
5230 unlock_res_and_lock(lock);
5232 /* checking lvb_ready is racy but this is okay. The worst case is
5233 * that multi processes may configure the file on the same time. */
5237 rc = ll_layout_fetch(inode, lock);
5241 /* for layout lock, lmm is stored in lock's lvb.
5242 * lvb_data is immutable if the lock is held so it's safe to access it
5245 * set layout to file. Unlikely this will fail as old layout was
5246 * surely eliminated */
5247 memset(&conf, 0, sizeof conf);
5248 conf.coc_opc = OBJECT_CONF_SET;
5249 conf.coc_inode = inode;
5250 conf.coc_lock = lock;
5251 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5252 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5253 rc = ll_layout_conf(inode, &conf);
5255 /* refresh layout failed, need to wait */
5256 wait_layout = rc == -EBUSY;
5259 LDLM_LOCK_PUT(lock);
5260 ldlm_lock_decref(lockh, mode);
5262 /* wait for IO to complete if it's still being used. */
5264 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5265 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5267 memset(&conf, 0, sizeof conf);
5268 conf.coc_opc = OBJECT_CONF_WAIT;
5269 conf.coc_inode = inode;
5270 rc = ll_layout_conf(inode, &conf);
5274 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5275 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5281 * Issue layout intent RPC to MDS.
5282 * \param inode [in] file inode
5283 * \param intent [in] layout intent
5285 * \retval 0 on success
5286 * \retval < 0 error code
5288 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5290 struct ll_inode_info *lli = ll_i2info(inode);
5291 struct ll_sb_info *sbi = ll_i2sbi(inode);
5292 struct md_op_data *op_data;
5293 struct lookup_intent it;
5294 struct ptlrpc_request *req;
5298 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5299 0, 0, LUSTRE_OPC_ANY, NULL);
5300 if (IS_ERR(op_data))
5301 RETURN(PTR_ERR(op_data));
5303 op_data->op_data = intent;
5304 op_data->op_data_size = sizeof(*intent);
5306 memset(&it, 0, sizeof(it));
5307 it.it_op = IT_LAYOUT;
5308 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5309 intent->li_opc == LAYOUT_INTENT_TRUNC)
5310 it.it_flags = FMODE_WRITE;
5312 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5313 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5315 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5316 &ll_md_blocking_ast, 0);
5317 if (it.it_request != NULL)
5318 ptlrpc_req_finished(it.it_request);
5319 it.it_request = NULL;
5321 ll_finish_md_op_data(op_data);
5323 /* set lock data in case this is a new lock */
5325 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5327 ll_intent_drop_lock(&it);
5333 * This function checks if there exists a LAYOUT lock on the client side,
5334 * or enqueues it if it doesn't have one in cache.
5336 * This function will not hold layout lock so it may be revoked any time after
5337 * this function returns. Any operations depend on layout should be redone
5340 * This function should be called before lov_io_init() to get an uptodate
5341 * layout version, the caller should save the version number and after IO
5342 * is finished, this function should be called again to verify that layout
5343 * is not changed during IO time.
5345 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5347 struct ll_inode_info *lli = ll_i2info(inode);
5348 struct ll_sb_info *sbi = ll_i2sbi(inode);
5349 struct lustre_handle lockh;
5350 struct layout_intent intent = {
5351 .li_opc = LAYOUT_INTENT_ACCESS,
5353 enum ldlm_mode mode;
5357 *gen = ll_layout_version_get(lli);
5358 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5362 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5363 LASSERT(S_ISREG(inode->i_mode));
5365 /* take layout lock mutex to enqueue layout lock exclusively. */
5366 mutex_lock(&lli->lli_layout_mutex);
5369 /* mostly layout lock is caching on the local side, so try to
5370 * match it before grabbing layout lock mutex. */
5371 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5372 LCK_CR | LCK_CW | LCK_PR |
5374 if (mode != 0) { /* hit cached lock */
5375 rc = ll_layout_lock_set(&lockh, mode, inode);
5381 rc = ll_layout_intent(inode, &intent);
5387 *gen = ll_layout_version_get(lli);
5388 mutex_unlock(&lli->lli_layout_mutex);
5394 * Issue layout intent RPC indicating where in a file an IO is about to write.
5396 * \param[in] inode file inode.
5397 * \param[in] ext write range with start offset of fille in bytes where
5398 * an IO is about to write, and exclusive end offset in
5401 * \retval 0 on success
5402 * \retval < 0 error code
5404 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5405 struct lu_extent *ext)
5407 struct layout_intent intent = {
5409 .li_extent.e_start = ext->e_start,
5410 .li_extent.e_end = ext->e_end,
5415 rc = ll_layout_intent(inode, &intent);
5421 * This function send a restore request to the MDT
5423 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5425 struct hsm_user_request *hur;
5429 len = sizeof(struct hsm_user_request) +
5430 sizeof(struct hsm_user_item);
5431 OBD_ALLOC(hur, len);
5435 hur->hur_request.hr_action = HUA_RESTORE;
5436 hur->hur_request.hr_archive_id = 0;
5437 hur->hur_request.hr_flags = 0;
5438 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5439 sizeof(hur->hur_user_item[0].hui_fid));
5440 hur->hur_user_item[0].hui_extent.offset = offset;
5441 hur->hur_user_item[0].hui_extent.length = length;
5442 hur->hur_request.hr_itemcount = 1;
5443 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,