4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #include <linux/uidgid.h>
47 #include <uapi/linux/lustre/lustre_ioctl.h>
48 #include <lustre_swab.h>
50 #include "cl_object.h"
51 #include "llite_internal.h"
52 #include "vvp_internal.h"
55 struct inode *sp_inode;
60 __u64 pa_data_version;
66 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
68 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
80 pcc_file_init(&fd->fd_pcc_file);
85 static void ll_file_data_put(struct ll_file_data *fd)
88 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
92 * Packs all the attributes into @op_data for the CLOSE rpc.
94 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
95 struct obd_client_handle *och)
99 ll_prep_md_op_data(op_data, inode, NULL, NULL,
100 0, 0, LUSTRE_OPC_ANY, NULL);
102 op_data->op_attr.ia_mode = inode->i_mode;
103 op_data->op_attr.ia_atime = inode->i_atime;
104 op_data->op_attr.ia_mtime = inode->i_mtime;
105 op_data->op_attr.ia_ctime = inode->i_ctime;
106 op_data->op_attr.ia_size = i_size_read(inode);
107 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
110 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
111 op_data->op_attr_blocks = inode->i_blocks;
112 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
113 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
114 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
115 op_data->op_open_handle = och->och_open_handle;
117 if (och->och_flags & FMODE_WRITE &&
118 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
119 /* For HSM: if inode data has been modified, pack it so that
120 * MDT can set data dirty flag in the archive. */
121 op_data->op_bias |= MDS_DATA_MODIFIED;
127 * Perform a close, possibly with a bias.
128 * The meaning of "data" depends on the value of "bias".
130 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
131 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
134 static int ll_close_inode_openhandle(struct inode *inode,
135 struct obd_client_handle *och,
136 enum mds_op_bias bias, void *data)
138 struct obd_export *md_exp = ll_i2mdexp(inode);
139 const struct ll_inode_info *lli = ll_i2info(inode);
140 struct md_op_data *op_data;
141 struct ptlrpc_request *req = NULL;
145 if (class_exp2obd(md_exp) == NULL) {
146 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
147 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
151 OBD_ALLOC_PTR(op_data);
152 /* We leak openhandle and request here on error, but not much to be
153 * done in OOM case since app won't retry close on error either. */
155 GOTO(out, rc = -ENOMEM);
157 ll_prepare_close(inode, op_data, och);
159 case MDS_CLOSE_LAYOUT_MERGE:
160 /* merge blocks from the victim inode */
161 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
162 op_data->op_attr.ia_valid |= ATTR_SIZE;
163 op_data->op_xvalid |= OP_XVALID_BLOCKS;
165 case MDS_CLOSE_LAYOUT_SPLIT:
166 case MDS_CLOSE_LAYOUT_SWAP: {
167 struct split_param *sp = data;
169 LASSERT(data != NULL);
170 op_data->op_bias |= bias;
171 op_data->op_data_version = 0;
172 op_data->op_lease_handle = och->och_lease_handle;
173 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
174 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
175 op_data->op_mirror_id = sp->sp_mirror_id;
177 op_data->op_fid2 = *ll_inode2fid(data);
182 case MDS_CLOSE_RESYNC_DONE: {
183 struct ll_ioc_lease *ioc = data;
185 LASSERT(data != NULL);
186 op_data->op_attr_blocks +=
187 ioc->lil_count * op_data->op_attr_blocks;
188 op_data->op_attr.ia_valid |= ATTR_SIZE;
189 op_data->op_xvalid |= OP_XVALID_BLOCKS;
190 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
192 op_data->op_lease_handle = och->och_lease_handle;
193 op_data->op_data = &ioc->lil_ids[0];
194 op_data->op_data_size =
195 ioc->lil_count * sizeof(ioc->lil_ids[0]);
199 case MDS_PCC_ATTACH: {
200 struct pcc_param *param = data;
202 LASSERT(data != NULL);
203 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
204 op_data->op_archive_id = param->pa_archive_id;
205 op_data->op_data_version = param->pa_data_version;
206 op_data->op_lease_handle = och->och_lease_handle;
210 case MDS_HSM_RELEASE:
211 LASSERT(data != NULL);
212 op_data->op_bias |= MDS_HSM_RELEASE;
213 op_data->op_data_version = *(__u64 *)data;
214 op_data->op_lease_handle = och->och_lease_handle;
215 op_data->op_attr.ia_valid |= ATTR_SIZE;
216 op_data->op_xvalid |= OP_XVALID_BLOCKS;
220 LASSERT(data == NULL);
224 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
225 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
226 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
227 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
229 rc = md_close(md_exp, op_data, och->och_mod, &req);
230 if (rc != 0 && rc != -EINTR)
231 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
232 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
234 if (rc == 0 && op_data->op_bias & bias) {
235 struct mdt_body *body;
237 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
238 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
241 if (bias & MDS_PCC_ATTACH) {
242 struct pcc_param *param = data;
244 param->pa_layout_gen = body->mbo_layout_gen;
248 ll_finish_md_op_data(op_data);
252 md_clear_open_replay_data(md_exp, och);
253 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
256 ptlrpc_req_finished(req); /* This is close request */
260 int ll_md_real_close(struct inode *inode, fmode_t fmode)
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct obd_client_handle **och_p;
264 struct obd_client_handle *och;
269 if (fmode & FMODE_WRITE) {
270 och_p = &lli->lli_mds_write_och;
271 och_usecount = &lli->lli_open_fd_write_count;
272 } else if (fmode & FMODE_EXEC) {
273 och_p = &lli->lli_mds_exec_och;
274 och_usecount = &lli->lli_open_fd_exec_count;
276 LASSERT(fmode & FMODE_READ);
277 och_p = &lli->lli_mds_read_och;
278 och_usecount = &lli->lli_open_fd_read_count;
281 mutex_lock(&lli->lli_och_mutex);
282 if (*och_usecount > 0) {
283 /* There are still users of this handle, so skip
285 mutex_unlock(&lli->lli_och_mutex);
291 mutex_unlock(&lli->lli_och_mutex);
294 /* There might be a race and this handle may already
296 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
302 static int ll_md_close(struct inode *inode, struct file *file)
304 union ldlm_policy_data policy = {
305 .l_inodebits = { MDS_INODELOCK_OPEN },
307 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
308 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
309 struct ll_inode_info *lli = ll_i2info(inode);
310 struct lustre_handle lockh;
311 enum ldlm_mode lockmode;
315 /* clear group lock, if present */
316 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
317 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
319 if (fd->fd_lease_och != NULL) {
322 /* Usually the lease is not released when the
323 * application crashed, we need to release here. */
324 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
325 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
326 PFID(&lli->lli_fid), rc, lease_broken);
328 fd->fd_lease_och = NULL;
331 if (fd->fd_och != NULL) {
332 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
337 /* Let's see if we have good enough OPEN lock on the file and if
338 we can skip talking to MDS */
339 mutex_lock(&lli->lli_och_mutex);
340 if (fd->fd_omode & FMODE_WRITE) {
342 LASSERT(lli->lli_open_fd_write_count);
343 lli->lli_open_fd_write_count--;
344 } else if (fd->fd_omode & FMODE_EXEC) {
346 LASSERT(lli->lli_open_fd_exec_count);
347 lli->lli_open_fd_exec_count--;
350 LASSERT(lli->lli_open_fd_read_count);
351 lli->lli_open_fd_read_count--;
353 mutex_unlock(&lli->lli_och_mutex);
355 /* LU-4398: do not cache write open lock if the file has exec bit */
356 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
357 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
358 LDLM_IBITS, &policy, lockmode, &lockh))
359 rc = ll_md_real_close(inode, fd->fd_omode);
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
368 /* While this returns an error code, fput() the caller does not, so we need
369 * to make every effort to clean up all of our state here. Also, applications
370 * rarely check close errors and even if an error is returned they will not
371 * re-try the close call.
373 int ll_file_release(struct inode *inode, struct file *file)
375 struct ll_file_data *fd;
376 struct ll_sb_info *sbi = ll_i2sbi(inode);
377 struct ll_inode_info *lli = ll_i2info(inode);
378 ktime_t kstart = ktime_get();
383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
384 PFID(ll_inode2fid(inode)), inode);
386 fd = LUSTRE_FPRIVATE(file);
389 /* The last ref on @file, maybe not the the owner pid of statahead,
390 * because parent and child process can share the same file handle. */
391 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
392 ll_deauthorize_statahead(inode, fd);
394 if (inode->i_sb->s_root == file_dentry(file)) {
395 LUSTRE_FPRIVATE(file) = NULL;
396 ll_file_data_put(fd);
400 pcc_file_release(inode, file);
402 if (!S_ISDIR(inode->i_mode)) {
403 if (lli->lli_clob != NULL)
404 lov_read_and_clear_async_rc(lli->lli_clob);
405 lli->lli_async_rc = 0;
408 rc = ll_md_close(inode, file);
410 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
411 libcfs_debug_dumplog();
414 if (!rc && inode->i_sb->s_root != file_dentry(file))
415 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
416 ktime_us_delta(ktime_get(), kstart));
420 static inline int ll_dom_readpage(void *data, struct page *page)
422 struct niobuf_local *lnb = data;
425 kaddr = ll_kmap_atomic(page, KM_USER0);
426 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
427 if (lnb->lnb_len < PAGE_SIZE)
428 memset(kaddr + lnb->lnb_len, 0,
429 PAGE_SIZE - lnb->lnb_len);
430 flush_dcache_page(page);
431 SetPageUptodate(page);
432 ll_kunmap_atomic(kaddr, KM_USER0);
438 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
439 struct lookup_intent *it)
441 struct ll_inode_info *lli = ll_i2info(inode);
442 struct cl_object *obj = lli->lli_clob;
443 struct address_space *mapping = inode->i_mapping;
445 struct niobuf_remote *rnb;
446 struct mdt_body *body;
448 unsigned long index, start;
449 struct niobuf_local lnb;
456 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
460 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
461 if (rnb == NULL || rnb->rnb_len == 0)
464 /* LU-11595: Server may return whole file and that is OK always or
465 * it may return just file tail and its offset must be aligned with
466 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
467 * smaller then offset may be not aligned and that data is just ignored.
469 if (rnb->rnb_offset % PAGE_SIZE)
472 /* Server returns whole file or just file tail if it fills in reply
473 * buffer, in both cases total size should be equal to the file size.
475 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
476 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
477 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
478 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
479 rnb->rnb_len, body->mbo_dom_size);
483 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
484 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
486 data = (char *)rnb + sizeof(*rnb);
488 lnb.lnb_file_offset = rnb->rnb_offset;
489 start = lnb.lnb_file_offset / PAGE_SIZE;
491 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
492 lnb.lnb_page_offset = 0;
494 lnb.lnb_data = data + (index << PAGE_SHIFT);
495 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
496 if (lnb.lnb_len > PAGE_SIZE)
497 lnb.lnb_len = PAGE_SIZE;
499 vmpage = read_cache_page(mapping, index + start,
500 ll_dom_readpage, &lnb);
501 if (IS_ERR(vmpage)) {
502 CWARN("%s: cannot fill page %lu for "DFID
503 " with data: rc = %li\n",
504 ll_i2sbi(inode)->ll_fsname, index + start,
505 PFID(lu_object_fid(&obj->co_lu)),
511 } while (rnb->rnb_len > (index << PAGE_SHIFT));
515 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
516 struct lookup_intent *itp)
518 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
519 struct dentry *parent = de->d_parent;
522 struct md_op_data *op_data;
523 struct ptlrpc_request *req = NULL;
527 LASSERT(parent != NULL);
528 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
530 /* if server supports open-by-fid, or file name is invalid, don't pack
531 * name in open request */
532 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
533 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
535 len = de->d_name.len;
536 name = kmalloc(len + 1, GFP_NOFS);
541 spin_lock(&de->d_lock);
542 if (len != de->d_name.len) {
543 spin_unlock(&de->d_lock);
547 memcpy(name, de->d_name.name, len);
549 spin_unlock(&de->d_lock);
551 if (!lu_name_is_valid_2(name, len)) {
557 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
558 name, len, 0, LUSTRE_OPC_ANY, NULL);
559 if (IS_ERR(op_data)) {
561 RETURN(PTR_ERR(op_data));
563 op_data->op_data = lmm;
564 op_data->op_data_size = lmmsize;
566 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
567 &ll_md_blocking_ast, 0);
569 ll_finish_md_op_data(op_data);
571 /* reason for keep own exit path - don`t flood log
572 * with messages with -ESTALE errors.
574 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
575 it_open_error(DISP_OPEN_OPEN, itp))
577 ll_release_openhandle(de, itp);
581 if (it_disposition(itp, DISP_LOOKUP_NEG))
582 GOTO(out, rc = -ENOENT);
584 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
585 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
586 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
590 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
592 if (!rc && itp->it_lock_mode) {
593 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
594 struct ldlm_lock *lock;
595 bool has_dom_bit = false;
597 /* If we got a lock back and it has a LOOKUP bit set,
598 * make sure the dentry is marked as valid so we can find it.
599 * We don't need to care about actual hashing since other bits
600 * of kernel will deal with that later.
602 lock = ldlm_handle2lock(&handle);
604 has_dom_bit = ldlm_has_dom(lock);
605 if (lock->l_policy_data.l_inodebits.bits &
606 MDS_INODELOCK_LOOKUP)
607 d_lustre_revalidate(de);
611 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
613 ll_dom_finish_open(de->d_inode, req, itp);
617 ptlrpc_req_finished(req);
618 ll_intent_drop_lock(itp);
620 /* We did open by fid, but by the time we got to the server,
621 * the object disappeared. If this is a create, we cannot really
622 * tell the userspace that the file it was trying to create
623 * does not exist. Instead let's return -ESTALE, and the VFS will
624 * retry the create with LOOKUP_REVAL that we are going to catch
625 * in ll_revalidate_dentry() and use lookup then.
627 if (rc == -ENOENT && itp->it_op & IT_CREAT)
633 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
634 struct obd_client_handle *och)
636 struct mdt_body *body;
638 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
639 och->och_open_handle = body->mbo_open_handle;
640 och->och_fid = body->mbo_fid1;
641 och->och_lease_handle.cookie = it->it_lock_handle;
642 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
643 och->och_flags = it->it_flags;
645 return md_set_open_replay_data(md_exp, och, it);
648 static int ll_local_open(struct file *file, struct lookup_intent *it,
649 struct ll_file_data *fd, struct obd_client_handle *och)
651 struct inode *inode = file_inode(file);
654 LASSERT(!LUSTRE_FPRIVATE(file));
661 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
666 LUSTRE_FPRIVATE(file) = fd;
667 ll_readahead_init(inode, &fd->fd_ras);
668 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
670 /* ll_cl_context initialize */
671 rwlock_init(&fd->fd_lock);
672 INIT_LIST_HEAD(&fd->fd_lccs);
677 /* Open a file, and (for the very first open) create objects on the OSTs at
678 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
679 * creation or open until ll_lov_setstripe() ioctl is called.
681 * If we already have the stripe MD locally then we don't request it in
682 * md_open(), by passing a lmm_size = 0.
684 * It is up to the application to ensure no other processes open this file
685 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
686 * used. We might be able to avoid races of that sort by getting lli_open_sem
687 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
688 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
690 int ll_file_open(struct inode *inode, struct file *file)
692 struct ll_inode_info *lli = ll_i2info(inode);
693 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
694 .it_flags = file->f_flags };
695 struct obd_client_handle **och_p = NULL;
696 __u64 *och_usecount = NULL;
697 struct ll_file_data *fd;
698 ktime_t kstart = ktime_get();
702 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
703 PFID(ll_inode2fid(inode)), inode, file->f_flags);
705 it = file->private_data; /* XXX: compat macro */
706 file->private_data = NULL; /* prevent ll_local_open assertion */
708 fd = ll_file_data_get();
710 GOTO(out_nofiledata, rc = -ENOMEM);
713 if (S_ISDIR(inode->i_mode))
714 ll_authorize_statahead(inode, fd);
716 if (inode->i_sb->s_root == file_dentry(file)) {
717 LUSTRE_FPRIVATE(file) = fd;
721 if (!it || !it->it_disposition) {
722 /* Convert f_flags into access mode. We cannot use file->f_mode,
723 * because everything but O_ACCMODE mask was stripped from
725 if ((oit.it_flags + 1) & O_ACCMODE)
727 if (file->f_flags & O_TRUNC)
728 oit.it_flags |= FMODE_WRITE;
730 /* kernel only call f_op->open in dentry_open. filp_open calls
731 * dentry_open after call to open_namei that checks permissions.
732 * Only nfsd_open call dentry_open directly without checking
733 * permissions and because of that this code below is safe.
735 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
736 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
738 /* We do not want O_EXCL here, presumably we opened the file
739 * already? XXX - NFS implications? */
740 oit.it_flags &= ~O_EXCL;
742 /* bug20584, if "it_flags" contains O_CREAT, the file will be
743 * created if necessary, then "IT_CREAT" should be set to keep
744 * consistent with it */
745 if (oit.it_flags & O_CREAT)
746 oit.it_op |= IT_CREAT;
752 /* Let's see if we have file open on MDS already. */
753 if (it->it_flags & FMODE_WRITE) {
754 och_p = &lli->lli_mds_write_och;
755 och_usecount = &lli->lli_open_fd_write_count;
756 } else if (it->it_flags & FMODE_EXEC) {
757 och_p = &lli->lli_mds_exec_och;
758 och_usecount = &lli->lli_open_fd_exec_count;
760 och_p = &lli->lli_mds_read_och;
761 och_usecount = &lli->lli_open_fd_read_count;
764 mutex_lock(&lli->lli_och_mutex);
765 if (*och_p) { /* Open handle is present */
766 if (it_disposition(it, DISP_OPEN_OPEN)) {
767 /* Well, there's extra open request that we do not need,
768 let's close it somehow. This will decref request. */
769 rc = it_open_error(DISP_OPEN_OPEN, it);
771 mutex_unlock(&lli->lli_och_mutex);
772 GOTO(out_openerr, rc);
775 ll_release_openhandle(file_dentry(file), it);
779 rc = ll_local_open(file, it, fd, NULL);
782 mutex_unlock(&lli->lli_och_mutex);
783 GOTO(out_openerr, rc);
786 LASSERT(*och_usecount == 0);
787 if (!it->it_disposition) {
788 struct dentry *dentry = file_dentry(file);
789 struct ll_dentry_data *ldd;
791 /* We cannot just request lock handle now, new ELC code
792 means that one of other OPEN locks for this file
793 could be cancelled, and since blocking ast handler
794 would attempt to grab och_mutex as well, that would
795 result in a deadlock */
796 mutex_unlock(&lli->lli_och_mutex);
798 * Normally called under two situations:
800 * 2. A race/condition on MDS resulting in no open
801 * handle to be returned from LOOKUP|OPEN request,
802 * for example if the target entry was a symlink.
804 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
805 * marked by a bit set in ll_iget_for_nfs. Clear the
806 * bit so that it's not confusing later callers.
808 * NB; when ldd is NULL, it must have come via normal
809 * lookup path only, since ll_iget_for_nfs always calls
812 ldd = ll_d2d(dentry);
813 if (ldd && ldd->lld_nfs_dentry) {
814 ldd->lld_nfs_dentry = 0;
815 if (!filename_is_volatile(dentry->d_name.name,
818 it->it_flags |= MDS_OPEN_LOCK;
822 * Always specify MDS_OPEN_BY_FID because we don't want
823 * to get file with different fid.
825 it->it_flags |= MDS_OPEN_BY_FID;
826 rc = ll_intent_file_open(dentry, NULL, 0, it);
828 GOTO(out_openerr, rc);
832 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
834 GOTO(out_och_free, rc = -ENOMEM);
838 /* md_intent_lock() didn't get a request ref if there was an
839 * open error, so don't do cleanup on the request here
841 /* XXX (green): Should not we bail out on any error here, not
842 * just open error? */
843 rc = it_open_error(DISP_OPEN_OPEN, it);
845 GOTO(out_och_free, rc);
847 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
848 "inode %p: disposition %x, status %d\n", inode,
849 it_disposition(it, ~0), it->it_status);
851 rc = ll_local_open(file, it, fd, *och_p);
853 GOTO(out_och_free, rc);
856 rc = pcc_file_open(inode, file);
858 GOTO(out_och_free, rc);
860 mutex_unlock(&lli->lli_och_mutex);
863 /* Must do this outside lli_och_mutex lock to prevent deadlock where
864 different kind of OPEN lock for this same inode gets cancelled
865 by ldlm_cancel_lru */
866 if (!S_ISREG(inode->i_mode))
867 GOTO(out_och_free, rc);
869 cl_lov_delay_create_clear(&file->f_flags);
870 GOTO(out_och_free, rc);
874 if (och_p && *och_p) {
875 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
876 *och_p = NULL; /* OBD_FREE writes some magic there */
879 mutex_unlock(&lli->lli_och_mutex);
882 if (lli->lli_opendir_key == fd)
883 ll_deauthorize_statahead(inode, fd);
886 ll_file_data_put(fd);
888 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
889 ktime_us_delta(ktime_get(), kstart));
893 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
894 ptlrpc_req_finished(it->it_request);
895 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
901 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
902 struct ldlm_lock_desc *desc, void *data, int flag)
905 struct lustre_handle lockh;
909 case LDLM_CB_BLOCKING:
910 ldlm_lock2handle(lock, &lockh);
911 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
913 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
917 case LDLM_CB_CANCELING:
925 * When setting a lease on a file, we take ownership of the lli_mds_*_och
926 * and save it as fd->fd_och so as to force client to reopen the file even
927 * if it has an open lock in cache already.
929 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
930 struct lustre_handle *old_open_handle)
932 struct ll_inode_info *lli = ll_i2info(inode);
933 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
934 struct obd_client_handle **och_p;
939 /* Get the openhandle of the file */
940 mutex_lock(&lli->lli_och_mutex);
941 if (fd->fd_lease_och != NULL)
942 GOTO(out_unlock, rc = -EBUSY);
944 if (fd->fd_och == NULL) {
945 if (file->f_mode & FMODE_WRITE) {
946 LASSERT(lli->lli_mds_write_och != NULL);
947 och_p = &lli->lli_mds_write_och;
948 och_usecount = &lli->lli_open_fd_write_count;
950 LASSERT(lli->lli_mds_read_och != NULL);
951 och_p = &lli->lli_mds_read_och;
952 och_usecount = &lli->lli_open_fd_read_count;
955 if (*och_usecount > 1)
956 GOTO(out_unlock, rc = -EBUSY);
963 *old_open_handle = fd->fd_och->och_open_handle;
967 mutex_unlock(&lli->lli_och_mutex);
972 * Release ownership on lli_mds_*_och when putting back a file lease.
974 static int ll_lease_och_release(struct inode *inode, struct file *file)
976 struct ll_inode_info *lli = ll_i2info(inode);
977 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
978 struct obd_client_handle **och_p;
979 struct obd_client_handle *old_och = NULL;
984 mutex_lock(&lli->lli_och_mutex);
985 if (file->f_mode & FMODE_WRITE) {
986 och_p = &lli->lli_mds_write_och;
987 och_usecount = &lli->lli_open_fd_write_count;
989 och_p = &lli->lli_mds_read_och;
990 och_usecount = &lli->lli_open_fd_read_count;
993 /* The file may have been open by another process (broken lease) so
994 * *och_p is not NULL. In this case we should simply increase usecount
997 if (*och_p != NULL) {
998 old_och = fd->fd_och;
1001 *och_p = fd->fd_och;
1005 mutex_unlock(&lli->lli_och_mutex);
1007 if (old_och != NULL)
1008 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1014 * Acquire a lease and open the file.
1016 static struct obd_client_handle *
1017 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1020 struct lookup_intent it = { .it_op = IT_OPEN };
1021 struct ll_sb_info *sbi = ll_i2sbi(inode);
1022 struct md_op_data *op_data;
1023 struct ptlrpc_request *req = NULL;
1024 struct lustre_handle old_open_handle = { 0 };
1025 struct obd_client_handle *och = NULL;
1030 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1031 RETURN(ERR_PTR(-EINVAL));
1034 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1035 RETURN(ERR_PTR(-EPERM));
1037 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1039 RETURN(ERR_PTR(rc));
1044 RETURN(ERR_PTR(-ENOMEM));
1046 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1047 LUSTRE_OPC_ANY, NULL);
1048 if (IS_ERR(op_data))
1049 GOTO(out, rc = PTR_ERR(op_data));
1051 /* To tell the MDT this openhandle is from the same owner */
1052 op_data->op_open_handle = old_open_handle;
1054 it.it_flags = fmode | open_flags;
1055 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1056 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1057 &ll_md_blocking_lease_ast,
1058 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1059 * it can be cancelled which may mislead applications that the lease is
1061 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1062 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1063 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1064 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1065 ll_finish_md_op_data(op_data);
1066 ptlrpc_req_finished(req);
1068 GOTO(out_release_it, rc);
1070 if (it_disposition(&it, DISP_LOOKUP_NEG))
1071 GOTO(out_release_it, rc = -ENOENT);
1073 rc = it_open_error(DISP_OPEN_OPEN, &it);
1075 GOTO(out_release_it, rc);
1077 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1078 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1080 GOTO(out_release_it, rc);
1082 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1083 GOTO(out_close, rc = -EOPNOTSUPP);
1085 /* already get lease, handle lease lock */
1086 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1087 if (it.it_lock_mode == 0 ||
1088 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1089 /* open lock must return for lease */
1090 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1091 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1093 GOTO(out_close, rc = -EPROTO);
1096 ll_intent_release(&it);
1100 /* Cancel open lock */
1101 if (it.it_lock_mode != 0) {
1102 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1104 it.it_lock_mode = 0;
1105 och->och_lease_handle.cookie = 0ULL;
1107 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1109 CERROR("%s: error closing file "DFID": %d\n",
1110 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1111 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1113 ll_intent_release(&it);
1117 RETURN(ERR_PTR(rc));
1121 * Check whether a layout swap can be done between two inodes.
1123 * \param[in] inode1 First inode to check
1124 * \param[in] inode2 Second inode to check
1126 * \retval 0 on success, layout swap can be performed between both inodes
1127 * \retval negative error code if requirements are not met
1129 static int ll_check_swap_layouts_validity(struct inode *inode1,
1130 struct inode *inode2)
1132 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1135 if (inode_permission(inode1, MAY_WRITE) ||
1136 inode_permission(inode2, MAY_WRITE))
1139 if (inode1->i_sb != inode2->i_sb)
1145 static int ll_swap_layouts_close(struct obd_client_handle *och,
1146 struct inode *inode, struct inode *inode2)
1148 const struct lu_fid *fid1 = ll_inode2fid(inode);
1149 const struct lu_fid *fid2;
1153 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1154 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1156 rc = ll_check_swap_layouts_validity(inode, inode2);
1158 GOTO(out_free_och, rc);
1160 /* We now know that inode2 is a lustre inode */
1161 fid2 = ll_inode2fid(inode2);
1163 rc = lu_fid_cmp(fid1, fid2);
1165 GOTO(out_free_och, rc = -EINVAL);
1167 /* Close the file and {swap,merge} layouts between inode & inode2.
1168 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1169 * because we still need it to pack l_remote_handle to MDT. */
1170 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1173 och = NULL; /* freed in ll_close_inode_openhandle() */
1183 * Release lease and close the file.
1184 * It will check if the lease has ever broken.
1186 static int ll_lease_close_intent(struct obd_client_handle *och,
1187 struct inode *inode,
1188 bool *lease_broken, enum mds_op_bias bias,
1191 struct ldlm_lock *lock;
1192 bool cancelled = true;
1196 lock = ldlm_handle2lock(&och->och_lease_handle);
1198 lock_res_and_lock(lock);
1199 cancelled = ldlm_is_cancel(lock);
1200 unlock_res_and_lock(lock);
1201 LDLM_LOCK_PUT(lock);
1204 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1205 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1207 if (lease_broken != NULL)
1208 *lease_broken = cancelled;
1210 if (!cancelled && !bias)
1211 ldlm_cli_cancel(&och->och_lease_handle, 0);
1213 if (cancelled) { /* no need to excute intent */
1218 rc = ll_close_inode_openhandle(inode, och, bias, data);
1222 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1225 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1229 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1231 static int ll_lease_file_resync(struct obd_client_handle *och,
1232 struct inode *inode, unsigned long arg)
1234 struct ll_sb_info *sbi = ll_i2sbi(inode);
1235 struct md_op_data *op_data;
1236 struct ll_ioc_lease_id ioc;
1237 __u64 data_version_unused;
1241 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1242 LUSTRE_OPC_ANY, NULL);
1243 if (IS_ERR(op_data))
1244 RETURN(PTR_ERR(op_data));
1246 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1250 /* before starting file resync, it's necessary to clean up page cache
1251 * in client memory, otherwise once the layout version is increased,
1252 * writing back cached data will be denied the OSTs. */
1253 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1257 op_data->op_lease_handle = och->och_lease_handle;
1258 op_data->op_mirror_id = ioc.lil_mirror_id;
1259 rc = md_file_resync(sbi->ll_md_exp, op_data);
1265 ll_finish_md_op_data(op_data);
1269 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1271 struct ll_inode_info *lli = ll_i2info(inode);
1272 struct cl_object *obj = lli->lli_clob;
1273 struct cl_attr *attr = vvp_env_thread_attr(env);
1281 ll_inode_size_lock(inode);
1283 /* Merge timestamps the most recently obtained from MDS with
1284 * timestamps obtained from OSTs.
1286 * Do not overwrite atime of inode because it may be refreshed
1287 * by file_accessed() function. If the read was served by cache
1288 * data, there is no RPC to be sent so that atime may not be
1289 * transferred to OSTs at all. MDT only updates atime at close time
1290 * if it's at least 'mdd.*.atime_diff' older.
1291 * All in all, the atime in Lustre does not strictly comply with
1292 * POSIX. Solving this problem needs to send an RPC to MDT for each
1293 * read, this will hurt performance.
1295 if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1296 inode->i_atime.tv_sec < lli->lli_atime)
1297 inode->i_atime.tv_sec = lli->lli_atime;
1299 inode->i_mtime.tv_sec = lli->lli_mtime;
1300 inode->i_ctime.tv_sec = lli->lli_ctime;
1302 mtime = inode->i_mtime.tv_sec;
1303 atime = inode->i_atime.tv_sec;
1304 ctime = inode->i_ctime.tv_sec;
1306 cl_object_attr_lock(obj);
1307 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1310 rc = cl_object_attr_get(env, obj, attr);
1311 cl_object_attr_unlock(obj);
1314 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1316 if (atime < attr->cat_atime)
1317 atime = attr->cat_atime;
1319 if (ctime < attr->cat_ctime)
1320 ctime = attr->cat_ctime;
1322 if (mtime < attr->cat_mtime)
1323 mtime = attr->cat_mtime;
1325 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1326 PFID(&lli->lli_fid), attr->cat_size);
1328 i_size_write(inode, attr->cat_size);
1329 inode->i_blocks = attr->cat_blocks;
1331 inode->i_mtime.tv_sec = mtime;
1332 inode->i_atime.tv_sec = atime;
1333 inode->i_ctime.tv_sec = ctime;
1336 ll_inode_size_unlock(inode);
1342 * Set designated mirror for I/O.
1344 * So far only read, write, and truncated can support to issue I/O to
1345 * designated mirror.
1347 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1349 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1351 /* clear layout version for generic(non-resync) I/O in case it carries
1352 * stale layout version due to I/O restart */
1353 io->ci_layout_version = 0;
1355 /* FLR: disable non-delay for designated mirror I/O because obviously
1356 * only one mirror is available */
1357 if (fd->fd_designated_mirror > 0) {
1359 io->ci_designated_mirror = fd->fd_designated_mirror;
1360 io->ci_layout_version = fd->fd_layout_version;
1363 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1364 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1367 static bool file_is_noatime(const struct file *file)
1369 const struct vfsmount *mnt = file->f_path.mnt;
1370 const struct inode *inode = file_inode((struct file *)file);
1372 /* Adapted from file_accessed() and touch_atime().*/
1373 if (file->f_flags & O_NOATIME)
1376 if (inode->i_flags & S_NOATIME)
1379 if (IS_NOATIME(inode))
1382 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1385 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1388 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1394 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1395 struct vvp_io_args *args)
1397 struct inode *inode = file_inode(file);
1398 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1400 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1401 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1403 if (iot == CIT_WRITE) {
1404 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1405 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1406 file->f_flags & O_DIRECT ||
1408 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1409 io->u.ci_wr.wr_sync |= !!(args &&
1410 args->via_io_subtype == IO_NORMAL &&
1411 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1415 io->ci_obj = ll_i2info(inode)->lli_clob;
1416 io->ci_lockreq = CILR_MAYBE;
1417 if (ll_file_nolock(file)) {
1418 io->ci_lockreq = CILR_NEVER;
1419 io->ci_no_srvlock = 1;
1420 } else if (file->f_flags & O_APPEND) {
1421 io->ci_lockreq = CILR_MANDATORY;
1423 io->ci_noatime = file_is_noatime(file);
1424 io->ci_async_readahead = false;
1426 /* FLR: only use non-delay I/O for read as there is only one
1427 * avaliable mirror for write. */
1428 io->ci_ndelay = !(iot == CIT_WRITE);
1430 ll_io_set_mirror(io, file);
1433 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1436 struct ll_inode_info *lli = ll_i2info(inode);
1437 struct ll_sb_info *sbi = ll_i2sbi(inode);
1438 enum obd_heat_type sample_type;
1439 enum obd_heat_type iobyte_type;
1440 __u64 now = ktime_get_real_seconds();
1442 if (!ll_sbi_has_file_heat(sbi) ||
1443 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1446 if (iot == CIT_READ) {
1447 sample_type = OBD_HEAT_READSAMPLE;
1448 iobyte_type = OBD_HEAT_READBYTE;
1449 } else if (iot == CIT_WRITE) {
1450 sample_type = OBD_HEAT_WRITESAMPLE;
1451 iobyte_type = OBD_HEAT_WRITEBYTE;
1456 spin_lock(&lli->lli_heat_lock);
1457 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1458 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1459 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1460 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1461 spin_unlock(&lli->lli_heat_lock);
1465 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1466 struct file *file, enum cl_io_type iot,
1467 loff_t *ppos, size_t count)
1469 struct vvp_io *vio = vvp_env_io(env);
1470 struct inode *inode = file_inode(file);
1471 struct ll_inode_info *lli = ll_i2info(inode);
1472 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1473 struct range_lock range;
1477 unsigned retried = 0;
1478 bool restarted = false;
1482 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1483 file_dentry(file)->d_name.name,
1484 iot == CIT_READ ? "read" : "write", *ppos, count);
1487 io = vvp_env_thread_io(env);
1488 ll_io_init(io, file, iot, args);
1489 io->ci_ndelay_tried = retried;
1491 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1492 bool range_locked = false;
1494 if (file->f_flags & O_APPEND)
1495 range_lock_init(&range, 0, LUSTRE_EOF);
1497 range_lock_init(&range, *ppos, *ppos + count - 1);
1499 vio->vui_fd = LUSTRE_FPRIVATE(file);
1500 vio->vui_io_subtype = args->via_io_subtype;
1502 switch (vio->vui_io_subtype) {
1504 vio->vui_iter = args->u.normal.via_iter;
1505 vio->vui_iocb = args->u.normal.via_iocb;
1506 /* Direct IO reads must also take range lock,
1507 * or multiple reads will try to work on the same pages
1508 * See LU-6227 for details. */
1509 if (((iot == CIT_WRITE) ||
1510 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1511 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1512 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1514 rc = range_lock(&lli->lli_write_tree, &range);
1518 range_locked = true;
1522 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1523 vio->u.splice.vui_flags = args->u.splice.via_flags;
1526 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1530 ll_cl_add(file, env, io, LCC_RW);
1531 rc = cl_io_loop(env, io);
1532 ll_cl_remove(file, env);
1535 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1537 range_unlock(&lli->lli_write_tree, &range);
1540 /* cl_io_rw_init() handled IO */
1544 if (io->ci_nob > 0) {
1545 result += io->ci_nob;
1546 count -= io->ci_nob;
1547 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1549 /* prepare IO restart */
1550 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1551 args->u.normal.via_iter = vio->vui_iter;
1554 cl_io_fini(env, io);
1557 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1558 file->f_path.dentry->d_name.name,
1559 iot, rc, result, io->ci_need_restart);
1561 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1563 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1564 file_dentry(file)->d_name.name,
1565 iot == CIT_READ ? "read" : "write",
1566 *ppos, count, result, rc);
1567 /* preserve the tried count for FLR */
1568 retried = io->ci_ndelay_tried;
1573 if (iot == CIT_READ) {
1575 ll_stats_ops_tally(ll_i2sbi(inode),
1576 LPROC_LL_READ_BYTES, result);
1577 } else if (iot == CIT_WRITE) {
1579 ll_stats_ops_tally(ll_i2sbi(inode),
1580 LPROC_LL_WRITE_BYTES, result);
1581 fd->fd_write_failed = false;
1582 } else if (result == 0 && rc == 0) {
1585 fd->fd_write_failed = true;
1587 fd->fd_write_failed = false;
1588 } else if (rc != -ERESTARTSYS) {
1589 fd->fd_write_failed = true;
1593 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1595 ll_heat_add(inode, iot, result);
1597 RETURN(result > 0 ? result : rc);
1601 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1602 * especially for small I/O.
1604 * To serve a read request, CLIO has to create and initialize a cl_io and
1605 * then request DLM lock. This has turned out to have siginificant overhead
1606 * and affects the performance of small I/O dramatically.
1608 * It's not necessary to create a cl_io for each I/O. Under the help of read
1609 * ahead, most of the pages being read are already in memory cache and we can
1610 * read those pages directly because if the pages exist, the corresponding DLM
1611 * lock must exist so that page content must be valid.
1613 * In fast read implementation, the llite speculatively finds and reads pages
1614 * in memory cache. There are three scenarios for fast read:
1615 * - If the page exists and is uptodate, kernel VM will provide the data and
1616 * CLIO won't be intervened;
1617 * - If the page was brought into memory by read ahead, it will be exported
1618 * and read ahead parameters will be updated;
1619 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1620 * it will go back and invoke normal read, i.e., a cl_io will be created
1621 * and DLM lock will be requested.
1623 * POSIX compliance: posix standard states that read is intended to be atomic.
1624 * Lustre read implementation is in line with Linux kernel read implementation
1625 * and neither of them complies with POSIX standard in this matter. Fast read
1626 * doesn't make the situation worse on single node but it may interleave write
1627 * results from multiple nodes due to short read handling in ll_file_aio_read().
1629 * \param env - lu_env
1630 * \param iocb - kiocb from kernel
1631 * \param iter - user space buffers where the data will be copied
1633 * \retval - number of bytes have been read, or error code if error occurred.
1636 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1640 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1643 /* NB: we can't do direct IO for fast read because it will need a lock
1644 * to make IO engine happy. */
1645 if (iocb->ki_filp->f_flags & O_DIRECT)
1648 result = generic_file_read_iter(iocb, iter);
1650 /* If the first page is not in cache, generic_file_aio_read() will be
1651 * returned with -ENODATA.
1652 * See corresponding code in ll_readpage(). */
1653 if (result == -ENODATA)
1657 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1658 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1659 LPROC_LL_READ_BYTES, result);
1666 * Read from a file (through the page cache).
1668 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1671 struct vvp_io_args *args;
1672 struct file *file = iocb->ki_filp;
1676 ktime_t kstart = ktime_get();
1679 if (!iov_iter_count(to))
1683 * Currently when PCC read failed, we do not fall back to the
1684 * normal read path, just return the error.
1685 * The resaon is that: for RW-PCC, the file data may be modified
1686 * in the PCC and inconsistent with the data on OSTs (or file
1687 * data has been removed from the Lustre file system), at this
1688 * time, fallback to the normal read path may read the wrong
1690 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1691 * path: read data from data copy on OSTs.
1693 result = pcc_file_read_iter(iocb, to, &cached);
1697 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1699 result = ll_do_fast_read(iocb, to);
1700 if (result < 0 || iov_iter_count(to) == 0)
1703 env = cl_env_get(&refcheck);
1705 return PTR_ERR(env);
1707 args = ll_env_args(env, IO_NORMAL);
1708 args->u.normal.via_iter = to;
1709 args->u.normal.via_iocb = iocb;
1711 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1712 &iocb->ki_pos, iov_iter_count(to));
1715 else if (result == 0)
1718 cl_env_put(env, &refcheck);
1721 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1722 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1724 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1725 ktime_us_delta(ktime_get(), kstart));
1732 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1733 * If a page is already in the page cache and dirty (and some other things -
1734 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1735 * write to it without doing a full I/O, because Lustre already knows about it
1736 * and will write it out. This saves a lot of processing time.
1738 * All writes here are within one page, so exclusion is handled by the page
1739 * lock on the vm page. We do not do tiny writes for writes which touch
1740 * multiple pages because it's very unlikely multiple sequential pages are
1741 * are already dirty.
1743 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1744 * and are unlikely to be to already dirty pages.
1746 * Attribute updates are important here, we do them in ll_tiny_write_end.
1748 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1750 ssize_t count = iov_iter_count(iter);
1751 struct file *file = iocb->ki_filp;
1752 struct inode *inode = file_inode(file);
1753 bool lock_inode = !IS_NOSEC(inode);
1758 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1759 * of function for why.
1761 if (count >= PAGE_SIZE ||
1762 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1765 if (unlikely(lock_inode))
1767 result = __generic_file_write_iter(iocb, iter);
1769 if (unlikely(lock_inode))
1770 inode_unlock(inode);
1772 /* If the page is not already dirty, ll_tiny_write_begin returns
1773 * -ENODATA. We continue on to normal write.
1775 if (result == -ENODATA)
1779 ll_heat_add(inode, CIT_WRITE, result);
1780 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1782 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1785 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1791 * Write to a file (through the page cache).
1793 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1795 struct vvp_io_args *args;
1797 ssize_t rc_tiny = 0, rc_normal;
1798 struct file *file = iocb->ki_filp;
1801 ktime_t kstart = ktime_get();
1806 if (!iov_iter_count(from))
1807 GOTO(out, rc_normal = 0);
1810 * When PCC write failed, we usually do not fall back to the normal
1811 * write path, just return the error. But there is a special case when
1812 * returned error code is -ENOSPC due to running out of space on PCC HSM
1813 * bakcend. At this time, it will fall back to normal I/O path and
1814 * retry the I/O. As the file is in HSM released state, it will restore
1815 * the file data to OSTs first and redo the write again. And the
1816 * restore process will revoke the layout lock and detach the file
1817 * from PCC cache automatically.
1819 result = pcc_file_write_iter(iocb, from, &cached);
1820 if (cached && result != -ENOSPC && result != -EDQUOT)
1821 GOTO(out, rc_normal = result);
1823 /* NB: we can't do direct IO for tiny writes because they use the page
1824 * cache, we can't do sync writes because tiny writes can't flush
1825 * pages, and we can't do append writes because we can't guarantee the
1826 * required DLM locks are held to protect file size.
1828 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1829 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1830 rc_tiny = ll_do_tiny_write(iocb, from);
1832 /* In case of error, go on and try normal write - Only stop if tiny
1833 * write completed I/O.
1835 if (iov_iter_count(from) == 0)
1836 GOTO(out, rc_normal = rc_tiny);
1838 env = cl_env_get(&refcheck);
1840 return PTR_ERR(env);
1842 args = ll_env_args(env, IO_NORMAL);
1843 args->u.normal.via_iter = from;
1844 args->u.normal.via_iocb = iocb;
1846 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1847 &iocb->ki_pos, iov_iter_count(from));
1849 /* On success, combine bytes written. */
1850 if (rc_tiny >= 0 && rc_normal > 0)
1851 rc_normal += rc_tiny;
1852 /* On error, only return error from normal write if tiny write did not
1853 * write any bytes. Otherwise return bytes written by tiny write.
1855 else if (rc_tiny > 0)
1856 rc_normal = rc_tiny;
1858 cl_env_put(env, &refcheck);
1860 if (rc_normal > 0) {
1861 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1862 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1864 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1865 ktime_us_delta(ktime_get(), kstart));
1871 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1873 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1875 static int ll_file_get_iov_count(const struct iovec *iov,
1876 unsigned long *nr_segs, size_t *count)
1881 for (seg = 0; seg < *nr_segs; seg++) {
1882 const struct iovec *iv = &iov[seg];
1885 * If any segment has a negative length, or the cumulative
1886 * length ever wraps negative then return -EINVAL.
1889 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1891 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1896 cnt -= iv->iov_len; /* This segment is no good */
1903 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1904 unsigned long nr_segs, loff_t pos)
1911 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1918 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1919 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1920 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1921 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1922 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1924 result = ll_file_read_iter(iocb, &to);
1929 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1932 struct iovec iov = { .iov_base = buf, .iov_len = count };
1941 init_sync_kiocb(&kiocb, file);
1942 kiocb.ki_pos = *ppos;
1943 #ifdef HAVE_KIOCB_KI_LEFT
1944 kiocb.ki_left = count;
1945 #elif defined(HAVE_KI_NBYTES)
1946 kiocb.i_nbytes = count;
1949 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1950 *ppos = kiocb.ki_pos;
1956 * Write to a file (through the page cache).
1959 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1960 unsigned long nr_segs, loff_t pos)
1962 struct iov_iter from;
1967 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1974 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1975 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1976 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1977 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1978 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1980 result = ll_file_write_iter(iocb, &from);
1985 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1986 size_t count, loff_t *ppos)
1988 struct iovec iov = { .iov_base = (void __user *)buf,
1998 init_sync_kiocb(&kiocb, file);
1999 kiocb.ki_pos = *ppos;
2000 #ifdef HAVE_KIOCB_KI_LEFT
2001 kiocb.ki_left = count;
2002 #elif defined(HAVE_KI_NBYTES)
2003 kiocb.ki_nbytes = count;
2006 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2007 *ppos = kiocb.ki_pos;
2011 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2014 * Send file content (through pagecache) somewhere with helper
2016 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2017 struct pipe_inode_info *pipe, size_t count,
2021 struct vvp_io_args *args;
2028 result = pcc_file_splice_read(in_file, ppos, pipe,
2029 count, flags, &cached);
2033 ll_ras_enter(in_file, *ppos, count);
2035 env = cl_env_get(&refcheck);
2037 RETURN(PTR_ERR(env));
2039 args = ll_env_args(env, IO_SPLICE);
2040 args->u.splice.via_pipe = pipe;
2041 args->u.splice.via_flags = flags;
2043 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2044 cl_env_put(env, &refcheck);
2047 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2048 LUSTRE_FPRIVATE(in_file), *ppos, result,
2053 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2054 __u64 flags, struct lov_user_md *lum, int lum_size)
2056 struct lookup_intent oit = {
2058 .it_flags = flags | MDS_OPEN_BY_FID,
2063 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2064 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2065 /* this code will only exist for big-endian systems */
2066 lustre_swab_lov_user_md(lum, 0);
2069 ll_inode_size_lock(inode);
2070 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2072 GOTO(out_unlock, rc);
2074 ll_release_openhandle(dentry, &oit);
2077 ll_inode_size_unlock(inode);
2078 ll_intent_release(&oit);
2083 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2084 struct lov_mds_md **lmmp, int *lmm_size,
2085 struct ptlrpc_request **request)
2087 struct ll_sb_info *sbi = ll_i2sbi(inode);
2088 struct mdt_body *body;
2089 struct lov_mds_md *lmm = NULL;
2090 struct ptlrpc_request *req = NULL;
2091 struct md_op_data *op_data;
2094 rc = ll_get_default_mdsize(sbi, &lmmsize);
2098 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2099 strlen(filename), lmmsize,
2100 LUSTRE_OPC_ANY, NULL);
2101 if (IS_ERR(op_data))
2102 RETURN(PTR_ERR(op_data));
2104 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2105 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2106 ll_finish_md_op_data(op_data);
2108 CDEBUG(D_INFO, "md_getattr_name failed "
2109 "on %s: rc %d\n", filename, rc);
2113 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2114 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2116 lmmsize = body->mbo_eadatasize;
2118 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2120 GOTO(out, rc = -ENODATA);
2123 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2124 LASSERT(lmm != NULL);
2126 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2127 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2128 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2129 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2130 GOTO(out, rc = -EPROTO);
2133 * This is coming from the MDS, so is probably in
2134 * little endian. We convert it to host endian before
2135 * passing it to userspace.
2137 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2138 __swab32(LOV_MAGIC_MAGIC)) {
2139 int stripe_count = 0;
2141 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2142 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2143 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2144 if (le32_to_cpu(lmm->lmm_pattern) &
2145 LOV_PATTERN_F_RELEASED)
2149 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2151 /* if function called for directory - we should
2152 * avoid swab not existent lsm objects */
2153 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2154 lustre_swab_lov_user_md_objects(
2155 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2157 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2158 S_ISREG(body->mbo_mode))
2159 lustre_swab_lov_user_md_objects(
2160 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2166 *lmm_size = lmmsize;
2171 static int ll_lov_setea(struct inode *inode, struct file *file,
2174 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2175 struct lov_user_md *lump;
2176 int lum_size = sizeof(struct lov_user_md) +
2177 sizeof(struct lov_user_ost_data);
2181 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2184 OBD_ALLOC_LARGE(lump, lum_size);
2188 if (copy_from_user(lump, arg, lum_size))
2189 GOTO(out_lump, rc = -EFAULT);
2191 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2193 cl_lov_delay_create_clear(&file->f_flags);
2196 OBD_FREE_LARGE(lump, lum_size);
2200 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2207 env = cl_env_get(&refcheck);
2209 RETURN(PTR_ERR(env));
2211 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2212 cl_env_put(env, &refcheck);
2216 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2219 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2220 struct lov_user_md *klum;
2222 __u64 flags = FMODE_WRITE;
2225 rc = ll_copy_user_md(lum, &klum);
2230 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2235 rc = put_user(0, &lum->lmm_stripe_count);
2239 rc = ll_layout_refresh(inode, &gen);
2243 rc = ll_file_getstripe(inode, arg, lum_size);
2245 cl_lov_delay_create_clear(&file->f_flags);
2248 OBD_FREE_LARGE(klum, lum_size);
2254 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2256 struct ll_inode_info *lli = ll_i2info(inode);
2257 struct cl_object *obj = lli->lli_clob;
2258 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2259 struct ll_grouplock grouplock;
2264 CWARN("group id for group lock must not be 0\n");
2268 if (ll_file_nolock(file))
2269 RETURN(-EOPNOTSUPP);
2271 if (file->f_flags & O_NONBLOCK) {
2272 if (!mutex_trylock(&lli->lli_group_mutex))
2275 mutex_lock(&lli->lli_group_mutex);
2277 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2278 CWARN("group lock already existed with gid %lu\n",
2279 fd->fd_grouplock.lg_gid);
2280 GOTO(out, rc = -EINVAL);
2282 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2283 if (file->f_flags & O_NONBLOCK)
2284 GOTO(out, rc = -EAGAIN);
2285 mutex_unlock(&lli->lli_group_mutex);
2286 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2287 GOTO(retry, rc = 0);
2289 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2292 * XXX: group lock needs to protect all OST objects while PFL
2293 * can add new OST objects during the IO, so we'd instantiate
2294 * all OST objects before getting its group lock.
2299 struct cl_layout cl = {
2300 .cl_is_composite = false,
2302 struct lu_extent ext = {
2304 .e_end = OBD_OBJECT_EOF,
2307 env = cl_env_get(&refcheck);
2309 GOTO(out, rc = PTR_ERR(env));
2311 rc = cl_object_layout_get(env, obj, &cl);
2312 if (!rc && cl.cl_is_composite)
2313 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2316 cl_env_put(env, &refcheck);
2321 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2322 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2327 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2328 fd->fd_grouplock = grouplock;
2329 if (lli->lli_group_users == 0)
2330 lli->lli_group_gid = grouplock.lg_gid;
2331 lli->lli_group_users++;
2333 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2335 mutex_unlock(&lli->lli_group_mutex);
2340 static int ll_put_grouplock(struct inode *inode, struct file *file,
2343 struct ll_inode_info *lli = ll_i2info(inode);
2344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2345 struct ll_grouplock grouplock;
2349 mutex_lock(&lli->lli_group_mutex);
2350 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2351 CWARN("no group lock held\n");
2352 GOTO(out, rc = -EINVAL);
2355 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2357 if (fd->fd_grouplock.lg_gid != arg) {
2358 CWARN("group lock %lu doesn't match current id %lu\n",
2359 arg, fd->fd_grouplock.lg_gid);
2360 GOTO(out, rc = -EINVAL);
2363 grouplock = fd->fd_grouplock;
2364 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2365 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2367 cl_put_grouplock(&grouplock);
2369 lli->lli_group_users--;
2370 if (lli->lli_group_users == 0) {
2371 lli->lli_group_gid = 0;
2372 wake_up_var(&lli->lli_group_users);
2374 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2377 mutex_unlock(&lli->lli_group_mutex);
2383 * Close inode open handle
2385 * \param dentry [in] dentry which contains the inode
2386 * \param it [in,out] intent which contains open info and result
2389 * \retval <0 failure
2391 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2393 struct inode *inode = dentry->d_inode;
2394 struct obd_client_handle *och;
2400 /* Root ? Do nothing. */
2401 if (dentry->d_inode->i_sb->s_root == dentry)
2404 /* No open handle to close? Move away */
2405 if (!it_disposition(it, DISP_OPEN_OPEN))
2408 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2410 OBD_ALLOC(och, sizeof(*och));
2412 GOTO(out, rc = -ENOMEM);
2414 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2418 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2420 /* this one is in place of ll_file_open */
2421 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2422 ptlrpc_req_finished(it->it_request);
2423 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2429 * Get size for inode for which FIEMAP mapping is requested.
2430 * Make the FIEMAP get_info call and returns the result.
2431 * \param fiemap kernel buffer to hold extens
2432 * \param num_bytes kernel buffer size
2434 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2440 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2443 /* Checks for fiemap flags */
2444 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2445 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2449 /* Check for FIEMAP_FLAG_SYNC */
2450 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2451 rc = filemap_fdatawrite(inode->i_mapping);
2456 env = cl_env_get(&refcheck);
2458 RETURN(PTR_ERR(env));
2460 if (i_size_read(inode) == 0) {
2461 rc = ll_glimpse_size(inode);
2466 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2467 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2468 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2470 /* If filesize is 0, then there would be no objects for mapping */
2471 if (fmkey.lfik_oa.o_size == 0) {
2472 fiemap->fm_mapped_extents = 0;
2476 fmkey.lfik_fiemap = *fiemap;
2478 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2479 &fmkey, fiemap, &num_bytes);
2481 cl_env_put(env, &refcheck);
2485 int ll_fid2path(struct inode *inode, void __user *arg)
2487 struct obd_export *exp = ll_i2mdexp(inode);
2488 const struct getinfo_fid2path __user *gfin = arg;
2490 struct getinfo_fid2path *gfout;
2496 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2497 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2500 /* Only need to get the buflen */
2501 if (get_user(pathlen, &gfin->gf_pathlen))
2504 if (pathlen > PATH_MAX)
2507 outsize = sizeof(*gfout) + pathlen;
2508 OBD_ALLOC(gfout, outsize);
2512 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2513 GOTO(gf_free, rc = -EFAULT);
2514 /* append root FID after gfout to let MDT know the root FID so that it
2515 * can lookup the correct path, this is mainly for fileset.
2516 * old server without fileset mount support will ignore this. */
2517 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2519 /* Call mdc_iocontrol */
2520 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2524 if (copy_to_user(arg, gfout, outsize))
2528 OBD_FREE(gfout, outsize);
2533 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2535 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2543 ioc->idv_version = 0;
2544 ioc->idv_layout_version = UINT_MAX;
2546 /* If no file object initialized, we consider its version is 0. */
2550 env = cl_env_get(&refcheck);
2552 RETURN(PTR_ERR(env));
2554 io = vvp_env_thread_io(env);
2556 io->u.ci_data_version.dv_data_version = 0;
2557 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2558 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2561 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2562 result = cl_io_loop(env, io);
2564 result = io->ci_result;
2566 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2567 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2569 cl_io_fini(env, io);
2571 if (unlikely(io->ci_need_restart))
2574 cl_env_put(env, &refcheck);
2580 * Read the data_version for inode.
2582 * This value is computed using stripe object version on OST.
2583 * Version is computed using server side locking.
2585 * @param flags if do sync on the OST side;
2587 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2588 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2590 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2592 struct ioc_data_version ioc = { .idv_flags = flags };
2595 rc = ll_ioc_data_version(inode, &ioc);
2597 *data_version = ioc.idv_version;
2603 * Trigger a HSM release request for the provided inode.
2605 int ll_hsm_release(struct inode *inode)
2608 struct obd_client_handle *och = NULL;
2609 __u64 data_version = 0;
2614 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2615 ll_i2sbi(inode)->ll_fsname,
2616 PFID(&ll_i2info(inode)->lli_fid));
2618 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2620 GOTO(out, rc = PTR_ERR(och));
2622 /* Grab latest data_version and [am]time values */
2623 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2627 env = cl_env_get(&refcheck);
2629 GOTO(out, rc = PTR_ERR(env));
2631 rc = ll_merge_attr(env, inode);
2632 cl_env_put(env, &refcheck);
2634 /* If error happen, we have the wrong size for a file.
2640 /* Release the file.
2641 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2642 * we still need it to pack l_remote_handle to MDT. */
2643 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2649 if (och != NULL && !IS_ERR(och)) /* close the file */
2650 ll_lease_close(och, inode, NULL);
2655 struct ll_swap_stack {
2658 struct inode *inode1;
2659 struct inode *inode2;
2664 static int ll_swap_layouts(struct file *file1, struct file *file2,
2665 struct lustre_swap_layouts *lsl)
2667 struct mdc_swap_layouts msl;
2668 struct md_op_data *op_data;
2671 struct ll_swap_stack *llss = NULL;
2674 OBD_ALLOC_PTR(llss);
2678 llss->inode1 = file_inode(file1);
2679 llss->inode2 = file_inode(file2);
2681 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2685 /* we use 2 bool because it is easier to swap than 2 bits */
2686 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2687 llss->check_dv1 = true;
2689 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2690 llss->check_dv2 = true;
2692 /* we cannot use lsl->sl_dvX directly because we may swap them */
2693 llss->dv1 = lsl->sl_dv1;
2694 llss->dv2 = lsl->sl_dv2;
2696 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2697 if (rc == 0) /* same file, done! */
2700 if (rc < 0) { /* sequentialize it */
2701 swap(llss->inode1, llss->inode2);
2703 swap(llss->dv1, llss->dv2);
2704 swap(llss->check_dv1, llss->check_dv2);
2708 if (gid != 0) { /* application asks to flush dirty cache */
2709 rc = ll_get_grouplock(llss->inode1, file1, gid);
2713 rc = ll_get_grouplock(llss->inode2, file2, gid);
2715 ll_put_grouplock(llss->inode1, file1, gid);
2720 /* ultimate check, before swaping the layouts we check if
2721 * dataversion has changed (if requested) */
2722 if (llss->check_dv1) {
2723 rc = ll_data_version(llss->inode1, &dv, 0);
2726 if (dv != llss->dv1)
2727 GOTO(putgl, rc = -EAGAIN);
2730 if (llss->check_dv2) {
2731 rc = ll_data_version(llss->inode2, &dv, 0);
2734 if (dv != llss->dv2)
2735 GOTO(putgl, rc = -EAGAIN);
2738 /* struct md_op_data is used to send the swap args to the mdt
2739 * only flags is missing, so we use struct mdc_swap_layouts
2740 * through the md_op_data->op_data */
2741 /* flags from user space have to be converted before they are send to
2742 * server, no flag is sent today, they are only used on the client */
2745 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2746 0, LUSTRE_OPC_ANY, &msl);
2747 if (IS_ERR(op_data))
2748 GOTO(free, rc = PTR_ERR(op_data));
2750 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2751 sizeof(*op_data), op_data, NULL);
2752 ll_finish_md_op_data(op_data);
2759 ll_put_grouplock(llss->inode2, file2, gid);
2760 ll_put_grouplock(llss->inode1, file1, gid);
2770 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2772 struct obd_export *exp = ll_i2mdexp(inode);
2773 struct md_op_data *op_data;
2777 /* Detect out-of range masks */
2778 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2781 /* Non-root users are forbidden to set or clear flags which are
2782 * NOT defined in HSM_USER_MASK. */
2783 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2784 !cfs_capable(CFS_CAP_SYS_ADMIN))
2787 if (!exp_connect_archive_id_array(exp)) {
2788 /* Detect out-of range archive id */
2789 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2790 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2794 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2795 LUSTRE_OPC_ANY, hss);
2796 if (IS_ERR(op_data))
2797 RETURN(PTR_ERR(op_data));
2799 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2802 ll_finish_md_op_data(op_data);
2807 static int ll_hsm_import(struct inode *inode, struct file *file,
2808 struct hsm_user_import *hui)
2810 struct hsm_state_set *hss = NULL;
2811 struct iattr *attr = NULL;
2815 if (!S_ISREG(inode->i_mode))
2821 GOTO(out, rc = -ENOMEM);
2823 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2824 hss->hss_archive_id = hui->hui_archive_id;
2825 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2826 rc = ll_hsm_state_set(inode, hss);
2830 OBD_ALLOC_PTR(attr);
2832 GOTO(out, rc = -ENOMEM);
2834 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2835 attr->ia_mode |= S_IFREG;
2836 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2837 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2838 attr->ia_size = hui->hui_size;
2839 attr->ia_mtime.tv_sec = hui->hui_mtime;
2840 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2841 attr->ia_atime.tv_sec = hui->hui_atime;
2842 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2844 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2845 ATTR_UID | ATTR_GID |
2846 ATTR_MTIME | ATTR_MTIME_SET |
2847 ATTR_ATIME | ATTR_ATIME_SET;
2851 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2855 inode_unlock(inode);
2867 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2869 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2870 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2873 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2875 struct inode *inode = file_inode(file);
2877 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2878 ATTR_MTIME | ATTR_MTIME_SET |
2881 .tv_sec = lfu->lfu_atime_sec,
2882 .tv_nsec = lfu->lfu_atime_nsec,
2885 .tv_sec = lfu->lfu_mtime_sec,
2886 .tv_nsec = lfu->lfu_mtime_nsec,
2889 .tv_sec = lfu->lfu_ctime_sec,
2890 .tv_nsec = lfu->lfu_ctime_nsec,
2896 if (!capable(CAP_SYS_ADMIN))
2899 if (!S_ISREG(inode->i_mode))
2903 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2905 inode_unlock(inode);
2910 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2913 case MODE_READ_USER:
2915 case MODE_WRITE_USER:
2922 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2924 /* Used to allow the upper layers of the client to request an LDLM lock
2925 * without doing an actual read or write.
2927 * Used for ladvise lockahead to manually request specific locks.
2929 * \param[in] file file this ladvise lock request is on
2930 * \param[in] ladvise ladvise struct describing this lock request
2932 * \retval 0 success, no detailed result available (sync requests
2933 * and requests sent to the server [not handled locally]
2934 * cannot return detailed results)
2935 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2936 * see definitions for details.
2937 * \retval negative negative errno on error
2939 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2941 struct lu_env *env = NULL;
2942 struct cl_io *io = NULL;
2943 struct cl_lock *lock = NULL;
2944 struct cl_lock_descr *descr = NULL;
2945 struct dentry *dentry = file->f_path.dentry;
2946 struct inode *inode = dentry->d_inode;
2947 enum cl_lock_mode cl_mode;
2948 off_t start = ladvise->lla_start;
2949 off_t end = ladvise->lla_end;
2955 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2956 "start=%llu, end=%llu\n", dentry->d_name.len,
2957 dentry->d_name.name, dentry->d_inode,
2958 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2961 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2963 GOTO(out, result = cl_mode);
2965 /* Get IO environment */
2966 result = cl_io_get(inode, &env, &io, &refcheck);
2970 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2973 * nothing to do for this io. This currently happens when
2974 * stripe sub-object's are not yet created.
2976 result = io->ci_result;
2977 } else if (result == 0) {
2978 lock = vvp_env_lock(env);
2979 descr = &lock->cll_descr;
2981 descr->cld_obj = io->ci_obj;
2982 /* Convert byte offsets to pages */
2983 descr->cld_start = cl_index(io->ci_obj, start);
2984 descr->cld_end = cl_index(io->ci_obj, end);
2985 descr->cld_mode = cl_mode;
2986 /* CEF_MUST is used because we do not want to convert a
2987 * lockahead request to a lockless lock */
2988 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2991 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2992 descr->cld_enq_flags |= CEF_SPECULATIVE;
2994 result = cl_lock_request(env, io, lock);
2996 /* On success, we need to release the lock */
2998 cl_lock_release(env, lock);
3000 cl_io_fini(env, io);
3001 cl_env_put(env, &refcheck);
3003 /* -ECANCELED indicates a matching lock with a different extent
3004 * was already present, and -EEXIST indicates a matching lock
3005 * on exactly the same extent was already present.
3006 * We convert them to positive values for userspace to make
3007 * recognizing true errors easier.
3008 * Note we can only return these detailed results on async requests,
3009 * as sync requests look the same as i/o requests for locking. */
3010 if (result == -ECANCELED)
3011 result = LLA_RESULT_DIFFERENT;
3012 else if (result == -EEXIST)
3013 result = LLA_RESULT_SAME;
3018 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3020 static int ll_ladvise_sanity(struct inode *inode,
3021 struct llapi_lu_ladvise *ladvise)
3023 struct ll_sb_info *sbi = ll_i2sbi(inode);
3024 enum lu_ladvise_type advice = ladvise->lla_advice;
3025 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3026 * be in the first 32 bits of enum ladvise_flags */
3027 __u32 flags = ladvise->lla_peradvice_flags;
3028 /* 3 lines at 80 characters per line, should be plenty */
3031 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3033 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3034 "last supported advice is %s (value '%d'): rc = %d\n",
3035 sbi->ll_fsname, advice,
3036 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3040 /* Per-advice checks */
3042 case LU_LADVISE_LOCKNOEXPAND:
3043 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3045 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3046 "rc = %d\n", sbi->ll_fsname, flags,
3047 ladvise_names[advice], rc);
3051 case LU_LADVISE_LOCKAHEAD:
3052 /* Currently only READ and WRITE modes can be requested */
3053 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3054 ladvise->lla_lockahead_mode == 0) {
3056 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3057 "rc = %d\n", sbi->ll_fsname,
3058 ladvise->lla_lockahead_mode,
3059 ladvise_names[advice], rc);
3063 case LU_LADVISE_WILLREAD:
3064 case LU_LADVISE_DONTNEED:
3066 /* Note fall through above - These checks apply to all advices
3067 * except LOCKNOEXPAND */
3068 if (flags & ~LF_DEFAULT_MASK) {
3070 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3071 "rc = %d\n", sbi->ll_fsname, flags,
3072 ladvise_names[advice], rc);
3075 if (ladvise->lla_start >= ladvise->lla_end) {
3077 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3078 "for %s: rc = %d\n", sbi->ll_fsname,
3079 ladvise->lla_start, ladvise->lla_end,
3080 ladvise_names[advice], rc);
3092 * Give file access advices
3094 * The ladvise interface is similar to Linux fadvise() system call, except it
3095 * forwards the advices directly from Lustre client to server. The server side
3096 * codes will apply appropriate read-ahead and caching techniques for the
3097 * corresponding files.
3099 * A typical workload for ladvise is e.g. a bunch of different clients are
3100 * doing small random reads of a file, so prefetching pages into OSS cache
3101 * with big linear reads before the random IO is a net benefit. Fetching
3102 * all that data into each client cache with fadvise() may not be, due to
3103 * much more data being sent to the client.
3105 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3106 struct llapi_lu_ladvise *ladvise)
3110 struct cl_ladvise_io *lio;
3115 env = cl_env_get(&refcheck);
3117 RETURN(PTR_ERR(env));
3119 io = vvp_env_thread_io(env);
3120 io->ci_obj = ll_i2info(inode)->lli_clob;
3122 /* initialize parameters for ladvise */
3123 lio = &io->u.ci_ladvise;
3124 lio->li_start = ladvise->lla_start;
3125 lio->li_end = ladvise->lla_end;
3126 lio->li_fid = ll_inode2fid(inode);
3127 lio->li_advice = ladvise->lla_advice;
3128 lio->li_flags = flags;
3130 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3131 rc = cl_io_loop(env, io);
3135 cl_io_fini(env, io);
3136 cl_env_put(env, &refcheck);
3140 static int ll_lock_noexpand(struct file *file, int flags)
3142 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3144 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3149 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3152 struct fsxattr fsxattr;
3154 if (copy_from_user(&fsxattr,
3155 (const struct fsxattr __user *)arg,
3159 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3160 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3161 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3162 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3163 if (copy_to_user((struct fsxattr __user *)arg,
3164 &fsxattr, sizeof(fsxattr)))
3170 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3173 * Project Quota ID state is only allowed to change from within the init
3174 * namespace. Enforce that restriction only if we are trying to change
3175 * the quota ID state. Everything else is allowed in user namespaces.
3177 if (current_user_ns() == &init_user_ns)
3180 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3183 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3184 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3187 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3194 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3198 struct md_op_data *op_data;
3199 struct ptlrpc_request *req = NULL;
3201 struct fsxattr fsxattr;
3202 struct cl_object *obj;
3206 if (copy_from_user(&fsxattr,
3207 (const struct fsxattr __user *)arg,
3211 rc = ll_ioctl_check_project(inode, &fsxattr);
3215 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3216 LUSTRE_OPC_ANY, NULL);
3217 if (IS_ERR(op_data))
3218 RETURN(PTR_ERR(op_data));
3220 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3221 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3222 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3223 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3224 op_data->op_projid = fsxattr.fsx_projid;
3225 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3226 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3228 ptlrpc_req_finished(req);
3230 GOTO(out_fsxattr, rc);
3231 ll_update_inode_flags(inode, op_data->op_attr_flags);
3232 obj = ll_i2info(inode)->lli_clob;
3234 GOTO(out_fsxattr, rc);
3236 OBD_ALLOC_PTR(attr);
3238 GOTO(out_fsxattr, rc = -ENOMEM);
3240 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3241 fsxattr.fsx_xflags);
3244 ll_finish_md_op_data(op_data);
3248 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3251 struct inode *inode = file_inode(file);
3252 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3253 struct ll_inode_info *lli = ll_i2info(inode);
3254 struct obd_client_handle *och = NULL;
3255 struct split_param sp;
3256 struct pcc_param param;
3257 bool lease_broken = false;
3259 enum mds_op_bias bias = 0;
3260 struct file *layout_file = NULL;
3262 size_t data_size = 0;
3263 bool attached = false;
3268 mutex_lock(&lli->lli_och_mutex);
3269 if (fd->fd_lease_och != NULL) {
3270 och = fd->fd_lease_och;
3271 fd->fd_lease_och = NULL;
3273 mutex_unlock(&lli->lli_och_mutex);
3278 fmode = och->och_flags;
3280 switch (ioc->lil_flags) {
3281 case LL_LEASE_RESYNC_DONE:
3282 if (ioc->lil_count > IOC_IDS_MAX)
3283 GOTO(out_lease_close, rc = -EINVAL);
3285 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3286 OBD_ALLOC(data, data_size);
3288 GOTO(out_lease_close, rc = -ENOMEM);
3290 if (copy_from_user(data, (void __user *)arg, data_size))
3291 GOTO(out_lease_close, rc = -EFAULT);
3293 bias = MDS_CLOSE_RESYNC_DONE;
3295 case LL_LEASE_LAYOUT_MERGE: {
3298 if (ioc->lil_count != 1)
3299 GOTO(out_lease_close, rc = -EINVAL);
3301 arg += sizeof(*ioc);
3302 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3303 GOTO(out_lease_close, rc = -EFAULT);
3305 layout_file = fget(fd);
3307 GOTO(out_lease_close, rc = -EBADF);
3309 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3310 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3311 GOTO(out_lease_close, rc = -EPERM);
3313 data = file_inode(layout_file);
3314 bias = MDS_CLOSE_LAYOUT_MERGE;
3317 case LL_LEASE_LAYOUT_SPLIT: {
3321 if (ioc->lil_count != 2)
3322 GOTO(out_lease_close, rc = -EINVAL);
3324 arg += sizeof(*ioc);
3325 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3326 GOTO(out_lease_close, rc = -EFAULT);
3328 arg += sizeof(__u32);
3329 if (copy_from_user(&mirror_id, (void __user *)arg,
3331 GOTO(out_lease_close, rc = -EFAULT);
3333 layout_file = fget(fdv);
3335 GOTO(out_lease_close, rc = -EBADF);
3337 sp.sp_inode = file_inode(layout_file);
3338 sp.sp_mirror_id = (__u16)mirror_id;
3340 bias = MDS_CLOSE_LAYOUT_SPLIT;
3343 case LL_LEASE_PCC_ATTACH:
3344 if (ioc->lil_count != 1)
3347 arg += sizeof(*ioc);
3348 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3350 GOTO(out_lease_close, rc2 = -EFAULT);
3352 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3354 GOTO(out_lease_close, rc2);
3357 /* Grab latest data version */
3358 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3361 GOTO(out_lease_close, rc2);
3364 bias = MDS_PCC_ATTACH;
3367 /* without close intent */
3372 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3376 rc = ll_lease_och_release(inode, file);
3385 switch (ioc->lil_flags) {
3386 case LL_LEASE_RESYNC_DONE:
3388 OBD_FREE(data, data_size);
3390 case LL_LEASE_LAYOUT_MERGE:
3391 case LL_LEASE_LAYOUT_SPLIT:
3395 case LL_LEASE_PCC_ATTACH:
3398 rc = pcc_readwrite_attach_fini(file, inode,
3399 param.pa_layout_gen,
3406 rc = ll_lease_type_from_fmode(fmode);
3410 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3413 struct inode *inode = file_inode(file);
3414 struct ll_inode_info *lli = ll_i2info(inode);
3415 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3416 struct obd_client_handle *och = NULL;
3417 __u64 open_flags = 0;
3423 switch (ioc->lil_mode) {
3424 case LL_LEASE_WRLCK:
3425 if (!(file->f_mode & FMODE_WRITE))
3427 fmode = FMODE_WRITE;
3429 case LL_LEASE_RDLCK:
3430 if (!(file->f_mode & FMODE_READ))
3434 case LL_LEASE_UNLCK:
3435 RETURN(ll_file_unlock_lease(file, ioc, arg));
3440 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3442 /* apply for lease */
3443 if (ioc->lil_flags & LL_LEASE_RESYNC)
3444 open_flags = MDS_OPEN_RESYNC;
3445 och = ll_lease_open(inode, file, fmode, open_flags);
3447 RETURN(PTR_ERR(och));
3449 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3450 rc = ll_lease_file_resync(och, inode, arg);
3452 ll_lease_close(och, inode, NULL);
3455 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3457 ll_lease_close(och, inode, NULL);
3463 mutex_lock(&lli->lli_och_mutex);
3464 if (fd->fd_lease_och == NULL) {
3465 fd->fd_lease_och = och;
3468 mutex_unlock(&lli->lli_och_mutex);
3470 /* impossible now that only excl is supported for now */
3471 ll_lease_close(och, inode, &lease_broken);
3477 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3479 struct ll_inode_info *lli = ll_i2info(inode);
3480 struct ll_sb_info *sbi = ll_i2sbi(inode);
3481 __u64 now = ktime_get_real_seconds();
3484 spin_lock(&lli->lli_heat_lock);
3485 heat->lh_flags = lli->lli_heat_flags;
3486 for (i = 0; i < heat->lh_count; i++)
3487 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3488 now, sbi->ll_heat_decay_weight,
3489 sbi->ll_heat_period_second);
3490 spin_unlock(&lli->lli_heat_lock);
3493 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3495 struct ll_inode_info *lli = ll_i2info(inode);
3498 spin_lock(&lli->lli_heat_lock);
3499 if (flags & LU_HEAT_FLAG_CLEAR)
3500 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3502 if (flags & LU_HEAT_FLAG_OFF)
3503 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3505 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3507 spin_unlock(&lli->lli_heat_lock);
3513 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3515 struct inode *inode = file_inode(file);
3516 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3520 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3521 PFID(ll_inode2fid(inode)), inode, cmd);
3522 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3524 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3525 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3529 case LL_IOC_GETFLAGS:
3530 /* Get the current value of the file flags */
3531 return put_user(fd->fd_flags, (int __user *)arg);
3532 case LL_IOC_SETFLAGS:
3533 case LL_IOC_CLRFLAGS:
3534 /* Set or clear specific file flags */
3535 /* XXX This probably needs checks to ensure the flags are
3536 * not abused, and to handle any flag side effects.
3538 if (get_user(flags, (int __user *) arg))
3541 if (cmd == LL_IOC_SETFLAGS) {
3542 if ((flags & LL_FILE_IGNORE_LOCK) &&
3543 !(file->f_flags & O_DIRECT)) {
3544 CERROR("%s: unable to disable locking on "
3545 "non-O_DIRECT file\n", current->comm);
3549 fd->fd_flags |= flags;
3551 fd->fd_flags &= ~flags;
3554 case LL_IOC_LOV_SETSTRIPE:
3555 case LL_IOC_LOV_SETSTRIPE_NEW:
3556 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3557 case LL_IOC_LOV_SETEA:
3558 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3559 case LL_IOC_LOV_SWAP_LAYOUTS: {
3561 struct lustre_swap_layouts lsl;
3563 if (copy_from_user(&lsl, (char __user *)arg,
3564 sizeof(struct lustre_swap_layouts)))
3567 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3570 file2 = fget(lsl.sl_fd);
3574 /* O_WRONLY or O_RDWR */
3575 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3576 GOTO(out, rc = -EPERM);
3578 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3579 struct inode *inode2;
3580 struct ll_inode_info *lli;
3581 struct obd_client_handle *och = NULL;
3583 lli = ll_i2info(inode);
3584 mutex_lock(&lli->lli_och_mutex);
3585 if (fd->fd_lease_och != NULL) {
3586 och = fd->fd_lease_och;
3587 fd->fd_lease_och = NULL;
3589 mutex_unlock(&lli->lli_och_mutex);
3591 GOTO(out, rc = -ENOLCK);
3592 inode2 = file_inode(file2);
3593 rc = ll_swap_layouts_close(och, inode, inode2);
3595 rc = ll_swap_layouts(file, file2, &lsl);
3601 case LL_IOC_LOV_GETSTRIPE:
3602 case LL_IOC_LOV_GETSTRIPE_NEW:
3603 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3604 case FS_IOC_GETFLAGS:
3605 case FS_IOC_SETFLAGS:
3606 RETURN(ll_iocontrol(inode, file, cmd, arg));
3607 case FSFILT_IOC_GETVERSION:
3608 case FS_IOC_GETVERSION:
3609 RETURN(put_user(inode->i_generation, (int __user *)arg));
3610 /* We need to special case any other ioctls we want to handle,
3611 * to send them to the MDS/OST as appropriate and to properly
3612 * network encode the arg field. */
3613 case FS_IOC_SETVERSION:
3616 case LL_IOC_GROUP_LOCK:
3617 RETURN(ll_get_grouplock(inode, file, arg));
3618 case LL_IOC_GROUP_UNLOCK:
3619 RETURN(ll_put_grouplock(inode, file, arg));
3620 case IOC_OBD_STATFS:
3621 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3623 case LL_IOC_FLUSHCTX:
3624 RETURN(ll_flush_ctx(inode));
3625 case LL_IOC_PATH2FID: {
3626 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3627 sizeof(struct lu_fid)))
3632 case LL_IOC_GETPARENT:
3633 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3635 case OBD_IOC_FID2PATH:
3636 RETURN(ll_fid2path(inode, (void __user *)arg));
3637 case LL_IOC_DATA_VERSION: {
3638 struct ioc_data_version idv;
3641 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3644 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3645 rc = ll_ioc_data_version(inode, &idv);
3648 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3654 case LL_IOC_GET_MDTIDX: {
3657 mdtidx = ll_get_mdt_idx(inode);
3661 if (put_user((int)mdtidx, (int __user *)arg))
3666 case OBD_IOC_GETDTNAME:
3667 case OBD_IOC_GETMDNAME:
3668 RETURN(ll_get_obd_name(inode, cmd, arg));
3669 case LL_IOC_HSM_STATE_GET: {
3670 struct md_op_data *op_data;
3671 struct hsm_user_state *hus;
3678 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3679 LUSTRE_OPC_ANY, hus);
3680 if (IS_ERR(op_data)) {
3682 RETURN(PTR_ERR(op_data));
3685 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3688 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3691 ll_finish_md_op_data(op_data);
3695 case LL_IOC_HSM_STATE_SET: {
3696 struct hsm_state_set *hss;
3703 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3708 rc = ll_hsm_state_set(inode, hss);
3713 case LL_IOC_HSM_ACTION: {
3714 struct md_op_data *op_data;
3715 struct hsm_current_action *hca;
3722 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3723 LUSTRE_OPC_ANY, hca);
3724 if (IS_ERR(op_data)) {
3726 RETURN(PTR_ERR(op_data));
3729 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3732 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3735 ll_finish_md_op_data(op_data);
3739 case LL_IOC_SET_LEASE_OLD: {
3740 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3742 RETURN(ll_file_set_lease(file, &ioc, 0));
3744 case LL_IOC_SET_LEASE: {
3745 struct ll_ioc_lease ioc;
3747 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3750 RETURN(ll_file_set_lease(file, &ioc, arg));
3752 case LL_IOC_GET_LEASE: {
3753 struct ll_inode_info *lli = ll_i2info(inode);
3754 struct ldlm_lock *lock = NULL;
3757 mutex_lock(&lli->lli_och_mutex);
3758 if (fd->fd_lease_och != NULL) {
3759 struct obd_client_handle *och = fd->fd_lease_och;
3761 lock = ldlm_handle2lock(&och->och_lease_handle);
3763 lock_res_and_lock(lock);
3764 if (!ldlm_is_cancel(lock))
3765 fmode = och->och_flags;
3767 unlock_res_and_lock(lock);
3768 LDLM_LOCK_PUT(lock);
3771 mutex_unlock(&lli->lli_och_mutex);
3773 RETURN(ll_lease_type_from_fmode(fmode));
3775 case LL_IOC_HSM_IMPORT: {
3776 struct hsm_user_import *hui;
3782 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3787 rc = ll_hsm_import(inode, file, hui);
3792 case LL_IOC_FUTIMES_3: {
3793 struct ll_futimes_3 lfu;
3795 if (copy_from_user(&lfu,
3796 (const struct ll_futimes_3 __user *)arg,
3800 RETURN(ll_file_futimes_3(file, &lfu));
3802 case LL_IOC_LADVISE: {
3803 struct llapi_ladvise_hdr *k_ladvise_hdr;
3804 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3807 int alloc_size = sizeof(*k_ladvise_hdr);
3810 u_ladvise_hdr = (void __user *)arg;
3811 OBD_ALLOC_PTR(k_ladvise_hdr);
3812 if (k_ladvise_hdr == NULL)
3815 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3816 GOTO(out_ladvise, rc = -EFAULT);
3818 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3819 k_ladvise_hdr->lah_count < 1)
3820 GOTO(out_ladvise, rc = -EINVAL);
3822 num_advise = k_ladvise_hdr->lah_count;
3823 if (num_advise >= LAH_COUNT_MAX)
3824 GOTO(out_ladvise, rc = -EFBIG);
3826 OBD_FREE_PTR(k_ladvise_hdr);
3827 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3828 lah_advise[num_advise]);
3829 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3830 if (k_ladvise_hdr == NULL)
3834 * TODO: submit multiple advices to one server in a single RPC
3836 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3837 GOTO(out_ladvise, rc = -EFAULT);
3839 for (i = 0; i < num_advise; i++) {
3840 struct llapi_lu_ladvise *k_ladvise =
3841 &k_ladvise_hdr->lah_advise[i];
3842 struct llapi_lu_ladvise __user *u_ladvise =
3843 &u_ladvise_hdr->lah_advise[i];
3845 rc = ll_ladvise_sanity(inode, k_ladvise);
3847 GOTO(out_ladvise, rc);
3849 switch (k_ladvise->lla_advice) {
3850 case LU_LADVISE_LOCKNOEXPAND:
3851 rc = ll_lock_noexpand(file,
3852 k_ladvise->lla_peradvice_flags);
3853 GOTO(out_ladvise, rc);
3854 case LU_LADVISE_LOCKAHEAD:
3856 rc = ll_file_lock_ahead(file, k_ladvise);
3859 GOTO(out_ladvise, rc);
3862 &u_ladvise->lla_lockahead_result))
3863 GOTO(out_ladvise, rc = -EFAULT);
3866 rc = ll_ladvise(inode, file,
3867 k_ladvise_hdr->lah_flags,
3870 GOTO(out_ladvise, rc);
3877 OBD_FREE(k_ladvise_hdr, alloc_size);
3880 case LL_IOC_FLR_SET_MIRROR: {
3881 /* mirror I/O must be direct to avoid polluting page cache
3883 if (!(file->f_flags & O_DIRECT))
3886 fd->fd_designated_mirror = (__u32)arg;
3889 case LL_IOC_FSGETXATTR:
3890 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3891 case LL_IOC_FSSETXATTR:
3892 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3894 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3895 case LL_IOC_HEAT_GET: {
3896 struct lu_heat uheat;
3897 struct lu_heat *heat;
3900 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3903 if (uheat.lh_count > OBD_HEAT_COUNT)
3904 uheat.lh_count = OBD_HEAT_COUNT;
3906 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3907 OBD_ALLOC(heat, size);
3911 heat->lh_count = uheat.lh_count;
3912 ll_heat_get(inode, heat);
3913 rc = copy_to_user((char __user *)arg, heat, size);
3914 OBD_FREE(heat, size);
3915 RETURN(rc ? -EFAULT : 0);
3917 case LL_IOC_HEAT_SET: {
3920 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3923 rc = ll_heat_set(inode, flags);
3926 case LL_IOC_PCC_DETACH: {
3927 struct lu_pcc_detach *detach;
3929 OBD_ALLOC_PTR(detach);
3933 if (copy_from_user(detach,
3934 (const struct lu_pcc_detach __user *)arg,
3936 GOTO(out_detach_free, rc = -EFAULT);
3938 if (!S_ISREG(inode->i_mode))
3939 GOTO(out_detach_free, rc = -EINVAL);
3941 if (!inode_owner_or_capable(inode))
3942 GOTO(out_detach_free, rc = -EPERM);
3944 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3946 OBD_FREE_PTR(detach);
3949 case LL_IOC_PCC_STATE: {
3950 struct lu_pcc_state __user *ustate =
3951 (struct lu_pcc_state __user *)arg;
3952 struct lu_pcc_state *state;
3954 OBD_ALLOC_PTR(state);
3958 if (copy_from_user(state, ustate, sizeof(*state)))
3959 GOTO(out_state, rc = -EFAULT);
3961 rc = pcc_ioctl_state(file, inode, state);
3963 GOTO(out_state, rc);
3965 if (copy_to_user(ustate, state, sizeof(*state)))
3966 GOTO(out_state, rc = -EFAULT);
3969 OBD_FREE_PTR(state);
3973 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3974 (void __user *)arg));
3978 #ifndef HAVE_FILE_LLSEEK_SIZE
3979 static inline loff_t
3980 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3982 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3984 if (offset > maxsize)
3987 if (offset != file->f_pos) {
3988 file->f_pos = offset;
3989 file->f_version = 0;
3995 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3996 loff_t maxsize, loff_t eof)
3998 struct inode *inode = file_inode(file);
4006 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4007 * position-querying operation. Avoid rewriting the "same"
4008 * f_pos value back to the file because a concurrent read(),
4009 * write() or lseek() might have altered it
4014 * f_lock protects against read/modify/write race with other
4015 * SEEK_CURs. Note that parallel writes and reads behave
4019 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4020 inode_unlock(inode);
4024 * In the generic case the entire file is data, so as long as
4025 * offset isn't at the end of the file then the offset is data.
4032 * There is a virtual hole at the end of the file, so as long as
4033 * offset isn't i_size or larger, return i_size.
4041 return llseek_execute(file, offset, maxsize);
4045 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4047 struct inode *inode = file_inode(file);
4048 loff_t retval, eof = 0;
4049 ktime_t kstart = ktime_get();
4052 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4053 (origin == SEEK_CUR) ? file->f_pos : 0);
4054 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4055 PFID(ll_inode2fid(inode)), inode, retval, retval,
4058 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4059 retval = ll_glimpse_size(inode);
4062 eof = i_size_read(inode);
4065 retval = ll_generic_file_llseek_size(file, offset, origin,
4066 ll_file_maxbytes(inode), eof);
4068 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4069 ktime_us_delta(ktime_get(), kstart));
4073 static int ll_flush(struct file *file, fl_owner_t id)
4075 struct inode *inode = file_inode(file);
4076 struct ll_inode_info *lli = ll_i2info(inode);
4077 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4080 LASSERT(!S_ISDIR(inode->i_mode));
4082 /* catch async errors that were recorded back when async writeback
4083 * failed for pages in this mapping. */
4084 rc = lli->lli_async_rc;
4085 lli->lli_async_rc = 0;
4086 if (lli->lli_clob != NULL) {
4087 err = lov_read_and_clear_async_rc(lli->lli_clob);
4092 /* The application has been told write failure already.
4093 * Do not report failure again. */
4094 if (fd->fd_write_failed)
4096 return rc ? -EIO : 0;
4100 * Called to make sure a portion of file has been written out.
4101 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4103 * Return how many pages have been written.
4105 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4106 enum cl_fsync_mode mode, int ignore_layout)
4110 struct cl_fsync_io *fio;
4115 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4116 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4119 env = cl_env_get(&refcheck);
4121 RETURN(PTR_ERR(env));
4123 io = vvp_env_thread_io(env);
4124 io->ci_obj = ll_i2info(inode)->lli_clob;
4125 io->ci_ignore_layout = ignore_layout;
4127 /* initialize parameters for sync */
4128 fio = &io->u.ci_fsync;
4129 fio->fi_start = start;
4131 fio->fi_fid = ll_inode2fid(inode);
4132 fio->fi_mode = mode;
4133 fio->fi_nr_written = 0;
4135 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4136 result = cl_io_loop(env, io);
4138 result = io->ci_result;
4140 result = fio->fi_nr_written;
4141 cl_io_fini(env, io);
4142 cl_env_put(env, &refcheck);
4148 * When dentry is provided (the 'else' case), file_dentry() may be
4149 * null and dentry must be used directly rather than pulled from
4150 * file_dentry() as is done otherwise.
4153 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4155 struct dentry *dentry = file_dentry(file);
4156 struct inode *inode = dentry->d_inode;
4157 struct ll_inode_info *lli = ll_i2info(inode);
4158 struct ptlrpc_request *req;
4159 ktime_t kstart = ktime_get();
4164 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4166 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4168 /* fsync's caller has already called _fdata{sync,write}, we want
4169 * that IO to finish before calling the osc and mdc sync methods */
4170 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4173 /* catch async errors that were recorded back when async writeback
4174 * failed for pages in this mapping. */
4175 if (!S_ISDIR(inode->i_mode)) {
4176 err = lli->lli_async_rc;
4177 lli->lli_async_rc = 0;
4180 if (lli->lli_clob != NULL) {
4181 err = lov_read_and_clear_async_rc(lli->lli_clob);
4187 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4191 ptlrpc_req_finished(req);
4193 if (S_ISREG(inode->i_mode)) {
4194 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4197 /* Sync metadata on MDT first, and then sync the cached data
4200 err = pcc_fsync(file, start, end, datasync, &cached);
4202 err = cl_sync_file_range(inode, start, end,
4204 if (rc == 0 && err < 0)
4207 fd->fd_write_failed = true;
4209 fd->fd_write_failed = false;
4212 inode_unlock(inode);
4215 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4216 ktime_us_delta(ktime_get(), kstart));
4221 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4223 struct inode *inode = file_inode(file);
4224 struct ll_sb_info *sbi = ll_i2sbi(inode);
4225 struct ldlm_enqueue_info einfo = {
4226 .ei_type = LDLM_FLOCK,
4227 .ei_cb_cp = ldlm_flock_completion_ast,
4228 .ei_cbdata = file_lock,
4230 struct md_op_data *op_data;
4231 struct lustre_handle lockh = { 0 };
4232 union ldlm_policy_data flock = { { 0 } };
4233 int fl_type = file_lock->fl_type;
4234 ktime_t kstart = ktime_get();
4240 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4241 PFID(ll_inode2fid(inode)), file_lock);
4243 if (file_lock->fl_flags & FL_FLOCK) {
4244 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4245 /* flocks are whole-file locks */
4246 flock.l_flock.end = OFFSET_MAX;
4247 /* For flocks owner is determined by the local file desctiptor*/
4248 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4249 } else if (file_lock->fl_flags & FL_POSIX) {
4250 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4251 flock.l_flock.start = file_lock->fl_start;
4252 flock.l_flock.end = file_lock->fl_end;
4256 flock.l_flock.pid = file_lock->fl_pid;
4258 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4259 /* Somewhat ugly workaround for svc lockd.
4260 * lockd installs custom fl_lmops->lm_compare_owner that checks
4261 * for the fl_owner to be the same (which it always is on local node
4262 * I guess between lockd processes) and then compares pid.
4263 * As such we assign pid to the owner field to make it all work,
4264 * conflict with normal locks is unlikely since pid space and
4265 * pointer space for current->files are not intersecting */
4266 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4267 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4272 einfo.ei_mode = LCK_PR;
4275 /* An unlock request may or may not have any relation to
4276 * existing locks so we may not be able to pass a lock handle
4277 * via a normal ldlm_lock_cancel() request. The request may even
4278 * unlock a byte range in the middle of an existing lock. In
4279 * order to process an unlock request we need all of the same
4280 * information that is given with a normal read or write record
4281 * lock request. To avoid creating another ldlm unlock (cancel)
4282 * message we'll treat a LCK_NL flock request as an unlock. */
4283 einfo.ei_mode = LCK_NL;
4286 einfo.ei_mode = LCK_PW;
4289 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4304 flags = LDLM_FL_BLOCK_NOWAIT;
4310 flags = LDLM_FL_TEST_LOCK;
4313 CERROR("unknown fcntl lock command: %d\n", cmd);
4317 /* Save the old mode so that if the mode in the lock changes we
4318 * can decrement the appropriate reader or writer refcount. */
4319 file_lock->fl_type = einfo.ei_mode;
4321 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4322 LUSTRE_OPC_ANY, NULL);
4323 if (IS_ERR(op_data))
4324 RETURN(PTR_ERR(op_data));
4326 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4327 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4328 flock.l_flock.pid, flags, einfo.ei_mode,
4329 flock.l_flock.start, flock.l_flock.end);
4331 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4334 /* Restore the file lock type if not TEST lock. */
4335 if (!(flags & LDLM_FL_TEST_LOCK))
4336 file_lock->fl_type = fl_type;
4338 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4339 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4340 !(flags & LDLM_FL_TEST_LOCK))
4341 rc2 = locks_lock_file_wait(file, file_lock);
4343 if ((file_lock->fl_flags & FL_FLOCK) &&
4344 (rc == 0 || file_lock->fl_type == F_UNLCK))
4345 rc2 = flock_lock_file_wait(file, file_lock);
4346 if ((file_lock->fl_flags & FL_POSIX) &&
4347 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4348 !(flags & LDLM_FL_TEST_LOCK))
4349 rc2 = posix_lock_file_wait(file, file_lock);
4350 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4352 if (rc2 && file_lock->fl_type != F_UNLCK) {
4353 einfo.ei_mode = LCK_NL;
4354 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4359 ll_finish_md_op_data(op_data);
4362 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4363 ktime_us_delta(ktime_get(), kstart));
4367 int ll_get_fid_by_name(struct inode *parent, const char *name,
4368 int namelen, struct lu_fid *fid,
4369 struct inode **inode)
4371 struct md_op_data *op_data = NULL;
4372 struct mdt_body *body;
4373 struct ptlrpc_request *req;
4377 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4378 LUSTRE_OPC_ANY, NULL);
4379 if (IS_ERR(op_data))
4380 RETURN(PTR_ERR(op_data));
4382 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4383 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4384 ll_finish_md_op_data(op_data);
4388 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4390 GOTO(out_req, rc = -EFAULT);
4392 *fid = body->mbo_fid1;
4395 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4397 ptlrpc_req_finished(req);
4401 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4404 struct dentry *dchild = NULL;
4405 struct inode *child_inode = NULL;
4406 struct md_op_data *op_data;
4407 struct ptlrpc_request *request = NULL;
4408 struct obd_client_handle *och = NULL;
4410 struct mdt_body *body;
4411 __u64 data_version = 0;
4412 size_t namelen = strlen(name);
4413 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4417 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4418 PFID(ll_inode2fid(parent)), name,
4419 lum->lum_stripe_offset, lum->lum_stripe_count);
4421 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4422 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4423 lustre_swab_lmv_user_md(lum);
4425 /* Get child FID first */
4426 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4429 dchild = d_lookup(file_dentry(file), &qstr);
4431 if (dchild->d_inode)
4432 child_inode = igrab(dchild->d_inode);
4437 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4446 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4447 OBD_CONNECT2_DIR_MIGRATE)) {
4448 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4449 ll_dir_striped(child_inode)) {
4450 CERROR("%s: MDT doesn't support stripe directory "
4451 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4452 GOTO(out_iput, rc = -EOPNOTSUPP);
4457 * lfs migrate command needs to be blocked on the client
4458 * by checking the migrate FID against the FID of the
4461 if (child_inode == parent->i_sb->s_root->d_inode)
4462 GOTO(out_iput, rc = -EINVAL);
4464 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4465 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4466 if (IS_ERR(op_data))
4467 GOTO(out_iput, rc = PTR_ERR(op_data));
4469 inode_lock(child_inode);
4470 op_data->op_fid3 = *ll_inode2fid(child_inode);
4471 if (!fid_is_sane(&op_data->op_fid3)) {
4472 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4473 ll_i2sbi(parent)->ll_fsname, name,
4474 PFID(&op_data->op_fid3));
4475 GOTO(out_unlock, rc = -EINVAL);
4478 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4479 op_data->op_data = lum;
4480 op_data->op_data_size = lumlen;
4483 if (S_ISREG(child_inode->i_mode)) {
4484 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4488 GOTO(out_unlock, rc);
4491 rc = ll_data_version(child_inode, &data_version,
4494 GOTO(out_close, rc);
4496 op_data->op_open_handle = och->och_open_handle;
4497 op_data->op_data_version = data_version;
4498 op_data->op_lease_handle = och->och_lease_handle;
4499 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4501 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4502 och->och_mod->mod_open_req->rq_replay = 0;
4503 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4506 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4507 name, namelen, &request);
4509 LASSERT(request != NULL);
4510 ll_update_times(request, parent);
4513 if (rc == 0 || rc == -EAGAIN) {
4514 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4515 LASSERT(body != NULL);
4517 /* If the server does release layout lock, then we cleanup
4518 * the client och here, otherwise release it in out_close: */
4519 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4520 obd_mod_put(och->och_mod);
4521 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4523 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4529 if (request != NULL) {
4530 ptlrpc_req_finished(request);
4534 /* Try again if the lease has cancelled. */
4535 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4540 ll_lease_close(och, child_inode, NULL);
4542 clear_nlink(child_inode);
4544 inode_unlock(child_inode);
4545 ll_finish_md_op_data(op_data);
4552 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4554 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4558 * In order to avoid flood of warning messages, only print one message
4559 * for one file. And the entire message rate on the client is limited
4560 * by CDEBUG_LIMIT too.
4562 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4563 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4564 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4565 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4571 * test if some locks matching bits and l_req_mode are acquired
4572 * - bits can be in different locks
4573 * - if found clear the common lock bits in *bits
4574 * - the bits not found, are kept in *bits
4576 * \param bits [IN] searched lock bits [IN]
4577 * \param l_req_mode [IN] searched lock mode
4578 * \retval boolean, true iff all bits are found
4580 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4582 struct lustre_handle lockh;
4583 union ldlm_policy_data policy;
4584 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4585 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4594 fid = &ll_i2info(inode)->lli_fid;
4595 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4596 ldlm_lockname[mode]);
4598 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4599 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4600 policy.l_inodebits.bits = *bits & (1 << i);
4601 if (policy.l_inodebits.bits == 0)
4604 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4605 &policy, mode, &lockh)) {
4606 struct ldlm_lock *lock;
4608 lock = ldlm_handle2lock(&lockh);
4611 ~(lock->l_policy_data.l_inodebits.bits);
4612 LDLM_LOCK_PUT(lock);
4614 *bits &= ~policy.l_inodebits.bits;
4621 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4622 struct lustre_handle *lockh, __u64 flags,
4623 enum ldlm_mode mode)
4625 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4630 fid = &ll_i2info(inode)->lli_fid;
4631 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4633 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4634 fid, LDLM_IBITS, &policy, mode, lockh);
4639 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4641 /* Already unlinked. Just update nlink and return success */
4642 if (rc == -ENOENT) {
4644 /* If it is striped directory, and there is bad stripe
4645 * Let's revalidate the dentry again, instead of returning
4647 if (ll_dir_striped(inode))
4650 /* This path cannot be hit for regular files unless in
4651 * case of obscure races, so no need to to validate
4653 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4655 } else if (rc != 0) {
4656 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4657 "%s: revalidate FID "DFID" error: rc = %d\n",
4658 ll_i2sbi(inode)->ll_fsname,
4659 PFID(ll_inode2fid(inode)), rc);
4665 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4667 struct inode *inode = dentry->d_inode;
4668 struct obd_export *exp = ll_i2mdexp(inode);
4669 struct lookup_intent oit = {
4672 struct ptlrpc_request *req = NULL;
4673 struct md_op_data *op_data;
4677 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4678 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4680 /* Call getattr by fid, so do not provide name at all. */
4681 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4682 LUSTRE_OPC_ANY, NULL);
4683 if (IS_ERR(op_data))
4684 RETURN(PTR_ERR(op_data));
4686 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4687 ll_finish_md_op_data(op_data);
4689 rc = ll_inode_revalidate_fini(inode, rc);
4693 rc = ll_revalidate_it_finish(req, &oit, dentry);
4695 ll_intent_release(&oit);
4699 /* Unlinked? Unhash dentry, so it is not picked up later by
4700 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4701 * here to preserve get_cwd functionality on 2.6.
4703 if (!dentry->d_inode->i_nlink) {
4704 spin_lock(&inode->i_lock);
4705 d_lustre_invalidate(dentry, 0);
4706 spin_unlock(&inode->i_lock);
4709 ll_lookup_finish_locks(&oit, dentry);
4711 ptlrpc_req_finished(req);
4716 static int ll_merge_md_attr(struct inode *inode)
4718 struct ll_inode_info *lli = ll_i2info(inode);
4719 struct cl_attr attr = { 0 };
4722 LASSERT(lli->lli_lsm_md != NULL);
4724 if (!lmv_dir_striped(lli->lli_lsm_md))
4727 down_read(&lli->lli_lsm_sem);
4728 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4729 &attr, ll_md_blocking_ast);
4730 up_read(&lli->lli_lsm_sem);
4734 set_nlink(inode, attr.cat_nlink);
4735 inode->i_blocks = attr.cat_blocks;
4736 i_size_write(inode, attr.cat_size);
4738 ll_i2info(inode)->lli_atime = attr.cat_atime;
4739 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4740 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4745 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4747 struct inode *inode = de->d_inode;
4748 struct ll_sb_info *sbi = ll_i2sbi(inode);
4749 struct ll_inode_info *lli = ll_i2info(inode);
4750 ktime_t kstart = ktime_get();
4753 rc = ll_inode_revalidate(de, IT_GETATTR);
4757 if (S_ISREG(inode->i_mode)) {
4760 rc = pcc_inode_getattr(inode, &cached);
4761 if (cached && rc < 0)
4764 /* In case of restore, the MDT has the right size and has
4765 * already send it back without granting the layout lock,
4766 * inode is up-to-date so glimpse is useless.
4767 * Also to glimpse we need the layout, in case of a running
4768 * restore the MDT holds the layout lock so the glimpse will
4769 * block up to the end of restore (getattr will block)
4771 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4772 rc = ll_glimpse_size(inode);
4777 /* If object isn't regular a file then don't validate size. */
4778 if (ll_dir_striped(inode)) {
4779 rc = ll_merge_md_attr(inode);
4784 inode->i_atime.tv_sec = lli->lli_atime;
4785 inode->i_mtime.tv_sec = lli->lli_mtime;
4786 inode->i_ctime.tv_sec = lli->lli_ctime;
4789 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4791 if (ll_need_32bit_api(sbi)) {
4792 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4793 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4794 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4796 stat->ino = inode->i_ino;
4797 stat->dev = inode->i_sb->s_dev;
4798 stat->rdev = inode->i_rdev;
4801 stat->mode = inode->i_mode;
4802 stat->uid = inode->i_uid;
4803 stat->gid = inode->i_gid;
4804 stat->atime = inode->i_atime;
4805 stat->mtime = inode->i_mtime;
4806 stat->ctime = inode->i_ctime;
4807 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4809 stat->nlink = inode->i_nlink;
4810 stat->size = i_size_read(inode);
4811 stat->blocks = inode->i_blocks;
4813 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4814 ktime_us_delta(ktime_get(), kstart));
4819 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4820 int ll_getattr(const struct path *path, struct kstat *stat,
4821 u32 request_mask, unsigned int flags)
4823 struct dentry *de = path->dentry;
4825 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4828 return ll_getattr_dentry(de, stat);
4831 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4832 __u64 start, __u64 len)
4836 struct fiemap *fiemap;
4837 unsigned int extent_count = fieinfo->fi_extents_max;
4839 num_bytes = sizeof(*fiemap) + (extent_count *
4840 sizeof(struct fiemap_extent));
4841 OBD_ALLOC_LARGE(fiemap, num_bytes);
4846 fiemap->fm_flags = fieinfo->fi_flags;
4847 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4848 fiemap->fm_start = start;
4849 fiemap->fm_length = len;
4850 if (extent_count > 0 &&
4851 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4852 sizeof(struct fiemap_extent)) != 0)
4853 GOTO(out, rc = -EFAULT);
4855 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4857 fieinfo->fi_flags = fiemap->fm_flags;
4858 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4859 if (extent_count > 0 &&
4860 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4861 fiemap->fm_mapped_extents *
4862 sizeof(struct fiemap_extent)) != 0)
4863 GOTO(out, rc = -EFAULT);
4865 OBD_FREE_LARGE(fiemap, num_bytes);
4869 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4871 struct ll_inode_info *lli = ll_i2info(inode);
4872 struct posix_acl *acl = NULL;
4875 spin_lock(&lli->lli_lock);
4876 /* VFS' acl_permission_check->check_acl will release the refcount */
4877 acl = posix_acl_dup(lli->lli_posix_acl);
4878 spin_unlock(&lli->lli_lock);
4883 #ifdef HAVE_IOP_SET_ACL
4884 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4885 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4887 struct ll_sb_info *sbi = ll_i2sbi(inode);
4888 struct ptlrpc_request *req = NULL;
4889 const char *name = NULL;
4891 size_t value_size = 0;
4896 case ACL_TYPE_ACCESS:
4897 name = XATTR_NAME_POSIX_ACL_ACCESS;
4899 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4902 case ACL_TYPE_DEFAULT:
4903 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4904 if (!S_ISDIR(inode->i_mode))
4905 rc = acl ? -EACCES : 0;
4916 value_size = posix_acl_xattr_size(acl->a_count);
4917 value = kmalloc(value_size, GFP_NOFS);
4919 GOTO(out, rc = -ENOMEM);
4921 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4923 GOTO(out_value, rc);
4926 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4927 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4928 name, value, value_size, 0, 0, &req);
4930 ptlrpc_req_finished(req);
4935 forget_cached_acl(inode, type);
4937 set_cached_acl(inode, type, acl);
4940 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4941 #endif /* HAVE_IOP_SET_ACL */
4943 int ll_inode_permission(struct inode *inode, int mask)
4946 struct ll_sb_info *sbi;
4947 struct root_squash_info *squash;
4948 struct cred *cred = NULL;
4949 const struct cred *old_cred = NULL;
4951 bool squash_id = false;
4952 ktime_t kstart = ktime_get();
4955 if (mask & MAY_NOT_BLOCK)
4958 /* as root inode are NOT getting validated in lookup operation,
4959 * need to do it before permission check. */
4961 if (inode == inode->i_sb->s_root->d_inode) {
4962 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4967 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4968 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4970 /* squash fsuid/fsgid if needed */
4971 sbi = ll_i2sbi(inode);
4972 squash = &sbi->ll_squash;
4973 if (unlikely(squash->rsi_uid != 0 &&
4974 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4975 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4979 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4980 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4981 squash->rsi_uid, squash->rsi_gid);
4983 /* update current process's credentials
4984 * and FS capability */
4985 cred = prepare_creds();
4989 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4990 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4991 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4992 if ((1 << cap) & CFS_CAP_FS_MASK)
4993 cap_lower(cred->cap_effective, cap);
4995 old_cred = override_creds(cred);
4998 rc = generic_permission(inode, mask);
4999 /* restore current process's credentials and FS capability */
5001 revert_creds(old_cred);
5006 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5007 ktime_us_delta(ktime_get(), kstart));
5012 /* -o localflock - only provides locally consistent flock locks */
5013 struct file_operations ll_file_operations = {
5014 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5015 # ifdef HAVE_SYNC_READ_WRITE
5016 .read = new_sync_read,
5017 .write = new_sync_write,
5019 .read_iter = ll_file_read_iter,
5020 .write_iter = ll_file_write_iter,
5021 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5022 .read = ll_file_read,
5023 .aio_read = ll_file_aio_read,
5024 .write = ll_file_write,
5025 .aio_write = ll_file_aio_write,
5026 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5027 .unlocked_ioctl = ll_file_ioctl,
5028 .open = ll_file_open,
5029 .release = ll_file_release,
5030 .mmap = ll_file_mmap,
5031 .llseek = ll_file_seek,
5032 .splice_read = ll_file_splice_read,
5037 struct file_operations ll_file_operations_flock = {
5038 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5039 # ifdef HAVE_SYNC_READ_WRITE
5040 .read = new_sync_read,
5041 .write = new_sync_write,
5042 # endif /* HAVE_SYNC_READ_WRITE */
5043 .read_iter = ll_file_read_iter,
5044 .write_iter = ll_file_write_iter,
5045 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5046 .read = ll_file_read,
5047 .aio_read = ll_file_aio_read,
5048 .write = ll_file_write,
5049 .aio_write = ll_file_aio_write,
5050 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5051 .unlocked_ioctl = ll_file_ioctl,
5052 .open = ll_file_open,
5053 .release = ll_file_release,
5054 .mmap = ll_file_mmap,
5055 .llseek = ll_file_seek,
5056 .splice_read = ll_file_splice_read,
5059 .flock = ll_file_flock,
5060 .lock = ll_file_flock
5063 /* These are for -o noflock - to return ENOSYS on flock calls */
5064 struct file_operations ll_file_operations_noflock = {
5065 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5066 # ifdef HAVE_SYNC_READ_WRITE
5067 .read = new_sync_read,
5068 .write = new_sync_write,
5069 # endif /* HAVE_SYNC_READ_WRITE */
5070 .read_iter = ll_file_read_iter,
5071 .write_iter = ll_file_write_iter,
5072 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5073 .read = ll_file_read,
5074 .aio_read = ll_file_aio_read,
5075 .write = ll_file_write,
5076 .aio_write = ll_file_aio_write,
5077 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5078 .unlocked_ioctl = ll_file_ioctl,
5079 .open = ll_file_open,
5080 .release = ll_file_release,
5081 .mmap = ll_file_mmap,
5082 .llseek = ll_file_seek,
5083 .splice_read = ll_file_splice_read,
5086 .flock = ll_file_noflock,
5087 .lock = ll_file_noflock
5090 struct inode_operations ll_file_inode_operations = {
5091 .setattr = ll_setattr,
5092 .getattr = ll_getattr,
5093 .permission = ll_inode_permission,
5094 #ifdef HAVE_IOP_XATTR
5095 .setxattr = ll_setxattr,
5096 .getxattr = ll_getxattr,
5097 .removexattr = ll_removexattr,
5099 .listxattr = ll_listxattr,
5100 .fiemap = ll_fiemap,
5101 #ifdef HAVE_IOP_GET_ACL
5102 .get_acl = ll_get_acl,
5104 #ifdef HAVE_IOP_SET_ACL
5105 .set_acl = ll_set_acl,
5109 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5111 struct ll_inode_info *lli = ll_i2info(inode);
5112 struct cl_object *obj = lli->lli_clob;
5121 env = cl_env_get(&refcheck);
5123 RETURN(PTR_ERR(env));
5125 rc = cl_conf_set(env, lli->lli_clob, conf);
5129 if (conf->coc_opc == OBJECT_CONF_SET) {
5130 struct ldlm_lock *lock = conf->coc_lock;
5131 struct cl_layout cl = {
5135 LASSERT(lock != NULL);
5136 LASSERT(ldlm_has_layout(lock));
5138 /* it can only be allowed to match after layout is
5139 * applied to inode otherwise false layout would be
5140 * seen. Applying layout shoud happen before dropping
5141 * the intent lock. */
5142 ldlm_lock_allow_match(lock);
5144 rc = cl_object_layout_get(env, obj, &cl);
5149 DFID": layout version change: %u -> %u\n",
5150 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5152 ll_layout_version_set(lli, cl.cl_layout_gen);
5156 cl_env_put(env, &refcheck);
5161 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5162 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5165 struct ll_sb_info *sbi = ll_i2sbi(inode);
5166 struct ptlrpc_request *req;
5173 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5174 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5175 lock->l_lvb_data, lock->l_lvb_len);
5177 if (lock->l_lvb_data != NULL)
5180 /* if layout lock was granted right away, the layout is returned
5181 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5182 * blocked and then granted via completion ast, we have to fetch
5183 * layout here. Please note that we can't use the LVB buffer in
5184 * completion AST because it doesn't have a large enough buffer */
5185 rc = ll_get_default_mdsize(sbi, &lmmsize);
5189 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5190 XATTR_NAME_LOV, lmmsize, &req);
5193 GOTO(out, rc = 0); /* empty layout */
5200 if (lmmsize == 0) /* empty layout */
5203 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5205 GOTO(out, rc = -EFAULT);
5207 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5208 if (lvbdata == NULL)
5209 GOTO(out, rc = -ENOMEM);
5211 memcpy(lvbdata, lmm, lmmsize);
5212 lock_res_and_lock(lock);
5213 if (unlikely(lock->l_lvb_data == NULL)) {
5214 lock->l_lvb_type = LVB_T_LAYOUT;
5215 lock->l_lvb_data = lvbdata;
5216 lock->l_lvb_len = lmmsize;
5219 unlock_res_and_lock(lock);
5222 OBD_FREE_LARGE(lvbdata, lmmsize);
5227 ptlrpc_req_finished(req);
5232 * Apply the layout to the inode. Layout lock is held and will be released
5235 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5236 struct inode *inode)
5238 struct ll_inode_info *lli = ll_i2info(inode);
5239 struct ll_sb_info *sbi = ll_i2sbi(inode);
5240 struct ldlm_lock *lock;
5241 struct cl_object_conf conf;
5244 bool wait_layout = false;
5247 LASSERT(lustre_handle_is_used(lockh));
5249 lock = ldlm_handle2lock(lockh);
5250 LASSERT(lock != NULL);
5251 LASSERT(ldlm_has_layout(lock));
5253 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5254 PFID(&lli->lli_fid), inode);
5256 /* in case this is a caching lock and reinstate with new inode */
5257 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5259 lock_res_and_lock(lock);
5260 lvb_ready = ldlm_is_lvb_ready(lock);
5261 unlock_res_and_lock(lock);
5263 /* checking lvb_ready is racy but this is okay. The worst case is
5264 * that multi processes may configure the file on the same time. */
5268 rc = ll_layout_fetch(inode, lock);
5272 /* for layout lock, lmm is stored in lock's lvb.
5273 * lvb_data is immutable if the lock is held so it's safe to access it
5276 * set layout to file. Unlikely this will fail as old layout was
5277 * surely eliminated */
5278 memset(&conf, 0, sizeof conf);
5279 conf.coc_opc = OBJECT_CONF_SET;
5280 conf.coc_inode = inode;
5281 conf.coc_lock = lock;
5282 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5283 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5284 rc = ll_layout_conf(inode, &conf);
5286 /* refresh layout failed, need to wait */
5287 wait_layout = rc == -EBUSY;
5290 LDLM_LOCK_PUT(lock);
5291 ldlm_lock_decref(lockh, mode);
5293 /* wait for IO to complete if it's still being used. */
5295 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5296 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5298 memset(&conf, 0, sizeof conf);
5299 conf.coc_opc = OBJECT_CONF_WAIT;
5300 conf.coc_inode = inode;
5301 rc = ll_layout_conf(inode, &conf);
5305 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5306 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5312 * Issue layout intent RPC to MDS.
5313 * \param inode [in] file inode
5314 * \param intent [in] layout intent
5316 * \retval 0 on success
5317 * \retval < 0 error code
5319 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5321 struct ll_inode_info *lli = ll_i2info(inode);
5322 struct ll_sb_info *sbi = ll_i2sbi(inode);
5323 struct md_op_data *op_data;
5324 struct lookup_intent it;
5325 struct ptlrpc_request *req;
5329 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5330 0, 0, LUSTRE_OPC_ANY, NULL);
5331 if (IS_ERR(op_data))
5332 RETURN(PTR_ERR(op_data));
5334 op_data->op_data = intent;
5335 op_data->op_data_size = sizeof(*intent);
5337 memset(&it, 0, sizeof(it));
5338 it.it_op = IT_LAYOUT;
5339 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5340 intent->li_opc == LAYOUT_INTENT_TRUNC)
5341 it.it_flags = FMODE_WRITE;
5343 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5344 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5346 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5347 &ll_md_blocking_ast, 0);
5348 if (it.it_request != NULL)
5349 ptlrpc_req_finished(it.it_request);
5350 it.it_request = NULL;
5352 ll_finish_md_op_data(op_data);
5354 /* set lock data in case this is a new lock */
5356 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5358 ll_intent_drop_lock(&it);
5364 * This function checks if there exists a LAYOUT lock on the client side,
5365 * or enqueues it if it doesn't have one in cache.
5367 * This function will not hold layout lock so it may be revoked any time after
5368 * this function returns. Any operations depend on layout should be redone
5371 * This function should be called before lov_io_init() to get an uptodate
5372 * layout version, the caller should save the version number and after IO
5373 * is finished, this function should be called again to verify that layout
5374 * is not changed during IO time.
5376 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5378 struct ll_inode_info *lli = ll_i2info(inode);
5379 struct ll_sb_info *sbi = ll_i2sbi(inode);
5380 struct lustre_handle lockh;
5381 struct layout_intent intent = {
5382 .li_opc = LAYOUT_INTENT_ACCESS,
5384 enum ldlm_mode mode;
5388 *gen = ll_layout_version_get(lli);
5389 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5393 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5394 LASSERT(S_ISREG(inode->i_mode));
5396 /* take layout lock mutex to enqueue layout lock exclusively. */
5397 mutex_lock(&lli->lli_layout_mutex);
5400 /* mostly layout lock is caching on the local side, so try to
5401 * match it before grabbing layout lock mutex. */
5402 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5403 LCK_CR | LCK_CW | LCK_PR |
5405 if (mode != 0) { /* hit cached lock */
5406 rc = ll_layout_lock_set(&lockh, mode, inode);
5412 rc = ll_layout_intent(inode, &intent);
5418 *gen = ll_layout_version_get(lli);
5419 mutex_unlock(&lli->lli_layout_mutex);
5425 * Issue layout intent RPC indicating where in a file an IO is about to write.
5427 * \param[in] inode file inode.
5428 * \param[in] ext write range with start offset of fille in bytes where
5429 * an IO is about to write, and exclusive end offset in
5432 * \retval 0 on success
5433 * \retval < 0 error code
5435 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5436 struct lu_extent *ext)
5438 struct layout_intent intent = {
5440 .li_extent.e_start = ext->e_start,
5441 .li_extent.e_end = ext->e_end,
5446 rc = ll_layout_intent(inode, &intent);
5452 * This function send a restore request to the MDT
5454 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5456 struct hsm_user_request *hur;
5460 len = sizeof(struct hsm_user_request) +
5461 sizeof(struct hsm_user_item);
5462 OBD_ALLOC(hur, len);
5466 hur->hur_request.hr_action = HUA_RESTORE;
5467 hur->hur_request.hr_archive_id = 0;
5468 hur->hur_request.hr_flags = 0;
5469 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5470 sizeof(hur->hur_user_item[0].hui_fid));
5471 hur->hur_user_item[0].hui_extent.offset = offset;
5472 hur->hur_user_item[0].hui_extent.length = length;
5473 hur->hur_request.hr_itemcount = 1;
5474 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,