4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
167 case MDS_CLOSE_LAYOUT_SPLIT:
168 case MDS_CLOSE_LAYOUT_SWAP: {
169 struct split_param *sp = data;
171 LASSERT(data != NULL);
172 op_data->op_bias |= bias;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
176 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
177 op_data->op_mirror_id = sp->sp_mirror_id;
179 op_data->op_fid2 = *ll_inode2fid(data);
184 case MDS_CLOSE_RESYNC_DONE: {
185 struct ll_ioc_lease *ioc = data;
187 LASSERT(data != NULL);
188 op_data->op_attr_blocks +=
189 ioc->lil_count * op_data->op_attr_blocks;
190 op_data->op_attr.ia_valid |= ATTR_SIZE;
191 op_data->op_xvalid |= OP_XVALID_BLOCKS;
192 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
194 op_data->op_lease_handle = och->och_lease_handle;
195 op_data->op_data = &ioc->lil_ids[0];
196 op_data->op_data_size =
197 ioc->lil_count * sizeof(ioc->lil_ids[0]);
201 case MDS_PCC_ATTACH: {
202 struct pcc_param *param = data;
204 LASSERT(data != NULL);
205 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
206 op_data->op_archive_id = param->pa_archive_id;
207 op_data->op_data_version = param->pa_data_version;
208 op_data->op_lease_handle = och->och_lease_handle;
212 case MDS_HSM_RELEASE:
213 LASSERT(data != NULL);
214 op_data->op_bias |= MDS_HSM_RELEASE;
215 op_data->op_data_version = *(__u64 *)data;
216 op_data->op_lease_handle = och->och_lease_handle;
217 op_data->op_attr.ia_valid |= ATTR_SIZE;
218 op_data->op_xvalid |= OP_XVALID_BLOCKS;
222 LASSERT(data == NULL);
226 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
227 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
228 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
229 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
231 rc = md_close(md_exp, op_data, och->och_mod, &req);
232 if (rc != 0 && rc != -EINTR)
233 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
234 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
236 if (rc == 0 && op_data->op_bias & bias) {
237 struct mdt_body *body;
239 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
240 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
243 if (bias & MDS_PCC_ATTACH) {
244 struct pcc_param *param = data;
246 param->pa_layout_gen = body->mbo_layout_gen;
250 ll_finish_md_op_data(op_data);
254 md_clear_open_replay_data(md_exp, och);
255 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
258 ptlrpc_req_finished(req); /* This is close request */
262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
264 struct ll_inode_info *lli = ll_i2info(inode);
265 struct obd_client_handle **och_p;
266 struct obd_client_handle *och;
271 if (fmode & FMODE_WRITE) {
272 och_p = &lli->lli_mds_write_och;
273 och_usecount = &lli->lli_open_fd_write_count;
274 } else if (fmode & FMODE_EXEC) {
275 och_p = &lli->lli_mds_exec_och;
276 och_usecount = &lli->lli_open_fd_exec_count;
278 LASSERT(fmode & FMODE_READ);
279 och_p = &lli->lli_mds_read_och;
280 och_usecount = &lli->lli_open_fd_read_count;
283 mutex_lock(&lli->lli_och_mutex);
284 if (*och_usecount > 0) {
285 /* There are still users of this handle, so skip
287 mutex_unlock(&lli->lli_och_mutex);
293 mutex_unlock(&lli->lli_och_mutex);
296 /* There might be a race and this handle may already
298 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
304 static int ll_md_close(struct inode *inode, struct file *file)
306 union ldlm_policy_data policy = {
307 .l_inodebits = { MDS_INODELOCK_OPEN },
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
311 struct ll_inode_info *lli = ll_i2info(inode);
312 struct lustre_handle lockh;
313 enum ldlm_mode lockmode;
317 /* clear group lock, if present */
318 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
319 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
321 if (fd->fd_lease_och != NULL) {
324 /* Usually the lease is not released when the
325 * application crashed, we need to release here. */
326 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
327 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
328 PFID(&lli->lli_fid), rc, lease_broken);
330 fd->fd_lease_och = NULL;
333 if (fd->fd_och != NULL) {
334 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
339 /* Let's see if we have good enough OPEN lock on the file and if
340 we can skip talking to MDS */
341 mutex_lock(&lli->lli_och_mutex);
342 if (fd->fd_omode & FMODE_WRITE) {
344 LASSERT(lli->lli_open_fd_write_count);
345 lli->lli_open_fd_write_count--;
346 } else if (fd->fd_omode & FMODE_EXEC) {
348 LASSERT(lli->lli_open_fd_exec_count);
349 lli->lli_open_fd_exec_count--;
352 LASSERT(lli->lli_open_fd_read_count);
353 lli->lli_open_fd_read_count--;
355 mutex_unlock(&lli->lli_och_mutex);
357 /* LU-4398: do not cache write open lock if the file has exec bit */
358 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
359 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
360 LDLM_IBITS, &policy, lockmode, &lockh))
361 rc = ll_md_real_close(inode, fd->fd_omode);
364 LUSTRE_FPRIVATE(file) = NULL;
365 ll_file_data_put(fd);
370 /* While this returns an error code, fput() the caller does not, so we need
371 * to make every effort to clean up all of our state here. Also, applications
372 * rarely check close errors and even if an error is returned they will not
373 * re-try the close call.
375 int ll_file_release(struct inode *inode, struct file *file)
377 struct ll_file_data *fd;
378 struct ll_sb_info *sbi = ll_i2sbi(inode);
379 struct ll_inode_info *lli = ll_i2info(inode);
380 ktime_t kstart = ktime_get();
385 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
386 PFID(ll_inode2fid(inode)), inode);
388 fd = LUSTRE_FPRIVATE(file);
391 /* The last ref on @file, maybe not the the owner pid of statahead,
392 * because parent and child process can share the same file handle. */
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
394 ll_deauthorize_statahead(inode, fd);
396 if (inode->i_sb->s_root == file_dentry(file)) {
397 LUSTRE_FPRIVATE(file) = NULL;
398 ll_file_data_put(fd);
402 pcc_file_release(inode, file);
404 if (!S_ISDIR(inode->i_mode)) {
405 if (lli->lli_clob != NULL)
406 lov_read_and_clear_async_rc(lli->lli_clob);
407 lli->lli_async_rc = 0;
410 rc = ll_md_close(inode, file);
412 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
413 libcfs_debug_dumplog();
416 if (!rc && inode->i_sb->s_root != file_dentry(file))
417 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
418 ktime_us_delta(ktime_get(), kstart));
422 static inline int ll_dom_readpage(void *data, struct page *page)
424 struct niobuf_local *lnb = data;
427 kaddr = ll_kmap_atomic(page, KM_USER0);
428 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
429 if (lnb->lnb_len < PAGE_SIZE)
430 memset(kaddr + lnb->lnb_len, 0,
431 PAGE_SIZE - lnb->lnb_len);
432 flush_dcache_page(page);
433 SetPageUptodate(page);
434 ll_kunmap_atomic(kaddr, KM_USER0);
440 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
441 struct lookup_intent *it)
443 struct ll_inode_info *lli = ll_i2info(inode);
444 struct cl_object *obj = lli->lli_clob;
445 struct address_space *mapping = inode->i_mapping;
447 struct niobuf_remote *rnb;
448 struct mdt_body *body;
450 unsigned long index, start;
451 struct niobuf_local lnb;
458 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
462 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
463 if (rnb == NULL || rnb->rnb_len == 0)
466 /* LU-11595: Server may return whole file and that is OK always or
467 * it may return just file tail and its offset must be aligned with
468 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
469 * smaller then offset may be not aligned and that data is just ignored.
471 if (rnb->rnb_offset % PAGE_SIZE)
474 /* Server returns whole file or just file tail if it fills in reply
475 * buffer, in both cases total size should be equal to the file size.
477 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
478 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
479 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
480 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
481 rnb->rnb_len, body->mbo_dom_size);
485 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
486 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
488 data = (char *)rnb + sizeof(*rnb);
490 lnb.lnb_file_offset = rnb->rnb_offset;
491 start = lnb.lnb_file_offset / PAGE_SIZE;
493 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
494 lnb.lnb_page_offset = 0;
496 lnb.lnb_data = data + (index << PAGE_SHIFT);
497 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
498 if (lnb.lnb_len > PAGE_SIZE)
499 lnb.lnb_len = PAGE_SIZE;
501 vmpage = read_cache_page(mapping, index + start,
502 ll_dom_readpage, &lnb);
503 if (IS_ERR(vmpage)) {
504 CWARN("%s: cannot fill page %lu for "DFID
505 " with data: rc = %li\n",
506 ll_i2sbi(inode)->ll_fsname, index + start,
507 PFID(lu_object_fid(&obj->co_lu)),
513 } while (rnb->rnb_len > (index << PAGE_SHIFT));
517 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
518 struct lookup_intent *itp)
520 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
521 struct dentry *parent = de->d_parent;
524 struct md_op_data *op_data;
525 struct ptlrpc_request *req = NULL;
529 LASSERT(parent != NULL);
530 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
532 /* if server supports open-by-fid, or file name is invalid, don't pack
533 * name in open request */
534 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
535 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
537 len = de->d_name.len;
538 name = kmalloc(len + 1, GFP_NOFS);
543 spin_lock(&de->d_lock);
544 if (len != de->d_name.len) {
545 spin_unlock(&de->d_lock);
549 memcpy(name, de->d_name.name, len);
551 spin_unlock(&de->d_lock);
553 if (!lu_name_is_valid_2(name, len)) {
559 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
560 name, len, 0, LUSTRE_OPC_ANY, NULL);
561 if (IS_ERR(op_data)) {
563 RETURN(PTR_ERR(op_data));
565 op_data->op_data = lmm;
566 op_data->op_data_size = lmmsize;
568 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
569 &ll_md_blocking_ast, 0);
571 ll_finish_md_op_data(op_data);
573 /* reason for keep own exit path - don`t flood log
574 * with messages with -ESTALE errors.
576 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
577 it_open_error(DISP_OPEN_OPEN, itp))
579 ll_release_openhandle(de, itp);
583 if (it_disposition(itp, DISP_LOOKUP_NEG))
584 GOTO(out, rc = -ENOENT);
586 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
587 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
588 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
592 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
594 if (!rc && itp->it_lock_mode) {
595 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
596 struct ldlm_lock *lock;
597 bool has_dom_bit = false;
599 /* If we got a lock back and it has a LOOKUP bit set,
600 * make sure the dentry is marked as valid so we can find it.
601 * We don't need to care about actual hashing since other bits
602 * of kernel will deal with that later.
604 lock = ldlm_handle2lock(&handle);
606 has_dom_bit = ldlm_has_dom(lock);
607 if (lock->l_policy_data.l_inodebits.bits &
608 MDS_INODELOCK_LOOKUP)
609 d_lustre_revalidate(de);
613 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
615 ll_dom_finish_open(de->d_inode, req, itp);
619 ptlrpc_req_finished(req);
620 ll_intent_drop_lock(itp);
622 /* We did open by fid, but by the time we got to the server,
623 * the object disappeared. If this is a create, we cannot really
624 * tell the userspace that the file it was trying to create
625 * does not exist. Instead let's return -ESTALE, and the VFS will
626 * retry the create with LOOKUP_REVAL that we are going to catch
627 * in ll_revalidate_dentry() and use lookup then.
629 if (rc == -ENOENT && itp->it_op & IT_CREAT)
635 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
636 struct obd_client_handle *och)
638 struct mdt_body *body;
640 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
641 och->och_open_handle = body->mbo_open_handle;
642 och->och_fid = body->mbo_fid1;
643 och->och_lease_handle.cookie = it->it_lock_handle;
644 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
645 och->och_flags = it->it_flags;
647 return md_set_open_replay_data(md_exp, och, it);
650 static int ll_local_open(struct file *file, struct lookup_intent *it,
651 struct ll_file_data *fd, struct obd_client_handle *och)
653 struct inode *inode = file_inode(file);
656 LASSERT(!LUSTRE_FPRIVATE(file));
663 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
668 LUSTRE_FPRIVATE(file) = fd;
669 ll_readahead_init(inode, &fd->fd_ras);
670 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
672 /* ll_cl_context initialize */
673 rwlock_init(&fd->fd_lock);
674 INIT_LIST_HEAD(&fd->fd_lccs);
679 /* Open a file, and (for the very first open) create objects on the OSTs at
680 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
681 * creation or open until ll_lov_setstripe() ioctl is called.
683 * If we already have the stripe MD locally then we don't request it in
684 * md_open(), by passing a lmm_size = 0.
686 * It is up to the application to ensure no other processes open this file
687 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
688 * used. We might be able to avoid races of that sort by getting lli_open_sem
689 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
690 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
692 int ll_file_open(struct inode *inode, struct file *file)
694 struct ll_inode_info *lli = ll_i2info(inode);
695 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
696 .it_flags = file->f_flags };
697 struct obd_client_handle **och_p = NULL;
698 __u64 *och_usecount = NULL;
699 struct ll_file_data *fd;
700 ktime_t kstart = ktime_get();
704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
705 PFID(ll_inode2fid(inode)), inode, file->f_flags);
707 it = file->private_data; /* XXX: compat macro */
708 file->private_data = NULL; /* prevent ll_local_open assertion */
710 fd = ll_file_data_get();
712 GOTO(out_nofiledata, rc = -ENOMEM);
715 if (S_ISDIR(inode->i_mode))
716 ll_authorize_statahead(inode, fd);
718 if (inode->i_sb->s_root == file_dentry(file)) {
719 LUSTRE_FPRIVATE(file) = fd;
723 if (!it || !it->it_disposition) {
724 /* Convert f_flags into access mode. We cannot use file->f_mode,
725 * because everything but O_ACCMODE mask was stripped from
727 if ((oit.it_flags + 1) & O_ACCMODE)
729 if (file->f_flags & O_TRUNC)
730 oit.it_flags |= FMODE_WRITE;
732 /* kernel only call f_op->open in dentry_open. filp_open calls
733 * dentry_open after call to open_namei that checks permissions.
734 * Only nfsd_open call dentry_open directly without checking
735 * permissions and because of that this code below is safe.
737 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
738 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
740 /* We do not want O_EXCL here, presumably we opened the file
741 * already? XXX - NFS implications? */
742 oit.it_flags &= ~O_EXCL;
744 /* bug20584, if "it_flags" contains O_CREAT, the file will be
745 * created if necessary, then "IT_CREAT" should be set to keep
746 * consistent with it */
747 if (oit.it_flags & O_CREAT)
748 oit.it_op |= IT_CREAT;
754 /* Let's see if we have file open on MDS already. */
755 if (it->it_flags & FMODE_WRITE) {
756 och_p = &lli->lli_mds_write_och;
757 och_usecount = &lli->lli_open_fd_write_count;
758 } else if (it->it_flags & FMODE_EXEC) {
759 och_p = &lli->lli_mds_exec_och;
760 och_usecount = &lli->lli_open_fd_exec_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 mutex_lock(&lli->lli_och_mutex);
767 if (*och_p) { /* Open handle is present */
768 if (it_disposition(it, DISP_OPEN_OPEN)) {
769 /* Well, there's extra open request that we do not need,
770 let's close it somehow. This will decref request. */
771 rc = it_open_error(DISP_OPEN_OPEN, it);
773 mutex_unlock(&lli->lli_och_mutex);
774 GOTO(out_openerr, rc);
777 ll_release_openhandle(file_dentry(file), it);
781 rc = ll_local_open(file, it, fd, NULL);
784 mutex_unlock(&lli->lli_och_mutex);
785 GOTO(out_openerr, rc);
788 LASSERT(*och_usecount == 0);
789 if (!it->it_disposition) {
790 struct dentry *dentry = file_dentry(file);
791 struct ll_dentry_data *ldd;
793 /* We cannot just request lock handle now, new ELC code
794 means that one of other OPEN locks for this file
795 could be cancelled, and since blocking ast handler
796 would attempt to grab och_mutex as well, that would
797 result in a deadlock */
798 mutex_unlock(&lli->lli_och_mutex);
800 * Normally called under two situations:
802 * 2. A race/condition on MDS resulting in no open
803 * handle to be returned from LOOKUP|OPEN request,
804 * for example if the target entry was a symlink.
806 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
807 * marked by a bit set in ll_iget_for_nfs. Clear the
808 * bit so that it's not confusing later callers.
810 * NB; when ldd is NULL, it must have come via normal
811 * lookup path only, since ll_iget_for_nfs always calls
814 ldd = ll_d2d(dentry);
815 if (ldd && ldd->lld_nfs_dentry) {
816 ldd->lld_nfs_dentry = 0;
817 if (!filename_is_volatile(dentry->d_name.name,
820 it->it_flags |= MDS_OPEN_LOCK;
824 * Always specify MDS_OPEN_BY_FID because we don't want
825 * to get file with different fid.
827 it->it_flags |= MDS_OPEN_BY_FID;
828 rc = ll_intent_file_open(dentry, NULL, 0, it);
830 GOTO(out_openerr, rc);
834 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
836 GOTO(out_och_free, rc = -ENOMEM);
840 /* md_intent_lock() didn't get a request ref if there was an
841 * open error, so don't do cleanup on the request here
843 /* XXX (green): Should not we bail out on any error here, not
844 * just open error? */
845 rc = it_open_error(DISP_OPEN_OPEN, it);
847 GOTO(out_och_free, rc);
849 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
850 "inode %p: disposition %x, status %d\n", inode,
851 it_disposition(it, ~0), it->it_status);
853 rc = ll_local_open(file, it, fd, *och_p);
855 GOTO(out_och_free, rc);
858 rc = pcc_file_open(inode, file);
860 GOTO(out_och_free, rc);
862 mutex_unlock(&lli->lli_och_mutex);
865 /* Must do this outside lli_och_mutex lock to prevent deadlock where
866 different kind of OPEN lock for this same inode gets cancelled
867 by ldlm_cancel_lru */
868 if (!S_ISREG(inode->i_mode))
869 GOTO(out_och_free, rc);
871 cl_lov_delay_create_clear(&file->f_flags);
872 GOTO(out_och_free, rc);
876 if (och_p && *och_p) {
877 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
878 *och_p = NULL; /* OBD_FREE writes some magic there */
881 mutex_unlock(&lli->lli_och_mutex);
884 if (lli->lli_opendir_key == fd)
885 ll_deauthorize_statahead(inode, fd);
888 ll_file_data_put(fd);
890 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
891 ktime_us_delta(ktime_get(), kstart));
895 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
896 ptlrpc_req_finished(it->it_request);
897 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
903 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
904 struct ldlm_lock_desc *desc, void *data, int flag)
907 struct lustre_handle lockh;
911 case LDLM_CB_BLOCKING:
912 ldlm_lock2handle(lock, &lockh);
913 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
915 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
919 case LDLM_CB_CANCELING:
927 * When setting a lease on a file, we take ownership of the lli_mds_*_och
928 * and save it as fd->fd_och so as to force client to reopen the file even
929 * if it has an open lock in cache already.
931 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
932 struct lustre_handle *old_open_handle)
934 struct ll_inode_info *lli = ll_i2info(inode);
935 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
936 struct obd_client_handle **och_p;
941 /* Get the openhandle of the file */
942 mutex_lock(&lli->lli_och_mutex);
943 if (fd->fd_lease_och != NULL)
944 GOTO(out_unlock, rc = -EBUSY);
946 if (fd->fd_och == NULL) {
947 if (file->f_mode & FMODE_WRITE) {
948 LASSERT(lli->lli_mds_write_och != NULL);
949 och_p = &lli->lli_mds_write_och;
950 och_usecount = &lli->lli_open_fd_write_count;
952 LASSERT(lli->lli_mds_read_och != NULL);
953 och_p = &lli->lli_mds_read_och;
954 och_usecount = &lli->lli_open_fd_read_count;
957 if (*och_usecount > 1)
958 GOTO(out_unlock, rc = -EBUSY);
965 *old_open_handle = fd->fd_och->och_open_handle;
969 mutex_unlock(&lli->lli_och_mutex);
974 * Release ownership on lli_mds_*_och when putting back a file lease.
976 static int ll_lease_och_release(struct inode *inode, struct file *file)
978 struct ll_inode_info *lli = ll_i2info(inode);
979 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
980 struct obd_client_handle **och_p;
981 struct obd_client_handle *old_och = NULL;
986 mutex_lock(&lli->lli_och_mutex);
987 if (file->f_mode & FMODE_WRITE) {
988 och_p = &lli->lli_mds_write_och;
989 och_usecount = &lli->lli_open_fd_write_count;
991 och_p = &lli->lli_mds_read_och;
992 och_usecount = &lli->lli_open_fd_read_count;
995 /* The file may have been open by another process (broken lease) so
996 * *och_p is not NULL. In this case we should simply increase usecount
999 if (*och_p != NULL) {
1000 old_och = fd->fd_och;
1003 *och_p = fd->fd_och;
1007 mutex_unlock(&lli->lli_och_mutex);
1009 if (old_och != NULL)
1010 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1016 * Acquire a lease and open the file.
1018 static struct obd_client_handle *
1019 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1022 struct lookup_intent it = { .it_op = IT_OPEN };
1023 struct ll_sb_info *sbi = ll_i2sbi(inode);
1024 struct md_op_data *op_data;
1025 struct ptlrpc_request *req = NULL;
1026 struct lustre_handle old_open_handle = { 0 };
1027 struct obd_client_handle *och = NULL;
1032 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1033 RETURN(ERR_PTR(-EINVAL));
1036 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1037 RETURN(ERR_PTR(-EPERM));
1039 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1041 RETURN(ERR_PTR(rc));
1046 RETURN(ERR_PTR(-ENOMEM));
1048 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1049 LUSTRE_OPC_ANY, NULL);
1050 if (IS_ERR(op_data))
1051 GOTO(out, rc = PTR_ERR(op_data));
1053 /* To tell the MDT this openhandle is from the same owner */
1054 op_data->op_open_handle = old_open_handle;
1056 it.it_flags = fmode | open_flags;
1057 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1058 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1059 &ll_md_blocking_lease_ast,
1060 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1061 * it can be cancelled which may mislead applications that the lease is
1063 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1064 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1065 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1066 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1067 ll_finish_md_op_data(op_data);
1068 ptlrpc_req_finished(req);
1070 GOTO(out_release_it, rc);
1072 if (it_disposition(&it, DISP_LOOKUP_NEG))
1073 GOTO(out_release_it, rc = -ENOENT);
1075 rc = it_open_error(DISP_OPEN_OPEN, &it);
1077 GOTO(out_release_it, rc);
1079 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1080 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1082 GOTO(out_release_it, rc);
1084 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1085 GOTO(out_close, rc = -EOPNOTSUPP);
1087 /* already get lease, handle lease lock */
1088 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1089 if (it.it_lock_mode == 0 ||
1090 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1091 /* open lock must return for lease */
1092 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1093 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1095 GOTO(out_close, rc = -EPROTO);
1098 ll_intent_release(&it);
1102 /* Cancel open lock */
1103 if (it.it_lock_mode != 0) {
1104 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1106 it.it_lock_mode = 0;
1107 och->och_lease_handle.cookie = 0ULL;
1109 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1111 CERROR("%s: error closing file "DFID": %d\n",
1112 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1113 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1115 ll_intent_release(&it);
1119 RETURN(ERR_PTR(rc));
1123 * Check whether a layout swap can be done between two inodes.
1125 * \param[in] inode1 First inode to check
1126 * \param[in] inode2 Second inode to check
1128 * \retval 0 on success, layout swap can be performed between both inodes
1129 * \retval negative error code if requirements are not met
1131 static int ll_check_swap_layouts_validity(struct inode *inode1,
1132 struct inode *inode2)
1134 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1137 if (inode_permission(inode1, MAY_WRITE) ||
1138 inode_permission(inode2, MAY_WRITE))
1141 if (inode1->i_sb != inode2->i_sb)
1147 static int ll_swap_layouts_close(struct obd_client_handle *och,
1148 struct inode *inode, struct inode *inode2)
1150 const struct lu_fid *fid1 = ll_inode2fid(inode);
1151 const struct lu_fid *fid2;
1155 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1156 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1158 rc = ll_check_swap_layouts_validity(inode, inode2);
1160 GOTO(out_free_och, rc);
1162 /* We now know that inode2 is a lustre inode */
1163 fid2 = ll_inode2fid(inode2);
1165 rc = lu_fid_cmp(fid1, fid2);
1167 GOTO(out_free_och, rc = -EINVAL);
1169 /* Close the file and {swap,merge} layouts between inode & inode2.
1170 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1171 * because we still need it to pack l_remote_handle to MDT. */
1172 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1175 och = NULL; /* freed in ll_close_inode_openhandle() */
1185 * Release lease and close the file.
1186 * It will check if the lease has ever broken.
1188 static int ll_lease_close_intent(struct obd_client_handle *och,
1189 struct inode *inode,
1190 bool *lease_broken, enum mds_op_bias bias,
1193 struct ldlm_lock *lock;
1194 bool cancelled = true;
1198 lock = ldlm_handle2lock(&och->och_lease_handle);
1200 lock_res_and_lock(lock);
1201 cancelled = ldlm_is_cancel(lock);
1202 unlock_res_and_lock(lock);
1203 LDLM_LOCK_PUT(lock);
1206 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1207 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1209 if (lease_broken != NULL)
1210 *lease_broken = cancelled;
1212 if (!cancelled && !bias)
1213 ldlm_cli_cancel(&och->och_lease_handle, 0);
1215 if (cancelled) { /* no need to excute intent */
1220 rc = ll_close_inode_openhandle(inode, och, bias, data);
1224 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1227 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1231 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1233 static int ll_lease_file_resync(struct obd_client_handle *och,
1234 struct inode *inode, unsigned long arg)
1236 struct ll_sb_info *sbi = ll_i2sbi(inode);
1237 struct md_op_data *op_data;
1238 struct ll_ioc_lease_id ioc;
1239 __u64 data_version_unused;
1243 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1244 LUSTRE_OPC_ANY, NULL);
1245 if (IS_ERR(op_data))
1246 RETURN(PTR_ERR(op_data));
1248 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1252 /* before starting file resync, it's necessary to clean up page cache
1253 * in client memory, otherwise once the layout version is increased,
1254 * writing back cached data will be denied the OSTs. */
1255 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1259 op_data->op_lease_handle = och->och_lease_handle;
1260 op_data->op_mirror_id = ioc.lil_mirror_id;
1261 rc = md_file_resync(sbi->ll_md_exp, op_data);
1267 ll_finish_md_op_data(op_data);
1271 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1273 struct ll_inode_info *lli = ll_i2info(inode);
1274 struct cl_object *obj = lli->lli_clob;
1275 struct cl_attr *attr = vvp_env_thread_attr(env);
1283 ll_inode_size_lock(inode);
1285 /* Merge timestamps the most recently obtained from MDS with
1286 * timestamps obtained from OSTs.
1288 * Do not overwrite atime of inode because it may be refreshed
1289 * by file_accessed() function. If the read was served by cache
1290 * data, there is no RPC to be sent so that atime may not be
1291 * transferred to OSTs at all. MDT only updates atime at close time
1292 * if it's at least 'mdd.*.atime_diff' older.
1293 * All in all, the atime in Lustre does not strictly comply with
1294 * POSIX. Solving this problem needs to send an RPC to MDT for each
1295 * read, this will hurt performance.
1297 if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1298 inode->i_atime.tv_sec < lli->lli_atime)
1299 inode->i_atime.tv_sec = lli->lli_atime;
1301 inode->i_mtime.tv_sec = lli->lli_mtime;
1302 inode->i_ctime.tv_sec = lli->lli_ctime;
1304 mtime = inode->i_mtime.tv_sec;
1305 atime = inode->i_atime.tv_sec;
1306 ctime = inode->i_ctime.tv_sec;
1308 cl_object_attr_lock(obj);
1309 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1312 rc = cl_object_attr_get(env, obj, attr);
1313 cl_object_attr_unlock(obj);
1316 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1318 if (atime < attr->cat_atime)
1319 atime = attr->cat_atime;
1321 if (ctime < attr->cat_ctime)
1322 ctime = attr->cat_ctime;
1324 if (mtime < attr->cat_mtime)
1325 mtime = attr->cat_mtime;
1327 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1328 PFID(&lli->lli_fid), attr->cat_size);
1330 i_size_write(inode, attr->cat_size);
1331 inode->i_blocks = attr->cat_blocks;
1333 inode->i_mtime.tv_sec = mtime;
1334 inode->i_atime.tv_sec = atime;
1335 inode->i_ctime.tv_sec = ctime;
1338 ll_inode_size_unlock(inode);
1344 * Set designated mirror for I/O.
1346 * So far only read, write, and truncated can support to issue I/O to
1347 * designated mirror.
1349 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1353 /* clear layout version for generic(non-resync) I/O in case it carries
1354 * stale layout version due to I/O restart */
1355 io->ci_layout_version = 0;
1357 /* FLR: disable non-delay for designated mirror I/O because obviously
1358 * only one mirror is available */
1359 if (fd->fd_designated_mirror > 0) {
1361 io->ci_designated_mirror = fd->fd_designated_mirror;
1362 io->ci_layout_version = fd->fd_layout_version;
1365 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1366 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1369 static bool file_is_noatime(const struct file *file)
1371 const struct vfsmount *mnt = file->f_path.mnt;
1372 const struct inode *inode = file_inode((struct file *)file);
1374 /* Adapted from file_accessed() and touch_atime().*/
1375 if (file->f_flags & O_NOATIME)
1378 if (inode->i_flags & S_NOATIME)
1381 if (IS_NOATIME(inode))
1384 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1387 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1390 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1396 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1397 struct vvp_io_args *args)
1399 struct inode *inode = file_inode(file);
1400 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1402 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1403 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1405 if (iot == CIT_WRITE) {
1406 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1407 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1408 file->f_flags & O_DIRECT ||
1410 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1411 io->u.ci_wr.wr_sync |= !!(args &&
1412 args->via_io_subtype == IO_NORMAL &&
1413 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1417 io->ci_obj = ll_i2info(inode)->lli_clob;
1418 io->ci_lockreq = CILR_MAYBE;
1419 if (ll_file_nolock(file)) {
1420 io->ci_lockreq = CILR_NEVER;
1421 io->ci_no_srvlock = 1;
1422 } else if (file->f_flags & O_APPEND) {
1423 io->ci_lockreq = CILR_MANDATORY;
1425 io->ci_noatime = file_is_noatime(file);
1426 io->ci_async_readahead = false;
1428 /* FLR: only use non-delay I/O for read as there is only one
1429 * avaliable mirror for write. */
1430 io->ci_ndelay = !(iot == CIT_WRITE);
1432 ll_io_set_mirror(io, file);
1435 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1438 struct ll_inode_info *lli = ll_i2info(inode);
1439 struct ll_sb_info *sbi = ll_i2sbi(inode);
1440 enum obd_heat_type sample_type;
1441 enum obd_heat_type iobyte_type;
1442 __u64 now = ktime_get_real_seconds();
1444 if (!ll_sbi_has_file_heat(sbi) ||
1445 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1448 if (iot == CIT_READ) {
1449 sample_type = OBD_HEAT_READSAMPLE;
1450 iobyte_type = OBD_HEAT_READBYTE;
1451 } else if (iot == CIT_WRITE) {
1452 sample_type = OBD_HEAT_WRITESAMPLE;
1453 iobyte_type = OBD_HEAT_WRITEBYTE;
1458 spin_lock(&lli->lli_heat_lock);
1459 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1460 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1461 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1462 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1463 spin_unlock(&lli->lli_heat_lock);
1467 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1468 struct file *file, enum cl_io_type iot,
1469 loff_t *ppos, size_t count)
1471 struct vvp_io *vio = vvp_env_io(env);
1472 struct inode *inode = file_inode(file);
1473 struct ll_inode_info *lli = ll_i2info(inode);
1474 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1475 struct range_lock range;
1479 unsigned retried = 0;
1480 bool restarted = false;
1484 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1485 file_dentry(file)->d_name.name,
1486 iot == CIT_READ ? "read" : "write", *ppos, count);
1489 io = vvp_env_thread_io(env);
1490 ll_io_init(io, file, iot, args);
1491 io->ci_ndelay_tried = retried;
1493 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1494 bool range_locked = false;
1496 if (file->f_flags & O_APPEND)
1497 range_lock_init(&range, 0, LUSTRE_EOF);
1499 range_lock_init(&range, *ppos, *ppos + count - 1);
1501 vio->vui_fd = LUSTRE_FPRIVATE(file);
1502 vio->vui_io_subtype = args->via_io_subtype;
1504 switch (vio->vui_io_subtype) {
1506 vio->vui_iter = args->u.normal.via_iter;
1507 vio->vui_iocb = args->u.normal.via_iocb;
1508 /* Direct IO reads must also take range lock,
1509 * or multiple reads will try to work on the same pages
1510 * See LU-6227 for details. */
1511 if (((iot == CIT_WRITE) ||
1512 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1513 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1514 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1516 rc = range_lock(&lli->lli_write_tree, &range);
1520 range_locked = true;
1524 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1525 vio->u.splice.vui_flags = args->u.splice.via_flags;
1528 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1532 ll_cl_add(file, env, io, LCC_RW);
1533 rc = cl_io_loop(env, io);
1534 ll_cl_remove(file, env);
1537 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1539 range_unlock(&lli->lli_write_tree, &range);
1542 /* cl_io_rw_init() handled IO */
1546 if (io->ci_nob > 0) {
1547 result += io->ci_nob;
1548 count -= io->ci_nob;
1549 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1551 /* prepare IO restart */
1552 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1553 args->u.normal.via_iter = vio->vui_iter;
1556 cl_io_fini(env, io);
1559 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1560 file->f_path.dentry->d_name.name,
1561 iot, rc, result, io->ci_need_restart);
1563 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1565 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1566 file_dentry(file)->d_name.name,
1567 iot == CIT_READ ? "read" : "write",
1568 *ppos, count, result, rc);
1569 /* preserve the tried count for FLR */
1570 retried = io->ci_ndelay_tried;
1575 if (iot == CIT_READ) {
1577 ll_stats_ops_tally(ll_i2sbi(inode),
1578 LPROC_LL_READ_BYTES, result);
1579 } else if (iot == CIT_WRITE) {
1581 ll_stats_ops_tally(ll_i2sbi(inode),
1582 LPROC_LL_WRITE_BYTES, result);
1583 fd->fd_write_failed = false;
1584 } else if (result == 0 && rc == 0) {
1587 fd->fd_write_failed = true;
1589 fd->fd_write_failed = false;
1590 } else if (rc != -ERESTARTSYS) {
1591 fd->fd_write_failed = true;
1595 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1597 ll_heat_add(inode, iot, result);
1599 RETURN(result > 0 ? result : rc);
1603 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1604 * especially for small I/O.
1606 * To serve a read request, CLIO has to create and initialize a cl_io and
1607 * then request DLM lock. This has turned out to have siginificant overhead
1608 * and affects the performance of small I/O dramatically.
1610 * It's not necessary to create a cl_io for each I/O. Under the help of read
1611 * ahead, most of the pages being read are already in memory cache and we can
1612 * read those pages directly because if the pages exist, the corresponding DLM
1613 * lock must exist so that page content must be valid.
1615 * In fast read implementation, the llite speculatively finds and reads pages
1616 * in memory cache. There are three scenarios for fast read:
1617 * - If the page exists and is uptodate, kernel VM will provide the data and
1618 * CLIO won't be intervened;
1619 * - If the page was brought into memory by read ahead, it will be exported
1620 * and read ahead parameters will be updated;
1621 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1622 * it will go back and invoke normal read, i.e., a cl_io will be created
1623 * and DLM lock will be requested.
1625 * POSIX compliance: posix standard states that read is intended to be atomic.
1626 * Lustre read implementation is in line with Linux kernel read implementation
1627 * and neither of them complies with POSIX standard in this matter. Fast read
1628 * doesn't make the situation worse on single node but it may interleave write
1629 * results from multiple nodes due to short read handling in ll_file_aio_read().
1631 * \param env - lu_env
1632 * \param iocb - kiocb from kernel
1633 * \param iter - user space buffers where the data will be copied
1635 * \retval - number of bytes have been read, or error code if error occurred.
1638 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1642 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1645 /* NB: we can't do direct IO for fast read because it will need a lock
1646 * to make IO engine happy. */
1647 if (iocb->ki_filp->f_flags & O_DIRECT)
1650 result = generic_file_read_iter(iocb, iter);
1652 /* If the first page is not in cache, generic_file_aio_read() will be
1653 * returned with -ENODATA.
1654 * See corresponding code in ll_readpage(). */
1655 if (result == -ENODATA)
1659 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1660 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1661 LPROC_LL_READ_BYTES, result);
1668 * Read from a file (through the page cache).
1670 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1673 struct vvp_io_args *args;
1674 struct file *file = iocb->ki_filp;
1678 ktime_t kstart = ktime_get();
1681 if (!iov_iter_count(to))
1685 * Currently when PCC read failed, we do not fall back to the
1686 * normal read path, just return the error.
1687 * The resaon is that: for RW-PCC, the file data may be modified
1688 * in the PCC and inconsistent with the data on OSTs (or file
1689 * data has been removed from the Lustre file system), at this
1690 * time, fallback to the normal read path may read the wrong
1692 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1693 * path: read data from data copy on OSTs.
1695 result = pcc_file_read_iter(iocb, to, &cached);
1699 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1701 result = ll_do_fast_read(iocb, to);
1702 if (result < 0 || iov_iter_count(to) == 0)
1705 env = cl_env_get(&refcheck);
1707 return PTR_ERR(env);
1709 args = ll_env_args(env, IO_NORMAL);
1710 args->u.normal.via_iter = to;
1711 args->u.normal.via_iocb = iocb;
1713 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1714 &iocb->ki_pos, iov_iter_count(to));
1717 else if (result == 0)
1720 cl_env_put(env, &refcheck);
1723 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1724 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1726 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1727 ktime_us_delta(ktime_get(), kstart));
1734 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1735 * If a page is already in the page cache and dirty (and some other things -
1736 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1737 * write to it without doing a full I/O, because Lustre already knows about it
1738 * and will write it out. This saves a lot of processing time.
1740 * All writes here are within one page, so exclusion is handled by the page
1741 * lock on the vm page. We do not do tiny writes for writes which touch
1742 * multiple pages because it's very unlikely multiple sequential pages are
1743 * are already dirty.
1745 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1746 * and are unlikely to be to already dirty pages.
1748 * Attribute updates are important here, we do them in ll_tiny_write_end.
1750 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1752 ssize_t count = iov_iter_count(iter);
1753 struct file *file = iocb->ki_filp;
1754 struct inode *inode = file_inode(file);
1755 bool lock_inode = !IS_NOSEC(inode);
1760 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1761 * of function for why.
1763 if (count >= PAGE_SIZE ||
1764 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1767 if (unlikely(lock_inode))
1769 result = __generic_file_write_iter(iocb, iter);
1771 if (unlikely(lock_inode))
1772 inode_unlock(inode);
1774 /* If the page is not already dirty, ll_tiny_write_begin returns
1775 * -ENODATA. We continue on to normal write.
1777 if (result == -ENODATA)
1781 ll_heat_add(inode, CIT_WRITE, result);
1782 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1784 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1787 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1793 * Write to a file (through the page cache).
1795 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1797 struct vvp_io_args *args;
1799 ssize_t rc_tiny = 0, rc_normal;
1800 struct file *file = iocb->ki_filp;
1803 ktime_t kstart = ktime_get();
1808 if (!iov_iter_count(from))
1809 GOTO(out, rc_normal = 0);
1812 * When PCC write failed, we usually do not fall back to the normal
1813 * write path, just return the error. But there is a special case when
1814 * returned error code is -ENOSPC due to running out of space on PCC HSM
1815 * bakcend. At this time, it will fall back to normal I/O path and
1816 * retry the I/O. As the file is in HSM released state, it will restore
1817 * the file data to OSTs first and redo the write again. And the
1818 * restore process will revoke the layout lock and detach the file
1819 * from PCC cache automatically.
1821 result = pcc_file_write_iter(iocb, from, &cached);
1822 if (cached && result != -ENOSPC && result != -EDQUOT)
1823 GOTO(out, rc_normal = result);
1825 /* NB: we can't do direct IO for tiny writes because they use the page
1826 * cache, we can't do sync writes because tiny writes can't flush
1827 * pages, and we can't do append writes because we can't guarantee the
1828 * required DLM locks are held to protect file size.
1830 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1831 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1832 rc_tiny = ll_do_tiny_write(iocb, from);
1834 /* In case of error, go on and try normal write - Only stop if tiny
1835 * write completed I/O.
1837 if (iov_iter_count(from) == 0)
1838 GOTO(out, rc_normal = rc_tiny);
1840 env = cl_env_get(&refcheck);
1842 return PTR_ERR(env);
1844 args = ll_env_args(env, IO_NORMAL);
1845 args->u.normal.via_iter = from;
1846 args->u.normal.via_iocb = iocb;
1848 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1849 &iocb->ki_pos, iov_iter_count(from));
1851 /* On success, combine bytes written. */
1852 if (rc_tiny >= 0 && rc_normal > 0)
1853 rc_normal += rc_tiny;
1854 /* On error, only return error from normal write if tiny write did not
1855 * write any bytes. Otherwise return bytes written by tiny write.
1857 else if (rc_tiny > 0)
1858 rc_normal = rc_tiny;
1860 cl_env_put(env, &refcheck);
1862 if (rc_normal > 0) {
1863 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1864 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1866 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1867 ktime_us_delta(ktime_get(), kstart));
1873 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1875 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1877 static int ll_file_get_iov_count(const struct iovec *iov,
1878 unsigned long *nr_segs, size_t *count)
1883 for (seg = 0; seg < *nr_segs; seg++) {
1884 const struct iovec *iv = &iov[seg];
1887 * If any segment has a negative length, or the cumulative
1888 * length ever wraps negative then return -EINVAL.
1891 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1893 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1898 cnt -= iv->iov_len; /* This segment is no good */
1905 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1906 unsigned long nr_segs, loff_t pos)
1913 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1920 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1921 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1922 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1923 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1924 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1926 result = ll_file_read_iter(iocb, &to);
1931 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1934 struct iovec iov = { .iov_base = buf, .iov_len = count };
1943 init_sync_kiocb(&kiocb, file);
1944 kiocb.ki_pos = *ppos;
1945 #ifdef HAVE_KIOCB_KI_LEFT
1946 kiocb.ki_left = count;
1947 #elif defined(HAVE_KI_NBYTES)
1948 kiocb.i_nbytes = count;
1951 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1952 *ppos = kiocb.ki_pos;
1958 * Write to a file (through the page cache).
1961 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1962 unsigned long nr_segs, loff_t pos)
1964 struct iov_iter from;
1969 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1976 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1977 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1978 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1979 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1980 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1982 result = ll_file_write_iter(iocb, &from);
1987 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1988 size_t count, loff_t *ppos)
1990 struct iovec iov = { .iov_base = (void __user *)buf,
2000 init_sync_kiocb(&kiocb, file);
2001 kiocb.ki_pos = *ppos;
2002 #ifdef HAVE_KIOCB_KI_LEFT
2003 kiocb.ki_left = count;
2004 #elif defined(HAVE_KI_NBYTES)
2005 kiocb.ki_nbytes = count;
2008 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2009 *ppos = kiocb.ki_pos;
2013 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2016 * Send file content (through pagecache) somewhere with helper
2018 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2019 struct pipe_inode_info *pipe, size_t count,
2023 struct vvp_io_args *args;
2030 result = pcc_file_splice_read(in_file, ppos, pipe,
2031 count, flags, &cached);
2035 ll_ras_enter(in_file, *ppos, count);
2037 env = cl_env_get(&refcheck);
2039 RETURN(PTR_ERR(env));
2041 args = ll_env_args(env, IO_SPLICE);
2042 args->u.splice.via_pipe = pipe;
2043 args->u.splice.via_flags = flags;
2045 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2046 cl_env_put(env, &refcheck);
2049 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2050 LUSTRE_FPRIVATE(in_file), *ppos, result,
2055 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2056 __u64 flags, struct lov_user_md *lum, int lum_size)
2058 struct lookup_intent oit = {
2060 .it_flags = flags | MDS_OPEN_BY_FID,
2065 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2066 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2067 /* this code will only exist for big-endian systems */
2068 lustre_swab_lov_user_md(lum, 0);
2071 ll_inode_size_lock(inode);
2072 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2074 GOTO(out_unlock, rc);
2076 ll_release_openhandle(dentry, &oit);
2079 ll_inode_size_unlock(inode);
2080 ll_intent_release(&oit);
2085 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2086 struct lov_mds_md **lmmp, int *lmm_size,
2087 struct ptlrpc_request **request)
2089 struct ll_sb_info *sbi = ll_i2sbi(inode);
2090 struct mdt_body *body;
2091 struct lov_mds_md *lmm = NULL;
2092 struct ptlrpc_request *req = NULL;
2093 struct md_op_data *op_data;
2096 rc = ll_get_default_mdsize(sbi, &lmmsize);
2100 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2101 strlen(filename), lmmsize,
2102 LUSTRE_OPC_ANY, NULL);
2103 if (IS_ERR(op_data))
2104 RETURN(PTR_ERR(op_data));
2106 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2107 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2108 ll_finish_md_op_data(op_data);
2110 CDEBUG(D_INFO, "md_getattr_name failed "
2111 "on %s: rc %d\n", filename, rc);
2115 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2116 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2118 lmmsize = body->mbo_eadatasize;
2120 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2122 GOTO(out, rc = -ENODATA);
2125 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2126 LASSERT(lmm != NULL);
2128 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2129 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2130 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2131 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2132 GOTO(out, rc = -EPROTO);
2135 * This is coming from the MDS, so is probably in
2136 * little endian. We convert it to host endian before
2137 * passing it to userspace.
2139 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2140 __swab32(LOV_MAGIC_MAGIC)) {
2141 int stripe_count = 0;
2143 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2144 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2145 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2146 if (le32_to_cpu(lmm->lmm_pattern) &
2147 LOV_PATTERN_F_RELEASED)
2151 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2153 /* if function called for directory - we should
2154 * avoid swab not existent lsm objects */
2155 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2156 lustre_swab_lov_user_md_objects(
2157 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2159 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2160 S_ISREG(body->mbo_mode))
2161 lustre_swab_lov_user_md_objects(
2162 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2168 *lmm_size = lmmsize;
2173 static int ll_lov_setea(struct inode *inode, struct file *file,
2176 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2177 struct lov_user_md *lump;
2178 int lum_size = sizeof(struct lov_user_md) +
2179 sizeof(struct lov_user_ost_data);
2183 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2186 OBD_ALLOC_LARGE(lump, lum_size);
2190 if (copy_from_user(lump, arg, lum_size))
2191 GOTO(out_lump, rc = -EFAULT);
2193 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2195 cl_lov_delay_create_clear(&file->f_flags);
2198 OBD_FREE_LARGE(lump, lum_size);
2202 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2209 env = cl_env_get(&refcheck);
2211 RETURN(PTR_ERR(env));
2213 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2214 cl_env_put(env, &refcheck);
2218 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2221 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2222 struct lov_user_md *klum;
2224 __u64 flags = FMODE_WRITE;
2227 rc = ll_copy_user_md(lum, &klum);
2232 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2237 rc = put_user(0, &lum->lmm_stripe_count);
2241 rc = ll_layout_refresh(inode, &gen);
2245 rc = ll_file_getstripe(inode, arg, lum_size);
2247 cl_lov_delay_create_clear(&file->f_flags);
2250 OBD_FREE_LARGE(klum, lum_size);
2256 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2258 struct ll_inode_info *lli = ll_i2info(inode);
2259 struct cl_object *obj = lli->lli_clob;
2260 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2261 struct ll_grouplock grouplock;
2266 CWARN("group id for group lock must not be 0\n");
2270 if (ll_file_nolock(file))
2271 RETURN(-EOPNOTSUPP);
2273 if (file->f_flags & O_NONBLOCK) {
2274 if (!mutex_trylock(&lli->lli_group_mutex))
2277 mutex_lock(&lli->lli_group_mutex);
2279 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2280 CWARN("group lock already existed with gid %lu\n",
2281 fd->fd_grouplock.lg_gid);
2282 GOTO(out, rc = -EINVAL);
2284 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2285 if (file->f_flags & O_NONBLOCK)
2286 GOTO(out, rc = -EAGAIN);
2287 mutex_unlock(&lli->lli_group_mutex);
2288 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2289 GOTO(retry, rc = 0);
2291 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2294 * XXX: group lock needs to protect all OST objects while PFL
2295 * can add new OST objects during the IO, so we'd instantiate
2296 * all OST objects before getting its group lock.
2301 struct cl_layout cl = {
2302 .cl_is_composite = false,
2304 struct lu_extent ext = {
2306 .e_end = OBD_OBJECT_EOF,
2309 env = cl_env_get(&refcheck);
2311 GOTO(out, rc = PTR_ERR(env));
2313 rc = cl_object_layout_get(env, obj, &cl);
2314 if (!rc && cl.cl_is_composite)
2315 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2318 cl_env_put(env, &refcheck);
2323 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2324 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2329 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2330 fd->fd_grouplock = grouplock;
2331 if (lli->lli_group_users == 0)
2332 lli->lli_group_gid = grouplock.lg_gid;
2333 lli->lli_group_users++;
2335 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2337 mutex_unlock(&lli->lli_group_mutex);
2342 static int ll_put_grouplock(struct inode *inode, struct file *file,
2345 struct ll_inode_info *lli = ll_i2info(inode);
2346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2347 struct ll_grouplock grouplock;
2351 mutex_lock(&lli->lli_group_mutex);
2352 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2353 CWARN("no group lock held\n");
2354 GOTO(out, rc = -EINVAL);
2357 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2359 if (fd->fd_grouplock.lg_gid != arg) {
2360 CWARN("group lock %lu doesn't match current id %lu\n",
2361 arg, fd->fd_grouplock.lg_gid);
2362 GOTO(out, rc = -EINVAL);
2365 grouplock = fd->fd_grouplock;
2366 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2367 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2369 cl_put_grouplock(&grouplock);
2371 lli->lli_group_users--;
2372 if (lli->lli_group_users == 0) {
2373 lli->lli_group_gid = 0;
2374 wake_up_var(&lli->lli_group_users);
2376 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2379 mutex_unlock(&lli->lli_group_mutex);
2385 * Close inode open handle
2387 * \param dentry [in] dentry which contains the inode
2388 * \param it [in,out] intent which contains open info and result
2391 * \retval <0 failure
2393 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2395 struct inode *inode = dentry->d_inode;
2396 struct obd_client_handle *och;
2402 /* Root ? Do nothing. */
2403 if (dentry->d_inode->i_sb->s_root == dentry)
2406 /* No open handle to close? Move away */
2407 if (!it_disposition(it, DISP_OPEN_OPEN))
2410 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2412 OBD_ALLOC(och, sizeof(*och));
2414 GOTO(out, rc = -ENOMEM);
2416 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2420 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2422 /* this one is in place of ll_file_open */
2423 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2424 ptlrpc_req_finished(it->it_request);
2425 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2431 * Get size for inode for which FIEMAP mapping is requested.
2432 * Make the FIEMAP get_info call and returns the result.
2433 * \param fiemap kernel buffer to hold extens
2434 * \param num_bytes kernel buffer size
2436 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2442 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2445 /* Checks for fiemap flags */
2446 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2447 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2451 /* Check for FIEMAP_FLAG_SYNC */
2452 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2453 rc = filemap_fdatawrite(inode->i_mapping);
2458 env = cl_env_get(&refcheck);
2460 RETURN(PTR_ERR(env));
2462 if (i_size_read(inode) == 0) {
2463 rc = ll_glimpse_size(inode);
2468 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2469 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2470 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2472 /* If filesize is 0, then there would be no objects for mapping */
2473 if (fmkey.lfik_oa.o_size == 0) {
2474 fiemap->fm_mapped_extents = 0;
2478 fmkey.lfik_fiemap = *fiemap;
2480 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2481 &fmkey, fiemap, &num_bytes);
2483 cl_env_put(env, &refcheck);
2487 int ll_fid2path(struct inode *inode, void __user *arg)
2489 struct obd_export *exp = ll_i2mdexp(inode);
2490 const struct getinfo_fid2path __user *gfin = arg;
2492 struct getinfo_fid2path *gfout;
2498 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2499 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2502 /* Only need to get the buflen */
2503 if (get_user(pathlen, &gfin->gf_pathlen))
2506 if (pathlen > PATH_MAX)
2509 outsize = sizeof(*gfout) + pathlen;
2510 OBD_ALLOC(gfout, outsize);
2514 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2515 GOTO(gf_free, rc = -EFAULT);
2516 /* append root FID after gfout to let MDT know the root FID so that it
2517 * can lookup the correct path, this is mainly for fileset.
2518 * old server without fileset mount support will ignore this. */
2519 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2521 /* Call mdc_iocontrol */
2522 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2526 if (copy_to_user(arg, gfout, outsize))
2530 OBD_FREE(gfout, outsize);
2535 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2537 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2545 ioc->idv_version = 0;
2546 ioc->idv_layout_version = UINT_MAX;
2548 /* If no file object initialized, we consider its version is 0. */
2552 env = cl_env_get(&refcheck);
2554 RETURN(PTR_ERR(env));
2556 io = vvp_env_thread_io(env);
2558 io->u.ci_data_version.dv_data_version = 0;
2559 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2560 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2563 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2564 result = cl_io_loop(env, io);
2566 result = io->ci_result;
2568 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2569 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2571 cl_io_fini(env, io);
2573 if (unlikely(io->ci_need_restart))
2576 cl_env_put(env, &refcheck);
2582 * Read the data_version for inode.
2584 * This value is computed using stripe object version on OST.
2585 * Version is computed using server side locking.
2587 * @param flags if do sync on the OST side;
2589 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2590 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2592 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2594 struct ioc_data_version ioc = { .idv_flags = flags };
2597 rc = ll_ioc_data_version(inode, &ioc);
2599 *data_version = ioc.idv_version;
2605 * Trigger a HSM release request for the provided inode.
2607 int ll_hsm_release(struct inode *inode)
2610 struct obd_client_handle *och = NULL;
2611 __u64 data_version = 0;
2616 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2617 ll_i2sbi(inode)->ll_fsname,
2618 PFID(&ll_i2info(inode)->lli_fid));
2620 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2622 GOTO(out, rc = PTR_ERR(och));
2624 /* Grab latest data_version and [am]time values */
2625 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2629 env = cl_env_get(&refcheck);
2631 GOTO(out, rc = PTR_ERR(env));
2633 rc = ll_merge_attr(env, inode);
2634 cl_env_put(env, &refcheck);
2636 /* If error happen, we have the wrong size for a file.
2642 /* Release the file.
2643 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2644 * we still need it to pack l_remote_handle to MDT. */
2645 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2651 if (och != NULL && !IS_ERR(och)) /* close the file */
2652 ll_lease_close(och, inode, NULL);
2657 struct ll_swap_stack {
2660 struct inode *inode1;
2661 struct inode *inode2;
2666 static int ll_swap_layouts(struct file *file1, struct file *file2,
2667 struct lustre_swap_layouts *lsl)
2669 struct mdc_swap_layouts msl;
2670 struct md_op_data *op_data;
2673 struct ll_swap_stack *llss = NULL;
2676 OBD_ALLOC_PTR(llss);
2680 llss->inode1 = file_inode(file1);
2681 llss->inode2 = file_inode(file2);
2683 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2687 /* we use 2 bool because it is easier to swap than 2 bits */
2688 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2689 llss->check_dv1 = true;
2691 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2692 llss->check_dv2 = true;
2694 /* we cannot use lsl->sl_dvX directly because we may swap them */
2695 llss->dv1 = lsl->sl_dv1;
2696 llss->dv2 = lsl->sl_dv2;
2698 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2699 if (rc == 0) /* same file, done! */
2702 if (rc < 0) { /* sequentialize it */
2703 swap(llss->inode1, llss->inode2);
2705 swap(llss->dv1, llss->dv2);
2706 swap(llss->check_dv1, llss->check_dv2);
2710 if (gid != 0) { /* application asks to flush dirty cache */
2711 rc = ll_get_grouplock(llss->inode1, file1, gid);
2715 rc = ll_get_grouplock(llss->inode2, file2, gid);
2717 ll_put_grouplock(llss->inode1, file1, gid);
2722 /* ultimate check, before swaping the layouts we check if
2723 * dataversion has changed (if requested) */
2724 if (llss->check_dv1) {
2725 rc = ll_data_version(llss->inode1, &dv, 0);
2728 if (dv != llss->dv1)
2729 GOTO(putgl, rc = -EAGAIN);
2732 if (llss->check_dv2) {
2733 rc = ll_data_version(llss->inode2, &dv, 0);
2736 if (dv != llss->dv2)
2737 GOTO(putgl, rc = -EAGAIN);
2740 /* struct md_op_data is used to send the swap args to the mdt
2741 * only flags is missing, so we use struct mdc_swap_layouts
2742 * through the md_op_data->op_data */
2743 /* flags from user space have to be converted before they are send to
2744 * server, no flag is sent today, they are only used on the client */
2747 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2748 0, LUSTRE_OPC_ANY, &msl);
2749 if (IS_ERR(op_data))
2750 GOTO(free, rc = PTR_ERR(op_data));
2752 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2753 sizeof(*op_data), op_data, NULL);
2754 ll_finish_md_op_data(op_data);
2761 ll_put_grouplock(llss->inode2, file2, gid);
2762 ll_put_grouplock(llss->inode1, file1, gid);
2772 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2774 struct obd_export *exp = ll_i2mdexp(inode);
2775 struct md_op_data *op_data;
2779 /* Detect out-of range masks */
2780 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2783 /* Non-root users are forbidden to set or clear flags which are
2784 * NOT defined in HSM_USER_MASK. */
2785 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2786 !cfs_capable(CFS_CAP_SYS_ADMIN))
2789 if (!exp_connect_archive_id_array(exp)) {
2790 /* Detect out-of range archive id */
2791 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2792 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2796 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2797 LUSTRE_OPC_ANY, hss);
2798 if (IS_ERR(op_data))
2799 RETURN(PTR_ERR(op_data));
2801 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2804 ll_finish_md_op_data(op_data);
2809 static int ll_hsm_import(struct inode *inode, struct file *file,
2810 struct hsm_user_import *hui)
2812 struct hsm_state_set *hss = NULL;
2813 struct iattr *attr = NULL;
2817 if (!S_ISREG(inode->i_mode))
2823 GOTO(out, rc = -ENOMEM);
2825 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2826 hss->hss_archive_id = hui->hui_archive_id;
2827 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2828 rc = ll_hsm_state_set(inode, hss);
2832 OBD_ALLOC_PTR(attr);
2834 GOTO(out, rc = -ENOMEM);
2836 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2837 attr->ia_mode |= S_IFREG;
2838 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2839 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2840 attr->ia_size = hui->hui_size;
2841 attr->ia_mtime.tv_sec = hui->hui_mtime;
2842 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2843 attr->ia_atime.tv_sec = hui->hui_atime;
2844 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2846 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2847 ATTR_UID | ATTR_GID |
2848 ATTR_MTIME | ATTR_MTIME_SET |
2849 ATTR_ATIME | ATTR_ATIME_SET;
2853 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2857 inode_unlock(inode);
2869 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2871 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2872 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2875 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2877 struct inode *inode = file_inode(file);
2879 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2880 ATTR_MTIME | ATTR_MTIME_SET |
2883 .tv_sec = lfu->lfu_atime_sec,
2884 .tv_nsec = lfu->lfu_atime_nsec,
2887 .tv_sec = lfu->lfu_mtime_sec,
2888 .tv_nsec = lfu->lfu_mtime_nsec,
2891 .tv_sec = lfu->lfu_ctime_sec,
2892 .tv_nsec = lfu->lfu_ctime_nsec,
2898 if (!capable(CAP_SYS_ADMIN))
2901 if (!S_ISREG(inode->i_mode))
2905 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2907 inode_unlock(inode);
2912 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2915 case MODE_READ_USER:
2917 case MODE_WRITE_USER:
2924 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2926 /* Used to allow the upper layers of the client to request an LDLM lock
2927 * without doing an actual read or write.
2929 * Used for ladvise lockahead to manually request specific locks.
2931 * \param[in] file file this ladvise lock request is on
2932 * \param[in] ladvise ladvise struct describing this lock request
2934 * \retval 0 success, no detailed result available (sync requests
2935 * and requests sent to the server [not handled locally]
2936 * cannot return detailed results)
2937 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2938 * see definitions for details.
2939 * \retval negative negative errno on error
2941 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2943 struct lu_env *env = NULL;
2944 struct cl_io *io = NULL;
2945 struct cl_lock *lock = NULL;
2946 struct cl_lock_descr *descr = NULL;
2947 struct dentry *dentry = file->f_path.dentry;
2948 struct inode *inode = dentry->d_inode;
2949 enum cl_lock_mode cl_mode;
2950 off_t start = ladvise->lla_start;
2951 off_t end = ladvise->lla_end;
2957 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2958 "start=%llu, end=%llu\n", dentry->d_name.len,
2959 dentry->d_name.name, dentry->d_inode,
2960 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2963 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2965 GOTO(out, result = cl_mode);
2967 /* Get IO environment */
2968 result = cl_io_get(inode, &env, &io, &refcheck);
2972 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2975 * nothing to do for this io. This currently happens when
2976 * stripe sub-object's are not yet created.
2978 result = io->ci_result;
2979 } else if (result == 0) {
2980 lock = vvp_env_lock(env);
2981 descr = &lock->cll_descr;
2983 descr->cld_obj = io->ci_obj;
2984 /* Convert byte offsets to pages */
2985 descr->cld_start = cl_index(io->ci_obj, start);
2986 descr->cld_end = cl_index(io->ci_obj, end);
2987 descr->cld_mode = cl_mode;
2988 /* CEF_MUST is used because we do not want to convert a
2989 * lockahead request to a lockless lock */
2990 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2993 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2994 descr->cld_enq_flags |= CEF_SPECULATIVE;
2996 result = cl_lock_request(env, io, lock);
2998 /* On success, we need to release the lock */
3000 cl_lock_release(env, lock);
3002 cl_io_fini(env, io);
3003 cl_env_put(env, &refcheck);
3005 /* -ECANCELED indicates a matching lock with a different extent
3006 * was already present, and -EEXIST indicates a matching lock
3007 * on exactly the same extent was already present.
3008 * We convert them to positive values for userspace to make
3009 * recognizing true errors easier.
3010 * Note we can only return these detailed results on async requests,
3011 * as sync requests look the same as i/o requests for locking. */
3012 if (result == -ECANCELED)
3013 result = LLA_RESULT_DIFFERENT;
3014 else if (result == -EEXIST)
3015 result = LLA_RESULT_SAME;
3020 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3022 static int ll_ladvise_sanity(struct inode *inode,
3023 struct llapi_lu_ladvise *ladvise)
3025 struct ll_sb_info *sbi = ll_i2sbi(inode);
3026 enum lu_ladvise_type advice = ladvise->lla_advice;
3027 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3028 * be in the first 32 bits of enum ladvise_flags */
3029 __u32 flags = ladvise->lla_peradvice_flags;
3030 /* 3 lines at 80 characters per line, should be plenty */
3033 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3035 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3036 "last supported advice is %s (value '%d'): rc = %d\n",
3037 sbi->ll_fsname, advice,
3038 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3042 /* Per-advice checks */
3044 case LU_LADVISE_LOCKNOEXPAND:
3045 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3047 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3048 "rc = %d\n", sbi->ll_fsname, flags,
3049 ladvise_names[advice], rc);
3053 case LU_LADVISE_LOCKAHEAD:
3054 /* Currently only READ and WRITE modes can be requested */
3055 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3056 ladvise->lla_lockahead_mode == 0) {
3058 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3059 "rc = %d\n", sbi->ll_fsname,
3060 ladvise->lla_lockahead_mode,
3061 ladvise_names[advice], rc);
3065 case LU_LADVISE_WILLREAD:
3066 case LU_LADVISE_DONTNEED:
3068 /* Note fall through above - These checks apply to all advices
3069 * except LOCKNOEXPAND */
3070 if (flags & ~LF_DEFAULT_MASK) {
3072 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3073 "rc = %d\n", sbi->ll_fsname, flags,
3074 ladvise_names[advice], rc);
3077 if (ladvise->lla_start >= ladvise->lla_end) {
3079 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3080 "for %s: rc = %d\n", sbi->ll_fsname,
3081 ladvise->lla_start, ladvise->lla_end,
3082 ladvise_names[advice], rc);
3094 * Give file access advices
3096 * The ladvise interface is similar to Linux fadvise() system call, except it
3097 * forwards the advices directly from Lustre client to server. The server side
3098 * codes will apply appropriate read-ahead and caching techniques for the
3099 * corresponding files.
3101 * A typical workload for ladvise is e.g. a bunch of different clients are
3102 * doing small random reads of a file, so prefetching pages into OSS cache
3103 * with big linear reads before the random IO is a net benefit. Fetching
3104 * all that data into each client cache with fadvise() may not be, due to
3105 * much more data being sent to the client.
3107 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3108 struct llapi_lu_ladvise *ladvise)
3112 struct cl_ladvise_io *lio;
3117 env = cl_env_get(&refcheck);
3119 RETURN(PTR_ERR(env));
3121 io = vvp_env_thread_io(env);
3122 io->ci_obj = ll_i2info(inode)->lli_clob;
3124 /* initialize parameters for ladvise */
3125 lio = &io->u.ci_ladvise;
3126 lio->li_start = ladvise->lla_start;
3127 lio->li_end = ladvise->lla_end;
3128 lio->li_fid = ll_inode2fid(inode);
3129 lio->li_advice = ladvise->lla_advice;
3130 lio->li_flags = flags;
3132 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3133 rc = cl_io_loop(env, io);
3137 cl_io_fini(env, io);
3138 cl_env_put(env, &refcheck);
3142 static int ll_lock_noexpand(struct file *file, int flags)
3144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3146 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3151 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3154 struct fsxattr fsxattr;
3156 if (copy_from_user(&fsxattr,
3157 (const struct fsxattr __user *)arg,
3161 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3162 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3163 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3164 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3165 if (copy_to_user((struct fsxattr __user *)arg,
3166 &fsxattr, sizeof(fsxattr)))
3172 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3175 * Project Quota ID state is only allowed to change from within the init
3176 * namespace. Enforce that restriction only if we are trying to change
3177 * the quota ID state. Everything else is allowed in user namespaces.
3179 if (current_user_ns() == &init_user_ns)
3182 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3185 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3186 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3189 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3196 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3200 struct md_op_data *op_data;
3201 struct ptlrpc_request *req = NULL;
3203 struct fsxattr fsxattr;
3204 struct cl_object *obj;
3208 if (copy_from_user(&fsxattr,
3209 (const struct fsxattr __user *)arg,
3213 rc = ll_ioctl_check_project(inode, &fsxattr);
3217 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3218 LUSTRE_OPC_ANY, NULL);
3219 if (IS_ERR(op_data))
3220 RETURN(PTR_ERR(op_data));
3222 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3223 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3224 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3225 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3226 op_data->op_projid = fsxattr.fsx_projid;
3227 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3228 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3230 ptlrpc_req_finished(req);
3232 GOTO(out_fsxattr, rc);
3233 ll_update_inode_flags(inode, op_data->op_attr_flags);
3234 obj = ll_i2info(inode)->lli_clob;
3236 GOTO(out_fsxattr, rc);
3238 OBD_ALLOC_PTR(attr);
3240 GOTO(out_fsxattr, rc = -ENOMEM);
3242 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3243 fsxattr.fsx_xflags);
3246 ll_finish_md_op_data(op_data);
3250 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3253 struct inode *inode = file_inode(file);
3254 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3255 struct ll_inode_info *lli = ll_i2info(inode);
3256 struct obd_client_handle *och = NULL;
3257 struct split_param sp;
3258 struct pcc_param param;
3259 bool lease_broken = false;
3261 enum mds_op_bias bias = 0;
3262 struct file *layout_file = NULL;
3264 size_t data_size = 0;
3265 bool attached = false;
3270 mutex_lock(&lli->lli_och_mutex);
3271 if (fd->fd_lease_och != NULL) {
3272 och = fd->fd_lease_och;
3273 fd->fd_lease_och = NULL;
3275 mutex_unlock(&lli->lli_och_mutex);
3280 fmode = och->och_flags;
3282 switch (ioc->lil_flags) {
3283 case LL_LEASE_RESYNC_DONE:
3284 if (ioc->lil_count > IOC_IDS_MAX)
3285 GOTO(out_lease_close, rc = -EINVAL);
3287 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3288 OBD_ALLOC(data, data_size);
3290 GOTO(out_lease_close, rc = -ENOMEM);
3292 if (copy_from_user(data, (void __user *)arg, data_size))
3293 GOTO(out_lease_close, rc = -EFAULT);
3295 bias = MDS_CLOSE_RESYNC_DONE;
3297 case LL_LEASE_LAYOUT_MERGE: {
3300 if (ioc->lil_count != 1)
3301 GOTO(out_lease_close, rc = -EINVAL);
3303 arg += sizeof(*ioc);
3304 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3305 GOTO(out_lease_close, rc = -EFAULT);
3307 layout_file = fget(fd);
3309 GOTO(out_lease_close, rc = -EBADF);
3311 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3312 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3313 GOTO(out_lease_close, rc = -EPERM);
3315 data = file_inode(layout_file);
3316 bias = MDS_CLOSE_LAYOUT_MERGE;
3319 case LL_LEASE_LAYOUT_SPLIT: {
3323 if (ioc->lil_count != 2)
3324 GOTO(out_lease_close, rc = -EINVAL);
3326 arg += sizeof(*ioc);
3327 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3328 GOTO(out_lease_close, rc = -EFAULT);
3330 arg += sizeof(__u32);
3331 if (copy_from_user(&mirror_id, (void __user *)arg,
3333 GOTO(out_lease_close, rc = -EFAULT);
3335 layout_file = fget(fdv);
3337 GOTO(out_lease_close, rc = -EBADF);
3339 sp.sp_inode = file_inode(layout_file);
3340 sp.sp_mirror_id = (__u16)mirror_id;
3342 bias = MDS_CLOSE_LAYOUT_SPLIT;
3345 case LL_LEASE_PCC_ATTACH:
3346 if (ioc->lil_count != 1)
3349 arg += sizeof(*ioc);
3350 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3352 GOTO(out_lease_close, rc2 = -EFAULT);
3354 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3356 GOTO(out_lease_close, rc2);
3359 /* Grab latest data version */
3360 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3363 GOTO(out_lease_close, rc2);
3366 bias = MDS_PCC_ATTACH;
3369 /* without close intent */
3374 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3378 rc = ll_lease_och_release(inode, file);
3387 switch (ioc->lil_flags) {
3388 case LL_LEASE_RESYNC_DONE:
3390 OBD_FREE(data, data_size);
3392 case LL_LEASE_LAYOUT_MERGE:
3393 case LL_LEASE_LAYOUT_SPLIT:
3397 case LL_LEASE_PCC_ATTACH:
3400 rc = pcc_readwrite_attach_fini(file, inode,
3401 param.pa_layout_gen,
3408 rc = ll_lease_type_from_fmode(fmode);
3412 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3415 struct inode *inode = file_inode(file);
3416 struct ll_inode_info *lli = ll_i2info(inode);
3417 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3418 struct obd_client_handle *och = NULL;
3419 __u64 open_flags = 0;
3425 switch (ioc->lil_mode) {
3426 case LL_LEASE_WRLCK:
3427 if (!(file->f_mode & FMODE_WRITE))
3429 fmode = FMODE_WRITE;
3431 case LL_LEASE_RDLCK:
3432 if (!(file->f_mode & FMODE_READ))
3436 case LL_LEASE_UNLCK:
3437 RETURN(ll_file_unlock_lease(file, ioc, arg));
3442 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3444 /* apply for lease */
3445 if (ioc->lil_flags & LL_LEASE_RESYNC)
3446 open_flags = MDS_OPEN_RESYNC;
3447 och = ll_lease_open(inode, file, fmode, open_flags);
3449 RETURN(PTR_ERR(och));
3451 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3452 rc = ll_lease_file_resync(och, inode, arg);
3454 ll_lease_close(och, inode, NULL);
3457 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3459 ll_lease_close(och, inode, NULL);
3465 mutex_lock(&lli->lli_och_mutex);
3466 if (fd->fd_lease_och == NULL) {
3467 fd->fd_lease_och = och;
3470 mutex_unlock(&lli->lli_och_mutex);
3472 /* impossible now that only excl is supported for now */
3473 ll_lease_close(och, inode, &lease_broken);
3479 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3481 struct ll_inode_info *lli = ll_i2info(inode);
3482 struct ll_sb_info *sbi = ll_i2sbi(inode);
3483 __u64 now = ktime_get_real_seconds();
3486 spin_lock(&lli->lli_heat_lock);
3487 heat->lh_flags = lli->lli_heat_flags;
3488 for (i = 0; i < heat->lh_count; i++)
3489 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3490 now, sbi->ll_heat_decay_weight,
3491 sbi->ll_heat_period_second);
3492 spin_unlock(&lli->lli_heat_lock);
3495 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3497 struct ll_inode_info *lli = ll_i2info(inode);
3500 spin_lock(&lli->lli_heat_lock);
3501 if (flags & LU_HEAT_FLAG_CLEAR)
3502 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3504 if (flags & LU_HEAT_FLAG_OFF)
3505 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3507 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3509 spin_unlock(&lli->lli_heat_lock);
3515 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3517 struct inode *inode = file_inode(file);
3518 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3522 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3523 PFID(ll_inode2fid(inode)), inode, cmd);
3524 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3526 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3527 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3531 case LL_IOC_GETFLAGS:
3532 /* Get the current value of the file flags */
3533 return put_user(fd->fd_flags, (int __user *)arg);
3534 case LL_IOC_SETFLAGS:
3535 case LL_IOC_CLRFLAGS:
3536 /* Set or clear specific file flags */
3537 /* XXX This probably needs checks to ensure the flags are
3538 * not abused, and to handle any flag side effects.
3540 if (get_user(flags, (int __user *) arg))
3543 if (cmd == LL_IOC_SETFLAGS) {
3544 if ((flags & LL_FILE_IGNORE_LOCK) &&
3545 !(file->f_flags & O_DIRECT)) {
3546 CERROR("%s: unable to disable locking on "
3547 "non-O_DIRECT file\n", current->comm);
3551 fd->fd_flags |= flags;
3553 fd->fd_flags &= ~flags;
3556 case LL_IOC_LOV_SETSTRIPE:
3557 case LL_IOC_LOV_SETSTRIPE_NEW:
3558 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3559 case LL_IOC_LOV_SETEA:
3560 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3561 case LL_IOC_LOV_SWAP_LAYOUTS: {
3563 struct lustre_swap_layouts lsl;
3565 if (copy_from_user(&lsl, (char __user *)arg,
3566 sizeof(struct lustre_swap_layouts)))
3569 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3572 file2 = fget(lsl.sl_fd);
3576 /* O_WRONLY or O_RDWR */
3577 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3578 GOTO(out, rc = -EPERM);
3580 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3581 struct inode *inode2;
3582 struct ll_inode_info *lli;
3583 struct obd_client_handle *och = NULL;
3585 lli = ll_i2info(inode);
3586 mutex_lock(&lli->lli_och_mutex);
3587 if (fd->fd_lease_och != NULL) {
3588 och = fd->fd_lease_och;
3589 fd->fd_lease_och = NULL;
3591 mutex_unlock(&lli->lli_och_mutex);
3593 GOTO(out, rc = -ENOLCK);
3594 inode2 = file_inode(file2);
3595 rc = ll_swap_layouts_close(och, inode, inode2);
3597 rc = ll_swap_layouts(file, file2, &lsl);
3603 case LL_IOC_LOV_GETSTRIPE:
3604 case LL_IOC_LOV_GETSTRIPE_NEW:
3605 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3606 case FS_IOC_GETFLAGS:
3607 case FS_IOC_SETFLAGS:
3608 RETURN(ll_iocontrol(inode, file, cmd, arg));
3609 case FSFILT_IOC_GETVERSION:
3610 case FS_IOC_GETVERSION:
3611 RETURN(put_user(inode->i_generation, (int __user *)arg));
3612 /* We need to special case any other ioctls we want to handle,
3613 * to send them to the MDS/OST as appropriate and to properly
3614 * network encode the arg field. */
3615 case FS_IOC_SETVERSION:
3618 case LL_IOC_GROUP_LOCK:
3619 RETURN(ll_get_grouplock(inode, file, arg));
3620 case LL_IOC_GROUP_UNLOCK:
3621 RETURN(ll_put_grouplock(inode, file, arg));
3622 case IOC_OBD_STATFS:
3623 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3625 case LL_IOC_FLUSHCTX:
3626 RETURN(ll_flush_ctx(inode));
3627 case LL_IOC_PATH2FID: {
3628 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3629 sizeof(struct lu_fid)))
3634 case LL_IOC_GETPARENT:
3635 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3637 case OBD_IOC_FID2PATH:
3638 RETURN(ll_fid2path(inode, (void __user *)arg));
3639 case LL_IOC_DATA_VERSION: {
3640 struct ioc_data_version idv;
3643 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3646 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3647 rc = ll_ioc_data_version(inode, &idv);
3650 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3656 case LL_IOC_GET_MDTIDX: {
3659 mdtidx = ll_get_mdt_idx(inode);
3663 if (put_user((int)mdtidx, (int __user *)arg))
3668 case OBD_IOC_GETDTNAME:
3669 case OBD_IOC_GETMDNAME:
3670 RETURN(ll_get_obd_name(inode, cmd, arg));
3671 case LL_IOC_HSM_STATE_GET: {
3672 struct md_op_data *op_data;
3673 struct hsm_user_state *hus;
3680 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3681 LUSTRE_OPC_ANY, hus);
3682 if (IS_ERR(op_data)) {
3684 RETURN(PTR_ERR(op_data));
3687 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3690 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3693 ll_finish_md_op_data(op_data);
3697 case LL_IOC_HSM_STATE_SET: {
3698 struct hsm_state_set *hss;
3705 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3710 rc = ll_hsm_state_set(inode, hss);
3715 case LL_IOC_HSM_ACTION: {
3716 struct md_op_data *op_data;
3717 struct hsm_current_action *hca;
3724 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3725 LUSTRE_OPC_ANY, hca);
3726 if (IS_ERR(op_data)) {
3728 RETURN(PTR_ERR(op_data));
3731 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3734 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3737 ll_finish_md_op_data(op_data);
3741 case LL_IOC_SET_LEASE_OLD: {
3742 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3744 RETURN(ll_file_set_lease(file, &ioc, 0));
3746 case LL_IOC_SET_LEASE: {
3747 struct ll_ioc_lease ioc;
3749 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3752 RETURN(ll_file_set_lease(file, &ioc, arg));
3754 case LL_IOC_GET_LEASE: {
3755 struct ll_inode_info *lli = ll_i2info(inode);
3756 struct ldlm_lock *lock = NULL;
3759 mutex_lock(&lli->lli_och_mutex);
3760 if (fd->fd_lease_och != NULL) {
3761 struct obd_client_handle *och = fd->fd_lease_och;
3763 lock = ldlm_handle2lock(&och->och_lease_handle);
3765 lock_res_and_lock(lock);
3766 if (!ldlm_is_cancel(lock))
3767 fmode = och->och_flags;
3769 unlock_res_and_lock(lock);
3770 LDLM_LOCK_PUT(lock);
3773 mutex_unlock(&lli->lli_och_mutex);
3775 RETURN(ll_lease_type_from_fmode(fmode));
3777 case LL_IOC_HSM_IMPORT: {
3778 struct hsm_user_import *hui;
3784 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3789 rc = ll_hsm_import(inode, file, hui);
3794 case LL_IOC_FUTIMES_3: {
3795 struct ll_futimes_3 lfu;
3797 if (copy_from_user(&lfu,
3798 (const struct ll_futimes_3 __user *)arg,
3802 RETURN(ll_file_futimes_3(file, &lfu));
3804 case LL_IOC_LADVISE: {
3805 struct llapi_ladvise_hdr *k_ladvise_hdr;
3806 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3809 int alloc_size = sizeof(*k_ladvise_hdr);
3812 u_ladvise_hdr = (void __user *)arg;
3813 OBD_ALLOC_PTR(k_ladvise_hdr);
3814 if (k_ladvise_hdr == NULL)
3817 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3818 GOTO(out_ladvise, rc = -EFAULT);
3820 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3821 k_ladvise_hdr->lah_count < 1)
3822 GOTO(out_ladvise, rc = -EINVAL);
3824 num_advise = k_ladvise_hdr->lah_count;
3825 if (num_advise >= LAH_COUNT_MAX)
3826 GOTO(out_ladvise, rc = -EFBIG);
3828 OBD_FREE_PTR(k_ladvise_hdr);
3829 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3830 lah_advise[num_advise]);
3831 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3832 if (k_ladvise_hdr == NULL)
3836 * TODO: submit multiple advices to one server in a single RPC
3838 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3839 GOTO(out_ladvise, rc = -EFAULT);
3841 for (i = 0; i < num_advise; i++) {
3842 struct llapi_lu_ladvise *k_ladvise =
3843 &k_ladvise_hdr->lah_advise[i];
3844 struct llapi_lu_ladvise __user *u_ladvise =
3845 &u_ladvise_hdr->lah_advise[i];
3847 rc = ll_ladvise_sanity(inode, k_ladvise);
3849 GOTO(out_ladvise, rc);
3851 switch (k_ladvise->lla_advice) {
3852 case LU_LADVISE_LOCKNOEXPAND:
3853 rc = ll_lock_noexpand(file,
3854 k_ladvise->lla_peradvice_flags);
3855 GOTO(out_ladvise, rc);
3856 case LU_LADVISE_LOCKAHEAD:
3858 rc = ll_file_lock_ahead(file, k_ladvise);
3861 GOTO(out_ladvise, rc);
3864 &u_ladvise->lla_lockahead_result))
3865 GOTO(out_ladvise, rc = -EFAULT);
3868 rc = ll_ladvise(inode, file,
3869 k_ladvise_hdr->lah_flags,
3872 GOTO(out_ladvise, rc);
3879 OBD_FREE(k_ladvise_hdr, alloc_size);
3882 case LL_IOC_FLR_SET_MIRROR: {
3883 /* mirror I/O must be direct to avoid polluting page cache
3885 if (!(file->f_flags & O_DIRECT))
3888 fd->fd_designated_mirror = (__u32)arg;
3891 case LL_IOC_FSGETXATTR:
3892 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3893 case LL_IOC_FSSETXATTR:
3894 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3896 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3897 case LL_IOC_HEAT_GET: {
3898 struct lu_heat uheat;
3899 struct lu_heat *heat;
3902 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3905 if (uheat.lh_count > OBD_HEAT_COUNT)
3906 uheat.lh_count = OBD_HEAT_COUNT;
3908 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3909 OBD_ALLOC(heat, size);
3913 heat->lh_count = uheat.lh_count;
3914 ll_heat_get(inode, heat);
3915 rc = copy_to_user((char __user *)arg, heat, size);
3916 OBD_FREE(heat, size);
3917 RETURN(rc ? -EFAULT : 0);
3919 case LL_IOC_HEAT_SET: {
3922 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3925 rc = ll_heat_set(inode, flags);
3928 case LL_IOC_PCC_DETACH: {
3929 struct lu_pcc_detach *detach;
3931 OBD_ALLOC_PTR(detach);
3935 if (copy_from_user(detach,
3936 (const struct lu_pcc_detach __user *)arg,
3938 GOTO(out_detach_free, rc = -EFAULT);
3940 if (!S_ISREG(inode->i_mode))
3941 GOTO(out_detach_free, rc = -EINVAL);
3943 if (!inode_owner_or_capable(inode))
3944 GOTO(out_detach_free, rc = -EPERM);
3946 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3948 OBD_FREE_PTR(detach);
3951 case LL_IOC_PCC_STATE: {
3952 struct lu_pcc_state __user *ustate =
3953 (struct lu_pcc_state __user *)arg;
3954 struct lu_pcc_state *state;
3956 OBD_ALLOC_PTR(state);
3960 if (copy_from_user(state, ustate, sizeof(*state)))
3961 GOTO(out_state, rc = -EFAULT);
3963 rc = pcc_ioctl_state(file, inode, state);
3965 GOTO(out_state, rc);
3967 if (copy_to_user(ustate, state, sizeof(*state)))
3968 GOTO(out_state, rc = -EFAULT);
3971 OBD_FREE_PTR(state);
3975 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3976 (void __user *)arg));
3980 #ifndef HAVE_FILE_LLSEEK_SIZE
3981 static inline loff_t
3982 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3984 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3986 if (offset > maxsize)
3989 if (offset != file->f_pos) {
3990 file->f_pos = offset;
3991 file->f_version = 0;
3997 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3998 loff_t maxsize, loff_t eof)
4000 struct inode *inode = file_inode(file);
4008 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4009 * position-querying operation. Avoid rewriting the "same"
4010 * f_pos value back to the file because a concurrent read(),
4011 * write() or lseek() might have altered it
4016 * f_lock protects against read/modify/write race with other
4017 * SEEK_CURs. Note that parallel writes and reads behave
4021 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4022 inode_unlock(inode);
4026 * In the generic case the entire file is data, so as long as
4027 * offset isn't at the end of the file then the offset is data.
4034 * There is a virtual hole at the end of the file, so as long as
4035 * offset isn't i_size or larger, return i_size.
4043 return llseek_execute(file, offset, maxsize);
4047 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4049 struct inode *inode = file_inode(file);
4050 loff_t retval, eof = 0;
4051 ktime_t kstart = ktime_get();
4054 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4055 (origin == SEEK_CUR) ? file->f_pos : 0);
4056 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4057 PFID(ll_inode2fid(inode)), inode, retval, retval,
4060 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4061 retval = ll_glimpse_size(inode);
4064 eof = i_size_read(inode);
4067 retval = ll_generic_file_llseek_size(file, offset, origin,
4068 ll_file_maxbytes(inode), eof);
4070 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4071 ktime_us_delta(ktime_get(), kstart));
4075 static int ll_flush(struct file *file, fl_owner_t id)
4077 struct inode *inode = file_inode(file);
4078 struct ll_inode_info *lli = ll_i2info(inode);
4079 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4082 LASSERT(!S_ISDIR(inode->i_mode));
4084 /* catch async errors that were recorded back when async writeback
4085 * failed for pages in this mapping. */
4086 rc = lli->lli_async_rc;
4087 lli->lli_async_rc = 0;
4088 if (lli->lli_clob != NULL) {
4089 err = lov_read_and_clear_async_rc(lli->lli_clob);
4094 /* The application has been told write failure already.
4095 * Do not report failure again. */
4096 if (fd->fd_write_failed)
4098 return rc ? -EIO : 0;
4102 * Called to make sure a portion of file has been written out.
4103 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4105 * Return how many pages have been written.
4107 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4108 enum cl_fsync_mode mode, int ignore_layout)
4112 struct cl_fsync_io *fio;
4117 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4118 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4121 env = cl_env_get(&refcheck);
4123 RETURN(PTR_ERR(env));
4125 io = vvp_env_thread_io(env);
4126 io->ci_obj = ll_i2info(inode)->lli_clob;
4127 io->ci_ignore_layout = ignore_layout;
4129 /* initialize parameters for sync */
4130 fio = &io->u.ci_fsync;
4131 fio->fi_start = start;
4133 fio->fi_fid = ll_inode2fid(inode);
4134 fio->fi_mode = mode;
4135 fio->fi_nr_written = 0;
4137 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4138 result = cl_io_loop(env, io);
4140 result = io->ci_result;
4142 result = fio->fi_nr_written;
4143 cl_io_fini(env, io);
4144 cl_env_put(env, &refcheck);
4150 * When dentry is provided (the 'else' case), file_dentry() may be
4151 * null and dentry must be used directly rather than pulled from
4152 * file_dentry() as is done otherwise.
4155 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4157 struct dentry *dentry = file_dentry(file);
4158 struct inode *inode = dentry->d_inode;
4159 struct ll_inode_info *lli = ll_i2info(inode);
4160 struct ptlrpc_request *req;
4161 ktime_t kstart = ktime_get();
4166 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4168 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4170 /* fsync's caller has already called _fdata{sync,write}, we want
4171 * that IO to finish before calling the osc and mdc sync methods */
4172 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4175 /* catch async errors that were recorded back when async writeback
4176 * failed for pages in this mapping. */
4177 if (!S_ISDIR(inode->i_mode)) {
4178 err = lli->lli_async_rc;
4179 lli->lli_async_rc = 0;
4182 if (lli->lli_clob != NULL) {
4183 err = lov_read_and_clear_async_rc(lli->lli_clob);
4189 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4193 ptlrpc_req_finished(req);
4195 if (S_ISREG(inode->i_mode)) {
4196 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4199 /* Sync metadata on MDT first, and then sync the cached data
4202 err = pcc_fsync(file, start, end, datasync, &cached);
4204 err = cl_sync_file_range(inode, start, end,
4206 if (rc == 0 && err < 0)
4209 fd->fd_write_failed = true;
4211 fd->fd_write_failed = false;
4214 inode_unlock(inode);
4217 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4218 ktime_us_delta(ktime_get(), kstart));
4223 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4225 struct inode *inode = file_inode(file);
4226 struct ll_sb_info *sbi = ll_i2sbi(inode);
4227 struct ldlm_enqueue_info einfo = {
4228 .ei_type = LDLM_FLOCK,
4229 .ei_cb_cp = ldlm_flock_completion_ast,
4230 .ei_cbdata = file_lock,
4232 struct md_op_data *op_data;
4233 struct lustre_handle lockh = { 0 };
4234 union ldlm_policy_data flock = { { 0 } };
4235 int fl_type = file_lock->fl_type;
4236 ktime_t kstart = ktime_get();
4242 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4243 PFID(ll_inode2fid(inode)), file_lock);
4245 if (file_lock->fl_flags & FL_FLOCK) {
4246 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4247 /* flocks are whole-file locks */
4248 flock.l_flock.end = OFFSET_MAX;
4249 /* For flocks owner is determined by the local file desctiptor*/
4250 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4251 } else if (file_lock->fl_flags & FL_POSIX) {
4252 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4253 flock.l_flock.start = file_lock->fl_start;
4254 flock.l_flock.end = file_lock->fl_end;
4258 flock.l_flock.pid = file_lock->fl_pid;
4260 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4261 /* Somewhat ugly workaround for svc lockd.
4262 * lockd installs custom fl_lmops->lm_compare_owner that checks
4263 * for the fl_owner to be the same (which it always is on local node
4264 * I guess between lockd processes) and then compares pid.
4265 * As such we assign pid to the owner field to make it all work,
4266 * conflict with normal locks is unlikely since pid space and
4267 * pointer space for current->files are not intersecting */
4268 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4269 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4274 einfo.ei_mode = LCK_PR;
4277 /* An unlock request may or may not have any relation to
4278 * existing locks so we may not be able to pass a lock handle
4279 * via a normal ldlm_lock_cancel() request. The request may even
4280 * unlock a byte range in the middle of an existing lock. In
4281 * order to process an unlock request we need all of the same
4282 * information that is given with a normal read or write record
4283 * lock request. To avoid creating another ldlm unlock (cancel)
4284 * message we'll treat a LCK_NL flock request as an unlock. */
4285 einfo.ei_mode = LCK_NL;
4288 einfo.ei_mode = LCK_PW;
4291 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4306 flags = LDLM_FL_BLOCK_NOWAIT;
4312 flags = LDLM_FL_TEST_LOCK;
4315 CERROR("unknown fcntl lock command: %d\n", cmd);
4319 /* Save the old mode so that if the mode in the lock changes we
4320 * can decrement the appropriate reader or writer refcount. */
4321 file_lock->fl_type = einfo.ei_mode;
4323 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4324 LUSTRE_OPC_ANY, NULL);
4325 if (IS_ERR(op_data))
4326 RETURN(PTR_ERR(op_data));
4328 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4329 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4330 flock.l_flock.pid, flags, einfo.ei_mode,
4331 flock.l_flock.start, flock.l_flock.end);
4333 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4336 /* Restore the file lock type if not TEST lock. */
4337 if (!(flags & LDLM_FL_TEST_LOCK))
4338 file_lock->fl_type = fl_type;
4340 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4341 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4342 !(flags & LDLM_FL_TEST_LOCK))
4343 rc2 = locks_lock_file_wait(file, file_lock);
4345 if ((file_lock->fl_flags & FL_FLOCK) &&
4346 (rc == 0 || file_lock->fl_type == F_UNLCK))
4347 rc2 = flock_lock_file_wait(file, file_lock);
4348 if ((file_lock->fl_flags & FL_POSIX) &&
4349 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4350 !(flags & LDLM_FL_TEST_LOCK))
4351 rc2 = posix_lock_file_wait(file, file_lock);
4352 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4354 if (rc2 && file_lock->fl_type != F_UNLCK) {
4355 einfo.ei_mode = LCK_NL;
4356 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4361 ll_finish_md_op_data(op_data);
4364 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4365 ktime_us_delta(ktime_get(), kstart));
4369 int ll_get_fid_by_name(struct inode *parent, const char *name,
4370 int namelen, struct lu_fid *fid,
4371 struct inode **inode)
4373 struct md_op_data *op_data = NULL;
4374 struct mdt_body *body;
4375 struct ptlrpc_request *req;
4379 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4380 LUSTRE_OPC_ANY, NULL);
4381 if (IS_ERR(op_data))
4382 RETURN(PTR_ERR(op_data));
4384 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4385 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4386 ll_finish_md_op_data(op_data);
4390 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4392 GOTO(out_req, rc = -EFAULT);
4394 *fid = body->mbo_fid1;
4397 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4399 ptlrpc_req_finished(req);
4403 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4406 struct dentry *dchild = NULL;
4407 struct inode *child_inode = NULL;
4408 struct md_op_data *op_data;
4409 struct ptlrpc_request *request = NULL;
4410 struct obd_client_handle *och = NULL;
4412 struct mdt_body *body;
4413 __u64 data_version = 0;
4414 size_t namelen = strlen(name);
4415 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4419 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4420 PFID(ll_inode2fid(parent)), name,
4421 lum->lum_stripe_offset, lum->lum_stripe_count);
4423 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4424 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4425 lustre_swab_lmv_user_md(lum);
4427 /* Get child FID first */
4428 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4431 dchild = d_lookup(file_dentry(file), &qstr);
4433 if (dchild->d_inode)
4434 child_inode = igrab(dchild->d_inode);
4439 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4448 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4449 OBD_CONNECT2_DIR_MIGRATE)) {
4450 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4451 ll_dir_striped(child_inode)) {
4452 CERROR("%s: MDT doesn't support stripe directory "
4453 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4454 GOTO(out_iput, rc = -EOPNOTSUPP);
4459 * lfs migrate command needs to be blocked on the client
4460 * by checking the migrate FID against the FID of the
4463 if (child_inode == parent->i_sb->s_root->d_inode)
4464 GOTO(out_iput, rc = -EINVAL);
4466 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4467 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4468 if (IS_ERR(op_data))
4469 GOTO(out_iput, rc = PTR_ERR(op_data));
4471 inode_lock(child_inode);
4472 op_data->op_fid3 = *ll_inode2fid(child_inode);
4473 if (!fid_is_sane(&op_data->op_fid3)) {
4474 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4475 ll_i2sbi(parent)->ll_fsname, name,
4476 PFID(&op_data->op_fid3));
4477 GOTO(out_unlock, rc = -EINVAL);
4480 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4481 op_data->op_data = lum;
4482 op_data->op_data_size = lumlen;
4485 if (S_ISREG(child_inode->i_mode)) {
4486 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4490 GOTO(out_unlock, rc);
4493 rc = ll_data_version(child_inode, &data_version,
4496 GOTO(out_close, rc);
4498 op_data->op_open_handle = och->och_open_handle;
4499 op_data->op_data_version = data_version;
4500 op_data->op_lease_handle = och->och_lease_handle;
4501 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4503 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4504 och->och_mod->mod_open_req->rq_replay = 0;
4505 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4508 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4509 name, namelen, &request);
4511 LASSERT(request != NULL);
4512 ll_update_times(request, parent);
4515 if (rc == 0 || rc == -EAGAIN) {
4516 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4517 LASSERT(body != NULL);
4519 /* If the server does release layout lock, then we cleanup
4520 * the client och here, otherwise release it in out_close: */
4521 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4522 obd_mod_put(och->och_mod);
4523 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4525 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4531 if (request != NULL) {
4532 ptlrpc_req_finished(request);
4536 /* Try again if the lease has cancelled. */
4537 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4542 ll_lease_close(och, child_inode, NULL);
4544 clear_nlink(child_inode);
4546 inode_unlock(child_inode);
4547 ll_finish_md_op_data(op_data);
4554 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4556 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4560 * In order to avoid flood of warning messages, only print one message
4561 * for one file. And the entire message rate on the client is limited
4562 * by CDEBUG_LIMIT too.
4564 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4565 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4566 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4567 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4573 * test if some locks matching bits and l_req_mode are acquired
4574 * - bits can be in different locks
4575 * - if found clear the common lock bits in *bits
4576 * - the bits not found, are kept in *bits
4578 * \param bits [IN] searched lock bits [IN]
4579 * \param l_req_mode [IN] searched lock mode
4580 * \retval boolean, true iff all bits are found
4582 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4584 struct lustre_handle lockh;
4585 union ldlm_policy_data policy;
4586 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4587 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4596 fid = &ll_i2info(inode)->lli_fid;
4597 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4598 ldlm_lockname[mode]);
4600 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4601 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4602 policy.l_inodebits.bits = *bits & (1 << i);
4603 if (policy.l_inodebits.bits == 0)
4606 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4607 &policy, mode, &lockh)) {
4608 struct ldlm_lock *lock;
4610 lock = ldlm_handle2lock(&lockh);
4613 ~(lock->l_policy_data.l_inodebits.bits);
4614 LDLM_LOCK_PUT(lock);
4616 *bits &= ~policy.l_inodebits.bits;
4623 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4624 struct lustre_handle *lockh, __u64 flags,
4625 enum ldlm_mode mode)
4627 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4632 fid = &ll_i2info(inode)->lli_fid;
4633 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4635 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4636 fid, LDLM_IBITS, &policy, mode, lockh);
4641 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4643 /* Already unlinked. Just update nlink and return success */
4644 if (rc == -ENOENT) {
4646 /* If it is striped directory, and there is bad stripe
4647 * Let's revalidate the dentry again, instead of returning
4649 if (ll_dir_striped(inode))
4652 /* This path cannot be hit for regular files unless in
4653 * case of obscure races, so no need to to validate
4655 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4657 } else if (rc != 0) {
4658 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4659 "%s: revalidate FID "DFID" error: rc = %d\n",
4660 ll_i2sbi(inode)->ll_fsname,
4661 PFID(ll_inode2fid(inode)), rc);
4667 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4669 struct inode *inode = dentry->d_inode;
4670 struct obd_export *exp = ll_i2mdexp(inode);
4671 struct lookup_intent oit = {
4674 struct ptlrpc_request *req = NULL;
4675 struct md_op_data *op_data;
4679 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4680 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4682 /* Call getattr by fid, so do not provide name at all. */
4683 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4684 LUSTRE_OPC_ANY, NULL);
4685 if (IS_ERR(op_data))
4686 RETURN(PTR_ERR(op_data));
4688 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4689 ll_finish_md_op_data(op_data);
4691 rc = ll_inode_revalidate_fini(inode, rc);
4695 rc = ll_revalidate_it_finish(req, &oit, dentry);
4697 ll_intent_release(&oit);
4701 /* Unlinked? Unhash dentry, so it is not picked up later by
4702 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4703 * here to preserve get_cwd functionality on 2.6.
4705 if (!dentry->d_inode->i_nlink) {
4706 spin_lock(&inode->i_lock);
4707 d_lustre_invalidate(dentry, 0);
4708 spin_unlock(&inode->i_lock);
4711 ll_lookup_finish_locks(&oit, dentry);
4713 ptlrpc_req_finished(req);
4718 static int ll_merge_md_attr(struct inode *inode)
4720 struct ll_inode_info *lli = ll_i2info(inode);
4721 struct cl_attr attr = { 0 };
4724 LASSERT(lli->lli_lsm_md != NULL);
4726 if (!lmv_dir_striped(lli->lli_lsm_md))
4729 down_read(&lli->lli_lsm_sem);
4730 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4731 &attr, ll_md_blocking_ast);
4732 up_read(&lli->lli_lsm_sem);
4736 set_nlink(inode, attr.cat_nlink);
4737 inode->i_blocks = attr.cat_blocks;
4738 i_size_write(inode, attr.cat_size);
4740 ll_i2info(inode)->lli_atime = attr.cat_atime;
4741 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4742 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4747 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4749 struct inode *inode = de->d_inode;
4750 struct ll_sb_info *sbi = ll_i2sbi(inode);
4751 struct ll_inode_info *lli = ll_i2info(inode);
4752 ktime_t kstart = ktime_get();
4755 rc = ll_inode_revalidate(de, IT_GETATTR);
4759 if (S_ISREG(inode->i_mode)) {
4762 rc = pcc_inode_getattr(inode, &cached);
4763 if (cached && rc < 0)
4766 /* In case of restore, the MDT has the right size and has
4767 * already send it back without granting the layout lock,
4768 * inode is up-to-date so glimpse is useless.
4769 * Also to glimpse we need the layout, in case of a running
4770 * restore the MDT holds the layout lock so the glimpse will
4771 * block up to the end of restore (getattr will block)
4773 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4774 rc = ll_glimpse_size(inode);
4779 /* If object isn't regular a file then don't validate size. */
4780 if (ll_dir_striped(inode)) {
4781 rc = ll_merge_md_attr(inode);
4786 inode->i_atime.tv_sec = lli->lli_atime;
4787 inode->i_mtime.tv_sec = lli->lli_mtime;
4788 inode->i_ctime.tv_sec = lli->lli_ctime;
4791 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4793 if (ll_need_32bit_api(sbi)) {
4794 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4795 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4796 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4798 stat->ino = inode->i_ino;
4799 stat->dev = inode->i_sb->s_dev;
4800 stat->rdev = inode->i_rdev;
4803 stat->mode = inode->i_mode;
4804 stat->uid = inode->i_uid;
4805 stat->gid = inode->i_gid;
4806 stat->atime = inode->i_atime;
4807 stat->mtime = inode->i_mtime;
4808 stat->ctime = inode->i_ctime;
4809 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4811 stat->nlink = inode->i_nlink;
4812 stat->size = i_size_read(inode);
4813 stat->blocks = inode->i_blocks;
4815 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4816 ktime_us_delta(ktime_get(), kstart));
4821 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4822 int ll_getattr(const struct path *path, struct kstat *stat,
4823 u32 request_mask, unsigned int flags)
4825 struct dentry *de = path->dentry;
4827 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4830 return ll_getattr_dentry(de, stat);
4833 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4834 __u64 start, __u64 len)
4838 struct fiemap *fiemap;
4839 unsigned int extent_count = fieinfo->fi_extents_max;
4841 num_bytes = sizeof(*fiemap) + (extent_count *
4842 sizeof(struct fiemap_extent));
4843 OBD_ALLOC_LARGE(fiemap, num_bytes);
4848 fiemap->fm_flags = fieinfo->fi_flags;
4849 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4850 fiemap->fm_start = start;
4851 fiemap->fm_length = len;
4852 if (extent_count > 0 &&
4853 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4854 sizeof(struct fiemap_extent)) != 0)
4855 GOTO(out, rc = -EFAULT);
4857 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4859 fieinfo->fi_flags = fiemap->fm_flags;
4860 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4861 if (extent_count > 0 &&
4862 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4863 fiemap->fm_mapped_extents *
4864 sizeof(struct fiemap_extent)) != 0)
4865 GOTO(out, rc = -EFAULT);
4867 OBD_FREE_LARGE(fiemap, num_bytes);
4871 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4873 struct ll_inode_info *lli = ll_i2info(inode);
4874 struct posix_acl *acl = NULL;
4877 spin_lock(&lli->lli_lock);
4878 /* VFS' acl_permission_check->check_acl will release the refcount */
4879 acl = posix_acl_dup(lli->lli_posix_acl);
4880 spin_unlock(&lli->lli_lock);
4885 #ifdef HAVE_IOP_SET_ACL
4886 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4887 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4889 struct ll_sb_info *sbi = ll_i2sbi(inode);
4890 struct ptlrpc_request *req = NULL;
4891 const char *name = NULL;
4893 size_t value_size = 0;
4898 case ACL_TYPE_ACCESS:
4899 name = XATTR_NAME_POSIX_ACL_ACCESS;
4901 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4904 case ACL_TYPE_DEFAULT:
4905 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4906 if (!S_ISDIR(inode->i_mode))
4907 rc = acl ? -EACCES : 0;
4918 value_size = posix_acl_xattr_size(acl->a_count);
4919 value = kmalloc(value_size, GFP_NOFS);
4921 GOTO(out, rc = -ENOMEM);
4923 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4925 GOTO(out_value, rc);
4928 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4929 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4930 name, value, value_size, 0, 0, &req);
4932 ptlrpc_req_finished(req);
4937 forget_cached_acl(inode, type);
4939 set_cached_acl(inode, type, acl);
4942 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4943 #endif /* HAVE_IOP_SET_ACL */
4945 int ll_inode_permission(struct inode *inode, int mask)
4948 struct ll_sb_info *sbi;
4949 struct root_squash_info *squash;
4950 struct cred *cred = NULL;
4951 const struct cred *old_cred = NULL;
4953 bool squash_id = false;
4954 ktime_t kstart = ktime_get();
4957 if (mask & MAY_NOT_BLOCK)
4960 /* as root inode are NOT getting validated in lookup operation,
4961 * need to do it before permission check. */
4963 if (inode == inode->i_sb->s_root->d_inode) {
4964 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4969 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4970 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4972 /* squash fsuid/fsgid if needed */
4973 sbi = ll_i2sbi(inode);
4974 squash = &sbi->ll_squash;
4975 if (unlikely(squash->rsi_uid != 0 &&
4976 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4977 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4981 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4982 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4983 squash->rsi_uid, squash->rsi_gid);
4985 /* update current process's credentials
4986 * and FS capability */
4987 cred = prepare_creds();
4991 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4992 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4993 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4994 if ((1 << cap) & CFS_CAP_FS_MASK)
4995 cap_lower(cred->cap_effective, cap);
4997 old_cred = override_creds(cred);
5000 rc = generic_permission(inode, mask);
5001 /* restore current process's credentials and FS capability */
5003 revert_creds(old_cred);
5008 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5009 ktime_us_delta(ktime_get(), kstart));
5014 /* -o localflock - only provides locally consistent flock locks */
5015 struct file_operations ll_file_operations = {
5016 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5017 # ifdef HAVE_SYNC_READ_WRITE
5018 .read = new_sync_read,
5019 .write = new_sync_write,
5021 .read_iter = ll_file_read_iter,
5022 .write_iter = ll_file_write_iter,
5023 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5024 .read = ll_file_read,
5025 .aio_read = ll_file_aio_read,
5026 .write = ll_file_write,
5027 .aio_write = ll_file_aio_write,
5028 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5029 .unlocked_ioctl = ll_file_ioctl,
5030 .open = ll_file_open,
5031 .release = ll_file_release,
5032 .mmap = ll_file_mmap,
5033 .llseek = ll_file_seek,
5034 .splice_read = ll_file_splice_read,
5039 struct file_operations ll_file_operations_flock = {
5040 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5041 # ifdef HAVE_SYNC_READ_WRITE
5042 .read = new_sync_read,
5043 .write = new_sync_write,
5044 # endif /* HAVE_SYNC_READ_WRITE */
5045 .read_iter = ll_file_read_iter,
5046 .write_iter = ll_file_write_iter,
5047 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5048 .read = ll_file_read,
5049 .aio_read = ll_file_aio_read,
5050 .write = ll_file_write,
5051 .aio_write = ll_file_aio_write,
5052 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5053 .unlocked_ioctl = ll_file_ioctl,
5054 .open = ll_file_open,
5055 .release = ll_file_release,
5056 .mmap = ll_file_mmap,
5057 .llseek = ll_file_seek,
5058 .splice_read = ll_file_splice_read,
5061 .flock = ll_file_flock,
5062 .lock = ll_file_flock
5065 /* These are for -o noflock - to return ENOSYS on flock calls */
5066 struct file_operations ll_file_operations_noflock = {
5067 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5068 # ifdef HAVE_SYNC_READ_WRITE
5069 .read = new_sync_read,
5070 .write = new_sync_write,
5071 # endif /* HAVE_SYNC_READ_WRITE */
5072 .read_iter = ll_file_read_iter,
5073 .write_iter = ll_file_write_iter,
5074 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5075 .read = ll_file_read,
5076 .aio_read = ll_file_aio_read,
5077 .write = ll_file_write,
5078 .aio_write = ll_file_aio_write,
5079 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5080 .unlocked_ioctl = ll_file_ioctl,
5081 .open = ll_file_open,
5082 .release = ll_file_release,
5083 .mmap = ll_file_mmap,
5084 .llseek = ll_file_seek,
5085 .splice_read = ll_file_splice_read,
5088 .flock = ll_file_noflock,
5089 .lock = ll_file_noflock
5092 struct inode_operations ll_file_inode_operations = {
5093 .setattr = ll_setattr,
5094 .getattr = ll_getattr,
5095 .permission = ll_inode_permission,
5096 #ifdef HAVE_IOP_XATTR
5097 .setxattr = ll_setxattr,
5098 .getxattr = ll_getxattr,
5099 .removexattr = ll_removexattr,
5101 .listxattr = ll_listxattr,
5102 .fiemap = ll_fiemap,
5103 #ifdef HAVE_IOP_GET_ACL
5104 .get_acl = ll_get_acl,
5106 #ifdef HAVE_IOP_SET_ACL
5107 .set_acl = ll_set_acl,
5111 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5113 struct ll_inode_info *lli = ll_i2info(inode);
5114 struct cl_object *obj = lli->lli_clob;
5123 env = cl_env_get(&refcheck);
5125 RETURN(PTR_ERR(env));
5127 rc = cl_conf_set(env, lli->lli_clob, conf);
5131 if (conf->coc_opc == OBJECT_CONF_SET) {
5132 struct ldlm_lock *lock = conf->coc_lock;
5133 struct cl_layout cl = {
5137 LASSERT(lock != NULL);
5138 LASSERT(ldlm_has_layout(lock));
5140 /* it can only be allowed to match after layout is
5141 * applied to inode otherwise false layout would be
5142 * seen. Applying layout shoud happen before dropping
5143 * the intent lock. */
5144 ldlm_lock_allow_match(lock);
5146 rc = cl_object_layout_get(env, obj, &cl);
5151 DFID": layout version change: %u -> %u\n",
5152 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5154 ll_layout_version_set(lli, cl.cl_layout_gen);
5158 cl_env_put(env, &refcheck);
5163 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5164 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5167 struct ll_sb_info *sbi = ll_i2sbi(inode);
5168 struct ptlrpc_request *req;
5175 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5176 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5177 lock->l_lvb_data, lock->l_lvb_len);
5179 if (lock->l_lvb_data != NULL)
5182 /* if layout lock was granted right away, the layout is returned
5183 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5184 * blocked and then granted via completion ast, we have to fetch
5185 * layout here. Please note that we can't use the LVB buffer in
5186 * completion AST because it doesn't have a large enough buffer */
5187 rc = ll_get_default_mdsize(sbi, &lmmsize);
5191 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5192 XATTR_NAME_LOV, lmmsize, &req);
5195 GOTO(out, rc = 0); /* empty layout */
5202 if (lmmsize == 0) /* empty layout */
5205 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5207 GOTO(out, rc = -EFAULT);
5209 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5210 if (lvbdata == NULL)
5211 GOTO(out, rc = -ENOMEM);
5213 memcpy(lvbdata, lmm, lmmsize);
5214 lock_res_and_lock(lock);
5215 if (unlikely(lock->l_lvb_data == NULL)) {
5216 lock->l_lvb_type = LVB_T_LAYOUT;
5217 lock->l_lvb_data = lvbdata;
5218 lock->l_lvb_len = lmmsize;
5221 unlock_res_and_lock(lock);
5224 OBD_FREE_LARGE(lvbdata, lmmsize);
5229 ptlrpc_req_finished(req);
5234 * Apply the layout to the inode. Layout lock is held and will be released
5237 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5238 struct inode *inode)
5240 struct ll_inode_info *lli = ll_i2info(inode);
5241 struct ll_sb_info *sbi = ll_i2sbi(inode);
5242 struct ldlm_lock *lock;
5243 struct cl_object_conf conf;
5246 bool wait_layout = false;
5249 LASSERT(lustre_handle_is_used(lockh));
5251 lock = ldlm_handle2lock(lockh);
5252 LASSERT(lock != NULL);
5253 LASSERT(ldlm_has_layout(lock));
5255 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5256 PFID(&lli->lli_fid), inode);
5258 /* in case this is a caching lock and reinstate with new inode */
5259 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5261 lock_res_and_lock(lock);
5262 lvb_ready = ldlm_is_lvb_ready(lock);
5263 unlock_res_and_lock(lock);
5265 /* checking lvb_ready is racy but this is okay. The worst case is
5266 * that multi processes may configure the file on the same time. */
5270 rc = ll_layout_fetch(inode, lock);
5274 /* for layout lock, lmm is stored in lock's lvb.
5275 * lvb_data is immutable if the lock is held so it's safe to access it
5278 * set layout to file. Unlikely this will fail as old layout was
5279 * surely eliminated */
5280 memset(&conf, 0, sizeof conf);
5281 conf.coc_opc = OBJECT_CONF_SET;
5282 conf.coc_inode = inode;
5283 conf.coc_lock = lock;
5284 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5285 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5286 rc = ll_layout_conf(inode, &conf);
5288 /* refresh layout failed, need to wait */
5289 wait_layout = rc == -EBUSY;
5292 LDLM_LOCK_PUT(lock);
5293 ldlm_lock_decref(lockh, mode);
5295 /* wait for IO to complete if it's still being used. */
5297 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5298 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5300 memset(&conf, 0, sizeof conf);
5301 conf.coc_opc = OBJECT_CONF_WAIT;
5302 conf.coc_inode = inode;
5303 rc = ll_layout_conf(inode, &conf);
5307 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5308 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5314 * Issue layout intent RPC to MDS.
5315 * \param inode [in] file inode
5316 * \param intent [in] layout intent
5318 * \retval 0 on success
5319 * \retval < 0 error code
5321 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5323 struct ll_inode_info *lli = ll_i2info(inode);
5324 struct ll_sb_info *sbi = ll_i2sbi(inode);
5325 struct md_op_data *op_data;
5326 struct lookup_intent it;
5327 struct ptlrpc_request *req;
5331 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5332 0, 0, LUSTRE_OPC_ANY, NULL);
5333 if (IS_ERR(op_data))
5334 RETURN(PTR_ERR(op_data));
5336 op_data->op_data = intent;
5337 op_data->op_data_size = sizeof(*intent);
5339 memset(&it, 0, sizeof(it));
5340 it.it_op = IT_LAYOUT;
5341 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5342 intent->li_opc == LAYOUT_INTENT_TRUNC)
5343 it.it_flags = FMODE_WRITE;
5345 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5346 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5348 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5349 &ll_md_blocking_ast, 0);
5350 if (it.it_request != NULL)
5351 ptlrpc_req_finished(it.it_request);
5352 it.it_request = NULL;
5354 ll_finish_md_op_data(op_data);
5356 /* set lock data in case this is a new lock */
5358 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5360 ll_intent_drop_lock(&it);
5366 * This function checks if there exists a LAYOUT lock on the client side,
5367 * or enqueues it if it doesn't have one in cache.
5369 * This function will not hold layout lock so it may be revoked any time after
5370 * this function returns. Any operations depend on layout should be redone
5373 * This function should be called before lov_io_init() to get an uptodate
5374 * layout version, the caller should save the version number and after IO
5375 * is finished, this function should be called again to verify that layout
5376 * is not changed during IO time.
5378 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5380 struct ll_inode_info *lli = ll_i2info(inode);
5381 struct ll_sb_info *sbi = ll_i2sbi(inode);
5382 struct lustre_handle lockh;
5383 struct layout_intent intent = {
5384 .li_opc = LAYOUT_INTENT_ACCESS,
5386 enum ldlm_mode mode;
5390 *gen = ll_layout_version_get(lli);
5391 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5395 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5396 LASSERT(S_ISREG(inode->i_mode));
5398 /* take layout lock mutex to enqueue layout lock exclusively. */
5399 mutex_lock(&lli->lli_layout_mutex);
5402 /* mostly layout lock is caching on the local side, so try to
5403 * match it before grabbing layout lock mutex. */
5404 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5405 LCK_CR | LCK_CW | LCK_PR |
5407 if (mode != 0) { /* hit cached lock */
5408 rc = ll_layout_lock_set(&lockh, mode, inode);
5414 rc = ll_layout_intent(inode, &intent);
5420 *gen = ll_layout_version_get(lli);
5421 mutex_unlock(&lli->lli_layout_mutex);
5427 * Issue layout intent RPC indicating where in a file an IO is about to write.
5429 * \param[in] inode file inode.
5430 * \param[in] ext write range with start offset of fille in bytes where
5431 * an IO is about to write, and exclusive end offset in
5434 * \retval 0 on success
5435 * \retval < 0 error code
5437 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5438 struct lu_extent *ext)
5440 struct layout_intent intent = {
5442 .li_extent.e_start = ext->e_start,
5443 .li_extent.e_end = ext->e_end,
5448 rc = ll_layout_intent(inode, &intent);
5454 * This function send a restore request to the MDT
5456 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5458 struct hsm_user_request *hur;
5462 len = sizeof(struct hsm_user_request) +
5463 sizeof(struct hsm_user_item);
5464 OBD_ALLOC(hur, len);
5468 hur->hur_request.hr_action = HUA_RESTORE;
5469 hur->hur_request.hr_archive_id = 0;
5470 hur->hur_request.hr_flags = 0;
5471 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5472 sizeof(hur->hur_user_item[0].hui_fid));
5473 hur->hur_user_item[0].hui_extent.offset = offset;
5474 hur->hur_user_item[0].hui_extent.length = length;
5475 hur->hur_request.hr_itemcount = 1;
5476 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,