4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
167 case MDS_CLOSE_LAYOUT_SPLIT:
168 case MDS_CLOSE_LAYOUT_SWAP: {
169 struct split_param *sp = data;
171 LASSERT(data != NULL);
172 op_data->op_bias |= bias;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
176 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
177 op_data->op_mirror_id = sp->sp_mirror_id;
179 op_data->op_fid2 = *ll_inode2fid(data);
184 case MDS_CLOSE_RESYNC_DONE: {
185 struct ll_ioc_lease *ioc = data;
187 LASSERT(data != NULL);
188 op_data->op_attr_blocks +=
189 ioc->lil_count * op_data->op_attr_blocks;
190 op_data->op_attr.ia_valid |= ATTR_SIZE;
191 op_data->op_xvalid |= OP_XVALID_BLOCKS;
192 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
194 op_data->op_lease_handle = och->och_lease_handle;
195 op_data->op_data = &ioc->lil_ids[0];
196 op_data->op_data_size =
197 ioc->lil_count * sizeof(ioc->lil_ids[0]);
201 case MDS_PCC_ATTACH: {
202 struct pcc_param *param = data;
204 LASSERT(data != NULL);
205 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
206 op_data->op_archive_id = param->pa_archive_id;
207 op_data->op_data_version = param->pa_data_version;
208 op_data->op_lease_handle = och->och_lease_handle;
212 case MDS_HSM_RELEASE:
213 LASSERT(data != NULL);
214 op_data->op_bias |= MDS_HSM_RELEASE;
215 op_data->op_data_version = *(__u64 *)data;
216 op_data->op_lease_handle = och->och_lease_handle;
217 op_data->op_attr.ia_valid |= ATTR_SIZE;
218 op_data->op_xvalid |= OP_XVALID_BLOCKS;
222 LASSERT(data == NULL);
226 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
227 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
228 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
229 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
231 rc = md_close(md_exp, op_data, och->och_mod, &req);
232 if (rc != 0 && rc != -EINTR)
233 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
234 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
236 if (rc == 0 && op_data->op_bias & bias) {
237 struct mdt_body *body;
239 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
240 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
243 if (bias & MDS_PCC_ATTACH) {
244 struct pcc_param *param = data;
246 param->pa_layout_gen = body->mbo_layout_gen;
250 ll_finish_md_op_data(op_data);
254 md_clear_open_replay_data(md_exp, och);
255 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
258 ptlrpc_req_finished(req); /* This is close request */
262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
264 struct ll_inode_info *lli = ll_i2info(inode);
265 struct obd_client_handle **och_p;
266 struct obd_client_handle *och;
271 if (fmode & FMODE_WRITE) {
272 och_p = &lli->lli_mds_write_och;
273 och_usecount = &lli->lli_open_fd_write_count;
274 } else if (fmode & FMODE_EXEC) {
275 och_p = &lli->lli_mds_exec_och;
276 och_usecount = &lli->lli_open_fd_exec_count;
278 LASSERT(fmode & FMODE_READ);
279 och_p = &lli->lli_mds_read_och;
280 och_usecount = &lli->lli_open_fd_read_count;
283 mutex_lock(&lli->lli_och_mutex);
284 if (*och_usecount > 0) {
285 /* There are still users of this handle, so skip
287 mutex_unlock(&lli->lli_och_mutex);
293 mutex_unlock(&lli->lli_och_mutex);
296 /* There might be a race and this handle may already
298 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
304 static int ll_md_close(struct inode *inode, struct file *file)
306 union ldlm_policy_data policy = {
307 .l_inodebits = { MDS_INODELOCK_OPEN },
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
311 struct ll_inode_info *lli = ll_i2info(inode);
312 struct lustre_handle lockh;
313 enum ldlm_mode lockmode;
317 /* clear group lock, if present */
318 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
319 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
321 if (fd->fd_lease_och != NULL) {
324 /* Usually the lease is not released when the
325 * application crashed, we need to release here. */
326 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
327 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
328 PFID(&lli->lli_fid), rc, lease_broken);
330 fd->fd_lease_och = NULL;
333 if (fd->fd_och != NULL) {
334 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
339 /* Let's see if we have good enough OPEN lock on the file and if
340 we can skip talking to MDS */
341 mutex_lock(&lli->lli_och_mutex);
342 if (fd->fd_omode & FMODE_WRITE) {
344 LASSERT(lli->lli_open_fd_write_count);
345 lli->lli_open_fd_write_count--;
346 } else if (fd->fd_omode & FMODE_EXEC) {
348 LASSERT(lli->lli_open_fd_exec_count);
349 lli->lli_open_fd_exec_count--;
352 LASSERT(lli->lli_open_fd_read_count);
353 lli->lli_open_fd_read_count--;
355 mutex_unlock(&lli->lli_och_mutex);
357 /* LU-4398: do not cache write open lock if the file has exec bit */
358 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
359 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
360 LDLM_IBITS, &policy, lockmode, &lockh))
361 rc = ll_md_real_close(inode, fd->fd_omode);
364 LUSTRE_FPRIVATE(file) = NULL;
365 ll_file_data_put(fd);
370 /* While this returns an error code, fput() the caller does not, so we need
371 * to make every effort to clean up all of our state here. Also, applications
372 * rarely check close errors and even if an error is returned they will not
373 * re-try the close call.
375 int ll_file_release(struct inode *inode, struct file *file)
377 struct ll_file_data *fd;
378 struct ll_sb_info *sbi = ll_i2sbi(inode);
379 struct ll_inode_info *lli = ll_i2info(inode);
380 ktime_t kstart = ktime_get();
385 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
386 PFID(ll_inode2fid(inode)), inode);
388 fd = LUSTRE_FPRIVATE(file);
391 /* The last ref on @file, maybe not the the owner pid of statahead,
392 * because parent and child process can share the same file handle. */
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
394 ll_deauthorize_statahead(inode, fd);
396 if (inode->i_sb->s_root == file_dentry(file)) {
397 LUSTRE_FPRIVATE(file) = NULL;
398 ll_file_data_put(fd);
402 pcc_file_release(inode, file);
404 if (!S_ISDIR(inode->i_mode)) {
405 if (lli->lli_clob != NULL)
406 lov_read_and_clear_async_rc(lli->lli_clob);
407 lli->lli_async_rc = 0;
410 rc = ll_md_close(inode, file);
412 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
413 libcfs_debug_dumplog();
416 if (!rc && inode->i_sb->s_root != file_dentry(file))
417 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
418 ktime_us_delta(ktime_get(), kstart));
422 static inline int ll_dom_readpage(void *data, struct page *page)
424 struct niobuf_local *lnb = data;
427 kaddr = ll_kmap_atomic(page, KM_USER0);
428 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
429 if (lnb->lnb_len < PAGE_SIZE)
430 memset(kaddr + lnb->lnb_len, 0,
431 PAGE_SIZE - lnb->lnb_len);
432 flush_dcache_page(page);
433 SetPageUptodate(page);
434 ll_kunmap_atomic(kaddr, KM_USER0);
440 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
441 struct lookup_intent *it)
443 struct ll_inode_info *lli = ll_i2info(inode);
444 struct cl_object *obj = lli->lli_clob;
445 struct address_space *mapping = inode->i_mapping;
447 struct niobuf_remote *rnb;
448 struct mdt_body *body;
450 unsigned long index, start;
451 struct niobuf_local lnb;
458 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
462 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
463 if (rnb == NULL || rnb->rnb_len == 0)
466 /* LU-11595: Server may return whole file and that is OK always or
467 * it may return just file tail and its offset must be aligned with
468 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
469 * smaller then offset may be not aligned and that data is just ignored.
471 if (rnb->rnb_offset % PAGE_SIZE)
474 /* Server returns whole file or just file tail if it fills in reply
475 * buffer, in both cases total size should be equal to the file size.
477 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
478 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
479 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
480 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
481 rnb->rnb_len, body->mbo_dom_size);
485 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
486 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
488 data = (char *)rnb + sizeof(*rnb);
490 lnb.lnb_file_offset = rnb->rnb_offset;
491 start = lnb.lnb_file_offset / PAGE_SIZE;
493 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
494 lnb.lnb_page_offset = 0;
496 lnb.lnb_data = data + (index << PAGE_SHIFT);
497 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
498 if (lnb.lnb_len > PAGE_SIZE)
499 lnb.lnb_len = PAGE_SIZE;
501 vmpage = read_cache_page(mapping, index + start,
502 ll_dom_readpage, &lnb);
503 if (IS_ERR(vmpage)) {
504 CWARN("%s: cannot fill page %lu for "DFID
505 " with data: rc = %li\n",
506 ll_i2sbi(inode)->ll_fsname, index + start,
507 PFID(lu_object_fid(&obj->co_lu)),
513 } while (rnb->rnb_len > (index << PAGE_SHIFT));
517 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
518 struct lookup_intent *itp)
520 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
521 struct dentry *parent = de->d_parent;
524 struct md_op_data *op_data;
525 struct ptlrpc_request *req = NULL;
529 LASSERT(parent != NULL);
530 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
532 /* if server supports open-by-fid, or file name is invalid, don't pack
533 * name in open request */
534 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
535 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
537 len = de->d_name.len;
538 name = kmalloc(len + 1, GFP_NOFS);
543 spin_lock(&de->d_lock);
544 if (len != de->d_name.len) {
545 spin_unlock(&de->d_lock);
549 memcpy(name, de->d_name.name, len);
551 spin_unlock(&de->d_lock);
553 if (!lu_name_is_valid_2(name, len)) {
559 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
560 name, len, 0, LUSTRE_OPC_ANY, NULL);
561 if (IS_ERR(op_data)) {
563 RETURN(PTR_ERR(op_data));
565 op_data->op_data = lmm;
566 op_data->op_data_size = lmmsize;
568 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
569 &ll_md_blocking_ast, 0);
571 ll_finish_md_op_data(op_data);
573 /* reason for keep own exit path - don`t flood log
574 * with messages with -ESTALE errors.
576 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
577 it_open_error(DISP_OPEN_OPEN, itp))
579 ll_release_openhandle(de, itp);
583 if (it_disposition(itp, DISP_LOOKUP_NEG))
584 GOTO(out, rc = -ENOENT);
586 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
587 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
588 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
592 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
594 if (!rc && itp->it_lock_mode) {
595 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
596 struct ldlm_lock *lock;
597 bool has_dom_bit = false;
599 /* If we got a lock back and it has a LOOKUP bit set,
600 * make sure the dentry is marked as valid so we can find it.
601 * We don't need to care about actual hashing since other bits
602 * of kernel will deal with that later.
604 lock = ldlm_handle2lock(&handle);
606 has_dom_bit = ldlm_has_dom(lock);
607 if (lock->l_policy_data.l_inodebits.bits &
608 MDS_INODELOCK_LOOKUP)
609 d_lustre_revalidate(de);
613 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
615 ll_dom_finish_open(de->d_inode, req, itp);
619 ptlrpc_req_finished(req);
620 ll_intent_drop_lock(itp);
622 /* We did open by fid, but by the time we got to the server,
623 * the object disappeared. If this is a create, we cannot really
624 * tell the userspace that the file it was trying to create
625 * does not exist. Instead let's return -ESTALE, and the VFS will
626 * retry the create with LOOKUP_REVAL that we are going to catch
627 * in ll_revalidate_dentry() and use lookup then.
629 if (rc == -ENOENT && itp->it_op & IT_CREAT)
635 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
636 struct obd_client_handle *och)
638 struct mdt_body *body;
640 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
641 och->och_open_handle = body->mbo_open_handle;
642 och->och_fid = body->mbo_fid1;
643 och->och_lease_handle.cookie = it->it_lock_handle;
644 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
645 och->och_flags = it->it_flags;
647 return md_set_open_replay_data(md_exp, och, it);
650 static int ll_local_open(struct file *file, struct lookup_intent *it,
651 struct ll_file_data *fd, struct obd_client_handle *och)
653 struct inode *inode = file_inode(file);
656 LASSERT(!LUSTRE_FPRIVATE(file));
663 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
668 LUSTRE_FPRIVATE(file) = fd;
669 ll_readahead_init(inode, &fd->fd_ras);
670 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
672 /* ll_cl_context initialize */
673 rwlock_init(&fd->fd_lock);
674 INIT_LIST_HEAD(&fd->fd_lccs);
679 /* Open a file, and (for the very first open) create objects on the OSTs at
680 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
681 * creation or open until ll_lov_setstripe() ioctl is called.
683 * If we already have the stripe MD locally then we don't request it in
684 * md_open(), by passing a lmm_size = 0.
686 * It is up to the application to ensure no other processes open this file
687 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
688 * used. We might be able to avoid races of that sort by getting lli_open_sem
689 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
690 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
692 int ll_file_open(struct inode *inode, struct file *file)
694 struct ll_inode_info *lli = ll_i2info(inode);
695 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
696 .it_flags = file->f_flags };
697 struct obd_client_handle **och_p = NULL;
698 __u64 *och_usecount = NULL;
699 struct ll_file_data *fd;
700 ktime_t kstart = ktime_get();
704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
705 PFID(ll_inode2fid(inode)), inode, file->f_flags);
707 it = file->private_data; /* XXX: compat macro */
708 file->private_data = NULL; /* prevent ll_local_open assertion */
710 fd = ll_file_data_get();
712 GOTO(out_nofiledata, rc = -ENOMEM);
715 if (S_ISDIR(inode->i_mode))
716 ll_authorize_statahead(inode, fd);
718 if (inode->i_sb->s_root == file_dentry(file)) {
719 LUSTRE_FPRIVATE(file) = fd;
723 if (!it || !it->it_disposition) {
724 /* Convert f_flags into access mode. We cannot use file->f_mode,
725 * because everything but O_ACCMODE mask was stripped from
727 if ((oit.it_flags + 1) & O_ACCMODE)
729 if (file->f_flags & O_TRUNC)
730 oit.it_flags |= FMODE_WRITE;
732 /* kernel only call f_op->open in dentry_open. filp_open calls
733 * dentry_open after call to open_namei that checks permissions.
734 * Only nfsd_open call dentry_open directly without checking
735 * permissions and because of that this code below is safe.
737 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
738 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
740 /* We do not want O_EXCL here, presumably we opened the file
741 * already? XXX - NFS implications? */
742 oit.it_flags &= ~O_EXCL;
744 /* bug20584, if "it_flags" contains O_CREAT, the file will be
745 * created if necessary, then "IT_CREAT" should be set to keep
746 * consistent with it */
747 if (oit.it_flags & O_CREAT)
748 oit.it_op |= IT_CREAT;
754 /* Let's see if we have file open on MDS already. */
755 if (it->it_flags & FMODE_WRITE) {
756 och_p = &lli->lli_mds_write_och;
757 och_usecount = &lli->lli_open_fd_write_count;
758 } else if (it->it_flags & FMODE_EXEC) {
759 och_p = &lli->lli_mds_exec_och;
760 och_usecount = &lli->lli_open_fd_exec_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 mutex_lock(&lli->lli_och_mutex);
767 if (*och_p) { /* Open handle is present */
768 if (it_disposition(it, DISP_OPEN_OPEN)) {
769 /* Well, there's extra open request that we do not need,
770 let's close it somehow. This will decref request. */
771 rc = it_open_error(DISP_OPEN_OPEN, it);
773 mutex_unlock(&lli->lli_och_mutex);
774 GOTO(out_openerr, rc);
777 ll_release_openhandle(file_dentry(file), it);
781 rc = ll_local_open(file, it, fd, NULL);
784 mutex_unlock(&lli->lli_och_mutex);
785 GOTO(out_openerr, rc);
788 LASSERT(*och_usecount == 0);
789 if (!it->it_disposition) {
790 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
791 /* We cannot just request lock handle now, new ELC code
792 means that one of other OPEN locks for this file
793 could be cancelled, and since blocking ast handler
794 would attempt to grab och_mutex as well, that would
795 result in a deadlock */
796 mutex_unlock(&lli->lli_och_mutex);
798 * Normally called under two situations:
800 * 2. A race/condition on MDS resulting in no open
801 * handle to be returned from LOOKUP|OPEN request,
802 * for example if the target entry was a symlink.
804 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
805 * marked by a bit set in ll_iget_for_nfs. Clear the
806 * bit so that it's not confusing later callers.
808 * NB; when ldd is NULL, it must have come via normal
809 * lookup path only, since ll_iget_for_nfs always calls
812 if (ldd && ldd->lld_nfs_dentry) {
813 ldd->lld_nfs_dentry = 0;
814 it->it_flags |= MDS_OPEN_LOCK;
818 * Always specify MDS_OPEN_BY_FID because we don't want
819 * to get file with different fid.
821 it->it_flags |= MDS_OPEN_BY_FID;
822 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
825 GOTO(out_openerr, rc);
829 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
831 GOTO(out_och_free, rc = -ENOMEM);
835 /* md_intent_lock() didn't get a request ref if there was an
836 * open error, so don't do cleanup on the request here
838 /* XXX (green): Should not we bail out on any error here, not
839 * just open error? */
840 rc = it_open_error(DISP_OPEN_OPEN, it);
842 GOTO(out_och_free, rc);
844 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
845 "inode %p: disposition %x, status %d\n", inode,
846 it_disposition(it, ~0), it->it_status);
848 rc = ll_local_open(file, it, fd, *och_p);
850 GOTO(out_och_free, rc);
853 rc = pcc_file_open(inode, file);
855 GOTO(out_och_free, rc);
857 mutex_unlock(&lli->lli_och_mutex);
860 /* Must do this outside lli_och_mutex lock to prevent deadlock where
861 different kind of OPEN lock for this same inode gets cancelled
862 by ldlm_cancel_lru */
863 if (!S_ISREG(inode->i_mode))
864 GOTO(out_och_free, rc);
866 cl_lov_delay_create_clear(&file->f_flags);
867 GOTO(out_och_free, rc);
871 if (och_p && *och_p) {
872 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
873 *och_p = NULL; /* OBD_FREE writes some magic there */
876 mutex_unlock(&lli->lli_och_mutex);
879 if (lli->lli_opendir_key == fd)
880 ll_deauthorize_statahead(inode, fd);
883 ll_file_data_put(fd);
885 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
886 ktime_us_delta(ktime_get(), kstart));
890 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
891 ptlrpc_req_finished(it->it_request);
892 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
898 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
899 struct ldlm_lock_desc *desc, void *data, int flag)
902 struct lustre_handle lockh;
906 case LDLM_CB_BLOCKING:
907 ldlm_lock2handle(lock, &lockh);
908 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
910 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
914 case LDLM_CB_CANCELING:
922 * When setting a lease on a file, we take ownership of the lli_mds_*_och
923 * and save it as fd->fd_och so as to force client to reopen the file even
924 * if it has an open lock in cache already.
926 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
927 struct lustre_handle *old_open_handle)
929 struct ll_inode_info *lli = ll_i2info(inode);
930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
931 struct obd_client_handle **och_p;
936 /* Get the openhandle of the file */
937 mutex_lock(&lli->lli_och_mutex);
938 if (fd->fd_lease_och != NULL)
939 GOTO(out_unlock, rc = -EBUSY);
941 if (fd->fd_och == NULL) {
942 if (file->f_mode & FMODE_WRITE) {
943 LASSERT(lli->lli_mds_write_och != NULL);
944 och_p = &lli->lli_mds_write_och;
945 och_usecount = &lli->lli_open_fd_write_count;
947 LASSERT(lli->lli_mds_read_och != NULL);
948 och_p = &lli->lli_mds_read_och;
949 och_usecount = &lli->lli_open_fd_read_count;
952 if (*och_usecount > 1)
953 GOTO(out_unlock, rc = -EBUSY);
960 *old_open_handle = fd->fd_och->och_open_handle;
964 mutex_unlock(&lli->lli_och_mutex);
969 * Release ownership on lli_mds_*_och when putting back a file lease.
971 static int ll_lease_och_release(struct inode *inode, struct file *file)
973 struct ll_inode_info *lli = ll_i2info(inode);
974 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
975 struct obd_client_handle **och_p;
976 struct obd_client_handle *old_och = NULL;
981 mutex_lock(&lli->lli_och_mutex);
982 if (file->f_mode & FMODE_WRITE) {
983 och_p = &lli->lli_mds_write_och;
984 och_usecount = &lli->lli_open_fd_write_count;
986 och_p = &lli->lli_mds_read_och;
987 och_usecount = &lli->lli_open_fd_read_count;
990 /* The file may have been open by another process (broken lease) so
991 * *och_p is not NULL. In this case we should simply increase usecount
994 if (*och_p != NULL) {
995 old_och = fd->fd_och;
1002 mutex_unlock(&lli->lli_och_mutex);
1004 if (old_och != NULL)
1005 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1011 * Acquire a lease and open the file.
1013 static struct obd_client_handle *
1014 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1017 struct lookup_intent it = { .it_op = IT_OPEN };
1018 struct ll_sb_info *sbi = ll_i2sbi(inode);
1019 struct md_op_data *op_data;
1020 struct ptlrpc_request *req = NULL;
1021 struct lustre_handle old_open_handle = { 0 };
1022 struct obd_client_handle *och = NULL;
1027 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1028 RETURN(ERR_PTR(-EINVAL));
1031 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1032 RETURN(ERR_PTR(-EPERM));
1034 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1036 RETURN(ERR_PTR(rc));
1041 RETURN(ERR_PTR(-ENOMEM));
1043 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1044 LUSTRE_OPC_ANY, NULL);
1045 if (IS_ERR(op_data))
1046 GOTO(out, rc = PTR_ERR(op_data));
1048 /* To tell the MDT this openhandle is from the same owner */
1049 op_data->op_open_handle = old_open_handle;
1051 it.it_flags = fmode | open_flags;
1052 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1053 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1054 &ll_md_blocking_lease_ast,
1055 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1056 * it can be cancelled which may mislead applications that the lease is
1058 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1059 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1060 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1061 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1062 ll_finish_md_op_data(op_data);
1063 ptlrpc_req_finished(req);
1065 GOTO(out_release_it, rc);
1067 if (it_disposition(&it, DISP_LOOKUP_NEG))
1068 GOTO(out_release_it, rc = -ENOENT);
1070 rc = it_open_error(DISP_OPEN_OPEN, &it);
1072 GOTO(out_release_it, rc);
1074 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1075 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1077 GOTO(out_release_it, rc);
1079 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1080 GOTO(out_close, rc = -EOPNOTSUPP);
1082 /* already get lease, handle lease lock */
1083 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1084 if (it.it_lock_mode == 0 ||
1085 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1086 /* open lock must return for lease */
1087 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1088 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1090 GOTO(out_close, rc = -EPROTO);
1093 ll_intent_release(&it);
1097 /* Cancel open lock */
1098 if (it.it_lock_mode != 0) {
1099 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1101 it.it_lock_mode = 0;
1102 och->och_lease_handle.cookie = 0ULL;
1104 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1106 CERROR("%s: error closing file "DFID": %d\n",
1107 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1108 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1110 ll_intent_release(&it);
1114 RETURN(ERR_PTR(rc));
1118 * Check whether a layout swap can be done between two inodes.
1120 * \param[in] inode1 First inode to check
1121 * \param[in] inode2 Second inode to check
1123 * \retval 0 on success, layout swap can be performed between both inodes
1124 * \retval negative error code if requirements are not met
1126 static int ll_check_swap_layouts_validity(struct inode *inode1,
1127 struct inode *inode2)
1129 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1132 if (inode_permission(inode1, MAY_WRITE) ||
1133 inode_permission(inode2, MAY_WRITE))
1136 if (inode1->i_sb != inode2->i_sb)
1142 static int ll_swap_layouts_close(struct obd_client_handle *och,
1143 struct inode *inode, struct inode *inode2)
1145 const struct lu_fid *fid1 = ll_inode2fid(inode);
1146 const struct lu_fid *fid2;
1150 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1151 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1153 rc = ll_check_swap_layouts_validity(inode, inode2);
1155 GOTO(out_free_och, rc);
1157 /* We now know that inode2 is a lustre inode */
1158 fid2 = ll_inode2fid(inode2);
1160 rc = lu_fid_cmp(fid1, fid2);
1162 GOTO(out_free_och, rc = -EINVAL);
1164 /* Close the file and {swap,merge} layouts between inode & inode2.
1165 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1166 * because we still need it to pack l_remote_handle to MDT. */
1167 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1170 och = NULL; /* freed in ll_close_inode_openhandle() */
1180 * Release lease and close the file.
1181 * It will check if the lease has ever broken.
1183 static int ll_lease_close_intent(struct obd_client_handle *och,
1184 struct inode *inode,
1185 bool *lease_broken, enum mds_op_bias bias,
1188 struct ldlm_lock *lock;
1189 bool cancelled = true;
1193 lock = ldlm_handle2lock(&och->och_lease_handle);
1195 lock_res_and_lock(lock);
1196 cancelled = ldlm_is_cancel(lock);
1197 unlock_res_and_lock(lock);
1198 LDLM_LOCK_PUT(lock);
1201 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1202 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1204 if (lease_broken != NULL)
1205 *lease_broken = cancelled;
1207 if (!cancelled && !bias)
1208 ldlm_cli_cancel(&och->och_lease_handle, 0);
1210 if (cancelled) { /* no need to excute intent */
1215 rc = ll_close_inode_openhandle(inode, och, bias, data);
1219 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1222 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1226 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1228 static int ll_lease_file_resync(struct obd_client_handle *och,
1229 struct inode *inode, unsigned long arg)
1231 struct ll_sb_info *sbi = ll_i2sbi(inode);
1232 struct md_op_data *op_data;
1233 struct ll_ioc_lease_id ioc;
1234 __u64 data_version_unused;
1238 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1239 LUSTRE_OPC_ANY, NULL);
1240 if (IS_ERR(op_data))
1241 RETURN(PTR_ERR(op_data));
1243 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1247 /* before starting file resync, it's necessary to clean up page cache
1248 * in client memory, otherwise once the layout version is increased,
1249 * writing back cached data will be denied the OSTs. */
1250 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1254 op_data->op_lease_handle = och->och_lease_handle;
1255 op_data->op_mirror_id = ioc.lil_mirror_id;
1256 rc = md_file_resync(sbi->ll_md_exp, op_data);
1262 ll_finish_md_op_data(op_data);
1266 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1268 struct ll_inode_info *lli = ll_i2info(inode);
1269 struct cl_object *obj = lli->lli_clob;
1270 struct cl_attr *attr = vvp_env_thread_attr(env);
1278 ll_inode_size_lock(inode);
1280 /* Merge timestamps the most recently obtained from MDS with
1281 * timestamps obtained from OSTs.
1283 * Do not overwrite atime of inode because it may be refreshed
1284 * by file_accessed() function. If the read was served by cache
1285 * data, there is no RPC to be sent so that atime may not be
1286 * transferred to OSTs at all. MDT only updates atime at close time
1287 * if it's at least 'mdd.*.atime_diff' older.
1288 * All in all, the atime in Lustre does not strictly comply with
1289 * POSIX. Solving this problem needs to send an RPC to MDT for each
1290 * read, this will hurt performance.
1292 if (inode->i_atime.tv_sec < lli->lli_atime ||
1293 lli->lli_update_atime) {
1294 inode->i_atime.tv_sec = lli->lli_atime;
1295 lli->lli_update_atime = 0;
1297 inode->i_mtime.tv_sec = lli->lli_mtime;
1298 inode->i_ctime.tv_sec = lli->lli_ctime;
1300 mtime = inode->i_mtime.tv_sec;
1301 atime = inode->i_atime.tv_sec;
1302 ctime = inode->i_ctime.tv_sec;
1304 cl_object_attr_lock(obj);
1305 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1308 rc = cl_object_attr_get(env, obj, attr);
1309 cl_object_attr_unlock(obj);
1312 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1314 if (atime < attr->cat_atime)
1315 atime = attr->cat_atime;
1317 if (ctime < attr->cat_ctime)
1318 ctime = attr->cat_ctime;
1320 if (mtime < attr->cat_mtime)
1321 mtime = attr->cat_mtime;
1323 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1324 PFID(&lli->lli_fid), attr->cat_size);
1326 i_size_write(inode, attr->cat_size);
1327 inode->i_blocks = attr->cat_blocks;
1329 inode->i_mtime.tv_sec = mtime;
1330 inode->i_atime.tv_sec = atime;
1331 inode->i_ctime.tv_sec = ctime;
1334 ll_inode_size_unlock(inode);
1340 * Set designated mirror for I/O.
1342 * So far only read, write, and truncated can support to issue I/O to
1343 * designated mirror.
1345 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1347 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1349 /* clear layout version for generic(non-resync) I/O in case it carries
1350 * stale layout version due to I/O restart */
1351 io->ci_layout_version = 0;
1353 /* FLR: disable non-delay for designated mirror I/O because obviously
1354 * only one mirror is available */
1355 if (fd->fd_designated_mirror > 0) {
1357 io->ci_designated_mirror = fd->fd_designated_mirror;
1358 io->ci_layout_version = fd->fd_layout_version;
1361 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1362 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1365 static bool file_is_noatime(const struct file *file)
1367 const struct vfsmount *mnt = file->f_path.mnt;
1368 const struct inode *inode = file_inode((struct file *)file);
1370 /* Adapted from file_accessed() and touch_atime().*/
1371 if (file->f_flags & O_NOATIME)
1374 if (inode->i_flags & S_NOATIME)
1377 if (IS_NOATIME(inode))
1380 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1383 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1386 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1392 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1393 struct vvp_io_args *args)
1395 struct inode *inode = file_inode(file);
1396 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1398 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1399 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1401 if (iot == CIT_WRITE) {
1402 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1403 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1404 file->f_flags & O_DIRECT ||
1406 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1407 io->u.ci_wr.wr_sync |= !!(args &&
1408 args->via_io_subtype == IO_NORMAL &&
1409 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1413 io->ci_obj = ll_i2info(inode)->lli_clob;
1414 io->ci_lockreq = CILR_MAYBE;
1415 if (ll_file_nolock(file)) {
1416 io->ci_lockreq = CILR_NEVER;
1417 io->ci_no_srvlock = 1;
1418 } else if (file->f_flags & O_APPEND) {
1419 io->ci_lockreq = CILR_MANDATORY;
1421 io->ci_noatime = file_is_noatime(file);
1422 io->ci_async_readahead = false;
1424 /* FLR: only use non-delay I/O for read as there is only one
1425 * avaliable mirror for write. */
1426 io->ci_ndelay = !(iot == CIT_WRITE);
1428 ll_io_set_mirror(io, file);
1431 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1434 struct ll_inode_info *lli = ll_i2info(inode);
1435 struct ll_sb_info *sbi = ll_i2sbi(inode);
1436 enum obd_heat_type sample_type;
1437 enum obd_heat_type iobyte_type;
1438 __u64 now = ktime_get_real_seconds();
1440 if (!ll_sbi_has_file_heat(sbi) ||
1441 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1444 if (iot == CIT_READ) {
1445 sample_type = OBD_HEAT_READSAMPLE;
1446 iobyte_type = OBD_HEAT_READBYTE;
1447 } else if (iot == CIT_WRITE) {
1448 sample_type = OBD_HEAT_WRITESAMPLE;
1449 iobyte_type = OBD_HEAT_WRITEBYTE;
1454 spin_lock(&lli->lli_heat_lock);
1455 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1456 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1457 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1458 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1459 spin_unlock(&lli->lli_heat_lock);
1463 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1464 struct file *file, enum cl_io_type iot,
1465 loff_t *ppos, size_t count)
1467 struct vvp_io *vio = vvp_env_io(env);
1468 struct inode *inode = file_inode(file);
1469 struct ll_inode_info *lli = ll_i2info(inode);
1470 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1471 struct range_lock range;
1475 unsigned retried = 0;
1476 bool restarted = false;
1480 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1481 file_dentry(file)->d_name.name,
1482 iot == CIT_READ ? "read" : "write", *ppos, count);
1485 io = vvp_env_thread_io(env);
1486 ll_io_init(io, file, iot, args);
1487 io->ci_ndelay_tried = retried;
1489 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1490 bool range_locked = false;
1492 if (file->f_flags & O_APPEND)
1493 range_lock_init(&range, 0, LUSTRE_EOF);
1495 range_lock_init(&range, *ppos, *ppos + count - 1);
1497 vio->vui_fd = LUSTRE_FPRIVATE(file);
1498 vio->vui_io_subtype = args->via_io_subtype;
1500 switch (vio->vui_io_subtype) {
1502 vio->vui_iter = args->u.normal.via_iter;
1503 vio->vui_iocb = args->u.normal.via_iocb;
1504 /* Direct IO reads must also take range lock,
1505 * or multiple reads will try to work on the same pages
1506 * See LU-6227 for details. */
1507 if (((iot == CIT_WRITE) ||
1508 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1509 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1510 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1512 rc = range_lock(&lli->lli_write_tree, &range);
1516 range_locked = true;
1520 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1521 vio->u.splice.vui_flags = args->u.splice.via_flags;
1524 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1528 ll_cl_add(file, env, io, LCC_RW);
1529 rc = cl_io_loop(env, io);
1530 ll_cl_remove(file, env);
1533 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1535 range_unlock(&lli->lli_write_tree, &range);
1538 /* cl_io_rw_init() handled IO */
1542 if (io->ci_nob > 0) {
1543 result += io->ci_nob;
1544 count -= io->ci_nob;
1545 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1547 /* prepare IO restart */
1548 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1549 args->u.normal.via_iter = vio->vui_iter;
1552 cl_io_fini(env, io);
1555 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1556 file->f_path.dentry->d_name.name,
1557 iot, rc, result, io->ci_need_restart);
1559 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1561 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1562 file_dentry(file)->d_name.name,
1563 iot == CIT_READ ? "read" : "write",
1564 *ppos, count, result, rc);
1565 /* preserve the tried count for FLR */
1566 retried = io->ci_ndelay_tried;
1571 if (iot == CIT_READ) {
1573 ll_stats_ops_tally(ll_i2sbi(inode),
1574 LPROC_LL_READ_BYTES, result);
1575 } else if (iot == CIT_WRITE) {
1577 ll_stats_ops_tally(ll_i2sbi(inode),
1578 LPROC_LL_WRITE_BYTES, result);
1579 fd->fd_write_failed = false;
1580 } else if (result == 0 && rc == 0) {
1583 fd->fd_write_failed = true;
1585 fd->fd_write_failed = false;
1586 } else if (rc != -ERESTARTSYS) {
1587 fd->fd_write_failed = true;
1591 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1593 ll_heat_add(inode, iot, result);
1595 RETURN(result > 0 ? result : rc);
1599 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1600 * especially for small I/O.
1602 * To serve a read request, CLIO has to create and initialize a cl_io and
1603 * then request DLM lock. This has turned out to have siginificant overhead
1604 * and affects the performance of small I/O dramatically.
1606 * It's not necessary to create a cl_io for each I/O. Under the help of read
1607 * ahead, most of the pages being read are already in memory cache and we can
1608 * read those pages directly because if the pages exist, the corresponding DLM
1609 * lock must exist so that page content must be valid.
1611 * In fast read implementation, the llite speculatively finds and reads pages
1612 * in memory cache. There are three scenarios for fast read:
1613 * - If the page exists and is uptodate, kernel VM will provide the data and
1614 * CLIO won't be intervened;
1615 * - If the page was brought into memory by read ahead, it will be exported
1616 * and read ahead parameters will be updated;
1617 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1618 * it will go back and invoke normal read, i.e., a cl_io will be created
1619 * and DLM lock will be requested.
1621 * POSIX compliance: posix standard states that read is intended to be atomic.
1622 * Lustre read implementation is in line with Linux kernel read implementation
1623 * and neither of them complies with POSIX standard in this matter. Fast read
1624 * doesn't make the situation worse on single node but it may interleave write
1625 * results from multiple nodes due to short read handling in ll_file_aio_read().
1627 * \param env - lu_env
1628 * \param iocb - kiocb from kernel
1629 * \param iter - user space buffers where the data will be copied
1631 * \retval - number of bytes have been read, or error code if error occurred.
1634 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1638 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1641 /* NB: we can't do direct IO for fast read because it will need a lock
1642 * to make IO engine happy. */
1643 if (iocb->ki_filp->f_flags & O_DIRECT)
1646 result = generic_file_read_iter(iocb, iter);
1648 /* If the first page is not in cache, generic_file_aio_read() will be
1649 * returned with -ENODATA.
1650 * See corresponding code in ll_readpage(). */
1651 if (result == -ENODATA)
1655 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1656 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1657 LPROC_LL_READ_BYTES, result);
1664 * Read from a file (through the page cache).
1666 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1669 struct vvp_io_args *args;
1670 struct file *file = iocb->ki_filp;
1674 ktime_t kstart = ktime_get();
1677 if (!iov_iter_count(to))
1681 * Currently when PCC read failed, we do not fall back to the
1682 * normal read path, just return the error.
1683 * The resaon is that: for RW-PCC, the file data may be modified
1684 * in the PCC and inconsistent with the data on OSTs (or file
1685 * data has been removed from the Lustre file system), at this
1686 * time, fallback to the normal read path may read the wrong
1688 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1689 * path: read data from data copy on OSTs.
1691 result = pcc_file_read_iter(iocb, to, &cached);
1695 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1697 result = ll_do_fast_read(iocb, to);
1698 if (result < 0 || iov_iter_count(to) == 0)
1701 env = cl_env_get(&refcheck);
1703 return PTR_ERR(env);
1705 args = ll_env_args(env, IO_NORMAL);
1706 args->u.normal.via_iter = to;
1707 args->u.normal.via_iocb = iocb;
1709 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1710 &iocb->ki_pos, iov_iter_count(to));
1713 else if (result == 0)
1716 cl_env_put(env, &refcheck);
1719 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1720 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1722 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1723 ktime_us_delta(ktime_get(), kstart));
1730 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1731 * If a page is already in the page cache and dirty (and some other things -
1732 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1733 * write to it without doing a full I/O, because Lustre already knows about it
1734 * and will write it out. This saves a lot of processing time.
1736 * All writes here are within one page, so exclusion is handled by the page
1737 * lock on the vm page. We do not do tiny writes for writes which touch
1738 * multiple pages because it's very unlikely multiple sequential pages are
1739 * are already dirty.
1741 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1742 * and are unlikely to be to already dirty pages.
1744 * Attribute updates are important here, we do them in ll_tiny_write_end.
1746 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1748 ssize_t count = iov_iter_count(iter);
1749 struct file *file = iocb->ki_filp;
1750 struct inode *inode = file_inode(file);
1751 bool lock_inode = !IS_NOSEC(inode);
1756 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1757 * of function for why.
1759 if (count >= PAGE_SIZE ||
1760 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1763 if (unlikely(lock_inode))
1765 result = __generic_file_write_iter(iocb, iter);
1767 if (unlikely(lock_inode))
1768 inode_unlock(inode);
1770 /* If the page is not already dirty, ll_tiny_write_begin returns
1771 * -ENODATA. We continue on to normal write.
1773 if (result == -ENODATA)
1777 ll_heat_add(inode, CIT_WRITE, result);
1778 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1780 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1783 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1789 * Write to a file (through the page cache).
1791 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1793 struct vvp_io_args *args;
1795 ssize_t rc_tiny = 0, rc_normal;
1796 struct file *file = iocb->ki_filp;
1799 ktime_t kstart = ktime_get();
1804 if (!iov_iter_count(from))
1805 GOTO(out, rc_normal = 0);
1808 * When PCC write failed, we usually do not fall back to the normal
1809 * write path, just return the error. But there is a special case when
1810 * returned error code is -ENOSPC due to running out of space on PCC HSM
1811 * bakcend. At this time, it will fall back to normal I/O path and
1812 * retry the I/O. As the file is in HSM released state, it will restore
1813 * the file data to OSTs first and redo the write again. And the
1814 * restore process will revoke the layout lock and detach the file
1815 * from PCC cache automatically.
1817 result = pcc_file_write_iter(iocb, from, &cached);
1818 if (cached && result != -ENOSPC && result != -EDQUOT)
1819 GOTO(out, rc_normal = result);
1821 /* NB: we can't do direct IO for tiny writes because they use the page
1822 * cache, we can't do sync writes because tiny writes can't flush
1823 * pages, and we can't do append writes because we can't guarantee the
1824 * required DLM locks are held to protect file size.
1826 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1827 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1828 rc_tiny = ll_do_tiny_write(iocb, from);
1830 /* In case of error, go on and try normal write - Only stop if tiny
1831 * write completed I/O.
1833 if (iov_iter_count(from) == 0)
1834 GOTO(out, rc_normal = rc_tiny);
1836 env = cl_env_get(&refcheck);
1838 return PTR_ERR(env);
1840 args = ll_env_args(env, IO_NORMAL);
1841 args->u.normal.via_iter = from;
1842 args->u.normal.via_iocb = iocb;
1844 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1845 &iocb->ki_pos, iov_iter_count(from));
1847 /* On success, combine bytes written. */
1848 if (rc_tiny >= 0 && rc_normal > 0)
1849 rc_normal += rc_tiny;
1850 /* On error, only return error from normal write if tiny write did not
1851 * write any bytes. Otherwise return bytes written by tiny write.
1853 else if (rc_tiny > 0)
1854 rc_normal = rc_tiny;
1856 cl_env_put(env, &refcheck);
1858 if (rc_normal > 0) {
1859 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1860 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1862 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1863 ktime_us_delta(ktime_get(), kstart));
1869 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1871 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1873 static int ll_file_get_iov_count(const struct iovec *iov,
1874 unsigned long *nr_segs, size_t *count)
1879 for (seg = 0; seg < *nr_segs; seg++) {
1880 const struct iovec *iv = &iov[seg];
1883 * If any segment has a negative length, or the cumulative
1884 * length ever wraps negative then return -EINVAL.
1887 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1889 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1894 cnt -= iv->iov_len; /* This segment is no good */
1901 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1902 unsigned long nr_segs, loff_t pos)
1909 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1916 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1917 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1918 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1919 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1920 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1922 result = ll_file_read_iter(iocb, &to);
1927 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1930 struct iovec iov = { .iov_base = buf, .iov_len = count };
1939 init_sync_kiocb(&kiocb, file);
1940 kiocb.ki_pos = *ppos;
1941 #ifdef HAVE_KIOCB_KI_LEFT
1942 kiocb.ki_left = count;
1943 #elif defined(HAVE_KI_NBYTES)
1944 kiocb.i_nbytes = count;
1947 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1948 *ppos = kiocb.ki_pos;
1954 * Write to a file (through the page cache).
1957 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1958 unsigned long nr_segs, loff_t pos)
1960 struct iov_iter from;
1965 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1972 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1973 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1974 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1975 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1976 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1978 result = ll_file_write_iter(iocb, &from);
1983 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1984 size_t count, loff_t *ppos)
1986 struct iovec iov = { .iov_base = (void __user *)buf,
1996 init_sync_kiocb(&kiocb, file);
1997 kiocb.ki_pos = *ppos;
1998 #ifdef HAVE_KIOCB_KI_LEFT
1999 kiocb.ki_left = count;
2000 #elif defined(HAVE_KI_NBYTES)
2001 kiocb.ki_nbytes = count;
2004 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2005 *ppos = kiocb.ki_pos;
2009 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2012 * Send file content (through pagecache) somewhere with helper
2014 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2015 struct pipe_inode_info *pipe, size_t count,
2019 struct vvp_io_args *args;
2026 result = pcc_file_splice_read(in_file, ppos, pipe,
2027 count, flags, &cached);
2031 ll_ras_enter(in_file, *ppos, count);
2033 env = cl_env_get(&refcheck);
2035 RETURN(PTR_ERR(env));
2037 args = ll_env_args(env, IO_SPLICE);
2038 args->u.splice.via_pipe = pipe;
2039 args->u.splice.via_flags = flags;
2041 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2042 cl_env_put(env, &refcheck);
2045 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2046 LUSTRE_FPRIVATE(in_file), *ppos, result,
2051 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2052 __u64 flags, struct lov_user_md *lum, int lum_size)
2054 struct lookup_intent oit = {
2056 .it_flags = flags | MDS_OPEN_BY_FID,
2061 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2062 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2063 /* this code will only exist for big-endian systems */
2064 lustre_swab_lov_user_md(lum, 0);
2067 ll_inode_size_lock(inode);
2068 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2070 GOTO(out_unlock, rc);
2072 ll_release_openhandle(dentry, &oit);
2075 ll_inode_size_unlock(inode);
2076 ll_intent_release(&oit);
2081 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2082 struct lov_mds_md **lmmp, int *lmm_size,
2083 struct ptlrpc_request **request)
2085 struct ll_sb_info *sbi = ll_i2sbi(inode);
2086 struct mdt_body *body;
2087 struct lov_mds_md *lmm = NULL;
2088 struct ptlrpc_request *req = NULL;
2089 struct md_op_data *op_data;
2092 rc = ll_get_default_mdsize(sbi, &lmmsize);
2096 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2097 strlen(filename), lmmsize,
2098 LUSTRE_OPC_ANY, NULL);
2099 if (IS_ERR(op_data))
2100 RETURN(PTR_ERR(op_data));
2102 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2103 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2104 ll_finish_md_op_data(op_data);
2106 CDEBUG(D_INFO, "md_getattr_name failed "
2107 "on %s: rc %d\n", filename, rc);
2111 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2112 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2114 lmmsize = body->mbo_eadatasize;
2116 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2118 GOTO(out, rc = -ENODATA);
2121 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2122 LASSERT(lmm != NULL);
2124 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2125 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2126 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2127 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2128 GOTO(out, rc = -EPROTO);
2131 * This is coming from the MDS, so is probably in
2132 * little endian. We convert it to host endian before
2133 * passing it to userspace.
2135 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2136 __swab32(LOV_MAGIC_MAGIC)) {
2137 int stripe_count = 0;
2139 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2140 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2141 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2142 if (le32_to_cpu(lmm->lmm_pattern) &
2143 LOV_PATTERN_F_RELEASED)
2147 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2149 /* if function called for directory - we should
2150 * avoid swab not existent lsm objects */
2151 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2152 lustre_swab_lov_user_md_objects(
2153 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2155 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2156 S_ISREG(body->mbo_mode))
2157 lustre_swab_lov_user_md_objects(
2158 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2164 *lmm_size = lmmsize;
2169 static int ll_lov_setea(struct inode *inode, struct file *file,
2172 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2173 struct lov_user_md *lump;
2174 int lum_size = sizeof(struct lov_user_md) +
2175 sizeof(struct lov_user_ost_data);
2179 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2182 OBD_ALLOC_LARGE(lump, lum_size);
2186 if (copy_from_user(lump, arg, lum_size))
2187 GOTO(out_lump, rc = -EFAULT);
2189 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2191 cl_lov_delay_create_clear(&file->f_flags);
2194 OBD_FREE_LARGE(lump, lum_size);
2198 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2205 env = cl_env_get(&refcheck);
2207 RETURN(PTR_ERR(env));
2209 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2210 cl_env_put(env, &refcheck);
2214 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2217 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2218 struct lov_user_md *klum;
2220 __u64 flags = FMODE_WRITE;
2223 rc = ll_copy_user_md(lum, &klum);
2228 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2233 rc = put_user(0, &lum->lmm_stripe_count);
2237 rc = ll_layout_refresh(inode, &gen);
2241 rc = ll_file_getstripe(inode, arg, lum_size);
2243 cl_lov_delay_create_clear(&file->f_flags);
2246 OBD_FREE_LARGE(klum, lum_size);
2252 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2254 struct ll_inode_info *lli = ll_i2info(inode);
2255 struct cl_object *obj = lli->lli_clob;
2256 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2257 struct ll_grouplock grouplock;
2262 CWARN("group id for group lock must not be 0\n");
2266 if (ll_file_nolock(file))
2267 RETURN(-EOPNOTSUPP);
2269 if (file->f_flags & O_NONBLOCK) {
2270 if (!mutex_trylock(&lli->lli_group_mutex))
2273 mutex_lock(&lli->lli_group_mutex);
2275 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2276 CWARN("group lock already existed with gid %lu\n",
2277 fd->fd_grouplock.lg_gid);
2278 GOTO(out, rc = -EINVAL);
2280 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2281 if (file->f_flags & O_NONBLOCK)
2282 GOTO(out, rc = -EAGAIN);
2283 mutex_unlock(&lli->lli_group_mutex);
2284 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2285 GOTO(retry, rc = 0);
2287 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2290 * XXX: group lock needs to protect all OST objects while PFL
2291 * can add new OST objects during the IO, so we'd instantiate
2292 * all OST objects before getting its group lock.
2297 struct cl_layout cl = {
2298 .cl_is_composite = false,
2300 struct lu_extent ext = {
2302 .e_end = OBD_OBJECT_EOF,
2305 env = cl_env_get(&refcheck);
2307 GOTO(out, rc = PTR_ERR(env));
2309 rc = cl_object_layout_get(env, obj, &cl);
2310 if (!rc && cl.cl_is_composite)
2311 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2314 cl_env_put(env, &refcheck);
2319 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2320 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2325 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2326 fd->fd_grouplock = grouplock;
2327 if (lli->lli_group_users == 0)
2328 lli->lli_group_gid = grouplock.lg_gid;
2329 lli->lli_group_users++;
2331 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2333 mutex_unlock(&lli->lli_group_mutex);
2338 static int ll_put_grouplock(struct inode *inode, struct file *file,
2341 struct ll_inode_info *lli = ll_i2info(inode);
2342 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2343 struct ll_grouplock grouplock;
2347 mutex_lock(&lli->lli_group_mutex);
2348 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2349 CWARN("no group lock held\n");
2350 GOTO(out, rc = -EINVAL);
2353 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2355 if (fd->fd_grouplock.lg_gid != arg) {
2356 CWARN("group lock %lu doesn't match current id %lu\n",
2357 arg, fd->fd_grouplock.lg_gid);
2358 GOTO(out, rc = -EINVAL);
2361 grouplock = fd->fd_grouplock;
2362 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2363 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2365 cl_put_grouplock(&grouplock);
2367 lli->lli_group_users--;
2368 if (lli->lli_group_users == 0) {
2369 lli->lli_group_gid = 0;
2370 wake_up_var(&lli->lli_group_users);
2372 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2375 mutex_unlock(&lli->lli_group_mutex);
2381 * Close inode open handle
2383 * \param dentry [in] dentry which contains the inode
2384 * \param it [in,out] intent which contains open info and result
2387 * \retval <0 failure
2389 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2391 struct inode *inode = dentry->d_inode;
2392 struct obd_client_handle *och;
2398 /* Root ? Do nothing. */
2399 if (dentry->d_inode->i_sb->s_root == dentry)
2402 /* No open handle to close? Move away */
2403 if (!it_disposition(it, DISP_OPEN_OPEN))
2406 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2408 OBD_ALLOC(och, sizeof(*och));
2410 GOTO(out, rc = -ENOMEM);
2412 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2416 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2418 /* this one is in place of ll_file_open */
2419 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2420 ptlrpc_req_finished(it->it_request);
2421 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2427 * Get size for inode for which FIEMAP mapping is requested.
2428 * Make the FIEMAP get_info call and returns the result.
2429 * \param fiemap kernel buffer to hold extens
2430 * \param num_bytes kernel buffer size
2432 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2438 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2441 /* Checks for fiemap flags */
2442 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2443 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2447 /* Check for FIEMAP_FLAG_SYNC */
2448 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2449 rc = filemap_fdatawrite(inode->i_mapping);
2454 env = cl_env_get(&refcheck);
2456 RETURN(PTR_ERR(env));
2458 if (i_size_read(inode) == 0) {
2459 rc = ll_glimpse_size(inode);
2464 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2465 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2466 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2468 /* If filesize is 0, then there would be no objects for mapping */
2469 if (fmkey.lfik_oa.o_size == 0) {
2470 fiemap->fm_mapped_extents = 0;
2474 fmkey.lfik_fiemap = *fiemap;
2476 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2477 &fmkey, fiemap, &num_bytes);
2479 cl_env_put(env, &refcheck);
2483 int ll_fid2path(struct inode *inode, void __user *arg)
2485 struct obd_export *exp = ll_i2mdexp(inode);
2486 const struct getinfo_fid2path __user *gfin = arg;
2488 struct getinfo_fid2path *gfout;
2494 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2495 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2498 /* Only need to get the buflen */
2499 if (get_user(pathlen, &gfin->gf_pathlen))
2502 if (pathlen > PATH_MAX)
2505 outsize = sizeof(*gfout) + pathlen;
2506 OBD_ALLOC(gfout, outsize);
2510 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2511 GOTO(gf_free, rc = -EFAULT);
2512 /* append root FID after gfout to let MDT know the root FID so that it
2513 * can lookup the correct path, this is mainly for fileset.
2514 * old server without fileset mount support will ignore this. */
2515 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2517 /* Call mdc_iocontrol */
2518 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2522 if (copy_to_user(arg, gfout, outsize))
2526 OBD_FREE(gfout, outsize);
2531 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2533 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2541 ioc->idv_version = 0;
2542 ioc->idv_layout_version = UINT_MAX;
2544 /* If no file object initialized, we consider its version is 0. */
2548 env = cl_env_get(&refcheck);
2550 RETURN(PTR_ERR(env));
2552 io = vvp_env_thread_io(env);
2554 io->u.ci_data_version.dv_data_version = 0;
2555 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2556 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2559 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2560 result = cl_io_loop(env, io);
2562 result = io->ci_result;
2564 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2565 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2567 cl_io_fini(env, io);
2569 if (unlikely(io->ci_need_restart))
2572 cl_env_put(env, &refcheck);
2578 * Read the data_version for inode.
2580 * This value is computed using stripe object version on OST.
2581 * Version is computed using server side locking.
2583 * @param flags if do sync on the OST side;
2585 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2586 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2588 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2590 struct ioc_data_version ioc = { .idv_flags = flags };
2593 rc = ll_ioc_data_version(inode, &ioc);
2595 *data_version = ioc.idv_version;
2601 * Trigger a HSM release request for the provided inode.
2603 int ll_hsm_release(struct inode *inode)
2606 struct obd_client_handle *och = NULL;
2607 __u64 data_version = 0;
2612 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2613 ll_i2sbi(inode)->ll_fsname,
2614 PFID(&ll_i2info(inode)->lli_fid));
2616 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2618 GOTO(out, rc = PTR_ERR(och));
2620 /* Grab latest data_version and [am]time values */
2621 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2625 env = cl_env_get(&refcheck);
2627 GOTO(out, rc = PTR_ERR(env));
2629 rc = ll_merge_attr(env, inode);
2630 cl_env_put(env, &refcheck);
2632 /* If error happen, we have the wrong size for a file.
2638 /* Release the file.
2639 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2640 * we still need it to pack l_remote_handle to MDT. */
2641 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2647 if (och != NULL && !IS_ERR(och)) /* close the file */
2648 ll_lease_close(och, inode, NULL);
2653 struct ll_swap_stack {
2656 struct inode *inode1;
2657 struct inode *inode2;
2662 static int ll_swap_layouts(struct file *file1, struct file *file2,
2663 struct lustre_swap_layouts *lsl)
2665 struct mdc_swap_layouts msl;
2666 struct md_op_data *op_data;
2669 struct ll_swap_stack *llss = NULL;
2672 OBD_ALLOC_PTR(llss);
2676 llss->inode1 = file_inode(file1);
2677 llss->inode2 = file_inode(file2);
2679 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2683 /* we use 2 bool because it is easier to swap than 2 bits */
2684 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2685 llss->check_dv1 = true;
2687 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2688 llss->check_dv2 = true;
2690 /* we cannot use lsl->sl_dvX directly because we may swap them */
2691 llss->dv1 = lsl->sl_dv1;
2692 llss->dv2 = lsl->sl_dv2;
2694 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2695 if (rc == 0) /* same file, done! */
2698 if (rc < 0) { /* sequentialize it */
2699 swap(llss->inode1, llss->inode2);
2701 swap(llss->dv1, llss->dv2);
2702 swap(llss->check_dv1, llss->check_dv2);
2706 if (gid != 0) { /* application asks to flush dirty cache */
2707 rc = ll_get_grouplock(llss->inode1, file1, gid);
2711 rc = ll_get_grouplock(llss->inode2, file2, gid);
2713 ll_put_grouplock(llss->inode1, file1, gid);
2718 /* ultimate check, before swaping the layouts we check if
2719 * dataversion has changed (if requested) */
2720 if (llss->check_dv1) {
2721 rc = ll_data_version(llss->inode1, &dv, 0);
2724 if (dv != llss->dv1)
2725 GOTO(putgl, rc = -EAGAIN);
2728 if (llss->check_dv2) {
2729 rc = ll_data_version(llss->inode2, &dv, 0);
2732 if (dv != llss->dv2)
2733 GOTO(putgl, rc = -EAGAIN);
2736 /* struct md_op_data is used to send the swap args to the mdt
2737 * only flags is missing, so we use struct mdc_swap_layouts
2738 * through the md_op_data->op_data */
2739 /* flags from user space have to be converted before they are send to
2740 * server, no flag is sent today, they are only used on the client */
2743 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2744 0, LUSTRE_OPC_ANY, &msl);
2745 if (IS_ERR(op_data))
2746 GOTO(free, rc = PTR_ERR(op_data));
2748 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2749 sizeof(*op_data), op_data, NULL);
2750 ll_finish_md_op_data(op_data);
2757 ll_put_grouplock(llss->inode2, file2, gid);
2758 ll_put_grouplock(llss->inode1, file1, gid);
2768 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2770 struct obd_export *exp = ll_i2mdexp(inode);
2771 struct md_op_data *op_data;
2775 /* Detect out-of range masks */
2776 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2779 /* Non-root users are forbidden to set or clear flags which are
2780 * NOT defined in HSM_USER_MASK. */
2781 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2782 !cfs_capable(CFS_CAP_SYS_ADMIN))
2785 if (!exp_connect_archive_id_array(exp)) {
2786 /* Detect out-of range archive id */
2787 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2788 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2792 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2793 LUSTRE_OPC_ANY, hss);
2794 if (IS_ERR(op_data))
2795 RETURN(PTR_ERR(op_data));
2797 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2800 ll_finish_md_op_data(op_data);
2805 static int ll_hsm_import(struct inode *inode, struct file *file,
2806 struct hsm_user_import *hui)
2808 struct hsm_state_set *hss = NULL;
2809 struct iattr *attr = NULL;
2813 if (!S_ISREG(inode->i_mode))
2819 GOTO(out, rc = -ENOMEM);
2821 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2822 hss->hss_archive_id = hui->hui_archive_id;
2823 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2824 rc = ll_hsm_state_set(inode, hss);
2828 OBD_ALLOC_PTR(attr);
2830 GOTO(out, rc = -ENOMEM);
2832 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2833 attr->ia_mode |= S_IFREG;
2834 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2835 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2836 attr->ia_size = hui->hui_size;
2837 attr->ia_mtime.tv_sec = hui->hui_mtime;
2838 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2839 attr->ia_atime.tv_sec = hui->hui_atime;
2840 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2842 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2843 ATTR_UID | ATTR_GID |
2844 ATTR_MTIME | ATTR_MTIME_SET |
2845 ATTR_ATIME | ATTR_ATIME_SET;
2849 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2853 inode_unlock(inode);
2865 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2867 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2868 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2871 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2873 struct inode *inode = file_inode(file);
2875 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2876 ATTR_MTIME | ATTR_MTIME_SET |
2879 .tv_sec = lfu->lfu_atime_sec,
2880 .tv_nsec = lfu->lfu_atime_nsec,
2883 .tv_sec = lfu->lfu_mtime_sec,
2884 .tv_nsec = lfu->lfu_mtime_nsec,
2887 .tv_sec = lfu->lfu_ctime_sec,
2888 .tv_nsec = lfu->lfu_ctime_nsec,
2894 if (!capable(CAP_SYS_ADMIN))
2897 if (!S_ISREG(inode->i_mode))
2901 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2903 inode_unlock(inode);
2908 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2911 case MODE_READ_USER:
2913 case MODE_WRITE_USER:
2920 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2922 /* Used to allow the upper layers of the client to request an LDLM lock
2923 * without doing an actual read or write.
2925 * Used for ladvise lockahead to manually request specific locks.
2927 * \param[in] file file this ladvise lock request is on
2928 * \param[in] ladvise ladvise struct describing this lock request
2930 * \retval 0 success, no detailed result available (sync requests
2931 * and requests sent to the server [not handled locally]
2932 * cannot return detailed results)
2933 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2934 * see definitions for details.
2935 * \retval negative negative errno on error
2937 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2939 struct lu_env *env = NULL;
2940 struct cl_io *io = NULL;
2941 struct cl_lock *lock = NULL;
2942 struct cl_lock_descr *descr = NULL;
2943 struct dentry *dentry = file->f_path.dentry;
2944 struct inode *inode = dentry->d_inode;
2945 enum cl_lock_mode cl_mode;
2946 off_t start = ladvise->lla_start;
2947 off_t end = ladvise->lla_end;
2953 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2954 "start=%llu, end=%llu\n", dentry->d_name.len,
2955 dentry->d_name.name, dentry->d_inode,
2956 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2959 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2961 GOTO(out, result = cl_mode);
2963 /* Get IO environment */
2964 result = cl_io_get(inode, &env, &io, &refcheck);
2968 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2971 * nothing to do for this io. This currently happens when
2972 * stripe sub-object's are not yet created.
2974 result = io->ci_result;
2975 } else if (result == 0) {
2976 lock = vvp_env_lock(env);
2977 descr = &lock->cll_descr;
2979 descr->cld_obj = io->ci_obj;
2980 /* Convert byte offsets to pages */
2981 descr->cld_start = cl_index(io->ci_obj, start);
2982 descr->cld_end = cl_index(io->ci_obj, end);
2983 descr->cld_mode = cl_mode;
2984 /* CEF_MUST is used because we do not want to convert a
2985 * lockahead request to a lockless lock */
2986 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2989 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2990 descr->cld_enq_flags |= CEF_SPECULATIVE;
2992 result = cl_lock_request(env, io, lock);
2994 /* On success, we need to release the lock */
2996 cl_lock_release(env, lock);
2998 cl_io_fini(env, io);
2999 cl_env_put(env, &refcheck);
3001 /* -ECANCELED indicates a matching lock with a different extent
3002 * was already present, and -EEXIST indicates a matching lock
3003 * on exactly the same extent was already present.
3004 * We convert them to positive values for userspace to make
3005 * recognizing true errors easier.
3006 * Note we can only return these detailed results on async requests,
3007 * as sync requests look the same as i/o requests for locking. */
3008 if (result == -ECANCELED)
3009 result = LLA_RESULT_DIFFERENT;
3010 else if (result == -EEXIST)
3011 result = LLA_RESULT_SAME;
3016 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3018 static int ll_ladvise_sanity(struct inode *inode,
3019 struct llapi_lu_ladvise *ladvise)
3021 struct ll_sb_info *sbi = ll_i2sbi(inode);
3022 enum lu_ladvise_type advice = ladvise->lla_advice;
3023 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3024 * be in the first 32 bits of enum ladvise_flags */
3025 __u32 flags = ladvise->lla_peradvice_flags;
3026 /* 3 lines at 80 characters per line, should be plenty */
3029 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3031 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3032 "last supported advice is %s (value '%d'): rc = %d\n",
3033 sbi->ll_fsname, advice,
3034 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3038 /* Per-advice checks */
3040 case LU_LADVISE_LOCKNOEXPAND:
3041 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3043 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3044 "rc = %d\n", sbi->ll_fsname, flags,
3045 ladvise_names[advice], rc);
3049 case LU_LADVISE_LOCKAHEAD:
3050 /* Currently only READ and WRITE modes can be requested */
3051 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3052 ladvise->lla_lockahead_mode == 0) {
3054 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3055 "rc = %d\n", sbi->ll_fsname,
3056 ladvise->lla_lockahead_mode,
3057 ladvise_names[advice], rc);
3061 case LU_LADVISE_WILLREAD:
3062 case LU_LADVISE_DONTNEED:
3064 /* Note fall through above - These checks apply to all advices
3065 * except LOCKNOEXPAND */
3066 if (flags & ~LF_DEFAULT_MASK) {
3068 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3069 "rc = %d\n", sbi->ll_fsname, flags,
3070 ladvise_names[advice], rc);
3073 if (ladvise->lla_start >= ladvise->lla_end) {
3075 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3076 "for %s: rc = %d\n", sbi->ll_fsname,
3077 ladvise->lla_start, ladvise->lla_end,
3078 ladvise_names[advice], rc);
3090 * Give file access advices
3092 * The ladvise interface is similar to Linux fadvise() system call, except it
3093 * forwards the advices directly from Lustre client to server. The server side
3094 * codes will apply appropriate read-ahead and caching techniques for the
3095 * corresponding files.
3097 * A typical workload for ladvise is e.g. a bunch of different clients are
3098 * doing small random reads of a file, so prefetching pages into OSS cache
3099 * with big linear reads before the random IO is a net benefit. Fetching
3100 * all that data into each client cache with fadvise() may not be, due to
3101 * much more data being sent to the client.
3103 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3104 struct llapi_lu_ladvise *ladvise)
3108 struct cl_ladvise_io *lio;
3113 env = cl_env_get(&refcheck);
3115 RETURN(PTR_ERR(env));
3117 io = vvp_env_thread_io(env);
3118 io->ci_obj = ll_i2info(inode)->lli_clob;
3120 /* initialize parameters for ladvise */
3121 lio = &io->u.ci_ladvise;
3122 lio->li_start = ladvise->lla_start;
3123 lio->li_end = ladvise->lla_end;
3124 lio->li_fid = ll_inode2fid(inode);
3125 lio->li_advice = ladvise->lla_advice;
3126 lio->li_flags = flags;
3128 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3129 rc = cl_io_loop(env, io);
3133 cl_io_fini(env, io);
3134 cl_env_put(env, &refcheck);
3138 static int ll_lock_noexpand(struct file *file, int flags)
3140 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3142 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3147 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3150 struct fsxattr fsxattr;
3152 if (copy_from_user(&fsxattr,
3153 (const struct fsxattr __user *)arg,
3157 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3158 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3159 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3160 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3161 if (copy_to_user((struct fsxattr __user *)arg,
3162 &fsxattr, sizeof(fsxattr)))
3168 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3171 * Project Quota ID state is only allowed to change from within the init
3172 * namespace. Enforce that restriction only if we are trying to change
3173 * the quota ID state. Everything else is allowed in user namespaces.
3175 if (current_user_ns() == &init_user_ns)
3178 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3181 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3182 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3185 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3192 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3196 struct md_op_data *op_data;
3197 struct ptlrpc_request *req = NULL;
3199 struct fsxattr fsxattr;
3200 struct cl_object *obj;
3204 if (copy_from_user(&fsxattr,
3205 (const struct fsxattr __user *)arg,
3209 rc = ll_ioctl_check_project(inode, &fsxattr);
3213 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3214 LUSTRE_OPC_ANY, NULL);
3215 if (IS_ERR(op_data))
3216 RETURN(PTR_ERR(op_data));
3218 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3219 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3220 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3221 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3222 op_data->op_projid = fsxattr.fsx_projid;
3223 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3224 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3226 ptlrpc_req_finished(req);
3228 GOTO(out_fsxattr, rc);
3229 ll_update_inode_flags(inode, op_data->op_attr_flags);
3230 obj = ll_i2info(inode)->lli_clob;
3232 GOTO(out_fsxattr, rc);
3234 OBD_ALLOC_PTR(attr);
3236 GOTO(out_fsxattr, rc = -ENOMEM);
3238 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3239 fsxattr.fsx_xflags);
3242 ll_finish_md_op_data(op_data);
3246 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3249 struct inode *inode = file_inode(file);
3250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3251 struct ll_inode_info *lli = ll_i2info(inode);
3252 struct obd_client_handle *och = NULL;
3253 struct split_param sp;
3254 struct pcc_param param;
3255 bool lease_broken = false;
3257 enum mds_op_bias bias = 0;
3258 struct file *layout_file = NULL;
3260 size_t data_size = 0;
3261 bool attached = false;
3266 mutex_lock(&lli->lli_och_mutex);
3267 if (fd->fd_lease_och != NULL) {
3268 och = fd->fd_lease_och;
3269 fd->fd_lease_och = NULL;
3271 mutex_unlock(&lli->lli_och_mutex);
3276 fmode = och->och_flags;
3278 switch (ioc->lil_flags) {
3279 case LL_LEASE_RESYNC_DONE:
3280 if (ioc->lil_count > IOC_IDS_MAX)
3281 GOTO(out_lease_close, rc = -EINVAL);
3283 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3284 OBD_ALLOC(data, data_size);
3286 GOTO(out_lease_close, rc = -ENOMEM);
3288 if (copy_from_user(data, (void __user *)arg, data_size))
3289 GOTO(out_lease_close, rc = -EFAULT);
3291 bias = MDS_CLOSE_RESYNC_DONE;
3293 case LL_LEASE_LAYOUT_MERGE: {
3296 if (ioc->lil_count != 1)
3297 GOTO(out_lease_close, rc = -EINVAL);
3299 arg += sizeof(*ioc);
3300 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3301 GOTO(out_lease_close, rc = -EFAULT);
3303 layout_file = fget(fd);
3305 GOTO(out_lease_close, rc = -EBADF);
3307 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3308 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3309 GOTO(out_lease_close, rc = -EPERM);
3311 data = file_inode(layout_file);
3312 bias = MDS_CLOSE_LAYOUT_MERGE;
3315 case LL_LEASE_LAYOUT_SPLIT: {
3319 if (ioc->lil_count != 2)
3320 GOTO(out_lease_close, rc = -EINVAL);
3322 arg += sizeof(*ioc);
3323 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3324 GOTO(out_lease_close, rc = -EFAULT);
3326 arg += sizeof(__u32);
3327 if (copy_from_user(&mirror_id, (void __user *)arg,
3329 GOTO(out_lease_close, rc = -EFAULT);
3331 layout_file = fget(fdv);
3333 GOTO(out_lease_close, rc = -EBADF);
3335 sp.sp_inode = file_inode(layout_file);
3336 sp.sp_mirror_id = (__u16)mirror_id;
3338 bias = MDS_CLOSE_LAYOUT_SPLIT;
3341 case LL_LEASE_PCC_ATTACH:
3342 if (ioc->lil_count != 1)
3345 arg += sizeof(*ioc);
3346 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3348 GOTO(out_lease_close, rc2 = -EFAULT);
3350 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3352 GOTO(out_lease_close, rc2);
3355 /* Grab latest data version */
3356 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3359 GOTO(out_lease_close, rc2);
3362 bias = MDS_PCC_ATTACH;
3365 /* without close intent */
3370 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3374 rc = ll_lease_och_release(inode, file);
3383 switch (ioc->lil_flags) {
3384 case LL_LEASE_RESYNC_DONE:
3386 OBD_FREE(data, data_size);
3388 case LL_LEASE_LAYOUT_MERGE:
3389 case LL_LEASE_LAYOUT_SPLIT:
3393 case LL_LEASE_PCC_ATTACH:
3396 rc = pcc_readwrite_attach_fini(file, inode,
3397 param.pa_layout_gen,
3404 rc = ll_lease_type_from_fmode(fmode);
3408 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3411 struct inode *inode = file_inode(file);
3412 struct ll_inode_info *lli = ll_i2info(inode);
3413 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3414 struct obd_client_handle *och = NULL;
3415 __u64 open_flags = 0;
3421 switch (ioc->lil_mode) {
3422 case LL_LEASE_WRLCK:
3423 if (!(file->f_mode & FMODE_WRITE))
3425 fmode = FMODE_WRITE;
3427 case LL_LEASE_RDLCK:
3428 if (!(file->f_mode & FMODE_READ))
3432 case LL_LEASE_UNLCK:
3433 RETURN(ll_file_unlock_lease(file, ioc, arg));
3438 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3440 /* apply for lease */
3441 if (ioc->lil_flags & LL_LEASE_RESYNC)
3442 open_flags = MDS_OPEN_RESYNC;
3443 och = ll_lease_open(inode, file, fmode, open_flags);
3445 RETURN(PTR_ERR(och));
3447 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3448 rc = ll_lease_file_resync(och, inode, arg);
3450 ll_lease_close(och, inode, NULL);
3453 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3455 ll_lease_close(och, inode, NULL);
3461 mutex_lock(&lli->lli_och_mutex);
3462 if (fd->fd_lease_och == NULL) {
3463 fd->fd_lease_och = och;
3466 mutex_unlock(&lli->lli_och_mutex);
3468 /* impossible now that only excl is supported for now */
3469 ll_lease_close(och, inode, &lease_broken);
3475 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3477 struct ll_inode_info *lli = ll_i2info(inode);
3478 struct ll_sb_info *sbi = ll_i2sbi(inode);
3479 __u64 now = ktime_get_real_seconds();
3482 spin_lock(&lli->lli_heat_lock);
3483 heat->lh_flags = lli->lli_heat_flags;
3484 for (i = 0; i < heat->lh_count; i++)
3485 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3486 now, sbi->ll_heat_decay_weight,
3487 sbi->ll_heat_period_second);
3488 spin_unlock(&lli->lli_heat_lock);
3491 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3493 struct ll_inode_info *lli = ll_i2info(inode);
3496 spin_lock(&lli->lli_heat_lock);
3497 if (flags & LU_HEAT_FLAG_CLEAR)
3498 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3500 if (flags & LU_HEAT_FLAG_OFF)
3501 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3503 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3505 spin_unlock(&lli->lli_heat_lock);
3511 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3513 struct inode *inode = file_inode(file);
3514 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3518 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3519 PFID(ll_inode2fid(inode)), inode, cmd);
3520 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3522 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3523 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3527 case LL_IOC_GETFLAGS:
3528 /* Get the current value of the file flags */
3529 return put_user(fd->fd_flags, (int __user *)arg);
3530 case LL_IOC_SETFLAGS:
3531 case LL_IOC_CLRFLAGS:
3532 /* Set or clear specific file flags */
3533 /* XXX This probably needs checks to ensure the flags are
3534 * not abused, and to handle any flag side effects.
3536 if (get_user(flags, (int __user *) arg))
3539 if (cmd == LL_IOC_SETFLAGS) {
3540 if ((flags & LL_FILE_IGNORE_LOCK) &&
3541 !(file->f_flags & O_DIRECT)) {
3542 CERROR("%s: unable to disable locking on "
3543 "non-O_DIRECT file\n", current->comm);
3547 fd->fd_flags |= flags;
3549 fd->fd_flags &= ~flags;
3552 case LL_IOC_LOV_SETSTRIPE:
3553 case LL_IOC_LOV_SETSTRIPE_NEW:
3554 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3555 case LL_IOC_LOV_SETEA:
3556 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3557 case LL_IOC_LOV_SWAP_LAYOUTS: {
3559 struct lustre_swap_layouts lsl;
3561 if (copy_from_user(&lsl, (char __user *)arg,
3562 sizeof(struct lustre_swap_layouts)))
3565 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3568 file2 = fget(lsl.sl_fd);
3572 /* O_WRONLY or O_RDWR */
3573 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3574 GOTO(out, rc = -EPERM);
3576 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3577 struct inode *inode2;
3578 struct ll_inode_info *lli;
3579 struct obd_client_handle *och = NULL;
3581 lli = ll_i2info(inode);
3582 mutex_lock(&lli->lli_och_mutex);
3583 if (fd->fd_lease_och != NULL) {
3584 och = fd->fd_lease_och;
3585 fd->fd_lease_och = NULL;
3587 mutex_unlock(&lli->lli_och_mutex);
3589 GOTO(out, rc = -ENOLCK);
3590 inode2 = file_inode(file2);
3591 rc = ll_swap_layouts_close(och, inode, inode2);
3593 rc = ll_swap_layouts(file, file2, &lsl);
3599 case LL_IOC_LOV_GETSTRIPE:
3600 case LL_IOC_LOV_GETSTRIPE_NEW:
3601 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3602 case FS_IOC_GETFLAGS:
3603 case FS_IOC_SETFLAGS:
3604 RETURN(ll_iocontrol(inode, file, cmd, arg));
3605 case FSFILT_IOC_GETVERSION:
3606 case FS_IOC_GETVERSION:
3607 RETURN(put_user(inode->i_generation, (int __user *)arg));
3608 /* We need to special case any other ioctls we want to handle,
3609 * to send them to the MDS/OST as appropriate and to properly
3610 * network encode the arg field. */
3611 case FS_IOC_SETVERSION:
3614 case LL_IOC_GROUP_LOCK:
3615 RETURN(ll_get_grouplock(inode, file, arg));
3616 case LL_IOC_GROUP_UNLOCK:
3617 RETURN(ll_put_grouplock(inode, file, arg));
3618 case IOC_OBD_STATFS:
3619 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3621 case LL_IOC_FLUSHCTX:
3622 RETURN(ll_flush_ctx(inode));
3623 case LL_IOC_PATH2FID: {
3624 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3625 sizeof(struct lu_fid)))
3630 case LL_IOC_GETPARENT:
3631 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3633 case OBD_IOC_FID2PATH:
3634 RETURN(ll_fid2path(inode, (void __user *)arg));
3635 case LL_IOC_DATA_VERSION: {
3636 struct ioc_data_version idv;
3639 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3642 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3643 rc = ll_ioc_data_version(inode, &idv);
3646 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3652 case LL_IOC_GET_MDTIDX: {
3655 mdtidx = ll_get_mdt_idx(inode);
3659 if (put_user((int)mdtidx, (int __user *)arg))
3664 case OBD_IOC_GETDTNAME:
3665 case OBD_IOC_GETMDNAME:
3666 RETURN(ll_get_obd_name(inode, cmd, arg));
3667 case LL_IOC_HSM_STATE_GET: {
3668 struct md_op_data *op_data;
3669 struct hsm_user_state *hus;
3676 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3677 LUSTRE_OPC_ANY, hus);
3678 if (IS_ERR(op_data)) {
3680 RETURN(PTR_ERR(op_data));
3683 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3686 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3689 ll_finish_md_op_data(op_data);
3693 case LL_IOC_HSM_STATE_SET: {
3694 struct hsm_state_set *hss;
3701 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3706 rc = ll_hsm_state_set(inode, hss);
3711 case LL_IOC_HSM_ACTION: {
3712 struct md_op_data *op_data;
3713 struct hsm_current_action *hca;
3720 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3721 LUSTRE_OPC_ANY, hca);
3722 if (IS_ERR(op_data)) {
3724 RETURN(PTR_ERR(op_data));
3727 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3730 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3733 ll_finish_md_op_data(op_data);
3737 case LL_IOC_SET_LEASE_OLD: {
3738 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3740 RETURN(ll_file_set_lease(file, &ioc, 0));
3742 case LL_IOC_SET_LEASE: {
3743 struct ll_ioc_lease ioc;
3745 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3748 RETURN(ll_file_set_lease(file, &ioc, arg));
3750 case LL_IOC_GET_LEASE: {
3751 struct ll_inode_info *lli = ll_i2info(inode);
3752 struct ldlm_lock *lock = NULL;
3755 mutex_lock(&lli->lli_och_mutex);
3756 if (fd->fd_lease_och != NULL) {
3757 struct obd_client_handle *och = fd->fd_lease_och;
3759 lock = ldlm_handle2lock(&och->och_lease_handle);
3761 lock_res_and_lock(lock);
3762 if (!ldlm_is_cancel(lock))
3763 fmode = och->och_flags;
3765 unlock_res_and_lock(lock);
3766 LDLM_LOCK_PUT(lock);
3769 mutex_unlock(&lli->lli_och_mutex);
3771 RETURN(ll_lease_type_from_fmode(fmode));
3773 case LL_IOC_HSM_IMPORT: {
3774 struct hsm_user_import *hui;
3780 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3785 rc = ll_hsm_import(inode, file, hui);
3790 case LL_IOC_FUTIMES_3: {
3791 struct ll_futimes_3 lfu;
3793 if (copy_from_user(&lfu,
3794 (const struct ll_futimes_3 __user *)arg,
3798 RETURN(ll_file_futimes_3(file, &lfu));
3800 case LL_IOC_LADVISE: {
3801 struct llapi_ladvise_hdr *k_ladvise_hdr;
3802 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3805 int alloc_size = sizeof(*k_ladvise_hdr);
3808 u_ladvise_hdr = (void __user *)arg;
3809 OBD_ALLOC_PTR(k_ladvise_hdr);
3810 if (k_ladvise_hdr == NULL)
3813 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3814 GOTO(out_ladvise, rc = -EFAULT);
3816 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3817 k_ladvise_hdr->lah_count < 1)
3818 GOTO(out_ladvise, rc = -EINVAL);
3820 num_advise = k_ladvise_hdr->lah_count;
3821 if (num_advise >= LAH_COUNT_MAX)
3822 GOTO(out_ladvise, rc = -EFBIG);
3824 OBD_FREE_PTR(k_ladvise_hdr);
3825 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3826 lah_advise[num_advise]);
3827 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3828 if (k_ladvise_hdr == NULL)
3832 * TODO: submit multiple advices to one server in a single RPC
3834 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3835 GOTO(out_ladvise, rc = -EFAULT);
3837 for (i = 0; i < num_advise; i++) {
3838 struct llapi_lu_ladvise *k_ladvise =
3839 &k_ladvise_hdr->lah_advise[i];
3840 struct llapi_lu_ladvise __user *u_ladvise =
3841 &u_ladvise_hdr->lah_advise[i];
3843 rc = ll_ladvise_sanity(inode, k_ladvise);
3845 GOTO(out_ladvise, rc);
3847 switch (k_ladvise->lla_advice) {
3848 case LU_LADVISE_LOCKNOEXPAND:
3849 rc = ll_lock_noexpand(file,
3850 k_ladvise->lla_peradvice_flags);
3851 GOTO(out_ladvise, rc);
3852 case LU_LADVISE_LOCKAHEAD:
3854 rc = ll_file_lock_ahead(file, k_ladvise);
3857 GOTO(out_ladvise, rc);
3860 &u_ladvise->lla_lockahead_result))
3861 GOTO(out_ladvise, rc = -EFAULT);
3864 rc = ll_ladvise(inode, file,
3865 k_ladvise_hdr->lah_flags,
3868 GOTO(out_ladvise, rc);
3875 OBD_FREE(k_ladvise_hdr, alloc_size);
3878 case LL_IOC_FLR_SET_MIRROR: {
3879 /* mirror I/O must be direct to avoid polluting page cache
3881 if (!(file->f_flags & O_DIRECT))
3884 fd->fd_designated_mirror = (__u32)arg;
3887 case LL_IOC_FSGETXATTR:
3888 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3889 case LL_IOC_FSSETXATTR:
3890 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3892 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3893 case LL_IOC_HEAT_GET: {
3894 struct lu_heat uheat;
3895 struct lu_heat *heat;
3898 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3901 if (uheat.lh_count > OBD_HEAT_COUNT)
3902 uheat.lh_count = OBD_HEAT_COUNT;
3904 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3905 OBD_ALLOC(heat, size);
3909 heat->lh_count = uheat.lh_count;
3910 ll_heat_get(inode, heat);
3911 rc = copy_to_user((char __user *)arg, heat, size);
3912 OBD_FREE(heat, size);
3913 RETURN(rc ? -EFAULT : 0);
3915 case LL_IOC_HEAT_SET: {
3918 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3921 rc = ll_heat_set(inode, flags);
3924 case LL_IOC_PCC_DETACH: {
3925 struct lu_pcc_detach *detach;
3927 OBD_ALLOC_PTR(detach);
3931 if (copy_from_user(detach,
3932 (const struct lu_pcc_detach __user *)arg,
3934 GOTO(out_detach_free, rc = -EFAULT);
3936 if (!S_ISREG(inode->i_mode))
3937 GOTO(out_detach_free, rc = -EINVAL);
3939 if (!inode_owner_or_capable(inode))
3940 GOTO(out_detach_free, rc = -EPERM);
3942 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3944 OBD_FREE_PTR(detach);
3947 case LL_IOC_PCC_STATE: {
3948 struct lu_pcc_state __user *ustate =
3949 (struct lu_pcc_state __user *)arg;
3950 struct lu_pcc_state *state;
3952 OBD_ALLOC_PTR(state);
3956 if (copy_from_user(state, ustate, sizeof(*state)))
3957 GOTO(out_state, rc = -EFAULT);
3959 rc = pcc_ioctl_state(file, inode, state);
3961 GOTO(out_state, rc);
3963 if (copy_to_user(ustate, state, sizeof(*state)))
3964 GOTO(out_state, rc = -EFAULT);
3967 OBD_FREE_PTR(state);
3971 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3972 (void __user *)arg));
3976 #ifndef HAVE_FILE_LLSEEK_SIZE
3977 static inline loff_t
3978 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3980 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3982 if (offset > maxsize)
3985 if (offset != file->f_pos) {
3986 file->f_pos = offset;
3987 file->f_version = 0;
3993 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3994 loff_t maxsize, loff_t eof)
3996 struct inode *inode = file_inode(file);
4004 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4005 * position-querying operation. Avoid rewriting the "same"
4006 * f_pos value back to the file because a concurrent read(),
4007 * write() or lseek() might have altered it
4012 * f_lock protects against read/modify/write race with other
4013 * SEEK_CURs. Note that parallel writes and reads behave
4017 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4018 inode_unlock(inode);
4022 * In the generic case the entire file is data, so as long as
4023 * offset isn't at the end of the file then the offset is data.
4030 * There is a virtual hole at the end of the file, so as long as
4031 * offset isn't i_size or larger, return i_size.
4039 return llseek_execute(file, offset, maxsize);
4043 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4045 struct inode *inode = file_inode(file);
4046 loff_t retval, eof = 0;
4047 ktime_t kstart = ktime_get();
4050 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4051 (origin == SEEK_CUR) ? file->f_pos : 0);
4052 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4053 PFID(ll_inode2fid(inode)), inode, retval, retval,
4056 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4057 retval = ll_glimpse_size(inode);
4060 eof = i_size_read(inode);
4063 retval = ll_generic_file_llseek_size(file, offset, origin,
4064 ll_file_maxbytes(inode), eof);
4066 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4067 ktime_us_delta(ktime_get(), kstart));
4071 static int ll_flush(struct file *file, fl_owner_t id)
4073 struct inode *inode = file_inode(file);
4074 struct ll_inode_info *lli = ll_i2info(inode);
4075 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4078 LASSERT(!S_ISDIR(inode->i_mode));
4080 /* catch async errors that were recorded back when async writeback
4081 * failed for pages in this mapping. */
4082 rc = lli->lli_async_rc;
4083 lli->lli_async_rc = 0;
4084 if (lli->lli_clob != NULL) {
4085 err = lov_read_and_clear_async_rc(lli->lli_clob);
4090 /* The application has been told write failure already.
4091 * Do not report failure again. */
4092 if (fd->fd_write_failed)
4094 return rc ? -EIO : 0;
4098 * Called to make sure a portion of file has been written out.
4099 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4101 * Return how many pages have been written.
4103 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4104 enum cl_fsync_mode mode, int ignore_layout)
4108 struct cl_fsync_io *fio;
4113 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4114 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4117 env = cl_env_get(&refcheck);
4119 RETURN(PTR_ERR(env));
4121 io = vvp_env_thread_io(env);
4122 io->ci_obj = ll_i2info(inode)->lli_clob;
4123 io->ci_ignore_layout = ignore_layout;
4125 /* initialize parameters for sync */
4126 fio = &io->u.ci_fsync;
4127 fio->fi_start = start;
4129 fio->fi_fid = ll_inode2fid(inode);
4130 fio->fi_mode = mode;
4131 fio->fi_nr_written = 0;
4133 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4134 result = cl_io_loop(env, io);
4136 result = io->ci_result;
4138 result = fio->fi_nr_written;
4139 cl_io_fini(env, io);
4140 cl_env_put(env, &refcheck);
4146 * When dentry is provided (the 'else' case), file_dentry() may be
4147 * null and dentry must be used directly rather than pulled from
4148 * file_dentry() as is done otherwise.
4151 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4153 struct dentry *dentry = file_dentry(file);
4154 struct inode *inode = dentry->d_inode;
4155 struct ll_inode_info *lli = ll_i2info(inode);
4156 struct ptlrpc_request *req;
4157 ktime_t kstart = ktime_get();
4162 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4164 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4166 /* fsync's caller has already called _fdata{sync,write}, we want
4167 * that IO to finish before calling the osc and mdc sync methods */
4168 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4171 /* catch async errors that were recorded back when async writeback
4172 * failed for pages in this mapping. */
4173 if (!S_ISDIR(inode->i_mode)) {
4174 err = lli->lli_async_rc;
4175 lli->lli_async_rc = 0;
4178 if (lli->lli_clob != NULL) {
4179 err = lov_read_and_clear_async_rc(lli->lli_clob);
4185 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4189 ptlrpc_req_finished(req);
4191 if (S_ISREG(inode->i_mode)) {
4192 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4195 /* Sync metadata on MDT first, and then sync the cached data
4198 err = pcc_fsync(file, start, end, datasync, &cached);
4200 err = cl_sync_file_range(inode, start, end,
4202 if (rc == 0 && err < 0)
4205 fd->fd_write_failed = true;
4207 fd->fd_write_failed = false;
4210 inode_unlock(inode);
4213 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4214 ktime_us_delta(ktime_get(), kstart));
4219 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4221 struct inode *inode = file_inode(file);
4222 struct ll_sb_info *sbi = ll_i2sbi(inode);
4223 struct ldlm_enqueue_info einfo = {
4224 .ei_type = LDLM_FLOCK,
4225 .ei_cb_cp = ldlm_flock_completion_ast,
4226 .ei_cbdata = file_lock,
4228 struct md_op_data *op_data;
4229 struct lustre_handle lockh = { 0 };
4230 union ldlm_policy_data flock = { { 0 } };
4231 int fl_type = file_lock->fl_type;
4232 ktime_t kstart = ktime_get();
4238 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4239 PFID(ll_inode2fid(inode)), file_lock);
4241 if (file_lock->fl_flags & FL_FLOCK) {
4242 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4243 /* flocks are whole-file locks */
4244 flock.l_flock.end = OFFSET_MAX;
4245 /* For flocks owner is determined by the local file desctiptor*/
4246 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4247 } else if (file_lock->fl_flags & FL_POSIX) {
4248 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4249 flock.l_flock.start = file_lock->fl_start;
4250 flock.l_flock.end = file_lock->fl_end;
4254 flock.l_flock.pid = file_lock->fl_pid;
4256 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4257 /* Somewhat ugly workaround for svc lockd.
4258 * lockd installs custom fl_lmops->lm_compare_owner that checks
4259 * for the fl_owner to be the same (which it always is on local node
4260 * I guess between lockd processes) and then compares pid.
4261 * As such we assign pid to the owner field to make it all work,
4262 * conflict with normal locks is unlikely since pid space and
4263 * pointer space for current->files are not intersecting */
4264 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4265 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4270 einfo.ei_mode = LCK_PR;
4273 /* An unlock request may or may not have any relation to
4274 * existing locks so we may not be able to pass a lock handle
4275 * via a normal ldlm_lock_cancel() request. The request may even
4276 * unlock a byte range in the middle of an existing lock. In
4277 * order to process an unlock request we need all of the same
4278 * information that is given with a normal read or write record
4279 * lock request. To avoid creating another ldlm unlock (cancel)
4280 * message we'll treat a LCK_NL flock request as an unlock. */
4281 einfo.ei_mode = LCK_NL;
4284 einfo.ei_mode = LCK_PW;
4287 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4302 flags = LDLM_FL_BLOCK_NOWAIT;
4308 flags = LDLM_FL_TEST_LOCK;
4311 CERROR("unknown fcntl lock command: %d\n", cmd);
4315 /* Save the old mode so that if the mode in the lock changes we
4316 * can decrement the appropriate reader or writer refcount. */
4317 file_lock->fl_type = einfo.ei_mode;
4319 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4320 LUSTRE_OPC_ANY, NULL);
4321 if (IS_ERR(op_data))
4322 RETURN(PTR_ERR(op_data));
4324 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4325 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4326 flock.l_flock.pid, flags, einfo.ei_mode,
4327 flock.l_flock.start, flock.l_flock.end);
4329 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4332 /* Restore the file lock type if not TEST lock. */
4333 if (!(flags & LDLM_FL_TEST_LOCK))
4334 file_lock->fl_type = fl_type;
4336 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4337 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4338 !(flags & LDLM_FL_TEST_LOCK))
4339 rc2 = locks_lock_file_wait(file, file_lock);
4341 if ((file_lock->fl_flags & FL_FLOCK) &&
4342 (rc == 0 || file_lock->fl_type == F_UNLCK))
4343 rc2 = flock_lock_file_wait(file, file_lock);
4344 if ((file_lock->fl_flags & FL_POSIX) &&
4345 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4346 !(flags & LDLM_FL_TEST_LOCK))
4347 rc2 = posix_lock_file_wait(file, file_lock);
4348 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4350 if (rc2 && file_lock->fl_type != F_UNLCK) {
4351 einfo.ei_mode = LCK_NL;
4352 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4357 ll_finish_md_op_data(op_data);
4360 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4361 ktime_us_delta(ktime_get(), kstart));
4365 int ll_get_fid_by_name(struct inode *parent, const char *name,
4366 int namelen, struct lu_fid *fid,
4367 struct inode **inode)
4369 struct md_op_data *op_data = NULL;
4370 struct mdt_body *body;
4371 struct ptlrpc_request *req;
4375 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4376 LUSTRE_OPC_ANY, NULL);
4377 if (IS_ERR(op_data))
4378 RETURN(PTR_ERR(op_data));
4380 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4381 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4382 ll_finish_md_op_data(op_data);
4386 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4388 GOTO(out_req, rc = -EFAULT);
4390 *fid = body->mbo_fid1;
4393 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4395 ptlrpc_req_finished(req);
4399 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4402 struct dentry *dchild = NULL;
4403 struct inode *child_inode = NULL;
4404 struct md_op_data *op_data;
4405 struct ptlrpc_request *request = NULL;
4406 struct obd_client_handle *och = NULL;
4408 struct mdt_body *body;
4409 __u64 data_version = 0;
4410 size_t namelen = strlen(name);
4411 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4415 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4416 PFID(ll_inode2fid(parent)), name,
4417 lum->lum_stripe_offset, lum->lum_stripe_count);
4419 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4420 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4421 lustre_swab_lmv_user_md(lum);
4423 /* Get child FID first */
4424 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4427 dchild = d_lookup(file_dentry(file), &qstr);
4429 if (dchild->d_inode)
4430 child_inode = igrab(dchild->d_inode);
4435 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4444 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4445 OBD_CONNECT2_DIR_MIGRATE)) {
4446 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4447 ll_dir_striped(child_inode)) {
4448 CERROR("%s: MDT doesn't support stripe directory "
4449 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4450 GOTO(out_iput, rc = -EOPNOTSUPP);
4455 * lfs migrate command needs to be blocked on the client
4456 * by checking the migrate FID against the FID of the
4459 if (child_inode == parent->i_sb->s_root->d_inode)
4460 GOTO(out_iput, rc = -EINVAL);
4462 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4463 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4464 if (IS_ERR(op_data))
4465 GOTO(out_iput, rc = PTR_ERR(op_data));
4467 inode_lock(child_inode);
4468 op_data->op_fid3 = *ll_inode2fid(child_inode);
4469 if (!fid_is_sane(&op_data->op_fid3)) {
4470 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4471 ll_i2sbi(parent)->ll_fsname, name,
4472 PFID(&op_data->op_fid3));
4473 GOTO(out_unlock, rc = -EINVAL);
4476 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4477 op_data->op_data = lum;
4478 op_data->op_data_size = lumlen;
4481 if (S_ISREG(child_inode->i_mode)) {
4482 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4486 GOTO(out_unlock, rc);
4489 rc = ll_data_version(child_inode, &data_version,
4492 GOTO(out_close, rc);
4494 op_data->op_open_handle = och->och_open_handle;
4495 op_data->op_data_version = data_version;
4496 op_data->op_lease_handle = och->och_lease_handle;
4497 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4499 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4500 och->och_mod->mod_open_req->rq_replay = 0;
4501 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4504 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4505 name, namelen, &request);
4507 LASSERT(request != NULL);
4508 ll_update_times(request, parent);
4511 if (rc == 0 || rc == -EAGAIN) {
4512 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4513 LASSERT(body != NULL);
4515 /* If the server does release layout lock, then we cleanup
4516 * the client och here, otherwise release it in out_close: */
4517 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4518 obd_mod_put(och->och_mod);
4519 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4521 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4527 if (request != NULL) {
4528 ptlrpc_req_finished(request);
4532 /* Try again if the lease has cancelled. */
4533 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4538 ll_lease_close(och, child_inode, NULL);
4540 clear_nlink(child_inode);
4542 inode_unlock(child_inode);
4543 ll_finish_md_op_data(op_data);
4550 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4552 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4556 * In order to avoid flood of warning messages, only print one message
4557 * for one file. And the entire message rate on the client is limited
4558 * by CDEBUG_LIMIT too.
4560 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4561 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4562 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4563 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4569 * test if some locks matching bits and l_req_mode are acquired
4570 * - bits can be in different locks
4571 * - if found clear the common lock bits in *bits
4572 * - the bits not found, are kept in *bits
4574 * \param bits [IN] searched lock bits [IN]
4575 * \param l_req_mode [IN] searched lock mode
4576 * \retval boolean, true iff all bits are found
4578 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4580 struct lustre_handle lockh;
4581 union ldlm_policy_data policy;
4582 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4583 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4592 fid = &ll_i2info(inode)->lli_fid;
4593 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4594 ldlm_lockname[mode]);
4596 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4597 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4598 policy.l_inodebits.bits = *bits & (1 << i);
4599 if (policy.l_inodebits.bits == 0)
4602 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4603 &policy, mode, &lockh)) {
4604 struct ldlm_lock *lock;
4606 lock = ldlm_handle2lock(&lockh);
4609 ~(lock->l_policy_data.l_inodebits.bits);
4610 LDLM_LOCK_PUT(lock);
4612 *bits &= ~policy.l_inodebits.bits;
4619 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4620 struct lustre_handle *lockh, __u64 flags,
4621 enum ldlm_mode mode)
4623 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4628 fid = &ll_i2info(inode)->lli_fid;
4629 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4631 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4632 fid, LDLM_IBITS, &policy, mode, lockh);
4637 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4639 /* Already unlinked. Just update nlink and return success */
4640 if (rc == -ENOENT) {
4642 /* If it is striped directory, and there is bad stripe
4643 * Let's revalidate the dentry again, instead of returning
4645 if (ll_dir_striped(inode))
4648 /* This path cannot be hit for regular files unless in
4649 * case of obscure races, so no need to to validate
4651 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4653 } else if (rc != 0) {
4654 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4655 "%s: revalidate FID "DFID" error: rc = %d\n",
4656 ll_i2sbi(inode)->ll_fsname,
4657 PFID(ll_inode2fid(inode)), rc);
4663 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4665 struct inode *inode = dentry->d_inode;
4666 struct obd_export *exp = ll_i2mdexp(inode);
4667 struct lookup_intent oit = {
4670 struct ptlrpc_request *req = NULL;
4671 struct md_op_data *op_data;
4675 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4676 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4678 /* Call getattr by fid, so do not provide name at all. */
4679 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4680 LUSTRE_OPC_ANY, NULL);
4681 if (IS_ERR(op_data))
4682 RETURN(PTR_ERR(op_data));
4684 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4685 ll_finish_md_op_data(op_data);
4687 rc = ll_inode_revalidate_fini(inode, rc);
4691 rc = ll_revalidate_it_finish(req, &oit, dentry);
4693 ll_intent_release(&oit);
4697 /* Unlinked? Unhash dentry, so it is not picked up later by
4698 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4699 * here to preserve get_cwd functionality on 2.6.
4701 if (!dentry->d_inode->i_nlink) {
4702 spin_lock(&inode->i_lock);
4703 d_lustre_invalidate(dentry, 0);
4704 spin_unlock(&inode->i_lock);
4707 ll_lookup_finish_locks(&oit, dentry);
4709 ptlrpc_req_finished(req);
4714 static int ll_merge_md_attr(struct inode *inode)
4716 struct ll_inode_info *lli = ll_i2info(inode);
4717 struct cl_attr attr = { 0 };
4720 LASSERT(lli->lli_lsm_md != NULL);
4722 if (!lmv_dir_striped(lli->lli_lsm_md))
4725 down_read(&lli->lli_lsm_sem);
4726 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4727 &attr, ll_md_blocking_ast);
4728 up_read(&lli->lli_lsm_sem);
4732 set_nlink(inode, attr.cat_nlink);
4733 inode->i_blocks = attr.cat_blocks;
4734 i_size_write(inode, attr.cat_size);
4736 ll_i2info(inode)->lli_atime = attr.cat_atime;
4737 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4738 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4743 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4745 struct inode *inode = de->d_inode;
4746 struct ll_sb_info *sbi = ll_i2sbi(inode);
4747 struct ll_inode_info *lli = ll_i2info(inode);
4748 ktime_t kstart = ktime_get();
4751 rc = ll_inode_revalidate(de, IT_GETATTR);
4755 if (S_ISREG(inode->i_mode)) {
4758 rc = pcc_inode_getattr(inode, &cached);
4759 if (cached && rc < 0)
4762 /* In case of restore, the MDT has the right size and has
4763 * already send it back without granting the layout lock,
4764 * inode is up-to-date so glimpse is useless.
4765 * Also to glimpse we need the layout, in case of a running
4766 * restore the MDT holds the layout lock so the glimpse will
4767 * block up to the end of restore (getattr will block)
4769 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4770 rc = ll_glimpse_size(inode);
4775 /* If object isn't regular a file then don't validate size. */
4776 if (ll_dir_striped(inode)) {
4777 rc = ll_merge_md_attr(inode);
4782 inode->i_atime.tv_sec = lli->lli_atime;
4783 inode->i_mtime.tv_sec = lli->lli_mtime;
4784 inode->i_ctime.tv_sec = lli->lli_ctime;
4787 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4789 if (ll_need_32bit_api(sbi)) {
4790 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4791 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4792 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4794 stat->ino = inode->i_ino;
4795 stat->dev = inode->i_sb->s_dev;
4796 stat->rdev = inode->i_rdev;
4799 stat->mode = inode->i_mode;
4800 stat->uid = inode->i_uid;
4801 stat->gid = inode->i_gid;
4802 stat->atime = inode->i_atime;
4803 stat->mtime = inode->i_mtime;
4804 stat->ctime = inode->i_ctime;
4805 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4807 stat->nlink = inode->i_nlink;
4808 stat->size = i_size_read(inode);
4809 stat->blocks = inode->i_blocks;
4811 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4812 ktime_us_delta(ktime_get(), kstart));
4817 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4818 int ll_getattr(const struct path *path, struct kstat *stat,
4819 u32 request_mask, unsigned int flags)
4821 struct dentry *de = path->dentry;
4823 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4826 return ll_getattr_dentry(de, stat);
4829 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4830 __u64 start, __u64 len)
4834 struct fiemap *fiemap;
4835 unsigned int extent_count = fieinfo->fi_extents_max;
4837 num_bytes = sizeof(*fiemap) + (extent_count *
4838 sizeof(struct fiemap_extent));
4839 OBD_ALLOC_LARGE(fiemap, num_bytes);
4844 fiemap->fm_flags = fieinfo->fi_flags;
4845 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4846 fiemap->fm_start = start;
4847 fiemap->fm_length = len;
4848 if (extent_count > 0 &&
4849 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4850 sizeof(struct fiemap_extent)) != 0)
4851 GOTO(out, rc = -EFAULT);
4853 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4855 fieinfo->fi_flags = fiemap->fm_flags;
4856 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4857 if (extent_count > 0 &&
4858 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4859 fiemap->fm_mapped_extents *
4860 sizeof(struct fiemap_extent)) != 0)
4861 GOTO(out, rc = -EFAULT);
4863 OBD_FREE_LARGE(fiemap, num_bytes);
4867 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4869 struct ll_inode_info *lli = ll_i2info(inode);
4870 struct posix_acl *acl = NULL;
4873 spin_lock(&lli->lli_lock);
4874 /* VFS' acl_permission_check->check_acl will release the refcount */
4875 acl = posix_acl_dup(lli->lli_posix_acl);
4876 spin_unlock(&lli->lli_lock);
4881 #ifdef HAVE_IOP_SET_ACL
4882 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4883 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4885 struct ll_sb_info *sbi = ll_i2sbi(inode);
4886 struct ptlrpc_request *req = NULL;
4887 const char *name = NULL;
4889 size_t value_size = 0;
4894 case ACL_TYPE_ACCESS:
4895 name = XATTR_NAME_POSIX_ACL_ACCESS;
4897 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4900 case ACL_TYPE_DEFAULT:
4901 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4902 if (!S_ISDIR(inode->i_mode))
4903 rc = acl ? -EACCES : 0;
4914 value_size = posix_acl_xattr_size(acl->a_count);
4915 value = kmalloc(value_size, GFP_NOFS);
4917 GOTO(out, rc = -ENOMEM);
4919 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4921 GOTO(out_value, rc);
4924 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4925 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4926 name, value, value_size, 0, 0, &req);
4928 ptlrpc_req_finished(req);
4933 forget_cached_acl(inode, type);
4935 set_cached_acl(inode, type, acl);
4938 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4939 #endif /* HAVE_IOP_SET_ACL */
4941 int ll_inode_permission(struct inode *inode, int mask)
4944 struct ll_sb_info *sbi;
4945 struct root_squash_info *squash;
4946 struct cred *cred = NULL;
4947 const struct cred *old_cred = NULL;
4949 bool squash_id = false;
4950 ktime_t kstart = ktime_get();
4953 if (mask & MAY_NOT_BLOCK)
4956 /* as root inode are NOT getting validated in lookup operation,
4957 * need to do it before permission check. */
4959 if (inode == inode->i_sb->s_root->d_inode) {
4960 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4965 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4966 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4968 /* squash fsuid/fsgid if needed */
4969 sbi = ll_i2sbi(inode);
4970 squash = &sbi->ll_squash;
4971 if (unlikely(squash->rsi_uid != 0 &&
4972 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4973 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4977 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4978 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4979 squash->rsi_uid, squash->rsi_gid);
4981 /* update current process's credentials
4982 * and FS capability */
4983 cred = prepare_creds();
4987 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4988 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4989 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4990 if ((1 << cap) & CFS_CAP_FS_MASK)
4991 cap_lower(cred->cap_effective, cap);
4993 old_cred = override_creds(cred);
4996 rc = generic_permission(inode, mask);
4997 /* restore current process's credentials and FS capability */
4999 revert_creds(old_cred);
5004 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5005 ktime_us_delta(ktime_get(), kstart));
5010 /* -o localflock - only provides locally consistent flock locks */
5011 struct file_operations ll_file_operations = {
5012 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5013 # ifdef HAVE_SYNC_READ_WRITE
5014 .read = new_sync_read,
5015 .write = new_sync_write,
5017 .read_iter = ll_file_read_iter,
5018 .write_iter = ll_file_write_iter,
5019 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5020 .read = ll_file_read,
5021 .aio_read = ll_file_aio_read,
5022 .write = ll_file_write,
5023 .aio_write = ll_file_aio_write,
5024 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5025 .unlocked_ioctl = ll_file_ioctl,
5026 .open = ll_file_open,
5027 .release = ll_file_release,
5028 .mmap = ll_file_mmap,
5029 .llseek = ll_file_seek,
5030 .splice_read = ll_file_splice_read,
5035 struct file_operations ll_file_operations_flock = {
5036 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5037 # ifdef HAVE_SYNC_READ_WRITE
5038 .read = new_sync_read,
5039 .write = new_sync_write,
5040 # endif /* HAVE_SYNC_READ_WRITE */
5041 .read_iter = ll_file_read_iter,
5042 .write_iter = ll_file_write_iter,
5043 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5044 .read = ll_file_read,
5045 .aio_read = ll_file_aio_read,
5046 .write = ll_file_write,
5047 .aio_write = ll_file_aio_write,
5048 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5049 .unlocked_ioctl = ll_file_ioctl,
5050 .open = ll_file_open,
5051 .release = ll_file_release,
5052 .mmap = ll_file_mmap,
5053 .llseek = ll_file_seek,
5054 .splice_read = ll_file_splice_read,
5057 .flock = ll_file_flock,
5058 .lock = ll_file_flock
5061 /* These are for -o noflock - to return ENOSYS on flock calls */
5062 struct file_operations ll_file_operations_noflock = {
5063 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5064 # ifdef HAVE_SYNC_READ_WRITE
5065 .read = new_sync_read,
5066 .write = new_sync_write,
5067 # endif /* HAVE_SYNC_READ_WRITE */
5068 .read_iter = ll_file_read_iter,
5069 .write_iter = ll_file_write_iter,
5070 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5071 .read = ll_file_read,
5072 .aio_read = ll_file_aio_read,
5073 .write = ll_file_write,
5074 .aio_write = ll_file_aio_write,
5075 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5076 .unlocked_ioctl = ll_file_ioctl,
5077 .open = ll_file_open,
5078 .release = ll_file_release,
5079 .mmap = ll_file_mmap,
5080 .llseek = ll_file_seek,
5081 .splice_read = ll_file_splice_read,
5084 .flock = ll_file_noflock,
5085 .lock = ll_file_noflock
5088 struct inode_operations ll_file_inode_operations = {
5089 .setattr = ll_setattr,
5090 .getattr = ll_getattr,
5091 .permission = ll_inode_permission,
5092 #ifdef HAVE_IOP_XATTR
5093 .setxattr = ll_setxattr,
5094 .getxattr = ll_getxattr,
5095 .removexattr = ll_removexattr,
5097 .listxattr = ll_listxattr,
5098 .fiemap = ll_fiemap,
5099 #ifdef HAVE_IOP_GET_ACL
5100 .get_acl = ll_get_acl,
5102 #ifdef HAVE_IOP_SET_ACL
5103 .set_acl = ll_set_acl,
5107 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5109 struct ll_inode_info *lli = ll_i2info(inode);
5110 struct cl_object *obj = lli->lli_clob;
5119 env = cl_env_get(&refcheck);
5121 RETURN(PTR_ERR(env));
5123 rc = cl_conf_set(env, lli->lli_clob, conf);
5127 if (conf->coc_opc == OBJECT_CONF_SET) {
5128 struct ldlm_lock *lock = conf->coc_lock;
5129 struct cl_layout cl = {
5133 LASSERT(lock != NULL);
5134 LASSERT(ldlm_has_layout(lock));
5136 /* it can only be allowed to match after layout is
5137 * applied to inode otherwise false layout would be
5138 * seen. Applying layout shoud happen before dropping
5139 * the intent lock. */
5140 ldlm_lock_allow_match(lock);
5142 rc = cl_object_layout_get(env, obj, &cl);
5147 DFID": layout version change: %u -> %u\n",
5148 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5150 ll_layout_version_set(lli, cl.cl_layout_gen);
5154 cl_env_put(env, &refcheck);
5159 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5160 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5163 struct ll_sb_info *sbi = ll_i2sbi(inode);
5164 struct ptlrpc_request *req;
5171 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5172 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5173 lock->l_lvb_data, lock->l_lvb_len);
5175 if (lock->l_lvb_data != NULL)
5178 /* if layout lock was granted right away, the layout is returned
5179 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5180 * blocked and then granted via completion ast, we have to fetch
5181 * layout here. Please note that we can't use the LVB buffer in
5182 * completion AST because it doesn't have a large enough buffer */
5183 rc = ll_get_default_mdsize(sbi, &lmmsize);
5187 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5188 XATTR_NAME_LOV, lmmsize, &req);
5191 GOTO(out, rc = 0); /* empty layout */
5198 if (lmmsize == 0) /* empty layout */
5201 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5203 GOTO(out, rc = -EFAULT);
5205 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5206 if (lvbdata == NULL)
5207 GOTO(out, rc = -ENOMEM);
5209 memcpy(lvbdata, lmm, lmmsize);
5210 lock_res_and_lock(lock);
5211 if (unlikely(lock->l_lvb_data == NULL)) {
5212 lock->l_lvb_type = LVB_T_LAYOUT;
5213 lock->l_lvb_data = lvbdata;
5214 lock->l_lvb_len = lmmsize;
5217 unlock_res_and_lock(lock);
5220 OBD_FREE_LARGE(lvbdata, lmmsize);
5225 ptlrpc_req_finished(req);
5230 * Apply the layout to the inode. Layout lock is held and will be released
5233 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5234 struct inode *inode)
5236 struct ll_inode_info *lli = ll_i2info(inode);
5237 struct ll_sb_info *sbi = ll_i2sbi(inode);
5238 struct ldlm_lock *lock;
5239 struct cl_object_conf conf;
5242 bool wait_layout = false;
5245 LASSERT(lustre_handle_is_used(lockh));
5247 lock = ldlm_handle2lock(lockh);
5248 LASSERT(lock != NULL);
5249 LASSERT(ldlm_has_layout(lock));
5251 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5252 PFID(&lli->lli_fid), inode);
5254 /* in case this is a caching lock and reinstate with new inode */
5255 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5257 lock_res_and_lock(lock);
5258 lvb_ready = ldlm_is_lvb_ready(lock);
5259 unlock_res_and_lock(lock);
5261 /* checking lvb_ready is racy but this is okay. The worst case is
5262 * that multi processes may configure the file on the same time. */
5266 rc = ll_layout_fetch(inode, lock);
5270 /* for layout lock, lmm is stored in lock's lvb.
5271 * lvb_data is immutable if the lock is held so it's safe to access it
5274 * set layout to file. Unlikely this will fail as old layout was
5275 * surely eliminated */
5276 memset(&conf, 0, sizeof conf);
5277 conf.coc_opc = OBJECT_CONF_SET;
5278 conf.coc_inode = inode;
5279 conf.coc_lock = lock;
5280 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5281 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5282 rc = ll_layout_conf(inode, &conf);
5284 /* refresh layout failed, need to wait */
5285 wait_layout = rc == -EBUSY;
5288 LDLM_LOCK_PUT(lock);
5289 ldlm_lock_decref(lockh, mode);
5291 /* wait for IO to complete if it's still being used. */
5293 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5294 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5296 memset(&conf, 0, sizeof conf);
5297 conf.coc_opc = OBJECT_CONF_WAIT;
5298 conf.coc_inode = inode;
5299 rc = ll_layout_conf(inode, &conf);
5303 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5304 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5310 * Issue layout intent RPC to MDS.
5311 * \param inode [in] file inode
5312 * \param intent [in] layout intent
5314 * \retval 0 on success
5315 * \retval < 0 error code
5317 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5319 struct ll_inode_info *lli = ll_i2info(inode);
5320 struct ll_sb_info *sbi = ll_i2sbi(inode);
5321 struct md_op_data *op_data;
5322 struct lookup_intent it;
5323 struct ptlrpc_request *req;
5327 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5328 0, 0, LUSTRE_OPC_ANY, NULL);
5329 if (IS_ERR(op_data))
5330 RETURN(PTR_ERR(op_data));
5332 op_data->op_data = intent;
5333 op_data->op_data_size = sizeof(*intent);
5335 memset(&it, 0, sizeof(it));
5336 it.it_op = IT_LAYOUT;
5337 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5338 intent->li_opc == LAYOUT_INTENT_TRUNC)
5339 it.it_flags = FMODE_WRITE;
5341 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5342 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5344 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5345 &ll_md_blocking_ast, 0);
5346 if (it.it_request != NULL)
5347 ptlrpc_req_finished(it.it_request);
5348 it.it_request = NULL;
5350 ll_finish_md_op_data(op_data);
5352 /* set lock data in case this is a new lock */
5354 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5356 ll_intent_drop_lock(&it);
5362 * This function checks if there exists a LAYOUT lock on the client side,
5363 * or enqueues it if it doesn't have one in cache.
5365 * This function will not hold layout lock so it may be revoked any time after
5366 * this function returns. Any operations depend on layout should be redone
5369 * This function should be called before lov_io_init() to get an uptodate
5370 * layout version, the caller should save the version number and after IO
5371 * is finished, this function should be called again to verify that layout
5372 * is not changed during IO time.
5374 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5376 struct ll_inode_info *lli = ll_i2info(inode);
5377 struct ll_sb_info *sbi = ll_i2sbi(inode);
5378 struct lustre_handle lockh;
5379 struct layout_intent intent = {
5380 .li_opc = LAYOUT_INTENT_ACCESS,
5382 enum ldlm_mode mode;
5386 *gen = ll_layout_version_get(lli);
5387 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5391 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5392 LASSERT(S_ISREG(inode->i_mode));
5394 /* take layout lock mutex to enqueue layout lock exclusively. */
5395 mutex_lock(&lli->lli_layout_mutex);
5398 /* mostly layout lock is caching on the local side, so try to
5399 * match it before grabbing layout lock mutex. */
5400 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5401 LCK_CR | LCK_CW | LCK_PR |
5403 if (mode != 0) { /* hit cached lock */
5404 rc = ll_layout_lock_set(&lockh, mode, inode);
5410 rc = ll_layout_intent(inode, &intent);
5416 *gen = ll_layout_version_get(lli);
5417 mutex_unlock(&lli->lli_layout_mutex);
5423 * Issue layout intent RPC indicating where in a file an IO is about to write.
5425 * \param[in] inode file inode.
5426 * \param[in] ext write range with start offset of fille in bytes where
5427 * an IO is about to write, and exclusive end offset in
5430 * \retval 0 on success
5431 * \retval < 0 error code
5433 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5434 struct lu_extent *ext)
5436 struct layout_intent intent = {
5438 .li_extent.e_start = ext->e_start,
5439 .li_extent.e_end = ext->e_end,
5444 rc = ll_layout_intent(inode, &intent);
5450 * This function send a restore request to the MDT
5452 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5454 struct hsm_user_request *hur;
5458 len = sizeof(struct hsm_user_request) +
5459 sizeof(struct hsm_user_item);
5460 OBD_ALLOC(hur, len);
5464 hur->hur_request.hr_action = HUA_RESTORE;
5465 hur->hur_request.hr_archive_id = 0;
5466 hur->hur_request.hr_flags = 0;
5467 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5468 sizeof(hur->hur_user_item[0].hui_fid));
5469 hur->hur_user_item[0].hui_extent.offset = offset;
5470 hur->hur_user_item[0].hui_extent.length = length;
5471 hur->hur_request.hr_itemcount = 1;
5472 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,