4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
167 case MDS_CLOSE_LAYOUT_SPLIT:
168 case MDS_CLOSE_LAYOUT_SWAP: {
169 struct split_param *sp = data;
171 LASSERT(data != NULL);
172 op_data->op_bias |= bias;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
176 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
177 op_data->op_mirror_id = sp->sp_mirror_id;
179 op_data->op_fid2 = *ll_inode2fid(data);
184 case MDS_CLOSE_RESYNC_DONE: {
185 struct ll_ioc_lease *ioc = data;
187 LASSERT(data != NULL);
188 op_data->op_attr_blocks +=
189 ioc->lil_count * op_data->op_attr_blocks;
190 op_data->op_attr.ia_valid |= ATTR_SIZE;
191 op_data->op_xvalid |= OP_XVALID_BLOCKS;
192 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
194 op_data->op_lease_handle = och->och_lease_handle;
195 op_data->op_data = &ioc->lil_ids[0];
196 op_data->op_data_size =
197 ioc->lil_count * sizeof(ioc->lil_ids[0]);
201 case MDS_PCC_ATTACH: {
202 struct pcc_param *param = data;
204 LASSERT(data != NULL);
205 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
206 op_data->op_archive_id = param->pa_archive_id;
207 op_data->op_data_version = param->pa_data_version;
208 op_data->op_lease_handle = och->och_lease_handle;
212 case MDS_HSM_RELEASE:
213 LASSERT(data != NULL);
214 op_data->op_bias |= MDS_HSM_RELEASE;
215 op_data->op_data_version = *(__u64 *)data;
216 op_data->op_lease_handle = och->och_lease_handle;
217 op_data->op_attr.ia_valid |= ATTR_SIZE;
218 op_data->op_xvalid |= OP_XVALID_BLOCKS;
222 LASSERT(data == NULL);
226 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
227 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
228 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
229 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
231 rc = md_close(md_exp, op_data, och->och_mod, &req);
232 if (rc != 0 && rc != -EINTR)
233 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
234 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
236 if (rc == 0 && op_data->op_bias & bias) {
237 struct mdt_body *body;
239 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
240 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
243 if (bias & MDS_PCC_ATTACH) {
244 struct pcc_param *param = data;
246 param->pa_layout_gen = body->mbo_layout_gen;
250 ll_finish_md_op_data(op_data);
254 md_clear_open_replay_data(md_exp, och);
255 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
258 ptlrpc_req_finished(req); /* This is close request */
262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
264 struct ll_inode_info *lli = ll_i2info(inode);
265 struct obd_client_handle **och_p;
266 struct obd_client_handle *och;
271 if (fmode & FMODE_WRITE) {
272 och_p = &lli->lli_mds_write_och;
273 och_usecount = &lli->lli_open_fd_write_count;
274 } else if (fmode & FMODE_EXEC) {
275 och_p = &lli->lli_mds_exec_och;
276 och_usecount = &lli->lli_open_fd_exec_count;
278 LASSERT(fmode & FMODE_READ);
279 och_p = &lli->lli_mds_read_och;
280 och_usecount = &lli->lli_open_fd_read_count;
283 mutex_lock(&lli->lli_och_mutex);
284 if (*och_usecount > 0) {
285 /* There are still users of this handle, so skip
287 mutex_unlock(&lli->lli_och_mutex);
293 mutex_unlock(&lli->lli_och_mutex);
296 /* There might be a race and this handle may already
298 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
304 static int ll_md_close(struct inode *inode, struct file *file)
306 union ldlm_policy_data policy = {
307 .l_inodebits = { MDS_INODELOCK_OPEN },
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
311 struct ll_inode_info *lli = ll_i2info(inode);
312 struct lustre_handle lockh;
313 enum ldlm_mode lockmode;
317 /* clear group lock, if present */
318 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
319 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
321 if (fd->fd_lease_och != NULL) {
324 /* Usually the lease is not released when the
325 * application crashed, we need to release here. */
326 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
327 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
328 PFID(&lli->lli_fid), rc, lease_broken);
330 fd->fd_lease_och = NULL;
333 if (fd->fd_och != NULL) {
334 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
339 /* Let's see if we have good enough OPEN lock on the file and if
340 we can skip talking to MDS */
341 mutex_lock(&lli->lli_och_mutex);
342 if (fd->fd_omode & FMODE_WRITE) {
344 LASSERT(lli->lli_open_fd_write_count);
345 lli->lli_open_fd_write_count--;
346 } else if (fd->fd_omode & FMODE_EXEC) {
348 LASSERT(lli->lli_open_fd_exec_count);
349 lli->lli_open_fd_exec_count--;
352 LASSERT(lli->lli_open_fd_read_count);
353 lli->lli_open_fd_read_count--;
355 mutex_unlock(&lli->lli_och_mutex);
357 /* LU-4398: do not cache write open lock if the file has exec bit */
358 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
359 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
360 LDLM_IBITS, &policy, lockmode, &lockh))
361 rc = ll_md_real_close(inode, fd->fd_omode);
364 LUSTRE_FPRIVATE(file) = NULL;
365 ll_file_data_put(fd);
370 /* While this returns an error code, fput() the caller does not, so we need
371 * to make every effort to clean up all of our state here. Also, applications
372 * rarely check close errors and even if an error is returned they will not
373 * re-try the close call.
375 int ll_file_release(struct inode *inode, struct file *file)
377 struct ll_file_data *fd;
378 struct ll_sb_info *sbi = ll_i2sbi(inode);
379 struct ll_inode_info *lli = ll_i2info(inode);
380 ktime_t kstart = ktime_get();
385 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
386 PFID(ll_inode2fid(inode)), inode);
388 fd = LUSTRE_FPRIVATE(file);
391 /* The last ref on @file, maybe not the the owner pid of statahead,
392 * because parent and child process can share the same file handle. */
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
394 ll_deauthorize_statahead(inode, fd);
396 if (inode->i_sb->s_root == file_dentry(file)) {
397 LUSTRE_FPRIVATE(file) = NULL;
398 ll_file_data_put(fd);
402 pcc_file_release(inode, file);
404 if (!S_ISDIR(inode->i_mode)) {
405 if (lli->lli_clob != NULL)
406 lov_read_and_clear_async_rc(lli->lli_clob);
407 lli->lli_async_rc = 0;
410 rc = ll_md_close(inode, file);
412 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
413 libcfs_debug_dumplog();
416 if (!rc && inode->i_sb->s_root != file_dentry(file))
417 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
418 ktime_us_delta(ktime_get(), kstart));
422 static inline int ll_dom_readpage(void *data, struct page *page)
424 struct niobuf_local *lnb = data;
427 kaddr = ll_kmap_atomic(page, KM_USER0);
428 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
429 if (lnb->lnb_len < PAGE_SIZE)
430 memset(kaddr + lnb->lnb_len, 0,
431 PAGE_SIZE - lnb->lnb_len);
432 flush_dcache_page(page);
433 SetPageUptodate(page);
434 ll_kunmap_atomic(kaddr, KM_USER0);
440 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
441 struct lookup_intent *it)
443 struct ll_inode_info *lli = ll_i2info(inode);
444 struct cl_object *obj = lli->lli_clob;
445 struct address_space *mapping = inode->i_mapping;
447 struct niobuf_remote *rnb;
448 struct mdt_body *body;
450 unsigned long index, start;
451 struct niobuf_local lnb;
458 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
462 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
463 if (rnb == NULL || rnb->rnb_len == 0)
466 /* LU-11595: Server may return whole file and that is OK always or
467 * it may return just file tail and its offset must be aligned with
468 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
469 * smaller then offset may be not aligned and that data is just ignored.
471 if (rnb->rnb_offset % PAGE_SIZE)
474 /* Server returns whole file or just file tail if it fills in reply
475 * buffer, in both cases total size should be equal to the file size.
477 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
478 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
479 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
480 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
481 rnb->rnb_len, body->mbo_dom_size);
485 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
486 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
488 data = (char *)rnb + sizeof(*rnb);
490 lnb.lnb_file_offset = rnb->rnb_offset;
491 start = lnb.lnb_file_offset / PAGE_SIZE;
493 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
494 lnb.lnb_page_offset = 0;
496 lnb.lnb_data = data + (index << PAGE_SHIFT);
497 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
498 if (lnb.lnb_len > PAGE_SIZE)
499 lnb.lnb_len = PAGE_SIZE;
501 vmpage = read_cache_page(mapping, index + start,
502 ll_dom_readpage, &lnb);
503 if (IS_ERR(vmpage)) {
504 CWARN("%s: cannot fill page %lu for "DFID
505 " with data: rc = %li\n",
506 ll_i2sbi(inode)->ll_fsname, index + start,
507 PFID(lu_object_fid(&obj->co_lu)),
513 } while (rnb->rnb_len > (index << PAGE_SHIFT));
517 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
518 struct lookup_intent *itp)
520 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
521 struct dentry *parent = de->d_parent;
524 struct md_op_data *op_data;
525 struct ptlrpc_request *req = NULL;
529 LASSERT(parent != NULL);
530 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
532 /* if server supports open-by-fid, or file name is invalid, don't pack
533 * name in open request */
534 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
535 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
537 len = de->d_name.len;
538 name = kmalloc(len + 1, GFP_NOFS);
543 spin_lock(&de->d_lock);
544 if (len != de->d_name.len) {
545 spin_unlock(&de->d_lock);
549 memcpy(name, de->d_name.name, len);
551 spin_unlock(&de->d_lock);
553 if (!lu_name_is_valid_2(name, len)) {
559 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
560 name, len, 0, LUSTRE_OPC_ANY, NULL);
561 if (IS_ERR(op_data)) {
563 RETURN(PTR_ERR(op_data));
565 op_data->op_data = lmm;
566 op_data->op_data_size = lmmsize;
568 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
569 &ll_md_blocking_ast, 0);
571 ll_finish_md_op_data(op_data);
573 /* reason for keep own exit path - don`t flood log
574 * with messages with -ESTALE errors.
576 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
577 it_open_error(DISP_OPEN_OPEN, itp))
579 ll_release_openhandle(de, itp);
583 if (it_disposition(itp, DISP_LOOKUP_NEG))
584 GOTO(out, rc = -ENOENT);
586 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
587 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
588 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
592 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
594 if (!rc && itp->it_lock_mode) {
595 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
596 struct ldlm_lock *lock;
597 bool has_dom_bit = false;
599 /* If we got a lock back and it has a LOOKUP bit set,
600 * make sure the dentry is marked as valid so we can find it.
601 * We don't need to care about actual hashing since other bits
602 * of kernel will deal with that later.
604 lock = ldlm_handle2lock(&handle);
606 has_dom_bit = ldlm_has_dom(lock);
607 if (lock->l_policy_data.l_inodebits.bits &
608 MDS_INODELOCK_LOOKUP)
609 d_lustre_revalidate(de);
613 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
615 ll_dom_finish_open(de->d_inode, req, itp);
619 ptlrpc_req_finished(req);
620 ll_intent_drop_lock(itp);
622 /* We did open by fid, but by the time we got to the server,
623 * the object disappeared. If this is a create, we cannot really
624 * tell the userspace that the file it was trying to create
625 * does not exist. Instead let's return -ESTALE, and the VFS will
626 * retry the create with LOOKUP_REVAL that we are going to catch
627 * in ll_revalidate_dentry() and use lookup then.
629 if (rc == -ENOENT && itp->it_op & IT_CREAT)
635 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
636 struct obd_client_handle *och)
638 struct mdt_body *body;
640 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
641 och->och_open_handle = body->mbo_open_handle;
642 och->och_fid = body->mbo_fid1;
643 och->och_lease_handle.cookie = it->it_lock_handle;
644 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
645 och->och_flags = it->it_flags;
647 return md_set_open_replay_data(md_exp, och, it);
650 static int ll_local_open(struct file *file, struct lookup_intent *it,
651 struct ll_file_data *fd, struct obd_client_handle *och)
653 struct inode *inode = file_inode(file);
656 LASSERT(!LUSTRE_FPRIVATE(file));
663 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
668 LUSTRE_FPRIVATE(file) = fd;
669 ll_readahead_init(inode, &fd->fd_ras);
670 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
672 /* ll_cl_context initialize */
673 rwlock_init(&fd->fd_lock);
674 INIT_LIST_HEAD(&fd->fd_lccs);
679 /* Open a file, and (for the very first open) create objects on the OSTs at
680 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
681 * creation or open until ll_lov_setstripe() ioctl is called.
683 * If we already have the stripe MD locally then we don't request it in
684 * md_open(), by passing a lmm_size = 0.
686 * It is up to the application to ensure no other processes open this file
687 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
688 * used. We might be able to avoid races of that sort by getting lli_open_sem
689 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
690 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
692 int ll_file_open(struct inode *inode, struct file *file)
694 struct ll_inode_info *lli = ll_i2info(inode);
695 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
696 .it_flags = file->f_flags };
697 struct obd_client_handle **och_p = NULL;
698 __u64 *och_usecount = NULL;
699 struct ll_file_data *fd;
700 ktime_t kstart = ktime_get();
704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
705 PFID(ll_inode2fid(inode)), inode, file->f_flags);
707 it = file->private_data; /* XXX: compat macro */
708 file->private_data = NULL; /* prevent ll_local_open assertion */
710 fd = ll_file_data_get();
712 GOTO(out_nofiledata, rc = -ENOMEM);
715 if (S_ISDIR(inode->i_mode))
716 ll_authorize_statahead(inode, fd);
718 if (inode->i_sb->s_root == file_dentry(file)) {
719 LUSTRE_FPRIVATE(file) = fd;
723 if (!it || !it->it_disposition) {
724 /* Convert f_flags into access mode. We cannot use file->f_mode,
725 * because everything but O_ACCMODE mask was stripped from
727 if ((oit.it_flags + 1) & O_ACCMODE)
729 if (file->f_flags & O_TRUNC)
730 oit.it_flags |= FMODE_WRITE;
732 /* kernel only call f_op->open in dentry_open. filp_open calls
733 * dentry_open after call to open_namei that checks permissions.
734 * Only nfsd_open call dentry_open directly without checking
735 * permissions and because of that this code below is safe.
737 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
738 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
740 /* We do not want O_EXCL here, presumably we opened the file
741 * already? XXX - NFS implications? */
742 oit.it_flags &= ~O_EXCL;
744 /* bug20584, if "it_flags" contains O_CREAT, the file will be
745 * created if necessary, then "IT_CREAT" should be set to keep
746 * consistent with it */
747 if (oit.it_flags & O_CREAT)
748 oit.it_op |= IT_CREAT;
754 /* Let's see if we have file open on MDS already. */
755 if (it->it_flags & FMODE_WRITE) {
756 och_p = &lli->lli_mds_write_och;
757 och_usecount = &lli->lli_open_fd_write_count;
758 } else if (it->it_flags & FMODE_EXEC) {
759 och_p = &lli->lli_mds_exec_och;
760 och_usecount = &lli->lli_open_fd_exec_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 mutex_lock(&lli->lli_och_mutex);
767 if (*och_p) { /* Open handle is present */
768 if (it_disposition(it, DISP_OPEN_OPEN)) {
769 /* Well, there's extra open request that we do not need,
770 let's close it somehow. This will decref request. */
771 rc = it_open_error(DISP_OPEN_OPEN, it);
773 mutex_unlock(&lli->lli_och_mutex);
774 GOTO(out_openerr, rc);
777 ll_release_openhandle(file_dentry(file), it);
781 rc = ll_local_open(file, it, fd, NULL);
784 mutex_unlock(&lli->lli_och_mutex);
785 GOTO(out_openerr, rc);
788 LASSERT(*och_usecount == 0);
789 if (!it->it_disposition) {
790 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
791 /* We cannot just request lock handle now, new ELC code
792 means that one of other OPEN locks for this file
793 could be cancelled, and since blocking ast handler
794 would attempt to grab och_mutex as well, that would
795 result in a deadlock */
796 mutex_unlock(&lli->lli_och_mutex);
798 * Normally called under two situations:
800 * 2. A race/condition on MDS resulting in no open
801 * handle to be returned from LOOKUP|OPEN request,
802 * for example if the target entry was a symlink.
804 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
805 * marked by a bit set in ll_iget_for_nfs. Clear the
806 * bit so that it's not confusing later callers.
808 * NB; when ldd is NULL, it must have come via normal
809 * lookup path only, since ll_iget_for_nfs always calls
812 if (ldd && ldd->lld_nfs_dentry) {
813 ldd->lld_nfs_dentry = 0;
814 it->it_flags |= MDS_OPEN_LOCK;
818 * Always specify MDS_OPEN_BY_FID because we don't want
819 * to get file with different fid.
821 it->it_flags |= MDS_OPEN_BY_FID;
822 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
825 GOTO(out_openerr, rc);
829 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
831 GOTO(out_och_free, rc = -ENOMEM);
835 /* md_intent_lock() didn't get a request ref if there was an
836 * open error, so don't do cleanup on the request here
838 /* XXX (green): Should not we bail out on any error here, not
839 * just open error? */
840 rc = it_open_error(DISP_OPEN_OPEN, it);
842 GOTO(out_och_free, rc);
844 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
845 "inode %p: disposition %x, status %d\n", inode,
846 it_disposition(it, ~0), it->it_status);
848 rc = ll_local_open(file, it, fd, *och_p);
850 GOTO(out_och_free, rc);
853 rc = pcc_file_open(inode, file);
855 GOTO(out_och_free, rc);
857 mutex_unlock(&lli->lli_och_mutex);
860 /* Must do this outside lli_och_mutex lock to prevent deadlock where
861 different kind of OPEN lock for this same inode gets cancelled
862 by ldlm_cancel_lru */
863 if (!S_ISREG(inode->i_mode))
864 GOTO(out_och_free, rc);
866 cl_lov_delay_create_clear(&file->f_flags);
867 GOTO(out_och_free, rc);
871 if (och_p && *och_p) {
872 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
873 *och_p = NULL; /* OBD_FREE writes some magic there */
876 mutex_unlock(&lli->lli_och_mutex);
879 if (lli->lli_opendir_key == fd)
880 ll_deauthorize_statahead(inode, fd);
883 ll_file_data_put(fd);
885 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
886 ktime_us_delta(ktime_get(), kstart));
890 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
891 ptlrpc_req_finished(it->it_request);
892 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
898 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
899 struct ldlm_lock_desc *desc, void *data, int flag)
902 struct lustre_handle lockh;
906 case LDLM_CB_BLOCKING:
907 ldlm_lock2handle(lock, &lockh);
908 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
910 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
914 case LDLM_CB_CANCELING:
922 * When setting a lease on a file, we take ownership of the lli_mds_*_och
923 * and save it as fd->fd_och so as to force client to reopen the file even
924 * if it has an open lock in cache already.
926 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
927 struct lustre_handle *old_open_handle)
929 struct ll_inode_info *lli = ll_i2info(inode);
930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
931 struct obd_client_handle **och_p;
936 /* Get the openhandle of the file */
937 mutex_lock(&lli->lli_och_mutex);
938 if (fd->fd_lease_och != NULL)
939 GOTO(out_unlock, rc = -EBUSY);
941 if (fd->fd_och == NULL) {
942 if (file->f_mode & FMODE_WRITE) {
943 LASSERT(lli->lli_mds_write_och != NULL);
944 och_p = &lli->lli_mds_write_och;
945 och_usecount = &lli->lli_open_fd_write_count;
947 LASSERT(lli->lli_mds_read_och != NULL);
948 och_p = &lli->lli_mds_read_och;
949 och_usecount = &lli->lli_open_fd_read_count;
952 if (*och_usecount > 1)
953 GOTO(out_unlock, rc = -EBUSY);
960 *old_open_handle = fd->fd_och->och_open_handle;
964 mutex_unlock(&lli->lli_och_mutex);
969 * Release ownership on lli_mds_*_och when putting back a file lease.
971 static int ll_lease_och_release(struct inode *inode, struct file *file)
973 struct ll_inode_info *lli = ll_i2info(inode);
974 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
975 struct obd_client_handle **och_p;
976 struct obd_client_handle *old_och = NULL;
981 mutex_lock(&lli->lli_och_mutex);
982 if (file->f_mode & FMODE_WRITE) {
983 och_p = &lli->lli_mds_write_och;
984 och_usecount = &lli->lli_open_fd_write_count;
986 och_p = &lli->lli_mds_read_och;
987 och_usecount = &lli->lli_open_fd_read_count;
990 /* The file may have been open by another process (broken lease) so
991 * *och_p is not NULL. In this case we should simply increase usecount
994 if (*och_p != NULL) {
995 old_och = fd->fd_och;
1002 mutex_unlock(&lli->lli_och_mutex);
1004 if (old_och != NULL)
1005 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1011 * Acquire a lease and open the file.
1013 static struct obd_client_handle *
1014 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1017 struct lookup_intent it = { .it_op = IT_OPEN };
1018 struct ll_sb_info *sbi = ll_i2sbi(inode);
1019 struct md_op_data *op_data;
1020 struct ptlrpc_request *req = NULL;
1021 struct lustre_handle old_open_handle = { 0 };
1022 struct obd_client_handle *och = NULL;
1027 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1028 RETURN(ERR_PTR(-EINVAL));
1031 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1032 RETURN(ERR_PTR(-EPERM));
1034 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1036 RETURN(ERR_PTR(rc));
1041 RETURN(ERR_PTR(-ENOMEM));
1043 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1044 LUSTRE_OPC_ANY, NULL);
1045 if (IS_ERR(op_data))
1046 GOTO(out, rc = PTR_ERR(op_data));
1048 /* To tell the MDT this openhandle is from the same owner */
1049 op_data->op_open_handle = old_open_handle;
1051 it.it_flags = fmode | open_flags;
1052 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1053 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1054 &ll_md_blocking_lease_ast,
1055 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1056 * it can be cancelled which may mislead applications that the lease is
1058 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1059 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1060 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1061 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1062 ll_finish_md_op_data(op_data);
1063 ptlrpc_req_finished(req);
1065 GOTO(out_release_it, rc);
1067 if (it_disposition(&it, DISP_LOOKUP_NEG))
1068 GOTO(out_release_it, rc = -ENOENT);
1070 rc = it_open_error(DISP_OPEN_OPEN, &it);
1072 GOTO(out_release_it, rc);
1074 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1075 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1077 GOTO(out_release_it, rc);
1079 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1080 GOTO(out_close, rc = -EOPNOTSUPP);
1082 /* already get lease, handle lease lock */
1083 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1084 if (it.it_lock_mode == 0 ||
1085 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1086 /* open lock must return for lease */
1087 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1088 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1090 GOTO(out_close, rc = -EPROTO);
1093 ll_intent_release(&it);
1097 /* Cancel open lock */
1098 if (it.it_lock_mode != 0) {
1099 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1101 it.it_lock_mode = 0;
1102 och->och_lease_handle.cookie = 0ULL;
1104 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1106 CERROR("%s: error closing file "DFID": %d\n",
1107 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1108 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1110 ll_intent_release(&it);
1114 RETURN(ERR_PTR(rc));
1118 * Check whether a layout swap can be done between two inodes.
1120 * \param[in] inode1 First inode to check
1121 * \param[in] inode2 Second inode to check
1123 * \retval 0 on success, layout swap can be performed between both inodes
1124 * \retval negative error code if requirements are not met
1126 static int ll_check_swap_layouts_validity(struct inode *inode1,
1127 struct inode *inode2)
1129 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1132 if (inode_permission(inode1, MAY_WRITE) ||
1133 inode_permission(inode2, MAY_WRITE))
1136 if (inode1->i_sb != inode2->i_sb)
1142 static int ll_swap_layouts_close(struct obd_client_handle *och,
1143 struct inode *inode, struct inode *inode2)
1145 const struct lu_fid *fid1 = ll_inode2fid(inode);
1146 const struct lu_fid *fid2;
1150 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1151 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1153 rc = ll_check_swap_layouts_validity(inode, inode2);
1155 GOTO(out_free_och, rc);
1157 /* We now know that inode2 is a lustre inode */
1158 fid2 = ll_inode2fid(inode2);
1160 rc = lu_fid_cmp(fid1, fid2);
1162 GOTO(out_free_och, rc = -EINVAL);
1164 /* Close the file and {swap,merge} layouts between inode & inode2.
1165 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1166 * because we still need it to pack l_remote_handle to MDT. */
1167 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1170 och = NULL; /* freed in ll_close_inode_openhandle() */
1180 * Release lease and close the file.
1181 * It will check if the lease has ever broken.
1183 static int ll_lease_close_intent(struct obd_client_handle *och,
1184 struct inode *inode,
1185 bool *lease_broken, enum mds_op_bias bias,
1188 struct ldlm_lock *lock;
1189 bool cancelled = true;
1193 lock = ldlm_handle2lock(&och->och_lease_handle);
1195 lock_res_and_lock(lock);
1196 cancelled = ldlm_is_cancel(lock);
1197 unlock_res_and_lock(lock);
1198 LDLM_LOCK_PUT(lock);
1201 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1202 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1204 if (lease_broken != NULL)
1205 *lease_broken = cancelled;
1207 if (!cancelled && !bias)
1208 ldlm_cli_cancel(&och->och_lease_handle, 0);
1210 if (cancelled) { /* no need to excute intent */
1215 rc = ll_close_inode_openhandle(inode, och, bias, data);
1219 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1222 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1226 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1228 static int ll_lease_file_resync(struct obd_client_handle *och,
1229 struct inode *inode, unsigned long arg)
1231 struct ll_sb_info *sbi = ll_i2sbi(inode);
1232 struct md_op_data *op_data;
1233 struct ll_ioc_lease_id ioc;
1234 __u64 data_version_unused;
1238 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1239 LUSTRE_OPC_ANY, NULL);
1240 if (IS_ERR(op_data))
1241 RETURN(PTR_ERR(op_data));
1243 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1247 /* before starting file resync, it's necessary to clean up page cache
1248 * in client memory, otherwise once the layout version is increased,
1249 * writing back cached data will be denied the OSTs. */
1250 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1254 op_data->op_lease_handle = och->och_lease_handle;
1255 op_data->op_mirror_id = ioc.lil_mirror_id;
1256 rc = md_file_resync(sbi->ll_md_exp, op_data);
1262 ll_finish_md_op_data(op_data);
1266 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1268 struct ll_inode_info *lli = ll_i2info(inode);
1269 struct cl_object *obj = lli->lli_clob;
1270 struct cl_attr *attr = vvp_env_thread_attr(env);
1278 ll_inode_size_lock(inode);
1280 /* Merge timestamps the most recently obtained from MDS with
1281 * timestamps obtained from OSTs.
1283 * Do not overwrite atime of inode because it may be refreshed
1284 * by file_accessed() function. If the read was served by cache
1285 * data, there is no RPC to be sent so that atime may not be
1286 * transferred to OSTs at all. MDT only updates atime at close time
1287 * if it's at least 'mdd.*.atime_diff' older.
1288 * All in all, the atime in Lustre does not strictly comply with
1289 * POSIX. Solving this problem needs to send an RPC to MDT for each
1290 * read, this will hurt performance.
1292 if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1293 inode->i_atime.tv_sec < lli->lli_atime)
1294 inode->i_atime.tv_sec = lli->lli_atime;
1296 inode->i_mtime.tv_sec = lli->lli_mtime;
1297 inode->i_ctime.tv_sec = lli->lli_ctime;
1299 mtime = inode->i_mtime.tv_sec;
1300 atime = inode->i_atime.tv_sec;
1301 ctime = inode->i_ctime.tv_sec;
1303 cl_object_attr_lock(obj);
1304 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1307 rc = cl_object_attr_get(env, obj, attr);
1308 cl_object_attr_unlock(obj);
1311 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1313 if (atime < attr->cat_atime)
1314 atime = attr->cat_atime;
1316 if (ctime < attr->cat_ctime)
1317 ctime = attr->cat_ctime;
1319 if (mtime < attr->cat_mtime)
1320 mtime = attr->cat_mtime;
1322 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1323 PFID(&lli->lli_fid), attr->cat_size);
1325 i_size_write(inode, attr->cat_size);
1326 inode->i_blocks = attr->cat_blocks;
1328 inode->i_mtime.tv_sec = mtime;
1329 inode->i_atime.tv_sec = atime;
1330 inode->i_ctime.tv_sec = ctime;
1333 ll_inode_size_unlock(inode);
1339 * Set designated mirror for I/O.
1341 * So far only read, write, and truncated can support to issue I/O to
1342 * designated mirror.
1344 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1348 /* clear layout version for generic(non-resync) I/O in case it carries
1349 * stale layout version due to I/O restart */
1350 io->ci_layout_version = 0;
1352 /* FLR: disable non-delay for designated mirror I/O because obviously
1353 * only one mirror is available */
1354 if (fd->fd_designated_mirror > 0) {
1356 io->ci_designated_mirror = fd->fd_designated_mirror;
1357 io->ci_layout_version = fd->fd_layout_version;
1360 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1361 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1364 static bool file_is_noatime(const struct file *file)
1366 const struct vfsmount *mnt = file->f_path.mnt;
1367 const struct inode *inode = file_inode((struct file *)file);
1369 /* Adapted from file_accessed() and touch_atime().*/
1370 if (file->f_flags & O_NOATIME)
1373 if (inode->i_flags & S_NOATIME)
1376 if (IS_NOATIME(inode))
1379 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1382 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1385 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1391 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1392 struct vvp_io_args *args)
1394 struct inode *inode = file_inode(file);
1395 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1397 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1398 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1400 if (iot == CIT_WRITE) {
1401 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1402 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1403 file->f_flags & O_DIRECT ||
1405 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1406 io->u.ci_wr.wr_sync |= !!(args &&
1407 args->via_io_subtype == IO_NORMAL &&
1408 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1412 io->ci_obj = ll_i2info(inode)->lli_clob;
1413 io->ci_lockreq = CILR_MAYBE;
1414 if (ll_file_nolock(file)) {
1415 io->ci_lockreq = CILR_NEVER;
1416 io->ci_no_srvlock = 1;
1417 } else if (file->f_flags & O_APPEND) {
1418 io->ci_lockreq = CILR_MANDATORY;
1420 io->ci_noatime = file_is_noatime(file);
1421 io->ci_async_readahead = false;
1423 /* FLR: only use non-delay I/O for read as there is only one
1424 * avaliable mirror for write. */
1425 io->ci_ndelay = !(iot == CIT_WRITE);
1427 ll_io_set_mirror(io, file);
1430 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1433 struct ll_inode_info *lli = ll_i2info(inode);
1434 struct ll_sb_info *sbi = ll_i2sbi(inode);
1435 enum obd_heat_type sample_type;
1436 enum obd_heat_type iobyte_type;
1437 __u64 now = ktime_get_real_seconds();
1439 if (!ll_sbi_has_file_heat(sbi) ||
1440 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1443 if (iot == CIT_READ) {
1444 sample_type = OBD_HEAT_READSAMPLE;
1445 iobyte_type = OBD_HEAT_READBYTE;
1446 } else if (iot == CIT_WRITE) {
1447 sample_type = OBD_HEAT_WRITESAMPLE;
1448 iobyte_type = OBD_HEAT_WRITEBYTE;
1453 spin_lock(&lli->lli_heat_lock);
1454 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1455 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1456 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1457 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1458 spin_unlock(&lli->lli_heat_lock);
1462 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1463 struct file *file, enum cl_io_type iot,
1464 loff_t *ppos, size_t count)
1466 struct vvp_io *vio = vvp_env_io(env);
1467 struct inode *inode = file_inode(file);
1468 struct ll_inode_info *lli = ll_i2info(inode);
1469 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1470 struct range_lock range;
1474 unsigned retried = 0;
1475 bool restarted = false;
1479 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1480 file_dentry(file)->d_name.name,
1481 iot == CIT_READ ? "read" : "write", *ppos, count);
1484 io = vvp_env_thread_io(env);
1485 ll_io_init(io, file, iot, args);
1486 io->ci_ndelay_tried = retried;
1488 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1489 bool range_locked = false;
1491 if (file->f_flags & O_APPEND)
1492 range_lock_init(&range, 0, LUSTRE_EOF);
1494 range_lock_init(&range, *ppos, *ppos + count - 1);
1496 vio->vui_fd = LUSTRE_FPRIVATE(file);
1497 vio->vui_io_subtype = args->via_io_subtype;
1499 switch (vio->vui_io_subtype) {
1501 vio->vui_iter = args->u.normal.via_iter;
1502 vio->vui_iocb = args->u.normal.via_iocb;
1503 /* Direct IO reads must also take range lock,
1504 * or multiple reads will try to work on the same pages
1505 * See LU-6227 for details. */
1506 if (((iot == CIT_WRITE) ||
1507 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1508 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1509 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1511 rc = range_lock(&lli->lli_write_tree, &range);
1515 range_locked = true;
1519 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1520 vio->u.splice.vui_flags = args->u.splice.via_flags;
1523 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1527 ll_cl_add(file, env, io, LCC_RW);
1528 rc = cl_io_loop(env, io);
1529 ll_cl_remove(file, env);
1532 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1534 range_unlock(&lli->lli_write_tree, &range);
1537 /* cl_io_rw_init() handled IO */
1541 if (io->ci_nob > 0) {
1542 result += io->ci_nob;
1543 count -= io->ci_nob;
1544 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1546 /* prepare IO restart */
1547 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1548 args->u.normal.via_iter = vio->vui_iter;
1551 cl_io_fini(env, io);
1554 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1555 file->f_path.dentry->d_name.name,
1556 iot, rc, result, io->ci_need_restart);
1558 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1560 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1561 file_dentry(file)->d_name.name,
1562 iot == CIT_READ ? "read" : "write",
1563 *ppos, count, result, rc);
1564 /* preserve the tried count for FLR */
1565 retried = io->ci_ndelay_tried;
1570 if (iot == CIT_READ) {
1572 ll_stats_ops_tally(ll_i2sbi(inode),
1573 LPROC_LL_READ_BYTES, result);
1574 } else if (iot == CIT_WRITE) {
1576 ll_stats_ops_tally(ll_i2sbi(inode),
1577 LPROC_LL_WRITE_BYTES, result);
1578 fd->fd_write_failed = false;
1579 } else if (result == 0 && rc == 0) {
1582 fd->fd_write_failed = true;
1584 fd->fd_write_failed = false;
1585 } else if (rc != -ERESTARTSYS) {
1586 fd->fd_write_failed = true;
1590 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1592 ll_heat_add(inode, iot, result);
1594 RETURN(result > 0 ? result : rc);
1598 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1599 * especially for small I/O.
1601 * To serve a read request, CLIO has to create and initialize a cl_io and
1602 * then request DLM lock. This has turned out to have siginificant overhead
1603 * and affects the performance of small I/O dramatically.
1605 * It's not necessary to create a cl_io for each I/O. Under the help of read
1606 * ahead, most of the pages being read are already in memory cache and we can
1607 * read those pages directly because if the pages exist, the corresponding DLM
1608 * lock must exist so that page content must be valid.
1610 * In fast read implementation, the llite speculatively finds and reads pages
1611 * in memory cache. There are three scenarios for fast read:
1612 * - If the page exists and is uptodate, kernel VM will provide the data and
1613 * CLIO won't be intervened;
1614 * - If the page was brought into memory by read ahead, it will be exported
1615 * and read ahead parameters will be updated;
1616 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1617 * it will go back and invoke normal read, i.e., a cl_io will be created
1618 * and DLM lock will be requested.
1620 * POSIX compliance: posix standard states that read is intended to be atomic.
1621 * Lustre read implementation is in line with Linux kernel read implementation
1622 * and neither of them complies with POSIX standard in this matter. Fast read
1623 * doesn't make the situation worse on single node but it may interleave write
1624 * results from multiple nodes due to short read handling in ll_file_aio_read().
1626 * \param env - lu_env
1627 * \param iocb - kiocb from kernel
1628 * \param iter - user space buffers where the data will be copied
1630 * \retval - number of bytes have been read, or error code if error occurred.
1633 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1637 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1640 /* NB: we can't do direct IO for fast read because it will need a lock
1641 * to make IO engine happy. */
1642 if (iocb->ki_filp->f_flags & O_DIRECT)
1645 result = generic_file_read_iter(iocb, iter);
1647 /* If the first page is not in cache, generic_file_aio_read() will be
1648 * returned with -ENODATA.
1649 * See corresponding code in ll_readpage(). */
1650 if (result == -ENODATA)
1654 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1655 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1656 LPROC_LL_READ_BYTES, result);
1663 * Read from a file (through the page cache).
1665 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1668 struct vvp_io_args *args;
1669 struct file *file = iocb->ki_filp;
1673 ktime_t kstart = ktime_get();
1676 if (!iov_iter_count(to))
1680 * Currently when PCC read failed, we do not fall back to the
1681 * normal read path, just return the error.
1682 * The resaon is that: for RW-PCC, the file data may be modified
1683 * in the PCC and inconsistent with the data on OSTs (or file
1684 * data has been removed from the Lustre file system), at this
1685 * time, fallback to the normal read path may read the wrong
1687 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1688 * path: read data from data copy on OSTs.
1690 result = pcc_file_read_iter(iocb, to, &cached);
1694 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1696 result = ll_do_fast_read(iocb, to);
1697 if (result < 0 || iov_iter_count(to) == 0)
1700 env = cl_env_get(&refcheck);
1702 return PTR_ERR(env);
1704 args = ll_env_args(env, IO_NORMAL);
1705 args->u.normal.via_iter = to;
1706 args->u.normal.via_iocb = iocb;
1708 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1709 &iocb->ki_pos, iov_iter_count(to));
1712 else if (result == 0)
1715 cl_env_put(env, &refcheck);
1718 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1719 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1721 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1722 ktime_us_delta(ktime_get(), kstart));
1729 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1730 * If a page is already in the page cache and dirty (and some other things -
1731 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1732 * write to it without doing a full I/O, because Lustre already knows about it
1733 * and will write it out. This saves a lot of processing time.
1735 * All writes here are within one page, so exclusion is handled by the page
1736 * lock on the vm page. We do not do tiny writes for writes which touch
1737 * multiple pages because it's very unlikely multiple sequential pages are
1738 * are already dirty.
1740 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1741 * and are unlikely to be to already dirty pages.
1743 * Attribute updates are important here, we do them in ll_tiny_write_end.
1745 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1747 ssize_t count = iov_iter_count(iter);
1748 struct file *file = iocb->ki_filp;
1749 struct inode *inode = file_inode(file);
1750 bool lock_inode = !IS_NOSEC(inode);
1755 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1756 * of function for why.
1758 if (count >= PAGE_SIZE ||
1759 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1762 if (unlikely(lock_inode))
1764 result = __generic_file_write_iter(iocb, iter);
1766 if (unlikely(lock_inode))
1767 inode_unlock(inode);
1769 /* If the page is not already dirty, ll_tiny_write_begin returns
1770 * -ENODATA. We continue on to normal write.
1772 if (result == -ENODATA)
1776 ll_heat_add(inode, CIT_WRITE, result);
1777 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1779 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1782 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1788 * Write to a file (through the page cache).
1790 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1792 struct vvp_io_args *args;
1794 ssize_t rc_tiny = 0, rc_normal;
1795 struct file *file = iocb->ki_filp;
1798 ktime_t kstart = ktime_get();
1803 if (!iov_iter_count(from))
1804 GOTO(out, rc_normal = 0);
1807 * When PCC write failed, we usually do not fall back to the normal
1808 * write path, just return the error. But there is a special case when
1809 * returned error code is -ENOSPC due to running out of space on PCC HSM
1810 * bakcend. At this time, it will fall back to normal I/O path and
1811 * retry the I/O. As the file is in HSM released state, it will restore
1812 * the file data to OSTs first and redo the write again. And the
1813 * restore process will revoke the layout lock and detach the file
1814 * from PCC cache automatically.
1816 result = pcc_file_write_iter(iocb, from, &cached);
1817 if (cached && result != -ENOSPC && result != -EDQUOT)
1818 GOTO(out, rc_normal = result);
1820 /* NB: we can't do direct IO for tiny writes because they use the page
1821 * cache, we can't do sync writes because tiny writes can't flush
1822 * pages, and we can't do append writes because we can't guarantee the
1823 * required DLM locks are held to protect file size.
1825 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1826 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1827 rc_tiny = ll_do_tiny_write(iocb, from);
1829 /* In case of error, go on and try normal write - Only stop if tiny
1830 * write completed I/O.
1832 if (iov_iter_count(from) == 0)
1833 GOTO(out, rc_normal = rc_tiny);
1835 env = cl_env_get(&refcheck);
1837 return PTR_ERR(env);
1839 args = ll_env_args(env, IO_NORMAL);
1840 args->u.normal.via_iter = from;
1841 args->u.normal.via_iocb = iocb;
1843 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1844 &iocb->ki_pos, iov_iter_count(from));
1846 /* On success, combine bytes written. */
1847 if (rc_tiny >= 0 && rc_normal > 0)
1848 rc_normal += rc_tiny;
1849 /* On error, only return error from normal write if tiny write did not
1850 * write any bytes. Otherwise return bytes written by tiny write.
1852 else if (rc_tiny > 0)
1853 rc_normal = rc_tiny;
1855 cl_env_put(env, &refcheck);
1857 if (rc_normal > 0) {
1858 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1859 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1861 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1862 ktime_us_delta(ktime_get(), kstart));
1868 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1870 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1872 static int ll_file_get_iov_count(const struct iovec *iov,
1873 unsigned long *nr_segs, size_t *count)
1878 for (seg = 0; seg < *nr_segs; seg++) {
1879 const struct iovec *iv = &iov[seg];
1882 * If any segment has a negative length, or the cumulative
1883 * length ever wraps negative then return -EINVAL.
1886 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1888 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1893 cnt -= iv->iov_len; /* This segment is no good */
1900 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1901 unsigned long nr_segs, loff_t pos)
1908 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1915 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1916 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1917 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1918 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1919 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1921 result = ll_file_read_iter(iocb, &to);
1926 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1929 struct iovec iov = { .iov_base = buf, .iov_len = count };
1938 init_sync_kiocb(&kiocb, file);
1939 kiocb.ki_pos = *ppos;
1940 #ifdef HAVE_KIOCB_KI_LEFT
1941 kiocb.ki_left = count;
1942 #elif defined(HAVE_KI_NBYTES)
1943 kiocb.i_nbytes = count;
1946 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1947 *ppos = kiocb.ki_pos;
1953 * Write to a file (through the page cache).
1956 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1957 unsigned long nr_segs, loff_t pos)
1959 struct iov_iter from;
1964 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1971 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1972 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1973 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1974 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1975 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1977 result = ll_file_write_iter(iocb, &from);
1982 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1983 size_t count, loff_t *ppos)
1985 struct iovec iov = { .iov_base = (void __user *)buf,
1995 init_sync_kiocb(&kiocb, file);
1996 kiocb.ki_pos = *ppos;
1997 #ifdef HAVE_KIOCB_KI_LEFT
1998 kiocb.ki_left = count;
1999 #elif defined(HAVE_KI_NBYTES)
2000 kiocb.ki_nbytes = count;
2003 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2004 *ppos = kiocb.ki_pos;
2008 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2011 * Send file content (through pagecache) somewhere with helper
2013 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2014 struct pipe_inode_info *pipe, size_t count,
2018 struct vvp_io_args *args;
2025 result = pcc_file_splice_read(in_file, ppos, pipe,
2026 count, flags, &cached);
2030 ll_ras_enter(in_file, *ppos, count);
2032 env = cl_env_get(&refcheck);
2034 RETURN(PTR_ERR(env));
2036 args = ll_env_args(env, IO_SPLICE);
2037 args->u.splice.via_pipe = pipe;
2038 args->u.splice.via_flags = flags;
2040 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2041 cl_env_put(env, &refcheck);
2044 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2045 LUSTRE_FPRIVATE(in_file), *ppos, result,
2050 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2051 __u64 flags, struct lov_user_md *lum, int lum_size)
2053 struct lookup_intent oit = {
2055 .it_flags = flags | MDS_OPEN_BY_FID,
2060 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2061 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2062 /* this code will only exist for big-endian systems */
2063 lustre_swab_lov_user_md(lum, 0);
2066 ll_inode_size_lock(inode);
2067 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2069 GOTO(out_unlock, rc);
2071 ll_release_openhandle(dentry, &oit);
2074 ll_inode_size_unlock(inode);
2075 ll_intent_release(&oit);
2080 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2081 struct lov_mds_md **lmmp, int *lmm_size,
2082 struct ptlrpc_request **request)
2084 struct ll_sb_info *sbi = ll_i2sbi(inode);
2085 struct mdt_body *body;
2086 struct lov_mds_md *lmm = NULL;
2087 struct ptlrpc_request *req = NULL;
2088 struct md_op_data *op_data;
2091 rc = ll_get_default_mdsize(sbi, &lmmsize);
2095 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2096 strlen(filename), lmmsize,
2097 LUSTRE_OPC_ANY, NULL);
2098 if (IS_ERR(op_data))
2099 RETURN(PTR_ERR(op_data));
2101 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2102 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2103 ll_finish_md_op_data(op_data);
2105 CDEBUG(D_INFO, "md_getattr_name failed "
2106 "on %s: rc %d\n", filename, rc);
2110 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2111 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2113 lmmsize = body->mbo_eadatasize;
2115 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2117 GOTO(out, rc = -ENODATA);
2120 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2121 LASSERT(lmm != NULL);
2123 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2124 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2125 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2126 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2127 GOTO(out, rc = -EPROTO);
2130 * This is coming from the MDS, so is probably in
2131 * little endian. We convert it to host endian before
2132 * passing it to userspace.
2134 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2135 __swab32(LOV_MAGIC_MAGIC)) {
2136 int stripe_count = 0;
2138 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2139 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2140 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2141 if (le32_to_cpu(lmm->lmm_pattern) &
2142 LOV_PATTERN_F_RELEASED)
2146 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2148 /* if function called for directory - we should
2149 * avoid swab not existent lsm objects */
2150 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2151 lustre_swab_lov_user_md_objects(
2152 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2154 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2155 S_ISREG(body->mbo_mode))
2156 lustre_swab_lov_user_md_objects(
2157 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2163 *lmm_size = lmmsize;
2168 static int ll_lov_setea(struct inode *inode, struct file *file,
2171 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2172 struct lov_user_md *lump;
2173 int lum_size = sizeof(struct lov_user_md) +
2174 sizeof(struct lov_user_ost_data);
2178 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2181 OBD_ALLOC_LARGE(lump, lum_size);
2185 if (copy_from_user(lump, arg, lum_size))
2186 GOTO(out_lump, rc = -EFAULT);
2188 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2190 cl_lov_delay_create_clear(&file->f_flags);
2193 OBD_FREE_LARGE(lump, lum_size);
2197 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2204 env = cl_env_get(&refcheck);
2206 RETURN(PTR_ERR(env));
2208 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2209 cl_env_put(env, &refcheck);
2213 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2216 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2217 struct lov_user_md *klum;
2219 __u64 flags = FMODE_WRITE;
2222 rc = ll_copy_user_md(lum, &klum);
2227 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2232 rc = put_user(0, &lum->lmm_stripe_count);
2236 rc = ll_layout_refresh(inode, &gen);
2240 rc = ll_file_getstripe(inode, arg, lum_size);
2242 cl_lov_delay_create_clear(&file->f_flags);
2245 OBD_FREE_LARGE(klum, lum_size);
2251 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2253 struct ll_inode_info *lli = ll_i2info(inode);
2254 struct cl_object *obj = lli->lli_clob;
2255 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2256 struct ll_grouplock grouplock;
2261 CWARN("group id for group lock must not be 0\n");
2265 if (ll_file_nolock(file))
2266 RETURN(-EOPNOTSUPP);
2268 if (file->f_flags & O_NONBLOCK) {
2269 if (!mutex_trylock(&lli->lli_group_mutex))
2272 mutex_lock(&lli->lli_group_mutex);
2274 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2275 CWARN("group lock already existed with gid %lu\n",
2276 fd->fd_grouplock.lg_gid);
2277 GOTO(out, rc = -EINVAL);
2279 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2280 if (file->f_flags & O_NONBLOCK)
2281 GOTO(out, rc = -EAGAIN);
2282 mutex_unlock(&lli->lli_group_mutex);
2283 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2284 GOTO(retry, rc = 0);
2286 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2289 * XXX: group lock needs to protect all OST objects while PFL
2290 * can add new OST objects during the IO, so we'd instantiate
2291 * all OST objects before getting its group lock.
2296 struct cl_layout cl = {
2297 .cl_is_composite = false,
2299 struct lu_extent ext = {
2301 .e_end = OBD_OBJECT_EOF,
2304 env = cl_env_get(&refcheck);
2306 GOTO(out, rc = PTR_ERR(env));
2308 rc = cl_object_layout_get(env, obj, &cl);
2309 if (!rc && cl.cl_is_composite)
2310 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2313 cl_env_put(env, &refcheck);
2318 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2319 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2324 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2325 fd->fd_grouplock = grouplock;
2326 if (lli->lli_group_users == 0)
2327 lli->lli_group_gid = grouplock.lg_gid;
2328 lli->lli_group_users++;
2330 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2332 mutex_unlock(&lli->lli_group_mutex);
2337 static int ll_put_grouplock(struct inode *inode, struct file *file,
2340 struct ll_inode_info *lli = ll_i2info(inode);
2341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2342 struct ll_grouplock grouplock;
2346 mutex_lock(&lli->lli_group_mutex);
2347 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2348 CWARN("no group lock held\n");
2349 GOTO(out, rc = -EINVAL);
2352 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2354 if (fd->fd_grouplock.lg_gid != arg) {
2355 CWARN("group lock %lu doesn't match current id %lu\n",
2356 arg, fd->fd_grouplock.lg_gid);
2357 GOTO(out, rc = -EINVAL);
2360 grouplock = fd->fd_grouplock;
2361 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2362 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2364 cl_put_grouplock(&grouplock);
2366 lli->lli_group_users--;
2367 if (lli->lli_group_users == 0) {
2368 lli->lli_group_gid = 0;
2369 wake_up_var(&lli->lli_group_users);
2371 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2374 mutex_unlock(&lli->lli_group_mutex);
2380 * Close inode open handle
2382 * \param dentry [in] dentry which contains the inode
2383 * \param it [in,out] intent which contains open info and result
2386 * \retval <0 failure
2388 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2390 struct inode *inode = dentry->d_inode;
2391 struct obd_client_handle *och;
2397 /* Root ? Do nothing. */
2398 if (dentry->d_inode->i_sb->s_root == dentry)
2401 /* No open handle to close? Move away */
2402 if (!it_disposition(it, DISP_OPEN_OPEN))
2405 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2407 OBD_ALLOC(och, sizeof(*och));
2409 GOTO(out, rc = -ENOMEM);
2411 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2415 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2417 /* this one is in place of ll_file_open */
2418 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2419 ptlrpc_req_finished(it->it_request);
2420 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2426 * Get size for inode for which FIEMAP mapping is requested.
2427 * Make the FIEMAP get_info call and returns the result.
2428 * \param fiemap kernel buffer to hold extens
2429 * \param num_bytes kernel buffer size
2431 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2437 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2440 /* Checks for fiemap flags */
2441 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2442 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2446 /* Check for FIEMAP_FLAG_SYNC */
2447 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2448 rc = filemap_fdatawrite(inode->i_mapping);
2453 env = cl_env_get(&refcheck);
2455 RETURN(PTR_ERR(env));
2457 if (i_size_read(inode) == 0) {
2458 rc = ll_glimpse_size(inode);
2463 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2464 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2465 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2467 /* If filesize is 0, then there would be no objects for mapping */
2468 if (fmkey.lfik_oa.o_size == 0) {
2469 fiemap->fm_mapped_extents = 0;
2473 fmkey.lfik_fiemap = *fiemap;
2475 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2476 &fmkey, fiemap, &num_bytes);
2478 cl_env_put(env, &refcheck);
2482 int ll_fid2path(struct inode *inode, void __user *arg)
2484 struct obd_export *exp = ll_i2mdexp(inode);
2485 const struct getinfo_fid2path __user *gfin = arg;
2487 struct getinfo_fid2path *gfout;
2493 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2494 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2497 /* Only need to get the buflen */
2498 if (get_user(pathlen, &gfin->gf_pathlen))
2501 if (pathlen > PATH_MAX)
2504 outsize = sizeof(*gfout) + pathlen;
2505 OBD_ALLOC(gfout, outsize);
2509 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2510 GOTO(gf_free, rc = -EFAULT);
2511 /* append root FID after gfout to let MDT know the root FID so that it
2512 * can lookup the correct path, this is mainly for fileset.
2513 * old server without fileset mount support will ignore this. */
2514 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2516 /* Call mdc_iocontrol */
2517 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2521 if (copy_to_user(arg, gfout, outsize))
2525 OBD_FREE(gfout, outsize);
2530 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2532 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2540 ioc->idv_version = 0;
2541 ioc->idv_layout_version = UINT_MAX;
2543 /* If no file object initialized, we consider its version is 0. */
2547 env = cl_env_get(&refcheck);
2549 RETURN(PTR_ERR(env));
2551 io = vvp_env_thread_io(env);
2553 io->u.ci_data_version.dv_data_version = 0;
2554 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2555 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2558 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2559 result = cl_io_loop(env, io);
2561 result = io->ci_result;
2563 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2564 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2566 cl_io_fini(env, io);
2568 if (unlikely(io->ci_need_restart))
2571 cl_env_put(env, &refcheck);
2577 * Read the data_version for inode.
2579 * This value is computed using stripe object version on OST.
2580 * Version is computed using server side locking.
2582 * @param flags if do sync on the OST side;
2584 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2585 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2587 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2589 struct ioc_data_version ioc = { .idv_flags = flags };
2592 rc = ll_ioc_data_version(inode, &ioc);
2594 *data_version = ioc.idv_version;
2600 * Trigger a HSM release request for the provided inode.
2602 int ll_hsm_release(struct inode *inode)
2605 struct obd_client_handle *och = NULL;
2606 __u64 data_version = 0;
2611 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2612 ll_i2sbi(inode)->ll_fsname,
2613 PFID(&ll_i2info(inode)->lli_fid));
2615 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2617 GOTO(out, rc = PTR_ERR(och));
2619 /* Grab latest data_version and [am]time values */
2620 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2624 env = cl_env_get(&refcheck);
2626 GOTO(out, rc = PTR_ERR(env));
2628 rc = ll_merge_attr(env, inode);
2629 cl_env_put(env, &refcheck);
2631 /* If error happen, we have the wrong size for a file.
2637 /* Release the file.
2638 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2639 * we still need it to pack l_remote_handle to MDT. */
2640 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2646 if (och != NULL && !IS_ERR(och)) /* close the file */
2647 ll_lease_close(och, inode, NULL);
2652 struct ll_swap_stack {
2655 struct inode *inode1;
2656 struct inode *inode2;
2661 static int ll_swap_layouts(struct file *file1, struct file *file2,
2662 struct lustre_swap_layouts *lsl)
2664 struct mdc_swap_layouts msl;
2665 struct md_op_data *op_data;
2668 struct ll_swap_stack *llss = NULL;
2671 OBD_ALLOC_PTR(llss);
2675 llss->inode1 = file_inode(file1);
2676 llss->inode2 = file_inode(file2);
2678 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2682 /* we use 2 bool because it is easier to swap than 2 bits */
2683 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2684 llss->check_dv1 = true;
2686 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2687 llss->check_dv2 = true;
2689 /* we cannot use lsl->sl_dvX directly because we may swap them */
2690 llss->dv1 = lsl->sl_dv1;
2691 llss->dv2 = lsl->sl_dv2;
2693 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2694 if (rc == 0) /* same file, done! */
2697 if (rc < 0) { /* sequentialize it */
2698 swap(llss->inode1, llss->inode2);
2700 swap(llss->dv1, llss->dv2);
2701 swap(llss->check_dv1, llss->check_dv2);
2705 if (gid != 0) { /* application asks to flush dirty cache */
2706 rc = ll_get_grouplock(llss->inode1, file1, gid);
2710 rc = ll_get_grouplock(llss->inode2, file2, gid);
2712 ll_put_grouplock(llss->inode1, file1, gid);
2717 /* ultimate check, before swaping the layouts we check if
2718 * dataversion has changed (if requested) */
2719 if (llss->check_dv1) {
2720 rc = ll_data_version(llss->inode1, &dv, 0);
2723 if (dv != llss->dv1)
2724 GOTO(putgl, rc = -EAGAIN);
2727 if (llss->check_dv2) {
2728 rc = ll_data_version(llss->inode2, &dv, 0);
2731 if (dv != llss->dv2)
2732 GOTO(putgl, rc = -EAGAIN);
2735 /* struct md_op_data is used to send the swap args to the mdt
2736 * only flags is missing, so we use struct mdc_swap_layouts
2737 * through the md_op_data->op_data */
2738 /* flags from user space have to be converted before they are send to
2739 * server, no flag is sent today, they are only used on the client */
2742 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2743 0, LUSTRE_OPC_ANY, &msl);
2744 if (IS_ERR(op_data))
2745 GOTO(free, rc = PTR_ERR(op_data));
2747 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2748 sizeof(*op_data), op_data, NULL);
2749 ll_finish_md_op_data(op_data);
2756 ll_put_grouplock(llss->inode2, file2, gid);
2757 ll_put_grouplock(llss->inode1, file1, gid);
2767 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2769 struct obd_export *exp = ll_i2mdexp(inode);
2770 struct md_op_data *op_data;
2774 /* Detect out-of range masks */
2775 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2778 /* Non-root users are forbidden to set or clear flags which are
2779 * NOT defined in HSM_USER_MASK. */
2780 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2781 !cfs_capable(CFS_CAP_SYS_ADMIN))
2784 if (!exp_connect_archive_id_array(exp)) {
2785 /* Detect out-of range archive id */
2786 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2787 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2791 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2792 LUSTRE_OPC_ANY, hss);
2793 if (IS_ERR(op_data))
2794 RETURN(PTR_ERR(op_data));
2796 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2799 ll_finish_md_op_data(op_data);
2804 static int ll_hsm_import(struct inode *inode, struct file *file,
2805 struct hsm_user_import *hui)
2807 struct hsm_state_set *hss = NULL;
2808 struct iattr *attr = NULL;
2812 if (!S_ISREG(inode->i_mode))
2818 GOTO(out, rc = -ENOMEM);
2820 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2821 hss->hss_archive_id = hui->hui_archive_id;
2822 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2823 rc = ll_hsm_state_set(inode, hss);
2827 OBD_ALLOC_PTR(attr);
2829 GOTO(out, rc = -ENOMEM);
2831 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2832 attr->ia_mode |= S_IFREG;
2833 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2834 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2835 attr->ia_size = hui->hui_size;
2836 attr->ia_mtime.tv_sec = hui->hui_mtime;
2837 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2838 attr->ia_atime.tv_sec = hui->hui_atime;
2839 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2841 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2842 ATTR_UID | ATTR_GID |
2843 ATTR_MTIME | ATTR_MTIME_SET |
2844 ATTR_ATIME | ATTR_ATIME_SET;
2848 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2852 inode_unlock(inode);
2864 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2866 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2867 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2870 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2872 struct inode *inode = file_inode(file);
2874 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2875 ATTR_MTIME | ATTR_MTIME_SET |
2878 .tv_sec = lfu->lfu_atime_sec,
2879 .tv_nsec = lfu->lfu_atime_nsec,
2882 .tv_sec = lfu->lfu_mtime_sec,
2883 .tv_nsec = lfu->lfu_mtime_nsec,
2886 .tv_sec = lfu->lfu_ctime_sec,
2887 .tv_nsec = lfu->lfu_ctime_nsec,
2893 if (!capable(CAP_SYS_ADMIN))
2896 if (!S_ISREG(inode->i_mode))
2900 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2902 inode_unlock(inode);
2907 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2910 case MODE_READ_USER:
2912 case MODE_WRITE_USER:
2919 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2921 /* Used to allow the upper layers of the client to request an LDLM lock
2922 * without doing an actual read or write.
2924 * Used for ladvise lockahead to manually request specific locks.
2926 * \param[in] file file this ladvise lock request is on
2927 * \param[in] ladvise ladvise struct describing this lock request
2929 * \retval 0 success, no detailed result available (sync requests
2930 * and requests sent to the server [not handled locally]
2931 * cannot return detailed results)
2932 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2933 * see definitions for details.
2934 * \retval negative negative errno on error
2936 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2938 struct lu_env *env = NULL;
2939 struct cl_io *io = NULL;
2940 struct cl_lock *lock = NULL;
2941 struct cl_lock_descr *descr = NULL;
2942 struct dentry *dentry = file->f_path.dentry;
2943 struct inode *inode = dentry->d_inode;
2944 enum cl_lock_mode cl_mode;
2945 off_t start = ladvise->lla_start;
2946 off_t end = ladvise->lla_end;
2952 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2953 "start=%llu, end=%llu\n", dentry->d_name.len,
2954 dentry->d_name.name, dentry->d_inode,
2955 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2958 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2960 GOTO(out, result = cl_mode);
2962 /* Get IO environment */
2963 result = cl_io_get(inode, &env, &io, &refcheck);
2967 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2970 * nothing to do for this io. This currently happens when
2971 * stripe sub-object's are not yet created.
2973 result = io->ci_result;
2974 } else if (result == 0) {
2975 lock = vvp_env_lock(env);
2976 descr = &lock->cll_descr;
2978 descr->cld_obj = io->ci_obj;
2979 /* Convert byte offsets to pages */
2980 descr->cld_start = cl_index(io->ci_obj, start);
2981 descr->cld_end = cl_index(io->ci_obj, end);
2982 descr->cld_mode = cl_mode;
2983 /* CEF_MUST is used because we do not want to convert a
2984 * lockahead request to a lockless lock */
2985 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2988 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2989 descr->cld_enq_flags |= CEF_SPECULATIVE;
2991 result = cl_lock_request(env, io, lock);
2993 /* On success, we need to release the lock */
2995 cl_lock_release(env, lock);
2997 cl_io_fini(env, io);
2998 cl_env_put(env, &refcheck);
3000 /* -ECANCELED indicates a matching lock with a different extent
3001 * was already present, and -EEXIST indicates a matching lock
3002 * on exactly the same extent was already present.
3003 * We convert them to positive values for userspace to make
3004 * recognizing true errors easier.
3005 * Note we can only return these detailed results on async requests,
3006 * as sync requests look the same as i/o requests for locking. */
3007 if (result == -ECANCELED)
3008 result = LLA_RESULT_DIFFERENT;
3009 else if (result == -EEXIST)
3010 result = LLA_RESULT_SAME;
3015 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3017 static int ll_ladvise_sanity(struct inode *inode,
3018 struct llapi_lu_ladvise *ladvise)
3020 struct ll_sb_info *sbi = ll_i2sbi(inode);
3021 enum lu_ladvise_type advice = ladvise->lla_advice;
3022 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3023 * be in the first 32 bits of enum ladvise_flags */
3024 __u32 flags = ladvise->lla_peradvice_flags;
3025 /* 3 lines at 80 characters per line, should be plenty */
3028 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3030 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3031 "last supported advice is %s (value '%d'): rc = %d\n",
3032 sbi->ll_fsname, advice,
3033 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3037 /* Per-advice checks */
3039 case LU_LADVISE_LOCKNOEXPAND:
3040 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3042 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3043 "rc = %d\n", sbi->ll_fsname, flags,
3044 ladvise_names[advice], rc);
3048 case LU_LADVISE_LOCKAHEAD:
3049 /* Currently only READ and WRITE modes can be requested */
3050 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3051 ladvise->lla_lockahead_mode == 0) {
3053 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3054 "rc = %d\n", sbi->ll_fsname,
3055 ladvise->lla_lockahead_mode,
3056 ladvise_names[advice], rc);
3060 case LU_LADVISE_WILLREAD:
3061 case LU_LADVISE_DONTNEED:
3063 /* Note fall through above - These checks apply to all advices
3064 * except LOCKNOEXPAND */
3065 if (flags & ~LF_DEFAULT_MASK) {
3067 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3068 "rc = %d\n", sbi->ll_fsname, flags,
3069 ladvise_names[advice], rc);
3072 if (ladvise->lla_start >= ladvise->lla_end) {
3074 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3075 "for %s: rc = %d\n", sbi->ll_fsname,
3076 ladvise->lla_start, ladvise->lla_end,
3077 ladvise_names[advice], rc);
3089 * Give file access advices
3091 * The ladvise interface is similar to Linux fadvise() system call, except it
3092 * forwards the advices directly from Lustre client to server. The server side
3093 * codes will apply appropriate read-ahead and caching techniques for the
3094 * corresponding files.
3096 * A typical workload for ladvise is e.g. a bunch of different clients are
3097 * doing small random reads of a file, so prefetching pages into OSS cache
3098 * with big linear reads before the random IO is a net benefit. Fetching
3099 * all that data into each client cache with fadvise() may not be, due to
3100 * much more data being sent to the client.
3102 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3103 struct llapi_lu_ladvise *ladvise)
3107 struct cl_ladvise_io *lio;
3112 env = cl_env_get(&refcheck);
3114 RETURN(PTR_ERR(env));
3116 io = vvp_env_thread_io(env);
3117 io->ci_obj = ll_i2info(inode)->lli_clob;
3119 /* initialize parameters for ladvise */
3120 lio = &io->u.ci_ladvise;
3121 lio->li_start = ladvise->lla_start;
3122 lio->li_end = ladvise->lla_end;
3123 lio->li_fid = ll_inode2fid(inode);
3124 lio->li_advice = ladvise->lla_advice;
3125 lio->li_flags = flags;
3127 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3128 rc = cl_io_loop(env, io);
3132 cl_io_fini(env, io);
3133 cl_env_put(env, &refcheck);
3137 static int ll_lock_noexpand(struct file *file, int flags)
3139 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3141 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3146 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3149 struct fsxattr fsxattr;
3151 if (copy_from_user(&fsxattr,
3152 (const struct fsxattr __user *)arg,
3156 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3157 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3158 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3159 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3160 if (copy_to_user((struct fsxattr __user *)arg,
3161 &fsxattr, sizeof(fsxattr)))
3167 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3170 * Project Quota ID state is only allowed to change from within the init
3171 * namespace. Enforce that restriction only if we are trying to change
3172 * the quota ID state. Everything else is allowed in user namespaces.
3174 if (current_user_ns() == &init_user_ns)
3177 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3180 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3181 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3184 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3191 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3195 struct md_op_data *op_data;
3196 struct ptlrpc_request *req = NULL;
3198 struct fsxattr fsxattr;
3199 struct cl_object *obj;
3203 if (copy_from_user(&fsxattr,
3204 (const struct fsxattr __user *)arg,
3208 rc = ll_ioctl_check_project(inode, &fsxattr);
3212 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3213 LUSTRE_OPC_ANY, NULL);
3214 if (IS_ERR(op_data))
3215 RETURN(PTR_ERR(op_data));
3217 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3218 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3219 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3220 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3221 op_data->op_projid = fsxattr.fsx_projid;
3222 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3223 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3225 ptlrpc_req_finished(req);
3227 GOTO(out_fsxattr, rc);
3228 ll_update_inode_flags(inode, op_data->op_attr_flags);
3229 obj = ll_i2info(inode)->lli_clob;
3231 GOTO(out_fsxattr, rc);
3233 OBD_ALLOC_PTR(attr);
3235 GOTO(out_fsxattr, rc = -ENOMEM);
3237 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3238 fsxattr.fsx_xflags);
3241 ll_finish_md_op_data(op_data);
3245 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3248 struct inode *inode = file_inode(file);
3249 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3250 struct ll_inode_info *lli = ll_i2info(inode);
3251 struct obd_client_handle *och = NULL;
3252 struct split_param sp;
3253 struct pcc_param param;
3254 bool lease_broken = false;
3256 enum mds_op_bias bias = 0;
3257 struct file *layout_file = NULL;
3259 size_t data_size = 0;
3260 bool attached = false;
3265 mutex_lock(&lli->lli_och_mutex);
3266 if (fd->fd_lease_och != NULL) {
3267 och = fd->fd_lease_och;
3268 fd->fd_lease_och = NULL;
3270 mutex_unlock(&lli->lli_och_mutex);
3275 fmode = och->och_flags;
3277 switch (ioc->lil_flags) {
3278 case LL_LEASE_RESYNC_DONE:
3279 if (ioc->lil_count > IOC_IDS_MAX)
3280 GOTO(out_lease_close, rc = -EINVAL);
3282 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3283 OBD_ALLOC(data, data_size);
3285 GOTO(out_lease_close, rc = -ENOMEM);
3287 if (copy_from_user(data, (void __user *)arg, data_size))
3288 GOTO(out_lease_close, rc = -EFAULT);
3290 bias = MDS_CLOSE_RESYNC_DONE;
3292 case LL_LEASE_LAYOUT_MERGE: {
3295 if (ioc->lil_count != 1)
3296 GOTO(out_lease_close, rc = -EINVAL);
3298 arg += sizeof(*ioc);
3299 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3300 GOTO(out_lease_close, rc = -EFAULT);
3302 layout_file = fget(fd);
3304 GOTO(out_lease_close, rc = -EBADF);
3306 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3307 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3308 GOTO(out_lease_close, rc = -EPERM);
3310 data = file_inode(layout_file);
3311 bias = MDS_CLOSE_LAYOUT_MERGE;
3314 case LL_LEASE_LAYOUT_SPLIT: {
3318 if (ioc->lil_count != 2)
3319 GOTO(out_lease_close, rc = -EINVAL);
3321 arg += sizeof(*ioc);
3322 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3323 GOTO(out_lease_close, rc = -EFAULT);
3325 arg += sizeof(__u32);
3326 if (copy_from_user(&mirror_id, (void __user *)arg,
3328 GOTO(out_lease_close, rc = -EFAULT);
3330 layout_file = fget(fdv);
3332 GOTO(out_lease_close, rc = -EBADF);
3334 sp.sp_inode = file_inode(layout_file);
3335 sp.sp_mirror_id = (__u16)mirror_id;
3337 bias = MDS_CLOSE_LAYOUT_SPLIT;
3340 case LL_LEASE_PCC_ATTACH:
3341 if (ioc->lil_count != 1)
3344 arg += sizeof(*ioc);
3345 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3347 GOTO(out_lease_close, rc2 = -EFAULT);
3349 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3351 GOTO(out_lease_close, rc2);
3354 /* Grab latest data version */
3355 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3358 GOTO(out_lease_close, rc2);
3361 bias = MDS_PCC_ATTACH;
3364 /* without close intent */
3369 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3373 rc = ll_lease_och_release(inode, file);
3382 switch (ioc->lil_flags) {
3383 case LL_LEASE_RESYNC_DONE:
3385 OBD_FREE(data, data_size);
3387 case LL_LEASE_LAYOUT_MERGE:
3388 case LL_LEASE_LAYOUT_SPLIT:
3392 case LL_LEASE_PCC_ATTACH:
3395 rc = pcc_readwrite_attach_fini(file, inode,
3396 param.pa_layout_gen,
3403 rc = ll_lease_type_from_fmode(fmode);
3407 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3410 struct inode *inode = file_inode(file);
3411 struct ll_inode_info *lli = ll_i2info(inode);
3412 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3413 struct obd_client_handle *och = NULL;
3414 __u64 open_flags = 0;
3420 switch (ioc->lil_mode) {
3421 case LL_LEASE_WRLCK:
3422 if (!(file->f_mode & FMODE_WRITE))
3424 fmode = FMODE_WRITE;
3426 case LL_LEASE_RDLCK:
3427 if (!(file->f_mode & FMODE_READ))
3431 case LL_LEASE_UNLCK:
3432 RETURN(ll_file_unlock_lease(file, ioc, arg));
3437 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3439 /* apply for lease */
3440 if (ioc->lil_flags & LL_LEASE_RESYNC)
3441 open_flags = MDS_OPEN_RESYNC;
3442 och = ll_lease_open(inode, file, fmode, open_flags);
3444 RETURN(PTR_ERR(och));
3446 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3447 rc = ll_lease_file_resync(och, inode, arg);
3449 ll_lease_close(och, inode, NULL);
3452 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3454 ll_lease_close(och, inode, NULL);
3460 mutex_lock(&lli->lli_och_mutex);
3461 if (fd->fd_lease_och == NULL) {
3462 fd->fd_lease_och = och;
3465 mutex_unlock(&lli->lli_och_mutex);
3467 /* impossible now that only excl is supported for now */
3468 ll_lease_close(och, inode, &lease_broken);
3474 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3476 struct ll_inode_info *lli = ll_i2info(inode);
3477 struct ll_sb_info *sbi = ll_i2sbi(inode);
3478 __u64 now = ktime_get_real_seconds();
3481 spin_lock(&lli->lli_heat_lock);
3482 heat->lh_flags = lli->lli_heat_flags;
3483 for (i = 0; i < heat->lh_count; i++)
3484 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3485 now, sbi->ll_heat_decay_weight,
3486 sbi->ll_heat_period_second);
3487 spin_unlock(&lli->lli_heat_lock);
3490 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3492 struct ll_inode_info *lli = ll_i2info(inode);
3495 spin_lock(&lli->lli_heat_lock);
3496 if (flags & LU_HEAT_FLAG_CLEAR)
3497 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3499 if (flags & LU_HEAT_FLAG_OFF)
3500 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3502 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3504 spin_unlock(&lli->lli_heat_lock);
3510 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3512 struct inode *inode = file_inode(file);
3513 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3517 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3518 PFID(ll_inode2fid(inode)), inode, cmd);
3519 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3521 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3522 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3526 case LL_IOC_GETFLAGS:
3527 /* Get the current value of the file flags */
3528 return put_user(fd->fd_flags, (int __user *)arg);
3529 case LL_IOC_SETFLAGS:
3530 case LL_IOC_CLRFLAGS:
3531 /* Set or clear specific file flags */
3532 /* XXX This probably needs checks to ensure the flags are
3533 * not abused, and to handle any flag side effects.
3535 if (get_user(flags, (int __user *) arg))
3538 if (cmd == LL_IOC_SETFLAGS) {
3539 if ((flags & LL_FILE_IGNORE_LOCK) &&
3540 !(file->f_flags & O_DIRECT)) {
3541 CERROR("%s: unable to disable locking on "
3542 "non-O_DIRECT file\n", current->comm);
3546 fd->fd_flags |= flags;
3548 fd->fd_flags &= ~flags;
3551 case LL_IOC_LOV_SETSTRIPE:
3552 case LL_IOC_LOV_SETSTRIPE_NEW:
3553 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3554 case LL_IOC_LOV_SETEA:
3555 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3556 case LL_IOC_LOV_SWAP_LAYOUTS: {
3558 struct lustre_swap_layouts lsl;
3560 if (copy_from_user(&lsl, (char __user *)arg,
3561 sizeof(struct lustre_swap_layouts)))
3564 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3567 file2 = fget(lsl.sl_fd);
3571 /* O_WRONLY or O_RDWR */
3572 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3573 GOTO(out, rc = -EPERM);
3575 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3576 struct inode *inode2;
3577 struct ll_inode_info *lli;
3578 struct obd_client_handle *och = NULL;
3580 lli = ll_i2info(inode);
3581 mutex_lock(&lli->lli_och_mutex);
3582 if (fd->fd_lease_och != NULL) {
3583 och = fd->fd_lease_och;
3584 fd->fd_lease_och = NULL;
3586 mutex_unlock(&lli->lli_och_mutex);
3588 GOTO(out, rc = -ENOLCK);
3589 inode2 = file_inode(file2);
3590 rc = ll_swap_layouts_close(och, inode, inode2);
3592 rc = ll_swap_layouts(file, file2, &lsl);
3598 case LL_IOC_LOV_GETSTRIPE:
3599 case LL_IOC_LOV_GETSTRIPE_NEW:
3600 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3601 case FS_IOC_GETFLAGS:
3602 case FS_IOC_SETFLAGS:
3603 RETURN(ll_iocontrol(inode, file, cmd, arg));
3604 case FSFILT_IOC_GETVERSION:
3605 case FS_IOC_GETVERSION:
3606 RETURN(put_user(inode->i_generation, (int __user *)arg));
3607 /* We need to special case any other ioctls we want to handle,
3608 * to send them to the MDS/OST as appropriate and to properly
3609 * network encode the arg field. */
3610 case FS_IOC_SETVERSION:
3613 case LL_IOC_GROUP_LOCK:
3614 RETURN(ll_get_grouplock(inode, file, arg));
3615 case LL_IOC_GROUP_UNLOCK:
3616 RETURN(ll_put_grouplock(inode, file, arg));
3617 case IOC_OBD_STATFS:
3618 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3620 case LL_IOC_FLUSHCTX:
3621 RETURN(ll_flush_ctx(inode));
3622 case LL_IOC_PATH2FID: {
3623 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3624 sizeof(struct lu_fid)))
3629 case LL_IOC_GETPARENT:
3630 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3632 case OBD_IOC_FID2PATH:
3633 RETURN(ll_fid2path(inode, (void __user *)arg));
3634 case LL_IOC_DATA_VERSION: {
3635 struct ioc_data_version idv;
3638 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3641 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3642 rc = ll_ioc_data_version(inode, &idv);
3645 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3651 case LL_IOC_GET_MDTIDX: {
3654 mdtidx = ll_get_mdt_idx(inode);
3658 if (put_user((int)mdtidx, (int __user *)arg))
3663 case OBD_IOC_GETDTNAME:
3664 case OBD_IOC_GETMDNAME:
3665 RETURN(ll_get_obd_name(inode, cmd, arg));
3666 case LL_IOC_HSM_STATE_GET: {
3667 struct md_op_data *op_data;
3668 struct hsm_user_state *hus;
3675 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3676 LUSTRE_OPC_ANY, hus);
3677 if (IS_ERR(op_data)) {
3679 RETURN(PTR_ERR(op_data));
3682 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3685 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3688 ll_finish_md_op_data(op_data);
3692 case LL_IOC_HSM_STATE_SET: {
3693 struct hsm_state_set *hss;
3700 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3705 rc = ll_hsm_state_set(inode, hss);
3710 case LL_IOC_HSM_ACTION: {
3711 struct md_op_data *op_data;
3712 struct hsm_current_action *hca;
3719 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3720 LUSTRE_OPC_ANY, hca);
3721 if (IS_ERR(op_data)) {
3723 RETURN(PTR_ERR(op_data));
3726 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3729 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3732 ll_finish_md_op_data(op_data);
3736 case LL_IOC_SET_LEASE_OLD: {
3737 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3739 RETURN(ll_file_set_lease(file, &ioc, 0));
3741 case LL_IOC_SET_LEASE: {
3742 struct ll_ioc_lease ioc;
3744 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3747 RETURN(ll_file_set_lease(file, &ioc, arg));
3749 case LL_IOC_GET_LEASE: {
3750 struct ll_inode_info *lli = ll_i2info(inode);
3751 struct ldlm_lock *lock = NULL;
3754 mutex_lock(&lli->lli_och_mutex);
3755 if (fd->fd_lease_och != NULL) {
3756 struct obd_client_handle *och = fd->fd_lease_och;
3758 lock = ldlm_handle2lock(&och->och_lease_handle);
3760 lock_res_and_lock(lock);
3761 if (!ldlm_is_cancel(lock))
3762 fmode = och->och_flags;
3764 unlock_res_and_lock(lock);
3765 LDLM_LOCK_PUT(lock);
3768 mutex_unlock(&lli->lli_och_mutex);
3770 RETURN(ll_lease_type_from_fmode(fmode));
3772 case LL_IOC_HSM_IMPORT: {
3773 struct hsm_user_import *hui;
3779 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3784 rc = ll_hsm_import(inode, file, hui);
3789 case LL_IOC_FUTIMES_3: {
3790 struct ll_futimes_3 lfu;
3792 if (copy_from_user(&lfu,
3793 (const struct ll_futimes_3 __user *)arg,
3797 RETURN(ll_file_futimes_3(file, &lfu));
3799 case LL_IOC_LADVISE: {
3800 struct llapi_ladvise_hdr *k_ladvise_hdr;
3801 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3804 int alloc_size = sizeof(*k_ladvise_hdr);
3807 u_ladvise_hdr = (void __user *)arg;
3808 OBD_ALLOC_PTR(k_ladvise_hdr);
3809 if (k_ladvise_hdr == NULL)
3812 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3813 GOTO(out_ladvise, rc = -EFAULT);
3815 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3816 k_ladvise_hdr->lah_count < 1)
3817 GOTO(out_ladvise, rc = -EINVAL);
3819 num_advise = k_ladvise_hdr->lah_count;
3820 if (num_advise >= LAH_COUNT_MAX)
3821 GOTO(out_ladvise, rc = -EFBIG);
3823 OBD_FREE_PTR(k_ladvise_hdr);
3824 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3825 lah_advise[num_advise]);
3826 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3827 if (k_ladvise_hdr == NULL)
3831 * TODO: submit multiple advices to one server in a single RPC
3833 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3834 GOTO(out_ladvise, rc = -EFAULT);
3836 for (i = 0; i < num_advise; i++) {
3837 struct llapi_lu_ladvise *k_ladvise =
3838 &k_ladvise_hdr->lah_advise[i];
3839 struct llapi_lu_ladvise __user *u_ladvise =
3840 &u_ladvise_hdr->lah_advise[i];
3842 rc = ll_ladvise_sanity(inode, k_ladvise);
3844 GOTO(out_ladvise, rc);
3846 switch (k_ladvise->lla_advice) {
3847 case LU_LADVISE_LOCKNOEXPAND:
3848 rc = ll_lock_noexpand(file,
3849 k_ladvise->lla_peradvice_flags);
3850 GOTO(out_ladvise, rc);
3851 case LU_LADVISE_LOCKAHEAD:
3853 rc = ll_file_lock_ahead(file, k_ladvise);
3856 GOTO(out_ladvise, rc);
3859 &u_ladvise->lla_lockahead_result))
3860 GOTO(out_ladvise, rc = -EFAULT);
3863 rc = ll_ladvise(inode, file,
3864 k_ladvise_hdr->lah_flags,
3867 GOTO(out_ladvise, rc);
3874 OBD_FREE(k_ladvise_hdr, alloc_size);
3877 case LL_IOC_FLR_SET_MIRROR: {
3878 /* mirror I/O must be direct to avoid polluting page cache
3880 if (!(file->f_flags & O_DIRECT))
3883 fd->fd_designated_mirror = (__u32)arg;
3886 case LL_IOC_FSGETXATTR:
3887 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3888 case LL_IOC_FSSETXATTR:
3889 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3891 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3892 case LL_IOC_HEAT_GET: {
3893 struct lu_heat uheat;
3894 struct lu_heat *heat;
3897 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3900 if (uheat.lh_count > OBD_HEAT_COUNT)
3901 uheat.lh_count = OBD_HEAT_COUNT;
3903 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3904 OBD_ALLOC(heat, size);
3908 heat->lh_count = uheat.lh_count;
3909 ll_heat_get(inode, heat);
3910 rc = copy_to_user((char __user *)arg, heat, size);
3911 OBD_FREE(heat, size);
3912 RETURN(rc ? -EFAULT : 0);
3914 case LL_IOC_HEAT_SET: {
3917 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3920 rc = ll_heat_set(inode, flags);
3923 case LL_IOC_PCC_DETACH: {
3924 struct lu_pcc_detach *detach;
3926 OBD_ALLOC_PTR(detach);
3930 if (copy_from_user(detach,
3931 (const struct lu_pcc_detach __user *)arg,
3933 GOTO(out_detach_free, rc = -EFAULT);
3935 if (!S_ISREG(inode->i_mode))
3936 GOTO(out_detach_free, rc = -EINVAL);
3938 if (!inode_owner_or_capable(inode))
3939 GOTO(out_detach_free, rc = -EPERM);
3941 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3943 OBD_FREE_PTR(detach);
3946 case LL_IOC_PCC_STATE: {
3947 struct lu_pcc_state __user *ustate =
3948 (struct lu_pcc_state __user *)arg;
3949 struct lu_pcc_state *state;
3951 OBD_ALLOC_PTR(state);
3955 if (copy_from_user(state, ustate, sizeof(*state)))
3956 GOTO(out_state, rc = -EFAULT);
3958 rc = pcc_ioctl_state(file, inode, state);
3960 GOTO(out_state, rc);
3962 if (copy_to_user(ustate, state, sizeof(*state)))
3963 GOTO(out_state, rc = -EFAULT);
3966 OBD_FREE_PTR(state);
3970 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3971 (void __user *)arg));
3975 #ifndef HAVE_FILE_LLSEEK_SIZE
3976 static inline loff_t
3977 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3979 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3981 if (offset > maxsize)
3984 if (offset != file->f_pos) {
3985 file->f_pos = offset;
3986 file->f_version = 0;
3992 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3993 loff_t maxsize, loff_t eof)
3995 struct inode *inode = file_inode(file);
4003 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4004 * position-querying operation. Avoid rewriting the "same"
4005 * f_pos value back to the file because a concurrent read(),
4006 * write() or lseek() might have altered it
4011 * f_lock protects against read/modify/write race with other
4012 * SEEK_CURs. Note that parallel writes and reads behave
4016 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4017 inode_unlock(inode);
4021 * In the generic case the entire file is data, so as long as
4022 * offset isn't at the end of the file then the offset is data.
4029 * There is a virtual hole at the end of the file, so as long as
4030 * offset isn't i_size or larger, return i_size.
4038 return llseek_execute(file, offset, maxsize);
4042 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4044 struct inode *inode = file_inode(file);
4045 loff_t retval, eof = 0;
4046 ktime_t kstart = ktime_get();
4049 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4050 (origin == SEEK_CUR) ? file->f_pos : 0);
4051 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4052 PFID(ll_inode2fid(inode)), inode, retval, retval,
4055 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4056 retval = ll_glimpse_size(inode);
4059 eof = i_size_read(inode);
4062 retval = ll_generic_file_llseek_size(file, offset, origin,
4063 ll_file_maxbytes(inode), eof);
4065 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4066 ktime_us_delta(ktime_get(), kstart));
4070 static int ll_flush(struct file *file, fl_owner_t id)
4072 struct inode *inode = file_inode(file);
4073 struct ll_inode_info *lli = ll_i2info(inode);
4074 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4077 LASSERT(!S_ISDIR(inode->i_mode));
4079 /* catch async errors that were recorded back when async writeback
4080 * failed for pages in this mapping. */
4081 rc = lli->lli_async_rc;
4082 lli->lli_async_rc = 0;
4083 if (lli->lli_clob != NULL) {
4084 err = lov_read_and_clear_async_rc(lli->lli_clob);
4089 /* The application has been told write failure already.
4090 * Do not report failure again. */
4091 if (fd->fd_write_failed)
4093 return rc ? -EIO : 0;
4097 * Called to make sure a portion of file has been written out.
4098 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4100 * Return how many pages have been written.
4102 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4103 enum cl_fsync_mode mode, int ignore_layout)
4107 struct cl_fsync_io *fio;
4112 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4113 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4116 env = cl_env_get(&refcheck);
4118 RETURN(PTR_ERR(env));
4120 io = vvp_env_thread_io(env);
4121 io->ci_obj = ll_i2info(inode)->lli_clob;
4122 io->ci_ignore_layout = ignore_layout;
4124 /* initialize parameters for sync */
4125 fio = &io->u.ci_fsync;
4126 fio->fi_start = start;
4128 fio->fi_fid = ll_inode2fid(inode);
4129 fio->fi_mode = mode;
4130 fio->fi_nr_written = 0;
4132 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4133 result = cl_io_loop(env, io);
4135 result = io->ci_result;
4137 result = fio->fi_nr_written;
4138 cl_io_fini(env, io);
4139 cl_env_put(env, &refcheck);
4145 * When dentry is provided (the 'else' case), file_dentry() may be
4146 * null and dentry must be used directly rather than pulled from
4147 * file_dentry() as is done otherwise.
4150 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4152 struct dentry *dentry = file_dentry(file);
4153 struct inode *inode = dentry->d_inode;
4154 struct ll_inode_info *lli = ll_i2info(inode);
4155 struct ptlrpc_request *req;
4156 ktime_t kstart = ktime_get();
4161 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4163 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4165 /* fsync's caller has already called _fdata{sync,write}, we want
4166 * that IO to finish before calling the osc and mdc sync methods */
4167 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4170 /* catch async errors that were recorded back when async writeback
4171 * failed for pages in this mapping. */
4172 if (!S_ISDIR(inode->i_mode)) {
4173 err = lli->lli_async_rc;
4174 lli->lli_async_rc = 0;
4177 if (lli->lli_clob != NULL) {
4178 err = lov_read_and_clear_async_rc(lli->lli_clob);
4184 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4188 ptlrpc_req_finished(req);
4190 if (S_ISREG(inode->i_mode)) {
4191 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4194 /* Sync metadata on MDT first, and then sync the cached data
4197 err = pcc_fsync(file, start, end, datasync, &cached);
4199 err = cl_sync_file_range(inode, start, end,
4201 if (rc == 0 && err < 0)
4204 fd->fd_write_failed = true;
4206 fd->fd_write_failed = false;
4209 inode_unlock(inode);
4212 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4213 ktime_us_delta(ktime_get(), kstart));
4218 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4220 struct inode *inode = file_inode(file);
4221 struct ll_sb_info *sbi = ll_i2sbi(inode);
4222 struct ldlm_enqueue_info einfo = {
4223 .ei_type = LDLM_FLOCK,
4224 .ei_cb_cp = ldlm_flock_completion_ast,
4225 .ei_cbdata = file_lock,
4227 struct md_op_data *op_data;
4228 struct lustre_handle lockh = { 0 };
4229 union ldlm_policy_data flock = { { 0 } };
4230 int fl_type = file_lock->fl_type;
4231 ktime_t kstart = ktime_get();
4237 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4238 PFID(ll_inode2fid(inode)), file_lock);
4240 if (file_lock->fl_flags & FL_FLOCK) {
4241 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4242 /* flocks are whole-file locks */
4243 flock.l_flock.end = OFFSET_MAX;
4244 /* For flocks owner is determined by the local file desctiptor*/
4245 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4246 } else if (file_lock->fl_flags & FL_POSIX) {
4247 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4248 flock.l_flock.start = file_lock->fl_start;
4249 flock.l_flock.end = file_lock->fl_end;
4253 flock.l_flock.pid = file_lock->fl_pid;
4255 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4256 /* Somewhat ugly workaround for svc lockd.
4257 * lockd installs custom fl_lmops->lm_compare_owner that checks
4258 * for the fl_owner to be the same (which it always is on local node
4259 * I guess between lockd processes) and then compares pid.
4260 * As such we assign pid to the owner field to make it all work,
4261 * conflict with normal locks is unlikely since pid space and
4262 * pointer space for current->files are not intersecting */
4263 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4264 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4269 einfo.ei_mode = LCK_PR;
4272 /* An unlock request may or may not have any relation to
4273 * existing locks so we may not be able to pass a lock handle
4274 * via a normal ldlm_lock_cancel() request. The request may even
4275 * unlock a byte range in the middle of an existing lock. In
4276 * order to process an unlock request we need all of the same
4277 * information that is given with a normal read or write record
4278 * lock request. To avoid creating another ldlm unlock (cancel)
4279 * message we'll treat a LCK_NL flock request as an unlock. */
4280 einfo.ei_mode = LCK_NL;
4283 einfo.ei_mode = LCK_PW;
4286 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4301 flags = LDLM_FL_BLOCK_NOWAIT;
4307 flags = LDLM_FL_TEST_LOCK;
4310 CERROR("unknown fcntl lock command: %d\n", cmd);
4314 /* Save the old mode so that if the mode in the lock changes we
4315 * can decrement the appropriate reader or writer refcount. */
4316 file_lock->fl_type = einfo.ei_mode;
4318 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4319 LUSTRE_OPC_ANY, NULL);
4320 if (IS_ERR(op_data))
4321 RETURN(PTR_ERR(op_data));
4323 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4324 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4325 flock.l_flock.pid, flags, einfo.ei_mode,
4326 flock.l_flock.start, flock.l_flock.end);
4328 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4331 /* Restore the file lock type if not TEST lock. */
4332 if (!(flags & LDLM_FL_TEST_LOCK))
4333 file_lock->fl_type = fl_type;
4335 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4336 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4337 !(flags & LDLM_FL_TEST_LOCK))
4338 rc2 = locks_lock_file_wait(file, file_lock);
4340 if ((file_lock->fl_flags & FL_FLOCK) &&
4341 (rc == 0 || file_lock->fl_type == F_UNLCK))
4342 rc2 = flock_lock_file_wait(file, file_lock);
4343 if ((file_lock->fl_flags & FL_POSIX) &&
4344 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4345 !(flags & LDLM_FL_TEST_LOCK))
4346 rc2 = posix_lock_file_wait(file, file_lock);
4347 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4349 if (rc2 && file_lock->fl_type != F_UNLCK) {
4350 einfo.ei_mode = LCK_NL;
4351 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4356 ll_finish_md_op_data(op_data);
4359 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4360 ktime_us_delta(ktime_get(), kstart));
4364 int ll_get_fid_by_name(struct inode *parent, const char *name,
4365 int namelen, struct lu_fid *fid,
4366 struct inode **inode)
4368 struct md_op_data *op_data = NULL;
4369 struct mdt_body *body;
4370 struct ptlrpc_request *req;
4374 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4375 LUSTRE_OPC_ANY, NULL);
4376 if (IS_ERR(op_data))
4377 RETURN(PTR_ERR(op_data));
4379 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4380 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4381 ll_finish_md_op_data(op_data);
4385 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4387 GOTO(out_req, rc = -EFAULT);
4389 *fid = body->mbo_fid1;
4392 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4394 ptlrpc_req_finished(req);
4398 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4401 struct dentry *dchild = NULL;
4402 struct inode *child_inode = NULL;
4403 struct md_op_data *op_data;
4404 struct ptlrpc_request *request = NULL;
4405 struct obd_client_handle *och = NULL;
4407 struct mdt_body *body;
4408 __u64 data_version = 0;
4409 size_t namelen = strlen(name);
4410 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4414 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4415 PFID(ll_inode2fid(parent)), name,
4416 lum->lum_stripe_offset, lum->lum_stripe_count);
4418 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4419 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4420 lustre_swab_lmv_user_md(lum);
4422 /* Get child FID first */
4423 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4426 dchild = d_lookup(file_dentry(file), &qstr);
4428 if (dchild->d_inode)
4429 child_inode = igrab(dchild->d_inode);
4434 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4443 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4444 OBD_CONNECT2_DIR_MIGRATE)) {
4445 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4446 ll_dir_striped(child_inode)) {
4447 CERROR("%s: MDT doesn't support stripe directory "
4448 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4449 GOTO(out_iput, rc = -EOPNOTSUPP);
4454 * lfs migrate command needs to be blocked on the client
4455 * by checking the migrate FID against the FID of the
4458 if (child_inode == parent->i_sb->s_root->d_inode)
4459 GOTO(out_iput, rc = -EINVAL);
4461 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4462 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4463 if (IS_ERR(op_data))
4464 GOTO(out_iput, rc = PTR_ERR(op_data));
4466 inode_lock(child_inode);
4467 op_data->op_fid3 = *ll_inode2fid(child_inode);
4468 if (!fid_is_sane(&op_data->op_fid3)) {
4469 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4470 ll_i2sbi(parent)->ll_fsname, name,
4471 PFID(&op_data->op_fid3));
4472 GOTO(out_unlock, rc = -EINVAL);
4475 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4476 op_data->op_data = lum;
4477 op_data->op_data_size = lumlen;
4480 if (S_ISREG(child_inode->i_mode)) {
4481 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4485 GOTO(out_unlock, rc);
4488 rc = ll_data_version(child_inode, &data_version,
4491 GOTO(out_close, rc);
4493 op_data->op_open_handle = och->och_open_handle;
4494 op_data->op_data_version = data_version;
4495 op_data->op_lease_handle = och->och_lease_handle;
4496 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4498 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4499 och->och_mod->mod_open_req->rq_replay = 0;
4500 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4503 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4504 name, namelen, &request);
4506 LASSERT(request != NULL);
4507 ll_update_times(request, parent);
4510 if (rc == 0 || rc == -EAGAIN) {
4511 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4512 LASSERT(body != NULL);
4514 /* If the server does release layout lock, then we cleanup
4515 * the client och here, otherwise release it in out_close: */
4516 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4517 obd_mod_put(och->och_mod);
4518 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4520 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4526 if (request != NULL) {
4527 ptlrpc_req_finished(request);
4531 /* Try again if the lease has cancelled. */
4532 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4537 ll_lease_close(och, child_inode, NULL);
4539 clear_nlink(child_inode);
4541 inode_unlock(child_inode);
4542 ll_finish_md_op_data(op_data);
4549 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4551 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4555 * In order to avoid flood of warning messages, only print one message
4556 * for one file. And the entire message rate on the client is limited
4557 * by CDEBUG_LIMIT too.
4559 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4560 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4561 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4562 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4568 * test if some locks matching bits and l_req_mode are acquired
4569 * - bits can be in different locks
4570 * - if found clear the common lock bits in *bits
4571 * - the bits not found, are kept in *bits
4573 * \param bits [IN] searched lock bits [IN]
4574 * \param l_req_mode [IN] searched lock mode
4575 * \retval boolean, true iff all bits are found
4577 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4579 struct lustre_handle lockh;
4580 union ldlm_policy_data policy;
4581 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4582 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4591 fid = &ll_i2info(inode)->lli_fid;
4592 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4593 ldlm_lockname[mode]);
4595 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4596 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4597 policy.l_inodebits.bits = *bits & (1 << i);
4598 if (policy.l_inodebits.bits == 0)
4601 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4602 &policy, mode, &lockh)) {
4603 struct ldlm_lock *lock;
4605 lock = ldlm_handle2lock(&lockh);
4608 ~(lock->l_policy_data.l_inodebits.bits);
4609 LDLM_LOCK_PUT(lock);
4611 *bits &= ~policy.l_inodebits.bits;
4618 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4619 struct lustre_handle *lockh, __u64 flags,
4620 enum ldlm_mode mode)
4622 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4627 fid = &ll_i2info(inode)->lli_fid;
4628 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4630 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4631 fid, LDLM_IBITS, &policy, mode, lockh);
4636 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4638 /* Already unlinked. Just update nlink and return success */
4639 if (rc == -ENOENT) {
4641 /* If it is striped directory, and there is bad stripe
4642 * Let's revalidate the dentry again, instead of returning
4644 if (ll_dir_striped(inode))
4647 /* This path cannot be hit for regular files unless in
4648 * case of obscure races, so no need to to validate
4650 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4652 } else if (rc != 0) {
4653 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4654 "%s: revalidate FID "DFID" error: rc = %d\n",
4655 ll_i2sbi(inode)->ll_fsname,
4656 PFID(ll_inode2fid(inode)), rc);
4662 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4664 struct inode *inode = dentry->d_inode;
4665 struct obd_export *exp = ll_i2mdexp(inode);
4666 struct lookup_intent oit = {
4669 struct ptlrpc_request *req = NULL;
4670 struct md_op_data *op_data;
4674 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4675 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4677 /* Call getattr by fid, so do not provide name at all. */
4678 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4679 LUSTRE_OPC_ANY, NULL);
4680 if (IS_ERR(op_data))
4681 RETURN(PTR_ERR(op_data));
4683 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4684 ll_finish_md_op_data(op_data);
4686 rc = ll_inode_revalidate_fini(inode, rc);
4690 rc = ll_revalidate_it_finish(req, &oit, dentry);
4692 ll_intent_release(&oit);
4696 /* Unlinked? Unhash dentry, so it is not picked up later by
4697 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4698 * here to preserve get_cwd functionality on 2.6.
4700 if (!dentry->d_inode->i_nlink) {
4701 spin_lock(&inode->i_lock);
4702 d_lustre_invalidate(dentry, 0);
4703 spin_unlock(&inode->i_lock);
4706 ll_lookup_finish_locks(&oit, dentry);
4708 ptlrpc_req_finished(req);
4713 static int ll_merge_md_attr(struct inode *inode)
4715 struct ll_inode_info *lli = ll_i2info(inode);
4716 struct cl_attr attr = { 0 };
4719 LASSERT(lli->lli_lsm_md != NULL);
4721 if (!lmv_dir_striped(lli->lli_lsm_md))
4724 down_read(&lli->lli_lsm_sem);
4725 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4726 &attr, ll_md_blocking_ast);
4727 up_read(&lli->lli_lsm_sem);
4731 set_nlink(inode, attr.cat_nlink);
4732 inode->i_blocks = attr.cat_blocks;
4733 i_size_write(inode, attr.cat_size);
4735 ll_i2info(inode)->lli_atime = attr.cat_atime;
4736 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4737 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4742 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4744 struct inode *inode = de->d_inode;
4745 struct ll_sb_info *sbi = ll_i2sbi(inode);
4746 struct ll_inode_info *lli = ll_i2info(inode);
4747 ktime_t kstart = ktime_get();
4750 rc = ll_inode_revalidate(de, IT_GETATTR);
4754 if (S_ISREG(inode->i_mode)) {
4757 rc = pcc_inode_getattr(inode, &cached);
4758 if (cached && rc < 0)
4761 /* In case of restore, the MDT has the right size and has
4762 * already send it back without granting the layout lock,
4763 * inode is up-to-date so glimpse is useless.
4764 * Also to glimpse we need the layout, in case of a running
4765 * restore the MDT holds the layout lock so the glimpse will
4766 * block up to the end of restore (getattr will block)
4768 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4769 rc = ll_glimpse_size(inode);
4774 /* If object isn't regular a file then don't validate size. */
4775 if (ll_dir_striped(inode)) {
4776 rc = ll_merge_md_attr(inode);
4781 inode->i_atime.tv_sec = lli->lli_atime;
4782 inode->i_mtime.tv_sec = lli->lli_mtime;
4783 inode->i_ctime.tv_sec = lli->lli_ctime;
4786 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4788 if (ll_need_32bit_api(sbi)) {
4789 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4790 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4791 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4793 stat->ino = inode->i_ino;
4794 stat->dev = inode->i_sb->s_dev;
4795 stat->rdev = inode->i_rdev;
4798 stat->mode = inode->i_mode;
4799 stat->uid = inode->i_uid;
4800 stat->gid = inode->i_gid;
4801 stat->atime = inode->i_atime;
4802 stat->mtime = inode->i_mtime;
4803 stat->ctime = inode->i_ctime;
4804 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4806 stat->nlink = inode->i_nlink;
4807 stat->size = i_size_read(inode);
4808 stat->blocks = inode->i_blocks;
4810 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4811 ktime_us_delta(ktime_get(), kstart));
4816 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4817 int ll_getattr(const struct path *path, struct kstat *stat,
4818 u32 request_mask, unsigned int flags)
4820 struct dentry *de = path->dentry;
4822 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4825 return ll_getattr_dentry(de, stat);
4828 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4829 __u64 start, __u64 len)
4833 struct fiemap *fiemap;
4834 unsigned int extent_count = fieinfo->fi_extents_max;
4836 num_bytes = sizeof(*fiemap) + (extent_count *
4837 sizeof(struct fiemap_extent));
4838 OBD_ALLOC_LARGE(fiemap, num_bytes);
4843 fiemap->fm_flags = fieinfo->fi_flags;
4844 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4845 fiemap->fm_start = start;
4846 fiemap->fm_length = len;
4847 if (extent_count > 0 &&
4848 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4849 sizeof(struct fiemap_extent)) != 0)
4850 GOTO(out, rc = -EFAULT);
4852 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4854 fieinfo->fi_flags = fiemap->fm_flags;
4855 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4856 if (extent_count > 0 &&
4857 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4858 fiemap->fm_mapped_extents *
4859 sizeof(struct fiemap_extent)) != 0)
4860 GOTO(out, rc = -EFAULT);
4862 OBD_FREE_LARGE(fiemap, num_bytes);
4866 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4868 struct ll_inode_info *lli = ll_i2info(inode);
4869 struct posix_acl *acl = NULL;
4872 spin_lock(&lli->lli_lock);
4873 /* VFS' acl_permission_check->check_acl will release the refcount */
4874 acl = posix_acl_dup(lli->lli_posix_acl);
4875 spin_unlock(&lli->lli_lock);
4880 #ifdef HAVE_IOP_SET_ACL
4881 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4882 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4884 struct ll_sb_info *sbi = ll_i2sbi(inode);
4885 struct ptlrpc_request *req = NULL;
4886 const char *name = NULL;
4888 size_t value_size = 0;
4893 case ACL_TYPE_ACCESS:
4894 name = XATTR_NAME_POSIX_ACL_ACCESS;
4896 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4899 case ACL_TYPE_DEFAULT:
4900 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4901 if (!S_ISDIR(inode->i_mode))
4902 rc = acl ? -EACCES : 0;
4913 value_size = posix_acl_xattr_size(acl->a_count);
4914 value = kmalloc(value_size, GFP_NOFS);
4916 GOTO(out, rc = -ENOMEM);
4918 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4920 GOTO(out_value, rc);
4923 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4924 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4925 name, value, value_size, 0, 0, &req);
4927 ptlrpc_req_finished(req);
4932 forget_cached_acl(inode, type);
4934 set_cached_acl(inode, type, acl);
4937 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4938 #endif /* HAVE_IOP_SET_ACL */
4940 int ll_inode_permission(struct inode *inode, int mask)
4943 struct ll_sb_info *sbi;
4944 struct root_squash_info *squash;
4945 struct cred *cred = NULL;
4946 const struct cred *old_cred = NULL;
4948 bool squash_id = false;
4949 ktime_t kstart = ktime_get();
4952 if (mask & MAY_NOT_BLOCK)
4955 /* as root inode are NOT getting validated in lookup operation,
4956 * need to do it before permission check. */
4958 if (inode == inode->i_sb->s_root->d_inode) {
4959 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4964 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4965 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4967 /* squash fsuid/fsgid if needed */
4968 sbi = ll_i2sbi(inode);
4969 squash = &sbi->ll_squash;
4970 if (unlikely(squash->rsi_uid != 0 &&
4971 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4972 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4976 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4977 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4978 squash->rsi_uid, squash->rsi_gid);
4980 /* update current process's credentials
4981 * and FS capability */
4982 cred = prepare_creds();
4986 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4987 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4988 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4989 if ((1 << cap) & CFS_CAP_FS_MASK)
4990 cap_lower(cred->cap_effective, cap);
4992 old_cred = override_creds(cred);
4995 rc = generic_permission(inode, mask);
4996 /* restore current process's credentials and FS capability */
4998 revert_creds(old_cred);
5003 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5004 ktime_us_delta(ktime_get(), kstart));
5009 /* -o localflock - only provides locally consistent flock locks */
5010 struct file_operations ll_file_operations = {
5011 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5012 # ifdef HAVE_SYNC_READ_WRITE
5013 .read = new_sync_read,
5014 .write = new_sync_write,
5016 .read_iter = ll_file_read_iter,
5017 .write_iter = ll_file_write_iter,
5018 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5019 .read = ll_file_read,
5020 .aio_read = ll_file_aio_read,
5021 .write = ll_file_write,
5022 .aio_write = ll_file_aio_write,
5023 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5024 .unlocked_ioctl = ll_file_ioctl,
5025 .open = ll_file_open,
5026 .release = ll_file_release,
5027 .mmap = ll_file_mmap,
5028 .llseek = ll_file_seek,
5029 .splice_read = ll_file_splice_read,
5034 struct file_operations ll_file_operations_flock = {
5035 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5036 # ifdef HAVE_SYNC_READ_WRITE
5037 .read = new_sync_read,
5038 .write = new_sync_write,
5039 # endif /* HAVE_SYNC_READ_WRITE */
5040 .read_iter = ll_file_read_iter,
5041 .write_iter = ll_file_write_iter,
5042 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5043 .read = ll_file_read,
5044 .aio_read = ll_file_aio_read,
5045 .write = ll_file_write,
5046 .aio_write = ll_file_aio_write,
5047 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5048 .unlocked_ioctl = ll_file_ioctl,
5049 .open = ll_file_open,
5050 .release = ll_file_release,
5051 .mmap = ll_file_mmap,
5052 .llseek = ll_file_seek,
5053 .splice_read = ll_file_splice_read,
5056 .flock = ll_file_flock,
5057 .lock = ll_file_flock
5060 /* These are for -o noflock - to return ENOSYS on flock calls */
5061 struct file_operations ll_file_operations_noflock = {
5062 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5063 # ifdef HAVE_SYNC_READ_WRITE
5064 .read = new_sync_read,
5065 .write = new_sync_write,
5066 # endif /* HAVE_SYNC_READ_WRITE */
5067 .read_iter = ll_file_read_iter,
5068 .write_iter = ll_file_write_iter,
5069 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5070 .read = ll_file_read,
5071 .aio_read = ll_file_aio_read,
5072 .write = ll_file_write,
5073 .aio_write = ll_file_aio_write,
5074 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5075 .unlocked_ioctl = ll_file_ioctl,
5076 .open = ll_file_open,
5077 .release = ll_file_release,
5078 .mmap = ll_file_mmap,
5079 .llseek = ll_file_seek,
5080 .splice_read = ll_file_splice_read,
5083 .flock = ll_file_noflock,
5084 .lock = ll_file_noflock
5087 struct inode_operations ll_file_inode_operations = {
5088 .setattr = ll_setattr,
5089 .getattr = ll_getattr,
5090 .permission = ll_inode_permission,
5091 #ifdef HAVE_IOP_XATTR
5092 .setxattr = ll_setxattr,
5093 .getxattr = ll_getxattr,
5094 .removexattr = ll_removexattr,
5096 .listxattr = ll_listxattr,
5097 .fiemap = ll_fiemap,
5098 #ifdef HAVE_IOP_GET_ACL
5099 .get_acl = ll_get_acl,
5101 #ifdef HAVE_IOP_SET_ACL
5102 .set_acl = ll_set_acl,
5106 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5108 struct ll_inode_info *lli = ll_i2info(inode);
5109 struct cl_object *obj = lli->lli_clob;
5118 env = cl_env_get(&refcheck);
5120 RETURN(PTR_ERR(env));
5122 rc = cl_conf_set(env, lli->lli_clob, conf);
5126 if (conf->coc_opc == OBJECT_CONF_SET) {
5127 struct ldlm_lock *lock = conf->coc_lock;
5128 struct cl_layout cl = {
5132 LASSERT(lock != NULL);
5133 LASSERT(ldlm_has_layout(lock));
5135 /* it can only be allowed to match after layout is
5136 * applied to inode otherwise false layout would be
5137 * seen. Applying layout shoud happen before dropping
5138 * the intent lock. */
5139 ldlm_lock_allow_match(lock);
5141 rc = cl_object_layout_get(env, obj, &cl);
5146 DFID": layout version change: %u -> %u\n",
5147 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5149 ll_layout_version_set(lli, cl.cl_layout_gen);
5153 cl_env_put(env, &refcheck);
5158 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5159 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5162 struct ll_sb_info *sbi = ll_i2sbi(inode);
5163 struct ptlrpc_request *req;
5170 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5171 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5172 lock->l_lvb_data, lock->l_lvb_len);
5174 if (lock->l_lvb_data != NULL)
5177 /* if layout lock was granted right away, the layout is returned
5178 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5179 * blocked and then granted via completion ast, we have to fetch
5180 * layout here. Please note that we can't use the LVB buffer in
5181 * completion AST because it doesn't have a large enough buffer */
5182 rc = ll_get_default_mdsize(sbi, &lmmsize);
5186 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5187 XATTR_NAME_LOV, lmmsize, &req);
5190 GOTO(out, rc = 0); /* empty layout */
5197 if (lmmsize == 0) /* empty layout */
5200 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5202 GOTO(out, rc = -EFAULT);
5204 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5205 if (lvbdata == NULL)
5206 GOTO(out, rc = -ENOMEM);
5208 memcpy(lvbdata, lmm, lmmsize);
5209 lock_res_and_lock(lock);
5210 if (unlikely(lock->l_lvb_data == NULL)) {
5211 lock->l_lvb_type = LVB_T_LAYOUT;
5212 lock->l_lvb_data = lvbdata;
5213 lock->l_lvb_len = lmmsize;
5216 unlock_res_and_lock(lock);
5219 OBD_FREE_LARGE(lvbdata, lmmsize);
5224 ptlrpc_req_finished(req);
5229 * Apply the layout to the inode. Layout lock is held and will be released
5232 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5233 struct inode *inode)
5235 struct ll_inode_info *lli = ll_i2info(inode);
5236 struct ll_sb_info *sbi = ll_i2sbi(inode);
5237 struct ldlm_lock *lock;
5238 struct cl_object_conf conf;
5241 bool wait_layout = false;
5244 LASSERT(lustre_handle_is_used(lockh));
5246 lock = ldlm_handle2lock(lockh);
5247 LASSERT(lock != NULL);
5248 LASSERT(ldlm_has_layout(lock));
5250 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5251 PFID(&lli->lli_fid), inode);
5253 /* in case this is a caching lock and reinstate with new inode */
5254 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5256 lock_res_and_lock(lock);
5257 lvb_ready = ldlm_is_lvb_ready(lock);
5258 unlock_res_and_lock(lock);
5260 /* checking lvb_ready is racy but this is okay. The worst case is
5261 * that multi processes may configure the file on the same time. */
5265 rc = ll_layout_fetch(inode, lock);
5269 /* for layout lock, lmm is stored in lock's lvb.
5270 * lvb_data is immutable if the lock is held so it's safe to access it
5273 * set layout to file. Unlikely this will fail as old layout was
5274 * surely eliminated */
5275 memset(&conf, 0, sizeof conf);
5276 conf.coc_opc = OBJECT_CONF_SET;
5277 conf.coc_inode = inode;
5278 conf.coc_lock = lock;
5279 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5280 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5281 rc = ll_layout_conf(inode, &conf);
5283 /* refresh layout failed, need to wait */
5284 wait_layout = rc == -EBUSY;
5287 LDLM_LOCK_PUT(lock);
5288 ldlm_lock_decref(lockh, mode);
5290 /* wait for IO to complete if it's still being used. */
5292 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5293 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5295 memset(&conf, 0, sizeof conf);
5296 conf.coc_opc = OBJECT_CONF_WAIT;
5297 conf.coc_inode = inode;
5298 rc = ll_layout_conf(inode, &conf);
5302 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5303 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5309 * Issue layout intent RPC to MDS.
5310 * \param inode [in] file inode
5311 * \param intent [in] layout intent
5313 * \retval 0 on success
5314 * \retval < 0 error code
5316 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5318 struct ll_inode_info *lli = ll_i2info(inode);
5319 struct ll_sb_info *sbi = ll_i2sbi(inode);
5320 struct md_op_data *op_data;
5321 struct lookup_intent it;
5322 struct ptlrpc_request *req;
5326 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5327 0, 0, LUSTRE_OPC_ANY, NULL);
5328 if (IS_ERR(op_data))
5329 RETURN(PTR_ERR(op_data));
5331 op_data->op_data = intent;
5332 op_data->op_data_size = sizeof(*intent);
5334 memset(&it, 0, sizeof(it));
5335 it.it_op = IT_LAYOUT;
5336 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5337 intent->li_opc == LAYOUT_INTENT_TRUNC)
5338 it.it_flags = FMODE_WRITE;
5340 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5341 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5343 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5344 &ll_md_blocking_ast, 0);
5345 if (it.it_request != NULL)
5346 ptlrpc_req_finished(it.it_request);
5347 it.it_request = NULL;
5349 ll_finish_md_op_data(op_data);
5351 /* set lock data in case this is a new lock */
5353 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5355 ll_intent_drop_lock(&it);
5361 * This function checks if there exists a LAYOUT lock on the client side,
5362 * or enqueues it if it doesn't have one in cache.
5364 * This function will not hold layout lock so it may be revoked any time after
5365 * this function returns. Any operations depend on layout should be redone
5368 * This function should be called before lov_io_init() to get an uptodate
5369 * layout version, the caller should save the version number and after IO
5370 * is finished, this function should be called again to verify that layout
5371 * is not changed during IO time.
5373 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5375 struct ll_inode_info *lli = ll_i2info(inode);
5376 struct ll_sb_info *sbi = ll_i2sbi(inode);
5377 struct lustre_handle lockh;
5378 struct layout_intent intent = {
5379 .li_opc = LAYOUT_INTENT_ACCESS,
5381 enum ldlm_mode mode;
5385 *gen = ll_layout_version_get(lli);
5386 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5390 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5391 LASSERT(S_ISREG(inode->i_mode));
5393 /* take layout lock mutex to enqueue layout lock exclusively. */
5394 mutex_lock(&lli->lli_layout_mutex);
5397 /* mostly layout lock is caching on the local side, so try to
5398 * match it before grabbing layout lock mutex. */
5399 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5400 LCK_CR | LCK_CW | LCK_PR |
5402 if (mode != 0) { /* hit cached lock */
5403 rc = ll_layout_lock_set(&lockh, mode, inode);
5409 rc = ll_layout_intent(inode, &intent);
5415 *gen = ll_layout_version_get(lli);
5416 mutex_unlock(&lli->lli_layout_mutex);
5422 * Issue layout intent RPC indicating where in a file an IO is about to write.
5424 * \param[in] inode file inode.
5425 * \param[in] ext write range with start offset of fille in bytes where
5426 * an IO is about to write, and exclusive end offset in
5429 * \retval 0 on success
5430 * \retval < 0 error code
5432 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5433 struct lu_extent *ext)
5435 struct layout_intent intent = {
5437 .li_extent.e_start = ext->e_start,
5438 .li_extent.e_end = ext->e_end,
5443 rc = ll_layout_intent(inode, &intent);
5449 * This function send a restore request to the MDT
5451 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5453 struct hsm_user_request *hur;
5457 len = sizeof(struct hsm_user_request) +
5458 sizeof(struct hsm_user_item);
5459 OBD_ALLOC(hur, len);
5463 hur->hur_request.hr_action = HUA_RESTORE;
5464 hur->hur_request.hr_archive_id = 0;
5465 hur->hur_request.hr_flags = 0;
5466 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5467 sizeof(hur->hur_user_item[0].hui_fid));
5468 hur->hur_user_item[0].hui_extent.offset = offset;
5469 hur->hur_user_item[0].hui_extent.length = length;
5470 hur->hur_request.hr_itemcount = 1;
5471 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,