4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_SWAP:
148 LASSERT(data != NULL);
149 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
150 op_data->op_data_version = 0;
151 op_data->op_lease_handle = och->och_lease_handle;
152 op_data->op_fid2 = *ll_inode2fid(data);
155 case MDS_HSM_RELEASE:
156 LASSERT(data != NULL);
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *(__u64 *)data;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 LASSERT(data == NULL);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 if (rc != 0 && rc != -EINTR)
170 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
171 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
174 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
175 struct mdt_body *body;
177 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
178 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
182 ll_finish_md_op_data(op_data);
186 md_clear_open_replay_data(md_exp, och);
187 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
190 ptlrpc_req_finished(req); /* This is close request */
194 int ll_md_real_close(struct inode *inode, fmode_t fmode)
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
203 if (fmode & FMODE_WRITE) {
204 och_p = &lli->lli_mds_write_och;
205 och_usecount = &lli->lli_open_fd_write_count;
206 } else if (fmode & FMODE_EXEC) {
207 och_p = &lli->lli_mds_exec_och;
208 och_usecount = &lli->lli_open_fd_exec_count;
210 LASSERT(fmode & FMODE_READ);
211 och_p = &lli->lli_mds_read_och;
212 och_usecount = &lli->lli_open_fd_read_count;
215 mutex_lock(&lli->lli_och_mutex);
216 if (*och_usecount > 0) {
217 /* There are still users of this handle, so skip
219 mutex_unlock(&lli->lli_och_mutex);
225 mutex_unlock(&lli->lli_och_mutex);
228 /* There might be a race and this handle may already
230 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
236 static int ll_md_close(struct inode *inode, struct file *file)
238 union ldlm_policy_data policy = {
239 .l_inodebits = { MDS_INODELOCK_OPEN },
241 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
242 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
243 struct ll_inode_info *lli = ll_i2info(inode);
244 struct lustre_handle lockh;
245 enum ldlm_mode lockmode;
249 /* clear group lock, if present */
250 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
251 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
253 if (fd->fd_lease_och != NULL) {
256 /* Usually the lease is not released when the
257 * application crashed, we need to release here. */
258 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
259 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
260 PFID(&lli->lli_fid), rc, lease_broken);
262 fd->fd_lease_och = NULL;
265 if (fd->fd_och != NULL) {
266 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
271 /* Let's see if we have good enough OPEN lock on the file and if
272 we can skip talking to MDS */
273 mutex_lock(&lli->lli_och_mutex);
274 if (fd->fd_omode & FMODE_WRITE) {
276 LASSERT(lli->lli_open_fd_write_count);
277 lli->lli_open_fd_write_count--;
278 } else if (fd->fd_omode & FMODE_EXEC) {
280 LASSERT(lli->lli_open_fd_exec_count);
281 lli->lli_open_fd_exec_count--;
284 LASSERT(lli->lli_open_fd_read_count);
285 lli->lli_open_fd_read_count--;
287 mutex_unlock(&lli->lli_och_mutex);
289 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
290 LDLM_IBITS, &policy, lockmode, &lockh))
291 rc = ll_md_real_close(inode, fd->fd_omode);
294 LUSTRE_FPRIVATE(file) = NULL;
295 ll_file_data_put(fd);
300 /* While this returns an error code, fput() the caller does not, so we need
301 * to make every effort to clean up all of our state here. Also, applications
302 * rarely check close errors and even if an error is returned they will not
303 * re-try the close call.
305 int ll_file_release(struct inode *inode, struct file *file)
307 struct ll_file_data *fd;
308 struct ll_sb_info *sbi = ll_i2sbi(inode);
309 struct ll_inode_info *lli = ll_i2info(inode);
313 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
314 PFID(ll_inode2fid(inode)), inode);
316 if (inode->i_sb->s_root != file_dentry(file))
317 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
318 fd = LUSTRE_FPRIVATE(file);
321 /* The last ref on @file, maybe not the the owner pid of statahead,
322 * because parent and child process can share the same file handle. */
323 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
324 ll_deauthorize_statahead(inode, fd);
326 if (inode->i_sb->s_root == file_dentry(file)) {
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
332 if (!S_ISDIR(inode->i_mode)) {
333 if (lli->lli_clob != NULL)
334 lov_read_and_clear_async_rc(lli->lli_clob);
335 lli->lli_async_rc = 0;
338 rc = ll_md_close(inode, file);
340 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
341 libcfs_debug_dumplog();
346 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
347 struct lookup_intent *itp)
349 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
350 struct dentry *parent = de->d_parent;
351 const char *name = NULL;
353 struct md_op_data *op_data;
354 struct ptlrpc_request *req = NULL;
358 LASSERT(parent != NULL);
359 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
361 /* if server supports open-by-fid, or file name is invalid, don't pack
362 * name in open request */
363 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
364 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
365 name = de->d_name.name;
366 len = de->d_name.len;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
370 name, len, 0, LUSTRE_OPC_ANY, NULL);
372 RETURN(PTR_ERR(op_data));
373 op_data->op_data = lmm;
374 op_data->op_data_size = lmmsize;
376 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
377 &ll_md_blocking_ast, 0);
378 ll_finish_md_op_data(op_data);
380 /* reason for keep own exit path - don`t flood log
381 * with messages with -ESTALE errors.
383 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
384 it_open_error(DISP_OPEN_OPEN, itp))
386 ll_release_openhandle(de, itp);
390 if (it_disposition(itp, DISP_LOOKUP_NEG))
391 GOTO(out, rc = -ENOENT);
393 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
394 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
395 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
399 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
400 if (!rc && itp->it_lock_mode)
401 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
404 ptlrpc_req_finished(req);
405 ll_intent_drop_lock(itp);
407 /* We did open by fid, but by the time we got to the server,
408 * the object disappeared. If this is a create, we cannot really
409 * tell the userspace that the file it was trying to create
410 * does not exist. Instead let's return -ESTALE, and the VFS will
411 * retry the create with LOOKUP_REVAL that we are going to catch
412 * in ll_revalidate_dentry() and use lookup then.
414 if (rc == -ENOENT && itp->it_op & IT_CREAT)
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file_inode(file);
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file_dentry(file)) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file_dentry(file), it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * When setting a lease on a file, we take ownership of the lli_mds_*_och
698 * and save it as fd->fd_och so as to force client to reopen the file even
699 * if it has an open lock in cache already.
701 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
702 struct lustre_handle *old_handle)
704 struct ll_inode_info *lli = ll_i2info(inode);
705 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
706 struct obd_client_handle **och_p;
711 /* Get the openhandle of the file */
712 mutex_lock(&lli->lli_och_mutex);
713 if (fd->fd_lease_och != NULL)
714 GOTO(out_unlock, rc = -EBUSY);
716 if (fd->fd_och == NULL) {
717 if (file->f_mode & FMODE_WRITE) {
718 LASSERT(lli->lli_mds_write_och != NULL);
719 och_p = &lli->lli_mds_write_och;
720 och_usecount = &lli->lli_open_fd_write_count;
722 LASSERT(lli->lli_mds_read_och != NULL);
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 if (*och_usecount > 1)
728 GOTO(out_unlock, rc = -EBUSY);
735 *old_handle = fd->fd_och->och_fh;
739 mutex_unlock(&lli->lli_och_mutex);
744 * Release ownership on lli_mds_*_och when putting back a file lease.
746 static int ll_lease_och_release(struct inode *inode, struct file *file)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
750 struct obd_client_handle **och_p;
751 struct obd_client_handle *old_och = NULL;
756 mutex_lock(&lli->lli_och_mutex);
757 if (file->f_mode & FMODE_WRITE) {
758 och_p = &lli->lli_mds_write_och;
759 och_usecount = &lli->lli_open_fd_write_count;
761 och_p = &lli->lli_mds_read_och;
762 och_usecount = &lli->lli_open_fd_read_count;
765 /* The file may have been open by another process (broken lease) so
766 * *och_p is not NULL. In this case we should simply increase usecount
769 if (*och_p != NULL) {
770 old_och = fd->fd_och;
777 mutex_unlock(&lli->lli_och_mutex);
780 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
786 * Acquire a lease and open the file.
788 static struct obd_client_handle *
789 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
792 struct lookup_intent it = { .it_op = IT_OPEN };
793 struct ll_sb_info *sbi = ll_i2sbi(inode);
794 struct md_op_data *op_data;
795 struct ptlrpc_request *req = NULL;
796 struct lustre_handle old_handle = { 0 };
797 struct obd_client_handle *och = NULL;
802 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
803 RETURN(ERR_PTR(-EINVAL));
806 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
807 RETURN(ERR_PTR(-EPERM));
809 rc = ll_lease_och_acquire(inode, file, &old_handle);
816 RETURN(ERR_PTR(-ENOMEM));
818 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819 LUSTRE_OPC_ANY, NULL);
821 GOTO(out, rc = PTR_ERR(op_data));
823 /* To tell the MDT this openhandle is from the same owner */
824 op_data->op_handle = old_handle;
826 it.it_flags = fmode | open_flags;
827 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
828 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
829 &ll_md_blocking_lease_ast,
830 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
831 * it can be cancelled which may mislead applications that the lease is
833 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
834 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
835 * doesn't deal with openhandle, so normal openhandle will be leaked. */
836 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
837 ll_finish_md_op_data(op_data);
838 ptlrpc_req_finished(req);
840 GOTO(out_release_it, rc);
842 if (it_disposition(&it, DISP_LOOKUP_NEG))
843 GOTO(out_release_it, rc = -ENOENT);
845 rc = it_open_error(DISP_OPEN_OPEN, &it);
847 GOTO(out_release_it, rc);
849 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
850 ll_och_fill(sbi->ll_md_exp, &it, och);
852 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
853 GOTO(out_close, rc = -EOPNOTSUPP);
855 /* already get lease, handle lease lock */
856 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
857 if (it.it_lock_mode == 0 ||
858 it.it_lock_bits != MDS_INODELOCK_OPEN) {
859 /* open lock must return for lease */
860 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
861 PFID(ll_inode2fid(inode)), it.it_lock_mode,
863 GOTO(out_close, rc = -EPROTO);
866 ll_intent_release(&it);
870 /* Cancel open lock */
871 if (it.it_lock_mode != 0) {
872 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
875 och->och_lease_handle.cookie = 0ULL;
877 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
879 CERROR("%s: error closing file "DFID": %d\n",
880 ll_get_fsname(inode->i_sb, NULL, 0),
881 PFID(&ll_i2info(inode)->lli_fid), rc2);
882 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
884 ll_intent_release(&it);
892 * Check whether a layout swap can be done between two inodes.
894 * \param[in] inode1 First inode to check
895 * \param[in] inode2 Second inode to check
897 * \retval 0 on success, layout swap can be performed between both inodes
898 * \retval negative error code if requirements are not met
900 static int ll_check_swap_layouts_validity(struct inode *inode1,
901 struct inode *inode2)
903 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
906 if (inode_permission(inode1, MAY_WRITE) ||
907 inode_permission(inode2, MAY_WRITE))
910 if (inode1->i_sb != inode2->i_sb)
916 static int ll_swap_layouts_close(struct obd_client_handle *och,
917 struct inode *inode, struct inode *inode2)
919 const struct lu_fid *fid1 = ll_inode2fid(inode);
920 const struct lu_fid *fid2;
924 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
925 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
927 rc = ll_check_swap_layouts_validity(inode, inode2);
929 GOTO(out_free_och, rc);
931 /* We now know that inode2 is a lustre inode */
932 fid2 = ll_inode2fid(inode2);
934 rc = lu_fid_cmp(fid1, fid2);
936 GOTO(out_free_och, rc = -EINVAL);
938 /* Close the file and swap layouts between inode & inode2.
939 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
940 * because we still need it to pack l_remote_handle to MDT. */
941 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
944 och = NULL; /* freed in ll_close_inode_openhandle() */
954 * Release lease and close the file.
955 * It will check if the lease has ever broken.
957 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
960 struct ldlm_lock *lock;
961 bool cancelled = true;
965 lock = ldlm_handle2lock(&och->och_lease_handle);
967 lock_res_and_lock(lock);
968 cancelled = ldlm_is_cancel(lock);
969 unlock_res_and_lock(lock);
973 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
974 PFID(&ll_i2info(inode)->lli_fid), cancelled);
977 ldlm_cli_cancel(&och->och_lease_handle, 0);
979 if (lease_broken != NULL)
980 *lease_broken = cancelled;
982 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
986 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
988 struct ll_inode_info *lli = ll_i2info(inode);
989 struct cl_object *obj = lli->lli_clob;
990 struct cl_attr *attr = vvp_env_thread_attr(env);
998 ll_inode_size_lock(inode);
1000 /* Merge timestamps the most recently obtained from MDS with
1001 * timestamps obtained from OSTs.
1003 * Do not overwrite atime of inode because it may be refreshed
1004 * by file_accessed() function. If the read was served by cache
1005 * data, there is no RPC to be sent so that atime may not be
1006 * transferred to OSTs at all. MDT only updates atime at close time
1007 * if it's at least 'mdd.*.atime_diff' older.
1008 * All in all, the atime in Lustre does not strictly comply with
1009 * POSIX. Solving this problem needs to send an RPC to MDT for each
1010 * read, this will hurt performance. */
1011 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1012 LTIME_S(inode->i_atime) = lli->lli_atime;
1013 lli->lli_update_atime = 0;
1015 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1016 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1018 atime = LTIME_S(inode->i_atime);
1019 mtime = LTIME_S(inode->i_mtime);
1020 ctime = LTIME_S(inode->i_ctime);
1022 cl_object_attr_lock(obj);
1023 rc = cl_object_attr_get(env, obj, attr);
1024 cl_object_attr_unlock(obj);
1027 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1029 if (atime < attr->cat_atime)
1030 atime = attr->cat_atime;
1032 if (ctime < attr->cat_ctime)
1033 ctime = attr->cat_ctime;
1035 if (mtime < attr->cat_mtime)
1036 mtime = attr->cat_mtime;
1038 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1039 PFID(&lli->lli_fid), attr->cat_size);
1041 i_size_write(inode, attr->cat_size);
1042 inode->i_blocks = attr->cat_blocks;
1044 LTIME_S(inode->i_atime) = atime;
1045 LTIME_S(inode->i_mtime) = mtime;
1046 LTIME_S(inode->i_ctime) = ctime;
1049 ll_inode_size_unlock(inode);
1054 static bool file_is_noatime(const struct file *file)
1056 const struct vfsmount *mnt = file->f_path.mnt;
1057 const struct inode *inode = file_inode((struct file *)file);
1059 /* Adapted from file_accessed() and touch_atime().*/
1060 if (file->f_flags & O_NOATIME)
1063 if (inode->i_flags & S_NOATIME)
1066 if (IS_NOATIME(inode))
1069 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1072 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1075 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1081 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1083 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1085 struct inode *inode = file_inode(file);
1086 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1088 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1089 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1090 io->u.ci_rw.rw_file = file;
1091 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1092 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1093 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1095 if (iot == CIT_WRITE) {
1096 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1097 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1098 file->f_flags & O_DIRECT ||
1101 io->ci_obj = ll_i2info(inode)->lli_clob;
1102 io->ci_lockreq = CILR_MAYBE;
1103 if (ll_file_nolock(file)) {
1104 io->ci_lockreq = CILR_NEVER;
1105 io->ci_no_srvlock = 1;
1106 } else if (file->f_flags & O_APPEND) {
1107 io->ci_lockreq = CILR_MANDATORY;
1109 io->ci_noatime = file_is_noatime(file);
1110 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1111 io->ci_pio = !io->u.ci_rw.rw_append;
1116 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1118 struct cl_io_pt *pt = ptask->pt_cbdata;
1119 struct file *file = pt->cip_file;
1122 loff_t pos = pt->cip_pos;
1127 env = cl_env_get(&refcheck);
1129 RETURN(PTR_ERR(env));
1131 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1132 file_dentry(file)->d_name.name,
1133 pt->cip_iot == CIT_READ ? "read" : "write",
1134 pos, pos + pt->cip_count);
1137 io = vvp_env_thread_io(env);
1138 ll_io_init(io, file, pt->cip_iot);
1139 io->u.ci_rw.rw_iter = pt->cip_iter;
1140 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1141 io->ci_pio = 0; /* It's already in parallel task */
1143 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1144 pt->cip_count - pt->cip_result);
1146 struct vvp_io *vio = vvp_env_io(env);
1148 vio->vui_io_subtype = IO_NORMAL;
1149 vio->vui_fd = LUSTRE_FPRIVATE(file);
1151 ll_cl_add(file, env, io, LCC_RW);
1152 rc = cl_io_loop(env, io);
1153 ll_cl_remove(file, env);
1155 /* cl_io_rw_init() handled IO */
1159 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1165 if (io->ci_nob > 0) {
1166 pt->cip_result += io->ci_nob;
1167 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1169 pt->cip_iocb.ki_pos = pos;
1170 #ifdef HAVE_KIOCB_KI_LEFT
1171 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1172 #elif defined(HAVE_KI_NBYTES)
1173 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1177 cl_io_fini(env, io);
1179 if ((rc == 0 || rc == -ENODATA) &&
1180 pt->cip_result < pt->cip_count &&
1181 io->ci_need_restart) {
1183 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1184 file_dentry(file)->d_name.name,
1185 pt->cip_iot == CIT_READ ? "read" : "write",
1186 pos, pos + pt->cip_count - pt->cip_result,
1187 pt->cip_result, rc);
1191 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1192 file_dentry(file)->d_name.name,
1193 pt->cip_iot == CIT_READ ? "read" : "write",
1194 pt->cip_result, rc);
1196 cl_env_put(env, &refcheck);
1197 RETURN(pt->cip_result > 0 ? 0 : rc);
1201 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1202 struct file *file, enum cl_io_type iot,
1203 loff_t *ppos, size_t count)
1205 struct range_lock range;
1206 struct vvp_io *vio = vvp_env_io(env);
1207 struct inode *inode = file_inode(file);
1208 struct ll_inode_info *lli = ll_i2info(inode);
1209 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1217 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1218 file_dentry(file)->d_name.name,
1219 iot == CIT_READ ? "read" : "write", pos, pos + count);
1222 io = vvp_env_thread_io(env);
1223 ll_io_init(io, file, iot);
1224 if (args->via_io_subtype == IO_NORMAL) {
1225 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1226 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1231 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1232 bool range_locked = false;
1234 if (file->f_flags & O_APPEND)
1235 range_lock_init(&range, 0, LUSTRE_EOF);
1237 range_lock_init(&range, pos, pos + count - 1);
1239 vio->vui_fd = LUSTRE_FPRIVATE(file);
1240 vio->vui_io_subtype = args->via_io_subtype;
1242 switch (vio->vui_io_subtype) {
1244 /* Direct IO reads must also take range lock,
1245 * or multiple reads will try to work on the same pages
1246 * See LU-6227 for details. */
1247 if (((iot == CIT_WRITE) ||
1248 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1249 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1250 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1252 rc = range_lock(&lli->lli_write_tree, &range);
1256 range_locked = true;
1260 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1261 vio->u.splice.vui_flags = args->u.splice.via_flags;
1264 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1268 ll_cl_add(file, env, io, LCC_RW);
1269 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1270 !lli->lli_inode_locked) {
1272 lli->lli_inode_locked = 1;
1274 rc = cl_io_loop(env, io);
1275 if (lli->lli_inode_locked) {
1276 lli->lli_inode_locked = 0;
1277 inode_unlock(inode);
1279 ll_cl_remove(file, env);
1282 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1284 range_unlock(&lli->lli_write_tree, &range);
1287 /* cl_io_rw_init() handled IO */
1291 if (io->ci_nob > 0) {
1292 result += io->ci_nob;
1293 count -= io->ci_nob;
1295 if (args->via_io_subtype == IO_NORMAL) {
1296 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1298 args->u.normal.via_iocb->ki_pos = pos;
1299 #ifdef HAVE_KIOCB_KI_LEFT
1300 args->u.normal.via_iocb->ki_left = count;
1301 #elif defined(HAVE_KI_NBYTES)
1302 args->u.normal.via_iocb->ki_nbytes = count;
1306 pos = io->u.ci_rw.rw_range.cir_pos;
1310 cl_io_fini(env, io);
1312 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1314 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1315 file_dentry(file)->d_name.name,
1316 iot == CIT_READ ? "read" : "write",
1317 pos, pos + count, result, rc);
1321 if (iot == CIT_READ) {
1323 ll_stats_ops_tally(ll_i2sbi(inode),
1324 LPROC_LL_READ_BYTES, result);
1325 } else if (iot == CIT_WRITE) {
1327 ll_stats_ops_tally(ll_i2sbi(inode),
1328 LPROC_LL_WRITE_BYTES, result);
1329 fd->fd_write_failed = false;
1330 } else if (result == 0 && rc == 0) {
1333 fd->fd_write_failed = true;
1335 fd->fd_write_failed = false;
1336 } else if (rc != -ERESTARTSYS) {
1337 fd->fd_write_failed = true;
1341 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1342 file_dentry(file)->d_name.name,
1343 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1347 RETURN(result > 0 ? result : rc);
1351 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1352 * especially for small I/O.
1354 * To serve a read request, CLIO has to create and initialize a cl_io and
1355 * then request DLM lock. This has turned out to have siginificant overhead
1356 * and affects the performance of small I/O dramatically.
1358 * It's not necessary to create a cl_io for each I/O. Under the help of read
1359 * ahead, most of the pages being read are already in memory cache and we can
1360 * read those pages directly because if the pages exist, the corresponding DLM
1361 * lock must exist so that page content must be valid.
1363 * In fast read implementation, the llite speculatively finds and reads pages
1364 * in memory cache. There are three scenarios for fast read:
1365 * - If the page exists and is uptodate, kernel VM will provide the data and
1366 * CLIO won't be intervened;
1367 * - If the page was brought into memory by read ahead, it will be exported
1368 * and read ahead parameters will be updated;
1369 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1370 * it will go back and invoke normal read, i.e., a cl_io will be created
1371 * and DLM lock will be requested.
1373 * POSIX compliance: posix standard states that read is intended to be atomic.
1374 * Lustre read implementation is in line with Linux kernel read implementation
1375 * and neither of them complies with POSIX standard in this matter. Fast read
1376 * doesn't make the situation worse on single node but it may interleave write
1377 * results from multiple nodes due to short read handling in ll_file_aio_read().
1379 * \param env - lu_env
1380 * \param iocb - kiocb from kernel
1381 * \param iter - user space buffers where the data will be copied
1383 * \retval - number of bytes have been read, or error code if error occurred.
1386 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1390 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1393 /* NB: we can't do direct IO for fast read because it will need a lock
1394 * to make IO engine happy. */
1395 if (iocb->ki_filp->f_flags & O_DIRECT)
1398 result = generic_file_read_iter(iocb, iter);
1400 /* If the first page is not in cache, generic_file_aio_read() will be
1401 * returned with -ENODATA.
1402 * See corresponding code in ll_readpage(). */
1403 if (result == -ENODATA)
1407 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1408 LPROC_LL_READ_BYTES, result);
1414 * Read from a file (through the page cache).
1416 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1419 struct vvp_io_args *args;
1424 result = ll_do_fast_read(iocb, to);
1425 if (result < 0 || iov_iter_count(to) == 0)
1428 env = cl_env_get(&refcheck);
1430 return PTR_ERR(env);
1432 args = ll_env_args(env, IO_NORMAL);
1433 args->u.normal.via_iter = to;
1434 args->u.normal.via_iocb = iocb;
1436 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1437 &iocb->ki_pos, iov_iter_count(to));
1440 else if (result == 0)
1443 cl_env_put(env, &refcheck);
1449 * Write to a file (through the page cache).
1451 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1453 struct vvp_io_args *args;
1458 env = cl_env_get(&refcheck);
1460 return PTR_ERR(env);
1462 args = ll_env_args(env, IO_NORMAL);
1463 args->u.normal.via_iter = from;
1464 args->u.normal.via_iocb = iocb;
1466 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1467 &iocb->ki_pos, iov_iter_count(from));
1468 cl_env_put(env, &refcheck);
1472 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1474 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1476 static int ll_file_get_iov_count(const struct iovec *iov,
1477 unsigned long *nr_segs, size_t *count)
1482 for (seg = 0; seg < *nr_segs; seg++) {
1483 const struct iovec *iv = &iov[seg];
1486 * If any segment has a negative length, or the cumulative
1487 * length ever wraps negative then return -EINVAL.
1490 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1492 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1497 cnt -= iv->iov_len; /* This segment is no good */
1504 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1505 unsigned long nr_segs, loff_t pos)
1512 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1516 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1517 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1518 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1519 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1520 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1522 result = ll_file_read_iter(iocb, &to);
1527 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1530 struct iovec iov = { .iov_base = buf, .iov_len = count };
1535 init_sync_kiocb(&kiocb, file);
1536 kiocb.ki_pos = *ppos;
1537 #ifdef HAVE_KIOCB_KI_LEFT
1538 kiocb.ki_left = count;
1539 #elif defined(HAVE_KI_NBYTES)
1540 kiocb.i_nbytes = count;
1543 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1544 *ppos = kiocb.ki_pos;
1550 * Write to a file (through the page cache).
1553 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1554 unsigned long nr_segs, loff_t pos)
1556 struct iov_iter from;
1561 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1565 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1566 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1567 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1568 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1569 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1571 result = ll_file_write_iter(iocb, &from);
1576 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1577 size_t count, loff_t *ppos)
1580 struct iovec iov = { .iov_base = (void __user *)buf,
1582 struct kiocb *kiocb;
1587 env = cl_env_get(&refcheck);
1589 RETURN(PTR_ERR(env));
1591 kiocb = &ll_env_info(env)->lti_kiocb;
1592 init_sync_kiocb(kiocb, file);
1593 kiocb->ki_pos = *ppos;
1594 #ifdef HAVE_KIOCB_KI_LEFT
1595 kiocb->ki_left = count;
1596 #elif defined(HAVE_KI_NBYTES)
1597 kiocb->ki_nbytes = count;
1600 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1601 *ppos = kiocb->ki_pos;
1603 cl_env_put(env, &refcheck);
1606 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1609 * Send file content (through pagecache) somewhere with helper
1611 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1612 struct pipe_inode_info *pipe, size_t count,
1616 struct vvp_io_args *args;
1621 env = cl_env_get(&refcheck);
1623 RETURN(PTR_ERR(env));
1625 args = ll_env_args(env, IO_SPLICE);
1626 args->u.splice.via_pipe = pipe;
1627 args->u.splice.via_flags = flags;
1629 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1630 cl_env_put(env, &refcheck);
1634 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1635 __u64 flags, struct lov_user_md *lum, int lum_size)
1637 struct lookup_intent oit = {
1639 .it_flags = flags | MDS_OPEN_BY_FID,
1644 ll_inode_size_lock(inode);
1645 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1647 GOTO(out_unlock, rc);
1649 ll_release_openhandle(dentry, &oit);
1652 ll_inode_size_unlock(inode);
1653 ll_intent_release(&oit);
1658 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1659 struct lov_mds_md **lmmp, int *lmm_size,
1660 struct ptlrpc_request **request)
1662 struct ll_sb_info *sbi = ll_i2sbi(inode);
1663 struct mdt_body *body;
1664 struct lov_mds_md *lmm = NULL;
1665 struct ptlrpc_request *req = NULL;
1666 struct md_op_data *op_data;
1669 rc = ll_get_default_mdsize(sbi, &lmmsize);
1673 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1674 strlen(filename), lmmsize,
1675 LUSTRE_OPC_ANY, NULL);
1676 if (IS_ERR(op_data))
1677 RETURN(PTR_ERR(op_data));
1679 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1680 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1681 ll_finish_md_op_data(op_data);
1683 CDEBUG(D_INFO, "md_getattr_name failed "
1684 "on %s: rc %d\n", filename, rc);
1688 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1689 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1691 lmmsize = body->mbo_eadatasize;
1693 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1695 GOTO(out, rc = -ENODATA);
1698 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1699 LASSERT(lmm != NULL);
1701 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1702 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1703 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1704 GOTO(out, rc = -EPROTO);
1707 * This is coming from the MDS, so is probably in
1708 * little endian. We convert it to host endian before
1709 * passing it to userspace.
1711 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1714 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1715 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1716 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1717 if (le32_to_cpu(lmm->lmm_pattern) &
1718 LOV_PATTERN_F_RELEASED)
1722 /* if function called for directory - we should
1723 * avoid swab not existent lsm objects */
1724 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1725 lustre_swab_lov_user_md_v1(
1726 (struct lov_user_md_v1 *)lmm);
1727 if (S_ISREG(body->mbo_mode))
1728 lustre_swab_lov_user_md_objects(
1729 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1731 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1732 lustre_swab_lov_user_md_v3(
1733 (struct lov_user_md_v3 *)lmm);
1734 if (S_ISREG(body->mbo_mode))
1735 lustre_swab_lov_user_md_objects(
1736 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1738 } else if (lmm->lmm_magic ==
1739 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1740 lustre_swab_lov_comp_md_v1(
1741 (struct lov_comp_md_v1 *)lmm);
1747 *lmm_size = lmmsize;
1752 static int ll_lov_setea(struct inode *inode, struct file *file,
1755 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1756 struct lov_user_md *lump;
1757 int lum_size = sizeof(struct lov_user_md) +
1758 sizeof(struct lov_user_ost_data);
1762 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1765 OBD_ALLOC_LARGE(lump, lum_size);
1769 if (copy_from_user(lump, arg, lum_size))
1770 GOTO(out_lump, rc = -EFAULT);
1772 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1774 cl_lov_delay_create_clear(&file->f_flags);
1777 OBD_FREE_LARGE(lump, lum_size);
1781 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1788 env = cl_env_get(&refcheck);
1790 RETURN(PTR_ERR(env));
1792 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1793 cl_env_put(env, &refcheck);
1797 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1800 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1801 struct lov_user_md *klum;
1803 __u64 flags = FMODE_WRITE;
1806 rc = ll_copy_user_md(lum, &klum);
1811 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1816 rc = put_user(0, &lum->lmm_stripe_count);
1820 rc = ll_layout_refresh(inode, &gen);
1824 rc = ll_file_getstripe(inode, arg, lum_size);
1826 cl_lov_delay_create_clear(&file->f_flags);
1829 OBD_FREE(klum, lum_size);
1834 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1836 struct ll_inode_info *lli = ll_i2info(inode);
1837 struct cl_object *obj = lli->lli_clob;
1838 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1839 struct ll_grouplock grouplock;
1844 CWARN("group id for group lock must not be 0\n");
1848 if (ll_file_nolock(file))
1849 RETURN(-EOPNOTSUPP);
1851 spin_lock(&lli->lli_lock);
1852 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1853 CWARN("group lock already existed with gid %lu\n",
1854 fd->fd_grouplock.lg_gid);
1855 spin_unlock(&lli->lli_lock);
1858 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1859 spin_unlock(&lli->lli_lock);
1862 * XXX: group lock needs to protect all OST objects while PFL
1863 * can add new OST objects during the IO, so we'd instantiate
1864 * all OST objects before getting its group lock.
1869 struct cl_layout cl = {
1870 .cl_is_composite = false,
1873 env = cl_env_get(&refcheck);
1875 RETURN(PTR_ERR(env));
1877 rc = cl_object_layout_get(env, obj, &cl);
1878 if (!rc && cl.cl_is_composite)
1879 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1881 cl_env_put(env, &refcheck);
1886 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1887 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1891 spin_lock(&lli->lli_lock);
1892 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1893 spin_unlock(&lli->lli_lock);
1894 CERROR("another thread just won the race\n");
1895 cl_put_grouplock(&grouplock);
1899 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1900 fd->fd_grouplock = grouplock;
1901 spin_unlock(&lli->lli_lock);
1903 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1907 static int ll_put_grouplock(struct inode *inode, struct file *file,
1910 struct ll_inode_info *lli = ll_i2info(inode);
1911 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1912 struct ll_grouplock grouplock;
1915 spin_lock(&lli->lli_lock);
1916 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1917 spin_unlock(&lli->lli_lock);
1918 CWARN("no group lock held\n");
1922 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1924 if (fd->fd_grouplock.lg_gid != arg) {
1925 CWARN("group lock %lu doesn't match current id %lu\n",
1926 arg, fd->fd_grouplock.lg_gid);
1927 spin_unlock(&lli->lli_lock);
1931 grouplock = fd->fd_grouplock;
1932 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1933 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1934 spin_unlock(&lli->lli_lock);
1936 cl_put_grouplock(&grouplock);
1937 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1942 * Close inode open handle
1944 * \param dentry [in] dentry which contains the inode
1945 * \param it [in,out] intent which contains open info and result
1948 * \retval <0 failure
1950 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1952 struct inode *inode = dentry->d_inode;
1953 struct obd_client_handle *och;
1959 /* Root ? Do nothing. */
1960 if (dentry->d_inode->i_sb->s_root == dentry)
1963 /* No open handle to close? Move away */
1964 if (!it_disposition(it, DISP_OPEN_OPEN))
1967 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1969 OBD_ALLOC(och, sizeof(*och));
1971 GOTO(out, rc = -ENOMEM);
1973 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1975 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1977 /* this one is in place of ll_file_open */
1978 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1979 ptlrpc_req_finished(it->it_request);
1980 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1986 * Get size for inode for which FIEMAP mapping is requested.
1987 * Make the FIEMAP get_info call and returns the result.
1988 * \param fiemap kernel buffer to hold extens
1989 * \param num_bytes kernel buffer size
1991 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1997 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2000 /* Checks for fiemap flags */
2001 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2002 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2006 /* Check for FIEMAP_FLAG_SYNC */
2007 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2008 rc = filemap_fdatawrite(inode->i_mapping);
2013 env = cl_env_get(&refcheck);
2015 RETURN(PTR_ERR(env));
2017 if (i_size_read(inode) == 0) {
2018 rc = ll_glimpse_size(inode);
2023 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2024 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2025 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2027 /* If filesize is 0, then there would be no objects for mapping */
2028 if (fmkey.lfik_oa.o_size == 0) {
2029 fiemap->fm_mapped_extents = 0;
2033 fmkey.lfik_fiemap = *fiemap;
2035 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2036 &fmkey, fiemap, &num_bytes);
2038 cl_env_put(env, &refcheck);
2042 int ll_fid2path(struct inode *inode, void __user *arg)
2044 struct obd_export *exp = ll_i2mdexp(inode);
2045 const struct getinfo_fid2path __user *gfin = arg;
2047 struct getinfo_fid2path *gfout;
2053 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2054 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2057 /* Only need to get the buflen */
2058 if (get_user(pathlen, &gfin->gf_pathlen))
2061 if (pathlen > PATH_MAX)
2064 outsize = sizeof(*gfout) + pathlen;
2065 OBD_ALLOC(gfout, outsize);
2069 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2070 GOTO(gf_free, rc = -EFAULT);
2071 /* append root FID after gfout to let MDT know the root FID so that it
2072 * can lookup the correct path, this is mainly for fileset.
2073 * old server without fileset mount support will ignore this. */
2074 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2076 /* Call mdc_iocontrol */
2077 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2081 if (copy_to_user(arg, gfout, outsize))
2085 OBD_FREE(gfout, outsize);
2090 * Read the data_version for inode.
2092 * This value is computed using stripe object version on OST.
2093 * Version is computed using server side locking.
2095 * @param flags if do sync on the OST side;
2097 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2098 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2100 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2102 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2110 /* If no file object initialized, we consider its version is 0. */
2116 env = cl_env_get(&refcheck);
2118 RETURN(PTR_ERR(env));
2120 io = vvp_env_thread_io(env);
2122 io->u.ci_data_version.dv_data_version = 0;
2123 io->u.ci_data_version.dv_flags = flags;
2126 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2127 result = cl_io_loop(env, io);
2129 result = io->ci_result;
2131 *data_version = io->u.ci_data_version.dv_data_version;
2133 cl_io_fini(env, io);
2135 if (unlikely(io->ci_need_restart))
2138 cl_env_put(env, &refcheck);
2144 * Trigger a HSM release request for the provided inode.
2146 int ll_hsm_release(struct inode *inode)
2149 struct obd_client_handle *och = NULL;
2150 __u64 data_version = 0;
2155 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2156 ll_get_fsname(inode->i_sb, NULL, 0),
2157 PFID(&ll_i2info(inode)->lli_fid));
2159 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2161 GOTO(out, rc = PTR_ERR(och));
2163 /* Grab latest data_version and [am]time values */
2164 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2168 env = cl_env_get(&refcheck);
2170 GOTO(out, rc = PTR_ERR(env));
2172 ll_merge_attr(env, inode);
2173 cl_env_put(env, &refcheck);
2175 /* Release the file.
2176 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2177 * we still need it to pack l_remote_handle to MDT. */
2178 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2184 if (och != NULL && !IS_ERR(och)) /* close the file */
2185 ll_lease_close(och, inode, NULL);
2190 struct ll_swap_stack {
2193 struct inode *inode1;
2194 struct inode *inode2;
2199 static int ll_swap_layouts(struct file *file1, struct file *file2,
2200 struct lustre_swap_layouts *lsl)
2202 struct mdc_swap_layouts msl;
2203 struct md_op_data *op_data;
2206 struct ll_swap_stack *llss = NULL;
2209 OBD_ALLOC_PTR(llss);
2213 llss->inode1 = file_inode(file1);
2214 llss->inode2 = file_inode(file2);
2216 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2220 /* we use 2 bool because it is easier to swap than 2 bits */
2221 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2222 llss->check_dv1 = true;
2224 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2225 llss->check_dv2 = true;
2227 /* we cannot use lsl->sl_dvX directly because we may swap them */
2228 llss->dv1 = lsl->sl_dv1;
2229 llss->dv2 = lsl->sl_dv2;
2231 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2232 if (rc == 0) /* same file, done! */
2235 if (rc < 0) { /* sequentialize it */
2236 swap(llss->inode1, llss->inode2);
2238 swap(llss->dv1, llss->dv2);
2239 swap(llss->check_dv1, llss->check_dv2);
2243 if (gid != 0) { /* application asks to flush dirty cache */
2244 rc = ll_get_grouplock(llss->inode1, file1, gid);
2248 rc = ll_get_grouplock(llss->inode2, file2, gid);
2250 ll_put_grouplock(llss->inode1, file1, gid);
2255 /* ultimate check, before swaping the layouts we check if
2256 * dataversion has changed (if requested) */
2257 if (llss->check_dv1) {
2258 rc = ll_data_version(llss->inode1, &dv, 0);
2261 if (dv != llss->dv1)
2262 GOTO(putgl, rc = -EAGAIN);
2265 if (llss->check_dv2) {
2266 rc = ll_data_version(llss->inode2, &dv, 0);
2269 if (dv != llss->dv2)
2270 GOTO(putgl, rc = -EAGAIN);
2273 /* struct md_op_data is used to send the swap args to the mdt
2274 * only flags is missing, so we use struct mdc_swap_layouts
2275 * through the md_op_data->op_data */
2276 /* flags from user space have to be converted before they are send to
2277 * server, no flag is sent today, they are only used on the client */
2280 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2281 0, LUSTRE_OPC_ANY, &msl);
2282 if (IS_ERR(op_data))
2283 GOTO(free, rc = PTR_ERR(op_data));
2285 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2286 sizeof(*op_data), op_data, NULL);
2287 ll_finish_md_op_data(op_data);
2294 ll_put_grouplock(llss->inode2, file2, gid);
2295 ll_put_grouplock(llss->inode1, file1, gid);
2305 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2307 struct md_op_data *op_data;
2311 /* Detect out-of range masks */
2312 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2315 /* Non-root users are forbidden to set or clear flags which are
2316 * NOT defined in HSM_USER_MASK. */
2317 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2318 !cfs_capable(CFS_CAP_SYS_ADMIN))
2321 /* Detect out-of range archive id */
2322 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2323 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2326 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2327 LUSTRE_OPC_ANY, hss);
2328 if (IS_ERR(op_data))
2329 RETURN(PTR_ERR(op_data));
2331 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2332 sizeof(*op_data), op_data, NULL);
2334 ll_finish_md_op_data(op_data);
2339 static int ll_hsm_import(struct inode *inode, struct file *file,
2340 struct hsm_user_import *hui)
2342 struct hsm_state_set *hss = NULL;
2343 struct iattr *attr = NULL;
2347 if (!S_ISREG(inode->i_mode))
2353 GOTO(out, rc = -ENOMEM);
2355 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2356 hss->hss_archive_id = hui->hui_archive_id;
2357 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2358 rc = ll_hsm_state_set(inode, hss);
2362 OBD_ALLOC_PTR(attr);
2364 GOTO(out, rc = -ENOMEM);
2366 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2367 attr->ia_mode |= S_IFREG;
2368 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2369 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2370 attr->ia_size = hui->hui_size;
2371 attr->ia_mtime.tv_sec = hui->hui_mtime;
2372 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2373 attr->ia_atime.tv_sec = hui->hui_atime;
2374 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2376 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2377 ATTR_UID | ATTR_GID |
2378 ATTR_MTIME | ATTR_MTIME_SET |
2379 ATTR_ATIME | ATTR_ATIME_SET;
2383 rc = ll_setattr_raw(file_dentry(file), attr, true);
2387 inode_unlock(inode);
2399 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2401 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2402 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2405 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2407 struct inode *inode = file_inode(file);
2409 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2410 ATTR_MTIME | ATTR_MTIME_SET |
2411 ATTR_CTIME | ATTR_CTIME_SET,
2413 .tv_sec = lfu->lfu_atime_sec,
2414 .tv_nsec = lfu->lfu_atime_nsec,
2417 .tv_sec = lfu->lfu_mtime_sec,
2418 .tv_nsec = lfu->lfu_mtime_nsec,
2421 .tv_sec = lfu->lfu_ctime_sec,
2422 .tv_nsec = lfu->lfu_ctime_nsec,
2428 if (!capable(CAP_SYS_ADMIN))
2431 if (!S_ISREG(inode->i_mode))
2435 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2436 inode_unlock(inode);
2441 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2444 case MODE_READ_USER:
2446 case MODE_WRITE_USER:
2453 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2455 /* Used to allow the upper layers of the client to request an LDLM lock
2456 * without doing an actual read or write.
2458 * Used for ladvise lockahead to manually request specific locks.
2460 * \param[in] file file this ladvise lock request is on
2461 * \param[in] ladvise ladvise struct describing this lock request
2463 * \retval 0 success, no detailed result available (sync requests
2464 * and requests sent to the server [not handled locally]
2465 * cannot return detailed results)
2466 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2467 * see definitions for details.
2468 * \retval negative negative errno on error
2470 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2472 struct lu_env *env = NULL;
2473 struct cl_io *io = NULL;
2474 struct cl_lock *lock = NULL;
2475 struct cl_lock_descr *descr = NULL;
2476 struct dentry *dentry = file->f_path.dentry;
2477 struct inode *inode = dentry->d_inode;
2478 enum cl_lock_mode cl_mode;
2479 off_t start = ladvise->lla_start;
2480 off_t end = ladvise->lla_end;
2486 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2487 "start=%llu, end=%llu\n", dentry->d_name.len,
2488 dentry->d_name.name, dentry->d_inode,
2489 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2492 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2494 GOTO(out, result = cl_mode);
2496 /* Get IO environment */
2497 result = cl_io_get(inode, &env, &io, &refcheck);
2501 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2504 * nothing to do for this io. This currently happens when
2505 * stripe sub-object's are not yet created.
2507 result = io->ci_result;
2508 } else if (result == 0) {
2509 lock = vvp_env_lock(env);
2510 descr = &lock->cll_descr;
2512 descr->cld_obj = io->ci_obj;
2513 /* Convert byte offsets to pages */
2514 descr->cld_start = cl_index(io->ci_obj, start);
2515 descr->cld_end = cl_index(io->ci_obj, end);
2516 descr->cld_mode = cl_mode;
2517 /* CEF_MUST is used because we do not want to convert a
2518 * lockahead request to a lockless lock */
2519 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2522 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2523 descr->cld_enq_flags |= CEF_SPECULATIVE;
2525 result = cl_lock_request(env, io, lock);
2527 /* On success, we need to release the lock */
2529 cl_lock_release(env, lock);
2531 cl_io_fini(env, io);
2532 cl_env_put(env, &refcheck);
2534 /* -ECANCELED indicates a matching lock with a different extent
2535 * was already present, and -EEXIST indicates a matching lock
2536 * on exactly the same extent was already present.
2537 * We convert them to positive values for userspace to make
2538 * recognizing true errors easier.
2539 * Note we can only return these detailed results on async requests,
2540 * as sync requests look the same as i/o requests for locking. */
2541 if (result == -ECANCELED)
2542 result = LLA_RESULT_DIFFERENT;
2543 else if (result == -EEXIST)
2544 result = LLA_RESULT_SAME;
2549 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2551 static int ll_ladvise_sanity(struct inode *inode,
2552 struct llapi_lu_ladvise *ladvise)
2554 enum lu_ladvise_type advice = ladvise->lla_advice;
2555 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2556 * be in the first 32 bits of enum ladvise_flags */
2557 __u32 flags = ladvise->lla_peradvice_flags;
2558 /* 3 lines at 80 characters per line, should be plenty */
2561 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2563 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2564 "last supported advice is %s (value '%d'): rc = %d\n",
2565 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2566 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2570 /* Per-advice checks */
2572 case LU_LADVISE_LOCKNOEXPAND:
2573 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2575 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2577 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2578 ladvise_names[advice], rc);
2582 case LU_LADVISE_LOCKAHEAD:
2583 /* Currently only READ and WRITE modes can be requested */
2584 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2585 ladvise->lla_lockahead_mode == 0) {
2587 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2589 ll_get_fsname(inode->i_sb, NULL, 0),
2590 ladvise->lla_lockahead_mode,
2591 ladvise_names[advice], rc);
2594 case LU_LADVISE_WILLREAD:
2595 case LU_LADVISE_DONTNEED:
2597 /* Note fall through above - These checks apply to all advices
2598 * except LOCKNOEXPAND */
2599 if (flags & ~LF_DEFAULT_MASK) {
2601 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2603 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2604 ladvise_names[advice], rc);
2607 if (ladvise->lla_start >= ladvise->lla_end) {
2609 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2610 "for %s: rc = %d\n",
2611 ll_get_fsname(inode->i_sb, NULL, 0),
2612 ladvise->lla_start, ladvise->lla_end,
2613 ladvise_names[advice], rc);
2625 * Give file access advices
2627 * The ladvise interface is similar to Linux fadvise() system call, except it
2628 * forwards the advices directly from Lustre client to server. The server side
2629 * codes will apply appropriate read-ahead and caching techniques for the
2630 * corresponding files.
2632 * A typical workload for ladvise is e.g. a bunch of different clients are
2633 * doing small random reads of a file, so prefetching pages into OSS cache
2634 * with big linear reads before the random IO is a net benefit. Fetching
2635 * all that data into each client cache with fadvise() may not be, due to
2636 * much more data being sent to the client.
2638 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2639 struct llapi_lu_ladvise *ladvise)
2643 struct cl_ladvise_io *lio;
2648 env = cl_env_get(&refcheck);
2650 RETURN(PTR_ERR(env));
2652 io = vvp_env_thread_io(env);
2653 io->ci_obj = ll_i2info(inode)->lli_clob;
2655 /* initialize parameters for ladvise */
2656 lio = &io->u.ci_ladvise;
2657 lio->li_start = ladvise->lla_start;
2658 lio->li_end = ladvise->lla_end;
2659 lio->li_fid = ll_inode2fid(inode);
2660 lio->li_advice = ladvise->lla_advice;
2661 lio->li_flags = flags;
2663 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2664 rc = cl_io_loop(env, io);
2668 cl_io_fini(env, io);
2669 cl_env_put(env, &refcheck);
2673 static int ll_lock_noexpand(struct file *file, int flags)
2675 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2677 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2682 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2685 struct fsxattr fsxattr;
2687 if (copy_from_user(&fsxattr,
2688 (const struct fsxattr __user *)arg,
2692 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2693 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2694 if (copy_to_user((struct fsxattr __user *)arg,
2695 &fsxattr, sizeof(fsxattr)))
2701 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2705 struct md_op_data *op_data;
2706 struct ptlrpc_request *req = NULL;
2708 struct fsxattr fsxattr;
2709 struct cl_object *obj;
2711 /* only root could change project ID */
2712 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2715 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2716 LUSTRE_OPC_ANY, NULL);
2717 if (IS_ERR(op_data))
2718 RETURN(PTR_ERR(op_data));
2720 if (copy_from_user(&fsxattr,
2721 (const struct fsxattr __user *)arg,
2723 GOTO(out_fsxattr1, rc = -EFAULT);
2725 op_data->op_attr_flags = fsxattr.fsx_xflags;
2726 op_data->op_projid = fsxattr.fsx_projid;
2727 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2728 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2730 ptlrpc_req_finished(req);
2732 obj = ll_i2info(inode)->lli_clob;
2736 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2737 OBD_ALLOC_PTR(attr);
2739 GOTO(out_fsxattr1, rc = -ENOMEM);
2740 attr->ia_valid = ATTR_ATTR_FLAG;
2741 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2746 ll_finish_md_op_data(op_data);
2753 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2755 struct inode *inode = file_inode(file);
2756 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2760 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2761 PFID(ll_inode2fid(inode)), inode, cmd);
2762 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2764 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2765 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2769 case LL_IOC_GETFLAGS:
2770 /* Get the current value of the file flags */
2771 return put_user(fd->fd_flags, (int __user *)arg);
2772 case LL_IOC_SETFLAGS:
2773 case LL_IOC_CLRFLAGS:
2774 /* Set or clear specific file flags */
2775 /* XXX This probably needs checks to ensure the flags are
2776 * not abused, and to handle any flag side effects.
2778 if (get_user(flags, (int __user *) arg))
2781 if (cmd == LL_IOC_SETFLAGS) {
2782 if ((flags & LL_FILE_IGNORE_LOCK) &&
2783 !(file->f_flags & O_DIRECT)) {
2784 CERROR("%s: unable to disable locking on "
2785 "non-O_DIRECT file\n", current->comm);
2789 fd->fd_flags |= flags;
2791 fd->fd_flags &= ~flags;
2794 case LL_IOC_LOV_SETSTRIPE:
2795 case LL_IOC_LOV_SETSTRIPE_NEW:
2796 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2797 case LL_IOC_LOV_SETEA:
2798 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2799 case LL_IOC_LOV_SWAP_LAYOUTS: {
2801 struct lustre_swap_layouts lsl;
2803 if (copy_from_user(&lsl, (char __user *)arg,
2804 sizeof(struct lustre_swap_layouts)))
2807 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2810 file2 = fget(lsl.sl_fd);
2814 /* O_WRONLY or O_RDWR */
2815 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2816 GOTO(out, rc = -EPERM);
2818 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2819 struct inode *inode2;
2820 struct ll_inode_info *lli;
2821 struct obd_client_handle *och = NULL;
2823 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2824 GOTO(out, rc = -EINVAL);
2826 lli = ll_i2info(inode);
2827 mutex_lock(&lli->lli_och_mutex);
2828 if (fd->fd_lease_och != NULL) {
2829 och = fd->fd_lease_och;
2830 fd->fd_lease_och = NULL;
2832 mutex_unlock(&lli->lli_och_mutex);
2834 GOTO(out, rc = -ENOLCK);
2835 inode2 = file_inode(file2);
2836 rc = ll_swap_layouts_close(och, inode, inode2);
2838 rc = ll_swap_layouts(file, file2, &lsl);
2844 case LL_IOC_LOV_GETSTRIPE:
2845 case LL_IOC_LOV_GETSTRIPE_NEW:
2846 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2847 case FSFILT_IOC_GETFLAGS:
2848 case FSFILT_IOC_SETFLAGS:
2849 RETURN(ll_iocontrol(inode, file, cmd, arg));
2850 case FSFILT_IOC_GETVERSION_OLD:
2851 case FSFILT_IOC_GETVERSION:
2852 RETURN(put_user(inode->i_generation, (int __user *)arg));
2853 case LL_IOC_GROUP_LOCK:
2854 RETURN(ll_get_grouplock(inode, file, arg));
2855 case LL_IOC_GROUP_UNLOCK:
2856 RETURN(ll_put_grouplock(inode, file, arg));
2857 case IOC_OBD_STATFS:
2858 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2860 /* We need to special case any other ioctls we want to handle,
2861 * to send them to the MDS/OST as appropriate and to properly
2862 * network encode the arg field.
2863 case FSFILT_IOC_SETVERSION_OLD:
2864 case FSFILT_IOC_SETVERSION:
2866 case LL_IOC_FLUSHCTX:
2867 RETURN(ll_flush_ctx(inode));
2868 case LL_IOC_PATH2FID: {
2869 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2870 sizeof(struct lu_fid)))
2875 case LL_IOC_GETPARENT:
2876 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2878 case OBD_IOC_FID2PATH:
2879 RETURN(ll_fid2path(inode, (void __user *)arg));
2880 case LL_IOC_DATA_VERSION: {
2881 struct ioc_data_version idv;
2884 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2887 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2888 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2891 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2897 case LL_IOC_GET_MDTIDX: {
2900 mdtidx = ll_get_mdt_idx(inode);
2904 if (put_user((int)mdtidx, (int __user *)arg))
2909 case OBD_IOC_GETDTNAME:
2910 case OBD_IOC_GETMDNAME:
2911 RETURN(ll_get_obd_name(inode, cmd, arg));
2912 case LL_IOC_HSM_STATE_GET: {
2913 struct md_op_data *op_data;
2914 struct hsm_user_state *hus;
2921 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2922 LUSTRE_OPC_ANY, hus);
2923 if (IS_ERR(op_data)) {
2925 RETURN(PTR_ERR(op_data));
2928 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2931 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2934 ll_finish_md_op_data(op_data);
2938 case LL_IOC_HSM_STATE_SET: {
2939 struct hsm_state_set *hss;
2946 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2951 rc = ll_hsm_state_set(inode, hss);
2956 case LL_IOC_HSM_ACTION: {
2957 struct md_op_data *op_data;
2958 struct hsm_current_action *hca;
2965 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2966 LUSTRE_OPC_ANY, hca);
2967 if (IS_ERR(op_data)) {
2969 RETURN(PTR_ERR(op_data));
2972 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2975 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2978 ll_finish_md_op_data(op_data);
2982 case LL_IOC_SET_LEASE: {
2983 struct ll_inode_info *lli = ll_i2info(inode);
2984 struct obd_client_handle *och = NULL;
2989 case LL_LEASE_WRLCK:
2990 if (!(file->f_mode & FMODE_WRITE))
2992 fmode = FMODE_WRITE;
2994 case LL_LEASE_RDLCK:
2995 if (!(file->f_mode & FMODE_READ))
2999 case LL_LEASE_UNLCK:
3000 mutex_lock(&lli->lli_och_mutex);
3001 if (fd->fd_lease_och != NULL) {
3002 och = fd->fd_lease_och;
3003 fd->fd_lease_och = NULL;
3005 mutex_unlock(&lli->lli_och_mutex);
3010 fmode = och->och_flags;
3011 rc = ll_lease_close(och, inode, &lease_broken);
3015 rc = ll_lease_och_release(inode, file);
3022 RETURN(ll_lease_type_from_fmode(fmode));
3027 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3029 /* apply for lease */
3030 och = ll_lease_open(inode, file, fmode, 0);
3032 RETURN(PTR_ERR(och));
3035 mutex_lock(&lli->lli_och_mutex);
3036 if (fd->fd_lease_och == NULL) {
3037 fd->fd_lease_och = och;
3040 mutex_unlock(&lli->lli_och_mutex);
3042 /* impossible now that only excl is supported for now */
3043 ll_lease_close(och, inode, &lease_broken);
3048 case LL_IOC_GET_LEASE: {
3049 struct ll_inode_info *lli = ll_i2info(inode);
3050 struct ldlm_lock *lock = NULL;
3053 mutex_lock(&lli->lli_och_mutex);
3054 if (fd->fd_lease_och != NULL) {
3055 struct obd_client_handle *och = fd->fd_lease_och;
3057 lock = ldlm_handle2lock(&och->och_lease_handle);
3059 lock_res_and_lock(lock);
3060 if (!ldlm_is_cancel(lock))
3061 fmode = och->och_flags;
3063 unlock_res_and_lock(lock);
3064 LDLM_LOCK_PUT(lock);
3067 mutex_unlock(&lli->lli_och_mutex);
3069 RETURN(ll_lease_type_from_fmode(fmode));
3071 case LL_IOC_HSM_IMPORT: {
3072 struct hsm_user_import *hui;
3078 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3083 rc = ll_hsm_import(inode, file, hui);
3088 case LL_IOC_FUTIMES_3: {
3089 struct ll_futimes_3 lfu;
3091 if (copy_from_user(&lfu,
3092 (const struct ll_futimes_3 __user *)arg,
3096 RETURN(ll_file_futimes_3(file, &lfu));
3098 case LL_IOC_LADVISE: {
3099 struct llapi_ladvise_hdr *k_ladvise_hdr;
3100 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3103 int alloc_size = sizeof(*k_ladvise_hdr);
3106 u_ladvise_hdr = (void __user *)arg;
3107 OBD_ALLOC_PTR(k_ladvise_hdr);
3108 if (k_ladvise_hdr == NULL)
3111 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3112 GOTO(out_ladvise, rc = -EFAULT);
3114 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3115 k_ladvise_hdr->lah_count < 1)
3116 GOTO(out_ladvise, rc = -EINVAL);
3118 num_advise = k_ladvise_hdr->lah_count;
3119 if (num_advise >= LAH_COUNT_MAX)
3120 GOTO(out_ladvise, rc = -EFBIG);
3122 OBD_FREE_PTR(k_ladvise_hdr);
3123 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3124 lah_advise[num_advise]);
3125 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3126 if (k_ladvise_hdr == NULL)
3130 * TODO: submit multiple advices to one server in a single RPC
3132 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3133 GOTO(out_ladvise, rc = -EFAULT);
3135 for (i = 0; i < num_advise; i++) {
3136 struct llapi_lu_ladvise *k_ladvise =
3137 &k_ladvise_hdr->lah_advise[i];
3138 struct llapi_lu_ladvise __user *u_ladvise =
3139 &u_ladvise_hdr->lah_advise[i];
3141 rc = ll_ladvise_sanity(inode, k_ladvise);
3143 GOTO(out_ladvise, rc);
3145 switch (k_ladvise->lla_advice) {
3146 case LU_LADVISE_LOCKNOEXPAND:
3147 rc = ll_lock_noexpand(file,
3148 k_ladvise->lla_peradvice_flags);
3149 GOTO(out_ladvise, rc);
3150 case LU_LADVISE_LOCKAHEAD:
3152 rc = ll_file_lock_ahead(file, k_ladvise);
3155 GOTO(out_ladvise, rc);
3158 &u_ladvise->lla_lockahead_result))
3159 GOTO(out_ladvise, rc = -EFAULT);
3162 rc = ll_ladvise(inode, file,
3163 k_ladvise_hdr->lah_flags,
3166 GOTO(out_ladvise, rc);
3173 OBD_FREE(k_ladvise_hdr, alloc_size);
3176 case LL_IOC_FSGETXATTR:
3177 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3178 case LL_IOC_FSSETXATTR:
3179 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3181 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3183 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3184 (void __user *)arg));
3188 #ifndef HAVE_FILE_LLSEEK_SIZE
3189 static inline loff_t
3190 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3192 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3194 if (offset > maxsize)
3197 if (offset != file->f_pos) {
3198 file->f_pos = offset;
3199 file->f_version = 0;
3205 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3206 loff_t maxsize, loff_t eof)
3208 struct inode *inode = file_inode(file);
3216 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3217 * position-querying operation. Avoid rewriting the "same"
3218 * f_pos value back to the file because a concurrent read(),
3219 * write() or lseek() might have altered it
3224 * f_lock protects against read/modify/write race with other
3225 * SEEK_CURs. Note that parallel writes and reads behave
3229 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3230 inode_unlock(inode);
3234 * In the generic case the entire file is data, so as long as
3235 * offset isn't at the end of the file then the offset is data.
3242 * There is a virtual hole at the end of the file, so as long as
3243 * offset isn't i_size or larger, return i_size.
3251 return llseek_execute(file, offset, maxsize);
3255 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3257 struct inode *inode = file_inode(file);
3258 loff_t retval, eof = 0;
3261 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3262 (origin == SEEK_CUR) ? file->f_pos : 0);
3263 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3264 PFID(ll_inode2fid(inode)), inode, retval, retval,
3266 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3268 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3269 retval = ll_glimpse_size(inode);
3272 eof = i_size_read(inode);
3275 retval = ll_generic_file_llseek_size(file, offset, origin,
3276 ll_file_maxbytes(inode), eof);
3280 static int ll_flush(struct file *file, fl_owner_t id)
3282 struct inode *inode = file_inode(file);
3283 struct ll_inode_info *lli = ll_i2info(inode);
3284 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3287 LASSERT(!S_ISDIR(inode->i_mode));
3289 /* catch async errors that were recorded back when async writeback
3290 * failed for pages in this mapping. */
3291 rc = lli->lli_async_rc;
3292 lli->lli_async_rc = 0;
3293 if (lli->lli_clob != NULL) {
3294 err = lov_read_and_clear_async_rc(lli->lli_clob);
3299 /* The application has been told write failure already.
3300 * Do not report failure again. */
3301 if (fd->fd_write_failed)
3303 return rc ? -EIO : 0;
3307 * Called to make sure a portion of file has been written out.
3308 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3310 * Return how many pages have been written.
3312 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3313 enum cl_fsync_mode mode, int ignore_layout)
3317 struct cl_fsync_io *fio;
3322 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3323 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3326 env = cl_env_get(&refcheck);
3328 RETURN(PTR_ERR(env));
3330 io = vvp_env_thread_io(env);
3331 io->ci_obj = ll_i2info(inode)->lli_clob;
3332 io->ci_ignore_layout = ignore_layout;
3334 /* initialize parameters for sync */
3335 fio = &io->u.ci_fsync;
3336 fio->fi_start = start;
3338 fio->fi_fid = ll_inode2fid(inode);
3339 fio->fi_mode = mode;
3340 fio->fi_nr_written = 0;
3342 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3343 result = cl_io_loop(env, io);
3345 result = io->ci_result;
3347 result = fio->fi_nr_written;
3348 cl_io_fini(env, io);
3349 cl_env_put(env, &refcheck);
3355 * When dentry is provided (the 'else' case), file_dentry() may be
3356 * null and dentry must be used directly rather than pulled from
3357 * file_dentry() as is done otherwise.
3360 #ifdef HAVE_FILE_FSYNC_4ARGS
3361 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3363 struct dentry *dentry = file_dentry(file);
3365 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3366 int ll_fsync(struct file *file, int datasync)
3368 struct dentry *dentry = file_dentry(file);
3370 loff_t end = LLONG_MAX;
3372 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3375 loff_t end = LLONG_MAX;
3377 struct inode *inode = dentry->d_inode;
3378 struct ll_inode_info *lli = ll_i2info(inode);
3379 struct ptlrpc_request *req;
3383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3384 PFID(ll_inode2fid(inode)), inode);
3385 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3387 #ifdef HAVE_FILE_FSYNC_4ARGS
3388 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3389 lock_inode = !lli->lli_inode_locked;
3393 /* fsync's caller has already called _fdata{sync,write}, we want
3394 * that IO to finish before calling the osc and mdc sync methods */
3395 rc = filemap_fdatawait(inode->i_mapping);
3398 /* catch async errors that were recorded back when async writeback
3399 * failed for pages in this mapping. */
3400 if (!S_ISDIR(inode->i_mode)) {
3401 err = lli->lli_async_rc;
3402 lli->lli_async_rc = 0;
3405 if (lli->lli_clob != NULL) {
3406 err = lov_read_and_clear_async_rc(lli->lli_clob);
3412 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3416 ptlrpc_req_finished(req);
3418 if (S_ISREG(inode->i_mode)) {
3419 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3421 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3422 if (rc == 0 && err < 0)
3425 fd->fd_write_failed = true;
3427 fd->fd_write_failed = false;
3430 #ifdef HAVE_FILE_FSYNC_4ARGS
3432 inode_unlock(inode);
3438 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3440 struct inode *inode = file_inode(file);
3441 struct ll_sb_info *sbi = ll_i2sbi(inode);
3442 struct ldlm_enqueue_info einfo = {
3443 .ei_type = LDLM_FLOCK,
3444 .ei_cb_cp = ldlm_flock_completion_ast,
3445 .ei_cbdata = file_lock,
3447 struct md_op_data *op_data;
3448 struct lustre_handle lockh = { 0 };
3449 union ldlm_policy_data flock = { { 0 } };
3450 int fl_type = file_lock->fl_type;
3456 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3457 PFID(ll_inode2fid(inode)), file_lock);
3459 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3461 if (file_lock->fl_flags & FL_FLOCK) {
3462 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3463 /* flocks are whole-file locks */
3464 flock.l_flock.end = OFFSET_MAX;
3465 /* For flocks owner is determined by the local file desctiptor*/
3466 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3467 } else if (file_lock->fl_flags & FL_POSIX) {
3468 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3469 flock.l_flock.start = file_lock->fl_start;
3470 flock.l_flock.end = file_lock->fl_end;
3474 flock.l_flock.pid = file_lock->fl_pid;
3476 /* Somewhat ugly workaround for svc lockd.
3477 * lockd installs custom fl_lmops->lm_compare_owner that checks
3478 * for the fl_owner to be the same (which it always is on local node
3479 * I guess between lockd processes) and then compares pid.
3480 * As such we assign pid to the owner field to make it all work,
3481 * conflict with normal locks is unlikely since pid space and
3482 * pointer space for current->files are not intersecting */
3483 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3484 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3488 einfo.ei_mode = LCK_PR;
3491 /* An unlock request may or may not have any relation to
3492 * existing locks so we may not be able to pass a lock handle
3493 * via a normal ldlm_lock_cancel() request. The request may even
3494 * unlock a byte range in the middle of an existing lock. In
3495 * order to process an unlock request we need all of the same
3496 * information that is given with a normal read or write record
3497 * lock request. To avoid creating another ldlm unlock (cancel)
3498 * message we'll treat a LCK_NL flock request as an unlock. */
3499 einfo.ei_mode = LCK_NL;
3502 einfo.ei_mode = LCK_PW;
3505 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3520 flags = LDLM_FL_BLOCK_NOWAIT;
3526 flags = LDLM_FL_TEST_LOCK;
3529 CERROR("unknown fcntl lock command: %d\n", cmd);
3533 /* Save the old mode so that if the mode in the lock changes we
3534 * can decrement the appropriate reader or writer refcount. */
3535 file_lock->fl_type = einfo.ei_mode;
3537 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3538 LUSTRE_OPC_ANY, NULL);
3539 if (IS_ERR(op_data))
3540 RETURN(PTR_ERR(op_data));
3542 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3543 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3544 flock.l_flock.pid, flags, einfo.ei_mode,
3545 flock.l_flock.start, flock.l_flock.end);
3547 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3550 /* Restore the file lock type if not TEST lock. */
3551 if (!(flags & LDLM_FL_TEST_LOCK))
3552 file_lock->fl_type = fl_type;
3554 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3555 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3556 !(flags & LDLM_FL_TEST_LOCK))
3557 rc2 = locks_lock_file_wait(file, file_lock);
3559 if ((file_lock->fl_flags & FL_FLOCK) &&
3560 (rc == 0 || file_lock->fl_type == F_UNLCK))
3561 rc2 = flock_lock_file_wait(file, file_lock);
3562 if ((file_lock->fl_flags & FL_POSIX) &&
3563 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3564 !(flags & LDLM_FL_TEST_LOCK))
3565 rc2 = posix_lock_file_wait(file, file_lock);
3566 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3568 if (rc2 && file_lock->fl_type != F_UNLCK) {
3569 einfo.ei_mode = LCK_NL;
3570 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3575 ll_finish_md_op_data(op_data);
3580 int ll_get_fid_by_name(struct inode *parent, const char *name,
3581 int namelen, struct lu_fid *fid,
3582 struct inode **inode)
3584 struct md_op_data *op_data = NULL;
3585 struct mdt_body *body;
3586 struct ptlrpc_request *req;
3590 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3591 LUSTRE_OPC_ANY, NULL);
3592 if (IS_ERR(op_data))
3593 RETURN(PTR_ERR(op_data));
3595 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3596 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3597 ll_finish_md_op_data(op_data);
3601 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3603 GOTO(out_req, rc = -EFAULT);
3605 *fid = body->mbo_fid1;
3608 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3610 ptlrpc_req_finished(req);
3614 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3615 const char *name, int namelen)
3617 struct dentry *dchild = NULL;
3618 struct inode *child_inode = NULL;
3619 struct md_op_data *op_data;
3620 struct ptlrpc_request *request = NULL;
3621 struct obd_client_handle *och = NULL;
3623 struct mdt_body *body;
3625 __u64 data_version = 0;
3628 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3629 name, PFID(ll_inode2fid(parent)), mdtidx);
3631 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3632 0, LUSTRE_OPC_ANY, NULL);
3633 if (IS_ERR(op_data))
3634 RETURN(PTR_ERR(op_data));
3636 /* Get child FID first */
3637 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3640 dchild = d_lookup(file_dentry(file), &qstr);
3641 if (dchild != NULL) {
3642 if (dchild->d_inode != NULL)
3643 child_inode = igrab(dchild->d_inode);
3647 if (child_inode == NULL) {
3648 rc = ll_get_fid_by_name(parent, name, namelen,
3649 &op_data->op_fid3, &child_inode);
3654 if (child_inode == NULL)
3655 GOTO(out_free, rc = -EINVAL);
3658 * lfs migrate command needs to be blocked on the client
3659 * by checking the migrate FID against the FID of the
3662 if (child_inode == parent->i_sb->s_root->d_inode)
3663 GOTO(out_iput, rc = -EINVAL);
3665 inode_lock(child_inode);
3666 op_data->op_fid3 = *ll_inode2fid(child_inode);
3667 if (!fid_is_sane(&op_data->op_fid3)) {
3668 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3669 ll_get_fsname(parent->i_sb, NULL, 0), name,
3670 PFID(&op_data->op_fid3));
3671 GOTO(out_unlock, rc = -EINVAL);
3674 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3676 GOTO(out_unlock, rc);
3679 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3680 PFID(&op_data->op_fid3), mdtidx);
3681 GOTO(out_unlock, rc = 0);
3684 if (S_ISREG(child_inode->i_mode)) {
3685 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3689 GOTO(out_unlock, rc);
3692 rc = ll_data_version(child_inode, &data_version,
3695 GOTO(out_close, rc);
3697 op_data->op_handle = och->och_fh;
3698 op_data->op_data = och->och_mod;
3699 op_data->op_data_version = data_version;
3700 op_data->op_lease_handle = och->och_lease_handle;
3701 op_data->op_bias |= MDS_RENAME_MIGRATE;
3704 op_data->op_mds = mdtidx;
3705 op_data->op_cli_flags = CLI_MIGRATE;
3706 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3707 namelen, name, namelen, &request);
3709 LASSERT(request != NULL);
3710 ll_update_times(request, parent);
3712 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3713 LASSERT(body != NULL);
3715 /* If the server does release layout lock, then we cleanup
3716 * the client och here, otherwise release it in out_close: */
3718 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3719 obd_mod_put(och->och_mod);
3720 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3722 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3728 if (request != NULL) {
3729 ptlrpc_req_finished(request);
3733 /* Try again if the file layout has changed. */
3734 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3738 if (och != NULL) /* close the file */
3739 ll_lease_close(och, child_inode, NULL);
3741 clear_nlink(child_inode);
3743 inode_unlock(child_inode);
3747 ll_finish_md_op_data(op_data);
3752 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3760 * test if some locks matching bits and l_req_mode are acquired
3761 * - bits can be in different locks
3762 * - if found clear the common lock bits in *bits
3763 * - the bits not found, are kept in *bits
3765 * \param bits [IN] searched lock bits [IN]
3766 * \param l_req_mode [IN] searched lock mode
3767 * \retval boolean, true iff all bits are found
3769 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3771 struct lustre_handle lockh;
3772 union ldlm_policy_data policy;
3773 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3774 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3783 fid = &ll_i2info(inode)->lli_fid;
3784 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3785 ldlm_lockname[mode]);
3787 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3788 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3789 policy.l_inodebits.bits = *bits & (1 << i);
3790 if (policy.l_inodebits.bits == 0)
3793 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3794 &policy, mode, &lockh)) {
3795 struct ldlm_lock *lock;
3797 lock = ldlm_handle2lock(&lockh);
3800 ~(lock->l_policy_data.l_inodebits.bits);
3801 LDLM_LOCK_PUT(lock);
3803 *bits &= ~policy.l_inodebits.bits;
3810 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3811 struct lustre_handle *lockh, __u64 flags,
3812 enum ldlm_mode mode)
3814 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3819 fid = &ll_i2info(inode)->lli_fid;
3820 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3822 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3823 fid, LDLM_IBITS, &policy, mode, lockh);
3828 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3830 /* Already unlinked. Just update nlink and return success */
3831 if (rc == -ENOENT) {
3833 /* If it is striped directory, and there is bad stripe
3834 * Let's revalidate the dentry again, instead of returning
3836 if (S_ISDIR(inode->i_mode) &&
3837 ll_i2info(inode)->lli_lsm_md != NULL)
3840 /* This path cannot be hit for regular files unless in
3841 * case of obscure races, so no need to to validate
3843 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3845 } else if (rc != 0) {
3846 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3847 "%s: revalidate FID "DFID" error: rc = %d\n",
3848 ll_get_fsname(inode->i_sb, NULL, 0),
3849 PFID(ll_inode2fid(inode)), rc);
3855 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3857 struct inode *inode = dentry->d_inode;
3858 struct ptlrpc_request *req = NULL;
3859 struct obd_export *exp;
3863 LASSERT(inode != NULL);
3865 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3866 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3868 exp = ll_i2mdexp(inode);
3870 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3871 * But under CMD case, it caused some lock issues, should be fixed
3872 * with new CMD ibits lock. See bug 12718 */
3873 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3874 struct lookup_intent oit = { .it_op = IT_GETATTR };
3875 struct md_op_data *op_data;
3877 if (ibits == MDS_INODELOCK_LOOKUP)
3878 oit.it_op = IT_LOOKUP;
3880 /* Call getattr by fid, so do not provide name at all. */
3881 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3882 dentry->d_inode, NULL, 0, 0,
3883 LUSTRE_OPC_ANY, NULL);
3884 if (IS_ERR(op_data))
3885 RETURN(PTR_ERR(op_data));
3887 rc = md_intent_lock(exp, op_data, &oit, &req,
3888 &ll_md_blocking_ast, 0);
3889 ll_finish_md_op_data(op_data);
3891 rc = ll_inode_revalidate_fini(inode, rc);
3895 rc = ll_revalidate_it_finish(req, &oit, dentry);
3897 ll_intent_release(&oit);
3901 /* Unlinked? Unhash dentry, so it is not picked up later by
3902 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3903 here to preserve get_cwd functionality on 2.6.
3905 if (!dentry->d_inode->i_nlink) {
3906 ll_lock_dcache(inode);
3907 d_lustre_invalidate(dentry, 0);
3908 ll_unlock_dcache(inode);
3911 ll_lookup_finish_locks(&oit, dentry);
3912 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3913 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3914 u64 valid = OBD_MD_FLGETATTR;
3915 struct md_op_data *op_data;
3918 if (S_ISREG(inode->i_mode)) {
3919 rc = ll_get_default_mdsize(sbi, &ealen);
3922 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3925 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3926 0, ealen, LUSTRE_OPC_ANY,
3928 if (IS_ERR(op_data))
3929 RETURN(PTR_ERR(op_data));
3931 op_data->op_valid = valid;
3932 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3933 ll_finish_md_op_data(op_data);
3935 rc = ll_inode_revalidate_fini(inode, rc);
3939 rc = ll_prep_inode(&inode, req, NULL, NULL);
3942 ptlrpc_req_finished(req);
3946 static int ll_merge_md_attr(struct inode *inode)
3948 struct cl_attr attr = { 0 };
3951 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3952 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3953 &attr, ll_md_blocking_ast);
3957 set_nlink(inode, attr.cat_nlink);
3958 inode->i_blocks = attr.cat_blocks;
3959 i_size_write(inode, attr.cat_size);
3961 ll_i2info(inode)->lli_atime = attr.cat_atime;
3962 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3963 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3969 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3971 struct inode *inode = dentry->d_inode;
3975 rc = __ll_inode_revalidate(dentry, ibits);
3979 /* if object isn't regular file, don't validate size */
3980 if (!S_ISREG(inode->i_mode)) {
3981 if (S_ISDIR(inode->i_mode) &&
3982 ll_i2info(inode)->lli_lsm_md != NULL) {
3983 rc = ll_merge_md_attr(inode);
3988 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3989 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3990 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3992 /* In case of restore, the MDT has the right size and has
3993 * already send it back without granting the layout lock,
3994 * inode is up-to-date so glimpse is useless.
3995 * Also to glimpse we need the layout, in case of a running
3996 * restore the MDT holds the layout lock so the glimpse will
3997 * block up to the end of restore (getattr will block)
3999 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4000 rc = ll_glimpse_size(inode);
4005 static inline dev_t ll_compat_encode_dev(dev_t dev)
4007 /* The compat_sys_*stat*() syscalls will fail unless the
4008 * device majors and minors are both less than 256. Note that
4009 * the value returned here will be passed through
4010 * old_encode_dev() in cp_compat_stat(). And so we are not
4011 * trying to return a valid compat (u16) device number, just
4012 * one that will pass the old_valid_dev() check. */
4014 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4017 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4018 int ll_getattr(const struct path *path, struct kstat *stat,
4019 u32 request_mask, unsigned int flags)
4022 struct dentry *de = path->dentry;
4024 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4027 struct inode *inode = de->d_inode;
4028 struct ll_sb_info *sbi = ll_i2sbi(inode);
4029 struct ll_inode_info *lli = ll_i2info(inode);
4032 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4033 MDS_INODELOCK_LOOKUP);
4034 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4039 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4041 if (ll_need_32bit_api(sbi)) {
4042 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4043 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4044 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4046 stat->ino = inode->i_ino;
4047 stat->dev = inode->i_sb->s_dev;
4048 stat->rdev = inode->i_rdev;
4051 stat->mode = inode->i_mode;
4052 stat->uid = inode->i_uid;
4053 stat->gid = inode->i_gid;
4054 stat->atime = inode->i_atime;
4055 stat->mtime = inode->i_mtime;
4056 stat->ctime = inode->i_ctime;
4057 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4059 stat->nlink = inode->i_nlink;
4060 stat->size = i_size_read(inode);
4061 stat->blocks = inode->i_blocks;
4066 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4067 __u64 start, __u64 len)
4071 struct fiemap *fiemap;
4072 unsigned int extent_count = fieinfo->fi_extents_max;
4074 num_bytes = sizeof(*fiemap) + (extent_count *
4075 sizeof(struct fiemap_extent));
4076 OBD_ALLOC_LARGE(fiemap, num_bytes);
4081 fiemap->fm_flags = fieinfo->fi_flags;
4082 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4083 fiemap->fm_start = start;
4084 fiemap->fm_length = len;
4085 if (extent_count > 0 &&
4086 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4087 sizeof(struct fiemap_extent)) != 0)
4088 GOTO(out, rc = -EFAULT);
4090 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4092 fieinfo->fi_flags = fiemap->fm_flags;
4093 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4094 if (extent_count > 0 &&
4095 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4096 fiemap->fm_mapped_extents *
4097 sizeof(struct fiemap_extent)) != 0)
4098 GOTO(out, rc = -EFAULT);
4100 OBD_FREE_LARGE(fiemap, num_bytes);
4104 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4106 struct ll_inode_info *lli = ll_i2info(inode);
4107 struct posix_acl *acl = NULL;
4110 spin_lock(&lli->lli_lock);
4111 /* VFS' acl_permission_check->check_acl will release the refcount */
4112 acl = posix_acl_dup(lli->lli_posix_acl);
4113 spin_unlock(&lli->lli_lock);
4118 #ifdef HAVE_IOP_SET_ACL
4119 #ifdef CONFIG_FS_POSIX_ACL
4120 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4122 const char *name = NULL;
4129 case ACL_TYPE_ACCESS:
4131 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4135 name = XATTR_NAME_POSIX_ACL_ACCESS;
4137 case ACL_TYPE_DEFAULT:
4138 if (!S_ISDIR(inode->i_mode))
4139 GOTO(out, rc = acl ? -EACCES : 0);
4140 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4143 GOTO(out, rc = -EINVAL);
4147 size = posix_acl_xattr_size(acl->a_count);
4148 value = kmalloc(size, GFP_NOFS);
4150 GOTO(out, rc = -ENOMEM);
4152 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4157 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4158 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4163 set_cached_acl(inode, type, acl);
4165 forget_cached_acl(inode, type);
4168 #endif /* CONFIG_FS_POSIX_ACL */
4169 #endif /* HAVE_IOP_SET_ACL */
4171 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4173 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4174 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4176 ll_check_acl(struct inode *inode, int mask)
4179 # ifdef CONFIG_FS_POSIX_ACL
4180 struct posix_acl *acl;
4184 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4185 if (flags & IPERM_FLAG_RCU)
4188 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4193 rc = posix_acl_permission(inode, acl, mask);
4194 posix_acl_release(acl);
4197 # else /* !CONFIG_FS_POSIX_ACL */
4199 # endif /* CONFIG_FS_POSIX_ACL */
4201 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4203 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4204 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4206 # ifdef HAVE_INODE_PERMISION_2ARGS
4207 int ll_inode_permission(struct inode *inode, int mask)
4209 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4214 struct ll_sb_info *sbi;
4215 struct root_squash_info *squash;
4216 struct cred *cred = NULL;
4217 const struct cred *old_cred = NULL;
4219 bool squash_id = false;
4222 #ifdef MAY_NOT_BLOCK
4223 if (mask & MAY_NOT_BLOCK)
4225 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4226 if (flags & IPERM_FLAG_RCU)
4230 /* as root inode are NOT getting validated in lookup operation,
4231 * need to do it before permission check. */
4233 if (inode == inode->i_sb->s_root->d_inode) {
4234 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4235 MDS_INODELOCK_LOOKUP);
4240 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4241 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4243 /* squash fsuid/fsgid if needed */
4244 sbi = ll_i2sbi(inode);
4245 squash = &sbi->ll_squash;
4246 if (unlikely(squash->rsi_uid != 0 &&
4247 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4248 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4252 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4253 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4254 squash->rsi_uid, squash->rsi_gid);
4256 /* update current process's credentials
4257 * and FS capability */
4258 cred = prepare_creds();
4262 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4263 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4264 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4265 if ((1 << cap) & CFS_CAP_FS_MASK)
4266 cap_lower(cred->cap_effective, cap);
4268 old_cred = override_creds(cred);
4271 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4272 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4273 /* restore current process's credentials and FS capability */
4275 revert_creds(old_cred);
4282 /* -o localflock - only provides locally consistent flock locks */
4283 struct file_operations ll_file_operations = {
4284 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4285 # ifdef HAVE_SYNC_READ_WRITE
4286 .read = new_sync_read,
4287 .write = new_sync_write,
4289 .read_iter = ll_file_read_iter,
4290 .write_iter = ll_file_write_iter,
4291 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4292 .read = ll_file_read,
4293 .aio_read = ll_file_aio_read,
4294 .write = ll_file_write,
4295 .aio_write = ll_file_aio_write,
4296 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4297 .unlocked_ioctl = ll_file_ioctl,
4298 .open = ll_file_open,
4299 .release = ll_file_release,
4300 .mmap = ll_file_mmap,
4301 .llseek = ll_file_seek,
4302 .splice_read = ll_file_splice_read,
4307 struct file_operations ll_file_operations_flock = {
4308 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4309 # ifdef HAVE_SYNC_READ_WRITE
4310 .read = new_sync_read,
4311 .write = new_sync_write,
4312 # endif /* HAVE_SYNC_READ_WRITE */
4313 .read_iter = ll_file_read_iter,
4314 .write_iter = ll_file_write_iter,
4315 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4316 .read = ll_file_read,
4317 .aio_read = ll_file_aio_read,
4318 .write = ll_file_write,
4319 .aio_write = ll_file_aio_write,
4320 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4321 .unlocked_ioctl = ll_file_ioctl,
4322 .open = ll_file_open,
4323 .release = ll_file_release,
4324 .mmap = ll_file_mmap,
4325 .llseek = ll_file_seek,
4326 .splice_read = ll_file_splice_read,
4329 .flock = ll_file_flock,
4330 .lock = ll_file_flock
4333 /* These are for -o noflock - to return ENOSYS on flock calls */
4334 struct file_operations ll_file_operations_noflock = {
4335 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4336 # ifdef HAVE_SYNC_READ_WRITE
4337 .read = new_sync_read,
4338 .write = new_sync_write,
4339 # endif /* HAVE_SYNC_READ_WRITE */
4340 .read_iter = ll_file_read_iter,
4341 .write_iter = ll_file_write_iter,
4342 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4343 .read = ll_file_read,
4344 .aio_read = ll_file_aio_read,
4345 .write = ll_file_write,
4346 .aio_write = ll_file_aio_write,
4347 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4348 .unlocked_ioctl = ll_file_ioctl,
4349 .open = ll_file_open,
4350 .release = ll_file_release,
4351 .mmap = ll_file_mmap,
4352 .llseek = ll_file_seek,
4353 .splice_read = ll_file_splice_read,
4356 .flock = ll_file_noflock,
4357 .lock = ll_file_noflock
4360 struct inode_operations ll_file_inode_operations = {
4361 .setattr = ll_setattr,
4362 .getattr = ll_getattr,
4363 .permission = ll_inode_permission,
4364 #ifdef HAVE_IOP_XATTR
4365 .setxattr = ll_setxattr,
4366 .getxattr = ll_getxattr,
4367 .removexattr = ll_removexattr,
4369 .listxattr = ll_listxattr,
4370 .fiemap = ll_fiemap,
4371 #ifdef HAVE_IOP_GET_ACL
4372 .get_acl = ll_get_acl,
4374 #ifdef HAVE_IOP_SET_ACL
4375 .set_acl = ll_set_acl,
4379 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4381 struct ll_inode_info *lli = ll_i2info(inode);
4382 struct cl_object *obj = lli->lli_clob;
4391 env = cl_env_get(&refcheck);
4393 RETURN(PTR_ERR(env));
4395 rc = cl_conf_set(env, lli->lli_clob, conf);
4399 if (conf->coc_opc == OBJECT_CONF_SET) {
4400 struct ldlm_lock *lock = conf->coc_lock;
4401 struct cl_layout cl = {
4405 LASSERT(lock != NULL);
4406 LASSERT(ldlm_has_layout(lock));
4408 /* it can only be allowed to match after layout is
4409 * applied to inode otherwise false layout would be
4410 * seen. Applying layout shoud happen before dropping
4411 * the intent lock. */
4412 ldlm_lock_allow_match(lock);
4414 rc = cl_object_layout_get(env, obj, &cl);
4419 DFID": layout version change: %u -> %u\n",
4420 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4422 ll_layout_version_set(lli, cl.cl_layout_gen);
4426 cl_env_put(env, &refcheck);
4431 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4432 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4435 struct ll_sb_info *sbi = ll_i2sbi(inode);
4436 struct ptlrpc_request *req;
4437 struct mdt_body *body;
4444 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4445 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4446 lock->l_lvb_data, lock->l_lvb_len);
4448 if (lock->l_lvb_data != NULL)
4451 /* if layout lock was granted right away, the layout is returned
4452 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4453 * blocked and then granted via completion ast, we have to fetch
4454 * layout here. Please note that we can't use the LVB buffer in
4455 * completion AST because it doesn't have a large enough buffer */
4456 rc = ll_get_default_mdsize(sbi, &lmmsize);
4458 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4459 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4464 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4466 GOTO(out, rc = -EPROTO);
4468 lmmsize = body->mbo_eadatasize;
4469 if (lmmsize == 0) /* empty layout */
4472 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4474 GOTO(out, rc = -EFAULT);
4476 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4477 if (lvbdata == NULL)
4478 GOTO(out, rc = -ENOMEM);
4480 memcpy(lvbdata, lmm, lmmsize);
4481 lock_res_and_lock(lock);
4482 if (unlikely(lock->l_lvb_data == NULL)) {
4483 lock->l_lvb_type = LVB_T_LAYOUT;
4484 lock->l_lvb_data = lvbdata;
4485 lock->l_lvb_len = lmmsize;
4488 unlock_res_and_lock(lock);
4491 OBD_FREE_LARGE(lvbdata, lmmsize);
4496 ptlrpc_req_finished(req);
4501 * Apply the layout to the inode. Layout lock is held and will be released
4504 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4505 struct inode *inode)
4507 struct ll_inode_info *lli = ll_i2info(inode);
4508 struct ll_sb_info *sbi = ll_i2sbi(inode);
4509 struct ldlm_lock *lock;
4510 struct cl_object_conf conf;
4513 bool wait_layout = false;
4516 LASSERT(lustre_handle_is_used(lockh));
4518 lock = ldlm_handle2lock(lockh);
4519 LASSERT(lock != NULL);
4520 LASSERT(ldlm_has_layout(lock));
4522 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4523 PFID(&lli->lli_fid), inode);
4525 /* in case this is a caching lock and reinstate with new inode */
4526 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4528 lock_res_and_lock(lock);
4529 lvb_ready = ldlm_is_lvb_ready(lock);
4530 unlock_res_and_lock(lock);
4532 /* checking lvb_ready is racy but this is okay. The worst case is
4533 * that multi processes may configure the file on the same time. */
4537 rc = ll_layout_fetch(inode, lock);
4541 /* for layout lock, lmm is stored in lock's lvb.
4542 * lvb_data is immutable if the lock is held so it's safe to access it
4545 * set layout to file. Unlikely this will fail as old layout was
4546 * surely eliminated */
4547 memset(&conf, 0, sizeof conf);
4548 conf.coc_opc = OBJECT_CONF_SET;
4549 conf.coc_inode = inode;
4550 conf.coc_lock = lock;
4551 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4552 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4553 rc = ll_layout_conf(inode, &conf);
4555 /* refresh layout failed, need to wait */
4556 wait_layout = rc == -EBUSY;
4559 LDLM_LOCK_PUT(lock);
4560 ldlm_lock_decref(lockh, mode);
4562 /* wait for IO to complete if it's still being used. */
4564 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4565 ll_get_fsname(inode->i_sb, NULL, 0),
4566 PFID(&lli->lli_fid), inode);
4568 memset(&conf, 0, sizeof conf);
4569 conf.coc_opc = OBJECT_CONF_WAIT;
4570 conf.coc_inode = inode;
4571 rc = ll_layout_conf(inode, &conf);
4575 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4576 ll_get_fsname(inode->i_sb, NULL, 0),
4577 PFID(&lli->lli_fid), rc);
4583 * Issue layout intent RPC to MDS.
4584 * \param inode [in] file inode
4585 * \param intent [in] layout intent
4587 * \retval 0 on success
4588 * \retval < 0 error code
4590 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4592 struct ll_inode_info *lli = ll_i2info(inode);
4593 struct ll_sb_info *sbi = ll_i2sbi(inode);
4594 struct md_op_data *op_data;
4595 struct lookup_intent it;
4596 struct ptlrpc_request *req;
4600 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4601 0, 0, LUSTRE_OPC_ANY, NULL);
4602 if (IS_ERR(op_data))
4603 RETURN(PTR_ERR(op_data));
4605 op_data->op_data = intent;
4606 op_data->op_data_size = sizeof(*intent);
4608 memset(&it, 0, sizeof(it));
4609 it.it_op = IT_LAYOUT;
4610 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4611 intent->li_opc == LAYOUT_INTENT_TRUNC)
4612 it.it_flags = FMODE_WRITE;
4614 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4615 ll_get_fsname(inode->i_sb, NULL, 0),
4616 PFID(&lli->lli_fid), inode);
4618 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4619 &ll_md_blocking_ast, 0);
4620 if (it.it_request != NULL)
4621 ptlrpc_req_finished(it.it_request);
4622 it.it_request = NULL;
4624 ll_finish_md_op_data(op_data);
4626 /* set lock data in case this is a new lock */
4628 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4630 ll_intent_drop_lock(&it);
4636 * This function checks if there exists a LAYOUT lock on the client side,
4637 * or enqueues it if it doesn't have one in cache.
4639 * This function will not hold layout lock so it may be revoked any time after
4640 * this function returns. Any operations depend on layout should be redone
4643 * This function should be called before lov_io_init() to get an uptodate
4644 * layout version, the caller should save the version number and after IO
4645 * is finished, this function should be called again to verify that layout
4646 * is not changed during IO time.
4648 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4650 struct ll_inode_info *lli = ll_i2info(inode);
4651 struct ll_sb_info *sbi = ll_i2sbi(inode);
4652 struct lustre_handle lockh;
4653 struct layout_intent intent = {
4654 .li_opc = LAYOUT_INTENT_ACCESS,
4656 enum ldlm_mode mode;
4660 *gen = ll_layout_version_get(lli);
4661 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4665 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4666 LASSERT(S_ISREG(inode->i_mode));
4668 /* take layout lock mutex to enqueue layout lock exclusively. */
4669 mutex_lock(&lli->lli_layout_mutex);
4672 /* mostly layout lock is caching on the local side, so try to
4673 * match it before grabbing layout lock mutex. */
4674 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4675 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4676 if (mode != 0) { /* hit cached lock */
4677 rc = ll_layout_lock_set(&lockh, mode, inode);
4683 rc = ll_layout_intent(inode, &intent);
4689 *gen = ll_layout_version_get(lli);
4690 mutex_unlock(&lli->lli_layout_mutex);
4696 * Issue layout intent RPC indicating where in a file an IO is about to write.
4698 * \param[in] inode file inode.
4699 * \param[in] start start offset of fille in bytes where an IO is about to
4701 * \param[in] end exclusive end offset in bytes of the write range.
4703 * \retval 0 on success
4704 * \retval < 0 error code
4706 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4708 struct layout_intent intent = {
4709 .li_opc = LAYOUT_INTENT_WRITE,
4716 rc = ll_layout_intent(inode, &intent);
4722 * This function send a restore request to the MDT
4724 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4726 struct hsm_user_request *hur;
4730 len = sizeof(struct hsm_user_request) +
4731 sizeof(struct hsm_user_item);
4732 OBD_ALLOC(hur, len);
4736 hur->hur_request.hr_action = HUA_RESTORE;
4737 hur->hur_request.hr_archive_id = 0;
4738 hur->hur_request.hr_flags = 0;
4739 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4740 sizeof(hur->hur_user_item[0].hui_fid));
4741 hur->hur_user_item[0].hui_extent.offset = offset;
4742 hur->hur_user_item[0].hui_extent.length = length;
4743 hur->hur_request.hr_itemcount = 1;
4744 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,