4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_SWAP:
148 LASSERT(data != NULL);
149 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
150 op_data->op_data_version = 0;
151 op_data->op_lease_handle = och->och_lease_handle;
152 op_data->op_fid2 = *ll_inode2fid(data);
155 case MDS_HSM_RELEASE:
156 LASSERT(data != NULL);
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *(__u64 *)data;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 LASSERT(data == NULL);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 if (rc != 0 && rc != -EINTR)
170 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
171 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
174 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
175 struct mdt_body *body;
177 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
178 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
182 ll_finish_md_op_data(op_data);
186 md_clear_open_replay_data(md_exp, och);
187 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
190 ptlrpc_req_finished(req); /* This is close request */
194 int ll_md_real_close(struct inode *inode, fmode_t fmode)
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
203 if (fmode & FMODE_WRITE) {
204 och_p = &lli->lli_mds_write_och;
205 och_usecount = &lli->lli_open_fd_write_count;
206 } else if (fmode & FMODE_EXEC) {
207 och_p = &lli->lli_mds_exec_och;
208 och_usecount = &lli->lli_open_fd_exec_count;
210 LASSERT(fmode & FMODE_READ);
211 och_p = &lli->lli_mds_read_och;
212 och_usecount = &lli->lli_open_fd_read_count;
215 mutex_lock(&lli->lli_och_mutex);
216 if (*och_usecount > 0) {
217 /* There are still users of this handle, so skip
219 mutex_unlock(&lli->lli_och_mutex);
225 mutex_unlock(&lli->lli_och_mutex);
228 /* There might be a race and this handle may already
230 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
236 static int ll_md_close(struct inode *inode, struct file *file)
238 union ldlm_policy_data policy = {
239 .l_inodebits = { MDS_INODELOCK_OPEN },
241 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
242 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
243 struct ll_inode_info *lli = ll_i2info(inode);
244 struct lustre_handle lockh;
245 enum ldlm_mode lockmode;
249 /* clear group lock, if present */
250 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
251 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
253 if (fd->fd_lease_och != NULL) {
256 /* Usually the lease is not released when the
257 * application crashed, we need to release here. */
258 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
259 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
260 PFID(&lli->lli_fid), rc, lease_broken);
262 fd->fd_lease_och = NULL;
265 if (fd->fd_och != NULL) {
266 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
271 /* Let's see if we have good enough OPEN lock on the file and if
272 we can skip talking to MDS */
273 mutex_lock(&lli->lli_och_mutex);
274 if (fd->fd_omode & FMODE_WRITE) {
276 LASSERT(lli->lli_open_fd_write_count);
277 lli->lli_open_fd_write_count--;
278 } else if (fd->fd_omode & FMODE_EXEC) {
280 LASSERT(lli->lli_open_fd_exec_count);
281 lli->lli_open_fd_exec_count--;
284 LASSERT(lli->lli_open_fd_read_count);
285 lli->lli_open_fd_read_count--;
287 mutex_unlock(&lli->lli_och_mutex);
289 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
290 LDLM_IBITS, &policy, lockmode, &lockh))
291 rc = ll_md_real_close(inode, fd->fd_omode);
294 LUSTRE_FPRIVATE(file) = NULL;
295 ll_file_data_put(fd);
300 /* While this returns an error code, fput() the caller does not, so we need
301 * to make every effort to clean up all of our state here. Also, applications
302 * rarely check close errors and even if an error is returned they will not
303 * re-try the close call.
305 int ll_file_release(struct inode *inode, struct file *file)
307 struct ll_file_data *fd;
308 struct ll_sb_info *sbi = ll_i2sbi(inode);
309 struct ll_inode_info *lli = ll_i2info(inode);
313 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
314 PFID(ll_inode2fid(inode)), inode);
316 if (inode->i_sb->s_root != file_dentry(file))
317 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
318 fd = LUSTRE_FPRIVATE(file);
321 /* The last ref on @file, maybe not the the owner pid of statahead,
322 * because parent and child process can share the same file handle. */
323 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
324 ll_deauthorize_statahead(inode, fd);
326 if (inode->i_sb->s_root == file_dentry(file)) {
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
332 if (!S_ISDIR(inode->i_mode)) {
333 if (lli->lli_clob != NULL)
334 lov_read_and_clear_async_rc(lli->lli_clob);
335 lli->lli_async_rc = 0;
338 rc = ll_md_close(inode, file);
340 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
341 libcfs_debug_dumplog();
346 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
347 struct lookup_intent *itp)
349 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
350 struct dentry *parent = de->d_parent;
351 const char *name = NULL;
353 struct md_op_data *op_data;
354 struct ptlrpc_request *req = NULL;
358 LASSERT(parent != NULL);
359 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
361 /* if server supports open-by-fid, or file name is invalid, don't pack
362 * name in open request */
363 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
364 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
365 name = de->d_name.name;
366 len = de->d_name.len;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
370 name, len, 0, LUSTRE_OPC_ANY, NULL);
372 RETURN(PTR_ERR(op_data));
373 op_data->op_data = lmm;
374 op_data->op_data_size = lmmsize;
376 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
377 &ll_md_blocking_ast, 0);
378 ll_finish_md_op_data(op_data);
380 /* reason for keep own exit path - don`t flood log
381 * with messages with -ESTALE errors.
383 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
384 it_open_error(DISP_OPEN_OPEN, itp))
386 ll_release_openhandle(de, itp);
390 if (it_disposition(itp, DISP_LOOKUP_NEG))
391 GOTO(out, rc = -ENOENT);
393 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
394 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
395 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
399 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
400 if (!rc && itp->it_lock_mode)
401 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
404 ptlrpc_req_finished(req);
405 ll_intent_drop_lock(itp);
407 /* We did open by fid, but by the time we got to the server,
408 * the object disappeared. If this is a create, we cannot really
409 * tell the userspace that the file it was trying to create
410 * does not exist. Instead let's return -ESTALE, and the VFS will
411 * retry the create with LOOKUP_REVAL that we are going to catch
412 * in ll_revalidate_dentry() and use lookup then.
414 if (rc == -ENOENT && itp->it_op & IT_CREAT)
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file_inode(file);
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file_dentry(file)) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file_dentry(file), it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * When setting a lease on a file, we take ownership of the lli_mds_*_och
698 * and save it as fd->fd_och so as to force client to reopen the file even
699 * if it has an open lock in cache already.
701 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
702 struct lustre_handle *old_handle)
704 struct ll_inode_info *lli = ll_i2info(inode);
705 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
706 struct obd_client_handle **och_p;
711 /* Get the openhandle of the file */
712 mutex_lock(&lli->lli_och_mutex);
713 if (fd->fd_lease_och != NULL)
714 GOTO(out_unlock, rc = -EBUSY);
716 if (fd->fd_och == NULL) {
717 if (file->f_mode & FMODE_WRITE) {
718 LASSERT(lli->lli_mds_write_och != NULL);
719 och_p = &lli->lli_mds_write_och;
720 och_usecount = &lli->lli_open_fd_write_count;
722 LASSERT(lli->lli_mds_read_och != NULL);
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 if (*och_usecount > 1)
728 GOTO(out_unlock, rc = -EBUSY);
735 *old_handle = fd->fd_och->och_fh;
739 mutex_unlock(&lli->lli_och_mutex);
744 * Release ownership on lli_mds_*_och when putting back a file lease.
746 static int ll_lease_och_release(struct inode *inode, struct file *file)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
750 struct obd_client_handle **och_p;
751 struct obd_client_handle *old_och = NULL;
756 mutex_lock(&lli->lli_och_mutex);
757 if (file->f_mode & FMODE_WRITE) {
758 och_p = &lli->lli_mds_write_och;
759 och_usecount = &lli->lli_open_fd_write_count;
761 och_p = &lli->lli_mds_read_och;
762 och_usecount = &lli->lli_open_fd_read_count;
765 /* The file may have been open by another process (broken lease) so
766 * *och_p is not NULL. In this case we should simply increase usecount
769 if (*och_p != NULL) {
770 old_och = fd->fd_och;
777 mutex_unlock(&lli->lli_och_mutex);
780 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
786 * Acquire a lease and open the file.
788 static struct obd_client_handle *
789 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
792 struct lookup_intent it = { .it_op = IT_OPEN };
793 struct ll_sb_info *sbi = ll_i2sbi(inode);
794 struct md_op_data *op_data;
795 struct ptlrpc_request *req = NULL;
796 struct lustre_handle old_handle = { 0 };
797 struct obd_client_handle *och = NULL;
802 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
803 RETURN(ERR_PTR(-EINVAL));
806 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
807 RETURN(ERR_PTR(-EPERM));
809 rc = ll_lease_och_acquire(inode, file, &old_handle);
816 RETURN(ERR_PTR(-ENOMEM));
818 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819 LUSTRE_OPC_ANY, NULL);
821 GOTO(out, rc = PTR_ERR(op_data));
823 /* To tell the MDT this openhandle is from the same owner */
824 op_data->op_handle = old_handle;
826 it.it_flags = fmode | open_flags;
827 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
828 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
829 &ll_md_blocking_lease_ast,
830 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
831 * it can be cancelled which may mislead applications that the lease is
833 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
834 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
835 * doesn't deal with openhandle, so normal openhandle will be leaked. */
836 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
837 ll_finish_md_op_data(op_data);
838 ptlrpc_req_finished(req);
840 GOTO(out_release_it, rc);
842 if (it_disposition(&it, DISP_LOOKUP_NEG))
843 GOTO(out_release_it, rc = -ENOENT);
845 rc = it_open_error(DISP_OPEN_OPEN, &it);
847 GOTO(out_release_it, rc);
849 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
850 ll_och_fill(sbi->ll_md_exp, &it, och);
852 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
853 GOTO(out_close, rc = -EOPNOTSUPP);
855 /* already get lease, handle lease lock */
856 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
857 if (it.it_lock_mode == 0 ||
858 it.it_lock_bits != MDS_INODELOCK_OPEN) {
859 /* open lock must return for lease */
860 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
861 PFID(ll_inode2fid(inode)), it.it_lock_mode,
863 GOTO(out_close, rc = -EPROTO);
866 ll_intent_release(&it);
870 /* Cancel open lock */
871 if (it.it_lock_mode != 0) {
872 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
875 och->och_lease_handle.cookie = 0ULL;
877 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
879 CERROR("%s: error closing file "DFID": %d\n",
880 ll_get_fsname(inode->i_sb, NULL, 0),
881 PFID(&ll_i2info(inode)->lli_fid), rc2);
882 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
884 ll_intent_release(&it);
892 * Check whether a layout swap can be done between two inodes.
894 * \param[in] inode1 First inode to check
895 * \param[in] inode2 Second inode to check
897 * \retval 0 on success, layout swap can be performed between both inodes
898 * \retval negative error code if requirements are not met
900 static int ll_check_swap_layouts_validity(struct inode *inode1,
901 struct inode *inode2)
903 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
906 if (inode_permission(inode1, MAY_WRITE) ||
907 inode_permission(inode2, MAY_WRITE))
910 if (inode1->i_sb != inode2->i_sb)
916 static int ll_swap_layouts_close(struct obd_client_handle *och,
917 struct inode *inode, struct inode *inode2)
919 const struct lu_fid *fid1 = ll_inode2fid(inode);
920 const struct lu_fid *fid2;
924 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
925 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
927 rc = ll_check_swap_layouts_validity(inode, inode2);
929 GOTO(out_free_och, rc);
931 /* We now know that inode2 is a lustre inode */
932 fid2 = ll_inode2fid(inode2);
934 rc = lu_fid_cmp(fid1, fid2);
936 GOTO(out_free_och, rc = -EINVAL);
938 /* Close the file and swap layouts between inode & inode2.
939 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
940 * because we still need it to pack l_remote_handle to MDT. */
941 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
944 och = NULL; /* freed in ll_close_inode_openhandle() */
954 * Release lease and close the file.
955 * It will check if the lease has ever broken.
957 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
960 struct ldlm_lock *lock;
961 bool cancelled = true;
965 lock = ldlm_handle2lock(&och->och_lease_handle);
967 lock_res_and_lock(lock);
968 cancelled = ldlm_is_cancel(lock);
969 unlock_res_and_lock(lock);
973 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
974 PFID(&ll_i2info(inode)->lli_fid), cancelled);
977 ldlm_cli_cancel(&och->och_lease_handle, 0);
979 if (lease_broken != NULL)
980 *lease_broken = cancelled;
982 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
986 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
988 struct ll_inode_info *lli = ll_i2info(inode);
989 struct cl_object *obj = lli->lli_clob;
990 struct cl_attr *attr = vvp_env_thread_attr(env);
998 ll_inode_size_lock(inode);
1000 /* Merge timestamps the most recently obtained from MDS with
1001 * timestamps obtained from OSTs.
1003 * Do not overwrite atime of inode because it may be refreshed
1004 * by file_accessed() function. If the read was served by cache
1005 * data, there is no RPC to be sent so that atime may not be
1006 * transferred to OSTs at all. MDT only updates atime at close time
1007 * if it's at least 'mdd.*.atime_diff' older.
1008 * All in all, the atime in Lustre does not strictly comply with
1009 * POSIX. Solving this problem needs to send an RPC to MDT for each
1010 * read, this will hurt performance. */
1011 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1012 LTIME_S(inode->i_atime) = lli->lli_atime;
1013 lli->lli_update_atime = 0;
1015 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1016 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1018 atime = LTIME_S(inode->i_atime);
1019 mtime = LTIME_S(inode->i_mtime);
1020 ctime = LTIME_S(inode->i_ctime);
1022 cl_object_attr_lock(obj);
1023 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1026 rc = cl_object_attr_get(env, obj, attr);
1027 cl_object_attr_unlock(obj);
1030 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1032 if (atime < attr->cat_atime)
1033 atime = attr->cat_atime;
1035 if (ctime < attr->cat_ctime)
1036 ctime = attr->cat_ctime;
1038 if (mtime < attr->cat_mtime)
1039 mtime = attr->cat_mtime;
1041 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1042 PFID(&lli->lli_fid), attr->cat_size);
1044 i_size_write(inode, attr->cat_size);
1045 inode->i_blocks = attr->cat_blocks;
1047 LTIME_S(inode->i_atime) = atime;
1048 LTIME_S(inode->i_mtime) = mtime;
1049 LTIME_S(inode->i_ctime) = ctime;
1052 ll_inode_size_unlock(inode);
1057 static bool file_is_noatime(const struct file *file)
1059 const struct vfsmount *mnt = file->f_path.mnt;
1060 const struct inode *inode = file_inode((struct file *)file);
1062 /* Adapted from file_accessed() and touch_atime().*/
1063 if (file->f_flags & O_NOATIME)
1066 if (inode->i_flags & S_NOATIME)
1069 if (IS_NOATIME(inode))
1072 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1075 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1078 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1084 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1086 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1088 struct inode *inode = file_inode(file);
1089 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1091 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1092 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1093 io->u.ci_rw.rw_file = file;
1094 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1095 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1096 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1098 if (iot == CIT_WRITE) {
1099 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1100 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1101 file->f_flags & O_DIRECT ||
1104 io->ci_obj = ll_i2info(inode)->lli_clob;
1105 io->ci_lockreq = CILR_MAYBE;
1106 if (ll_file_nolock(file)) {
1107 io->ci_lockreq = CILR_NEVER;
1108 io->ci_no_srvlock = 1;
1109 } else if (file->f_flags & O_APPEND) {
1110 io->ci_lockreq = CILR_MANDATORY;
1112 io->ci_noatime = file_is_noatime(file);
1113 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1114 io->ci_pio = !io->u.ci_rw.rw_append;
1119 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1121 struct cl_io_pt *pt = ptask->pt_cbdata;
1122 struct file *file = pt->cip_file;
1125 loff_t pos = pt->cip_pos;
1130 env = cl_env_get(&refcheck);
1132 RETURN(PTR_ERR(env));
1134 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1135 file_dentry(file)->d_name.name,
1136 pt->cip_iot == CIT_READ ? "read" : "write",
1137 pos, pos + pt->cip_count);
1140 io = vvp_env_thread_io(env);
1141 ll_io_init(io, file, pt->cip_iot);
1142 io->u.ci_rw.rw_iter = pt->cip_iter;
1143 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1144 io->ci_pio = 0; /* It's already in parallel task */
1146 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1147 pt->cip_count - pt->cip_result);
1149 struct vvp_io *vio = vvp_env_io(env);
1151 vio->vui_io_subtype = IO_NORMAL;
1152 vio->vui_fd = LUSTRE_FPRIVATE(file);
1154 ll_cl_add(file, env, io, LCC_RW);
1155 rc = cl_io_loop(env, io);
1156 ll_cl_remove(file, env);
1158 /* cl_io_rw_init() handled IO */
1162 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1168 if (io->ci_nob > 0) {
1169 pt->cip_result += io->ci_nob;
1170 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1172 pt->cip_iocb.ki_pos = pos;
1173 #ifdef HAVE_KIOCB_KI_LEFT
1174 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1175 #elif defined(HAVE_KI_NBYTES)
1176 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1180 cl_io_fini(env, io);
1182 if ((rc == 0 || rc == -ENODATA) &&
1183 pt->cip_result < pt->cip_count &&
1184 io->ci_need_restart) {
1186 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1187 file_dentry(file)->d_name.name,
1188 pt->cip_iot == CIT_READ ? "read" : "write",
1189 pos, pos + pt->cip_count - pt->cip_result,
1190 pt->cip_result, rc);
1194 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1195 file_dentry(file)->d_name.name,
1196 pt->cip_iot == CIT_READ ? "read" : "write",
1197 pt->cip_result, rc);
1199 cl_env_put(env, &refcheck);
1200 RETURN(pt->cip_result > 0 ? 0 : rc);
1204 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1205 struct file *file, enum cl_io_type iot,
1206 loff_t *ppos, size_t count)
1208 struct range_lock range;
1209 struct vvp_io *vio = vvp_env_io(env);
1210 struct inode *inode = file_inode(file);
1211 struct ll_inode_info *lli = ll_i2info(inode);
1212 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1220 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1221 file_dentry(file)->d_name.name,
1222 iot == CIT_READ ? "read" : "write", pos, pos + count);
1225 io = vvp_env_thread_io(env);
1226 ll_io_init(io, file, iot);
1227 if (args->via_io_subtype == IO_NORMAL) {
1228 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1229 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1234 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1235 bool range_locked = false;
1237 if (file->f_flags & O_APPEND)
1238 range_lock_init(&range, 0, LUSTRE_EOF);
1240 range_lock_init(&range, pos, pos + count - 1);
1242 vio->vui_fd = LUSTRE_FPRIVATE(file);
1243 vio->vui_io_subtype = args->via_io_subtype;
1245 switch (vio->vui_io_subtype) {
1247 /* Direct IO reads must also take range lock,
1248 * or multiple reads will try to work on the same pages
1249 * See LU-6227 for details. */
1250 if (((iot == CIT_WRITE) ||
1251 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1252 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1253 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1255 rc = range_lock(&lli->lli_write_tree, &range);
1259 range_locked = true;
1263 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1264 vio->u.splice.vui_flags = args->u.splice.via_flags;
1267 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1271 ll_cl_add(file, env, io, LCC_RW);
1272 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1273 !lli->lli_inode_locked) {
1275 lli->lli_inode_locked = 1;
1277 rc = cl_io_loop(env, io);
1278 if (lli->lli_inode_locked) {
1279 lli->lli_inode_locked = 0;
1280 inode_unlock(inode);
1282 ll_cl_remove(file, env);
1285 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1287 range_unlock(&lli->lli_write_tree, &range);
1290 /* cl_io_rw_init() handled IO */
1294 if (io->ci_nob > 0) {
1295 result += io->ci_nob;
1296 count -= io->ci_nob;
1298 if (args->via_io_subtype == IO_NORMAL) {
1299 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1301 args->u.normal.via_iocb->ki_pos = pos;
1302 #ifdef HAVE_KIOCB_KI_LEFT
1303 args->u.normal.via_iocb->ki_left = count;
1304 #elif defined(HAVE_KI_NBYTES)
1305 args->u.normal.via_iocb->ki_nbytes = count;
1309 pos = io->u.ci_rw.rw_range.cir_pos;
1313 cl_io_fini(env, io);
1315 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1317 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1318 file_dentry(file)->d_name.name,
1319 iot == CIT_READ ? "read" : "write",
1320 pos, pos + count, result, rc);
1324 if (iot == CIT_READ) {
1326 ll_stats_ops_tally(ll_i2sbi(inode),
1327 LPROC_LL_READ_BYTES, result);
1328 } else if (iot == CIT_WRITE) {
1330 ll_stats_ops_tally(ll_i2sbi(inode),
1331 LPROC_LL_WRITE_BYTES, result);
1332 fd->fd_write_failed = false;
1333 } else if (result == 0 && rc == 0) {
1336 fd->fd_write_failed = true;
1338 fd->fd_write_failed = false;
1339 } else if (rc != -ERESTARTSYS) {
1340 fd->fd_write_failed = true;
1344 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1345 file_dentry(file)->d_name.name,
1346 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1350 RETURN(result > 0 ? result : rc);
1354 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1355 * especially for small I/O.
1357 * To serve a read request, CLIO has to create and initialize a cl_io and
1358 * then request DLM lock. This has turned out to have siginificant overhead
1359 * and affects the performance of small I/O dramatically.
1361 * It's not necessary to create a cl_io for each I/O. Under the help of read
1362 * ahead, most of the pages being read are already in memory cache and we can
1363 * read those pages directly because if the pages exist, the corresponding DLM
1364 * lock must exist so that page content must be valid.
1366 * In fast read implementation, the llite speculatively finds and reads pages
1367 * in memory cache. There are three scenarios for fast read:
1368 * - If the page exists and is uptodate, kernel VM will provide the data and
1369 * CLIO won't be intervened;
1370 * - If the page was brought into memory by read ahead, it will be exported
1371 * and read ahead parameters will be updated;
1372 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1373 * it will go back and invoke normal read, i.e., a cl_io will be created
1374 * and DLM lock will be requested.
1376 * POSIX compliance: posix standard states that read is intended to be atomic.
1377 * Lustre read implementation is in line with Linux kernel read implementation
1378 * and neither of them complies with POSIX standard in this matter. Fast read
1379 * doesn't make the situation worse on single node but it may interleave write
1380 * results from multiple nodes due to short read handling in ll_file_aio_read().
1382 * \param env - lu_env
1383 * \param iocb - kiocb from kernel
1384 * \param iter - user space buffers where the data will be copied
1386 * \retval - number of bytes have been read, or error code if error occurred.
1389 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1393 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1396 /* NB: we can't do direct IO for fast read because it will need a lock
1397 * to make IO engine happy. */
1398 if (iocb->ki_filp->f_flags & O_DIRECT)
1401 result = generic_file_read_iter(iocb, iter);
1403 /* If the first page is not in cache, generic_file_aio_read() will be
1404 * returned with -ENODATA.
1405 * See corresponding code in ll_readpage(). */
1406 if (result == -ENODATA)
1410 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1411 LPROC_LL_READ_BYTES, result);
1417 * Read from a file (through the page cache).
1419 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1422 struct vvp_io_args *args;
1427 result = ll_do_fast_read(iocb, to);
1428 if (result < 0 || iov_iter_count(to) == 0)
1431 env = cl_env_get(&refcheck);
1433 return PTR_ERR(env);
1435 args = ll_env_args(env, IO_NORMAL);
1436 args->u.normal.via_iter = to;
1437 args->u.normal.via_iocb = iocb;
1439 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1440 &iocb->ki_pos, iov_iter_count(to));
1443 else if (result == 0)
1446 cl_env_put(env, &refcheck);
1452 * Write to a file (through the page cache).
1454 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1456 struct vvp_io_args *args;
1461 env = cl_env_get(&refcheck);
1463 return PTR_ERR(env);
1465 args = ll_env_args(env, IO_NORMAL);
1466 args->u.normal.via_iter = from;
1467 args->u.normal.via_iocb = iocb;
1469 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1470 &iocb->ki_pos, iov_iter_count(from));
1471 cl_env_put(env, &refcheck);
1475 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1477 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1479 static int ll_file_get_iov_count(const struct iovec *iov,
1480 unsigned long *nr_segs, size_t *count)
1485 for (seg = 0; seg < *nr_segs; seg++) {
1486 const struct iovec *iv = &iov[seg];
1489 * If any segment has a negative length, or the cumulative
1490 * length ever wraps negative then return -EINVAL.
1493 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1495 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1500 cnt -= iv->iov_len; /* This segment is no good */
1507 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1508 unsigned long nr_segs, loff_t pos)
1515 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1519 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1520 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1521 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1522 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1523 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1525 result = ll_file_read_iter(iocb, &to);
1530 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1533 struct iovec iov = { .iov_base = buf, .iov_len = count };
1538 init_sync_kiocb(&kiocb, file);
1539 kiocb.ki_pos = *ppos;
1540 #ifdef HAVE_KIOCB_KI_LEFT
1541 kiocb.ki_left = count;
1542 #elif defined(HAVE_KI_NBYTES)
1543 kiocb.i_nbytes = count;
1546 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1547 *ppos = kiocb.ki_pos;
1553 * Write to a file (through the page cache).
1556 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1557 unsigned long nr_segs, loff_t pos)
1559 struct iov_iter from;
1564 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1568 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1569 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1570 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1571 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1572 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1574 result = ll_file_write_iter(iocb, &from);
1579 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1580 size_t count, loff_t *ppos)
1583 struct iovec iov = { .iov_base = (void __user *)buf,
1585 struct kiocb *kiocb;
1590 env = cl_env_get(&refcheck);
1592 RETURN(PTR_ERR(env));
1594 kiocb = &ll_env_info(env)->lti_kiocb;
1595 init_sync_kiocb(kiocb, file);
1596 kiocb->ki_pos = *ppos;
1597 #ifdef HAVE_KIOCB_KI_LEFT
1598 kiocb->ki_left = count;
1599 #elif defined(HAVE_KI_NBYTES)
1600 kiocb->ki_nbytes = count;
1603 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1604 *ppos = kiocb->ki_pos;
1606 cl_env_put(env, &refcheck);
1609 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1612 * Send file content (through pagecache) somewhere with helper
1614 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1615 struct pipe_inode_info *pipe, size_t count,
1619 struct vvp_io_args *args;
1624 env = cl_env_get(&refcheck);
1626 RETURN(PTR_ERR(env));
1628 args = ll_env_args(env, IO_SPLICE);
1629 args->u.splice.via_pipe = pipe;
1630 args->u.splice.via_flags = flags;
1632 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1633 cl_env_put(env, &refcheck);
1637 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1638 __u64 flags, struct lov_user_md *lum, int lum_size)
1640 struct lookup_intent oit = {
1642 .it_flags = flags | MDS_OPEN_BY_FID,
1647 ll_inode_size_lock(inode);
1648 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1650 GOTO(out_unlock, rc);
1652 ll_release_openhandle(dentry, &oit);
1655 ll_inode_size_unlock(inode);
1656 ll_intent_release(&oit);
1661 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1662 struct lov_mds_md **lmmp, int *lmm_size,
1663 struct ptlrpc_request **request)
1665 struct ll_sb_info *sbi = ll_i2sbi(inode);
1666 struct mdt_body *body;
1667 struct lov_mds_md *lmm = NULL;
1668 struct ptlrpc_request *req = NULL;
1669 struct md_op_data *op_data;
1672 rc = ll_get_default_mdsize(sbi, &lmmsize);
1676 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1677 strlen(filename), lmmsize,
1678 LUSTRE_OPC_ANY, NULL);
1679 if (IS_ERR(op_data))
1680 RETURN(PTR_ERR(op_data));
1682 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1683 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1684 ll_finish_md_op_data(op_data);
1686 CDEBUG(D_INFO, "md_getattr_name failed "
1687 "on %s: rc %d\n", filename, rc);
1691 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1692 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1694 lmmsize = body->mbo_eadatasize;
1696 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1698 GOTO(out, rc = -ENODATA);
1701 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1702 LASSERT(lmm != NULL);
1704 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1705 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1706 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1707 GOTO(out, rc = -EPROTO);
1710 * This is coming from the MDS, so is probably in
1711 * little endian. We convert it to host endian before
1712 * passing it to userspace.
1714 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1717 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1718 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1719 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1720 if (le32_to_cpu(lmm->lmm_pattern) &
1721 LOV_PATTERN_F_RELEASED)
1725 /* if function called for directory - we should
1726 * avoid swab not existent lsm objects */
1727 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1728 lustre_swab_lov_user_md_v1(
1729 (struct lov_user_md_v1 *)lmm);
1730 if (S_ISREG(body->mbo_mode))
1731 lustre_swab_lov_user_md_objects(
1732 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1734 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1735 lustre_swab_lov_user_md_v3(
1736 (struct lov_user_md_v3 *)lmm);
1737 if (S_ISREG(body->mbo_mode))
1738 lustre_swab_lov_user_md_objects(
1739 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1741 } else if (lmm->lmm_magic ==
1742 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1743 lustre_swab_lov_comp_md_v1(
1744 (struct lov_comp_md_v1 *)lmm);
1750 *lmm_size = lmmsize;
1755 static int ll_lov_setea(struct inode *inode, struct file *file,
1758 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1759 struct lov_user_md *lump;
1760 int lum_size = sizeof(struct lov_user_md) +
1761 sizeof(struct lov_user_ost_data);
1765 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1768 OBD_ALLOC_LARGE(lump, lum_size);
1772 if (copy_from_user(lump, arg, lum_size))
1773 GOTO(out_lump, rc = -EFAULT);
1775 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1777 cl_lov_delay_create_clear(&file->f_flags);
1780 OBD_FREE_LARGE(lump, lum_size);
1784 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1791 env = cl_env_get(&refcheck);
1793 RETURN(PTR_ERR(env));
1795 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1796 cl_env_put(env, &refcheck);
1800 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1803 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1804 struct lov_user_md *klum;
1806 __u64 flags = FMODE_WRITE;
1809 rc = ll_copy_user_md(lum, &klum);
1814 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1819 rc = put_user(0, &lum->lmm_stripe_count);
1823 rc = ll_layout_refresh(inode, &gen);
1827 rc = ll_file_getstripe(inode, arg, lum_size);
1829 cl_lov_delay_create_clear(&file->f_flags);
1832 OBD_FREE(klum, lum_size);
1837 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1839 struct ll_inode_info *lli = ll_i2info(inode);
1840 struct cl_object *obj = lli->lli_clob;
1841 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1842 struct ll_grouplock grouplock;
1847 CWARN("group id for group lock must not be 0\n");
1851 if (ll_file_nolock(file))
1852 RETURN(-EOPNOTSUPP);
1854 spin_lock(&lli->lli_lock);
1855 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1856 CWARN("group lock already existed with gid %lu\n",
1857 fd->fd_grouplock.lg_gid);
1858 spin_unlock(&lli->lli_lock);
1861 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1862 spin_unlock(&lli->lli_lock);
1865 * XXX: group lock needs to protect all OST objects while PFL
1866 * can add new OST objects during the IO, so we'd instantiate
1867 * all OST objects before getting its group lock.
1872 struct cl_layout cl = {
1873 .cl_is_composite = false,
1876 env = cl_env_get(&refcheck);
1878 RETURN(PTR_ERR(env));
1880 rc = cl_object_layout_get(env, obj, &cl);
1881 if (!rc && cl.cl_is_composite)
1882 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1884 cl_env_put(env, &refcheck);
1889 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1890 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1894 spin_lock(&lli->lli_lock);
1895 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1896 spin_unlock(&lli->lli_lock);
1897 CERROR("another thread just won the race\n");
1898 cl_put_grouplock(&grouplock);
1902 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1903 fd->fd_grouplock = grouplock;
1904 spin_unlock(&lli->lli_lock);
1906 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1910 static int ll_put_grouplock(struct inode *inode, struct file *file,
1913 struct ll_inode_info *lli = ll_i2info(inode);
1914 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1915 struct ll_grouplock grouplock;
1918 spin_lock(&lli->lli_lock);
1919 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1920 spin_unlock(&lli->lli_lock);
1921 CWARN("no group lock held\n");
1925 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1927 if (fd->fd_grouplock.lg_gid != arg) {
1928 CWARN("group lock %lu doesn't match current id %lu\n",
1929 arg, fd->fd_grouplock.lg_gid);
1930 spin_unlock(&lli->lli_lock);
1934 grouplock = fd->fd_grouplock;
1935 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1936 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1937 spin_unlock(&lli->lli_lock);
1939 cl_put_grouplock(&grouplock);
1940 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1945 * Close inode open handle
1947 * \param dentry [in] dentry which contains the inode
1948 * \param it [in,out] intent which contains open info and result
1951 * \retval <0 failure
1953 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1955 struct inode *inode = dentry->d_inode;
1956 struct obd_client_handle *och;
1962 /* Root ? Do nothing. */
1963 if (dentry->d_inode->i_sb->s_root == dentry)
1966 /* No open handle to close? Move away */
1967 if (!it_disposition(it, DISP_OPEN_OPEN))
1970 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1972 OBD_ALLOC(och, sizeof(*och));
1974 GOTO(out, rc = -ENOMEM);
1976 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1978 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1980 /* this one is in place of ll_file_open */
1981 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1982 ptlrpc_req_finished(it->it_request);
1983 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1989 * Get size for inode for which FIEMAP mapping is requested.
1990 * Make the FIEMAP get_info call and returns the result.
1991 * \param fiemap kernel buffer to hold extens
1992 * \param num_bytes kernel buffer size
1994 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2000 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2003 /* Checks for fiemap flags */
2004 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2005 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2009 /* Check for FIEMAP_FLAG_SYNC */
2010 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2011 rc = filemap_fdatawrite(inode->i_mapping);
2016 env = cl_env_get(&refcheck);
2018 RETURN(PTR_ERR(env));
2020 if (i_size_read(inode) == 0) {
2021 rc = ll_glimpse_size(inode);
2026 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2027 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2028 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2030 /* If filesize is 0, then there would be no objects for mapping */
2031 if (fmkey.lfik_oa.o_size == 0) {
2032 fiemap->fm_mapped_extents = 0;
2036 fmkey.lfik_fiemap = *fiemap;
2038 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2039 &fmkey, fiemap, &num_bytes);
2041 cl_env_put(env, &refcheck);
2045 int ll_fid2path(struct inode *inode, void __user *arg)
2047 struct obd_export *exp = ll_i2mdexp(inode);
2048 const struct getinfo_fid2path __user *gfin = arg;
2050 struct getinfo_fid2path *gfout;
2056 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2057 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2060 /* Only need to get the buflen */
2061 if (get_user(pathlen, &gfin->gf_pathlen))
2064 if (pathlen > PATH_MAX)
2067 outsize = sizeof(*gfout) + pathlen;
2068 OBD_ALLOC(gfout, outsize);
2072 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2073 GOTO(gf_free, rc = -EFAULT);
2074 /* append root FID after gfout to let MDT know the root FID so that it
2075 * can lookup the correct path, this is mainly for fileset.
2076 * old server without fileset mount support will ignore this. */
2077 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2079 /* Call mdc_iocontrol */
2080 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2084 if (copy_to_user(arg, gfout, outsize))
2088 OBD_FREE(gfout, outsize);
2093 * Read the data_version for inode.
2095 * This value is computed using stripe object version on OST.
2096 * Version is computed using server side locking.
2098 * @param flags if do sync on the OST side;
2100 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2101 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2103 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2105 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2113 /* If no file object initialized, we consider its version is 0. */
2119 env = cl_env_get(&refcheck);
2121 RETURN(PTR_ERR(env));
2123 io = vvp_env_thread_io(env);
2125 io->u.ci_data_version.dv_data_version = 0;
2126 io->u.ci_data_version.dv_flags = flags;
2129 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2130 result = cl_io_loop(env, io);
2132 result = io->ci_result;
2134 *data_version = io->u.ci_data_version.dv_data_version;
2136 cl_io_fini(env, io);
2138 if (unlikely(io->ci_need_restart))
2141 cl_env_put(env, &refcheck);
2147 * Trigger a HSM release request for the provided inode.
2149 int ll_hsm_release(struct inode *inode)
2152 struct obd_client_handle *och = NULL;
2153 __u64 data_version = 0;
2158 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2159 ll_get_fsname(inode->i_sb, NULL, 0),
2160 PFID(&ll_i2info(inode)->lli_fid));
2162 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2164 GOTO(out, rc = PTR_ERR(och));
2166 /* Grab latest data_version and [am]time values */
2167 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2171 env = cl_env_get(&refcheck);
2173 GOTO(out, rc = PTR_ERR(env));
2175 rc = ll_merge_attr(env, inode);
2176 cl_env_put(env, &refcheck);
2178 /* If error happen, we have the wrong size for a file.
2184 /* Release the file.
2185 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2186 * we still need it to pack l_remote_handle to MDT. */
2187 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2193 if (och != NULL && !IS_ERR(och)) /* close the file */
2194 ll_lease_close(och, inode, NULL);
2199 struct ll_swap_stack {
2202 struct inode *inode1;
2203 struct inode *inode2;
2208 static int ll_swap_layouts(struct file *file1, struct file *file2,
2209 struct lustre_swap_layouts *lsl)
2211 struct mdc_swap_layouts msl;
2212 struct md_op_data *op_data;
2215 struct ll_swap_stack *llss = NULL;
2218 OBD_ALLOC_PTR(llss);
2222 llss->inode1 = file_inode(file1);
2223 llss->inode2 = file_inode(file2);
2225 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2229 /* we use 2 bool because it is easier to swap than 2 bits */
2230 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2231 llss->check_dv1 = true;
2233 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2234 llss->check_dv2 = true;
2236 /* we cannot use lsl->sl_dvX directly because we may swap them */
2237 llss->dv1 = lsl->sl_dv1;
2238 llss->dv2 = lsl->sl_dv2;
2240 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2241 if (rc == 0) /* same file, done! */
2244 if (rc < 0) { /* sequentialize it */
2245 swap(llss->inode1, llss->inode2);
2247 swap(llss->dv1, llss->dv2);
2248 swap(llss->check_dv1, llss->check_dv2);
2252 if (gid != 0) { /* application asks to flush dirty cache */
2253 rc = ll_get_grouplock(llss->inode1, file1, gid);
2257 rc = ll_get_grouplock(llss->inode2, file2, gid);
2259 ll_put_grouplock(llss->inode1, file1, gid);
2264 /* ultimate check, before swaping the layouts we check if
2265 * dataversion has changed (if requested) */
2266 if (llss->check_dv1) {
2267 rc = ll_data_version(llss->inode1, &dv, 0);
2270 if (dv != llss->dv1)
2271 GOTO(putgl, rc = -EAGAIN);
2274 if (llss->check_dv2) {
2275 rc = ll_data_version(llss->inode2, &dv, 0);
2278 if (dv != llss->dv2)
2279 GOTO(putgl, rc = -EAGAIN);
2282 /* struct md_op_data is used to send the swap args to the mdt
2283 * only flags is missing, so we use struct mdc_swap_layouts
2284 * through the md_op_data->op_data */
2285 /* flags from user space have to be converted before they are send to
2286 * server, no flag is sent today, they are only used on the client */
2289 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2290 0, LUSTRE_OPC_ANY, &msl);
2291 if (IS_ERR(op_data))
2292 GOTO(free, rc = PTR_ERR(op_data));
2294 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2295 sizeof(*op_data), op_data, NULL);
2296 ll_finish_md_op_data(op_data);
2303 ll_put_grouplock(llss->inode2, file2, gid);
2304 ll_put_grouplock(llss->inode1, file1, gid);
2314 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2316 struct md_op_data *op_data;
2320 /* Detect out-of range masks */
2321 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2324 /* Non-root users are forbidden to set or clear flags which are
2325 * NOT defined in HSM_USER_MASK. */
2326 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2327 !cfs_capable(CFS_CAP_SYS_ADMIN))
2330 /* Detect out-of range archive id */
2331 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2332 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2335 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2336 LUSTRE_OPC_ANY, hss);
2337 if (IS_ERR(op_data))
2338 RETURN(PTR_ERR(op_data));
2340 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2341 sizeof(*op_data), op_data, NULL);
2343 ll_finish_md_op_data(op_data);
2348 static int ll_hsm_import(struct inode *inode, struct file *file,
2349 struct hsm_user_import *hui)
2351 struct hsm_state_set *hss = NULL;
2352 struct iattr *attr = NULL;
2356 if (!S_ISREG(inode->i_mode))
2362 GOTO(out, rc = -ENOMEM);
2364 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2365 hss->hss_archive_id = hui->hui_archive_id;
2366 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2367 rc = ll_hsm_state_set(inode, hss);
2371 OBD_ALLOC_PTR(attr);
2373 GOTO(out, rc = -ENOMEM);
2375 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2376 attr->ia_mode |= S_IFREG;
2377 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2378 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2379 attr->ia_size = hui->hui_size;
2380 attr->ia_mtime.tv_sec = hui->hui_mtime;
2381 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2382 attr->ia_atime.tv_sec = hui->hui_atime;
2383 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2385 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2386 ATTR_UID | ATTR_GID |
2387 ATTR_MTIME | ATTR_MTIME_SET |
2388 ATTR_ATIME | ATTR_ATIME_SET;
2392 rc = ll_setattr_raw(file_dentry(file), attr, true);
2396 inode_unlock(inode);
2408 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2410 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2411 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2414 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2416 struct inode *inode = file_inode(file);
2418 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2419 ATTR_MTIME | ATTR_MTIME_SET |
2420 ATTR_CTIME | ATTR_CTIME_SET,
2422 .tv_sec = lfu->lfu_atime_sec,
2423 .tv_nsec = lfu->lfu_atime_nsec,
2426 .tv_sec = lfu->lfu_mtime_sec,
2427 .tv_nsec = lfu->lfu_mtime_nsec,
2430 .tv_sec = lfu->lfu_ctime_sec,
2431 .tv_nsec = lfu->lfu_ctime_nsec,
2437 if (!capable(CAP_SYS_ADMIN))
2440 if (!S_ISREG(inode->i_mode))
2444 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2445 inode_unlock(inode);
2450 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2453 case MODE_READ_USER:
2455 case MODE_WRITE_USER:
2462 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2464 /* Used to allow the upper layers of the client to request an LDLM lock
2465 * without doing an actual read or write.
2467 * Used for ladvise lockahead to manually request specific locks.
2469 * \param[in] file file this ladvise lock request is on
2470 * \param[in] ladvise ladvise struct describing this lock request
2472 * \retval 0 success, no detailed result available (sync requests
2473 * and requests sent to the server [not handled locally]
2474 * cannot return detailed results)
2475 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2476 * see definitions for details.
2477 * \retval negative negative errno on error
2479 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2481 struct lu_env *env = NULL;
2482 struct cl_io *io = NULL;
2483 struct cl_lock *lock = NULL;
2484 struct cl_lock_descr *descr = NULL;
2485 struct dentry *dentry = file->f_path.dentry;
2486 struct inode *inode = dentry->d_inode;
2487 enum cl_lock_mode cl_mode;
2488 off_t start = ladvise->lla_start;
2489 off_t end = ladvise->lla_end;
2495 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2496 "start=%llu, end=%llu\n", dentry->d_name.len,
2497 dentry->d_name.name, dentry->d_inode,
2498 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2501 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2503 GOTO(out, result = cl_mode);
2505 /* Get IO environment */
2506 result = cl_io_get(inode, &env, &io, &refcheck);
2510 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2513 * nothing to do for this io. This currently happens when
2514 * stripe sub-object's are not yet created.
2516 result = io->ci_result;
2517 } else if (result == 0) {
2518 lock = vvp_env_lock(env);
2519 descr = &lock->cll_descr;
2521 descr->cld_obj = io->ci_obj;
2522 /* Convert byte offsets to pages */
2523 descr->cld_start = cl_index(io->ci_obj, start);
2524 descr->cld_end = cl_index(io->ci_obj, end);
2525 descr->cld_mode = cl_mode;
2526 /* CEF_MUST is used because we do not want to convert a
2527 * lockahead request to a lockless lock */
2528 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2531 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2532 descr->cld_enq_flags |= CEF_SPECULATIVE;
2534 result = cl_lock_request(env, io, lock);
2536 /* On success, we need to release the lock */
2538 cl_lock_release(env, lock);
2540 cl_io_fini(env, io);
2541 cl_env_put(env, &refcheck);
2543 /* -ECANCELED indicates a matching lock with a different extent
2544 * was already present, and -EEXIST indicates a matching lock
2545 * on exactly the same extent was already present.
2546 * We convert them to positive values for userspace to make
2547 * recognizing true errors easier.
2548 * Note we can only return these detailed results on async requests,
2549 * as sync requests look the same as i/o requests for locking. */
2550 if (result == -ECANCELED)
2551 result = LLA_RESULT_DIFFERENT;
2552 else if (result == -EEXIST)
2553 result = LLA_RESULT_SAME;
2558 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2560 static int ll_ladvise_sanity(struct inode *inode,
2561 struct llapi_lu_ladvise *ladvise)
2563 enum lu_ladvise_type advice = ladvise->lla_advice;
2564 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2565 * be in the first 32 bits of enum ladvise_flags */
2566 __u32 flags = ladvise->lla_peradvice_flags;
2567 /* 3 lines at 80 characters per line, should be plenty */
2570 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2572 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2573 "last supported advice is %s (value '%d'): rc = %d\n",
2574 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2575 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2579 /* Per-advice checks */
2581 case LU_LADVISE_LOCKNOEXPAND:
2582 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2584 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2586 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2587 ladvise_names[advice], rc);
2591 case LU_LADVISE_LOCKAHEAD:
2592 /* Currently only READ and WRITE modes can be requested */
2593 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2594 ladvise->lla_lockahead_mode == 0) {
2596 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2598 ll_get_fsname(inode->i_sb, NULL, 0),
2599 ladvise->lla_lockahead_mode,
2600 ladvise_names[advice], rc);
2603 case LU_LADVISE_WILLREAD:
2604 case LU_LADVISE_DONTNEED:
2606 /* Note fall through above - These checks apply to all advices
2607 * except LOCKNOEXPAND */
2608 if (flags & ~LF_DEFAULT_MASK) {
2610 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2612 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2613 ladvise_names[advice], rc);
2616 if (ladvise->lla_start >= ladvise->lla_end) {
2618 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2619 "for %s: rc = %d\n",
2620 ll_get_fsname(inode->i_sb, NULL, 0),
2621 ladvise->lla_start, ladvise->lla_end,
2622 ladvise_names[advice], rc);
2634 * Give file access advices
2636 * The ladvise interface is similar to Linux fadvise() system call, except it
2637 * forwards the advices directly from Lustre client to server. The server side
2638 * codes will apply appropriate read-ahead and caching techniques for the
2639 * corresponding files.
2641 * A typical workload for ladvise is e.g. a bunch of different clients are
2642 * doing small random reads of a file, so prefetching pages into OSS cache
2643 * with big linear reads before the random IO is a net benefit. Fetching
2644 * all that data into each client cache with fadvise() may not be, due to
2645 * much more data being sent to the client.
2647 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2648 struct llapi_lu_ladvise *ladvise)
2652 struct cl_ladvise_io *lio;
2657 env = cl_env_get(&refcheck);
2659 RETURN(PTR_ERR(env));
2661 io = vvp_env_thread_io(env);
2662 io->ci_obj = ll_i2info(inode)->lli_clob;
2664 /* initialize parameters for ladvise */
2665 lio = &io->u.ci_ladvise;
2666 lio->li_start = ladvise->lla_start;
2667 lio->li_end = ladvise->lla_end;
2668 lio->li_fid = ll_inode2fid(inode);
2669 lio->li_advice = ladvise->lla_advice;
2670 lio->li_flags = flags;
2672 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2673 rc = cl_io_loop(env, io);
2677 cl_io_fini(env, io);
2678 cl_env_put(env, &refcheck);
2682 static int ll_lock_noexpand(struct file *file, int flags)
2684 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2686 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2691 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2694 struct fsxattr fsxattr;
2696 if (copy_from_user(&fsxattr,
2697 (const struct fsxattr __user *)arg,
2701 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2702 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2703 if (copy_to_user((struct fsxattr __user *)arg,
2704 &fsxattr, sizeof(fsxattr)))
2710 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2714 struct md_op_data *op_data;
2715 struct ptlrpc_request *req = NULL;
2717 struct fsxattr fsxattr;
2718 struct cl_object *obj;
2720 /* only root could change project ID */
2721 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2724 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2725 LUSTRE_OPC_ANY, NULL);
2726 if (IS_ERR(op_data))
2727 RETURN(PTR_ERR(op_data));
2729 if (copy_from_user(&fsxattr,
2730 (const struct fsxattr __user *)arg,
2732 GOTO(out_fsxattr1, rc = -EFAULT);
2734 op_data->op_attr_flags = fsxattr.fsx_xflags;
2735 op_data->op_projid = fsxattr.fsx_projid;
2736 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2737 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2739 ptlrpc_req_finished(req);
2741 obj = ll_i2info(inode)->lli_clob;
2745 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2746 OBD_ALLOC_PTR(attr);
2748 GOTO(out_fsxattr1, rc = -ENOMEM);
2749 attr->ia_valid = ATTR_ATTR_FLAG;
2750 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2755 ll_finish_md_op_data(op_data);
2762 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2764 struct inode *inode = file_inode(file);
2765 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2769 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2770 PFID(ll_inode2fid(inode)), inode, cmd);
2771 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2773 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2774 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2778 case LL_IOC_GETFLAGS:
2779 /* Get the current value of the file flags */
2780 return put_user(fd->fd_flags, (int __user *)arg);
2781 case LL_IOC_SETFLAGS:
2782 case LL_IOC_CLRFLAGS:
2783 /* Set or clear specific file flags */
2784 /* XXX This probably needs checks to ensure the flags are
2785 * not abused, and to handle any flag side effects.
2787 if (get_user(flags, (int __user *) arg))
2790 if (cmd == LL_IOC_SETFLAGS) {
2791 if ((flags & LL_FILE_IGNORE_LOCK) &&
2792 !(file->f_flags & O_DIRECT)) {
2793 CERROR("%s: unable to disable locking on "
2794 "non-O_DIRECT file\n", current->comm);
2798 fd->fd_flags |= flags;
2800 fd->fd_flags &= ~flags;
2803 case LL_IOC_LOV_SETSTRIPE:
2804 case LL_IOC_LOV_SETSTRIPE_NEW:
2805 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2806 case LL_IOC_LOV_SETEA:
2807 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2808 case LL_IOC_LOV_SWAP_LAYOUTS: {
2810 struct lustre_swap_layouts lsl;
2812 if (copy_from_user(&lsl, (char __user *)arg,
2813 sizeof(struct lustre_swap_layouts)))
2816 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2819 file2 = fget(lsl.sl_fd);
2823 /* O_WRONLY or O_RDWR */
2824 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2825 GOTO(out, rc = -EPERM);
2827 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2828 struct inode *inode2;
2829 struct ll_inode_info *lli;
2830 struct obd_client_handle *och = NULL;
2832 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2833 GOTO(out, rc = -EINVAL);
2835 lli = ll_i2info(inode);
2836 mutex_lock(&lli->lli_och_mutex);
2837 if (fd->fd_lease_och != NULL) {
2838 och = fd->fd_lease_och;
2839 fd->fd_lease_och = NULL;
2841 mutex_unlock(&lli->lli_och_mutex);
2843 GOTO(out, rc = -ENOLCK);
2844 inode2 = file_inode(file2);
2845 rc = ll_swap_layouts_close(och, inode, inode2);
2847 rc = ll_swap_layouts(file, file2, &lsl);
2853 case LL_IOC_LOV_GETSTRIPE:
2854 case LL_IOC_LOV_GETSTRIPE_NEW:
2855 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2856 case FSFILT_IOC_GETFLAGS:
2857 case FSFILT_IOC_SETFLAGS:
2858 RETURN(ll_iocontrol(inode, file, cmd, arg));
2859 case FSFILT_IOC_GETVERSION_OLD:
2860 case FSFILT_IOC_GETVERSION:
2861 RETURN(put_user(inode->i_generation, (int __user *)arg));
2862 case LL_IOC_GROUP_LOCK:
2863 RETURN(ll_get_grouplock(inode, file, arg));
2864 case LL_IOC_GROUP_UNLOCK:
2865 RETURN(ll_put_grouplock(inode, file, arg));
2866 case IOC_OBD_STATFS:
2867 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2869 /* We need to special case any other ioctls we want to handle,
2870 * to send them to the MDS/OST as appropriate and to properly
2871 * network encode the arg field.
2872 case FSFILT_IOC_SETVERSION_OLD:
2873 case FSFILT_IOC_SETVERSION:
2875 case LL_IOC_FLUSHCTX:
2876 RETURN(ll_flush_ctx(inode));
2877 case LL_IOC_PATH2FID: {
2878 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2879 sizeof(struct lu_fid)))
2884 case LL_IOC_GETPARENT:
2885 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2887 case OBD_IOC_FID2PATH:
2888 RETURN(ll_fid2path(inode, (void __user *)arg));
2889 case LL_IOC_DATA_VERSION: {
2890 struct ioc_data_version idv;
2893 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2896 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2897 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2900 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2906 case LL_IOC_GET_MDTIDX: {
2909 mdtidx = ll_get_mdt_idx(inode);
2913 if (put_user((int)mdtidx, (int __user *)arg))
2918 case OBD_IOC_GETDTNAME:
2919 case OBD_IOC_GETMDNAME:
2920 RETURN(ll_get_obd_name(inode, cmd, arg));
2921 case LL_IOC_HSM_STATE_GET: {
2922 struct md_op_data *op_data;
2923 struct hsm_user_state *hus;
2930 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2931 LUSTRE_OPC_ANY, hus);
2932 if (IS_ERR(op_data)) {
2934 RETURN(PTR_ERR(op_data));
2937 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2940 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2943 ll_finish_md_op_data(op_data);
2947 case LL_IOC_HSM_STATE_SET: {
2948 struct hsm_state_set *hss;
2955 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2960 rc = ll_hsm_state_set(inode, hss);
2965 case LL_IOC_HSM_ACTION: {
2966 struct md_op_data *op_data;
2967 struct hsm_current_action *hca;
2974 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2975 LUSTRE_OPC_ANY, hca);
2976 if (IS_ERR(op_data)) {
2978 RETURN(PTR_ERR(op_data));
2981 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2984 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2987 ll_finish_md_op_data(op_data);
2991 case LL_IOC_SET_LEASE: {
2992 struct ll_inode_info *lli = ll_i2info(inode);
2993 struct obd_client_handle *och = NULL;
2998 case LL_LEASE_WRLCK:
2999 if (!(file->f_mode & FMODE_WRITE))
3001 fmode = FMODE_WRITE;
3003 case LL_LEASE_RDLCK:
3004 if (!(file->f_mode & FMODE_READ))
3008 case LL_LEASE_UNLCK:
3009 mutex_lock(&lli->lli_och_mutex);
3010 if (fd->fd_lease_och != NULL) {
3011 och = fd->fd_lease_och;
3012 fd->fd_lease_och = NULL;
3014 mutex_unlock(&lli->lli_och_mutex);
3019 fmode = och->och_flags;
3020 rc = ll_lease_close(och, inode, &lease_broken);
3024 rc = ll_lease_och_release(inode, file);
3031 RETURN(ll_lease_type_from_fmode(fmode));
3036 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3038 /* apply for lease */
3039 och = ll_lease_open(inode, file, fmode, 0);
3041 RETURN(PTR_ERR(och));
3044 mutex_lock(&lli->lli_och_mutex);
3045 if (fd->fd_lease_och == NULL) {
3046 fd->fd_lease_och = och;
3049 mutex_unlock(&lli->lli_och_mutex);
3051 /* impossible now that only excl is supported for now */
3052 ll_lease_close(och, inode, &lease_broken);
3057 case LL_IOC_GET_LEASE: {
3058 struct ll_inode_info *lli = ll_i2info(inode);
3059 struct ldlm_lock *lock = NULL;
3062 mutex_lock(&lli->lli_och_mutex);
3063 if (fd->fd_lease_och != NULL) {
3064 struct obd_client_handle *och = fd->fd_lease_och;
3066 lock = ldlm_handle2lock(&och->och_lease_handle);
3068 lock_res_and_lock(lock);
3069 if (!ldlm_is_cancel(lock))
3070 fmode = och->och_flags;
3072 unlock_res_and_lock(lock);
3073 LDLM_LOCK_PUT(lock);
3076 mutex_unlock(&lli->lli_och_mutex);
3078 RETURN(ll_lease_type_from_fmode(fmode));
3080 case LL_IOC_HSM_IMPORT: {
3081 struct hsm_user_import *hui;
3087 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3092 rc = ll_hsm_import(inode, file, hui);
3097 case LL_IOC_FUTIMES_3: {
3098 struct ll_futimes_3 lfu;
3100 if (copy_from_user(&lfu,
3101 (const struct ll_futimes_3 __user *)arg,
3105 RETURN(ll_file_futimes_3(file, &lfu));
3107 case LL_IOC_LADVISE: {
3108 struct llapi_ladvise_hdr *k_ladvise_hdr;
3109 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3112 int alloc_size = sizeof(*k_ladvise_hdr);
3115 u_ladvise_hdr = (void __user *)arg;
3116 OBD_ALLOC_PTR(k_ladvise_hdr);
3117 if (k_ladvise_hdr == NULL)
3120 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3121 GOTO(out_ladvise, rc = -EFAULT);
3123 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3124 k_ladvise_hdr->lah_count < 1)
3125 GOTO(out_ladvise, rc = -EINVAL);
3127 num_advise = k_ladvise_hdr->lah_count;
3128 if (num_advise >= LAH_COUNT_MAX)
3129 GOTO(out_ladvise, rc = -EFBIG);
3131 OBD_FREE_PTR(k_ladvise_hdr);
3132 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3133 lah_advise[num_advise]);
3134 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3135 if (k_ladvise_hdr == NULL)
3139 * TODO: submit multiple advices to one server in a single RPC
3141 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3142 GOTO(out_ladvise, rc = -EFAULT);
3144 for (i = 0; i < num_advise; i++) {
3145 struct llapi_lu_ladvise *k_ladvise =
3146 &k_ladvise_hdr->lah_advise[i];
3147 struct llapi_lu_ladvise __user *u_ladvise =
3148 &u_ladvise_hdr->lah_advise[i];
3150 rc = ll_ladvise_sanity(inode, k_ladvise);
3152 GOTO(out_ladvise, rc);
3154 switch (k_ladvise->lla_advice) {
3155 case LU_LADVISE_LOCKNOEXPAND:
3156 rc = ll_lock_noexpand(file,
3157 k_ladvise->lla_peradvice_flags);
3158 GOTO(out_ladvise, rc);
3159 case LU_LADVISE_LOCKAHEAD:
3161 rc = ll_file_lock_ahead(file, k_ladvise);
3164 GOTO(out_ladvise, rc);
3167 &u_ladvise->lla_lockahead_result))
3168 GOTO(out_ladvise, rc = -EFAULT);
3171 rc = ll_ladvise(inode, file,
3172 k_ladvise_hdr->lah_flags,
3175 GOTO(out_ladvise, rc);
3182 OBD_FREE(k_ladvise_hdr, alloc_size);
3185 case LL_IOC_FSGETXATTR:
3186 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3187 case LL_IOC_FSSETXATTR:
3188 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3190 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3192 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3193 (void __user *)arg));
3197 #ifndef HAVE_FILE_LLSEEK_SIZE
3198 static inline loff_t
3199 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3201 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3203 if (offset > maxsize)
3206 if (offset != file->f_pos) {
3207 file->f_pos = offset;
3208 file->f_version = 0;
3214 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3215 loff_t maxsize, loff_t eof)
3217 struct inode *inode = file_inode(file);
3225 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3226 * position-querying operation. Avoid rewriting the "same"
3227 * f_pos value back to the file because a concurrent read(),
3228 * write() or lseek() might have altered it
3233 * f_lock protects against read/modify/write race with other
3234 * SEEK_CURs. Note that parallel writes and reads behave
3238 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3239 inode_unlock(inode);
3243 * In the generic case the entire file is data, so as long as
3244 * offset isn't at the end of the file then the offset is data.
3251 * There is a virtual hole at the end of the file, so as long as
3252 * offset isn't i_size or larger, return i_size.
3260 return llseek_execute(file, offset, maxsize);
3264 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3266 struct inode *inode = file_inode(file);
3267 loff_t retval, eof = 0;
3270 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3271 (origin == SEEK_CUR) ? file->f_pos : 0);
3272 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3273 PFID(ll_inode2fid(inode)), inode, retval, retval,
3275 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3277 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3278 retval = ll_glimpse_size(inode);
3281 eof = i_size_read(inode);
3284 retval = ll_generic_file_llseek_size(file, offset, origin,
3285 ll_file_maxbytes(inode), eof);
3289 static int ll_flush(struct file *file, fl_owner_t id)
3291 struct inode *inode = file_inode(file);
3292 struct ll_inode_info *lli = ll_i2info(inode);
3293 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3296 LASSERT(!S_ISDIR(inode->i_mode));
3298 /* catch async errors that were recorded back when async writeback
3299 * failed for pages in this mapping. */
3300 rc = lli->lli_async_rc;
3301 lli->lli_async_rc = 0;
3302 if (lli->lli_clob != NULL) {
3303 err = lov_read_and_clear_async_rc(lli->lli_clob);
3308 /* The application has been told write failure already.
3309 * Do not report failure again. */
3310 if (fd->fd_write_failed)
3312 return rc ? -EIO : 0;
3316 * Called to make sure a portion of file has been written out.
3317 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3319 * Return how many pages have been written.
3321 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3322 enum cl_fsync_mode mode, int ignore_layout)
3326 struct cl_fsync_io *fio;
3331 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3332 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3335 env = cl_env_get(&refcheck);
3337 RETURN(PTR_ERR(env));
3339 io = vvp_env_thread_io(env);
3340 io->ci_obj = ll_i2info(inode)->lli_clob;
3341 io->ci_ignore_layout = ignore_layout;
3343 /* initialize parameters for sync */
3344 fio = &io->u.ci_fsync;
3345 fio->fi_start = start;
3347 fio->fi_fid = ll_inode2fid(inode);
3348 fio->fi_mode = mode;
3349 fio->fi_nr_written = 0;
3351 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3352 result = cl_io_loop(env, io);
3354 result = io->ci_result;
3356 result = fio->fi_nr_written;
3357 cl_io_fini(env, io);
3358 cl_env_put(env, &refcheck);
3364 * When dentry is provided (the 'else' case), file_dentry() may be
3365 * null and dentry must be used directly rather than pulled from
3366 * file_dentry() as is done otherwise.
3369 #ifdef HAVE_FILE_FSYNC_4ARGS
3370 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3372 struct dentry *dentry = file_dentry(file);
3374 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3375 int ll_fsync(struct file *file, int datasync)
3377 struct dentry *dentry = file_dentry(file);
3379 loff_t end = LLONG_MAX;
3381 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3384 loff_t end = LLONG_MAX;
3386 struct inode *inode = dentry->d_inode;
3387 struct ll_inode_info *lli = ll_i2info(inode);
3388 struct ptlrpc_request *req;
3392 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3393 PFID(ll_inode2fid(inode)), inode);
3394 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3396 #ifdef HAVE_FILE_FSYNC_4ARGS
3397 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3398 lock_inode = !lli->lli_inode_locked;
3402 /* fsync's caller has already called _fdata{sync,write}, we want
3403 * that IO to finish before calling the osc and mdc sync methods */
3404 rc = filemap_fdatawait(inode->i_mapping);
3407 /* catch async errors that were recorded back when async writeback
3408 * failed for pages in this mapping. */
3409 if (!S_ISDIR(inode->i_mode)) {
3410 err = lli->lli_async_rc;
3411 lli->lli_async_rc = 0;
3414 if (lli->lli_clob != NULL) {
3415 err = lov_read_and_clear_async_rc(lli->lli_clob);
3421 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3425 ptlrpc_req_finished(req);
3427 if (S_ISREG(inode->i_mode)) {
3428 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3430 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3431 if (rc == 0 && err < 0)
3434 fd->fd_write_failed = true;
3436 fd->fd_write_failed = false;
3439 #ifdef HAVE_FILE_FSYNC_4ARGS
3441 inode_unlock(inode);
3447 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3449 struct inode *inode = file_inode(file);
3450 struct ll_sb_info *sbi = ll_i2sbi(inode);
3451 struct ldlm_enqueue_info einfo = {
3452 .ei_type = LDLM_FLOCK,
3453 .ei_cb_cp = ldlm_flock_completion_ast,
3454 .ei_cbdata = file_lock,
3456 struct md_op_data *op_data;
3457 struct lustre_handle lockh = { 0 };
3458 union ldlm_policy_data flock = { { 0 } };
3459 int fl_type = file_lock->fl_type;
3465 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3466 PFID(ll_inode2fid(inode)), file_lock);
3468 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3470 if (file_lock->fl_flags & FL_FLOCK) {
3471 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3472 /* flocks are whole-file locks */
3473 flock.l_flock.end = OFFSET_MAX;
3474 /* For flocks owner is determined by the local file desctiptor*/
3475 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3476 } else if (file_lock->fl_flags & FL_POSIX) {
3477 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3478 flock.l_flock.start = file_lock->fl_start;
3479 flock.l_flock.end = file_lock->fl_end;
3483 flock.l_flock.pid = file_lock->fl_pid;
3485 /* Somewhat ugly workaround for svc lockd.
3486 * lockd installs custom fl_lmops->lm_compare_owner that checks
3487 * for the fl_owner to be the same (which it always is on local node
3488 * I guess between lockd processes) and then compares pid.
3489 * As such we assign pid to the owner field to make it all work,
3490 * conflict with normal locks is unlikely since pid space and
3491 * pointer space for current->files are not intersecting */
3492 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3493 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3497 einfo.ei_mode = LCK_PR;
3500 /* An unlock request may or may not have any relation to
3501 * existing locks so we may not be able to pass a lock handle
3502 * via a normal ldlm_lock_cancel() request. The request may even
3503 * unlock a byte range in the middle of an existing lock. In
3504 * order to process an unlock request we need all of the same
3505 * information that is given with a normal read or write record
3506 * lock request. To avoid creating another ldlm unlock (cancel)
3507 * message we'll treat a LCK_NL flock request as an unlock. */
3508 einfo.ei_mode = LCK_NL;
3511 einfo.ei_mode = LCK_PW;
3514 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3529 flags = LDLM_FL_BLOCK_NOWAIT;
3535 flags = LDLM_FL_TEST_LOCK;
3538 CERROR("unknown fcntl lock command: %d\n", cmd);
3542 /* Save the old mode so that if the mode in the lock changes we
3543 * can decrement the appropriate reader or writer refcount. */
3544 file_lock->fl_type = einfo.ei_mode;
3546 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3547 LUSTRE_OPC_ANY, NULL);
3548 if (IS_ERR(op_data))
3549 RETURN(PTR_ERR(op_data));
3551 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3552 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3553 flock.l_flock.pid, flags, einfo.ei_mode,
3554 flock.l_flock.start, flock.l_flock.end);
3556 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3559 /* Restore the file lock type if not TEST lock. */
3560 if (!(flags & LDLM_FL_TEST_LOCK))
3561 file_lock->fl_type = fl_type;
3563 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3564 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3565 !(flags & LDLM_FL_TEST_LOCK))
3566 rc2 = locks_lock_file_wait(file, file_lock);
3568 if ((file_lock->fl_flags & FL_FLOCK) &&
3569 (rc == 0 || file_lock->fl_type == F_UNLCK))
3570 rc2 = flock_lock_file_wait(file, file_lock);
3571 if ((file_lock->fl_flags & FL_POSIX) &&
3572 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3573 !(flags & LDLM_FL_TEST_LOCK))
3574 rc2 = posix_lock_file_wait(file, file_lock);
3575 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3577 if (rc2 && file_lock->fl_type != F_UNLCK) {
3578 einfo.ei_mode = LCK_NL;
3579 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3584 ll_finish_md_op_data(op_data);
3589 int ll_get_fid_by_name(struct inode *parent, const char *name,
3590 int namelen, struct lu_fid *fid,
3591 struct inode **inode)
3593 struct md_op_data *op_data = NULL;
3594 struct mdt_body *body;
3595 struct ptlrpc_request *req;
3599 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3600 LUSTRE_OPC_ANY, NULL);
3601 if (IS_ERR(op_data))
3602 RETURN(PTR_ERR(op_data));
3604 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3605 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3606 ll_finish_md_op_data(op_data);
3610 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3612 GOTO(out_req, rc = -EFAULT);
3614 *fid = body->mbo_fid1;
3617 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3619 ptlrpc_req_finished(req);
3623 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3624 const char *name, int namelen)
3626 struct dentry *dchild = NULL;
3627 struct inode *child_inode = NULL;
3628 struct md_op_data *op_data;
3629 struct ptlrpc_request *request = NULL;
3630 struct obd_client_handle *och = NULL;
3632 struct mdt_body *body;
3634 __u64 data_version = 0;
3637 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3638 name, PFID(ll_inode2fid(parent)), mdtidx);
3640 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3641 0, LUSTRE_OPC_ANY, NULL);
3642 if (IS_ERR(op_data))
3643 RETURN(PTR_ERR(op_data));
3645 /* Get child FID first */
3646 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3649 dchild = d_lookup(file_dentry(file), &qstr);
3650 if (dchild != NULL) {
3651 if (dchild->d_inode != NULL)
3652 child_inode = igrab(dchild->d_inode);
3656 if (child_inode == NULL) {
3657 rc = ll_get_fid_by_name(parent, name, namelen,
3658 &op_data->op_fid3, &child_inode);
3663 if (child_inode == NULL)
3664 GOTO(out_free, rc = -EINVAL);
3667 * lfs migrate command needs to be blocked on the client
3668 * by checking the migrate FID against the FID of the
3671 if (child_inode == parent->i_sb->s_root->d_inode)
3672 GOTO(out_iput, rc = -EINVAL);
3674 inode_lock(child_inode);
3675 op_data->op_fid3 = *ll_inode2fid(child_inode);
3676 if (!fid_is_sane(&op_data->op_fid3)) {
3677 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3678 ll_get_fsname(parent->i_sb, NULL, 0), name,
3679 PFID(&op_data->op_fid3));
3680 GOTO(out_unlock, rc = -EINVAL);
3683 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3685 GOTO(out_unlock, rc);
3688 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3689 PFID(&op_data->op_fid3), mdtidx);
3690 GOTO(out_unlock, rc = 0);
3693 if (S_ISREG(child_inode->i_mode)) {
3694 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3698 GOTO(out_unlock, rc);
3701 rc = ll_data_version(child_inode, &data_version,
3704 GOTO(out_close, rc);
3706 op_data->op_handle = och->och_fh;
3707 op_data->op_data = och->och_mod;
3708 op_data->op_data_version = data_version;
3709 op_data->op_lease_handle = och->och_lease_handle;
3710 op_data->op_bias |= MDS_RENAME_MIGRATE;
3713 op_data->op_mds = mdtidx;
3714 op_data->op_cli_flags = CLI_MIGRATE;
3715 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3716 namelen, name, namelen, &request);
3718 LASSERT(request != NULL);
3719 ll_update_times(request, parent);
3721 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3722 LASSERT(body != NULL);
3724 /* If the server does release layout lock, then we cleanup
3725 * the client och here, otherwise release it in out_close: */
3727 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3728 obd_mod_put(och->och_mod);
3729 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3731 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3737 if (request != NULL) {
3738 ptlrpc_req_finished(request);
3742 /* Try again if the file layout has changed. */
3743 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3747 if (och != NULL) /* close the file */
3748 ll_lease_close(och, child_inode, NULL);
3750 clear_nlink(child_inode);
3752 inode_unlock(child_inode);
3756 ll_finish_md_op_data(op_data);
3761 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3769 * test if some locks matching bits and l_req_mode are acquired
3770 * - bits can be in different locks
3771 * - if found clear the common lock bits in *bits
3772 * - the bits not found, are kept in *bits
3774 * \param bits [IN] searched lock bits [IN]
3775 * \param l_req_mode [IN] searched lock mode
3776 * \retval boolean, true iff all bits are found
3778 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3780 struct lustre_handle lockh;
3781 union ldlm_policy_data policy;
3782 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3783 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3792 fid = &ll_i2info(inode)->lli_fid;
3793 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3794 ldlm_lockname[mode]);
3796 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3797 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3798 policy.l_inodebits.bits = *bits & (1 << i);
3799 if (policy.l_inodebits.bits == 0)
3802 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3803 &policy, mode, &lockh)) {
3804 struct ldlm_lock *lock;
3806 lock = ldlm_handle2lock(&lockh);
3809 ~(lock->l_policy_data.l_inodebits.bits);
3810 LDLM_LOCK_PUT(lock);
3812 *bits &= ~policy.l_inodebits.bits;
3819 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3820 struct lustre_handle *lockh, __u64 flags,
3821 enum ldlm_mode mode)
3823 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3828 fid = &ll_i2info(inode)->lli_fid;
3829 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3831 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3832 fid, LDLM_IBITS, &policy, mode, lockh);
3837 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3839 /* Already unlinked. Just update nlink and return success */
3840 if (rc == -ENOENT) {
3842 /* If it is striped directory, and there is bad stripe
3843 * Let's revalidate the dentry again, instead of returning
3845 if (S_ISDIR(inode->i_mode) &&
3846 ll_i2info(inode)->lli_lsm_md != NULL)
3849 /* This path cannot be hit for regular files unless in
3850 * case of obscure races, so no need to to validate
3852 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3854 } else if (rc != 0) {
3855 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3856 "%s: revalidate FID "DFID" error: rc = %d\n",
3857 ll_get_fsname(inode->i_sb, NULL, 0),
3858 PFID(ll_inode2fid(inode)), rc);
3864 static int __ll_inode_revalidate(struct dentry *dentry,
3865 enum ldlm_intent_flags op)
3867 struct inode *inode = dentry->d_inode;
3868 struct lookup_intent oit = {
3871 struct ptlrpc_request *req = NULL;
3872 struct md_op_data *op_data;
3873 struct obd_export *exp;
3877 LASSERT(inode != NULL);
3879 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3880 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3882 exp = ll_i2mdexp(inode);
3884 /* Call getattr by fid, so do not provide name at all. */
3885 op_data = ll_prep_md_op_data(NULL, dentry->d_inode, dentry->d_inode,
3886 NULL, 0, 0, LUSTRE_OPC_ANY, NULL);
3887 if (IS_ERR(op_data))
3888 RETURN(PTR_ERR(op_data));
3890 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
3891 ll_finish_md_op_data(op_data);
3893 rc = ll_inode_revalidate_fini(inode, rc);
3897 rc = ll_revalidate_it_finish(req, &oit, dentry);
3899 ll_intent_release(&oit);
3903 /* Unlinked? Unhash dentry, so it is not picked up later by
3904 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3905 * here to preserve get_cwd functionality on 2.6.
3907 if (!dentry->d_inode->i_nlink) {
3908 ll_lock_dcache(inode);
3909 d_lustre_invalidate(dentry, 0);
3910 ll_unlock_dcache(inode);
3913 ll_lookup_finish_locks(&oit, dentry);
3915 ptlrpc_req_finished(req);
3920 static int ll_merge_md_attr(struct inode *inode)
3922 struct cl_attr attr = { 0 };
3925 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3926 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3927 &attr, ll_md_blocking_ast);
3931 set_nlink(inode, attr.cat_nlink);
3932 inode->i_blocks = attr.cat_blocks;
3933 i_size_write(inode, attr.cat_size);
3935 ll_i2info(inode)->lli_atime = attr.cat_atime;
3936 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3937 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3943 ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
3945 struct inode *inode = dentry->d_inode;
3949 rc = __ll_inode_revalidate(dentry, op);
3953 /* if object isn't regular file, don't validate size */
3954 if (!S_ISREG(inode->i_mode)) {
3955 if (S_ISDIR(inode->i_mode) &&
3956 ll_i2info(inode)->lli_lsm_md != NULL) {
3957 rc = ll_merge_md_attr(inode);
3962 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3963 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3964 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3966 /* In case of restore, the MDT has the right size and has
3967 * already send it back without granting the layout lock,
3968 * inode is up-to-date so glimpse is useless.
3969 * Also to glimpse we need the layout, in case of a running
3970 * restore the MDT holds the layout lock so the glimpse will
3971 * block up to the end of restore (getattr will block)
3973 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3974 rc = ll_glimpse_size(inode);
3979 static inline dev_t ll_compat_encode_dev(dev_t dev)
3981 /* The compat_sys_*stat*() syscalls will fail unless the
3982 * device majors and minors are both less than 256. Note that
3983 * the value returned here will be passed through
3984 * old_encode_dev() in cp_compat_stat(). And so we are not
3985 * trying to return a valid compat (u16) device number, just
3986 * one that will pass the old_valid_dev() check. */
3988 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3991 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
3992 int ll_getattr(const struct path *path, struct kstat *stat,
3993 u32 request_mask, unsigned int flags)
3996 struct dentry *de = path->dentry;
3998 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4001 struct inode *inode = de->d_inode;
4002 struct ll_sb_info *sbi = ll_i2sbi(inode);
4003 struct ll_inode_info *lli = ll_i2info(inode);
4006 res = ll_inode_revalidate(de, IT_GETATTR);
4007 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4012 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4014 if (ll_need_32bit_api(sbi)) {
4015 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4016 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4017 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4019 stat->ino = inode->i_ino;
4020 stat->dev = inode->i_sb->s_dev;
4021 stat->rdev = inode->i_rdev;
4024 stat->mode = inode->i_mode;
4025 stat->uid = inode->i_uid;
4026 stat->gid = inode->i_gid;
4027 stat->atime = inode->i_atime;
4028 stat->mtime = inode->i_mtime;
4029 stat->ctime = inode->i_ctime;
4030 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4032 stat->nlink = inode->i_nlink;
4033 stat->size = i_size_read(inode);
4034 stat->blocks = inode->i_blocks;
4039 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4040 __u64 start, __u64 len)
4044 struct fiemap *fiemap;
4045 unsigned int extent_count = fieinfo->fi_extents_max;
4047 num_bytes = sizeof(*fiemap) + (extent_count *
4048 sizeof(struct fiemap_extent));
4049 OBD_ALLOC_LARGE(fiemap, num_bytes);
4054 fiemap->fm_flags = fieinfo->fi_flags;
4055 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4056 fiemap->fm_start = start;
4057 fiemap->fm_length = len;
4058 if (extent_count > 0 &&
4059 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4060 sizeof(struct fiemap_extent)) != 0)
4061 GOTO(out, rc = -EFAULT);
4063 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4065 fieinfo->fi_flags = fiemap->fm_flags;
4066 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4067 if (extent_count > 0 &&
4068 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4069 fiemap->fm_mapped_extents *
4070 sizeof(struct fiemap_extent)) != 0)
4071 GOTO(out, rc = -EFAULT);
4073 OBD_FREE_LARGE(fiemap, num_bytes);
4077 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4079 struct ll_inode_info *lli = ll_i2info(inode);
4080 struct posix_acl *acl = NULL;
4083 spin_lock(&lli->lli_lock);
4084 /* VFS' acl_permission_check->check_acl will release the refcount */
4085 acl = posix_acl_dup(lli->lli_posix_acl);
4086 spin_unlock(&lli->lli_lock);
4091 #ifdef HAVE_IOP_SET_ACL
4092 #ifdef CONFIG_FS_POSIX_ACL
4093 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4095 const char *name = NULL;
4102 case ACL_TYPE_ACCESS:
4104 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4108 name = XATTR_NAME_POSIX_ACL_ACCESS;
4110 case ACL_TYPE_DEFAULT:
4111 if (!S_ISDIR(inode->i_mode))
4112 GOTO(out, rc = acl ? -EACCES : 0);
4113 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4116 GOTO(out, rc = -EINVAL);
4120 size = posix_acl_xattr_size(acl->a_count);
4121 value = kmalloc(size, GFP_NOFS);
4123 GOTO(out, rc = -ENOMEM);
4125 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4130 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4131 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4136 set_cached_acl(inode, type, acl);
4138 forget_cached_acl(inode, type);
4141 #endif /* CONFIG_FS_POSIX_ACL */
4142 #endif /* HAVE_IOP_SET_ACL */
4144 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4146 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4147 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4149 ll_check_acl(struct inode *inode, int mask)
4152 # ifdef CONFIG_FS_POSIX_ACL
4153 struct posix_acl *acl;
4157 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4158 if (flags & IPERM_FLAG_RCU)
4161 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4166 rc = posix_acl_permission(inode, acl, mask);
4167 posix_acl_release(acl);
4170 # else /* !CONFIG_FS_POSIX_ACL */
4172 # endif /* CONFIG_FS_POSIX_ACL */
4174 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4176 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4177 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4179 # ifdef HAVE_INODE_PERMISION_2ARGS
4180 int ll_inode_permission(struct inode *inode, int mask)
4182 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4187 struct ll_sb_info *sbi;
4188 struct root_squash_info *squash;
4189 struct cred *cred = NULL;
4190 const struct cred *old_cred = NULL;
4192 bool squash_id = false;
4195 #ifdef MAY_NOT_BLOCK
4196 if (mask & MAY_NOT_BLOCK)
4198 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4199 if (flags & IPERM_FLAG_RCU)
4203 /* as root inode are NOT getting validated in lookup operation,
4204 * need to do it before permission check. */
4206 if (inode == inode->i_sb->s_root->d_inode) {
4207 rc = __ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4212 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4213 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4215 /* squash fsuid/fsgid if needed */
4216 sbi = ll_i2sbi(inode);
4217 squash = &sbi->ll_squash;
4218 if (unlikely(squash->rsi_uid != 0 &&
4219 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4220 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4224 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4225 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4226 squash->rsi_uid, squash->rsi_gid);
4228 /* update current process's credentials
4229 * and FS capability */
4230 cred = prepare_creds();
4234 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4235 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4236 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4237 if ((1 << cap) & CFS_CAP_FS_MASK)
4238 cap_lower(cred->cap_effective, cap);
4240 old_cred = override_creds(cred);
4243 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4244 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4245 /* restore current process's credentials and FS capability */
4247 revert_creds(old_cred);
4254 /* -o localflock - only provides locally consistent flock locks */
4255 struct file_operations ll_file_operations = {
4256 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4257 # ifdef HAVE_SYNC_READ_WRITE
4258 .read = new_sync_read,
4259 .write = new_sync_write,
4261 .read_iter = ll_file_read_iter,
4262 .write_iter = ll_file_write_iter,
4263 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4264 .read = ll_file_read,
4265 .aio_read = ll_file_aio_read,
4266 .write = ll_file_write,
4267 .aio_write = ll_file_aio_write,
4268 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4269 .unlocked_ioctl = ll_file_ioctl,
4270 .open = ll_file_open,
4271 .release = ll_file_release,
4272 .mmap = ll_file_mmap,
4273 .llseek = ll_file_seek,
4274 .splice_read = ll_file_splice_read,
4279 struct file_operations ll_file_operations_flock = {
4280 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4281 # ifdef HAVE_SYNC_READ_WRITE
4282 .read = new_sync_read,
4283 .write = new_sync_write,
4284 # endif /* HAVE_SYNC_READ_WRITE */
4285 .read_iter = ll_file_read_iter,
4286 .write_iter = ll_file_write_iter,
4287 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4288 .read = ll_file_read,
4289 .aio_read = ll_file_aio_read,
4290 .write = ll_file_write,
4291 .aio_write = ll_file_aio_write,
4292 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4293 .unlocked_ioctl = ll_file_ioctl,
4294 .open = ll_file_open,
4295 .release = ll_file_release,
4296 .mmap = ll_file_mmap,
4297 .llseek = ll_file_seek,
4298 .splice_read = ll_file_splice_read,
4301 .flock = ll_file_flock,
4302 .lock = ll_file_flock
4305 /* These are for -o noflock - to return ENOSYS on flock calls */
4306 struct file_operations ll_file_operations_noflock = {
4307 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4308 # ifdef HAVE_SYNC_READ_WRITE
4309 .read = new_sync_read,
4310 .write = new_sync_write,
4311 # endif /* HAVE_SYNC_READ_WRITE */
4312 .read_iter = ll_file_read_iter,
4313 .write_iter = ll_file_write_iter,
4314 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4315 .read = ll_file_read,
4316 .aio_read = ll_file_aio_read,
4317 .write = ll_file_write,
4318 .aio_write = ll_file_aio_write,
4319 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4320 .unlocked_ioctl = ll_file_ioctl,
4321 .open = ll_file_open,
4322 .release = ll_file_release,
4323 .mmap = ll_file_mmap,
4324 .llseek = ll_file_seek,
4325 .splice_read = ll_file_splice_read,
4328 .flock = ll_file_noflock,
4329 .lock = ll_file_noflock
4332 struct inode_operations ll_file_inode_operations = {
4333 .setattr = ll_setattr,
4334 .getattr = ll_getattr,
4335 .permission = ll_inode_permission,
4336 #ifdef HAVE_IOP_XATTR
4337 .setxattr = ll_setxattr,
4338 .getxattr = ll_getxattr,
4339 .removexattr = ll_removexattr,
4341 .listxattr = ll_listxattr,
4342 .fiemap = ll_fiemap,
4343 #ifdef HAVE_IOP_GET_ACL
4344 .get_acl = ll_get_acl,
4346 #ifdef HAVE_IOP_SET_ACL
4347 .set_acl = ll_set_acl,
4351 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4353 struct ll_inode_info *lli = ll_i2info(inode);
4354 struct cl_object *obj = lli->lli_clob;
4363 env = cl_env_get(&refcheck);
4365 RETURN(PTR_ERR(env));
4367 rc = cl_conf_set(env, lli->lli_clob, conf);
4371 if (conf->coc_opc == OBJECT_CONF_SET) {
4372 struct ldlm_lock *lock = conf->coc_lock;
4373 struct cl_layout cl = {
4377 LASSERT(lock != NULL);
4378 LASSERT(ldlm_has_layout(lock));
4380 /* it can only be allowed to match after layout is
4381 * applied to inode otherwise false layout would be
4382 * seen. Applying layout shoud happen before dropping
4383 * the intent lock. */
4384 ldlm_lock_allow_match(lock);
4386 rc = cl_object_layout_get(env, obj, &cl);
4391 DFID": layout version change: %u -> %u\n",
4392 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4394 ll_layout_version_set(lli, cl.cl_layout_gen);
4398 cl_env_put(env, &refcheck);
4403 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4404 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4407 struct ll_sb_info *sbi = ll_i2sbi(inode);
4408 struct ptlrpc_request *req;
4409 struct mdt_body *body;
4416 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4417 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4418 lock->l_lvb_data, lock->l_lvb_len);
4420 if (lock->l_lvb_data != NULL)
4423 /* if layout lock was granted right away, the layout is returned
4424 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4425 * blocked and then granted via completion ast, we have to fetch
4426 * layout here. Please note that we can't use the LVB buffer in
4427 * completion AST because it doesn't have a large enough buffer */
4428 rc = ll_get_default_mdsize(sbi, &lmmsize);
4430 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4431 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4436 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4438 GOTO(out, rc = -EPROTO);
4440 lmmsize = body->mbo_eadatasize;
4441 if (lmmsize == 0) /* empty layout */
4444 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4446 GOTO(out, rc = -EFAULT);
4448 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4449 if (lvbdata == NULL)
4450 GOTO(out, rc = -ENOMEM);
4452 memcpy(lvbdata, lmm, lmmsize);
4453 lock_res_and_lock(lock);
4454 if (unlikely(lock->l_lvb_data == NULL)) {
4455 lock->l_lvb_type = LVB_T_LAYOUT;
4456 lock->l_lvb_data = lvbdata;
4457 lock->l_lvb_len = lmmsize;
4460 unlock_res_and_lock(lock);
4463 OBD_FREE_LARGE(lvbdata, lmmsize);
4468 ptlrpc_req_finished(req);
4473 * Apply the layout to the inode. Layout lock is held and will be released
4476 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4477 struct inode *inode)
4479 struct ll_inode_info *lli = ll_i2info(inode);
4480 struct ll_sb_info *sbi = ll_i2sbi(inode);
4481 struct ldlm_lock *lock;
4482 struct cl_object_conf conf;
4485 bool wait_layout = false;
4488 LASSERT(lustre_handle_is_used(lockh));
4490 lock = ldlm_handle2lock(lockh);
4491 LASSERT(lock != NULL);
4492 LASSERT(ldlm_has_layout(lock));
4494 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4495 PFID(&lli->lli_fid), inode);
4497 /* in case this is a caching lock and reinstate with new inode */
4498 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4500 lock_res_and_lock(lock);
4501 lvb_ready = ldlm_is_lvb_ready(lock);
4502 unlock_res_and_lock(lock);
4504 /* checking lvb_ready is racy but this is okay. The worst case is
4505 * that multi processes may configure the file on the same time. */
4509 rc = ll_layout_fetch(inode, lock);
4513 /* for layout lock, lmm is stored in lock's lvb.
4514 * lvb_data is immutable if the lock is held so it's safe to access it
4517 * set layout to file. Unlikely this will fail as old layout was
4518 * surely eliminated */
4519 memset(&conf, 0, sizeof conf);
4520 conf.coc_opc = OBJECT_CONF_SET;
4521 conf.coc_inode = inode;
4522 conf.coc_lock = lock;
4523 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4524 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4525 rc = ll_layout_conf(inode, &conf);
4527 /* refresh layout failed, need to wait */
4528 wait_layout = rc == -EBUSY;
4531 LDLM_LOCK_PUT(lock);
4532 ldlm_lock_decref(lockh, mode);
4534 /* wait for IO to complete if it's still being used. */
4536 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4537 ll_get_fsname(inode->i_sb, NULL, 0),
4538 PFID(&lli->lli_fid), inode);
4540 memset(&conf, 0, sizeof conf);
4541 conf.coc_opc = OBJECT_CONF_WAIT;
4542 conf.coc_inode = inode;
4543 rc = ll_layout_conf(inode, &conf);
4547 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4548 ll_get_fsname(inode->i_sb, NULL, 0),
4549 PFID(&lli->lli_fid), rc);
4555 * Issue layout intent RPC to MDS.
4556 * \param inode [in] file inode
4557 * \param intent [in] layout intent
4559 * \retval 0 on success
4560 * \retval < 0 error code
4562 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4564 struct ll_inode_info *lli = ll_i2info(inode);
4565 struct ll_sb_info *sbi = ll_i2sbi(inode);
4566 struct md_op_data *op_data;
4567 struct lookup_intent it;
4568 struct ptlrpc_request *req;
4572 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4573 0, 0, LUSTRE_OPC_ANY, NULL);
4574 if (IS_ERR(op_data))
4575 RETURN(PTR_ERR(op_data));
4577 op_data->op_data = intent;
4578 op_data->op_data_size = sizeof(*intent);
4580 memset(&it, 0, sizeof(it));
4581 it.it_op = IT_LAYOUT;
4582 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4583 intent->li_opc == LAYOUT_INTENT_TRUNC)
4584 it.it_flags = FMODE_WRITE;
4586 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4587 ll_get_fsname(inode->i_sb, NULL, 0),
4588 PFID(&lli->lli_fid), inode);
4590 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4591 &ll_md_blocking_ast, 0);
4592 if (it.it_request != NULL)
4593 ptlrpc_req_finished(it.it_request);
4594 it.it_request = NULL;
4596 ll_finish_md_op_data(op_data);
4598 /* set lock data in case this is a new lock */
4600 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4602 ll_intent_drop_lock(&it);
4608 * This function checks if there exists a LAYOUT lock on the client side,
4609 * or enqueues it if it doesn't have one in cache.
4611 * This function will not hold layout lock so it may be revoked any time after
4612 * this function returns. Any operations depend on layout should be redone
4615 * This function should be called before lov_io_init() to get an uptodate
4616 * layout version, the caller should save the version number and after IO
4617 * is finished, this function should be called again to verify that layout
4618 * is not changed during IO time.
4620 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4622 struct ll_inode_info *lli = ll_i2info(inode);
4623 struct ll_sb_info *sbi = ll_i2sbi(inode);
4624 struct lustre_handle lockh;
4625 struct layout_intent intent = {
4626 .li_opc = LAYOUT_INTENT_ACCESS,
4628 enum ldlm_mode mode;
4632 *gen = ll_layout_version_get(lli);
4633 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4637 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4638 LASSERT(S_ISREG(inode->i_mode));
4640 /* take layout lock mutex to enqueue layout lock exclusively. */
4641 mutex_lock(&lli->lli_layout_mutex);
4644 /* mostly layout lock is caching on the local side, so try to
4645 * match it before grabbing layout lock mutex. */
4646 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4647 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4648 if (mode != 0) { /* hit cached lock */
4649 rc = ll_layout_lock_set(&lockh, mode, inode);
4655 rc = ll_layout_intent(inode, &intent);
4661 *gen = ll_layout_version_get(lli);
4662 mutex_unlock(&lli->lli_layout_mutex);
4668 * Issue layout intent RPC indicating where in a file an IO is about to write.
4670 * \param[in] inode file inode.
4671 * \param[in] start start offset of fille in bytes where an IO is about to
4673 * \param[in] end exclusive end offset in bytes of the write range.
4675 * \retval 0 on success
4676 * \retval < 0 error code
4678 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4680 struct layout_intent intent = {
4681 .li_opc = LAYOUT_INTENT_WRITE,
4688 rc = ll_layout_intent(inode, &intent);
4694 * This function send a restore request to the MDT
4696 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4698 struct hsm_user_request *hur;
4702 len = sizeof(struct hsm_user_request) +
4703 sizeof(struct hsm_user_item);
4704 OBD_ALLOC(hur, len);
4708 hur->hur_request.hr_action = HUA_RESTORE;
4709 hur->hur_request.hr_archive_id = 0;
4710 hur->hur_request.hr_flags = 0;
4711 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4712 sizeof(hur->hur_user_item[0].hui_fid));
4713 hur->hur_user_item[0].hui_extent.offset = offset;
4714 hur->hur_user_item[0].hui_extent.length = length;
4715 hur->hur_request.hr_itemcount = 1;
4716 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,