4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_CLOSE_RESYNC_DONE: {
160 struct ll_ioc_lease *ioc = data;
162 LASSERT(data != NULL);
163 op_data->op_attr_blocks +=
164 ioc->lil_count * op_data->op_attr_blocks;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_data = &ioc->lil_ids[0];
170 op_data->op_data_size =
171 ioc->lil_count * sizeof(ioc->lil_ids[0]);
175 case MDS_HSM_RELEASE:
176 LASSERT(data != NULL);
177 op_data->op_bias |= MDS_HSM_RELEASE;
178 op_data->op_data_version = *(__u64 *)data;
179 op_data->op_lease_handle = och->och_lease_handle;
180 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
184 LASSERT(data == NULL);
188 rc = md_close(md_exp, op_data, och->och_mod, &req);
189 if (rc != 0 && rc != -EINTR)
190 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
191 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
193 if (rc == 0 && op_data->op_bias & bias) {
194 struct mdt_body *body;
196 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
201 ll_finish_md_op_data(op_data);
205 md_clear_open_replay_data(md_exp, och);
206 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
209 ptlrpc_req_finished(req); /* This is close request */
213 int ll_md_real_close(struct inode *inode, fmode_t fmode)
215 struct ll_inode_info *lli = ll_i2info(inode);
216 struct obd_client_handle **och_p;
217 struct obd_client_handle *och;
222 if (fmode & FMODE_WRITE) {
223 och_p = &lli->lli_mds_write_och;
224 och_usecount = &lli->lli_open_fd_write_count;
225 } else if (fmode & FMODE_EXEC) {
226 och_p = &lli->lli_mds_exec_och;
227 och_usecount = &lli->lli_open_fd_exec_count;
229 LASSERT(fmode & FMODE_READ);
230 och_p = &lli->lli_mds_read_och;
231 och_usecount = &lli->lli_open_fd_read_count;
234 mutex_lock(&lli->lli_och_mutex);
235 if (*och_usecount > 0) {
236 /* There are still users of this handle, so skip
238 mutex_unlock(&lli->lli_och_mutex);
244 mutex_unlock(&lli->lli_och_mutex);
247 /* There might be a race and this handle may already
249 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
255 static int ll_md_close(struct inode *inode, struct file *file)
257 union ldlm_policy_data policy = {
258 .l_inodebits = { MDS_INODELOCK_OPEN },
260 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct lustre_handle lockh;
264 enum ldlm_mode lockmode;
268 /* clear group lock, if present */
269 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
270 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
272 if (fd->fd_lease_och != NULL) {
275 /* Usually the lease is not released when the
276 * application crashed, we need to release here. */
277 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
278 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
279 PFID(&lli->lli_fid), rc, lease_broken);
281 fd->fd_lease_och = NULL;
284 if (fd->fd_och != NULL) {
285 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
290 /* Let's see if we have good enough OPEN lock on the file and if
291 we can skip talking to MDS */
292 mutex_lock(&lli->lli_och_mutex);
293 if (fd->fd_omode & FMODE_WRITE) {
295 LASSERT(lli->lli_open_fd_write_count);
296 lli->lli_open_fd_write_count--;
297 } else if (fd->fd_omode & FMODE_EXEC) {
299 LASSERT(lli->lli_open_fd_exec_count);
300 lli->lli_open_fd_exec_count--;
303 LASSERT(lli->lli_open_fd_read_count);
304 lli->lli_open_fd_read_count--;
306 mutex_unlock(&lli->lli_och_mutex);
308 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
309 LDLM_IBITS, &policy, lockmode, &lockh))
310 rc = ll_md_real_close(inode, fd->fd_omode);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
319 /* While this returns an error code, fput() the caller does not, so we need
320 * to make every effort to clean up all of our state here. Also, applications
321 * rarely check close errors and even if an error is returned they will not
322 * re-try the close call.
324 int ll_file_release(struct inode *inode, struct file *file)
326 struct ll_file_data *fd;
327 struct ll_sb_info *sbi = ll_i2sbi(inode);
328 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
333 PFID(ll_inode2fid(inode)), inode);
335 if (inode->i_sb->s_root != file_dentry(file))
336 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
337 fd = LUSTRE_FPRIVATE(file);
340 /* The last ref on @file, maybe not the the owner pid of statahead,
341 * because parent and child process can share the same file handle. */
342 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
343 ll_deauthorize_statahead(inode, fd);
345 if (inode->i_sb->s_root == file_dentry(file)) {
346 LUSTRE_FPRIVATE(file) = NULL;
347 ll_file_data_put(fd);
351 if (!S_ISDIR(inode->i_mode)) {
352 if (lli->lli_clob != NULL)
353 lov_read_and_clear_async_rc(lli->lli_clob);
354 lli->lli_async_rc = 0;
357 rc = ll_md_close(inode, file);
359 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
360 libcfs_debug_dumplog();
365 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
366 struct lookup_intent *itp)
368 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
369 struct dentry *parent = de->d_parent;
370 const char *name = NULL;
372 struct md_op_data *op_data;
373 struct ptlrpc_request *req = NULL;
377 LASSERT(parent != NULL);
378 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
380 /* if server supports open-by-fid, or file name is invalid, don't pack
381 * name in open request */
382 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
383 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
384 name = de->d_name.name;
385 len = de->d_name.len;
388 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
389 name, len, 0, LUSTRE_OPC_ANY, NULL);
391 RETURN(PTR_ERR(op_data));
392 op_data->op_data = lmm;
393 op_data->op_data_size = lmmsize;
395 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
396 &ll_md_blocking_ast, 0);
397 ll_finish_md_op_data(op_data);
399 /* reason for keep own exit path - don`t flood log
400 * with messages with -ESTALE errors.
402 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
403 it_open_error(DISP_OPEN_OPEN, itp))
405 ll_release_openhandle(de, itp);
409 if (it_disposition(itp, DISP_LOOKUP_NEG))
410 GOTO(out, rc = -ENOENT);
412 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
413 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
414 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
418 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
419 if (!rc && itp->it_lock_mode)
420 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
423 ptlrpc_req_finished(req);
424 ll_intent_drop_lock(itp);
426 /* We did open by fid, but by the time we got to the server,
427 * the object disappeared. If this is a create, we cannot really
428 * tell the userspace that the file it was trying to create
429 * does not exist. Instead let's return -ESTALE, and the VFS will
430 * retry the create with LOOKUP_REVAL that we are going to catch
431 * in ll_revalidate_dentry() and use lookup then.
433 if (rc == -ENOENT && itp->it_op & IT_CREAT)
439 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
440 struct obd_client_handle *och)
442 struct mdt_body *body;
444 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
445 och->och_fh = body->mbo_handle;
446 och->och_fid = body->mbo_fid1;
447 och->och_lease_handle.cookie = it->it_lock_handle;
448 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
449 och->och_flags = it->it_flags;
451 return md_set_open_replay_data(md_exp, och, it);
454 static int ll_local_open(struct file *file, struct lookup_intent *it,
455 struct ll_file_data *fd, struct obd_client_handle *och)
457 struct inode *inode = file_inode(file);
460 LASSERT(!LUSTRE_FPRIVATE(file));
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
476 /* ll_cl_context initialize */
477 rwlock_init(&fd->fd_lock);
478 INIT_LIST_HEAD(&fd->fd_lccs);
483 /* Open a file, and (for the very first open) create objects on the OSTs at
484 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
485 * creation or open until ll_lov_setstripe() ioctl is called.
487 * If we already have the stripe MD locally then we don't request it in
488 * md_open(), by passing a lmm_size = 0.
490 * It is up to the application to ensure no other processes open this file
491 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
492 * used. We might be able to avoid races of that sort by getting lli_open_sem
493 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
494 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
496 int ll_file_open(struct inode *inode, struct file *file)
498 struct ll_inode_info *lli = ll_i2info(inode);
499 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
500 .it_flags = file->f_flags };
501 struct obd_client_handle **och_p = NULL;
502 __u64 *och_usecount = NULL;
503 struct ll_file_data *fd;
507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
508 PFID(ll_inode2fid(inode)), inode, file->f_flags);
510 it = file->private_data; /* XXX: compat macro */
511 file->private_data = NULL; /* prevent ll_local_open assertion */
513 fd = ll_file_data_get();
515 GOTO(out_openerr, rc = -ENOMEM);
518 if (S_ISDIR(inode->i_mode))
519 ll_authorize_statahead(inode, fd);
521 if (inode->i_sb->s_root == file_dentry(file)) {
522 LUSTRE_FPRIVATE(file) = fd;
526 if (!it || !it->it_disposition) {
527 /* Convert f_flags into access mode. We cannot use file->f_mode,
528 * because everything but O_ACCMODE mask was stripped from
530 if ((oit.it_flags + 1) & O_ACCMODE)
532 if (file->f_flags & O_TRUNC)
533 oit.it_flags |= FMODE_WRITE;
535 /* kernel only call f_op->open in dentry_open. filp_open calls
536 * dentry_open after call to open_namei that checks permissions.
537 * Only nfsd_open call dentry_open directly without checking
538 * permissions and because of that this code below is safe. */
539 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
540 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
542 /* We do not want O_EXCL here, presumably we opened the file
543 * already? XXX - NFS implications? */
544 oit.it_flags &= ~O_EXCL;
546 /* bug20584, if "it_flags" contains O_CREAT, the file will be
547 * created if necessary, then "IT_CREAT" should be set to keep
548 * consistent with it */
549 if (oit.it_flags & O_CREAT)
550 oit.it_op |= IT_CREAT;
556 /* Let's see if we have file open on MDS already. */
557 if (it->it_flags & FMODE_WRITE) {
558 och_p = &lli->lli_mds_write_och;
559 och_usecount = &lli->lli_open_fd_write_count;
560 } else if (it->it_flags & FMODE_EXEC) {
561 och_p = &lli->lli_mds_exec_och;
562 och_usecount = &lli->lli_open_fd_exec_count;
564 och_p = &lli->lli_mds_read_och;
565 och_usecount = &lli->lli_open_fd_read_count;
568 mutex_lock(&lli->lli_och_mutex);
569 if (*och_p) { /* Open handle is present */
570 if (it_disposition(it, DISP_OPEN_OPEN)) {
571 /* Well, there's extra open request that we do not need,
572 let's close it somehow. This will decref request. */
573 rc = it_open_error(DISP_OPEN_OPEN, it);
575 mutex_unlock(&lli->lli_och_mutex);
576 GOTO(out_openerr, rc);
579 ll_release_openhandle(file_dentry(file), it);
583 rc = ll_local_open(file, it, fd, NULL);
586 mutex_unlock(&lli->lli_och_mutex);
587 GOTO(out_openerr, rc);
590 LASSERT(*och_usecount == 0);
591 if (!it->it_disposition) {
592 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
593 /* We cannot just request lock handle now, new ELC code
594 means that one of other OPEN locks for this file
595 could be cancelled, and since blocking ast handler
596 would attempt to grab och_mutex as well, that would
597 result in a deadlock */
598 mutex_unlock(&lli->lli_och_mutex);
600 * Normally called under two situations:
602 * 2. A race/condition on MDS resulting in no open
603 * handle to be returned from LOOKUP|OPEN request,
604 * for example if the target entry was a symlink.
606 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
607 * marked by a bit set in ll_iget_for_nfs. Clear the
608 * bit so that it's not confusing later callers.
610 * NB; when ldd is NULL, it must have come via normal
611 * lookup path only, since ll_iget_for_nfs always calls
614 if (ldd && ldd->lld_nfs_dentry) {
615 ldd->lld_nfs_dentry = 0;
616 it->it_flags |= MDS_OPEN_LOCK;
620 * Always specify MDS_OPEN_BY_FID because we don't want
621 * to get file with different fid.
623 it->it_flags |= MDS_OPEN_BY_FID;
624 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
627 GOTO(out_openerr, rc);
631 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
633 GOTO(out_och_free, rc = -ENOMEM);
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 GOTO(out_och_free, rc);
646 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
647 "inode %p: disposition %x, status %d\n", inode,
648 it_disposition(it, ~0), it->it_status);
650 rc = ll_local_open(file, it, fd, *och_p);
652 GOTO(out_och_free, rc);
654 mutex_unlock(&lli->lli_och_mutex);
657 /* Must do this outside lli_och_mutex lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
661 GOTO(out_och_free, rc);
663 cl_lov_delay_create_clear(&file->f_flags);
664 GOTO(out_och_free, rc);
668 if (och_p && *och_p) {
669 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
670 *och_p = NULL; /* OBD_FREE writes some magic there */
673 mutex_unlock(&lli->lli_och_mutex);
676 if (lli->lli_opendir_key == fd)
677 ll_deauthorize_statahead(inode, fd);
679 ll_file_data_put(fd);
681 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
684 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
685 ptlrpc_req_finished(it->it_request);
686 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
692 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
693 struct ldlm_lock_desc *desc, void *data, int flag)
696 struct lustre_handle lockh;
700 case LDLM_CB_BLOCKING:
701 ldlm_lock2handle(lock, &lockh);
702 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
704 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
708 case LDLM_CB_CANCELING:
716 * When setting a lease on a file, we take ownership of the lli_mds_*_och
717 * and save it as fd->fd_och so as to force client to reopen the file even
718 * if it has an open lock in cache already.
720 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
721 struct lustre_handle *old_handle)
723 struct ll_inode_info *lli = ll_i2info(inode);
724 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
725 struct obd_client_handle **och_p;
730 /* Get the openhandle of the file */
731 mutex_lock(&lli->lli_och_mutex);
732 if (fd->fd_lease_och != NULL)
733 GOTO(out_unlock, rc = -EBUSY);
735 if (fd->fd_och == NULL) {
736 if (file->f_mode & FMODE_WRITE) {
737 LASSERT(lli->lli_mds_write_och != NULL);
738 och_p = &lli->lli_mds_write_och;
739 och_usecount = &lli->lli_open_fd_write_count;
741 LASSERT(lli->lli_mds_read_och != NULL);
742 och_p = &lli->lli_mds_read_och;
743 och_usecount = &lli->lli_open_fd_read_count;
746 if (*och_usecount > 1)
747 GOTO(out_unlock, rc = -EBUSY);
754 *old_handle = fd->fd_och->och_fh;
758 mutex_unlock(&lli->lli_och_mutex);
763 * Release ownership on lli_mds_*_och when putting back a file lease.
765 static int ll_lease_och_release(struct inode *inode, struct file *file)
767 struct ll_inode_info *lli = ll_i2info(inode);
768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
769 struct obd_client_handle **och_p;
770 struct obd_client_handle *old_och = NULL;
775 mutex_lock(&lli->lli_och_mutex);
776 if (file->f_mode & FMODE_WRITE) {
777 och_p = &lli->lli_mds_write_och;
778 och_usecount = &lli->lli_open_fd_write_count;
780 och_p = &lli->lli_mds_read_och;
781 och_usecount = &lli->lli_open_fd_read_count;
784 /* The file may have been open by another process (broken lease) so
785 * *och_p is not NULL. In this case we should simply increase usecount
788 if (*och_p != NULL) {
789 old_och = fd->fd_och;
796 mutex_unlock(&lli->lli_och_mutex);
799 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
805 * Acquire a lease and open the file.
807 static struct obd_client_handle *
808 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
811 struct lookup_intent it = { .it_op = IT_OPEN };
812 struct ll_sb_info *sbi = ll_i2sbi(inode);
813 struct md_op_data *op_data;
814 struct ptlrpc_request *req = NULL;
815 struct lustre_handle old_handle = { 0 };
816 struct obd_client_handle *och = NULL;
821 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
822 RETURN(ERR_PTR(-EINVAL));
825 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
826 RETURN(ERR_PTR(-EPERM));
828 rc = ll_lease_och_acquire(inode, file, &old_handle);
835 RETURN(ERR_PTR(-ENOMEM));
837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
838 LUSTRE_OPC_ANY, NULL);
840 GOTO(out, rc = PTR_ERR(op_data));
842 /* To tell the MDT this openhandle is from the same owner */
843 op_data->op_handle = old_handle;
845 it.it_flags = fmode | open_flags;
846 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
847 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
848 &ll_md_blocking_lease_ast,
849 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
850 * it can be cancelled which may mislead applications that the lease is
852 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
853 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
854 * doesn't deal with openhandle, so normal openhandle will be leaked. */
855 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
856 ll_finish_md_op_data(op_data);
857 ptlrpc_req_finished(req);
859 GOTO(out_release_it, rc);
861 if (it_disposition(&it, DISP_LOOKUP_NEG))
862 GOTO(out_release_it, rc = -ENOENT);
864 rc = it_open_error(DISP_OPEN_OPEN, &it);
866 GOTO(out_release_it, rc);
868 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
869 ll_och_fill(sbi->ll_md_exp, &it, och);
871 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
872 GOTO(out_close, rc = -EOPNOTSUPP);
874 /* already get lease, handle lease lock */
875 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
876 if (it.it_lock_mode == 0 ||
877 it.it_lock_bits != MDS_INODELOCK_OPEN) {
878 /* open lock must return for lease */
879 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
880 PFID(ll_inode2fid(inode)), it.it_lock_mode,
882 GOTO(out_close, rc = -EPROTO);
885 ll_intent_release(&it);
889 /* Cancel open lock */
890 if (it.it_lock_mode != 0) {
891 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
894 och->och_lease_handle.cookie = 0ULL;
896 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
898 CERROR("%s: error closing file "DFID": %d\n",
899 ll_get_fsname(inode->i_sb, NULL, 0),
900 PFID(&ll_i2info(inode)->lli_fid), rc2);
901 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
903 ll_intent_release(&it);
911 * Check whether a layout swap can be done between two inodes.
913 * \param[in] inode1 First inode to check
914 * \param[in] inode2 Second inode to check
916 * \retval 0 on success, layout swap can be performed between both inodes
917 * \retval negative error code if requirements are not met
919 static int ll_check_swap_layouts_validity(struct inode *inode1,
920 struct inode *inode2)
922 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
925 if (inode_permission(inode1, MAY_WRITE) ||
926 inode_permission(inode2, MAY_WRITE))
929 if (inode1->i_sb != inode2->i_sb)
935 static int ll_swap_layouts_close(struct obd_client_handle *och,
936 struct inode *inode, struct inode *inode2,
939 const struct lu_fid *fid1 = ll_inode2fid(inode);
940 const struct lu_fid *fid2;
941 enum mds_op_bias bias;
945 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
946 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
948 rc = ll_check_swap_layouts_validity(inode, inode2);
950 GOTO(out_free_och, rc);
952 /* We now know that inode2 is a lustre inode */
953 fid2 = ll_inode2fid(inode2);
955 rc = lu_fid_cmp(fid1, fid2);
957 GOTO(out_free_och, rc = -EINVAL);
960 case SWAP_LAYOUTS_CLOSE:
961 bias = MDS_CLOSE_LAYOUT_SWAP;
963 case MERGE_LAYOUTS_CLOSE:
964 bias = MDS_CLOSE_LAYOUT_MERGE;
967 GOTO(out_free_och, rc = -EOPNOTSUPP);
970 /* Close the file and {swap,merge} layouts between inode & inode2.
971 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
972 * because we still need it to pack l_remote_handle to MDT. */
973 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
975 och = NULL; /* freed in ll_close_inode_openhandle() */
985 * Release lease and close the file.
986 * It will check if the lease has ever broken.
988 static int ll_lease_close_intent(struct obd_client_handle *och,
990 bool *lease_broken, enum mds_op_bias bias,
993 struct ldlm_lock *lock;
994 bool cancelled = true;
998 lock = ldlm_handle2lock(&och->och_lease_handle);
1000 lock_res_and_lock(lock);
1001 cancelled = ldlm_is_cancel(lock);
1002 unlock_res_and_lock(lock);
1003 LDLM_LOCK_PUT(lock);
1006 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1007 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1009 if (lease_broken != NULL)
1010 *lease_broken = cancelled;
1012 if (!cancelled && !bias)
1013 ldlm_cli_cancel(&och->och_lease_handle, 0);
1015 if (cancelled) { /* no need to excute intent */
1020 rc = ll_close_inode_openhandle(inode, och, bias, data);
1024 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1027 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1031 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1033 static int ll_lease_file_resync(struct obd_client_handle *och,
1034 struct inode *inode)
1036 struct ll_sb_info *sbi = ll_i2sbi(inode);
1037 struct md_op_data *op_data;
1038 __u64 data_version_unused;
1042 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1043 LUSTRE_OPC_ANY, NULL);
1044 if (IS_ERR(op_data))
1045 RETURN(PTR_ERR(op_data));
1047 /* before starting file resync, it's necessary to clean up page cache
1048 * in client memory, otherwise once the layout version is increased,
1049 * writing back cached data will be denied the OSTs. */
1050 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1054 op_data->op_handle = och->och_lease_handle;
1055 rc = md_file_resync(sbi->ll_md_exp, op_data);
1061 ll_finish_md_op_data(op_data);
1065 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1067 struct ll_inode_info *lli = ll_i2info(inode);
1068 struct cl_object *obj = lli->lli_clob;
1069 struct cl_attr *attr = vvp_env_thread_attr(env);
1077 ll_inode_size_lock(inode);
1079 /* Merge timestamps the most recently obtained from MDS with
1080 * timestamps obtained from OSTs.
1082 * Do not overwrite atime of inode because it may be refreshed
1083 * by file_accessed() function. If the read was served by cache
1084 * data, there is no RPC to be sent so that atime may not be
1085 * transferred to OSTs at all. MDT only updates atime at close time
1086 * if it's at least 'mdd.*.atime_diff' older.
1087 * All in all, the atime in Lustre does not strictly comply with
1088 * POSIX. Solving this problem needs to send an RPC to MDT for each
1089 * read, this will hurt performance. */
1090 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1091 LTIME_S(inode->i_atime) = lli->lli_atime;
1092 lli->lli_update_atime = 0;
1094 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1095 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1097 atime = LTIME_S(inode->i_atime);
1098 mtime = LTIME_S(inode->i_mtime);
1099 ctime = LTIME_S(inode->i_ctime);
1101 cl_object_attr_lock(obj);
1102 rc = cl_object_attr_get(env, obj, attr);
1103 cl_object_attr_unlock(obj);
1106 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1108 if (atime < attr->cat_atime)
1109 atime = attr->cat_atime;
1111 if (ctime < attr->cat_ctime)
1112 ctime = attr->cat_ctime;
1114 if (mtime < attr->cat_mtime)
1115 mtime = attr->cat_mtime;
1117 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1118 PFID(&lli->lli_fid), attr->cat_size);
1120 i_size_write(inode, attr->cat_size);
1121 inode->i_blocks = attr->cat_blocks;
1123 LTIME_S(inode->i_atime) = atime;
1124 LTIME_S(inode->i_mtime) = mtime;
1125 LTIME_S(inode->i_ctime) = ctime;
1128 ll_inode_size_unlock(inode);
1134 * Set designated mirror for I/O.
1136 * So far only read, write, and truncated can support to issue I/O to
1137 * designated mirror.
1139 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1143 /* clear layout version for generic(non-resync) I/O in case it carries
1144 * stale layout version due to I/O restart */
1145 io->ci_layout_version = 0;
1147 /* FLR: disable non-delay for designated mirror I/O because obviously
1148 * only one mirror is available */
1149 if (fd->fd_designated_mirror > 0) {
1151 io->ci_designated_mirror = fd->fd_designated_mirror;
1152 io->ci_layout_version = fd->fd_layout_version;
1153 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1157 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1158 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1161 static bool file_is_noatime(const struct file *file)
1163 const struct vfsmount *mnt = file->f_path.mnt;
1164 const struct inode *inode = file_inode((struct file *)file);
1166 /* Adapted from file_accessed() and touch_atime().*/
1167 if (file->f_flags & O_NOATIME)
1170 if (inode->i_flags & S_NOATIME)
1173 if (IS_NOATIME(inode))
1176 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1179 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1182 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1188 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1190 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1192 struct inode *inode = file_inode(file);
1193 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1195 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1196 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1197 io->u.ci_rw.rw_file = file;
1198 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1199 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1200 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1202 if (iot == CIT_WRITE) {
1203 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1204 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1205 file->f_flags & O_DIRECT ||
1208 io->ci_obj = ll_i2info(inode)->lli_clob;
1209 io->ci_lockreq = CILR_MAYBE;
1210 if (ll_file_nolock(file)) {
1211 io->ci_lockreq = CILR_NEVER;
1212 io->ci_no_srvlock = 1;
1213 } else if (file->f_flags & O_APPEND) {
1214 io->ci_lockreq = CILR_MANDATORY;
1216 io->ci_noatime = file_is_noatime(file);
1217 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1218 io->ci_pio = !io->u.ci_rw.rw_append;
1222 /* FLR: only use non-delay I/O for read as there is only one
1223 * avaliable mirror for write. */
1224 io->ci_ndelay = !(iot == CIT_WRITE);
1226 ll_io_set_mirror(io, file);
1229 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1231 struct cl_io_pt *pt = ptask->pt_cbdata;
1232 struct file *file = pt->cip_file;
1235 loff_t pos = pt->cip_pos;
1240 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1241 file_dentry(file)->d_name.name,
1242 pt->cip_iot == CIT_READ ? "read" : "write",
1243 pos, pos + pt->cip_count);
1245 env = cl_env_get(&refcheck);
1247 RETURN(PTR_ERR(env));
1249 io = vvp_env_thread_io(env);
1250 ll_io_init(io, file, pt->cip_iot);
1251 io->u.ci_rw.rw_iter = pt->cip_iter;
1252 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1253 io->ci_pio = 0; /* It's already in parallel task */
1255 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1256 pt->cip_count - pt->cip_result);
1258 struct vvp_io *vio = vvp_env_io(env);
1260 vio->vui_io_subtype = IO_NORMAL;
1261 vio->vui_fd = LUSTRE_FPRIVATE(file);
1263 ll_cl_add(file, env, io, LCC_RW);
1264 rc = cl_io_loop(env, io);
1265 ll_cl_remove(file, env);
1267 /* cl_io_rw_init() handled IO */
1271 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1277 if (io->ci_nob > 0) {
1278 pt->cip_result += io->ci_nob;
1279 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1281 pt->cip_iocb.ki_pos = pos;
1282 #ifdef HAVE_KIOCB_KI_LEFT
1283 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1284 #elif defined(HAVE_KI_NBYTES)
1285 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1289 cl_io_fini(env, io);
1290 cl_env_put(env, &refcheck);
1292 pt->cip_need_restart = io->ci_need_restart;
1294 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1295 file_dentry(file)->d_name.name,
1296 pt->cip_iot == CIT_READ ? "read" : "write",
1297 pt->cip_result, rc);
1299 RETURN(pt->cip_result > 0 ? 0 : rc);
1303 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1304 struct file *file, enum cl_io_type iot,
1305 loff_t *ppos, size_t count)
1307 struct range_lock range;
1308 struct vvp_io *vio = vvp_env_io(env);
1309 struct inode *inode = file_inode(file);
1310 struct ll_inode_info *lli = ll_i2info(inode);
1311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1316 unsigned retried = 0;
1317 bool restarted = false;
1321 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1322 file_dentry(file)->d_name.name,
1323 iot == CIT_READ ? "read" : "write", pos, pos + count);
1326 io = vvp_env_thread_io(env);
1327 ll_io_init(io, file, iot);
1328 if (args->via_io_subtype == IO_NORMAL) {
1329 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1330 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1332 if (args->via_io_subtype != IO_NORMAL || restarted)
1334 io->ci_ndelay_tried = retried;
1336 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1337 bool range_locked = false;
1339 if (file->f_flags & O_APPEND)
1340 range_lock_init(&range, 0, LUSTRE_EOF);
1342 range_lock_init(&range, pos, pos + count - 1);
1344 vio->vui_fd = LUSTRE_FPRIVATE(file);
1345 vio->vui_io_subtype = args->via_io_subtype;
1347 switch (vio->vui_io_subtype) {
1349 /* Direct IO reads must also take range lock,
1350 * or multiple reads will try to work on the same pages
1351 * See LU-6227 for details. */
1352 if (((iot == CIT_WRITE) ||
1353 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1354 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1355 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1357 rc = range_lock(&lli->lli_write_tree, &range);
1361 range_locked = true;
1365 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1366 vio->u.splice.vui_flags = args->u.splice.via_flags;
1369 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1373 ll_cl_add(file, env, io, LCC_RW);
1374 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1375 !lli->lli_inode_locked) {
1377 lli->lli_inode_locked = 1;
1379 rc = cl_io_loop(env, io);
1380 if (lli->lli_inode_locked) {
1381 lli->lli_inode_locked = 0;
1382 inode_unlock(inode);
1384 ll_cl_remove(file, env);
1387 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1389 range_unlock(&lli->lli_write_tree, &range);
1392 /* cl_io_rw_init() handled IO */
1396 if (io->ci_nob > 0) {
1397 result += io->ci_nob;
1398 count -= io->ci_nob;
1400 if (args->via_io_subtype == IO_NORMAL) {
1401 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1403 args->u.normal.via_iocb->ki_pos = pos;
1404 #ifdef HAVE_KIOCB_KI_LEFT
1405 args->u.normal.via_iocb->ki_left = count;
1406 #elif defined(HAVE_KI_NBYTES)
1407 args->u.normal.via_iocb->ki_nbytes = count;
1411 pos = io->u.ci_rw.rw_range.cir_pos;
1415 cl_io_fini(env, io);
1418 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1419 file->f_path.dentry->d_name.name,
1420 iot, rc, result, io->ci_need_restart);
1422 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1424 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1425 file_dentry(file)->d_name.name,
1426 iot == CIT_READ ? "read" : "write",
1427 pos, pos + count, result, rc);
1428 /* preserve the tried count for FLR */
1429 retried = io->ci_ndelay_tried;
1434 if (iot == CIT_READ) {
1436 ll_stats_ops_tally(ll_i2sbi(inode),
1437 LPROC_LL_READ_BYTES, result);
1438 } else if (iot == CIT_WRITE) {
1440 ll_stats_ops_tally(ll_i2sbi(inode),
1441 LPROC_LL_WRITE_BYTES, result);
1442 fd->fd_write_failed = false;
1443 } else if (result == 0 && rc == 0) {
1446 fd->fd_write_failed = true;
1448 fd->fd_write_failed = false;
1449 } else if (rc != -ERESTARTSYS) {
1450 fd->fd_write_failed = true;
1454 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1455 file_dentry(file)->d_name.name,
1456 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1460 RETURN(result > 0 ? result : rc);
1464 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1465 * especially for small I/O.
1467 * To serve a read request, CLIO has to create and initialize a cl_io and
1468 * then request DLM lock. This has turned out to have siginificant overhead
1469 * and affects the performance of small I/O dramatically.
1471 * It's not necessary to create a cl_io for each I/O. Under the help of read
1472 * ahead, most of the pages being read are already in memory cache and we can
1473 * read those pages directly because if the pages exist, the corresponding DLM
1474 * lock must exist so that page content must be valid.
1476 * In fast read implementation, the llite speculatively finds and reads pages
1477 * in memory cache. There are three scenarios for fast read:
1478 * - If the page exists and is uptodate, kernel VM will provide the data and
1479 * CLIO won't be intervened;
1480 * - If the page was brought into memory by read ahead, it will be exported
1481 * and read ahead parameters will be updated;
1482 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1483 * it will go back and invoke normal read, i.e., a cl_io will be created
1484 * and DLM lock will be requested.
1486 * POSIX compliance: posix standard states that read is intended to be atomic.
1487 * Lustre read implementation is in line with Linux kernel read implementation
1488 * and neither of them complies with POSIX standard in this matter. Fast read
1489 * doesn't make the situation worse on single node but it may interleave write
1490 * results from multiple nodes due to short read handling in ll_file_aio_read().
1492 * \param env - lu_env
1493 * \param iocb - kiocb from kernel
1494 * \param iter - user space buffers where the data will be copied
1496 * \retval - number of bytes have been read, or error code if error occurred.
1499 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1503 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1506 /* NB: we can't do direct IO for fast read because it will need a lock
1507 * to make IO engine happy. */
1508 if (iocb->ki_filp->f_flags & O_DIRECT)
1511 result = generic_file_read_iter(iocb, iter);
1513 /* If the first page is not in cache, generic_file_aio_read() will be
1514 * returned with -ENODATA.
1515 * See corresponding code in ll_readpage(). */
1516 if (result == -ENODATA)
1520 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1521 LPROC_LL_READ_BYTES, result);
1527 * Read from a file (through the page cache).
1529 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1532 struct vvp_io_args *args;
1537 result = ll_do_fast_read(iocb, to);
1538 if (result < 0 || iov_iter_count(to) == 0)
1541 env = cl_env_get(&refcheck);
1543 return PTR_ERR(env);
1545 args = ll_env_args(env, IO_NORMAL);
1546 args->u.normal.via_iter = to;
1547 args->u.normal.via_iocb = iocb;
1549 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1550 &iocb->ki_pos, iov_iter_count(to));
1553 else if (result == 0)
1556 cl_env_put(env, &refcheck);
1562 * Write to a file (through the page cache).
1564 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1566 struct vvp_io_args *args;
1571 env = cl_env_get(&refcheck);
1573 return PTR_ERR(env);
1575 args = ll_env_args(env, IO_NORMAL);
1576 args->u.normal.via_iter = from;
1577 args->u.normal.via_iocb = iocb;
1579 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1580 &iocb->ki_pos, iov_iter_count(from));
1581 cl_env_put(env, &refcheck);
1585 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1587 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1589 static int ll_file_get_iov_count(const struct iovec *iov,
1590 unsigned long *nr_segs, size_t *count)
1595 for (seg = 0; seg < *nr_segs; seg++) {
1596 const struct iovec *iv = &iov[seg];
1599 * If any segment has a negative length, or the cumulative
1600 * length ever wraps negative then return -EINVAL.
1603 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1605 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1610 cnt -= iv->iov_len; /* This segment is no good */
1617 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1618 unsigned long nr_segs, loff_t pos)
1625 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1629 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1630 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1631 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1632 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1633 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1635 result = ll_file_read_iter(iocb, &to);
1640 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1643 struct iovec iov = { .iov_base = buf, .iov_len = count };
1648 init_sync_kiocb(&kiocb, file);
1649 kiocb.ki_pos = *ppos;
1650 #ifdef HAVE_KIOCB_KI_LEFT
1651 kiocb.ki_left = count;
1652 #elif defined(HAVE_KI_NBYTES)
1653 kiocb.i_nbytes = count;
1656 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1657 *ppos = kiocb.ki_pos;
1663 * Write to a file (through the page cache).
1666 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1667 unsigned long nr_segs, loff_t pos)
1669 struct iov_iter from;
1674 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1678 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1679 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1680 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1681 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1682 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1684 result = ll_file_write_iter(iocb, &from);
1689 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1690 size_t count, loff_t *ppos)
1693 struct iovec iov = { .iov_base = (void __user *)buf,
1695 struct kiocb *kiocb;
1700 env = cl_env_get(&refcheck);
1702 RETURN(PTR_ERR(env));
1704 kiocb = &ll_env_info(env)->lti_kiocb;
1705 init_sync_kiocb(kiocb, file);
1706 kiocb->ki_pos = *ppos;
1707 #ifdef HAVE_KIOCB_KI_LEFT
1708 kiocb->ki_left = count;
1709 #elif defined(HAVE_KI_NBYTES)
1710 kiocb->ki_nbytes = count;
1713 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1714 *ppos = kiocb->ki_pos;
1716 cl_env_put(env, &refcheck);
1719 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1722 * Send file content (through pagecache) somewhere with helper
1724 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1725 struct pipe_inode_info *pipe, size_t count,
1729 struct vvp_io_args *args;
1734 env = cl_env_get(&refcheck);
1736 RETURN(PTR_ERR(env));
1738 args = ll_env_args(env, IO_SPLICE);
1739 args->u.splice.via_pipe = pipe;
1740 args->u.splice.via_flags = flags;
1742 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1743 cl_env_put(env, &refcheck);
1747 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1748 __u64 flags, struct lov_user_md *lum, int lum_size)
1750 struct lookup_intent oit = {
1752 .it_flags = flags | MDS_OPEN_BY_FID,
1757 ll_inode_size_lock(inode);
1758 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1760 GOTO(out_unlock, rc);
1762 ll_release_openhandle(dentry, &oit);
1765 ll_inode_size_unlock(inode);
1766 ll_intent_release(&oit);
1771 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1772 struct lov_mds_md **lmmp, int *lmm_size,
1773 struct ptlrpc_request **request)
1775 struct ll_sb_info *sbi = ll_i2sbi(inode);
1776 struct mdt_body *body;
1777 struct lov_mds_md *lmm = NULL;
1778 struct ptlrpc_request *req = NULL;
1779 struct md_op_data *op_data;
1782 rc = ll_get_default_mdsize(sbi, &lmmsize);
1786 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1787 strlen(filename), lmmsize,
1788 LUSTRE_OPC_ANY, NULL);
1789 if (IS_ERR(op_data))
1790 RETURN(PTR_ERR(op_data));
1792 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1793 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1794 ll_finish_md_op_data(op_data);
1796 CDEBUG(D_INFO, "md_getattr_name failed "
1797 "on %s: rc %d\n", filename, rc);
1801 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1802 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1804 lmmsize = body->mbo_eadatasize;
1806 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1808 GOTO(out, rc = -ENODATA);
1811 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1812 LASSERT(lmm != NULL);
1814 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1815 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1816 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1817 GOTO(out, rc = -EPROTO);
1820 * This is coming from the MDS, so is probably in
1821 * little endian. We convert it to host endian before
1822 * passing it to userspace.
1824 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1827 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1828 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1829 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1830 if (le32_to_cpu(lmm->lmm_pattern) &
1831 LOV_PATTERN_F_RELEASED)
1835 /* if function called for directory - we should
1836 * avoid swab not existent lsm objects */
1837 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1838 lustre_swab_lov_user_md_v1(
1839 (struct lov_user_md_v1 *)lmm);
1840 if (S_ISREG(body->mbo_mode))
1841 lustre_swab_lov_user_md_objects(
1842 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1844 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1845 lustre_swab_lov_user_md_v3(
1846 (struct lov_user_md_v3 *)lmm);
1847 if (S_ISREG(body->mbo_mode))
1848 lustre_swab_lov_user_md_objects(
1849 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1851 } else if (lmm->lmm_magic ==
1852 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1853 lustre_swab_lov_comp_md_v1(
1854 (struct lov_comp_md_v1 *)lmm);
1860 *lmm_size = lmmsize;
1865 static int ll_lov_setea(struct inode *inode, struct file *file,
1868 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1869 struct lov_user_md *lump;
1870 int lum_size = sizeof(struct lov_user_md) +
1871 sizeof(struct lov_user_ost_data);
1875 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1878 OBD_ALLOC_LARGE(lump, lum_size);
1882 if (copy_from_user(lump, arg, lum_size))
1883 GOTO(out_lump, rc = -EFAULT);
1885 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1887 cl_lov_delay_create_clear(&file->f_flags);
1890 OBD_FREE_LARGE(lump, lum_size);
1894 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1901 env = cl_env_get(&refcheck);
1903 RETURN(PTR_ERR(env));
1905 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1906 cl_env_put(env, &refcheck);
1910 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1913 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1914 struct lov_user_md *klum;
1916 __u64 flags = FMODE_WRITE;
1919 rc = ll_copy_user_md(lum, &klum);
1924 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1929 rc = put_user(0, &lum->lmm_stripe_count);
1933 rc = ll_layout_refresh(inode, &gen);
1937 rc = ll_file_getstripe(inode, arg, lum_size);
1939 cl_lov_delay_create_clear(&file->f_flags);
1942 OBD_FREE(klum, lum_size);
1947 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1949 struct ll_inode_info *lli = ll_i2info(inode);
1950 struct cl_object *obj = lli->lli_clob;
1951 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1952 struct ll_grouplock grouplock;
1957 CWARN("group id for group lock must not be 0\n");
1961 if (ll_file_nolock(file))
1962 RETURN(-EOPNOTSUPP);
1964 spin_lock(&lli->lli_lock);
1965 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1966 CWARN("group lock already existed with gid %lu\n",
1967 fd->fd_grouplock.lg_gid);
1968 spin_unlock(&lli->lli_lock);
1971 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1972 spin_unlock(&lli->lli_lock);
1975 * XXX: group lock needs to protect all OST objects while PFL
1976 * can add new OST objects during the IO, so we'd instantiate
1977 * all OST objects before getting its group lock.
1982 struct cl_layout cl = {
1983 .cl_is_composite = false,
1986 env = cl_env_get(&refcheck);
1988 RETURN(PTR_ERR(env));
1990 rc = cl_object_layout_get(env, obj, &cl);
1991 if (!rc && cl.cl_is_composite)
1992 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1994 cl_env_put(env, &refcheck);
1999 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2000 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2004 spin_lock(&lli->lli_lock);
2005 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2006 spin_unlock(&lli->lli_lock);
2007 CERROR("another thread just won the race\n");
2008 cl_put_grouplock(&grouplock);
2012 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2013 fd->fd_grouplock = grouplock;
2014 spin_unlock(&lli->lli_lock);
2016 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2020 static int ll_put_grouplock(struct inode *inode, struct file *file,
2023 struct ll_inode_info *lli = ll_i2info(inode);
2024 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2025 struct ll_grouplock grouplock;
2028 spin_lock(&lli->lli_lock);
2029 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2030 spin_unlock(&lli->lli_lock);
2031 CWARN("no group lock held\n");
2035 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2037 if (fd->fd_grouplock.lg_gid != arg) {
2038 CWARN("group lock %lu doesn't match current id %lu\n",
2039 arg, fd->fd_grouplock.lg_gid);
2040 spin_unlock(&lli->lli_lock);
2044 grouplock = fd->fd_grouplock;
2045 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2046 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2047 spin_unlock(&lli->lli_lock);
2049 cl_put_grouplock(&grouplock);
2050 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2055 * Close inode open handle
2057 * \param dentry [in] dentry which contains the inode
2058 * \param it [in,out] intent which contains open info and result
2061 * \retval <0 failure
2063 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2065 struct inode *inode = dentry->d_inode;
2066 struct obd_client_handle *och;
2072 /* Root ? Do nothing. */
2073 if (dentry->d_inode->i_sb->s_root == dentry)
2076 /* No open handle to close? Move away */
2077 if (!it_disposition(it, DISP_OPEN_OPEN))
2080 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2082 OBD_ALLOC(och, sizeof(*och));
2084 GOTO(out, rc = -ENOMEM);
2086 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2088 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2090 /* this one is in place of ll_file_open */
2091 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2092 ptlrpc_req_finished(it->it_request);
2093 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2099 * Get size for inode for which FIEMAP mapping is requested.
2100 * Make the FIEMAP get_info call and returns the result.
2101 * \param fiemap kernel buffer to hold extens
2102 * \param num_bytes kernel buffer size
2104 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2110 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2113 /* Checks for fiemap flags */
2114 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2115 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2119 /* Check for FIEMAP_FLAG_SYNC */
2120 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2121 rc = filemap_fdatawrite(inode->i_mapping);
2126 env = cl_env_get(&refcheck);
2128 RETURN(PTR_ERR(env));
2130 if (i_size_read(inode) == 0) {
2131 rc = ll_glimpse_size(inode);
2136 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2137 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2138 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2140 /* If filesize is 0, then there would be no objects for mapping */
2141 if (fmkey.lfik_oa.o_size == 0) {
2142 fiemap->fm_mapped_extents = 0;
2146 fmkey.lfik_fiemap = *fiemap;
2148 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2149 &fmkey, fiemap, &num_bytes);
2151 cl_env_put(env, &refcheck);
2155 int ll_fid2path(struct inode *inode, void __user *arg)
2157 struct obd_export *exp = ll_i2mdexp(inode);
2158 const struct getinfo_fid2path __user *gfin = arg;
2160 struct getinfo_fid2path *gfout;
2166 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2167 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2170 /* Only need to get the buflen */
2171 if (get_user(pathlen, &gfin->gf_pathlen))
2174 if (pathlen > PATH_MAX)
2177 outsize = sizeof(*gfout) + pathlen;
2178 OBD_ALLOC(gfout, outsize);
2182 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2183 GOTO(gf_free, rc = -EFAULT);
2184 /* append root FID after gfout to let MDT know the root FID so that it
2185 * can lookup the correct path, this is mainly for fileset.
2186 * old server without fileset mount support will ignore this. */
2187 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2189 /* Call mdc_iocontrol */
2190 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2194 if (copy_to_user(arg, gfout, outsize))
2198 OBD_FREE(gfout, outsize);
2203 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2205 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2213 ioc->idv_version = 0;
2214 ioc->idv_layout_version = UINT_MAX;
2216 /* If no file object initialized, we consider its version is 0. */
2220 env = cl_env_get(&refcheck);
2222 RETURN(PTR_ERR(env));
2224 io = vvp_env_thread_io(env);
2226 io->u.ci_data_version.dv_data_version = 0;
2227 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2228 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2231 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2232 result = cl_io_loop(env, io);
2234 result = io->ci_result;
2236 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2237 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2239 cl_io_fini(env, io);
2241 if (unlikely(io->ci_need_restart))
2244 cl_env_put(env, &refcheck);
2250 * Read the data_version for inode.
2252 * This value is computed using stripe object version on OST.
2253 * Version is computed using server side locking.
2255 * @param flags if do sync on the OST side;
2257 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2258 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2260 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2262 struct ioc_data_version ioc = { .idv_flags = flags };
2265 rc = ll_ioc_data_version(inode, &ioc);
2267 *data_version = ioc.idv_version;
2273 * Trigger a HSM release request for the provided inode.
2275 int ll_hsm_release(struct inode *inode)
2278 struct obd_client_handle *och = NULL;
2279 __u64 data_version = 0;
2284 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2285 ll_get_fsname(inode->i_sb, NULL, 0),
2286 PFID(&ll_i2info(inode)->lli_fid));
2288 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2290 GOTO(out, rc = PTR_ERR(och));
2292 /* Grab latest data_version and [am]time values */
2293 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2297 env = cl_env_get(&refcheck);
2299 GOTO(out, rc = PTR_ERR(env));
2301 ll_merge_attr(env, inode);
2302 cl_env_put(env, &refcheck);
2304 /* Release the file.
2305 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2306 * we still need it to pack l_remote_handle to MDT. */
2307 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2313 if (och != NULL && !IS_ERR(och)) /* close the file */
2314 ll_lease_close(och, inode, NULL);
2319 struct ll_swap_stack {
2322 struct inode *inode1;
2323 struct inode *inode2;
2328 static int ll_swap_layouts(struct file *file1, struct file *file2,
2329 struct lustre_swap_layouts *lsl)
2331 struct mdc_swap_layouts msl;
2332 struct md_op_data *op_data;
2335 struct ll_swap_stack *llss = NULL;
2338 OBD_ALLOC_PTR(llss);
2342 llss->inode1 = file_inode(file1);
2343 llss->inode2 = file_inode(file2);
2345 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2349 /* we use 2 bool because it is easier to swap than 2 bits */
2350 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2351 llss->check_dv1 = true;
2353 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2354 llss->check_dv2 = true;
2356 /* we cannot use lsl->sl_dvX directly because we may swap them */
2357 llss->dv1 = lsl->sl_dv1;
2358 llss->dv2 = lsl->sl_dv2;
2360 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2361 if (rc == 0) /* same file, done! */
2364 if (rc < 0) { /* sequentialize it */
2365 swap(llss->inode1, llss->inode2);
2367 swap(llss->dv1, llss->dv2);
2368 swap(llss->check_dv1, llss->check_dv2);
2372 if (gid != 0) { /* application asks to flush dirty cache */
2373 rc = ll_get_grouplock(llss->inode1, file1, gid);
2377 rc = ll_get_grouplock(llss->inode2, file2, gid);
2379 ll_put_grouplock(llss->inode1, file1, gid);
2384 /* ultimate check, before swaping the layouts we check if
2385 * dataversion has changed (if requested) */
2386 if (llss->check_dv1) {
2387 rc = ll_data_version(llss->inode1, &dv, 0);
2390 if (dv != llss->dv1)
2391 GOTO(putgl, rc = -EAGAIN);
2394 if (llss->check_dv2) {
2395 rc = ll_data_version(llss->inode2, &dv, 0);
2398 if (dv != llss->dv2)
2399 GOTO(putgl, rc = -EAGAIN);
2402 /* struct md_op_data is used to send the swap args to the mdt
2403 * only flags is missing, so we use struct mdc_swap_layouts
2404 * through the md_op_data->op_data */
2405 /* flags from user space have to be converted before they are send to
2406 * server, no flag is sent today, they are only used on the client */
2409 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2410 0, LUSTRE_OPC_ANY, &msl);
2411 if (IS_ERR(op_data))
2412 GOTO(free, rc = PTR_ERR(op_data));
2414 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2415 sizeof(*op_data), op_data, NULL);
2416 ll_finish_md_op_data(op_data);
2423 ll_put_grouplock(llss->inode2, file2, gid);
2424 ll_put_grouplock(llss->inode1, file1, gid);
2434 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2436 struct md_op_data *op_data;
2440 /* Detect out-of range masks */
2441 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2444 /* Non-root users are forbidden to set or clear flags which are
2445 * NOT defined in HSM_USER_MASK. */
2446 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2447 !cfs_capable(CFS_CAP_SYS_ADMIN))
2450 /* Detect out-of range archive id */
2451 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2452 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2455 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2456 LUSTRE_OPC_ANY, hss);
2457 if (IS_ERR(op_data))
2458 RETURN(PTR_ERR(op_data));
2460 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2461 sizeof(*op_data), op_data, NULL);
2463 ll_finish_md_op_data(op_data);
2468 static int ll_hsm_import(struct inode *inode, struct file *file,
2469 struct hsm_user_import *hui)
2471 struct hsm_state_set *hss = NULL;
2472 struct iattr *attr = NULL;
2476 if (!S_ISREG(inode->i_mode))
2482 GOTO(out, rc = -ENOMEM);
2484 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2485 hss->hss_archive_id = hui->hui_archive_id;
2486 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2487 rc = ll_hsm_state_set(inode, hss);
2491 OBD_ALLOC_PTR(attr);
2493 GOTO(out, rc = -ENOMEM);
2495 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2496 attr->ia_mode |= S_IFREG;
2497 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2498 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2499 attr->ia_size = hui->hui_size;
2500 attr->ia_mtime.tv_sec = hui->hui_mtime;
2501 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2502 attr->ia_atime.tv_sec = hui->hui_atime;
2503 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2505 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2506 ATTR_UID | ATTR_GID |
2507 ATTR_MTIME | ATTR_MTIME_SET |
2508 ATTR_ATIME | ATTR_ATIME_SET;
2512 rc = ll_setattr_raw(file_dentry(file), attr, true);
2516 inode_unlock(inode);
2528 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2530 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2531 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2534 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2536 struct inode *inode = file_inode(file);
2538 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2539 ATTR_MTIME | ATTR_MTIME_SET |
2540 ATTR_CTIME | ATTR_CTIME_SET,
2542 .tv_sec = lfu->lfu_atime_sec,
2543 .tv_nsec = lfu->lfu_atime_nsec,
2546 .tv_sec = lfu->lfu_mtime_sec,
2547 .tv_nsec = lfu->lfu_mtime_nsec,
2550 .tv_sec = lfu->lfu_ctime_sec,
2551 .tv_nsec = lfu->lfu_ctime_nsec,
2557 if (!capable(CAP_SYS_ADMIN))
2560 if (!S_ISREG(inode->i_mode))
2564 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2565 inode_unlock(inode);
2570 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2573 case MODE_READ_USER:
2575 case MODE_WRITE_USER:
2582 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2584 /* Used to allow the upper layers of the client to request an LDLM lock
2585 * without doing an actual read or write.
2587 * Used for ladvise lockahead to manually request specific locks.
2589 * \param[in] file file this ladvise lock request is on
2590 * \param[in] ladvise ladvise struct describing this lock request
2592 * \retval 0 success, no detailed result available (sync requests
2593 * and requests sent to the server [not handled locally]
2594 * cannot return detailed results)
2595 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2596 * see definitions for details.
2597 * \retval negative negative errno on error
2599 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2601 struct lu_env *env = NULL;
2602 struct cl_io *io = NULL;
2603 struct cl_lock *lock = NULL;
2604 struct cl_lock_descr *descr = NULL;
2605 struct dentry *dentry = file->f_path.dentry;
2606 struct inode *inode = dentry->d_inode;
2607 enum cl_lock_mode cl_mode;
2608 off_t start = ladvise->lla_start;
2609 off_t end = ladvise->lla_end;
2615 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2616 "start=%llu, end=%llu\n", dentry->d_name.len,
2617 dentry->d_name.name, dentry->d_inode,
2618 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2621 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2623 GOTO(out, result = cl_mode);
2625 /* Get IO environment */
2626 result = cl_io_get(inode, &env, &io, &refcheck);
2630 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2633 * nothing to do for this io. This currently happens when
2634 * stripe sub-object's are not yet created.
2636 result = io->ci_result;
2637 } else if (result == 0) {
2638 lock = vvp_env_lock(env);
2639 descr = &lock->cll_descr;
2641 descr->cld_obj = io->ci_obj;
2642 /* Convert byte offsets to pages */
2643 descr->cld_start = cl_index(io->ci_obj, start);
2644 descr->cld_end = cl_index(io->ci_obj, end);
2645 descr->cld_mode = cl_mode;
2646 /* CEF_MUST is used because we do not want to convert a
2647 * lockahead request to a lockless lock */
2648 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2651 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2652 descr->cld_enq_flags |= CEF_SPECULATIVE;
2654 result = cl_lock_request(env, io, lock);
2656 /* On success, we need to release the lock */
2658 cl_lock_release(env, lock);
2660 cl_io_fini(env, io);
2661 cl_env_put(env, &refcheck);
2663 /* -ECANCELED indicates a matching lock with a different extent
2664 * was already present, and -EEXIST indicates a matching lock
2665 * on exactly the same extent was already present.
2666 * We convert them to positive values for userspace to make
2667 * recognizing true errors easier.
2668 * Note we can only return these detailed results on async requests,
2669 * as sync requests look the same as i/o requests for locking. */
2670 if (result == -ECANCELED)
2671 result = LLA_RESULT_DIFFERENT;
2672 else if (result == -EEXIST)
2673 result = LLA_RESULT_SAME;
2678 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2680 static int ll_ladvise_sanity(struct inode *inode,
2681 struct llapi_lu_ladvise *ladvise)
2683 enum lu_ladvise_type advice = ladvise->lla_advice;
2684 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2685 * be in the first 32 bits of enum ladvise_flags */
2686 __u32 flags = ladvise->lla_peradvice_flags;
2687 /* 3 lines at 80 characters per line, should be plenty */
2690 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2692 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2693 "last supported advice is %s (value '%d'): rc = %d\n",
2694 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2695 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2699 /* Per-advice checks */
2701 case LU_LADVISE_LOCKNOEXPAND:
2702 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2704 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2706 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2707 ladvise_names[advice], rc);
2711 case LU_LADVISE_LOCKAHEAD:
2712 /* Currently only READ and WRITE modes can be requested */
2713 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2714 ladvise->lla_lockahead_mode == 0) {
2716 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2718 ll_get_fsname(inode->i_sb, NULL, 0),
2719 ladvise->lla_lockahead_mode,
2720 ladvise_names[advice], rc);
2723 case LU_LADVISE_WILLREAD:
2724 case LU_LADVISE_DONTNEED:
2726 /* Note fall through above - These checks apply to all advices
2727 * except LOCKNOEXPAND */
2728 if (flags & ~LF_DEFAULT_MASK) {
2730 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2732 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2733 ladvise_names[advice], rc);
2736 if (ladvise->lla_start >= ladvise->lla_end) {
2738 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2739 "for %s: rc = %d\n",
2740 ll_get_fsname(inode->i_sb, NULL, 0),
2741 ladvise->lla_start, ladvise->lla_end,
2742 ladvise_names[advice], rc);
2754 * Give file access advices
2756 * The ladvise interface is similar to Linux fadvise() system call, except it
2757 * forwards the advices directly from Lustre client to server. The server side
2758 * codes will apply appropriate read-ahead and caching techniques for the
2759 * corresponding files.
2761 * A typical workload for ladvise is e.g. a bunch of different clients are
2762 * doing small random reads of a file, so prefetching pages into OSS cache
2763 * with big linear reads before the random IO is a net benefit. Fetching
2764 * all that data into each client cache with fadvise() may not be, due to
2765 * much more data being sent to the client.
2767 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2768 struct llapi_lu_ladvise *ladvise)
2772 struct cl_ladvise_io *lio;
2777 env = cl_env_get(&refcheck);
2779 RETURN(PTR_ERR(env));
2781 io = vvp_env_thread_io(env);
2782 io->ci_obj = ll_i2info(inode)->lli_clob;
2784 /* initialize parameters for ladvise */
2785 lio = &io->u.ci_ladvise;
2786 lio->li_start = ladvise->lla_start;
2787 lio->li_end = ladvise->lla_end;
2788 lio->li_fid = ll_inode2fid(inode);
2789 lio->li_advice = ladvise->lla_advice;
2790 lio->li_flags = flags;
2792 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2793 rc = cl_io_loop(env, io);
2797 cl_io_fini(env, io);
2798 cl_env_put(env, &refcheck);
2802 static int ll_lock_noexpand(struct file *file, int flags)
2804 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2806 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2811 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2814 struct fsxattr fsxattr;
2816 if (copy_from_user(&fsxattr,
2817 (const struct fsxattr __user *)arg,
2821 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2822 if (copy_to_user((struct fsxattr __user *)arg,
2823 &fsxattr, sizeof(fsxattr)))
2829 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2833 struct md_op_data *op_data;
2834 struct ptlrpc_request *req = NULL;
2836 struct fsxattr fsxattr;
2838 /* only root could change project ID */
2839 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2842 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2843 LUSTRE_OPC_ANY, NULL);
2844 if (IS_ERR(op_data))
2845 RETURN(PTR_ERR(op_data));
2847 if (copy_from_user(&fsxattr,
2848 (const struct fsxattr __user *)arg,
2850 GOTO(out_fsxattr1, rc = -EFAULT);
2852 op_data->op_projid = fsxattr.fsx_projid;
2853 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2854 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2856 ptlrpc_req_finished(req);
2859 ll_finish_md_op_data(op_data);
2863 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2866 struct inode *inode = file_inode(file);
2867 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2868 struct ll_inode_info *lli = ll_i2info(inode);
2869 struct obd_client_handle *och = NULL;
2872 enum mds_op_bias bias = 0;
2874 size_t data_size = 0;
2878 mutex_lock(&lli->lli_och_mutex);
2879 if (fd->fd_lease_och != NULL) {
2880 och = fd->fd_lease_och;
2881 fd->fd_lease_och = NULL;
2883 mutex_unlock(&lli->lli_och_mutex);
2886 GOTO(out, rc = -ENOLCK);
2888 fmode = och->och_flags;
2890 if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
2891 if (ioc->lil_count > IOC_IDS_MAX)
2892 GOTO(out, rc = -EINVAL);
2894 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2895 OBD_ALLOC(data, data_size);
2897 GOTO(out, rc = -ENOMEM);
2899 if (copy_from_user(data, (void __user *)arg, data_size))
2900 GOTO(out, rc = -EFAULT);
2902 bias = MDS_CLOSE_RESYNC_DONE;
2905 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2909 rc = ll_lease_och_release(inode, file);
2919 OBD_FREE(data, data_size);
2921 rc = ll_lease_type_from_fmode(fmode);
2925 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
2928 struct inode *inode = file_inode(file);
2929 struct ll_inode_info *lli = ll_i2info(inode);
2930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2931 struct obd_client_handle *och = NULL;
2932 __u64 open_flags = 0;
2938 switch (ioc->lil_mode) {
2939 case LL_LEASE_WRLCK:
2940 if (!(file->f_mode & FMODE_WRITE))
2942 fmode = FMODE_WRITE;
2944 case LL_LEASE_RDLCK:
2945 if (!(file->f_mode & FMODE_READ))
2949 case LL_LEASE_UNLCK:
2950 RETURN(ll_file_unlock_lease(file, ioc, arg));
2955 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2957 /* apply for lease */
2958 if (ioc->lil_flags & LL_LEASE_RESYNC)
2959 open_flags = MDS_OPEN_RESYNC;
2960 och = ll_lease_open(inode, file, fmode, open_flags);
2962 RETURN(PTR_ERR(och));
2964 if (ioc->lil_flags & LL_LEASE_RESYNC) {
2965 rc = ll_lease_file_resync(och, inode);
2967 ll_lease_close(och, inode, NULL);
2970 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
2972 ll_lease_close(och, inode, NULL);
2978 mutex_lock(&lli->lli_och_mutex);
2979 if (fd->fd_lease_och == NULL) {
2980 fd->fd_lease_och = och;
2983 mutex_unlock(&lli->lli_och_mutex);
2985 /* impossible now that only excl is supported for now */
2986 ll_lease_close(och, inode, &lease_broken);
2993 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2995 struct inode *inode = file_inode(file);
2996 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3000 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3001 PFID(ll_inode2fid(inode)), inode, cmd);
3002 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3004 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3005 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3009 case LL_IOC_GETFLAGS:
3010 /* Get the current value of the file flags */
3011 return put_user(fd->fd_flags, (int __user *)arg);
3012 case LL_IOC_SETFLAGS:
3013 case LL_IOC_CLRFLAGS:
3014 /* Set or clear specific file flags */
3015 /* XXX This probably needs checks to ensure the flags are
3016 * not abused, and to handle any flag side effects.
3018 if (get_user(flags, (int __user *) arg))
3021 if (cmd == LL_IOC_SETFLAGS) {
3022 if ((flags & LL_FILE_IGNORE_LOCK) &&
3023 !(file->f_flags & O_DIRECT)) {
3024 CERROR("%s: unable to disable locking on "
3025 "non-O_DIRECT file\n", current->comm);
3029 fd->fd_flags |= flags;
3031 fd->fd_flags &= ~flags;
3034 case LL_IOC_LOV_SETSTRIPE:
3035 case LL_IOC_LOV_SETSTRIPE_NEW:
3036 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3037 case LL_IOC_LOV_SETEA:
3038 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3039 case LL_IOC_LOV_SWAP_LAYOUTS: {
3041 struct lustre_swap_layouts lsl;
3044 if (copy_from_user(&lsl, (char __user *)arg,
3045 sizeof(struct lustre_swap_layouts)))
3048 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3051 file2 = fget(lsl.sl_fd);
3055 /* O_WRONLY or O_RDWR */
3056 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3057 GOTO(out, rc = -EPERM);
3059 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
3061 struct inode *inode2;
3062 struct ll_inode_info *lli;
3063 struct obd_client_handle *och = NULL;
3065 lli = ll_i2info(inode);
3066 mutex_lock(&lli->lli_och_mutex);
3067 if (fd->fd_lease_och != NULL) {
3068 och = fd->fd_lease_och;
3069 fd->fd_lease_och = NULL;
3071 mutex_unlock(&lli->lli_och_mutex);
3073 GOTO(out, rc = -ENOLCK);
3074 inode2 = file_inode(file2);
3075 rc = ll_swap_layouts_close(och, inode, inode2, intent);
3077 rc = ll_swap_layouts(file, file2, &lsl);
3083 case LL_IOC_LOV_GETSTRIPE:
3084 case LL_IOC_LOV_GETSTRIPE_NEW:
3085 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3086 case FSFILT_IOC_GETFLAGS:
3087 case FSFILT_IOC_SETFLAGS:
3088 RETURN(ll_iocontrol(inode, file, cmd, arg));
3089 case FSFILT_IOC_GETVERSION_OLD:
3090 case FSFILT_IOC_GETVERSION:
3091 RETURN(put_user(inode->i_generation, (int __user *)arg));
3092 case LL_IOC_GROUP_LOCK:
3093 RETURN(ll_get_grouplock(inode, file, arg));
3094 case LL_IOC_GROUP_UNLOCK:
3095 RETURN(ll_put_grouplock(inode, file, arg));
3096 case IOC_OBD_STATFS:
3097 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3099 /* We need to special case any other ioctls we want to handle,
3100 * to send them to the MDS/OST as appropriate and to properly
3101 * network encode the arg field.
3102 case FSFILT_IOC_SETVERSION_OLD:
3103 case FSFILT_IOC_SETVERSION:
3105 case LL_IOC_FLUSHCTX:
3106 RETURN(ll_flush_ctx(inode));
3107 case LL_IOC_PATH2FID: {
3108 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3109 sizeof(struct lu_fid)))
3114 case LL_IOC_GETPARENT:
3115 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3117 case OBD_IOC_FID2PATH:
3118 RETURN(ll_fid2path(inode, (void __user *)arg));
3119 case LL_IOC_DATA_VERSION: {
3120 struct ioc_data_version idv;
3123 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3126 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3127 rc = ll_ioc_data_version(inode, &idv);
3130 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3136 case LL_IOC_GET_MDTIDX: {
3139 mdtidx = ll_get_mdt_idx(inode);
3143 if (put_user((int)mdtidx, (int __user *)arg))
3148 case OBD_IOC_GETDTNAME:
3149 case OBD_IOC_GETMDNAME:
3150 RETURN(ll_get_obd_name(inode, cmd, arg));
3151 case LL_IOC_HSM_STATE_GET: {
3152 struct md_op_data *op_data;
3153 struct hsm_user_state *hus;
3160 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3161 LUSTRE_OPC_ANY, hus);
3162 if (IS_ERR(op_data)) {
3164 RETURN(PTR_ERR(op_data));
3167 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3170 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3173 ll_finish_md_op_data(op_data);
3177 case LL_IOC_HSM_STATE_SET: {
3178 struct hsm_state_set *hss;
3185 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3190 rc = ll_hsm_state_set(inode, hss);
3195 case LL_IOC_HSM_ACTION: {
3196 struct md_op_data *op_data;
3197 struct hsm_current_action *hca;
3204 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3205 LUSTRE_OPC_ANY, hca);
3206 if (IS_ERR(op_data)) {
3208 RETURN(PTR_ERR(op_data));
3211 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3214 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3217 ll_finish_md_op_data(op_data);
3221 case LL_IOC_SET_LEASE_OLD: {
3222 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3224 RETURN(ll_file_set_lease(file, &ioc, 0));
3226 case LL_IOC_SET_LEASE: {
3227 struct ll_ioc_lease ioc;
3229 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3232 RETURN(ll_file_set_lease(file, &ioc, arg));
3234 case LL_IOC_GET_LEASE: {
3235 struct ll_inode_info *lli = ll_i2info(inode);
3236 struct ldlm_lock *lock = NULL;
3239 mutex_lock(&lli->lli_och_mutex);
3240 if (fd->fd_lease_och != NULL) {
3241 struct obd_client_handle *och = fd->fd_lease_och;
3243 lock = ldlm_handle2lock(&och->och_lease_handle);
3245 lock_res_and_lock(lock);
3246 if (!ldlm_is_cancel(lock))
3247 fmode = och->och_flags;
3249 unlock_res_and_lock(lock);
3250 LDLM_LOCK_PUT(lock);
3253 mutex_unlock(&lli->lli_och_mutex);
3255 RETURN(ll_lease_type_from_fmode(fmode));
3257 case LL_IOC_HSM_IMPORT: {
3258 struct hsm_user_import *hui;
3264 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3269 rc = ll_hsm_import(inode, file, hui);
3274 case LL_IOC_FUTIMES_3: {
3275 struct ll_futimes_3 lfu;
3277 if (copy_from_user(&lfu,
3278 (const struct ll_futimes_3 __user *)arg,
3282 RETURN(ll_file_futimes_3(file, &lfu));
3284 case LL_IOC_LADVISE: {
3285 struct llapi_ladvise_hdr *k_ladvise_hdr;
3286 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3289 int alloc_size = sizeof(*k_ladvise_hdr);
3292 u_ladvise_hdr = (void __user *)arg;
3293 OBD_ALLOC_PTR(k_ladvise_hdr);
3294 if (k_ladvise_hdr == NULL)
3297 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3298 GOTO(out_ladvise, rc = -EFAULT);
3300 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3301 k_ladvise_hdr->lah_count < 1)
3302 GOTO(out_ladvise, rc = -EINVAL);
3304 num_advise = k_ladvise_hdr->lah_count;
3305 if (num_advise >= LAH_COUNT_MAX)
3306 GOTO(out_ladvise, rc = -EFBIG);
3308 OBD_FREE_PTR(k_ladvise_hdr);
3309 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3310 lah_advise[num_advise]);
3311 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3312 if (k_ladvise_hdr == NULL)
3316 * TODO: submit multiple advices to one server in a single RPC
3318 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3319 GOTO(out_ladvise, rc = -EFAULT);
3321 for (i = 0; i < num_advise; i++) {
3322 struct llapi_lu_ladvise *k_ladvise =
3323 &k_ladvise_hdr->lah_advise[i];
3324 struct llapi_lu_ladvise __user *u_ladvise =
3325 &u_ladvise_hdr->lah_advise[i];
3327 rc = ll_ladvise_sanity(inode, k_ladvise);
3329 GOTO(out_ladvise, rc);
3331 switch (k_ladvise->lla_advice) {
3332 case LU_LADVISE_LOCKNOEXPAND:
3333 rc = ll_lock_noexpand(file,
3334 k_ladvise->lla_peradvice_flags);
3335 GOTO(out_ladvise, rc);
3336 case LU_LADVISE_LOCKAHEAD:
3338 rc = ll_file_lock_ahead(file, k_ladvise);
3341 GOTO(out_ladvise, rc);
3344 &u_ladvise->lla_lockahead_result))
3345 GOTO(out_ladvise, rc = -EFAULT);
3348 rc = ll_ladvise(inode, file,
3349 k_ladvise_hdr->lah_flags,
3352 GOTO(out_ladvise, rc);
3359 OBD_FREE(k_ladvise_hdr, alloc_size);
3362 case LL_IOC_FLR_SET_MIRROR: {
3363 /* mirror I/O must be direct to avoid polluting page cache
3365 if (!(file->f_flags & O_DIRECT))
3368 fd->fd_designated_mirror = (__u32)arg;
3371 case LL_IOC_FSGETXATTR:
3372 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3373 case LL_IOC_FSSETXATTR:
3374 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3376 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3378 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3379 (void __user *)arg));
3383 #ifndef HAVE_FILE_LLSEEK_SIZE
3384 static inline loff_t
3385 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3387 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3389 if (offset > maxsize)
3392 if (offset != file->f_pos) {
3393 file->f_pos = offset;
3394 file->f_version = 0;
3400 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3401 loff_t maxsize, loff_t eof)
3403 struct inode *inode = file_inode(file);
3411 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3412 * position-querying operation. Avoid rewriting the "same"
3413 * f_pos value back to the file because a concurrent read(),
3414 * write() or lseek() might have altered it
3419 * f_lock protects against read/modify/write race with other
3420 * SEEK_CURs. Note that parallel writes and reads behave
3424 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3425 inode_unlock(inode);
3429 * In the generic case the entire file is data, so as long as
3430 * offset isn't at the end of the file then the offset is data.
3437 * There is a virtual hole at the end of the file, so as long as
3438 * offset isn't i_size or larger, return i_size.
3446 return llseek_execute(file, offset, maxsize);
3450 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3452 struct inode *inode = file_inode(file);
3453 loff_t retval, eof = 0;
3456 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3457 (origin == SEEK_CUR) ? file->f_pos : 0);
3458 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3459 PFID(ll_inode2fid(inode)), inode, retval, retval,
3461 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3463 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3464 retval = ll_glimpse_size(inode);
3467 eof = i_size_read(inode);
3470 retval = ll_generic_file_llseek_size(file, offset, origin,
3471 ll_file_maxbytes(inode), eof);
3475 static int ll_flush(struct file *file, fl_owner_t id)
3477 struct inode *inode = file_inode(file);
3478 struct ll_inode_info *lli = ll_i2info(inode);
3479 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3482 LASSERT(!S_ISDIR(inode->i_mode));
3484 /* catch async errors that were recorded back when async writeback
3485 * failed for pages in this mapping. */
3486 rc = lli->lli_async_rc;
3487 lli->lli_async_rc = 0;
3488 if (lli->lli_clob != NULL) {
3489 err = lov_read_and_clear_async_rc(lli->lli_clob);
3494 /* The application has been told write failure already.
3495 * Do not report failure again. */
3496 if (fd->fd_write_failed)
3498 return rc ? -EIO : 0;
3502 * Called to make sure a portion of file has been written out.
3503 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3505 * Return how many pages have been written.
3507 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3508 enum cl_fsync_mode mode, int ignore_layout)
3512 struct cl_fsync_io *fio;
3517 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3518 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3521 env = cl_env_get(&refcheck);
3523 RETURN(PTR_ERR(env));
3525 io = vvp_env_thread_io(env);
3526 io->ci_obj = ll_i2info(inode)->lli_clob;
3527 io->ci_ignore_layout = ignore_layout;
3529 /* initialize parameters for sync */
3530 fio = &io->u.ci_fsync;
3531 fio->fi_start = start;
3533 fio->fi_fid = ll_inode2fid(inode);
3534 fio->fi_mode = mode;
3535 fio->fi_nr_written = 0;
3537 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3538 result = cl_io_loop(env, io);
3540 result = io->ci_result;
3542 result = fio->fi_nr_written;
3543 cl_io_fini(env, io);
3544 cl_env_put(env, &refcheck);
3550 * When dentry is provided (the 'else' case), file_dentry() may be
3551 * null and dentry must be used directly rather than pulled from
3552 * file_dentry() as is done otherwise.
3555 #ifdef HAVE_FILE_FSYNC_4ARGS
3556 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3558 struct dentry *dentry = file_dentry(file);
3560 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3561 int ll_fsync(struct file *file, int datasync)
3563 struct dentry *dentry = file_dentry(file);
3565 loff_t end = LLONG_MAX;
3567 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3570 loff_t end = LLONG_MAX;
3572 struct inode *inode = dentry->d_inode;
3573 struct ll_inode_info *lli = ll_i2info(inode);
3574 struct ptlrpc_request *req;
3578 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3579 PFID(ll_inode2fid(inode)), inode);
3580 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3582 #ifdef HAVE_FILE_FSYNC_4ARGS
3583 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3584 lock_inode = !lli->lli_inode_locked;
3588 /* fsync's caller has already called _fdata{sync,write}, we want
3589 * that IO to finish before calling the osc and mdc sync methods */
3590 rc = filemap_fdatawait(inode->i_mapping);
3593 /* catch async errors that were recorded back when async writeback
3594 * failed for pages in this mapping. */
3595 if (!S_ISDIR(inode->i_mode)) {
3596 err = lli->lli_async_rc;
3597 lli->lli_async_rc = 0;
3600 if (lli->lli_clob != NULL) {
3601 err = lov_read_and_clear_async_rc(lli->lli_clob);
3607 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3611 ptlrpc_req_finished(req);
3613 if (S_ISREG(inode->i_mode)) {
3614 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3616 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3617 if (rc == 0 && err < 0)
3620 fd->fd_write_failed = true;
3622 fd->fd_write_failed = false;
3625 #ifdef HAVE_FILE_FSYNC_4ARGS
3627 inode_unlock(inode);
3633 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3635 struct inode *inode = file_inode(file);
3636 struct ll_sb_info *sbi = ll_i2sbi(inode);
3637 struct ldlm_enqueue_info einfo = {
3638 .ei_type = LDLM_FLOCK,
3639 .ei_cb_cp = ldlm_flock_completion_ast,
3640 .ei_cbdata = file_lock,
3642 struct md_op_data *op_data;
3643 struct lustre_handle lockh = { 0 };
3644 union ldlm_policy_data flock = { { 0 } };
3645 int fl_type = file_lock->fl_type;
3651 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3652 PFID(ll_inode2fid(inode)), file_lock);
3654 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3656 if (file_lock->fl_flags & FL_FLOCK) {
3657 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3658 /* flocks are whole-file locks */
3659 flock.l_flock.end = OFFSET_MAX;
3660 /* For flocks owner is determined by the local file desctiptor*/
3661 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3662 } else if (file_lock->fl_flags & FL_POSIX) {
3663 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3664 flock.l_flock.start = file_lock->fl_start;
3665 flock.l_flock.end = file_lock->fl_end;
3669 flock.l_flock.pid = file_lock->fl_pid;
3671 /* Somewhat ugly workaround for svc lockd.
3672 * lockd installs custom fl_lmops->lm_compare_owner that checks
3673 * for the fl_owner to be the same (which it always is on local node
3674 * I guess between lockd processes) and then compares pid.
3675 * As such we assign pid to the owner field to make it all work,
3676 * conflict with normal locks is unlikely since pid space and
3677 * pointer space for current->files are not intersecting */
3678 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3679 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3683 einfo.ei_mode = LCK_PR;
3686 /* An unlock request may or may not have any relation to
3687 * existing locks so we may not be able to pass a lock handle
3688 * via a normal ldlm_lock_cancel() request. The request may even
3689 * unlock a byte range in the middle of an existing lock. In
3690 * order to process an unlock request we need all of the same
3691 * information that is given with a normal read or write record
3692 * lock request. To avoid creating another ldlm unlock (cancel)
3693 * message we'll treat a LCK_NL flock request as an unlock. */
3694 einfo.ei_mode = LCK_NL;
3697 einfo.ei_mode = LCK_PW;
3700 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3715 flags = LDLM_FL_BLOCK_NOWAIT;
3721 flags = LDLM_FL_TEST_LOCK;
3724 CERROR("unknown fcntl lock command: %d\n", cmd);
3728 /* Save the old mode so that if the mode in the lock changes we
3729 * can decrement the appropriate reader or writer refcount. */
3730 file_lock->fl_type = einfo.ei_mode;
3732 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3733 LUSTRE_OPC_ANY, NULL);
3734 if (IS_ERR(op_data))
3735 RETURN(PTR_ERR(op_data));
3737 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3738 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3739 flock.l_flock.pid, flags, einfo.ei_mode,
3740 flock.l_flock.start, flock.l_flock.end);
3742 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3745 /* Restore the file lock type if not TEST lock. */
3746 if (!(flags & LDLM_FL_TEST_LOCK))
3747 file_lock->fl_type = fl_type;
3749 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3750 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3751 !(flags & LDLM_FL_TEST_LOCK))
3752 rc2 = locks_lock_file_wait(file, file_lock);
3754 if ((file_lock->fl_flags & FL_FLOCK) &&
3755 (rc == 0 || file_lock->fl_type == F_UNLCK))
3756 rc2 = flock_lock_file_wait(file, file_lock);
3757 if ((file_lock->fl_flags & FL_POSIX) &&
3758 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3759 !(flags & LDLM_FL_TEST_LOCK))
3760 rc2 = posix_lock_file_wait(file, file_lock);
3761 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3763 if (rc2 && file_lock->fl_type != F_UNLCK) {
3764 einfo.ei_mode = LCK_NL;
3765 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3770 ll_finish_md_op_data(op_data);
3775 int ll_get_fid_by_name(struct inode *parent, const char *name,
3776 int namelen, struct lu_fid *fid,
3777 struct inode **inode)
3779 struct md_op_data *op_data = NULL;
3780 struct mdt_body *body;
3781 struct ptlrpc_request *req;
3785 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3786 LUSTRE_OPC_ANY, NULL);
3787 if (IS_ERR(op_data))
3788 RETURN(PTR_ERR(op_data));
3790 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3791 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3792 ll_finish_md_op_data(op_data);
3796 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3798 GOTO(out_req, rc = -EFAULT);
3800 *fid = body->mbo_fid1;
3803 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3805 ptlrpc_req_finished(req);
3809 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3810 const char *name, int namelen)
3812 struct dentry *dchild = NULL;
3813 struct inode *child_inode = NULL;
3814 struct md_op_data *op_data;
3815 struct ptlrpc_request *request = NULL;
3816 struct obd_client_handle *och = NULL;
3818 struct mdt_body *body;
3820 __u64 data_version = 0;
3823 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3824 name, PFID(ll_inode2fid(parent)), mdtidx);
3826 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3827 0, LUSTRE_OPC_ANY, NULL);
3828 if (IS_ERR(op_data))
3829 RETURN(PTR_ERR(op_data));
3831 /* Get child FID first */
3832 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3835 dchild = d_lookup(file_dentry(file), &qstr);
3836 if (dchild != NULL) {
3837 if (dchild->d_inode != NULL)
3838 child_inode = igrab(dchild->d_inode);
3842 if (child_inode == NULL) {
3843 rc = ll_get_fid_by_name(parent, name, namelen,
3844 &op_data->op_fid3, &child_inode);
3849 if (child_inode == NULL)
3850 GOTO(out_free, rc = -EINVAL);
3853 * lfs migrate command needs to be blocked on the client
3854 * by checking the migrate FID against the FID of the
3857 if (child_inode == parent->i_sb->s_root->d_inode)
3858 GOTO(out_iput, rc = -EINVAL);
3860 inode_lock(child_inode);
3861 op_data->op_fid3 = *ll_inode2fid(child_inode);
3862 if (!fid_is_sane(&op_data->op_fid3)) {
3863 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3864 ll_get_fsname(parent->i_sb, NULL, 0), name,
3865 PFID(&op_data->op_fid3));
3866 GOTO(out_unlock, rc = -EINVAL);
3869 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3871 GOTO(out_unlock, rc);
3874 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3875 PFID(&op_data->op_fid3), mdtidx);
3876 GOTO(out_unlock, rc = 0);
3879 if (S_ISREG(child_inode->i_mode)) {
3880 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3884 GOTO(out_unlock, rc);
3887 rc = ll_data_version(child_inode, &data_version,
3890 GOTO(out_close, rc);
3892 op_data->op_handle = och->och_fh;
3893 op_data->op_data = och->och_mod;
3894 op_data->op_data_version = data_version;
3895 op_data->op_lease_handle = och->och_lease_handle;
3896 op_data->op_bias |= MDS_RENAME_MIGRATE;
3899 op_data->op_mds = mdtidx;
3900 op_data->op_cli_flags = CLI_MIGRATE;
3901 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3902 namelen, name, namelen, &request);
3904 LASSERT(request != NULL);
3905 ll_update_times(request, parent);
3907 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3908 LASSERT(body != NULL);
3910 /* If the server does release layout lock, then we cleanup
3911 * the client och here, otherwise release it in out_close: */
3913 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3914 obd_mod_put(och->och_mod);
3915 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3917 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3923 if (request != NULL) {
3924 ptlrpc_req_finished(request);
3928 /* Try again if the file layout has changed. */
3929 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3933 if (och != NULL) /* close the file */
3934 ll_lease_close(och, child_inode, NULL);
3936 clear_nlink(child_inode);
3938 inode_unlock(child_inode);
3942 ll_finish_md_op_data(op_data);
3947 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3955 * test if some locks matching bits and l_req_mode are acquired
3956 * - bits can be in different locks
3957 * - if found clear the common lock bits in *bits
3958 * - the bits not found, are kept in *bits
3960 * \param bits [IN] searched lock bits [IN]
3961 * \param l_req_mode [IN] searched lock mode
3962 * \retval boolean, true iff all bits are found
3964 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3966 struct lustre_handle lockh;
3967 union ldlm_policy_data policy;
3968 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3969 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3978 fid = &ll_i2info(inode)->lli_fid;
3979 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3980 ldlm_lockname[mode]);
3982 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3983 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3984 policy.l_inodebits.bits = *bits & (1 << i);
3985 if (policy.l_inodebits.bits == 0)
3988 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3989 &policy, mode, &lockh)) {
3990 struct ldlm_lock *lock;
3992 lock = ldlm_handle2lock(&lockh);
3995 ~(lock->l_policy_data.l_inodebits.bits);
3996 LDLM_LOCK_PUT(lock);
3998 *bits &= ~policy.l_inodebits.bits;
4005 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4006 struct lustre_handle *lockh, __u64 flags,
4007 enum ldlm_mode mode)
4009 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4014 fid = &ll_i2info(inode)->lli_fid;
4015 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4017 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4018 fid, LDLM_IBITS, &policy, mode, lockh);
4023 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4025 /* Already unlinked. Just update nlink and return success */
4026 if (rc == -ENOENT) {
4028 /* If it is striped directory, and there is bad stripe
4029 * Let's revalidate the dentry again, instead of returning
4031 if (S_ISDIR(inode->i_mode) &&
4032 ll_i2info(inode)->lli_lsm_md != NULL)
4035 /* This path cannot be hit for regular files unless in
4036 * case of obscure races, so no need to to validate
4038 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4040 } else if (rc != 0) {
4041 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4042 "%s: revalidate FID "DFID" error: rc = %d\n",
4043 ll_get_fsname(inode->i_sb, NULL, 0),
4044 PFID(ll_inode2fid(inode)), rc);
4050 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4052 struct inode *inode = dentry->d_inode;
4053 struct ptlrpc_request *req = NULL;
4054 struct obd_export *exp;
4058 LASSERT(inode != NULL);
4060 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4061 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4063 exp = ll_i2mdexp(inode);
4065 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
4066 * But under CMD case, it caused some lock issues, should be fixed
4067 * with new CMD ibits lock. See bug 12718 */
4068 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
4069 struct lookup_intent oit = { .it_op = IT_GETATTR };
4070 struct md_op_data *op_data;
4072 if (ibits == MDS_INODELOCK_LOOKUP)
4073 oit.it_op = IT_LOOKUP;
4075 /* Call getattr by fid, so do not provide name at all. */
4076 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
4077 dentry->d_inode, NULL, 0, 0,
4078 LUSTRE_OPC_ANY, NULL);
4079 if (IS_ERR(op_data))
4080 RETURN(PTR_ERR(op_data));
4082 rc = md_intent_lock(exp, op_data, &oit, &req,
4083 &ll_md_blocking_ast, 0);
4084 ll_finish_md_op_data(op_data);
4086 rc = ll_inode_revalidate_fini(inode, rc);
4090 rc = ll_revalidate_it_finish(req, &oit, dentry);
4092 ll_intent_release(&oit);
4096 /* Unlinked? Unhash dentry, so it is not picked up later by
4097 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4098 here to preserve get_cwd functionality on 2.6.
4100 if (!dentry->d_inode->i_nlink) {
4101 ll_lock_dcache(inode);
4102 d_lustre_invalidate(dentry, 0);
4103 ll_unlock_dcache(inode);
4106 ll_lookup_finish_locks(&oit, dentry);
4107 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
4108 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
4109 u64 valid = OBD_MD_FLGETATTR;
4110 struct md_op_data *op_data;
4113 if (S_ISREG(inode->i_mode)) {
4114 rc = ll_get_default_mdsize(sbi, &ealen);
4117 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
4120 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
4121 0, ealen, LUSTRE_OPC_ANY,
4123 if (IS_ERR(op_data))
4124 RETURN(PTR_ERR(op_data));
4126 op_data->op_valid = valid;
4127 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
4128 ll_finish_md_op_data(op_data);
4130 rc = ll_inode_revalidate_fini(inode, rc);
4134 rc = ll_prep_inode(&inode, req, NULL, NULL);
4137 ptlrpc_req_finished(req);
4141 static int ll_merge_md_attr(struct inode *inode)
4143 struct cl_attr attr = { 0 };
4146 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4147 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4148 &attr, ll_md_blocking_ast);
4152 set_nlink(inode, attr.cat_nlink);
4153 inode->i_blocks = attr.cat_blocks;
4154 i_size_write(inode, attr.cat_size);
4156 ll_i2info(inode)->lli_atime = attr.cat_atime;
4157 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4158 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4164 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4166 struct inode *inode = dentry->d_inode;
4170 rc = __ll_inode_revalidate(dentry, ibits);
4174 /* if object isn't regular file, don't validate size */
4175 if (!S_ISREG(inode->i_mode)) {
4176 if (S_ISDIR(inode->i_mode) &&
4177 ll_i2info(inode)->lli_lsm_md != NULL) {
4178 rc = ll_merge_md_attr(inode);
4183 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
4184 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
4185 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4187 /* In case of restore, the MDT has the right size and has
4188 * already send it back without granting the layout lock,
4189 * inode is up-to-date so glimpse is useless.
4190 * Also to glimpse we need the layout, in case of a running
4191 * restore the MDT holds the layout lock so the glimpse will
4192 * block up to the end of restore (getattr will block)
4194 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4195 rc = ll_glimpse_size(inode);
4200 static inline dev_t ll_compat_encode_dev(dev_t dev)
4202 /* The compat_sys_*stat*() syscalls will fail unless the
4203 * device majors and minors are both less than 256. Note that
4204 * the value returned here will be passed through
4205 * old_encode_dev() in cp_compat_stat(). And so we are not
4206 * trying to return a valid compat (u16) device number, just
4207 * one that will pass the old_valid_dev() check. */
4209 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4212 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4213 int ll_getattr(const struct path *path, struct kstat *stat,
4214 u32 request_mask, unsigned int flags)
4217 struct dentry *de = path->dentry;
4219 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4222 struct inode *inode = de->d_inode;
4223 struct ll_sb_info *sbi = ll_i2sbi(inode);
4224 struct ll_inode_info *lli = ll_i2info(inode);
4227 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4228 MDS_INODELOCK_LOOKUP);
4229 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4234 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4236 if (ll_need_32bit_api(sbi)) {
4237 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4238 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4239 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4241 stat->ino = inode->i_ino;
4242 stat->dev = inode->i_sb->s_dev;
4243 stat->rdev = inode->i_rdev;
4246 stat->mode = inode->i_mode;
4247 stat->uid = inode->i_uid;
4248 stat->gid = inode->i_gid;
4249 stat->atime = inode->i_atime;
4250 stat->mtime = inode->i_mtime;
4251 stat->ctime = inode->i_ctime;
4252 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4254 stat->nlink = inode->i_nlink;
4255 stat->size = i_size_read(inode);
4256 stat->blocks = inode->i_blocks;
4261 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4262 __u64 start, __u64 len)
4266 struct fiemap *fiemap;
4267 unsigned int extent_count = fieinfo->fi_extents_max;
4269 num_bytes = sizeof(*fiemap) + (extent_count *
4270 sizeof(struct fiemap_extent));
4271 OBD_ALLOC_LARGE(fiemap, num_bytes);
4276 fiemap->fm_flags = fieinfo->fi_flags;
4277 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4278 fiemap->fm_start = start;
4279 fiemap->fm_length = len;
4280 if (extent_count > 0 &&
4281 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4282 sizeof(struct fiemap_extent)) != 0)
4283 GOTO(out, rc = -EFAULT);
4285 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4287 fieinfo->fi_flags = fiemap->fm_flags;
4288 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4289 if (extent_count > 0 &&
4290 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4291 fiemap->fm_mapped_extents *
4292 sizeof(struct fiemap_extent)) != 0)
4293 GOTO(out, rc = -EFAULT);
4295 OBD_FREE_LARGE(fiemap, num_bytes);
4299 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4301 struct ll_inode_info *lli = ll_i2info(inode);
4302 struct posix_acl *acl = NULL;
4305 spin_lock(&lli->lli_lock);
4306 /* VFS' acl_permission_check->check_acl will release the refcount */
4307 acl = posix_acl_dup(lli->lli_posix_acl);
4308 spin_unlock(&lli->lli_lock);
4313 #ifdef HAVE_IOP_SET_ACL
4314 #ifdef CONFIG_FS_POSIX_ACL
4315 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4317 const char *name = NULL;
4324 case ACL_TYPE_ACCESS:
4326 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4330 name = XATTR_NAME_POSIX_ACL_ACCESS;
4332 case ACL_TYPE_DEFAULT:
4333 if (!S_ISDIR(inode->i_mode))
4334 GOTO(out, rc = acl ? -EACCES : 0);
4335 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4338 GOTO(out, rc = -EINVAL);
4342 size = posix_acl_xattr_size(acl->a_count);
4343 value = kmalloc(size, GFP_NOFS);
4345 GOTO(out, rc = -ENOMEM);
4347 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4352 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4353 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4358 set_cached_acl(inode, type, acl);
4360 forget_cached_acl(inode, type);
4363 #endif /* CONFIG_FS_POSIX_ACL */
4364 #endif /* HAVE_IOP_SET_ACL */
4366 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4368 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4369 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4371 ll_check_acl(struct inode *inode, int mask)
4374 # ifdef CONFIG_FS_POSIX_ACL
4375 struct posix_acl *acl;
4379 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4380 if (flags & IPERM_FLAG_RCU)
4383 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4388 rc = posix_acl_permission(inode, acl, mask);
4389 posix_acl_release(acl);
4392 # else /* !CONFIG_FS_POSIX_ACL */
4394 # endif /* CONFIG_FS_POSIX_ACL */
4396 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4398 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4399 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4401 # ifdef HAVE_INODE_PERMISION_2ARGS
4402 int ll_inode_permission(struct inode *inode, int mask)
4404 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4409 struct ll_sb_info *sbi;
4410 struct root_squash_info *squash;
4411 struct cred *cred = NULL;
4412 const struct cred *old_cred = NULL;
4414 bool squash_id = false;
4417 #ifdef MAY_NOT_BLOCK
4418 if (mask & MAY_NOT_BLOCK)
4420 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4421 if (flags & IPERM_FLAG_RCU)
4425 /* as root inode are NOT getting validated in lookup operation,
4426 * need to do it before permission check. */
4428 if (inode == inode->i_sb->s_root->d_inode) {
4429 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4430 MDS_INODELOCK_LOOKUP);
4435 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4436 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4438 /* squash fsuid/fsgid if needed */
4439 sbi = ll_i2sbi(inode);
4440 squash = &sbi->ll_squash;
4441 if (unlikely(squash->rsi_uid != 0 &&
4442 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4443 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4447 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4448 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4449 squash->rsi_uid, squash->rsi_gid);
4451 /* update current process's credentials
4452 * and FS capability */
4453 cred = prepare_creds();
4457 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4458 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4459 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4460 if ((1 << cap) & CFS_CAP_FS_MASK)
4461 cap_lower(cred->cap_effective, cap);
4463 old_cred = override_creds(cred);
4466 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4467 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4468 /* restore current process's credentials and FS capability */
4470 revert_creds(old_cred);
4477 /* -o localflock - only provides locally consistent flock locks */
4478 struct file_operations ll_file_operations = {
4479 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4480 # ifdef HAVE_SYNC_READ_WRITE
4481 .read = new_sync_read,
4482 .write = new_sync_write,
4484 .read_iter = ll_file_read_iter,
4485 .write_iter = ll_file_write_iter,
4486 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4487 .read = ll_file_read,
4488 .aio_read = ll_file_aio_read,
4489 .write = ll_file_write,
4490 .aio_write = ll_file_aio_write,
4491 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4492 .unlocked_ioctl = ll_file_ioctl,
4493 .open = ll_file_open,
4494 .release = ll_file_release,
4495 .mmap = ll_file_mmap,
4496 .llseek = ll_file_seek,
4497 .splice_read = ll_file_splice_read,
4502 struct file_operations ll_file_operations_flock = {
4503 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4504 # ifdef HAVE_SYNC_READ_WRITE
4505 .read = new_sync_read,
4506 .write = new_sync_write,
4507 # endif /* HAVE_SYNC_READ_WRITE */
4508 .read_iter = ll_file_read_iter,
4509 .write_iter = ll_file_write_iter,
4510 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4511 .read = ll_file_read,
4512 .aio_read = ll_file_aio_read,
4513 .write = ll_file_write,
4514 .aio_write = ll_file_aio_write,
4515 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4516 .unlocked_ioctl = ll_file_ioctl,
4517 .open = ll_file_open,
4518 .release = ll_file_release,
4519 .mmap = ll_file_mmap,
4520 .llseek = ll_file_seek,
4521 .splice_read = ll_file_splice_read,
4524 .flock = ll_file_flock,
4525 .lock = ll_file_flock
4528 /* These are for -o noflock - to return ENOSYS on flock calls */
4529 struct file_operations ll_file_operations_noflock = {
4530 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4531 # ifdef HAVE_SYNC_READ_WRITE
4532 .read = new_sync_read,
4533 .write = new_sync_write,
4534 # endif /* HAVE_SYNC_READ_WRITE */
4535 .read_iter = ll_file_read_iter,
4536 .write_iter = ll_file_write_iter,
4537 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4538 .read = ll_file_read,
4539 .aio_read = ll_file_aio_read,
4540 .write = ll_file_write,
4541 .aio_write = ll_file_aio_write,
4542 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4543 .unlocked_ioctl = ll_file_ioctl,
4544 .open = ll_file_open,
4545 .release = ll_file_release,
4546 .mmap = ll_file_mmap,
4547 .llseek = ll_file_seek,
4548 .splice_read = ll_file_splice_read,
4551 .flock = ll_file_noflock,
4552 .lock = ll_file_noflock
4555 struct inode_operations ll_file_inode_operations = {
4556 .setattr = ll_setattr,
4557 .getattr = ll_getattr,
4558 .permission = ll_inode_permission,
4559 #ifdef HAVE_IOP_XATTR
4560 .setxattr = ll_setxattr,
4561 .getxattr = ll_getxattr,
4562 .removexattr = ll_removexattr,
4564 .listxattr = ll_listxattr,
4565 .fiemap = ll_fiemap,
4566 #ifdef HAVE_IOP_GET_ACL
4567 .get_acl = ll_get_acl,
4569 #ifdef HAVE_IOP_SET_ACL
4570 .set_acl = ll_set_acl,
4574 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4576 struct ll_inode_info *lli = ll_i2info(inode);
4577 struct cl_object *obj = lli->lli_clob;
4586 env = cl_env_get(&refcheck);
4588 RETURN(PTR_ERR(env));
4590 rc = cl_conf_set(env, lli->lli_clob, conf);
4594 if (conf->coc_opc == OBJECT_CONF_SET) {
4595 struct ldlm_lock *lock = conf->coc_lock;
4596 struct cl_layout cl = {
4600 LASSERT(lock != NULL);
4601 LASSERT(ldlm_has_layout(lock));
4603 /* it can only be allowed to match after layout is
4604 * applied to inode otherwise false layout would be
4605 * seen. Applying layout shoud happen before dropping
4606 * the intent lock. */
4607 ldlm_lock_allow_match(lock);
4609 rc = cl_object_layout_get(env, obj, &cl);
4614 DFID": layout version change: %u -> %u\n",
4615 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4617 ll_layout_version_set(lli, cl.cl_layout_gen);
4621 cl_env_put(env, &refcheck);
4626 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4627 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4630 struct ll_sb_info *sbi = ll_i2sbi(inode);
4631 struct ptlrpc_request *req;
4632 struct mdt_body *body;
4639 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4640 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4641 lock->l_lvb_data, lock->l_lvb_len);
4643 if (lock->l_lvb_data != NULL)
4646 /* if layout lock was granted right away, the layout is returned
4647 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4648 * blocked and then granted via completion ast, we have to fetch
4649 * layout here. Please note that we can't use the LVB buffer in
4650 * completion AST because it doesn't have a large enough buffer */
4651 rc = ll_get_default_mdsize(sbi, &lmmsize);
4653 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4654 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4659 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4661 GOTO(out, rc = -EPROTO);
4663 lmmsize = body->mbo_eadatasize;
4664 if (lmmsize == 0) /* empty layout */
4667 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4669 GOTO(out, rc = -EFAULT);
4671 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4672 if (lvbdata == NULL)
4673 GOTO(out, rc = -ENOMEM);
4675 memcpy(lvbdata, lmm, lmmsize);
4676 lock_res_and_lock(lock);
4677 if (unlikely(lock->l_lvb_data == NULL)) {
4678 lock->l_lvb_type = LVB_T_LAYOUT;
4679 lock->l_lvb_data = lvbdata;
4680 lock->l_lvb_len = lmmsize;
4683 unlock_res_and_lock(lock);
4686 OBD_FREE_LARGE(lvbdata, lmmsize);
4691 ptlrpc_req_finished(req);
4696 * Apply the layout to the inode. Layout lock is held and will be released
4699 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4700 struct inode *inode)
4702 struct ll_inode_info *lli = ll_i2info(inode);
4703 struct ll_sb_info *sbi = ll_i2sbi(inode);
4704 struct ldlm_lock *lock;
4705 struct cl_object_conf conf;
4708 bool wait_layout = false;
4711 LASSERT(lustre_handle_is_used(lockh));
4713 lock = ldlm_handle2lock(lockh);
4714 LASSERT(lock != NULL);
4715 LASSERT(ldlm_has_layout(lock));
4717 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4718 PFID(&lli->lli_fid), inode);
4720 /* in case this is a caching lock and reinstate with new inode */
4721 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4723 lock_res_and_lock(lock);
4724 lvb_ready = ldlm_is_lvb_ready(lock);
4725 unlock_res_and_lock(lock);
4727 /* checking lvb_ready is racy but this is okay. The worst case is
4728 * that multi processes may configure the file on the same time. */
4732 rc = ll_layout_fetch(inode, lock);
4736 /* for layout lock, lmm is stored in lock's lvb.
4737 * lvb_data is immutable if the lock is held so it's safe to access it
4740 * set layout to file. Unlikely this will fail as old layout was
4741 * surely eliminated */
4742 memset(&conf, 0, sizeof conf);
4743 conf.coc_opc = OBJECT_CONF_SET;
4744 conf.coc_inode = inode;
4745 conf.coc_lock = lock;
4746 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4747 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4748 rc = ll_layout_conf(inode, &conf);
4750 /* refresh layout failed, need to wait */
4751 wait_layout = rc == -EBUSY;
4754 LDLM_LOCK_PUT(lock);
4755 ldlm_lock_decref(lockh, mode);
4757 /* wait for IO to complete if it's still being used. */
4759 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4760 ll_get_fsname(inode->i_sb, NULL, 0),
4761 PFID(&lli->lli_fid), inode);
4763 memset(&conf, 0, sizeof conf);
4764 conf.coc_opc = OBJECT_CONF_WAIT;
4765 conf.coc_inode = inode;
4766 rc = ll_layout_conf(inode, &conf);
4770 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4771 ll_get_fsname(inode->i_sb, NULL, 0),
4772 PFID(&lli->lli_fid), rc);
4778 * Issue layout intent RPC to MDS.
4779 * \param inode [in] file inode
4780 * \param intent [in] layout intent
4782 * \retval 0 on success
4783 * \retval < 0 error code
4785 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4787 struct ll_inode_info *lli = ll_i2info(inode);
4788 struct ll_sb_info *sbi = ll_i2sbi(inode);
4789 struct md_op_data *op_data;
4790 struct lookup_intent it;
4791 struct ptlrpc_request *req;
4795 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4796 0, 0, LUSTRE_OPC_ANY, NULL);
4797 if (IS_ERR(op_data))
4798 RETURN(PTR_ERR(op_data));
4800 op_data->op_data = intent;
4801 op_data->op_data_size = sizeof(*intent);
4803 memset(&it, 0, sizeof(it));
4804 it.it_op = IT_LAYOUT;
4805 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4806 intent->li_opc == LAYOUT_INTENT_TRUNC)
4807 it.it_flags = FMODE_WRITE;
4809 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4810 ll_get_fsname(inode->i_sb, NULL, 0),
4811 PFID(&lli->lli_fid), inode);
4813 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4814 &ll_md_blocking_ast, 0);
4815 if (it.it_request != NULL)
4816 ptlrpc_req_finished(it.it_request);
4817 it.it_request = NULL;
4819 ll_finish_md_op_data(op_data);
4821 /* set lock data in case this is a new lock */
4823 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4825 ll_intent_drop_lock(&it);
4831 * This function checks if there exists a LAYOUT lock on the client side,
4832 * or enqueues it if it doesn't have one in cache.
4834 * This function will not hold layout lock so it may be revoked any time after
4835 * this function returns. Any operations depend on layout should be redone
4838 * This function should be called before lov_io_init() to get an uptodate
4839 * layout version, the caller should save the version number and after IO
4840 * is finished, this function should be called again to verify that layout
4841 * is not changed during IO time.
4843 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4845 struct ll_inode_info *lli = ll_i2info(inode);
4846 struct ll_sb_info *sbi = ll_i2sbi(inode);
4847 struct lustre_handle lockh;
4848 struct layout_intent intent = {
4849 .li_opc = LAYOUT_INTENT_ACCESS,
4851 enum ldlm_mode mode;
4855 *gen = ll_layout_version_get(lli);
4856 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4860 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4861 LASSERT(S_ISREG(inode->i_mode));
4863 /* take layout lock mutex to enqueue layout lock exclusively. */
4864 mutex_lock(&lli->lli_layout_mutex);
4867 /* mostly layout lock is caching on the local side, so try to
4868 * match it before grabbing layout lock mutex. */
4869 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4870 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4871 if (mode != 0) { /* hit cached lock */
4872 rc = ll_layout_lock_set(&lockh, mode, inode);
4878 rc = ll_layout_intent(inode, &intent);
4884 *gen = ll_layout_version_get(lli);
4885 mutex_unlock(&lli->lli_layout_mutex);
4891 * Issue layout intent RPC indicating where in a file an IO is about to write.
4893 * \param[in] inode file inode.
4894 * \param[in] start start offset of fille in bytes where an IO is about to
4896 * \param[in] end exclusive end offset in bytes of the write range.
4898 * \retval 0 on success
4899 * \retval < 0 error code
4901 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4903 struct layout_intent intent = {
4904 .li_opc = LAYOUT_INTENT_WRITE,
4905 .li_extent.e_start = start,
4906 .li_extent.e_end = end,
4911 rc = ll_layout_intent(inode, &intent);
4917 * This function send a restore request to the MDT
4919 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4921 struct hsm_user_request *hur;
4925 len = sizeof(struct hsm_user_request) +
4926 sizeof(struct hsm_user_item);
4927 OBD_ALLOC(hur, len);
4931 hur->hur_request.hr_action = HUA_RESTORE;
4932 hur->hur_request.hr_archive_id = 0;
4933 hur->hur_request.hr_flags = 0;
4934 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4935 sizeof(hur->hur_user_item[0].hui_fid));
4936 hur->hur_user_item[0].hui_extent.offset = offset;
4937 hur->hur_user_item[0].hui_extent.length = length;
4938 hur->hur_request.hr_itemcount = 1;
4939 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,