4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 rc = md_close(md_exp, op_data, och->och_mod, &req);
203 if (rc != 0 && rc != -EINTR)
204 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
205 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
207 if (rc == 0 && op_data->op_bias & bias) {
208 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
215 ll_finish_md_op_data(op_data);
219 md_clear_open_replay_data(md_exp, och);
220 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
223 ptlrpc_req_finished(req); /* This is close request */
227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
229 struct ll_inode_info *lli = ll_i2info(inode);
230 struct obd_client_handle **och_p;
231 struct obd_client_handle *och;
236 if (fmode & FMODE_WRITE) {
237 och_p = &lli->lli_mds_write_och;
238 och_usecount = &lli->lli_open_fd_write_count;
239 } else if (fmode & FMODE_EXEC) {
240 och_p = &lli->lli_mds_exec_och;
241 och_usecount = &lli->lli_open_fd_exec_count;
243 LASSERT(fmode & FMODE_READ);
244 och_p = &lli->lli_mds_read_och;
245 och_usecount = &lli->lli_open_fd_read_count;
248 mutex_lock(&lli->lli_och_mutex);
249 if (*och_usecount > 0) {
250 /* There are still users of this handle, so skip
252 mutex_unlock(&lli->lli_och_mutex);
258 mutex_unlock(&lli->lli_och_mutex);
261 /* There might be a race and this handle may already
263 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
269 static int ll_md_close(struct inode *inode, struct file *file)
271 union ldlm_policy_data policy = {
272 .l_inodebits = { MDS_INODELOCK_OPEN },
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
276 struct ll_inode_info *lli = ll_i2info(inode);
277 struct lustre_handle lockh;
278 enum ldlm_mode lockmode;
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 mutex_lock(&lli->lli_och_mutex);
307 if (fd->fd_omode & FMODE_WRITE) {
309 LASSERT(lli->lli_open_fd_write_count);
310 lli->lli_open_fd_write_count--;
311 } else if (fd->fd_omode & FMODE_EXEC) {
313 LASSERT(lli->lli_open_fd_exec_count);
314 lli->lli_open_fd_exec_count--;
317 LASSERT(lli->lli_open_fd_read_count);
318 lli->lli_open_fd_read_count--;
320 mutex_unlock(&lli->lli_och_mutex);
322 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
323 LDLM_IBITS, &policy, lockmode, &lockh))
324 rc = ll_md_real_close(inode, fd->fd_omode);
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
338 int ll_file_release(struct inode *inode, struct file *file)
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
347 PFID(ll_inode2fid(inode)), inode);
349 if (inode->i_sb->s_root != file_dentry(file))
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead,
355 * because parent and child process can share the same file handle. */
356 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
357 ll_deauthorize_statahead(inode, fd);
359 if (inode->i_sb->s_root == file_dentry(file)) {
360 LUSTRE_FPRIVATE(file) = NULL;
361 ll_file_data_put(fd);
365 if (!S_ISDIR(inode->i_mode)) {
366 if (lli->lli_clob != NULL)
367 lov_read_and_clear_async_rc(lli->lli_clob);
368 lli->lli_async_rc = 0;
371 rc = ll_md_close(inode, file);
373 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
374 libcfs_debug_dumplog();
379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
380 struct lookup_intent *itp)
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
440 /* We did open by fid, but by the time we got to the server,
441 * the object disappeared. If this is a create, we cannot really
442 * tell the userspace that the file it was trying to create
443 * does not exist. Instead let's return -ESTALE, and the VFS will
444 * retry the create with LOOKUP_REVAL that we are going to catch
445 * in ll_revalidate_dentry() and use lookup then.
447 if (rc == -ENOENT && itp->it_op & IT_CREAT)
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct mdt_body *body;
458 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
459 och->och_fh = body->mbo_handle;
460 och->och_fid = body->mbo_fid1;
461 och->och_lease_handle.cookie = it->it_lock_handle;
462 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
463 och->och_flags = it->it_flags;
465 return md_set_open_replay_data(md_exp, och, it);
468 static int ll_local_open(struct file *file, struct lookup_intent *it,
469 struct ll_file_data *fd, struct obd_client_handle *och)
471 struct inode *inode = file_inode(file);
474 LASSERT(!LUSTRE_FPRIVATE(file));
481 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
486 LUSTRE_FPRIVATE(file) = fd;
487 ll_readahead_init(inode, &fd->fd_ras);
488 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
490 /* ll_cl_context initialize */
491 rwlock_init(&fd->fd_lock);
492 INIT_LIST_HEAD(&fd->fd_lccs);
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
522 PFID(ll_inode2fid(inode)), inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_openerr, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode))
533 ll_authorize_statahead(inode, fd);
535 if (inode->i_sb->s_root == file_dentry(file)) {
536 LUSTRE_FPRIVATE(file) = fd;
540 if (!it || !it->it_disposition) {
541 /* Convert f_flags into access mode. We cannot use file->f_mode,
542 * because everything but O_ACCMODE mask was stripped from
544 if ((oit.it_flags + 1) & O_ACCMODE)
546 if (file->f_flags & O_TRUNC)
547 oit.it_flags |= FMODE_WRITE;
549 /* kernel only call f_op->open in dentry_open. filp_open calls
550 * dentry_open after call to open_namei that checks permissions.
551 * Only nfsd_open call dentry_open directly without checking
552 * permissions and because of that this code below is safe. */
553 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
554 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556 /* We do not want O_EXCL here, presumably we opened the file
557 * already? XXX - NFS implications? */
558 oit.it_flags &= ~O_EXCL;
560 /* bug20584, if "it_flags" contains O_CREAT, the file will be
561 * created if necessary, then "IT_CREAT" should be set to keep
562 * consistent with it */
563 if (oit.it_flags & O_CREAT)
564 oit.it_op |= IT_CREAT;
570 /* Let's see if we have file open on MDS already. */
571 if (it->it_flags & FMODE_WRITE) {
572 och_p = &lli->lli_mds_write_och;
573 och_usecount = &lli->lli_open_fd_write_count;
574 } else if (it->it_flags & FMODE_EXEC) {
575 och_p = &lli->lli_mds_exec_och;
576 och_usecount = &lli->lli_open_fd_exec_count;
578 och_p = &lli->lli_mds_read_och;
579 och_usecount = &lli->lli_open_fd_read_count;
582 mutex_lock(&lli->lli_och_mutex);
583 if (*och_p) { /* Open handle is present */
584 if (it_disposition(it, DISP_OPEN_OPEN)) {
585 /* Well, there's extra open request that we do not need,
586 let's close it somehow. This will decref request. */
587 rc = it_open_error(DISP_OPEN_OPEN, it);
589 mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 ll_release_openhandle(file_dentry(file), it);
597 rc = ll_local_open(file, it, fd, NULL);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->it_disposition) {
606 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
607 /* We cannot just request lock handle now, new ELC code
608 means that one of other OPEN locks for this file
609 could be cancelled, and since blocking ast handler
610 would attempt to grab och_mutex as well, that would
611 result in a deadlock */
612 mutex_unlock(&lli->lli_och_mutex);
614 * Normally called under two situations:
616 * 2. A race/condition on MDS resulting in no open
617 * handle to be returned from LOOKUP|OPEN request,
618 * for example if the target entry was a symlink.
620 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
621 * marked by a bit set in ll_iget_for_nfs. Clear the
622 * bit so that it's not confusing later callers.
624 * NB; when ldd is NULL, it must have come via normal
625 * lookup path only, since ll_iget_for_nfs always calls
628 if (ldd && ldd->lld_nfs_dentry) {
629 ldd->lld_nfs_dentry = 0;
630 it->it_flags |= MDS_OPEN_LOCK;
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
641 GOTO(out_openerr, rc);
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
647 GOTO(out_och_free, rc = -ENOMEM);
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
658 GOTO(out_och_free, rc);
660 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
661 "inode %p: disposition %x, status %d\n", inode,
662 it_disposition(it, ~0), it->it_status);
664 rc = ll_local_open(file, it, fd, *och_p);
666 GOTO(out_och_free, rc);
668 mutex_unlock(&lli->lli_och_mutex);
671 /* Must do this outside lli_och_mutex lock to prevent deadlock where
672 different kind of OPEN lock for this same inode gets cancelled
673 by ldlm_cancel_lru */
674 if (!S_ISREG(inode->i_mode))
675 GOTO(out_och_free, rc);
677 cl_lov_delay_create_clear(&file->f_flags);
678 GOTO(out_och_free, rc);
682 if (och_p && *och_p) {
683 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
684 *och_p = NULL; /* OBD_FREE writes some magic there */
687 mutex_unlock(&lli->lli_och_mutex);
690 if (lli->lli_opendir_key == fd)
691 ll_deauthorize_statahead(inode, fd);
693 ll_file_data_put(fd);
695 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
698 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
699 ptlrpc_req_finished(it->it_request);
700 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
706 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
707 struct ldlm_lock_desc *desc, void *data, int flag)
710 struct lustre_handle lockh;
714 case LDLM_CB_BLOCKING:
715 ldlm_lock2handle(lock, &lockh);
716 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
718 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
722 case LDLM_CB_CANCELING:
730 * When setting a lease on a file, we take ownership of the lli_mds_*_och
731 * and save it as fd->fd_och so as to force client to reopen the file even
732 * if it has an open lock in cache already.
734 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
735 struct lustre_handle *old_handle)
737 struct ll_inode_info *lli = ll_i2info(inode);
738 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
739 struct obd_client_handle **och_p;
744 /* Get the openhandle of the file */
745 mutex_lock(&lli->lli_och_mutex);
746 if (fd->fd_lease_och != NULL)
747 GOTO(out_unlock, rc = -EBUSY);
749 if (fd->fd_och == NULL) {
750 if (file->f_mode & FMODE_WRITE) {
751 LASSERT(lli->lli_mds_write_och != NULL);
752 och_p = &lli->lli_mds_write_och;
753 och_usecount = &lli->lli_open_fd_write_count;
755 LASSERT(lli->lli_mds_read_och != NULL);
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
760 if (*och_usecount > 1)
761 GOTO(out_unlock, rc = -EBUSY);
768 *old_handle = fd->fd_och->och_fh;
772 mutex_unlock(&lli->lli_och_mutex);
777 * Release ownership on lli_mds_*_och when putting back a file lease.
779 static int ll_lease_och_release(struct inode *inode, struct file *file)
781 struct ll_inode_info *lli = ll_i2info(inode);
782 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
783 struct obd_client_handle **och_p;
784 struct obd_client_handle *old_och = NULL;
789 mutex_lock(&lli->lli_och_mutex);
790 if (file->f_mode & FMODE_WRITE) {
791 och_p = &lli->lli_mds_write_och;
792 och_usecount = &lli->lli_open_fd_write_count;
794 och_p = &lli->lli_mds_read_och;
795 och_usecount = &lli->lli_open_fd_read_count;
798 /* The file may have been open by another process (broken lease) so
799 * *och_p is not NULL. In this case we should simply increase usecount
802 if (*och_p != NULL) {
803 old_och = fd->fd_och;
810 mutex_unlock(&lli->lli_och_mutex);
813 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
819 * Acquire a lease and open the file.
821 static struct obd_client_handle *
822 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
825 struct lookup_intent it = { .it_op = IT_OPEN };
826 struct ll_sb_info *sbi = ll_i2sbi(inode);
827 struct md_op_data *op_data;
828 struct ptlrpc_request *req = NULL;
829 struct lustre_handle old_handle = { 0 };
830 struct obd_client_handle *och = NULL;
835 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
836 RETURN(ERR_PTR(-EINVAL));
839 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
840 RETURN(ERR_PTR(-EPERM));
842 rc = ll_lease_och_acquire(inode, file, &old_handle);
849 RETURN(ERR_PTR(-ENOMEM));
851 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
852 LUSTRE_OPC_ANY, NULL);
854 GOTO(out, rc = PTR_ERR(op_data));
856 /* To tell the MDT this openhandle is from the same owner */
857 op_data->op_handle = old_handle;
859 it.it_flags = fmode | open_flags;
860 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
861 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
862 &ll_md_blocking_lease_ast,
863 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
864 * it can be cancelled which may mislead applications that the lease is
866 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
867 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
868 * doesn't deal with openhandle, so normal openhandle will be leaked. */
869 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
870 ll_finish_md_op_data(op_data);
871 ptlrpc_req_finished(req);
873 GOTO(out_release_it, rc);
875 if (it_disposition(&it, DISP_LOOKUP_NEG))
876 GOTO(out_release_it, rc = -ENOENT);
878 rc = it_open_error(DISP_OPEN_OPEN, &it);
880 GOTO(out_release_it, rc);
882 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
883 ll_och_fill(sbi->ll_md_exp, &it, och);
885 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
886 GOTO(out_close, rc = -EOPNOTSUPP);
888 /* already get lease, handle lease lock */
889 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
890 if (it.it_lock_mode == 0 ||
891 it.it_lock_bits != MDS_INODELOCK_OPEN) {
892 /* open lock must return for lease */
893 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
894 PFID(ll_inode2fid(inode)), it.it_lock_mode,
896 GOTO(out_close, rc = -EPROTO);
899 ll_intent_release(&it);
903 /* Cancel open lock */
904 if (it.it_lock_mode != 0) {
905 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
908 och->och_lease_handle.cookie = 0ULL;
910 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
912 CERROR("%s: error closing file "DFID": %d\n",
913 ll_get_fsname(inode->i_sb, NULL, 0),
914 PFID(&ll_i2info(inode)->lli_fid), rc2);
915 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
917 ll_intent_release(&it);
925 * Check whether a layout swap can be done between two inodes.
927 * \param[in] inode1 First inode to check
928 * \param[in] inode2 Second inode to check
930 * \retval 0 on success, layout swap can be performed between both inodes
931 * \retval negative error code if requirements are not met
933 static int ll_check_swap_layouts_validity(struct inode *inode1,
934 struct inode *inode2)
936 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
939 if (inode_permission(inode1, MAY_WRITE) ||
940 inode_permission(inode2, MAY_WRITE))
943 if (inode1->i_sb != inode2->i_sb)
949 static int ll_swap_layouts_close(struct obd_client_handle *och,
950 struct inode *inode, struct inode *inode2)
952 const struct lu_fid *fid1 = ll_inode2fid(inode);
953 const struct lu_fid *fid2;
957 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
958 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
960 rc = ll_check_swap_layouts_validity(inode, inode2);
962 GOTO(out_free_och, rc);
964 /* We now know that inode2 is a lustre inode */
965 fid2 = ll_inode2fid(inode2);
967 rc = lu_fid_cmp(fid1, fid2);
969 GOTO(out_free_och, rc = -EINVAL);
971 /* Close the file and {swap,merge} layouts between inode & inode2.
972 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
973 * because we still need it to pack l_remote_handle to MDT. */
974 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
977 och = NULL; /* freed in ll_close_inode_openhandle() */
987 * Release lease and close the file.
988 * It will check if the lease has ever broken.
990 static int ll_lease_close_intent(struct obd_client_handle *och,
992 bool *lease_broken, enum mds_op_bias bias,
995 struct ldlm_lock *lock;
996 bool cancelled = true;
1000 lock = ldlm_handle2lock(&och->och_lease_handle);
1002 lock_res_and_lock(lock);
1003 cancelled = ldlm_is_cancel(lock);
1004 unlock_res_and_lock(lock);
1005 LDLM_LOCK_PUT(lock);
1008 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1009 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1011 if (lease_broken != NULL)
1012 *lease_broken = cancelled;
1014 if (!cancelled && !bias)
1015 ldlm_cli_cancel(&och->och_lease_handle, 0);
1017 if (cancelled) { /* no need to excute intent */
1022 rc = ll_close_inode_openhandle(inode, och, bias, data);
1026 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1029 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1033 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1035 static int ll_lease_file_resync(struct obd_client_handle *och,
1036 struct inode *inode)
1038 struct ll_sb_info *sbi = ll_i2sbi(inode);
1039 struct md_op_data *op_data;
1040 __u64 data_version_unused;
1044 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1045 LUSTRE_OPC_ANY, NULL);
1046 if (IS_ERR(op_data))
1047 RETURN(PTR_ERR(op_data));
1049 /* before starting file resync, it's necessary to clean up page cache
1050 * in client memory, otherwise once the layout version is increased,
1051 * writing back cached data will be denied the OSTs. */
1052 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1056 op_data->op_handle = och->och_lease_handle;
1057 rc = md_file_resync(sbi->ll_md_exp, op_data);
1063 ll_finish_md_op_data(op_data);
1067 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1069 struct ll_inode_info *lli = ll_i2info(inode);
1070 struct cl_object *obj = lli->lli_clob;
1071 struct cl_attr *attr = vvp_env_thread_attr(env);
1079 ll_inode_size_lock(inode);
1081 /* Merge timestamps the most recently obtained from MDS with
1082 * timestamps obtained from OSTs.
1084 * Do not overwrite atime of inode because it may be refreshed
1085 * by file_accessed() function. If the read was served by cache
1086 * data, there is no RPC to be sent so that atime may not be
1087 * transferred to OSTs at all. MDT only updates atime at close time
1088 * if it's at least 'mdd.*.atime_diff' older.
1089 * All in all, the atime in Lustre does not strictly comply with
1090 * POSIX. Solving this problem needs to send an RPC to MDT for each
1091 * read, this will hurt performance. */
1092 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1093 LTIME_S(inode->i_atime) = lli->lli_atime;
1094 lli->lli_update_atime = 0;
1096 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1097 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1099 atime = LTIME_S(inode->i_atime);
1100 mtime = LTIME_S(inode->i_mtime);
1101 ctime = LTIME_S(inode->i_ctime);
1103 cl_object_attr_lock(obj);
1104 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1107 rc = cl_object_attr_get(env, obj, attr);
1108 cl_object_attr_unlock(obj);
1111 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1113 if (atime < attr->cat_atime)
1114 atime = attr->cat_atime;
1116 if (ctime < attr->cat_ctime)
1117 ctime = attr->cat_ctime;
1119 if (mtime < attr->cat_mtime)
1120 mtime = attr->cat_mtime;
1122 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1123 PFID(&lli->lli_fid), attr->cat_size);
1125 i_size_write(inode, attr->cat_size);
1126 inode->i_blocks = attr->cat_blocks;
1128 LTIME_S(inode->i_atime) = atime;
1129 LTIME_S(inode->i_mtime) = mtime;
1130 LTIME_S(inode->i_ctime) = ctime;
1133 ll_inode_size_unlock(inode);
1139 * Set designated mirror for I/O.
1141 * So far only read, write, and truncated can support to issue I/O to
1142 * designated mirror.
1144 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 /* clear layout version for generic(non-resync) I/O in case it carries
1149 * stale layout version due to I/O restart */
1150 io->ci_layout_version = 0;
1152 /* FLR: disable non-delay for designated mirror I/O because obviously
1153 * only one mirror is available */
1154 if (fd->fd_designated_mirror > 0) {
1156 io->ci_designated_mirror = fd->fd_designated_mirror;
1157 io->ci_layout_version = fd->fd_layout_version;
1158 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1162 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1163 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1166 static bool file_is_noatime(const struct file *file)
1168 const struct vfsmount *mnt = file->f_path.mnt;
1169 const struct inode *inode = file_inode((struct file *)file);
1171 /* Adapted from file_accessed() and touch_atime().*/
1172 if (file->f_flags & O_NOATIME)
1175 if (inode->i_flags & S_NOATIME)
1178 if (IS_NOATIME(inode))
1181 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1184 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1187 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1193 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1195 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1197 struct inode *inode = file_inode(file);
1198 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1200 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1201 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1202 io->u.ci_rw.rw_file = file;
1203 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1204 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1205 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1207 if (iot == CIT_WRITE) {
1208 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1209 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1210 file->f_flags & O_DIRECT ||
1213 io->ci_obj = ll_i2info(inode)->lli_clob;
1214 io->ci_lockreq = CILR_MAYBE;
1215 if (ll_file_nolock(file)) {
1216 io->ci_lockreq = CILR_NEVER;
1217 io->ci_no_srvlock = 1;
1218 } else if (file->f_flags & O_APPEND) {
1219 io->ci_lockreq = CILR_MANDATORY;
1221 io->ci_noatime = file_is_noatime(file);
1222 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1223 io->ci_pio = !io->u.ci_rw.rw_append;
1227 /* FLR: only use non-delay I/O for read as there is only one
1228 * avaliable mirror for write. */
1229 io->ci_ndelay = !(iot == CIT_WRITE);
1231 ll_io_set_mirror(io, file);
1234 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1236 struct cl_io_pt *pt = ptask->pt_cbdata;
1237 struct file *file = pt->cip_file;
1240 loff_t pos = pt->cip_pos;
1245 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1246 file_dentry(file)->d_name.name,
1247 pt->cip_iot == CIT_READ ? "read" : "write",
1248 pos, pos + pt->cip_count);
1250 env = cl_env_get(&refcheck);
1252 RETURN(PTR_ERR(env));
1254 io = vvp_env_thread_io(env);
1255 ll_io_init(io, file, pt->cip_iot);
1256 io->u.ci_rw.rw_iter = pt->cip_iter;
1257 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1258 io->ci_pio = 0; /* It's already in parallel task */
1260 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1261 pt->cip_count - pt->cip_result);
1263 struct vvp_io *vio = vvp_env_io(env);
1265 vio->vui_io_subtype = IO_NORMAL;
1266 vio->vui_fd = LUSTRE_FPRIVATE(file);
1268 ll_cl_add(file, env, io, LCC_RW);
1269 rc = cl_io_loop(env, io);
1270 ll_cl_remove(file, env);
1272 /* cl_io_rw_init() handled IO */
1276 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1282 if (io->ci_nob > 0) {
1283 pt->cip_result += io->ci_nob;
1284 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1286 pt->cip_iocb.ki_pos = pos;
1287 #ifdef HAVE_KIOCB_KI_LEFT
1288 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1289 #elif defined(HAVE_KI_NBYTES)
1290 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1294 cl_io_fini(env, io);
1295 cl_env_put(env, &refcheck);
1297 pt->cip_need_restart = io->ci_need_restart;
1299 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1300 file_dentry(file)->d_name.name,
1301 pt->cip_iot == CIT_READ ? "read" : "write",
1302 pt->cip_result, rc);
1304 RETURN(pt->cip_result > 0 ? 0 : rc);
1308 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1309 struct file *file, enum cl_io_type iot,
1310 loff_t *ppos, size_t count)
1312 struct range_lock range;
1313 struct vvp_io *vio = vvp_env_io(env);
1314 struct inode *inode = file_inode(file);
1315 struct ll_inode_info *lli = ll_i2info(inode);
1316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1321 unsigned retried = 0;
1322 bool restarted = false;
1326 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1327 file_dentry(file)->d_name.name,
1328 iot == CIT_READ ? "read" : "write", pos, pos + count);
1331 io = vvp_env_thread_io(env);
1332 ll_io_init(io, file, iot);
1333 if (args->via_io_subtype == IO_NORMAL) {
1334 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1335 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1337 if (args->via_io_subtype != IO_NORMAL || restarted)
1339 io->ci_ndelay_tried = retried;
1341 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1342 bool range_locked = false;
1344 if (file->f_flags & O_APPEND)
1345 range_lock_init(&range, 0, LUSTRE_EOF);
1347 range_lock_init(&range, pos, pos + count - 1);
1349 vio->vui_fd = LUSTRE_FPRIVATE(file);
1350 vio->vui_io_subtype = args->via_io_subtype;
1352 switch (vio->vui_io_subtype) {
1354 /* Direct IO reads must also take range lock,
1355 * or multiple reads will try to work on the same pages
1356 * See LU-6227 for details. */
1357 if (((iot == CIT_WRITE) ||
1358 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1359 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1360 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1362 rc = range_lock(&lli->lli_write_tree, &range);
1366 range_locked = true;
1370 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1371 vio->u.splice.vui_flags = args->u.splice.via_flags;
1374 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1378 ll_cl_add(file, env, io, LCC_RW);
1379 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1380 !lli->lli_inode_locked) {
1382 lli->lli_inode_locked = 1;
1384 rc = cl_io_loop(env, io);
1385 if (lli->lli_inode_locked) {
1386 lli->lli_inode_locked = 0;
1387 inode_unlock(inode);
1389 ll_cl_remove(file, env);
1392 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1394 range_unlock(&lli->lli_write_tree, &range);
1397 /* cl_io_rw_init() handled IO */
1401 if (io->ci_nob > 0) {
1402 result += io->ci_nob;
1403 count -= io->ci_nob;
1405 if (args->via_io_subtype == IO_NORMAL) {
1406 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1408 args->u.normal.via_iocb->ki_pos = pos;
1409 #ifdef HAVE_KIOCB_KI_LEFT
1410 args->u.normal.via_iocb->ki_left = count;
1411 #elif defined(HAVE_KI_NBYTES)
1412 args->u.normal.via_iocb->ki_nbytes = count;
1416 pos = io->u.ci_rw.rw_range.cir_pos;
1420 cl_io_fini(env, io);
1423 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1424 file->f_path.dentry->d_name.name,
1425 iot, rc, result, io->ci_need_restart);
1427 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1429 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1430 file_dentry(file)->d_name.name,
1431 iot == CIT_READ ? "read" : "write",
1432 pos, pos + count, result, rc);
1433 /* preserve the tried count for FLR */
1434 retried = io->ci_ndelay_tried;
1439 if (iot == CIT_READ) {
1441 ll_stats_ops_tally(ll_i2sbi(inode),
1442 LPROC_LL_READ_BYTES, result);
1443 } else if (iot == CIT_WRITE) {
1445 ll_stats_ops_tally(ll_i2sbi(inode),
1446 LPROC_LL_WRITE_BYTES, result);
1447 fd->fd_write_failed = false;
1448 } else if (result == 0 && rc == 0) {
1451 fd->fd_write_failed = true;
1453 fd->fd_write_failed = false;
1454 } else if (rc != -ERESTARTSYS) {
1455 fd->fd_write_failed = true;
1459 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1460 file_dentry(file)->d_name.name,
1461 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1465 RETURN(result > 0 ? result : rc);
1469 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1470 * especially for small I/O.
1472 * To serve a read request, CLIO has to create and initialize a cl_io and
1473 * then request DLM lock. This has turned out to have siginificant overhead
1474 * and affects the performance of small I/O dramatically.
1476 * It's not necessary to create a cl_io for each I/O. Under the help of read
1477 * ahead, most of the pages being read are already in memory cache and we can
1478 * read those pages directly because if the pages exist, the corresponding DLM
1479 * lock must exist so that page content must be valid.
1481 * In fast read implementation, the llite speculatively finds and reads pages
1482 * in memory cache. There are three scenarios for fast read:
1483 * - If the page exists and is uptodate, kernel VM will provide the data and
1484 * CLIO won't be intervened;
1485 * - If the page was brought into memory by read ahead, it will be exported
1486 * and read ahead parameters will be updated;
1487 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1488 * it will go back and invoke normal read, i.e., a cl_io will be created
1489 * and DLM lock will be requested.
1491 * POSIX compliance: posix standard states that read is intended to be atomic.
1492 * Lustre read implementation is in line with Linux kernel read implementation
1493 * and neither of them complies with POSIX standard in this matter. Fast read
1494 * doesn't make the situation worse on single node but it may interleave write
1495 * results from multiple nodes due to short read handling in ll_file_aio_read().
1497 * \param env - lu_env
1498 * \param iocb - kiocb from kernel
1499 * \param iter - user space buffers where the data will be copied
1501 * \retval - number of bytes have been read, or error code if error occurred.
1504 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1508 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1511 /* NB: we can't do direct IO for fast read because it will need a lock
1512 * to make IO engine happy. */
1513 if (iocb->ki_filp->f_flags & O_DIRECT)
1516 result = generic_file_read_iter(iocb, iter);
1518 /* If the first page is not in cache, generic_file_aio_read() will be
1519 * returned with -ENODATA.
1520 * See corresponding code in ll_readpage(). */
1521 if (result == -ENODATA)
1525 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1526 LPROC_LL_READ_BYTES, result);
1532 * Read from a file (through the page cache).
1534 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1537 struct vvp_io_args *args;
1542 result = ll_do_fast_read(iocb, to);
1543 if (result < 0 || iov_iter_count(to) == 0)
1546 env = cl_env_get(&refcheck);
1548 return PTR_ERR(env);
1550 args = ll_env_args(env, IO_NORMAL);
1551 args->u.normal.via_iter = to;
1552 args->u.normal.via_iocb = iocb;
1554 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1555 &iocb->ki_pos, iov_iter_count(to));
1558 else if (result == 0)
1561 cl_env_put(env, &refcheck);
1567 * Write to a file (through the page cache).
1569 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1571 struct vvp_io_args *args;
1576 env = cl_env_get(&refcheck);
1578 return PTR_ERR(env);
1580 args = ll_env_args(env, IO_NORMAL);
1581 args->u.normal.via_iter = from;
1582 args->u.normal.via_iocb = iocb;
1584 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1585 &iocb->ki_pos, iov_iter_count(from));
1586 cl_env_put(env, &refcheck);
1590 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1592 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1594 static int ll_file_get_iov_count(const struct iovec *iov,
1595 unsigned long *nr_segs, size_t *count)
1600 for (seg = 0; seg < *nr_segs; seg++) {
1601 const struct iovec *iv = &iov[seg];
1604 * If any segment has a negative length, or the cumulative
1605 * length ever wraps negative then return -EINVAL.
1608 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1610 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1615 cnt -= iv->iov_len; /* This segment is no good */
1622 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1623 unsigned long nr_segs, loff_t pos)
1630 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1634 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1635 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1636 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1637 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1638 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1640 result = ll_file_read_iter(iocb, &to);
1645 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1648 struct iovec iov = { .iov_base = buf, .iov_len = count };
1653 init_sync_kiocb(&kiocb, file);
1654 kiocb.ki_pos = *ppos;
1655 #ifdef HAVE_KIOCB_KI_LEFT
1656 kiocb.ki_left = count;
1657 #elif defined(HAVE_KI_NBYTES)
1658 kiocb.i_nbytes = count;
1661 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1662 *ppos = kiocb.ki_pos;
1668 * Write to a file (through the page cache).
1671 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1672 unsigned long nr_segs, loff_t pos)
1674 struct iov_iter from;
1679 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1683 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1684 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1685 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1686 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1687 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1689 result = ll_file_write_iter(iocb, &from);
1694 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1695 size_t count, loff_t *ppos)
1698 struct iovec iov = { .iov_base = (void __user *)buf,
1700 struct kiocb *kiocb;
1705 env = cl_env_get(&refcheck);
1707 RETURN(PTR_ERR(env));
1709 kiocb = &ll_env_info(env)->lti_kiocb;
1710 init_sync_kiocb(kiocb, file);
1711 kiocb->ki_pos = *ppos;
1712 #ifdef HAVE_KIOCB_KI_LEFT
1713 kiocb->ki_left = count;
1714 #elif defined(HAVE_KI_NBYTES)
1715 kiocb->ki_nbytes = count;
1718 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1719 *ppos = kiocb->ki_pos;
1721 cl_env_put(env, &refcheck);
1724 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1727 * Send file content (through pagecache) somewhere with helper
1729 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1730 struct pipe_inode_info *pipe, size_t count,
1734 struct vvp_io_args *args;
1739 env = cl_env_get(&refcheck);
1741 RETURN(PTR_ERR(env));
1743 args = ll_env_args(env, IO_SPLICE);
1744 args->u.splice.via_pipe = pipe;
1745 args->u.splice.via_flags = flags;
1747 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1748 cl_env_put(env, &refcheck);
1752 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1753 __u64 flags, struct lov_user_md *lum, int lum_size)
1755 struct lookup_intent oit = {
1757 .it_flags = flags | MDS_OPEN_BY_FID,
1762 ll_inode_size_lock(inode);
1763 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1765 GOTO(out_unlock, rc);
1767 ll_release_openhandle(dentry, &oit);
1770 ll_inode_size_unlock(inode);
1771 ll_intent_release(&oit);
1776 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1777 struct lov_mds_md **lmmp, int *lmm_size,
1778 struct ptlrpc_request **request)
1780 struct ll_sb_info *sbi = ll_i2sbi(inode);
1781 struct mdt_body *body;
1782 struct lov_mds_md *lmm = NULL;
1783 struct ptlrpc_request *req = NULL;
1784 struct md_op_data *op_data;
1787 rc = ll_get_default_mdsize(sbi, &lmmsize);
1791 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1792 strlen(filename), lmmsize,
1793 LUSTRE_OPC_ANY, NULL);
1794 if (IS_ERR(op_data))
1795 RETURN(PTR_ERR(op_data));
1797 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1798 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1799 ll_finish_md_op_data(op_data);
1801 CDEBUG(D_INFO, "md_getattr_name failed "
1802 "on %s: rc %d\n", filename, rc);
1806 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1807 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1809 lmmsize = body->mbo_eadatasize;
1811 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1813 GOTO(out, rc = -ENODATA);
1816 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1817 LASSERT(lmm != NULL);
1819 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1820 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1821 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1822 GOTO(out, rc = -EPROTO);
1825 * This is coming from the MDS, so is probably in
1826 * little endian. We convert it to host endian before
1827 * passing it to userspace.
1829 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1832 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1833 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1834 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1835 if (le32_to_cpu(lmm->lmm_pattern) &
1836 LOV_PATTERN_F_RELEASED)
1840 /* if function called for directory - we should
1841 * avoid swab not existent lsm objects */
1842 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1843 lustre_swab_lov_user_md_v1(
1844 (struct lov_user_md_v1 *)lmm);
1845 if (S_ISREG(body->mbo_mode))
1846 lustre_swab_lov_user_md_objects(
1847 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1849 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1850 lustre_swab_lov_user_md_v3(
1851 (struct lov_user_md_v3 *)lmm);
1852 if (S_ISREG(body->mbo_mode))
1853 lustre_swab_lov_user_md_objects(
1854 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1856 } else if (lmm->lmm_magic ==
1857 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1858 lustre_swab_lov_comp_md_v1(
1859 (struct lov_comp_md_v1 *)lmm);
1865 *lmm_size = lmmsize;
1870 static int ll_lov_setea(struct inode *inode, struct file *file,
1873 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1874 struct lov_user_md *lump;
1875 int lum_size = sizeof(struct lov_user_md) +
1876 sizeof(struct lov_user_ost_data);
1880 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1883 OBD_ALLOC_LARGE(lump, lum_size);
1887 if (copy_from_user(lump, arg, lum_size))
1888 GOTO(out_lump, rc = -EFAULT);
1890 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1892 cl_lov_delay_create_clear(&file->f_flags);
1895 OBD_FREE_LARGE(lump, lum_size);
1899 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1906 env = cl_env_get(&refcheck);
1908 RETURN(PTR_ERR(env));
1910 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1911 cl_env_put(env, &refcheck);
1915 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1918 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1919 struct lov_user_md *klum;
1921 __u64 flags = FMODE_WRITE;
1924 rc = ll_copy_user_md(lum, &klum);
1929 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1934 rc = put_user(0, &lum->lmm_stripe_count);
1938 rc = ll_layout_refresh(inode, &gen);
1942 rc = ll_file_getstripe(inode, arg, lum_size);
1944 cl_lov_delay_create_clear(&file->f_flags);
1947 OBD_FREE(klum, lum_size);
1952 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1954 struct ll_inode_info *lli = ll_i2info(inode);
1955 struct cl_object *obj = lli->lli_clob;
1956 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1957 struct ll_grouplock grouplock;
1962 CWARN("group id for group lock must not be 0\n");
1966 if (ll_file_nolock(file))
1967 RETURN(-EOPNOTSUPP);
1969 spin_lock(&lli->lli_lock);
1970 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1971 CWARN("group lock already existed with gid %lu\n",
1972 fd->fd_grouplock.lg_gid);
1973 spin_unlock(&lli->lli_lock);
1976 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1977 spin_unlock(&lli->lli_lock);
1980 * XXX: group lock needs to protect all OST objects while PFL
1981 * can add new OST objects during the IO, so we'd instantiate
1982 * all OST objects before getting its group lock.
1987 struct cl_layout cl = {
1988 .cl_is_composite = false,
1990 struct lu_extent ext = {
1992 .e_end = OBD_OBJECT_EOF,
1995 env = cl_env_get(&refcheck);
1997 RETURN(PTR_ERR(env));
1999 rc = cl_object_layout_get(env, obj, &cl);
2000 if (!rc && cl.cl_is_composite)
2001 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2004 cl_env_put(env, &refcheck);
2009 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2010 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2014 spin_lock(&lli->lli_lock);
2015 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2016 spin_unlock(&lli->lli_lock);
2017 CERROR("another thread just won the race\n");
2018 cl_put_grouplock(&grouplock);
2022 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2023 fd->fd_grouplock = grouplock;
2024 spin_unlock(&lli->lli_lock);
2026 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2030 static int ll_put_grouplock(struct inode *inode, struct file *file,
2033 struct ll_inode_info *lli = ll_i2info(inode);
2034 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2035 struct ll_grouplock grouplock;
2038 spin_lock(&lli->lli_lock);
2039 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2040 spin_unlock(&lli->lli_lock);
2041 CWARN("no group lock held\n");
2045 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2047 if (fd->fd_grouplock.lg_gid != arg) {
2048 CWARN("group lock %lu doesn't match current id %lu\n",
2049 arg, fd->fd_grouplock.lg_gid);
2050 spin_unlock(&lli->lli_lock);
2054 grouplock = fd->fd_grouplock;
2055 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2056 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2057 spin_unlock(&lli->lli_lock);
2059 cl_put_grouplock(&grouplock);
2060 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2065 * Close inode open handle
2067 * \param dentry [in] dentry which contains the inode
2068 * \param it [in,out] intent which contains open info and result
2071 * \retval <0 failure
2073 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2075 struct inode *inode = dentry->d_inode;
2076 struct obd_client_handle *och;
2082 /* Root ? Do nothing. */
2083 if (dentry->d_inode->i_sb->s_root == dentry)
2086 /* No open handle to close? Move away */
2087 if (!it_disposition(it, DISP_OPEN_OPEN))
2090 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2092 OBD_ALLOC(och, sizeof(*och));
2094 GOTO(out, rc = -ENOMEM);
2096 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2098 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2100 /* this one is in place of ll_file_open */
2101 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2102 ptlrpc_req_finished(it->it_request);
2103 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2109 * Get size for inode for which FIEMAP mapping is requested.
2110 * Make the FIEMAP get_info call and returns the result.
2111 * \param fiemap kernel buffer to hold extens
2112 * \param num_bytes kernel buffer size
2114 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2120 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2123 /* Checks for fiemap flags */
2124 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2125 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2129 /* Check for FIEMAP_FLAG_SYNC */
2130 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2131 rc = filemap_fdatawrite(inode->i_mapping);
2136 env = cl_env_get(&refcheck);
2138 RETURN(PTR_ERR(env));
2140 if (i_size_read(inode) == 0) {
2141 rc = ll_glimpse_size(inode);
2146 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2147 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2148 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2150 /* If filesize is 0, then there would be no objects for mapping */
2151 if (fmkey.lfik_oa.o_size == 0) {
2152 fiemap->fm_mapped_extents = 0;
2156 fmkey.lfik_fiemap = *fiemap;
2158 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2159 &fmkey, fiemap, &num_bytes);
2161 cl_env_put(env, &refcheck);
2165 int ll_fid2path(struct inode *inode, void __user *arg)
2167 struct obd_export *exp = ll_i2mdexp(inode);
2168 const struct getinfo_fid2path __user *gfin = arg;
2170 struct getinfo_fid2path *gfout;
2176 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2177 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2180 /* Only need to get the buflen */
2181 if (get_user(pathlen, &gfin->gf_pathlen))
2184 if (pathlen > PATH_MAX)
2187 outsize = sizeof(*gfout) + pathlen;
2188 OBD_ALLOC(gfout, outsize);
2192 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2193 GOTO(gf_free, rc = -EFAULT);
2194 /* append root FID after gfout to let MDT know the root FID so that it
2195 * can lookup the correct path, this is mainly for fileset.
2196 * old server without fileset mount support will ignore this. */
2197 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2199 /* Call mdc_iocontrol */
2200 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2204 if (copy_to_user(arg, gfout, outsize))
2208 OBD_FREE(gfout, outsize);
2213 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2215 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2223 ioc->idv_version = 0;
2224 ioc->idv_layout_version = UINT_MAX;
2226 /* If no file object initialized, we consider its version is 0. */
2230 env = cl_env_get(&refcheck);
2232 RETURN(PTR_ERR(env));
2234 io = vvp_env_thread_io(env);
2236 io->u.ci_data_version.dv_data_version = 0;
2237 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2238 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2241 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2242 result = cl_io_loop(env, io);
2244 result = io->ci_result;
2246 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2247 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2249 cl_io_fini(env, io);
2251 if (unlikely(io->ci_need_restart))
2254 cl_env_put(env, &refcheck);
2260 * Read the data_version for inode.
2262 * This value is computed using stripe object version on OST.
2263 * Version is computed using server side locking.
2265 * @param flags if do sync on the OST side;
2267 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2268 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2270 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2272 struct ioc_data_version ioc = { .idv_flags = flags };
2275 rc = ll_ioc_data_version(inode, &ioc);
2277 *data_version = ioc.idv_version;
2283 * Trigger a HSM release request for the provided inode.
2285 int ll_hsm_release(struct inode *inode)
2288 struct obd_client_handle *och = NULL;
2289 __u64 data_version = 0;
2294 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2295 ll_get_fsname(inode->i_sb, NULL, 0),
2296 PFID(&ll_i2info(inode)->lli_fid));
2298 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2300 GOTO(out, rc = PTR_ERR(och));
2302 /* Grab latest data_version and [am]time values */
2303 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2307 env = cl_env_get(&refcheck);
2309 GOTO(out, rc = PTR_ERR(env));
2311 rc = ll_merge_attr(env, inode);
2312 cl_env_put(env, &refcheck);
2314 /* If error happen, we have the wrong size for a file.
2320 /* Release the file.
2321 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2322 * we still need it to pack l_remote_handle to MDT. */
2323 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2329 if (och != NULL && !IS_ERR(och)) /* close the file */
2330 ll_lease_close(och, inode, NULL);
2335 struct ll_swap_stack {
2338 struct inode *inode1;
2339 struct inode *inode2;
2344 static int ll_swap_layouts(struct file *file1, struct file *file2,
2345 struct lustre_swap_layouts *lsl)
2347 struct mdc_swap_layouts msl;
2348 struct md_op_data *op_data;
2351 struct ll_swap_stack *llss = NULL;
2354 OBD_ALLOC_PTR(llss);
2358 llss->inode1 = file_inode(file1);
2359 llss->inode2 = file_inode(file2);
2361 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2365 /* we use 2 bool because it is easier to swap than 2 bits */
2366 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2367 llss->check_dv1 = true;
2369 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2370 llss->check_dv2 = true;
2372 /* we cannot use lsl->sl_dvX directly because we may swap them */
2373 llss->dv1 = lsl->sl_dv1;
2374 llss->dv2 = lsl->sl_dv2;
2376 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2377 if (rc == 0) /* same file, done! */
2380 if (rc < 0) { /* sequentialize it */
2381 swap(llss->inode1, llss->inode2);
2383 swap(llss->dv1, llss->dv2);
2384 swap(llss->check_dv1, llss->check_dv2);
2388 if (gid != 0) { /* application asks to flush dirty cache */
2389 rc = ll_get_grouplock(llss->inode1, file1, gid);
2393 rc = ll_get_grouplock(llss->inode2, file2, gid);
2395 ll_put_grouplock(llss->inode1, file1, gid);
2400 /* ultimate check, before swaping the layouts we check if
2401 * dataversion has changed (if requested) */
2402 if (llss->check_dv1) {
2403 rc = ll_data_version(llss->inode1, &dv, 0);
2406 if (dv != llss->dv1)
2407 GOTO(putgl, rc = -EAGAIN);
2410 if (llss->check_dv2) {
2411 rc = ll_data_version(llss->inode2, &dv, 0);
2414 if (dv != llss->dv2)
2415 GOTO(putgl, rc = -EAGAIN);
2418 /* struct md_op_data is used to send the swap args to the mdt
2419 * only flags is missing, so we use struct mdc_swap_layouts
2420 * through the md_op_data->op_data */
2421 /* flags from user space have to be converted before they are send to
2422 * server, no flag is sent today, they are only used on the client */
2425 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2426 0, LUSTRE_OPC_ANY, &msl);
2427 if (IS_ERR(op_data))
2428 GOTO(free, rc = PTR_ERR(op_data));
2430 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2431 sizeof(*op_data), op_data, NULL);
2432 ll_finish_md_op_data(op_data);
2439 ll_put_grouplock(llss->inode2, file2, gid);
2440 ll_put_grouplock(llss->inode1, file1, gid);
2450 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2452 struct md_op_data *op_data;
2456 /* Detect out-of range masks */
2457 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2460 /* Non-root users are forbidden to set or clear flags which are
2461 * NOT defined in HSM_USER_MASK. */
2462 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2463 !cfs_capable(CFS_CAP_SYS_ADMIN))
2466 /* Detect out-of range archive id */
2467 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2468 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2471 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2472 LUSTRE_OPC_ANY, hss);
2473 if (IS_ERR(op_data))
2474 RETURN(PTR_ERR(op_data));
2476 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2477 sizeof(*op_data), op_data, NULL);
2479 ll_finish_md_op_data(op_data);
2484 static int ll_hsm_import(struct inode *inode, struct file *file,
2485 struct hsm_user_import *hui)
2487 struct hsm_state_set *hss = NULL;
2488 struct iattr *attr = NULL;
2492 if (!S_ISREG(inode->i_mode))
2498 GOTO(out, rc = -ENOMEM);
2500 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2501 hss->hss_archive_id = hui->hui_archive_id;
2502 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2503 rc = ll_hsm_state_set(inode, hss);
2507 OBD_ALLOC_PTR(attr);
2509 GOTO(out, rc = -ENOMEM);
2511 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2512 attr->ia_mode |= S_IFREG;
2513 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2514 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2515 attr->ia_size = hui->hui_size;
2516 attr->ia_mtime.tv_sec = hui->hui_mtime;
2517 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2518 attr->ia_atime.tv_sec = hui->hui_atime;
2519 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2521 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2522 ATTR_UID | ATTR_GID |
2523 ATTR_MTIME | ATTR_MTIME_SET |
2524 ATTR_ATIME | ATTR_ATIME_SET;
2528 rc = ll_setattr_raw(file_dentry(file), attr, true);
2532 inode_unlock(inode);
2544 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2546 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2547 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2550 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2552 struct inode *inode = file_inode(file);
2554 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2555 ATTR_MTIME | ATTR_MTIME_SET |
2556 ATTR_CTIME | ATTR_CTIME_SET,
2558 .tv_sec = lfu->lfu_atime_sec,
2559 .tv_nsec = lfu->lfu_atime_nsec,
2562 .tv_sec = lfu->lfu_mtime_sec,
2563 .tv_nsec = lfu->lfu_mtime_nsec,
2566 .tv_sec = lfu->lfu_ctime_sec,
2567 .tv_nsec = lfu->lfu_ctime_nsec,
2573 if (!capable(CAP_SYS_ADMIN))
2576 if (!S_ISREG(inode->i_mode))
2580 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2581 inode_unlock(inode);
2586 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2589 case MODE_READ_USER:
2591 case MODE_WRITE_USER:
2598 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2600 /* Used to allow the upper layers of the client to request an LDLM lock
2601 * without doing an actual read or write.
2603 * Used for ladvise lockahead to manually request specific locks.
2605 * \param[in] file file this ladvise lock request is on
2606 * \param[in] ladvise ladvise struct describing this lock request
2608 * \retval 0 success, no detailed result available (sync requests
2609 * and requests sent to the server [not handled locally]
2610 * cannot return detailed results)
2611 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2612 * see definitions for details.
2613 * \retval negative negative errno on error
2615 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2617 struct lu_env *env = NULL;
2618 struct cl_io *io = NULL;
2619 struct cl_lock *lock = NULL;
2620 struct cl_lock_descr *descr = NULL;
2621 struct dentry *dentry = file->f_path.dentry;
2622 struct inode *inode = dentry->d_inode;
2623 enum cl_lock_mode cl_mode;
2624 off_t start = ladvise->lla_start;
2625 off_t end = ladvise->lla_end;
2631 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2632 "start=%llu, end=%llu\n", dentry->d_name.len,
2633 dentry->d_name.name, dentry->d_inode,
2634 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2637 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2639 GOTO(out, result = cl_mode);
2641 /* Get IO environment */
2642 result = cl_io_get(inode, &env, &io, &refcheck);
2646 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2649 * nothing to do for this io. This currently happens when
2650 * stripe sub-object's are not yet created.
2652 result = io->ci_result;
2653 } else if (result == 0) {
2654 lock = vvp_env_lock(env);
2655 descr = &lock->cll_descr;
2657 descr->cld_obj = io->ci_obj;
2658 /* Convert byte offsets to pages */
2659 descr->cld_start = cl_index(io->ci_obj, start);
2660 descr->cld_end = cl_index(io->ci_obj, end);
2661 descr->cld_mode = cl_mode;
2662 /* CEF_MUST is used because we do not want to convert a
2663 * lockahead request to a lockless lock */
2664 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2667 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2668 descr->cld_enq_flags |= CEF_SPECULATIVE;
2670 result = cl_lock_request(env, io, lock);
2672 /* On success, we need to release the lock */
2674 cl_lock_release(env, lock);
2676 cl_io_fini(env, io);
2677 cl_env_put(env, &refcheck);
2679 /* -ECANCELED indicates a matching lock with a different extent
2680 * was already present, and -EEXIST indicates a matching lock
2681 * on exactly the same extent was already present.
2682 * We convert them to positive values for userspace to make
2683 * recognizing true errors easier.
2684 * Note we can only return these detailed results on async requests,
2685 * as sync requests look the same as i/o requests for locking. */
2686 if (result == -ECANCELED)
2687 result = LLA_RESULT_DIFFERENT;
2688 else if (result == -EEXIST)
2689 result = LLA_RESULT_SAME;
2694 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2696 static int ll_ladvise_sanity(struct inode *inode,
2697 struct llapi_lu_ladvise *ladvise)
2699 enum lu_ladvise_type advice = ladvise->lla_advice;
2700 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2701 * be in the first 32 bits of enum ladvise_flags */
2702 __u32 flags = ladvise->lla_peradvice_flags;
2703 /* 3 lines at 80 characters per line, should be plenty */
2706 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2708 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2709 "last supported advice is %s (value '%d'): rc = %d\n",
2710 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2711 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2715 /* Per-advice checks */
2717 case LU_LADVISE_LOCKNOEXPAND:
2718 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2720 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2722 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2723 ladvise_names[advice], rc);
2727 case LU_LADVISE_LOCKAHEAD:
2728 /* Currently only READ and WRITE modes can be requested */
2729 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2730 ladvise->lla_lockahead_mode == 0) {
2732 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2734 ll_get_fsname(inode->i_sb, NULL, 0),
2735 ladvise->lla_lockahead_mode,
2736 ladvise_names[advice], rc);
2739 case LU_LADVISE_WILLREAD:
2740 case LU_LADVISE_DONTNEED:
2742 /* Note fall through above - These checks apply to all advices
2743 * except LOCKNOEXPAND */
2744 if (flags & ~LF_DEFAULT_MASK) {
2746 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2748 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2749 ladvise_names[advice], rc);
2752 if (ladvise->lla_start >= ladvise->lla_end) {
2754 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2755 "for %s: rc = %d\n",
2756 ll_get_fsname(inode->i_sb, NULL, 0),
2757 ladvise->lla_start, ladvise->lla_end,
2758 ladvise_names[advice], rc);
2770 * Give file access advices
2772 * The ladvise interface is similar to Linux fadvise() system call, except it
2773 * forwards the advices directly from Lustre client to server. The server side
2774 * codes will apply appropriate read-ahead and caching techniques for the
2775 * corresponding files.
2777 * A typical workload for ladvise is e.g. a bunch of different clients are
2778 * doing small random reads of a file, so prefetching pages into OSS cache
2779 * with big linear reads before the random IO is a net benefit. Fetching
2780 * all that data into each client cache with fadvise() may not be, due to
2781 * much more data being sent to the client.
2783 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2784 struct llapi_lu_ladvise *ladvise)
2788 struct cl_ladvise_io *lio;
2793 env = cl_env_get(&refcheck);
2795 RETURN(PTR_ERR(env));
2797 io = vvp_env_thread_io(env);
2798 io->ci_obj = ll_i2info(inode)->lli_clob;
2800 /* initialize parameters for ladvise */
2801 lio = &io->u.ci_ladvise;
2802 lio->li_start = ladvise->lla_start;
2803 lio->li_end = ladvise->lla_end;
2804 lio->li_fid = ll_inode2fid(inode);
2805 lio->li_advice = ladvise->lla_advice;
2806 lio->li_flags = flags;
2808 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2809 rc = cl_io_loop(env, io);
2813 cl_io_fini(env, io);
2814 cl_env_put(env, &refcheck);
2818 static int ll_lock_noexpand(struct file *file, int flags)
2820 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2822 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2827 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2830 struct fsxattr fsxattr;
2832 if (copy_from_user(&fsxattr,
2833 (const struct fsxattr __user *)arg,
2837 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2838 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2839 if (copy_to_user((struct fsxattr __user *)arg,
2840 &fsxattr, sizeof(fsxattr)))
2846 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2850 struct md_op_data *op_data;
2851 struct ptlrpc_request *req = NULL;
2853 struct fsxattr fsxattr;
2854 struct cl_object *obj;
2856 /* only root could change project ID */
2857 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2860 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2861 LUSTRE_OPC_ANY, NULL);
2862 if (IS_ERR(op_data))
2863 RETURN(PTR_ERR(op_data));
2865 if (copy_from_user(&fsxattr,
2866 (const struct fsxattr __user *)arg,
2868 GOTO(out_fsxattr1, rc = -EFAULT);
2870 op_data->op_attr_flags = fsxattr.fsx_xflags;
2871 op_data->op_projid = fsxattr.fsx_projid;
2872 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2873 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2875 ptlrpc_req_finished(req);
2877 obj = ll_i2info(inode)->lli_clob;
2881 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2882 OBD_ALLOC_PTR(attr);
2884 GOTO(out_fsxattr1, rc = -ENOMEM);
2885 attr->ia_valid = ATTR_ATTR_FLAG;
2886 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2891 ll_finish_md_op_data(op_data);
2895 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2898 struct inode *inode = file_inode(file);
2899 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2900 struct ll_inode_info *lli = ll_i2info(inode);
2901 struct obd_client_handle *och = NULL;
2902 struct split_param sp;
2905 enum mds_op_bias bias = 0;
2906 struct file *layout_file = NULL;
2908 size_t data_size = 0;
2912 mutex_lock(&lli->lli_och_mutex);
2913 if (fd->fd_lease_och != NULL) {
2914 och = fd->fd_lease_och;
2915 fd->fd_lease_och = NULL;
2917 mutex_unlock(&lli->lli_och_mutex);
2920 GOTO(out, rc = -ENOLCK);
2922 fmode = och->och_flags;
2924 switch (ioc->lil_flags) {
2925 case LL_LEASE_RESYNC_DONE:
2926 if (ioc->lil_count > IOC_IDS_MAX)
2927 GOTO(out, rc = -EINVAL);
2929 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2930 OBD_ALLOC(data, data_size);
2932 GOTO(out, rc = -ENOMEM);
2934 if (copy_from_user(data, (void __user *)arg, data_size))
2935 GOTO(out, rc = -EFAULT);
2937 bias = MDS_CLOSE_RESYNC_DONE;
2939 case LL_LEASE_LAYOUT_MERGE: {
2942 if (ioc->lil_count != 1)
2943 GOTO(out, rc = -EINVAL);
2945 arg += sizeof(*ioc);
2946 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
2947 GOTO(out, rc = -EFAULT);
2949 layout_file = fget(fd);
2951 GOTO(out, rc = -EBADF);
2953 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
2954 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
2955 GOTO(out, rc = -EPERM);
2957 data = file_inode(layout_file);
2958 bias = MDS_CLOSE_LAYOUT_MERGE;
2961 case LL_LEASE_LAYOUT_SPLIT: {
2965 if (ioc->lil_count != 2)
2966 GOTO(out, rc = -EINVAL);
2968 arg += sizeof(*ioc);
2969 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
2970 GOTO(out, rc = -EFAULT);
2972 arg += sizeof(__u32);
2973 if (copy_from_user(&mirror_id, (void __user *)arg,
2975 GOTO(out, rc = -EFAULT);
2977 layout_file = fget(fdv);
2979 GOTO(out, rc = -EBADF);
2981 sp.sp_inode = file_inode(layout_file);
2982 sp.sp_mirror_id = (__u16)mirror_id;
2984 bias = MDS_CLOSE_LAYOUT_SPLIT;
2988 /* without close intent */
2992 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2996 rc = ll_lease_och_release(inode, file);
3005 switch (ioc->lil_flags) {
3006 case LL_LEASE_RESYNC_DONE:
3008 OBD_FREE(data, data_size);
3010 case LL_LEASE_LAYOUT_MERGE:
3011 case LL_LEASE_LAYOUT_SPLIT:
3018 rc = ll_lease_type_from_fmode(fmode);
3022 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3025 struct inode *inode = file_inode(file);
3026 struct ll_inode_info *lli = ll_i2info(inode);
3027 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3028 struct obd_client_handle *och = NULL;
3029 __u64 open_flags = 0;
3035 switch (ioc->lil_mode) {
3036 case LL_LEASE_WRLCK:
3037 if (!(file->f_mode & FMODE_WRITE))
3039 fmode = FMODE_WRITE;
3041 case LL_LEASE_RDLCK:
3042 if (!(file->f_mode & FMODE_READ))
3046 case LL_LEASE_UNLCK:
3047 RETURN(ll_file_unlock_lease(file, ioc, arg));
3052 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3054 /* apply for lease */
3055 if (ioc->lil_flags & LL_LEASE_RESYNC)
3056 open_flags = MDS_OPEN_RESYNC;
3057 och = ll_lease_open(inode, file, fmode, open_flags);
3059 RETURN(PTR_ERR(och));
3061 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3062 rc = ll_lease_file_resync(och, inode);
3064 ll_lease_close(och, inode, NULL);
3067 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3069 ll_lease_close(och, inode, NULL);
3075 mutex_lock(&lli->lli_och_mutex);
3076 if (fd->fd_lease_och == NULL) {
3077 fd->fd_lease_och = och;
3080 mutex_unlock(&lli->lli_och_mutex);
3082 /* impossible now that only excl is supported for now */
3083 ll_lease_close(och, inode, &lease_broken);
3090 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3092 struct inode *inode = file_inode(file);
3093 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3097 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3098 PFID(ll_inode2fid(inode)), inode, cmd);
3099 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3101 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3102 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3106 case LL_IOC_GETFLAGS:
3107 /* Get the current value of the file flags */
3108 return put_user(fd->fd_flags, (int __user *)arg);
3109 case LL_IOC_SETFLAGS:
3110 case LL_IOC_CLRFLAGS:
3111 /* Set or clear specific file flags */
3112 /* XXX This probably needs checks to ensure the flags are
3113 * not abused, and to handle any flag side effects.
3115 if (get_user(flags, (int __user *) arg))
3118 if (cmd == LL_IOC_SETFLAGS) {
3119 if ((flags & LL_FILE_IGNORE_LOCK) &&
3120 !(file->f_flags & O_DIRECT)) {
3121 CERROR("%s: unable to disable locking on "
3122 "non-O_DIRECT file\n", current->comm);
3126 fd->fd_flags |= flags;
3128 fd->fd_flags &= ~flags;
3131 case LL_IOC_LOV_SETSTRIPE:
3132 case LL_IOC_LOV_SETSTRIPE_NEW:
3133 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3134 case LL_IOC_LOV_SETEA:
3135 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3136 case LL_IOC_LOV_SWAP_LAYOUTS: {
3138 struct lustre_swap_layouts lsl;
3140 if (copy_from_user(&lsl, (char __user *)arg,
3141 sizeof(struct lustre_swap_layouts)))
3144 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3147 file2 = fget(lsl.sl_fd);
3151 /* O_WRONLY or O_RDWR */
3152 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3153 GOTO(out, rc = -EPERM);
3155 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3156 struct inode *inode2;
3157 struct ll_inode_info *lli;
3158 struct obd_client_handle *och = NULL;
3160 lli = ll_i2info(inode);
3161 mutex_lock(&lli->lli_och_mutex);
3162 if (fd->fd_lease_och != NULL) {
3163 och = fd->fd_lease_och;
3164 fd->fd_lease_och = NULL;
3166 mutex_unlock(&lli->lli_och_mutex);
3168 GOTO(out, rc = -ENOLCK);
3169 inode2 = file_inode(file2);
3170 rc = ll_swap_layouts_close(och, inode, inode2);
3172 rc = ll_swap_layouts(file, file2, &lsl);
3178 case LL_IOC_LOV_GETSTRIPE:
3179 case LL_IOC_LOV_GETSTRIPE_NEW:
3180 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3181 case FSFILT_IOC_GETFLAGS:
3182 case FSFILT_IOC_SETFLAGS:
3183 RETURN(ll_iocontrol(inode, file, cmd, arg));
3184 case FSFILT_IOC_GETVERSION_OLD:
3185 case FSFILT_IOC_GETVERSION:
3186 RETURN(put_user(inode->i_generation, (int __user *)arg));
3187 case LL_IOC_GROUP_LOCK:
3188 RETURN(ll_get_grouplock(inode, file, arg));
3189 case LL_IOC_GROUP_UNLOCK:
3190 RETURN(ll_put_grouplock(inode, file, arg));
3191 case IOC_OBD_STATFS:
3192 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3194 /* We need to special case any other ioctls we want to handle,
3195 * to send them to the MDS/OST as appropriate and to properly
3196 * network encode the arg field.
3197 case FSFILT_IOC_SETVERSION_OLD:
3198 case FSFILT_IOC_SETVERSION:
3200 case LL_IOC_FLUSHCTX:
3201 RETURN(ll_flush_ctx(inode));
3202 case LL_IOC_PATH2FID: {
3203 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3204 sizeof(struct lu_fid)))
3209 case LL_IOC_GETPARENT:
3210 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3212 case OBD_IOC_FID2PATH:
3213 RETURN(ll_fid2path(inode, (void __user *)arg));
3214 case LL_IOC_DATA_VERSION: {
3215 struct ioc_data_version idv;
3218 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3221 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3222 rc = ll_ioc_data_version(inode, &idv);
3225 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3231 case LL_IOC_GET_MDTIDX: {
3234 mdtidx = ll_get_mdt_idx(inode);
3238 if (put_user((int)mdtidx, (int __user *)arg))
3243 case OBD_IOC_GETDTNAME:
3244 case OBD_IOC_GETMDNAME:
3245 RETURN(ll_get_obd_name(inode, cmd, arg));
3246 case LL_IOC_HSM_STATE_GET: {
3247 struct md_op_data *op_data;
3248 struct hsm_user_state *hus;
3255 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3256 LUSTRE_OPC_ANY, hus);
3257 if (IS_ERR(op_data)) {
3259 RETURN(PTR_ERR(op_data));
3262 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3265 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3268 ll_finish_md_op_data(op_data);
3272 case LL_IOC_HSM_STATE_SET: {
3273 struct hsm_state_set *hss;
3280 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3285 rc = ll_hsm_state_set(inode, hss);
3290 case LL_IOC_HSM_ACTION: {
3291 struct md_op_data *op_data;
3292 struct hsm_current_action *hca;
3299 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3300 LUSTRE_OPC_ANY, hca);
3301 if (IS_ERR(op_data)) {
3303 RETURN(PTR_ERR(op_data));
3306 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3309 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3312 ll_finish_md_op_data(op_data);
3316 case LL_IOC_SET_LEASE_OLD: {
3317 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3319 RETURN(ll_file_set_lease(file, &ioc, 0));
3321 case LL_IOC_SET_LEASE: {
3322 struct ll_ioc_lease ioc;
3324 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3327 RETURN(ll_file_set_lease(file, &ioc, arg));
3329 case LL_IOC_GET_LEASE: {
3330 struct ll_inode_info *lli = ll_i2info(inode);
3331 struct ldlm_lock *lock = NULL;
3334 mutex_lock(&lli->lli_och_mutex);
3335 if (fd->fd_lease_och != NULL) {
3336 struct obd_client_handle *och = fd->fd_lease_och;
3338 lock = ldlm_handle2lock(&och->och_lease_handle);
3340 lock_res_and_lock(lock);
3341 if (!ldlm_is_cancel(lock))
3342 fmode = och->och_flags;
3344 unlock_res_and_lock(lock);
3345 LDLM_LOCK_PUT(lock);
3348 mutex_unlock(&lli->lli_och_mutex);
3350 RETURN(ll_lease_type_from_fmode(fmode));
3352 case LL_IOC_HSM_IMPORT: {
3353 struct hsm_user_import *hui;
3359 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3364 rc = ll_hsm_import(inode, file, hui);
3369 case LL_IOC_FUTIMES_3: {
3370 struct ll_futimes_3 lfu;
3372 if (copy_from_user(&lfu,
3373 (const struct ll_futimes_3 __user *)arg,
3377 RETURN(ll_file_futimes_3(file, &lfu));
3379 case LL_IOC_LADVISE: {
3380 struct llapi_ladvise_hdr *k_ladvise_hdr;
3381 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3384 int alloc_size = sizeof(*k_ladvise_hdr);
3387 u_ladvise_hdr = (void __user *)arg;
3388 OBD_ALLOC_PTR(k_ladvise_hdr);
3389 if (k_ladvise_hdr == NULL)
3392 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3393 GOTO(out_ladvise, rc = -EFAULT);
3395 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3396 k_ladvise_hdr->lah_count < 1)
3397 GOTO(out_ladvise, rc = -EINVAL);
3399 num_advise = k_ladvise_hdr->lah_count;
3400 if (num_advise >= LAH_COUNT_MAX)
3401 GOTO(out_ladvise, rc = -EFBIG);
3403 OBD_FREE_PTR(k_ladvise_hdr);
3404 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3405 lah_advise[num_advise]);
3406 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3407 if (k_ladvise_hdr == NULL)
3411 * TODO: submit multiple advices to one server in a single RPC
3413 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3414 GOTO(out_ladvise, rc = -EFAULT);
3416 for (i = 0; i < num_advise; i++) {
3417 struct llapi_lu_ladvise *k_ladvise =
3418 &k_ladvise_hdr->lah_advise[i];
3419 struct llapi_lu_ladvise __user *u_ladvise =
3420 &u_ladvise_hdr->lah_advise[i];
3422 rc = ll_ladvise_sanity(inode, k_ladvise);
3424 GOTO(out_ladvise, rc);
3426 switch (k_ladvise->lla_advice) {
3427 case LU_LADVISE_LOCKNOEXPAND:
3428 rc = ll_lock_noexpand(file,
3429 k_ladvise->lla_peradvice_flags);
3430 GOTO(out_ladvise, rc);
3431 case LU_LADVISE_LOCKAHEAD:
3433 rc = ll_file_lock_ahead(file, k_ladvise);
3436 GOTO(out_ladvise, rc);
3439 &u_ladvise->lla_lockahead_result))
3440 GOTO(out_ladvise, rc = -EFAULT);
3443 rc = ll_ladvise(inode, file,
3444 k_ladvise_hdr->lah_flags,
3447 GOTO(out_ladvise, rc);
3454 OBD_FREE(k_ladvise_hdr, alloc_size);
3457 case LL_IOC_FLR_SET_MIRROR: {
3458 /* mirror I/O must be direct to avoid polluting page cache
3460 if (!(file->f_flags & O_DIRECT))
3463 fd->fd_designated_mirror = (__u32)arg;
3466 case LL_IOC_FSGETXATTR:
3467 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3468 case LL_IOC_FSSETXATTR:
3469 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3471 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3473 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3474 (void __user *)arg));
3478 #ifndef HAVE_FILE_LLSEEK_SIZE
3479 static inline loff_t
3480 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3482 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3484 if (offset > maxsize)
3487 if (offset != file->f_pos) {
3488 file->f_pos = offset;
3489 file->f_version = 0;
3495 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3496 loff_t maxsize, loff_t eof)
3498 struct inode *inode = file_inode(file);
3506 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3507 * position-querying operation. Avoid rewriting the "same"
3508 * f_pos value back to the file because a concurrent read(),
3509 * write() or lseek() might have altered it
3514 * f_lock protects against read/modify/write race with other
3515 * SEEK_CURs. Note that parallel writes and reads behave
3519 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3520 inode_unlock(inode);
3524 * In the generic case the entire file is data, so as long as
3525 * offset isn't at the end of the file then the offset is data.
3532 * There is a virtual hole at the end of the file, so as long as
3533 * offset isn't i_size or larger, return i_size.
3541 return llseek_execute(file, offset, maxsize);
3545 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3547 struct inode *inode = file_inode(file);
3548 loff_t retval, eof = 0;
3551 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3552 (origin == SEEK_CUR) ? file->f_pos : 0);
3553 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3554 PFID(ll_inode2fid(inode)), inode, retval, retval,
3556 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3558 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3559 retval = ll_glimpse_size(inode);
3562 eof = i_size_read(inode);
3565 retval = ll_generic_file_llseek_size(file, offset, origin,
3566 ll_file_maxbytes(inode), eof);
3570 static int ll_flush(struct file *file, fl_owner_t id)
3572 struct inode *inode = file_inode(file);
3573 struct ll_inode_info *lli = ll_i2info(inode);
3574 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3577 LASSERT(!S_ISDIR(inode->i_mode));
3579 /* catch async errors that were recorded back when async writeback
3580 * failed for pages in this mapping. */
3581 rc = lli->lli_async_rc;
3582 lli->lli_async_rc = 0;
3583 if (lli->lli_clob != NULL) {
3584 err = lov_read_and_clear_async_rc(lli->lli_clob);
3589 /* The application has been told write failure already.
3590 * Do not report failure again. */
3591 if (fd->fd_write_failed)
3593 return rc ? -EIO : 0;
3597 * Called to make sure a portion of file has been written out.
3598 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3600 * Return how many pages have been written.
3602 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3603 enum cl_fsync_mode mode, int ignore_layout)
3607 struct cl_fsync_io *fio;
3612 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3613 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3616 env = cl_env_get(&refcheck);
3618 RETURN(PTR_ERR(env));
3620 io = vvp_env_thread_io(env);
3621 io->ci_obj = ll_i2info(inode)->lli_clob;
3622 io->ci_ignore_layout = ignore_layout;
3624 /* initialize parameters for sync */
3625 fio = &io->u.ci_fsync;
3626 fio->fi_start = start;
3628 fio->fi_fid = ll_inode2fid(inode);
3629 fio->fi_mode = mode;
3630 fio->fi_nr_written = 0;
3632 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3633 result = cl_io_loop(env, io);
3635 result = io->ci_result;
3637 result = fio->fi_nr_written;
3638 cl_io_fini(env, io);
3639 cl_env_put(env, &refcheck);
3645 * When dentry is provided (the 'else' case), file_dentry() may be
3646 * null and dentry must be used directly rather than pulled from
3647 * file_dentry() as is done otherwise.
3650 #ifdef HAVE_FILE_FSYNC_4ARGS
3651 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3653 struct dentry *dentry = file_dentry(file);
3655 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3656 int ll_fsync(struct file *file, int datasync)
3658 struct dentry *dentry = file_dentry(file);
3660 loff_t end = LLONG_MAX;
3662 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3665 loff_t end = LLONG_MAX;
3667 struct inode *inode = dentry->d_inode;
3668 struct ll_inode_info *lli = ll_i2info(inode);
3669 struct ptlrpc_request *req;
3673 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3674 PFID(ll_inode2fid(inode)), inode);
3675 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3677 #ifdef HAVE_FILE_FSYNC_4ARGS
3678 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3679 lock_inode = !lli->lli_inode_locked;
3683 /* fsync's caller has already called _fdata{sync,write}, we want
3684 * that IO to finish before calling the osc and mdc sync methods */
3685 rc = filemap_fdatawait(inode->i_mapping);
3688 /* catch async errors that were recorded back when async writeback
3689 * failed for pages in this mapping. */
3690 if (!S_ISDIR(inode->i_mode)) {
3691 err = lli->lli_async_rc;
3692 lli->lli_async_rc = 0;
3695 if (lli->lli_clob != NULL) {
3696 err = lov_read_and_clear_async_rc(lli->lli_clob);
3702 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3706 ptlrpc_req_finished(req);
3708 if (S_ISREG(inode->i_mode)) {
3709 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3711 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3712 if (rc == 0 && err < 0)
3715 fd->fd_write_failed = true;
3717 fd->fd_write_failed = false;
3720 #ifdef HAVE_FILE_FSYNC_4ARGS
3722 inode_unlock(inode);
3728 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3730 struct inode *inode = file_inode(file);
3731 struct ll_sb_info *sbi = ll_i2sbi(inode);
3732 struct ldlm_enqueue_info einfo = {
3733 .ei_type = LDLM_FLOCK,
3734 .ei_cb_cp = ldlm_flock_completion_ast,
3735 .ei_cbdata = file_lock,
3737 struct md_op_data *op_data;
3738 struct lustre_handle lockh = { 0 };
3739 union ldlm_policy_data flock = { { 0 } };
3740 int fl_type = file_lock->fl_type;
3746 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3747 PFID(ll_inode2fid(inode)), file_lock);
3749 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3751 if (file_lock->fl_flags & FL_FLOCK) {
3752 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3753 /* flocks are whole-file locks */
3754 flock.l_flock.end = OFFSET_MAX;
3755 /* For flocks owner is determined by the local file desctiptor*/
3756 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3757 } else if (file_lock->fl_flags & FL_POSIX) {
3758 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3759 flock.l_flock.start = file_lock->fl_start;
3760 flock.l_flock.end = file_lock->fl_end;
3764 flock.l_flock.pid = file_lock->fl_pid;
3766 /* Somewhat ugly workaround for svc lockd.
3767 * lockd installs custom fl_lmops->lm_compare_owner that checks
3768 * for the fl_owner to be the same (which it always is on local node
3769 * I guess between lockd processes) and then compares pid.
3770 * As such we assign pid to the owner field to make it all work,
3771 * conflict with normal locks is unlikely since pid space and
3772 * pointer space for current->files are not intersecting */
3773 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3774 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3778 einfo.ei_mode = LCK_PR;
3781 /* An unlock request may or may not have any relation to
3782 * existing locks so we may not be able to pass a lock handle
3783 * via a normal ldlm_lock_cancel() request. The request may even
3784 * unlock a byte range in the middle of an existing lock. In
3785 * order to process an unlock request we need all of the same
3786 * information that is given with a normal read or write record
3787 * lock request. To avoid creating another ldlm unlock (cancel)
3788 * message we'll treat a LCK_NL flock request as an unlock. */
3789 einfo.ei_mode = LCK_NL;
3792 einfo.ei_mode = LCK_PW;
3795 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3810 flags = LDLM_FL_BLOCK_NOWAIT;
3816 flags = LDLM_FL_TEST_LOCK;
3819 CERROR("unknown fcntl lock command: %d\n", cmd);
3823 /* Save the old mode so that if the mode in the lock changes we
3824 * can decrement the appropriate reader or writer refcount. */
3825 file_lock->fl_type = einfo.ei_mode;
3827 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3828 LUSTRE_OPC_ANY, NULL);
3829 if (IS_ERR(op_data))
3830 RETURN(PTR_ERR(op_data));
3832 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3833 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3834 flock.l_flock.pid, flags, einfo.ei_mode,
3835 flock.l_flock.start, flock.l_flock.end);
3837 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3840 /* Restore the file lock type if not TEST lock. */
3841 if (!(flags & LDLM_FL_TEST_LOCK))
3842 file_lock->fl_type = fl_type;
3844 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3845 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3846 !(flags & LDLM_FL_TEST_LOCK))
3847 rc2 = locks_lock_file_wait(file, file_lock);
3849 if ((file_lock->fl_flags & FL_FLOCK) &&
3850 (rc == 0 || file_lock->fl_type == F_UNLCK))
3851 rc2 = flock_lock_file_wait(file, file_lock);
3852 if ((file_lock->fl_flags & FL_POSIX) &&
3853 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3854 !(flags & LDLM_FL_TEST_LOCK))
3855 rc2 = posix_lock_file_wait(file, file_lock);
3856 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3858 if (rc2 && file_lock->fl_type != F_UNLCK) {
3859 einfo.ei_mode = LCK_NL;
3860 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3865 ll_finish_md_op_data(op_data);
3870 int ll_get_fid_by_name(struct inode *parent, const char *name,
3871 int namelen, struct lu_fid *fid,
3872 struct inode **inode)
3874 struct md_op_data *op_data = NULL;
3875 struct mdt_body *body;
3876 struct ptlrpc_request *req;
3880 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3881 LUSTRE_OPC_ANY, NULL);
3882 if (IS_ERR(op_data))
3883 RETURN(PTR_ERR(op_data));
3885 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3886 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3887 ll_finish_md_op_data(op_data);
3891 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3893 GOTO(out_req, rc = -EFAULT);
3895 *fid = body->mbo_fid1;
3898 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3900 ptlrpc_req_finished(req);
3904 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3905 const char *name, int namelen)
3907 struct dentry *dchild = NULL;
3908 struct inode *child_inode = NULL;
3909 struct md_op_data *op_data;
3910 struct ptlrpc_request *request = NULL;
3911 struct obd_client_handle *och = NULL;
3913 struct mdt_body *body;
3915 __u64 data_version = 0;
3918 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3919 name, PFID(ll_inode2fid(parent)), mdtidx);
3921 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3922 0, LUSTRE_OPC_ANY, NULL);
3923 if (IS_ERR(op_data))
3924 RETURN(PTR_ERR(op_data));
3926 /* Get child FID first */
3927 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3930 dchild = d_lookup(file_dentry(file), &qstr);
3931 if (dchild != NULL) {
3932 if (dchild->d_inode != NULL)
3933 child_inode = igrab(dchild->d_inode);
3937 if (child_inode == NULL) {
3938 rc = ll_get_fid_by_name(parent, name, namelen,
3939 &op_data->op_fid3, &child_inode);
3944 if (child_inode == NULL)
3945 GOTO(out_free, rc = -EINVAL);
3948 * lfs migrate command needs to be blocked on the client
3949 * by checking the migrate FID against the FID of the
3952 if (child_inode == parent->i_sb->s_root->d_inode)
3953 GOTO(out_iput, rc = -EINVAL);
3955 inode_lock(child_inode);
3956 op_data->op_fid3 = *ll_inode2fid(child_inode);
3957 if (!fid_is_sane(&op_data->op_fid3)) {
3958 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3959 ll_get_fsname(parent->i_sb, NULL, 0), name,
3960 PFID(&op_data->op_fid3));
3961 GOTO(out_unlock, rc = -EINVAL);
3964 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3966 GOTO(out_unlock, rc);
3969 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3970 PFID(&op_data->op_fid3), mdtidx);
3971 GOTO(out_unlock, rc = 0);
3974 if (S_ISREG(child_inode->i_mode)) {
3975 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3979 GOTO(out_unlock, rc);
3982 rc = ll_data_version(child_inode, &data_version,
3985 GOTO(out_close, rc);
3987 op_data->op_handle = och->och_fh;
3988 op_data->op_data = och->och_mod;
3989 op_data->op_data_version = data_version;
3990 op_data->op_lease_handle = och->och_lease_handle;
3991 op_data->op_bias |= MDS_RENAME_MIGRATE;
3994 op_data->op_mds = mdtidx;
3995 op_data->op_cli_flags = CLI_MIGRATE;
3996 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3997 namelen, name, namelen, &request);
3999 LASSERT(request != NULL);
4000 ll_update_times(request, parent);
4002 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4003 LASSERT(body != NULL);
4005 /* If the server does release layout lock, then we cleanup
4006 * the client och here, otherwise release it in out_close: */
4008 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4009 obd_mod_put(och->och_mod);
4010 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4012 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4018 if (request != NULL) {
4019 ptlrpc_req_finished(request);
4023 /* Try again if the file layout has changed. */
4024 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4028 if (och != NULL) /* close the file */
4029 ll_lease_close(och, child_inode, NULL);
4031 clear_nlink(child_inode);
4033 inode_unlock(child_inode);
4037 ll_finish_md_op_data(op_data);
4042 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4050 * test if some locks matching bits and l_req_mode are acquired
4051 * - bits can be in different locks
4052 * - if found clear the common lock bits in *bits
4053 * - the bits not found, are kept in *bits
4055 * \param bits [IN] searched lock bits [IN]
4056 * \param l_req_mode [IN] searched lock mode
4057 * \retval boolean, true iff all bits are found
4059 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4061 struct lustre_handle lockh;
4062 union ldlm_policy_data policy;
4063 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4064 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4073 fid = &ll_i2info(inode)->lli_fid;
4074 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4075 ldlm_lockname[mode]);
4077 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4078 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4079 policy.l_inodebits.bits = *bits & (1 << i);
4080 if (policy.l_inodebits.bits == 0)
4083 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4084 &policy, mode, &lockh)) {
4085 struct ldlm_lock *lock;
4087 lock = ldlm_handle2lock(&lockh);
4090 ~(lock->l_policy_data.l_inodebits.bits);
4091 LDLM_LOCK_PUT(lock);
4093 *bits &= ~policy.l_inodebits.bits;
4100 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4101 struct lustre_handle *lockh, __u64 flags,
4102 enum ldlm_mode mode)
4104 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4109 fid = &ll_i2info(inode)->lli_fid;
4110 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4112 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4113 fid, LDLM_IBITS, &policy, mode, lockh);
4118 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4120 /* Already unlinked. Just update nlink and return success */
4121 if (rc == -ENOENT) {
4123 /* If it is striped directory, and there is bad stripe
4124 * Let's revalidate the dentry again, instead of returning
4126 if (S_ISDIR(inode->i_mode) &&
4127 ll_i2info(inode)->lli_lsm_md != NULL)
4130 /* This path cannot be hit for regular files unless in
4131 * case of obscure races, so no need to to validate
4133 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4135 } else if (rc != 0) {
4136 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4137 "%s: revalidate FID "DFID" error: rc = %d\n",
4138 ll_get_fsname(inode->i_sb, NULL, 0),
4139 PFID(ll_inode2fid(inode)), rc);
4145 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4147 struct inode *inode = dentry->d_inode;
4148 struct obd_export *exp = ll_i2mdexp(inode);
4149 struct lookup_intent oit = {
4152 struct ptlrpc_request *req = NULL;
4153 struct md_op_data *op_data;
4157 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4158 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4160 /* Call getattr by fid, so do not provide name at all. */
4161 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4162 LUSTRE_OPC_ANY, NULL);
4163 if (IS_ERR(op_data))
4164 RETURN(PTR_ERR(op_data));
4166 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4167 ll_finish_md_op_data(op_data);
4169 rc = ll_inode_revalidate_fini(inode, rc);
4173 rc = ll_revalidate_it_finish(req, &oit, dentry);
4175 ll_intent_release(&oit);
4179 /* Unlinked? Unhash dentry, so it is not picked up later by
4180 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4181 * here to preserve get_cwd functionality on 2.6.
4183 if (!dentry->d_inode->i_nlink) {
4184 ll_lock_dcache(inode);
4185 d_lustre_invalidate(dentry, 0);
4186 ll_unlock_dcache(inode);
4189 ll_lookup_finish_locks(&oit, dentry);
4191 ptlrpc_req_finished(req);
4196 static int ll_merge_md_attr(struct inode *inode)
4198 struct cl_attr attr = { 0 };
4201 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4202 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4203 &attr, ll_md_blocking_ast);
4207 set_nlink(inode, attr.cat_nlink);
4208 inode->i_blocks = attr.cat_blocks;
4209 i_size_write(inode, attr.cat_size);
4211 ll_i2info(inode)->lli_atime = attr.cat_atime;
4212 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4213 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4218 static inline dev_t ll_compat_encode_dev(dev_t dev)
4220 /* The compat_sys_*stat*() syscalls will fail unless the
4221 * device majors and minors are both less than 256. Note that
4222 * the value returned here will be passed through
4223 * old_encode_dev() in cp_compat_stat(). And so we are not
4224 * trying to return a valid compat (u16) device number, just
4225 * one that will pass the old_valid_dev() check. */
4227 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4230 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4231 int ll_getattr(const struct path *path, struct kstat *stat,
4232 u32 request_mask, unsigned int flags)
4234 struct dentry *de = path->dentry;
4236 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4239 struct inode *inode = de->d_inode;
4240 struct ll_sb_info *sbi = ll_i2sbi(inode);
4241 struct ll_inode_info *lli = ll_i2info(inode);
4244 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4246 rc = ll_inode_revalidate(de, IT_GETATTR);
4250 if (S_ISREG(inode->i_mode)) {
4251 /* In case of restore, the MDT has the right size and has
4252 * already send it back without granting the layout lock,
4253 * inode is up-to-date so glimpse is useless.
4254 * Also to glimpse we need the layout, in case of a running
4255 * restore the MDT holds the layout lock so the glimpse will
4256 * block up to the end of restore (getattr will block)
4258 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4259 rc = ll_glimpse_size(inode);
4264 /* If object isn't regular a file then don't validate size. */
4265 if (S_ISDIR(inode->i_mode) &&
4266 lli->lli_lsm_md != NULL) {
4267 rc = ll_merge_md_attr(inode);
4272 LTIME_S(inode->i_atime) = lli->lli_atime;
4273 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4274 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4277 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4279 if (ll_need_32bit_api(sbi)) {
4280 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4281 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4282 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4284 stat->ino = inode->i_ino;
4285 stat->dev = inode->i_sb->s_dev;
4286 stat->rdev = inode->i_rdev;
4289 stat->mode = inode->i_mode;
4290 stat->uid = inode->i_uid;
4291 stat->gid = inode->i_gid;
4292 stat->atime = inode->i_atime;
4293 stat->mtime = inode->i_mtime;
4294 stat->ctime = inode->i_ctime;
4295 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4297 stat->nlink = inode->i_nlink;
4298 stat->size = i_size_read(inode);
4299 stat->blocks = inode->i_blocks;
4304 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4305 __u64 start, __u64 len)
4309 struct fiemap *fiemap;
4310 unsigned int extent_count = fieinfo->fi_extents_max;
4312 num_bytes = sizeof(*fiemap) + (extent_count *
4313 sizeof(struct fiemap_extent));
4314 OBD_ALLOC_LARGE(fiemap, num_bytes);
4319 fiemap->fm_flags = fieinfo->fi_flags;
4320 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4321 fiemap->fm_start = start;
4322 fiemap->fm_length = len;
4323 if (extent_count > 0 &&
4324 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4325 sizeof(struct fiemap_extent)) != 0)
4326 GOTO(out, rc = -EFAULT);
4328 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4330 fieinfo->fi_flags = fiemap->fm_flags;
4331 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4332 if (extent_count > 0 &&
4333 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4334 fiemap->fm_mapped_extents *
4335 sizeof(struct fiemap_extent)) != 0)
4336 GOTO(out, rc = -EFAULT);
4338 OBD_FREE_LARGE(fiemap, num_bytes);
4342 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4344 struct ll_inode_info *lli = ll_i2info(inode);
4345 struct posix_acl *acl = NULL;
4348 spin_lock(&lli->lli_lock);
4349 /* VFS' acl_permission_check->check_acl will release the refcount */
4350 acl = posix_acl_dup(lli->lli_posix_acl);
4351 spin_unlock(&lli->lli_lock);
4356 #ifdef HAVE_IOP_SET_ACL
4357 #ifdef CONFIG_FS_POSIX_ACL
4358 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4360 const char *name = NULL;
4367 case ACL_TYPE_ACCESS:
4369 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4373 name = XATTR_NAME_POSIX_ACL_ACCESS;
4375 case ACL_TYPE_DEFAULT:
4376 if (!S_ISDIR(inode->i_mode))
4377 GOTO(out, rc = acl ? -EACCES : 0);
4378 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4381 GOTO(out, rc = -EINVAL);
4385 size = posix_acl_xattr_size(acl->a_count);
4386 value = kmalloc(size, GFP_NOFS);
4388 GOTO(out, rc = -ENOMEM);
4390 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4395 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4396 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4401 set_cached_acl(inode, type, acl);
4403 forget_cached_acl(inode, type);
4406 #endif /* CONFIG_FS_POSIX_ACL */
4407 #endif /* HAVE_IOP_SET_ACL */
4409 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4411 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4412 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4414 ll_check_acl(struct inode *inode, int mask)
4417 # ifdef CONFIG_FS_POSIX_ACL
4418 struct posix_acl *acl;
4422 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4423 if (flags & IPERM_FLAG_RCU)
4426 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4431 rc = posix_acl_permission(inode, acl, mask);
4432 posix_acl_release(acl);
4435 # else /* !CONFIG_FS_POSIX_ACL */
4437 # endif /* CONFIG_FS_POSIX_ACL */
4439 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4441 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4442 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4444 # ifdef HAVE_INODE_PERMISION_2ARGS
4445 int ll_inode_permission(struct inode *inode, int mask)
4447 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4452 struct ll_sb_info *sbi;
4453 struct root_squash_info *squash;
4454 struct cred *cred = NULL;
4455 const struct cred *old_cred = NULL;
4457 bool squash_id = false;
4460 #ifdef MAY_NOT_BLOCK
4461 if (mask & MAY_NOT_BLOCK)
4463 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4464 if (flags & IPERM_FLAG_RCU)
4468 /* as root inode are NOT getting validated in lookup operation,
4469 * need to do it before permission check. */
4471 if (inode == inode->i_sb->s_root->d_inode) {
4472 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4477 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4478 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4480 /* squash fsuid/fsgid if needed */
4481 sbi = ll_i2sbi(inode);
4482 squash = &sbi->ll_squash;
4483 if (unlikely(squash->rsi_uid != 0 &&
4484 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4485 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4489 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4490 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4491 squash->rsi_uid, squash->rsi_gid);
4493 /* update current process's credentials
4494 * and FS capability */
4495 cred = prepare_creds();
4499 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4500 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4501 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4502 if ((1 << cap) & CFS_CAP_FS_MASK)
4503 cap_lower(cred->cap_effective, cap);
4505 old_cred = override_creds(cred);
4508 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4509 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4510 /* restore current process's credentials and FS capability */
4512 revert_creds(old_cred);
4519 /* -o localflock - only provides locally consistent flock locks */
4520 struct file_operations ll_file_operations = {
4521 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4522 # ifdef HAVE_SYNC_READ_WRITE
4523 .read = new_sync_read,
4524 .write = new_sync_write,
4526 .read_iter = ll_file_read_iter,
4527 .write_iter = ll_file_write_iter,
4528 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4529 .read = ll_file_read,
4530 .aio_read = ll_file_aio_read,
4531 .write = ll_file_write,
4532 .aio_write = ll_file_aio_write,
4533 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4534 .unlocked_ioctl = ll_file_ioctl,
4535 .open = ll_file_open,
4536 .release = ll_file_release,
4537 .mmap = ll_file_mmap,
4538 .llseek = ll_file_seek,
4539 .splice_read = ll_file_splice_read,
4544 struct file_operations ll_file_operations_flock = {
4545 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4546 # ifdef HAVE_SYNC_READ_WRITE
4547 .read = new_sync_read,
4548 .write = new_sync_write,
4549 # endif /* HAVE_SYNC_READ_WRITE */
4550 .read_iter = ll_file_read_iter,
4551 .write_iter = ll_file_write_iter,
4552 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4553 .read = ll_file_read,
4554 .aio_read = ll_file_aio_read,
4555 .write = ll_file_write,
4556 .aio_write = ll_file_aio_write,
4557 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4558 .unlocked_ioctl = ll_file_ioctl,
4559 .open = ll_file_open,
4560 .release = ll_file_release,
4561 .mmap = ll_file_mmap,
4562 .llseek = ll_file_seek,
4563 .splice_read = ll_file_splice_read,
4566 .flock = ll_file_flock,
4567 .lock = ll_file_flock
4570 /* These are for -o noflock - to return ENOSYS on flock calls */
4571 struct file_operations ll_file_operations_noflock = {
4572 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4573 # ifdef HAVE_SYNC_READ_WRITE
4574 .read = new_sync_read,
4575 .write = new_sync_write,
4576 # endif /* HAVE_SYNC_READ_WRITE */
4577 .read_iter = ll_file_read_iter,
4578 .write_iter = ll_file_write_iter,
4579 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4580 .read = ll_file_read,
4581 .aio_read = ll_file_aio_read,
4582 .write = ll_file_write,
4583 .aio_write = ll_file_aio_write,
4584 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4585 .unlocked_ioctl = ll_file_ioctl,
4586 .open = ll_file_open,
4587 .release = ll_file_release,
4588 .mmap = ll_file_mmap,
4589 .llseek = ll_file_seek,
4590 .splice_read = ll_file_splice_read,
4593 .flock = ll_file_noflock,
4594 .lock = ll_file_noflock
4597 struct inode_operations ll_file_inode_operations = {
4598 .setattr = ll_setattr,
4599 .getattr = ll_getattr,
4600 .permission = ll_inode_permission,
4601 #ifdef HAVE_IOP_XATTR
4602 .setxattr = ll_setxattr,
4603 .getxattr = ll_getxattr,
4604 .removexattr = ll_removexattr,
4606 .listxattr = ll_listxattr,
4607 .fiemap = ll_fiemap,
4608 #ifdef HAVE_IOP_GET_ACL
4609 .get_acl = ll_get_acl,
4611 #ifdef HAVE_IOP_SET_ACL
4612 .set_acl = ll_set_acl,
4616 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4618 struct ll_inode_info *lli = ll_i2info(inode);
4619 struct cl_object *obj = lli->lli_clob;
4628 env = cl_env_get(&refcheck);
4630 RETURN(PTR_ERR(env));
4632 rc = cl_conf_set(env, lli->lli_clob, conf);
4636 if (conf->coc_opc == OBJECT_CONF_SET) {
4637 struct ldlm_lock *lock = conf->coc_lock;
4638 struct cl_layout cl = {
4642 LASSERT(lock != NULL);
4643 LASSERT(ldlm_has_layout(lock));
4645 /* it can only be allowed to match after layout is
4646 * applied to inode otherwise false layout would be
4647 * seen. Applying layout shoud happen before dropping
4648 * the intent lock. */
4649 ldlm_lock_allow_match(lock);
4651 rc = cl_object_layout_get(env, obj, &cl);
4656 DFID": layout version change: %u -> %u\n",
4657 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4659 ll_layout_version_set(lli, cl.cl_layout_gen);
4663 cl_env_put(env, &refcheck);
4668 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4669 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4672 struct ll_sb_info *sbi = ll_i2sbi(inode);
4673 struct ptlrpc_request *req;
4674 struct mdt_body *body;
4681 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4682 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4683 lock->l_lvb_data, lock->l_lvb_len);
4685 if (lock->l_lvb_data != NULL)
4688 /* if layout lock was granted right away, the layout is returned
4689 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4690 * blocked and then granted via completion ast, we have to fetch
4691 * layout here. Please note that we can't use the LVB buffer in
4692 * completion AST because it doesn't have a large enough buffer */
4693 rc = ll_get_default_mdsize(sbi, &lmmsize);
4695 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4696 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4701 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4703 GOTO(out, rc = -EPROTO);
4705 lmmsize = body->mbo_eadatasize;
4706 if (lmmsize == 0) /* empty layout */
4709 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4711 GOTO(out, rc = -EFAULT);
4713 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4714 if (lvbdata == NULL)
4715 GOTO(out, rc = -ENOMEM);
4717 memcpy(lvbdata, lmm, lmmsize);
4718 lock_res_and_lock(lock);
4719 if (unlikely(lock->l_lvb_data == NULL)) {
4720 lock->l_lvb_type = LVB_T_LAYOUT;
4721 lock->l_lvb_data = lvbdata;
4722 lock->l_lvb_len = lmmsize;
4725 unlock_res_and_lock(lock);
4728 OBD_FREE_LARGE(lvbdata, lmmsize);
4733 ptlrpc_req_finished(req);
4738 * Apply the layout to the inode. Layout lock is held and will be released
4741 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4742 struct inode *inode)
4744 struct ll_inode_info *lli = ll_i2info(inode);
4745 struct ll_sb_info *sbi = ll_i2sbi(inode);
4746 struct ldlm_lock *lock;
4747 struct cl_object_conf conf;
4750 bool wait_layout = false;
4753 LASSERT(lustre_handle_is_used(lockh));
4755 lock = ldlm_handle2lock(lockh);
4756 LASSERT(lock != NULL);
4757 LASSERT(ldlm_has_layout(lock));
4759 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4760 PFID(&lli->lli_fid), inode);
4762 /* in case this is a caching lock and reinstate with new inode */
4763 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4765 lock_res_and_lock(lock);
4766 lvb_ready = ldlm_is_lvb_ready(lock);
4767 unlock_res_and_lock(lock);
4769 /* checking lvb_ready is racy but this is okay. The worst case is
4770 * that multi processes may configure the file on the same time. */
4774 rc = ll_layout_fetch(inode, lock);
4778 /* for layout lock, lmm is stored in lock's lvb.
4779 * lvb_data is immutable if the lock is held so it's safe to access it
4782 * set layout to file. Unlikely this will fail as old layout was
4783 * surely eliminated */
4784 memset(&conf, 0, sizeof conf);
4785 conf.coc_opc = OBJECT_CONF_SET;
4786 conf.coc_inode = inode;
4787 conf.coc_lock = lock;
4788 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4789 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4790 rc = ll_layout_conf(inode, &conf);
4792 /* refresh layout failed, need to wait */
4793 wait_layout = rc == -EBUSY;
4796 LDLM_LOCK_PUT(lock);
4797 ldlm_lock_decref(lockh, mode);
4799 /* wait for IO to complete if it's still being used. */
4801 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4802 ll_get_fsname(inode->i_sb, NULL, 0),
4803 PFID(&lli->lli_fid), inode);
4805 memset(&conf, 0, sizeof conf);
4806 conf.coc_opc = OBJECT_CONF_WAIT;
4807 conf.coc_inode = inode;
4808 rc = ll_layout_conf(inode, &conf);
4812 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4813 ll_get_fsname(inode->i_sb, NULL, 0),
4814 PFID(&lli->lli_fid), rc);
4820 * Issue layout intent RPC to MDS.
4821 * \param inode [in] file inode
4822 * \param intent [in] layout intent
4824 * \retval 0 on success
4825 * \retval < 0 error code
4827 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4829 struct ll_inode_info *lli = ll_i2info(inode);
4830 struct ll_sb_info *sbi = ll_i2sbi(inode);
4831 struct md_op_data *op_data;
4832 struct lookup_intent it;
4833 struct ptlrpc_request *req;
4837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4838 0, 0, LUSTRE_OPC_ANY, NULL);
4839 if (IS_ERR(op_data))
4840 RETURN(PTR_ERR(op_data));
4842 op_data->op_data = intent;
4843 op_data->op_data_size = sizeof(*intent);
4845 memset(&it, 0, sizeof(it));
4846 it.it_op = IT_LAYOUT;
4847 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4848 intent->li_opc == LAYOUT_INTENT_TRUNC)
4849 it.it_flags = FMODE_WRITE;
4851 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4852 ll_get_fsname(inode->i_sb, NULL, 0),
4853 PFID(&lli->lli_fid), inode);
4855 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4856 &ll_md_blocking_ast, 0);
4857 if (it.it_request != NULL)
4858 ptlrpc_req_finished(it.it_request);
4859 it.it_request = NULL;
4861 ll_finish_md_op_data(op_data);
4863 /* set lock data in case this is a new lock */
4865 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4867 ll_intent_drop_lock(&it);
4873 * This function checks if there exists a LAYOUT lock on the client side,
4874 * or enqueues it if it doesn't have one in cache.
4876 * This function will not hold layout lock so it may be revoked any time after
4877 * this function returns. Any operations depend on layout should be redone
4880 * This function should be called before lov_io_init() to get an uptodate
4881 * layout version, the caller should save the version number and after IO
4882 * is finished, this function should be called again to verify that layout
4883 * is not changed during IO time.
4885 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4887 struct ll_inode_info *lli = ll_i2info(inode);
4888 struct ll_sb_info *sbi = ll_i2sbi(inode);
4889 struct lustre_handle lockh;
4890 struct layout_intent intent = {
4891 .li_opc = LAYOUT_INTENT_ACCESS,
4893 enum ldlm_mode mode;
4897 *gen = ll_layout_version_get(lli);
4898 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4902 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4903 LASSERT(S_ISREG(inode->i_mode));
4905 /* take layout lock mutex to enqueue layout lock exclusively. */
4906 mutex_lock(&lli->lli_layout_mutex);
4909 /* mostly layout lock is caching on the local side, so try to
4910 * match it before grabbing layout lock mutex. */
4911 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4912 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4913 if (mode != 0) { /* hit cached lock */
4914 rc = ll_layout_lock_set(&lockh, mode, inode);
4920 rc = ll_layout_intent(inode, &intent);
4926 *gen = ll_layout_version_get(lli);
4927 mutex_unlock(&lli->lli_layout_mutex);
4933 * Issue layout intent RPC indicating where in a file an IO is about to write.
4935 * \param[in] inode file inode.
4936 * \param[in] ext write range with start offset of fille in bytes where
4937 * an IO is about to write, and exclusive end offset in
4940 * \retval 0 on success
4941 * \retval < 0 error code
4943 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4944 struct lu_extent *ext)
4946 struct layout_intent intent = {
4948 .li_extent.e_start = ext->e_start,
4949 .li_extent.e_end = ext->e_end,
4954 rc = ll_layout_intent(inode, &intent);
4960 * This function send a restore request to the MDT
4962 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4964 struct hsm_user_request *hur;
4968 len = sizeof(struct hsm_user_request) +
4969 sizeof(struct hsm_user_item);
4970 OBD_ALLOC(hur, len);
4974 hur->hur_request.hr_action = HUA_RESTORE;
4975 hur->hur_request.hr_archive_id = 0;
4976 hur->hur_request.hr_flags = 0;
4977 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4978 sizeof(hur->hur_user_item[0].hui_fid));
4979 hur->hur_user_item[0].hui_extent.offset = offset;
4980 hur->hur_user_item[0].hui_extent.length = length;
4981 hur->hur_request.hr_itemcount = 1;
4982 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,