4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 if (inode->i_sb->s_root != file->f_path.dentry)
326 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327 fd = LUSTRE_FPRIVATE(file);
330 /* The last ref on @file, maybe not the the owner pid of statahead,
331 * because parent and child process can share the same file handle. */
332 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
333 ll_deauthorize_statahead(inode, fd);
335 if (inode->i_sb->s_root == file->f_path.dentry) {
336 LUSTRE_FPRIVATE(file) = NULL;
337 ll_file_data_put(fd);
341 if (!S_ISDIR(inode->i_mode)) {
342 if (lli->lli_clob != NULL)
343 lov_read_and_clear_async_rc(lli->lli_clob);
344 lli->lli_async_rc = 0;
347 rc = ll_md_close(inode, file);
349 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
350 libcfs_debug_dumplog();
355 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
356 struct lookup_intent *itp)
358 struct dentry *de = file->f_path.dentry;
359 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
360 struct dentry *parent = de->d_parent;
361 const char *name = NULL;
363 struct md_op_data *op_data;
364 struct ptlrpc_request *req = NULL;
368 LASSERT(parent != NULL);
369 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
371 /* if server supports open-by-fid, or file name is invalid, don't pack
372 * name in open request */
373 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
374 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
375 name = de->d_name.name;
376 len = de->d_name.len;
379 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
380 name, len, 0, LUSTRE_OPC_ANY, NULL);
382 RETURN(PTR_ERR(op_data));
383 op_data->op_data = lmm;
384 op_data->op_data_size = lmmsize;
386 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
387 &ll_md_blocking_ast, 0);
388 ll_finish_md_op_data(op_data);
390 /* reason for keep own exit path - don`t flood log
391 * with messages with -ESTALE errors.
393 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
394 it_open_error(DISP_OPEN_OPEN, itp))
396 ll_release_openhandle(de, itp);
400 if (it_disposition(itp, DISP_LOOKUP_NEG))
401 GOTO(out, rc = -ENOENT);
403 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
404 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
405 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
409 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
410 if (!rc && itp->it_lock_mode)
411 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
414 ptlrpc_req_finished(req);
415 ll_intent_drop_lock(itp);
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file->f_path.dentry->d_inode;
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file->f_path.dentry) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file->f_path.dentry, it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file, NULL, 0, it);
607 GOTO(out_openerr, rc);
611 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
613 GOTO(out_och_free, rc = -ENOMEM);
617 /* md_intent_lock() didn't get a request ref if there was an
618 * open error, so don't do cleanup on the request here
620 /* XXX (green): Should not we bail out on any error here, not
621 * just open error? */
622 rc = it_open_error(DISP_OPEN_OPEN, it);
624 GOTO(out_och_free, rc);
626 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
627 "inode %p: disposition %x, status %d\n", inode,
628 it_disposition(it, ~0), it->it_status);
630 rc = ll_local_open(file, it, fd, *och_p);
632 GOTO(out_och_free, rc);
634 mutex_unlock(&lli->lli_och_mutex);
637 /* Must do this outside lli_och_mutex lock to prevent deadlock where
638 different kind of OPEN lock for this same inode gets cancelled
639 by ldlm_cancel_lru */
640 if (!S_ISREG(inode->i_mode))
641 GOTO(out_och_free, rc);
643 cl_lov_delay_create_clear(&file->f_flags);
644 GOTO(out_och_free, rc);
648 if (och_p && *och_p) {
649 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
650 *och_p = NULL; /* OBD_FREE writes some magic there */
653 mutex_unlock(&lli->lli_och_mutex);
656 if (lli->lli_opendir_key == fd)
657 ll_deauthorize_statahead(inode, fd);
659 ll_file_data_put(fd);
661 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
664 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
665 ptlrpc_req_finished(it->it_request);
666 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
672 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
673 struct ldlm_lock_desc *desc, void *data, int flag)
676 struct lustre_handle lockh;
680 case LDLM_CB_BLOCKING:
681 ldlm_lock2handle(lock, &lockh);
682 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
684 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
688 case LDLM_CB_CANCELING:
696 * Acquire a lease and open the file.
698 static struct obd_client_handle *
699 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
702 struct lookup_intent it = { .it_op = IT_OPEN };
703 struct ll_sb_info *sbi = ll_i2sbi(inode);
704 struct md_op_data *op_data;
705 struct ptlrpc_request *req = NULL;
706 struct lustre_handle old_handle = { 0 };
707 struct obd_client_handle *och = NULL;
712 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
713 RETURN(ERR_PTR(-EINVAL));
716 struct ll_inode_info *lli = ll_i2info(inode);
717 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
718 struct obd_client_handle **och_p;
721 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
722 RETURN(ERR_PTR(-EPERM));
724 /* Get the openhandle of the file */
726 mutex_lock(&lli->lli_och_mutex);
727 if (fd->fd_lease_och != NULL) {
728 mutex_unlock(&lli->lli_och_mutex);
732 if (fd->fd_och == NULL) {
733 if (file->f_mode & FMODE_WRITE) {
734 LASSERT(lli->lli_mds_write_och != NULL);
735 och_p = &lli->lli_mds_write_och;
736 och_usecount = &lli->lli_open_fd_write_count;
738 LASSERT(lli->lli_mds_read_och != NULL);
739 och_p = &lli->lli_mds_read_och;
740 och_usecount = &lli->lli_open_fd_read_count;
742 if (*och_usecount == 1) {
749 mutex_unlock(&lli->lli_och_mutex);
750 if (rc < 0) /* more than 1 opener */
753 LASSERT(fd->fd_och != NULL);
754 old_handle = fd->fd_och->och_fh;
759 RETURN(ERR_PTR(-ENOMEM));
761 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
762 LUSTRE_OPC_ANY, NULL);
764 GOTO(out, rc = PTR_ERR(op_data));
766 /* To tell the MDT this openhandle is from the same owner */
767 op_data->op_handle = old_handle;
769 it.it_flags = fmode | open_flags;
770 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
771 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
772 &ll_md_blocking_lease_ast,
773 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
774 * it can be cancelled which may mislead applications that the lease is
776 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
777 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
778 * doesn't deal with openhandle, so normal openhandle will be leaked. */
779 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
780 ll_finish_md_op_data(op_data);
781 ptlrpc_req_finished(req);
783 GOTO(out_release_it, rc);
785 if (it_disposition(&it, DISP_LOOKUP_NEG))
786 GOTO(out_release_it, rc = -ENOENT);
788 rc = it_open_error(DISP_OPEN_OPEN, &it);
790 GOTO(out_release_it, rc);
792 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
793 ll_och_fill(sbi->ll_md_exp, &it, och);
795 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
796 GOTO(out_close, rc = -EOPNOTSUPP);
798 /* already get lease, handle lease lock */
799 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
800 if (it.it_lock_mode == 0 ||
801 it.it_lock_bits != MDS_INODELOCK_OPEN) {
802 /* open lock must return for lease */
803 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
804 PFID(ll_inode2fid(inode)), it.it_lock_mode,
806 GOTO(out_close, rc = -EPROTO);
809 ll_intent_release(&it);
813 /* Cancel open lock */
814 if (it.it_lock_mode != 0) {
815 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
818 och->och_lease_handle.cookie = 0ULL;
820 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
822 CERROR("%s: error closing file "DFID": %d\n",
823 ll_get_fsname(inode->i_sb, NULL, 0),
824 PFID(&ll_i2info(inode)->lli_fid), rc2);
825 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
827 ll_intent_release(&it);
835 * Check whether a layout swap can be done between two inodes.
837 * \param[in] inode1 First inode to check
838 * \param[in] inode2 Second inode to check
840 * \retval 0 on success, layout swap can be performed between both inodes
841 * \retval negative error code if requirements are not met
843 static int ll_check_swap_layouts_validity(struct inode *inode1,
844 struct inode *inode2)
846 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
849 if (inode_permission(inode1, MAY_WRITE) ||
850 inode_permission(inode2, MAY_WRITE))
853 if (inode1->i_sb != inode2->i_sb)
859 static int ll_swap_layouts_close(struct obd_client_handle *och,
860 struct inode *inode, struct inode *inode2)
862 const struct lu_fid *fid1 = ll_inode2fid(inode);
863 const struct lu_fid *fid2;
867 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
868 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
870 rc = ll_check_swap_layouts_validity(inode, inode2);
872 GOTO(out_free_och, rc);
874 /* We now know that inode2 is a lustre inode */
875 fid2 = ll_inode2fid(inode2);
877 rc = lu_fid_cmp(fid1, fid2);
879 GOTO(out_free_och, rc = -EINVAL);
881 /* Close the file and swap layouts between inode & inode2.
882 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
883 * because we still need it to pack l_remote_handle to MDT. */
884 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
887 och = NULL; /* freed in ll_close_inode_openhandle() */
897 * Release lease and close the file.
898 * It will check if the lease has ever broken.
900 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
903 struct ldlm_lock *lock;
904 bool cancelled = true;
908 lock = ldlm_handle2lock(&och->och_lease_handle);
910 lock_res_and_lock(lock);
911 cancelled = ldlm_is_cancel(lock);
912 unlock_res_and_lock(lock);
916 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
917 PFID(&ll_i2info(inode)->lli_fid), cancelled);
920 ldlm_cli_cancel(&och->och_lease_handle, 0);
921 if (lease_broken != NULL)
922 *lease_broken = cancelled;
924 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
928 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
930 struct ll_inode_info *lli = ll_i2info(inode);
931 struct cl_object *obj = lli->lli_clob;
932 struct cl_attr *attr = vvp_env_thread_attr(env);
940 ll_inode_size_lock(inode);
942 /* Merge timestamps the most recently obtained from MDS with
943 * timestamps obtained from OSTs.
945 * Do not overwrite atime of inode because it may be refreshed
946 * by file_accessed() function. If the read was served by cache
947 * data, there is no RPC to be sent so that atime may not be
948 * transferred to OSTs at all. MDT only updates atime at close time
949 * if it's at least 'mdd.*.atime_diff' older.
950 * All in all, the atime in Lustre does not strictly comply with
951 * POSIX. Solving this problem needs to send an RPC to MDT for each
952 * read, this will hurt performance. */
953 if (LTIME_S(inode->i_atime) < lli->lli_atime)
954 LTIME_S(inode->i_atime) = lli->lli_atime;
955 LTIME_S(inode->i_mtime) = lli->lli_mtime;
956 LTIME_S(inode->i_ctime) = lli->lli_ctime;
958 atime = LTIME_S(inode->i_atime);
959 mtime = LTIME_S(inode->i_mtime);
960 ctime = LTIME_S(inode->i_ctime);
962 cl_object_attr_lock(obj);
963 rc = cl_object_attr_get(env, obj, attr);
964 cl_object_attr_unlock(obj);
967 GOTO(out_size_unlock, rc);
969 if (atime < attr->cat_atime)
970 atime = attr->cat_atime;
972 if (ctime < attr->cat_ctime)
973 ctime = attr->cat_ctime;
975 if (mtime < attr->cat_mtime)
976 mtime = attr->cat_mtime;
978 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
979 PFID(&lli->lli_fid), attr->cat_size);
981 i_size_write(inode, attr->cat_size);
982 inode->i_blocks = attr->cat_blocks;
984 LTIME_S(inode->i_atime) = atime;
985 LTIME_S(inode->i_mtime) = mtime;
986 LTIME_S(inode->i_ctime) = ctime;
989 ll_inode_size_unlock(inode);
994 static bool file_is_noatime(const struct file *file)
996 const struct vfsmount *mnt = file->f_path.mnt;
997 const struct inode *inode = file->f_path.dentry->d_inode;
999 /* Adapted from file_accessed() and touch_atime().*/
1000 if (file->f_flags & O_NOATIME)
1003 if (inode->i_flags & S_NOATIME)
1006 if (IS_NOATIME(inode))
1009 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1012 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1015 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1021 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1023 struct inode *inode = file->f_path.dentry->d_inode;
1025 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1027 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1028 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1029 file->f_flags & O_DIRECT ||
1032 io->ci_obj = ll_i2info(inode)->lli_clob;
1033 io->ci_lockreq = CILR_MAYBE;
1034 if (ll_file_nolock(file)) {
1035 io->ci_lockreq = CILR_NEVER;
1036 io->ci_no_srvlock = 1;
1037 } else if (file->f_flags & O_APPEND) {
1038 io->ci_lockreq = CILR_MANDATORY;
1041 io->ci_noatime = file_is_noatime(file);
1045 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1046 struct file *file, enum cl_io_type iot,
1047 loff_t *ppos, size_t count)
1049 struct vvp_io *vio = vvp_env_io(env);
1050 struct inode *inode = file->f_path.dentry->d_inode;
1051 struct ll_inode_info *lli = ll_i2info(inode);
1052 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1056 struct range_lock range;
1060 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1061 file->f_path.dentry->d_name.name, iot, *ppos, count);
1064 io = vvp_env_thread_io(env);
1065 ll_io_init(io, file, iot == CIT_WRITE);
1067 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1068 bool range_locked = false;
1070 if (file->f_flags & O_APPEND)
1071 range_lock_init(&range, 0, LUSTRE_EOF);
1073 range_lock_init(&range, *ppos, *ppos + count - 1);
1075 vio->vui_fd = LUSTRE_FPRIVATE(file);
1076 vio->vui_io_subtype = args->via_io_subtype;
1078 switch (vio->vui_io_subtype) {
1080 vio->vui_iter = args->u.normal.via_iter;
1081 vio->vui_iocb = args->u.normal.via_iocb;
1082 /* Direct IO reads must also take range lock,
1083 * or multiple reads will try to work on the same pages
1084 * See LU-6227 for details. */
1085 if (((iot == CIT_WRITE) ||
1086 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1087 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1088 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1090 rc = range_lock(&lli->lli_write_tree, &range);
1094 range_locked = true;
1098 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1099 vio->u.splice.vui_flags = args->u.splice.via_flags;
1102 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1106 ll_cl_add(file, env, io, LCC_RW);
1107 rc = cl_io_loop(env, io);
1108 ll_cl_remove(file, env);
1111 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1113 range_unlock(&lli->lli_write_tree, &range);
1116 /* cl_io_rw_init() handled IO */
1120 if (io->ci_nob > 0) {
1121 result += io->ci_nob;
1122 count -= io->ci_nob;
1123 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1125 /* prepare IO restart */
1126 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1127 args->u.normal.via_iter = vio->vui_iter;
1131 cl_io_fini(env, io);
1133 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1135 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1136 file->f_path.dentry->d_name.name,
1137 iot == CIT_READ ? "read" : "write",
1138 *ppos, count, result);
1142 if (iot == CIT_READ) {
1144 ll_stats_ops_tally(ll_i2sbi(inode),
1145 LPROC_LL_READ_BYTES, result);
1146 } else if (iot == CIT_WRITE) {
1148 ll_stats_ops_tally(ll_i2sbi(inode),
1149 LPROC_LL_WRITE_BYTES, result);
1150 fd->fd_write_failed = false;
1151 } else if (result == 0 && rc == 0) {
1154 fd->fd_write_failed = true;
1156 fd->fd_write_failed = false;
1157 } else if (rc != -ERESTARTSYS) {
1158 fd->fd_write_failed = true;
1162 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1164 return result > 0 ? result : rc;
1168 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1169 * especially for small I/O.
1171 * To serve a read request, CLIO has to create and initialize a cl_io and
1172 * then request DLM lock. This has turned out to have siginificant overhead
1173 * and affects the performance of small I/O dramatically.
1175 * It's not necessary to create a cl_io for each I/O. Under the help of read
1176 * ahead, most of the pages being read are already in memory cache and we can
1177 * read those pages directly because if the pages exist, the corresponding DLM
1178 * lock must exist so that page content must be valid.
1180 * In fast read implementation, the llite speculatively finds and reads pages
1181 * in memory cache. There are three scenarios for fast read:
1182 * - If the page exists and is uptodate, kernel VM will provide the data and
1183 * CLIO won't be intervened;
1184 * - If the page was brought into memory by read ahead, it will be exported
1185 * and read ahead parameters will be updated;
1186 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1187 * it will go back and invoke normal read, i.e., a cl_io will be created
1188 * and DLM lock will be requested.
1190 * POSIX compliance: posix standard states that read is intended to be atomic.
1191 * Lustre read implementation is in line with Linux kernel read implementation
1192 * and neither of them complies with POSIX standard in this matter. Fast read
1193 * doesn't make the situation worse on single node but it may interleave write
1194 * results from multiple nodes due to short read handling in ll_file_aio_read().
1196 * \param env - lu_env
1197 * \param iocb - kiocb from kernel
1198 * \param iter - user space buffers where the data will be copied
1200 * \retval - number of bytes have been read, or error code if error occurred.
1203 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1204 struct iov_iter *iter)
1208 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1211 /* NB: we can't do direct IO for fast read because it will need a lock
1212 * to make IO engine happy. */
1213 if (iocb->ki_filp->f_flags & O_DIRECT)
1216 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1217 result = generic_file_read_iter(iocb, iter);
1218 ll_cl_remove(iocb->ki_filp, env);
1220 /* If the first page is not in cache, generic_file_aio_read() will be
1221 * returned with -ENODATA.
1222 * See corresponding code in ll_readpage(). */
1223 if (result == -ENODATA)
1227 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1228 LPROC_LL_READ_BYTES, result);
1234 * Read from a file (through the page cache).
1236 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1239 struct vvp_io_args *args;
1244 env = cl_env_get(&refcheck);
1246 return PTR_ERR(env);
1248 result = ll_do_fast_read(env, iocb, to);
1249 if (result < 0 || iov_iter_count(to) == 0)
1252 args = ll_env_args(env, IO_NORMAL);
1253 args->u.normal.via_iter = to;
1254 args->u.normal.via_iocb = iocb;
1256 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1257 &iocb->ki_pos, iov_iter_count(to));
1260 else if (result == 0)
1264 cl_env_put(env, &refcheck);
1269 * Write to a file (through the page cache).
1271 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1273 struct vvp_io_args *args;
1278 env = cl_env_get(&refcheck);
1280 return PTR_ERR(env);
1282 args = ll_env_args(env, IO_NORMAL);
1283 args->u.normal.via_iter = from;
1284 args->u.normal.via_iocb = iocb;
1286 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1287 &iocb->ki_pos, iov_iter_count(from));
1288 cl_env_put(env, &refcheck);
1292 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1294 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1296 static int ll_file_get_iov_count(const struct iovec *iov,
1297 unsigned long *nr_segs, size_t *count)
1302 for (seg = 0; seg < *nr_segs; seg++) {
1303 const struct iovec *iv = &iov[seg];
1306 * If any segment has a negative length, or the cumulative
1307 * length ever wraps negative then return -EINVAL.
1310 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1312 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1317 cnt -= iv->iov_len; /* This segment is no good */
1324 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1325 unsigned long nr_segs, loff_t pos)
1332 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1336 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1337 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1338 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1339 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1340 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1342 result = ll_file_read_iter(iocb, &to);
1347 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1350 struct iovec iov = { .iov_base = buf, .iov_len = count };
1351 struct kiocb *kiocb;
1355 OBD_ALLOC_PTR(kiocb);
1359 init_sync_kiocb(kiocb, file);
1360 kiocb->ki_pos = *ppos;
1361 #ifdef HAVE_KIOCB_KI_LEFT
1362 kiocb->ki_left = count;
1363 #elif defined(HAVE_KI_NBYTES)
1364 kiocb->ki_nbytes = count;
1367 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1368 *ppos = kiocb->ki_pos;
1370 OBD_FREE_PTR(kiocb);
1375 * Write to a file (through the page cache).
1378 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1379 unsigned long nr_segs, loff_t pos)
1381 struct iov_iter from;
1386 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1390 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1391 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1392 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1393 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1394 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1396 result = ll_file_write_iter(iocb, &from);
1401 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1402 size_t count, loff_t *ppos)
1405 struct iovec iov = { .iov_base = (void __user *)buf,
1407 struct kiocb *kiocb;
1412 env = cl_env_get(&refcheck);
1414 RETURN(PTR_ERR(env));
1416 kiocb = &ll_env_info(env)->lti_kiocb;
1417 init_sync_kiocb(kiocb, file);
1418 kiocb->ki_pos = *ppos;
1419 #ifdef HAVE_KIOCB_KI_LEFT
1420 kiocb->ki_left = count;
1421 #elif defined(HAVE_KI_NBYTES)
1422 kiocb->ki_nbytes = count;
1425 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1426 *ppos = kiocb->ki_pos;
1428 cl_env_put(env, &refcheck);
1431 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1434 * Send file content (through pagecache) somewhere with helper
1436 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1437 struct pipe_inode_info *pipe, size_t count,
1441 struct vvp_io_args *args;
1446 env = cl_env_get(&refcheck);
1448 RETURN(PTR_ERR(env));
1450 args = ll_env_args(env, IO_SPLICE);
1451 args->u.splice.via_pipe = pipe;
1452 args->u.splice.via_flags = flags;
1454 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1455 cl_env_put(env, &refcheck);
1459 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1460 __u64 flags, struct lov_user_md *lum,
1463 struct lookup_intent oit = {
1465 .it_flags = flags | MDS_OPEN_BY_FID,
1470 ll_inode_size_lock(inode);
1471 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1473 GOTO(out_unlock, rc);
1475 ll_release_openhandle(file->f_path.dentry, &oit);
1478 ll_inode_size_unlock(inode);
1479 ll_intent_release(&oit);
1480 cl_lov_delay_create_clear(&file->f_flags);
1485 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1486 struct lov_mds_md **lmmp, int *lmm_size,
1487 struct ptlrpc_request **request)
1489 struct ll_sb_info *sbi = ll_i2sbi(inode);
1490 struct mdt_body *body;
1491 struct lov_mds_md *lmm = NULL;
1492 struct ptlrpc_request *req = NULL;
1493 struct md_op_data *op_data;
1496 rc = ll_get_default_mdsize(sbi, &lmmsize);
1500 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1501 strlen(filename), lmmsize,
1502 LUSTRE_OPC_ANY, NULL);
1503 if (IS_ERR(op_data))
1504 RETURN(PTR_ERR(op_data));
1506 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1507 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1508 ll_finish_md_op_data(op_data);
1510 CDEBUG(D_INFO, "md_getattr_name failed "
1511 "on %s: rc %d\n", filename, rc);
1515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1516 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1518 lmmsize = body->mbo_eadatasize;
1520 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1522 GOTO(out, rc = -ENODATA);
1525 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1526 LASSERT(lmm != NULL);
1528 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1529 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1530 GOTO(out, rc = -EPROTO);
1534 * This is coming from the MDS, so is probably in
1535 * little endian. We convert it to host endian before
1536 * passing it to userspace.
1538 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1541 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1542 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1545 /* if function called for directory - we should
1546 * avoid swab not existent lsm objects */
1547 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1548 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1549 if (S_ISREG(body->mbo_mode))
1550 lustre_swab_lov_user_md_objects(
1551 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1553 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1554 lustre_swab_lov_user_md_v3(
1555 (struct lov_user_md_v3 *)lmm);
1556 if (S_ISREG(body->mbo_mode))
1557 lustre_swab_lov_user_md_objects(
1558 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1565 *lmm_size = lmmsize;
1570 static int ll_lov_setea(struct inode *inode, struct file *file,
1573 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1574 struct lov_user_md *lump;
1575 int lum_size = sizeof(struct lov_user_md) +
1576 sizeof(struct lov_user_ost_data);
1580 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1583 OBD_ALLOC_LARGE(lump, lum_size);
1587 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1588 GOTO(out_lump, rc = -EFAULT);
1590 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1593 OBD_FREE_LARGE(lump, lum_size);
1597 static int ll_file_getstripe(struct inode *inode,
1598 struct lov_user_md __user *lum)
1605 env = cl_env_get(&refcheck);
1607 RETURN(PTR_ERR(env));
1609 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1610 cl_env_put(env, &refcheck);
1614 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1617 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1618 struct lov_user_md *klum;
1620 __u64 flags = FMODE_WRITE;
1623 rc = ll_copy_user_md(lum, &klum);
1628 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1632 put_user(0, &lum->lmm_stripe_count);
1634 ll_layout_refresh(inode, &gen);
1635 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1638 OBD_FREE(klum, lum_size);
1643 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1645 struct ll_inode_info *lli = ll_i2info(inode);
1646 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1647 struct ll_grouplock grouplock;
1652 CWARN("group id for group lock must not be 0\n");
1656 if (ll_file_nolock(file))
1657 RETURN(-EOPNOTSUPP);
1659 spin_lock(&lli->lli_lock);
1660 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1661 CWARN("group lock already existed with gid %lu\n",
1662 fd->fd_grouplock.lg_gid);
1663 spin_unlock(&lli->lli_lock);
1666 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1667 spin_unlock(&lli->lli_lock);
1669 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1670 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1674 spin_lock(&lli->lli_lock);
1675 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1676 spin_unlock(&lli->lli_lock);
1677 CERROR("another thread just won the race\n");
1678 cl_put_grouplock(&grouplock);
1682 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1683 fd->fd_grouplock = grouplock;
1684 spin_unlock(&lli->lli_lock);
1686 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1690 static int ll_put_grouplock(struct inode *inode, struct file *file,
1693 struct ll_inode_info *lli = ll_i2info(inode);
1694 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1695 struct ll_grouplock grouplock;
1698 spin_lock(&lli->lli_lock);
1699 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1700 spin_unlock(&lli->lli_lock);
1701 CWARN("no group lock held\n");
1705 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1707 if (fd->fd_grouplock.lg_gid != arg) {
1708 CWARN("group lock %lu doesn't match current id %lu\n",
1709 arg, fd->fd_grouplock.lg_gid);
1710 spin_unlock(&lli->lli_lock);
1714 grouplock = fd->fd_grouplock;
1715 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1716 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1717 spin_unlock(&lli->lli_lock);
1719 cl_put_grouplock(&grouplock);
1720 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1725 * Close inode open handle
1727 * \param dentry [in] dentry which contains the inode
1728 * \param it [in,out] intent which contains open info and result
1731 * \retval <0 failure
1733 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1735 struct inode *inode = dentry->d_inode;
1736 struct obd_client_handle *och;
1742 /* Root ? Do nothing. */
1743 if (dentry->d_inode->i_sb->s_root == dentry)
1746 /* No open handle to close? Move away */
1747 if (!it_disposition(it, DISP_OPEN_OPEN))
1750 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1752 OBD_ALLOC(och, sizeof(*och));
1754 GOTO(out, rc = -ENOMEM);
1756 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1758 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1760 /* this one is in place of ll_file_open */
1761 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1762 ptlrpc_req_finished(it->it_request);
1763 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1769 * Get size for inode for which FIEMAP mapping is requested.
1770 * Make the FIEMAP get_info call and returns the result.
1771 * \param fiemap kernel buffer to hold extens
1772 * \param num_bytes kernel buffer size
1774 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1780 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1783 /* Checks for fiemap flags */
1784 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1785 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1789 /* Check for FIEMAP_FLAG_SYNC */
1790 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1791 rc = filemap_fdatawrite(inode->i_mapping);
1796 env = cl_env_get(&refcheck);
1798 RETURN(PTR_ERR(env));
1800 if (i_size_read(inode) == 0) {
1801 rc = ll_glimpse_size(inode);
1806 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1807 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1808 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1810 /* If filesize is 0, then there would be no objects for mapping */
1811 if (fmkey.lfik_oa.o_size == 0) {
1812 fiemap->fm_mapped_extents = 0;
1816 fmkey.lfik_fiemap = *fiemap;
1818 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1819 &fmkey, fiemap, &num_bytes);
1821 cl_env_put(env, &refcheck);
1825 int ll_fid2path(struct inode *inode, void __user *arg)
1827 struct obd_export *exp = ll_i2mdexp(inode);
1828 const struct getinfo_fid2path __user *gfin = arg;
1830 struct getinfo_fid2path *gfout;
1836 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1837 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1840 /* Only need to get the buflen */
1841 if (get_user(pathlen, &gfin->gf_pathlen))
1844 if (pathlen > PATH_MAX)
1847 outsize = sizeof(*gfout) + pathlen;
1848 OBD_ALLOC(gfout, outsize);
1852 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1853 GOTO(gf_free, rc = -EFAULT);
1854 /* append root FID after gfout to let MDT know the root FID so that it
1855 * can lookup the correct path, this is mainly for fileset.
1856 * old server without fileset mount support will ignore this. */
1857 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1859 /* Call mdc_iocontrol */
1860 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1864 if (copy_to_user(arg, gfout, outsize))
1868 OBD_FREE(gfout, outsize);
1873 * Read the data_version for inode.
1875 * This value is computed using stripe object version on OST.
1876 * Version is computed using server side locking.
1878 * @param flags if do sync on the OST side;
1880 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1881 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1883 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1885 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1893 /* If no file object initialized, we consider its version is 0. */
1899 env = cl_env_get(&refcheck);
1901 RETURN(PTR_ERR(env));
1903 io = vvp_env_thread_io(env);
1905 io->u.ci_data_version.dv_data_version = 0;
1906 io->u.ci_data_version.dv_flags = flags;
1909 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1910 result = cl_io_loop(env, io);
1912 result = io->ci_result;
1914 *data_version = io->u.ci_data_version.dv_data_version;
1916 cl_io_fini(env, io);
1918 if (unlikely(io->ci_need_restart))
1921 cl_env_put(env, &refcheck);
1927 * Trigger a HSM release request for the provided inode.
1929 int ll_hsm_release(struct inode *inode)
1932 struct obd_client_handle *och = NULL;
1933 __u64 data_version = 0;
1938 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1939 ll_get_fsname(inode->i_sb, NULL, 0),
1940 PFID(&ll_i2info(inode)->lli_fid));
1942 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1944 GOTO(out, rc = PTR_ERR(och));
1946 /* Grab latest data_version and [am]time values */
1947 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1951 env = cl_env_get(&refcheck);
1953 GOTO(out, rc = PTR_ERR(env));
1955 ll_merge_attr(env, inode);
1956 cl_env_put(env, &refcheck);
1958 /* Release the file.
1959 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1960 * we still need it to pack l_remote_handle to MDT. */
1961 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1967 if (och != NULL && !IS_ERR(och)) /* close the file */
1968 ll_lease_close(och, inode, NULL);
1973 struct ll_swap_stack {
1976 struct inode *inode1;
1977 struct inode *inode2;
1982 static int ll_swap_layouts(struct file *file1, struct file *file2,
1983 struct lustre_swap_layouts *lsl)
1985 struct mdc_swap_layouts msl;
1986 struct md_op_data *op_data;
1989 struct ll_swap_stack *llss = NULL;
1992 OBD_ALLOC_PTR(llss);
1996 llss->inode1 = file1->f_path.dentry->d_inode;
1997 llss->inode2 = file2->f_path.dentry->d_inode;
1999 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2003 /* we use 2 bool because it is easier to swap than 2 bits */
2004 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2005 llss->check_dv1 = true;
2007 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2008 llss->check_dv2 = true;
2010 /* we cannot use lsl->sl_dvX directly because we may swap them */
2011 llss->dv1 = lsl->sl_dv1;
2012 llss->dv2 = lsl->sl_dv2;
2014 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2015 if (rc == 0) /* same file, done! */
2018 if (rc < 0) { /* sequentialize it */
2019 swap(llss->inode1, llss->inode2);
2021 swap(llss->dv1, llss->dv2);
2022 swap(llss->check_dv1, llss->check_dv2);
2026 if (gid != 0) { /* application asks to flush dirty cache */
2027 rc = ll_get_grouplock(llss->inode1, file1, gid);
2031 rc = ll_get_grouplock(llss->inode2, file2, gid);
2033 ll_put_grouplock(llss->inode1, file1, gid);
2038 /* ultimate check, before swaping the layouts we check if
2039 * dataversion has changed (if requested) */
2040 if (llss->check_dv1) {
2041 rc = ll_data_version(llss->inode1, &dv, 0);
2044 if (dv != llss->dv1)
2045 GOTO(putgl, rc = -EAGAIN);
2048 if (llss->check_dv2) {
2049 rc = ll_data_version(llss->inode2, &dv, 0);
2052 if (dv != llss->dv2)
2053 GOTO(putgl, rc = -EAGAIN);
2056 /* struct md_op_data is used to send the swap args to the mdt
2057 * only flags is missing, so we use struct mdc_swap_layouts
2058 * through the md_op_data->op_data */
2059 /* flags from user space have to be converted before they are send to
2060 * server, no flag is sent today, they are only used on the client */
2063 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2064 0, LUSTRE_OPC_ANY, &msl);
2065 if (IS_ERR(op_data))
2066 GOTO(free, rc = PTR_ERR(op_data));
2068 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2069 sizeof(*op_data), op_data, NULL);
2070 ll_finish_md_op_data(op_data);
2077 ll_put_grouplock(llss->inode2, file2, gid);
2078 ll_put_grouplock(llss->inode1, file1, gid);
2088 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2090 struct md_op_data *op_data;
2094 /* Detect out-of range masks */
2095 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2098 /* Non-root users are forbidden to set or clear flags which are
2099 * NOT defined in HSM_USER_MASK. */
2100 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2101 !cfs_capable(CFS_CAP_SYS_ADMIN))
2104 /* Detect out-of range archive id */
2105 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2106 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2109 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2110 LUSTRE_OPC_ANY, hss);
2111 if (IS_ERR(op_data))
2112 RETURN(PTR_ERR(op_data));
2114 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2115 sizeof(*op_data), op_data, NULL);
2117 ll_finish_md_op_data(op_data);
2122 static int ll_hsm_import(struct inode *inode, struct file *file,
2123 struct hsm_user_import *hui)
2125 struct hsm_state_set *hss = NULL;
2126 struct iattr *attr = NULL;
2130 if (!S_ISREG(inode->i_mode))
2136 GOTO(out, rc = -ENOMEM);
2138 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2139 hss->hss_archive_id = hui->hui_archive_id;
2140 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2141 rc = ll_hsm_state_set(inode, hss);
2145 OBD_ALLOC_PTR(attr);
2147 GOTO(out, rc = -ENOMEM);
2149 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2150 attr->ia_mode |= S_IFREG;
2151 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2152 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2153 attr->ia_size = hui->hui_size;
2154 attr->ia_mtime.tv_sec = hui->hui_mtime;
2155 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2156 attr->ia_atime.tv_sec = hui->hui_atime;
2157 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2159 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2160 ATTR_UID | ATTR_GID |
2161 ATTR_MTIME | ATTR_MTIME_SET |
2162 ATTR_ATIME | ATTR_ATIME_SET;
2164 mutex_lock(&inode->i_mutex);
2166 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2170 mutex_unlock(&inode->i_mutex);
2182 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2184 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2185 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2188 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2190 struct inode *inode = file->f_path.dentry->d_inode;
2192 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2193 ATTR_MTIME | ATTR_MTIME_SET |
2194 ATTR_CTIME | ATTR_CTIME_SET,
2196 .tv_sec = lfu->lfu_atime_sec,
2197 .tv_nsec = lfu->lfu_atime_nsec,
2200 .tv_sec = lfu->lfu_mtime_sec,
2201 .tv_nsec = lfu->lfu_mtime_nsec,
2204 .tv_sec = lfu->lfu_ctime_sec,
2205 .tv_nsec = lfu->lfu_ctime_nsec,
2211 if (!capable(CAP_SYS_ADMIN))
2214 if (!S_ISREG(inode->i_mode))
2217 mutex_lock(&inode->i_mutex);
2218 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2219 mutex_unlock(&inode->i_mutex);
2225 * Give file access advices
2227 * The ladvise interface is similar to Linux fadvise() system call, except it
2228 * forwards the advices directly from Lustre client to server. The server side
2229 * codes will apply appropriate read-ahead and caching techniques for the
2230 * corresponding files.
2232 * A typical workload for ladvise is e.g. a bunch of different clients are
2233 * doing small random reads of a file, so prefetching pages into OSS cache
2234 * with big linear reads before the random IO is a net benefit. Fetching
2235 * all that data into each client cache with fadvise() may not be, due to
2236 * much more data being sent to the client.
2238 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2239 struct lu_ladvise *ladvise)
2243 struct cl_ladvise_io *lio;
2248 env = cl_env_get(&refcheck);
2250 RETURN(PTR_ERR(env));
2252 io = vvp_env_thread_io(env);
2253 io->ci_obj = ll_i2info(inode)->lli_clob;
2255 /* initialize parameters for ladvise */
2256 lio = &io->u.ci_ladvise;
2257 lio->li_start = ladvise->lla_start;
2258 lio->li_end = ladvise->lla_end;
2259 lio->li_fid = ll_inode2fid(inode);
2260 lio->li_advice = ladvise->lla_advice;
2261 lio->li_flags = flags;
2263 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2264 rc = cl_io_loop(env, io);
2268 cl_io_fini(env, io);
2269 cl_env_put(env, &refcheck);
2274 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2276 struct inode *inode = file->f_path.dentry->d_inode;
2277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2281 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2282 PFID(ll_inode2fid(inode)), inode, cmd);
2283 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2285 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2286 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2290 case LL_IOC_GETFLAGS:
2291 /* Get the current value of the file flags */
2292 return put_user(fd->fd_flags, (int __user *)arg);
2293 case LL_IOC_SETFLAGS:
2294 case LL_IOC_CLRFLAGS:
2295 /* Set or clear specific file flags */
2296 /* XXX This probably needs checks to ensure the flags are
2297 * not abused, and to handle any flag side effects.
2299 if (get_user(flags, (int __user *) arg))
2302 if (cmd == LL_IOC_SETFLAGS) {
2303 if ((flags & LL_FILE_IGNORE_LOCK) &&
2304 !(file->f_flags & O_DIRECT)) {
2305 CERROR("%s: unable to disable locking on "
2306 "non-O_DIRECT file\n", current->comm);
2310 fd->fd_flags |= flags;
2312 fd->fd_flags &= ~flags;
2315 case LL_IOC_LOV_SETSTRIPE:
2316 RETURN(ll_lov_setstripe(inode, file, arg));
2317 case LL_IOC_LOV_SETEA:
2318 RETURN(ll_lov_setea(inode, file, arg));
2319 case LL_IOC_LOV_SWAP_LAYOUTS: {
2321 struct lustre_swap_layouts lsl;
2323 if (copy_from_user(&lsl, (char __user *)arg,
2324 sizeof(struct lustre_swap_layouts)))
2327 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2330 file2 = fget(lsl.sl_fd);
2334 /* O_WRONLY or O_RDWR */
2335 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2336 GOTO(out, rc = -EPERM);
2338 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2339 struct inode *inode2;
2340 struct ll_inode_info *lli;
2341 struct obd_client_handle *och = NULL;
2343 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2344 GOTO(out, rc = -EINVAL);
2346 lli = ll_i2info(inode);
2347 mutex_lock(&lli->lli_och_mutex);
2348 if (fd->fd_lease_och != NULL) {
2349 och = fd->fd_lease_och;
2350 fd->fd_lease_och = NULL;
2352 mutex_unlock(&lli->lli_och_mutex);
2354 GOTO(out, rc = -ENOLCK);
2355 inode2 = file2->f_path.dentry->d_inode;
2356 rc = ll_swap_layouts_close(och, inode, inode2);
2358 rc = ll_swap_layouts(file, file2, &lsl);
2364 case LL_IOC_LOV_GETSTRIPE:
2365 RETURN(ll_file_getstripe(inode,
2366 (struct lov_user_md __user *)arg));
2367 case FSFILT_IOC_GETFLAGS:
2368 case FSFILT_IOC_SETFLAGS:
2369 RETURN(ll_iocontrol(inode, file, cmd, arg));
2370 case FSFILT_IOC_GETVERSION_OLD:
2371 case FSFILT_IOC_GETVERSION:
2372 RETURN(put_user(inode->i_generation, (int __user *)arg));
2373 case LL_IOC_GROUP_LOCK:
2374 RETURN(ll_get_grouplock(inode, file, arg));
2375 case LL_IOC_GROUP_UNLOCK:
2376 RETURN(ll_put_grouplock(inode, file, arg));
2377 case IOC_OBD_STATFS:
2378 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2380 /* We need to special case any other ioctls we want to handle,
2381 * to send them to the MDS/OST as appropriate and to properly
2382 * network encode the arg field.
2383 case FSFILT_IOC_SETVERSION_OLD:
2384 case FSFILT_IOC_SETVERSION:
2386 case LL_IOC_FLUSHCTX:
2387 RETURN(ll_flush_ctx(inode));
2388 case LL_IOC_PATH2FID: {
2389 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2390 sizeof(struct lu_fid)))
2395 case LL_IOC_GETPARENT:
2396 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2398 case OBD_IOC_FID2PATH:
2399 RETURN(ll_fid2path(inode, (void __user *)arg));
2400 case LL_IOC_DATA_VERSION: {
2401 struct ioc_data_version idv;
2404 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2407 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2408 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2411 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2417 case LL_IOC_GET_MDTIDX: {
2420 mdtidx = ll_get_mdt_idx(inode);
2424 if (put_user((int)mdtidx, (int __user *)arg))
2429 case OBD_IOC_GETDTNAME:
2430 case OBD_IOC_GETMDNAME:
2431 RETURN(ll_get_obd_name(inode, cmd, arg));
2432 case LL_IOC_HSM_STATE_GET: {
2433 struct md_op_data *op_data;
2434 struct hsm_user_state *hus;
2441 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2442 LUSTRE_OPC_ANY, hus);
2443 if (IS_ERR(op_data)) {
2445 RETURN(PTR_ERR(op_data));
2448 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2451 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2454 ll_finish_md_op_data(op_data);
2458 case LL_IOC_HSM_STATE_SET: {
2459 struct hsm_state_set *hss;
2466 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2471 rc = ll_hsm_state_set(inode, hss);
2476 case LL_IOC_HSM_ACTION: {
2477 struct md_op_data *op_data;
2478 struct hsm_current_action *hca;
2485 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2486 LUSTRE_OPC_ANY, hca);
2487 if (IS_ERR(op_data)) {
2489 RETURN(PTR_ERR(op_data));
2492 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2495 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2498 ll_finish_md_op_data(op_data);
2502 case LL_IOC_SET_LEASE: {
2503 struct ll_inode_info *lli = ll_i2info(inode);
2504 struct obd_client_handle *och = NULL;
2509 case LL_LEASE_WRLCK:
2510 if (!(file->f_mode & FMODE_WRITE))
2512 fmode = FMODE_WRITE;
2514 case LL_LEASE_RDLCK:
2515 if (!(file->f_mode & FMODE_READ))
2519 case LL_LEASE_UNLCK:
2520 mutex_lock(&lli->lli_och_mutex);
2521 if (fd->fd_lease_och != NULL) {
2522 och = fd->fd_lease_och;
2523 fd->fd_lease_och = NULL;
2525 mutex_unlock(&lli->lli_och_mutex);
2530 fmode = och->och_flags;
2531 rc = ll_lease_close(och, inode, &lease_broken);
2538 RETURN(ll_lease_type_from_fmode(fmode));
2543 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2545 /* apply for lease */
2546 och = ll_lease_open(inode, file, fmode, 0);
2548 RETURN(PTR_ERR(och));
2551 mutex_lock(&lli->lli_och_mutex);
2552 if (fd->fd_lease_och == NULL) {
2553 fd->fd_lease_och = och;
2556 mutex_unlock(&lli->lli_och_mutex);
2558 /* impossible now that only excl is supported for now */
2559 ll_lease_close(och, inode, &lease_broken);
2564 case LL_IOC_GET_LEASE: {
2565 struct ll_inode_info *lli = ll_i2info(inode);
2566 struct ldlm_lock *lock = NULL;
2569 mutex_lock(&lli->lli_och_mutex);
2570 if (fd->fd_lease_och != NULL) {
2571 struct obd_client_handle *och = fd->fd_lease_och;
2573 lock = ldlm_handle2lock(&och->och_lease_handle);
2575 lock_res_and_lock(lock);
2576 if (!ldlm_is_cancel(lock))
2577 fmode = och->och_flags;
2579 unlock_res_and_lock(lock);
2580 LDLM_LOCK_PUT(lock);
2583 mutex_unlock(&lli->lli_och_mutex);
2585 RETURN(ll_lease_type_from_fmode(fmode));
2587 case LL_IOC_HSM_IMPORT: {
2588 struct hsm_user_import *hui;
2594 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2599 rc = ll_hsm_import(inode, file, hui);
2604 case LL_IOC_FUTIMES_3: {
2605 struct ll_futimes_3 lfu;
2607 if (copy_from_user(&lfu,
2608 (const struct ll_futimes_3 __user *)arg,
2612 RETURN(ll_file_futimes_3(file, &lfu));
2614 case LL_IOC_LADVISE: {
2615 struct ladvise_hdr *ladvise_hdr;
2618 int alloc_size = sizeof(*ladvise_hdr);
2621 OBD_ALLOC_PTR(ladvise_hdr);
2622 if (ladvise_hdr == NULL)
2625 if (copy_from_user(ladvise_hdr,
2626 (const struct ladvise_hdr __user *)arg,
2628 GOTO(out_ladvise, rc = -EFAULT);
2630 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2631 ladvise_hdr->lah_count < 1)
2632 GOTO(out_ladvise, rc = -EINVAL);
2634 num_advise = ladvise_hdr->lah_count;
2635 if (num_advise >= LAH_COUNT_MAX)
2636 GOTO(out_ladvise, rc = -EFBIG);
2638 OBD_FREE_PTR(ladvise_hdr);
2639 alloc_size = offsetof(typeof(*ladvise_hdr),
2640 lah_advise[num_advise]);
2641 OBD_ALLOC(ladvise_hdr, alloc_size);
2642 if (ladvise_hdr == NULL)
2646 * TODO: submit multiple advices to one server in a single RPC
2648 if (copy_from_user(ladvise_hdr,
2649 (const struct ladvise_hdr __user *)arg,
2651 GOTO(out_ladvise, rc = -EFAULT);
2653 for (i = 0; i < num_advise; i++) {
2654 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2655 &ladvise_hdr->lah_advise[i]);
2661 OBD_FREE(ladvise_hdr, alloc_size);
2668 ll_iocontrol_call(inode, file, cmd, arg, &err))
2671 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2672 (void __user *)arg));
2677 #ifndef HAVE_FILE_LLSEEK_SIZE
2678 static inline loff_t
2679 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2681 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2683 if (offset > maxsize)
2686 if (offset != file->f_pos) {
2687 file->f_pos = offset;
2688 file->f_version = 0;
2694 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2695 loff_t maxsize, loff_t eof)
2697 struct inode *inode = file->f_path.dentry->d_inode;
2705 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2706 * position-querying operation. Avoid rewriting the "same"
2707 * f_pos value back to the file because a concurrent read(),
2708 * write() or lseek() might have altered it
2713 * f_lock protects against read/modify/write race with other
2714 * SEEK_CURs. Note that parallel writes and reads behave
2717 mutex_lock(&inode->i_mutex);
2718 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2719 mutex_unlock(&inode->i_mutex);
2723 * In the generic case the entire file is data, so as long as
2724 * offset isn't at the end of the file then the offset is data.
2731 * There is a virtual hole at the end of the file, so as long as
2732 * offset isn't i_size or larger, return i_size.
2740 return llseek_execute(file, offset, maxsize);
2744 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2746 struct inode *inode = file->f_path.dentry->d_inode;
2747 loff_t retval, eof = 0;
2750 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2751 (origin == SEEK_CUR) ? file->f_pos : 0);
2752 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2753 PFID(ll_inode2fid(inode)), inode, retval, retval,
2755 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2757 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2758 retval = ll_glimpse_size(inode);
2761 eof = i_size_read(inode);
2764 retval = ll_generic_file_llseek_size(file, offset, origin,
2765 ll_file_maxbytes(inode), eof);
2769 static int ll_flush(struct file *file, fl_owner_t id)
2771 struct inode *inode = file->f_path.dentry->d_inode;
2772 struct ll_inode_info *lli = ll_i2info(inode);
2773 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2776 LASSERT(!S_ISDIR(inode->i_mode));
2778 /* catch async errors that were recorded back when async writeback
2779 * failed for pages in this mapping. */
2780 rc = lli->lli_async_rc;
2781 lli->lli_async_rc = 0;
2782 if (lli->lli_clob != NULL) {
2783 err = lov_read_and_clear_async_rc(lli->lli_clob);
2788 /* The application has been told write failure already.
2789 * Do not report failure again. */
2790 if (fd->fd_write_failed)
2792 return rc ? -EIO : 0;
2796 * Called to make sure a portion of file has been written out.
2797 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2799 * Return how many pages have been written.
2801 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2802 enum cl_fsync_mode mode, int ignore_layout)
2806 struct cl_fsync_io *fio;
2811 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2812 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2815 env = cl_env_get(&refcheck);
2817 RETURN(PTR_ERR(env));
2819 io = vvp_env_thread_io(env);
2820 io->ci_obj = ll_i2info(inode)->lli_clob;
2821 io->ci_ignore_layout = ignore_layout;
2823 /* initialize parameters for sync */
2824 fio = &io->u.ci_fsync;
2825 fio->fi_start = start;
2827 fio->fi_fid = ll_inode2fid(inode);
2828 fio->fi_mode = mode;
2829 fio->fi_nr_written = 0;
2831 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2832 result = cl_io_loop(env, io);
2834 result = io->ci_result;
2836 result = fio->fi_nr_written;
2837 cl_io_fini(env, io);
2838 cl_env_put(env, &refcheck);
2844 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2845 * null and dentry must be used directly rather than pulled from
2846 * *file->f_path.dentry as is done otherwise.
2849 #ifdef HAVE_FILE_FSYNC_4ARGS
2850 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2852 struct dentry *dentry = file->f_path.dentry;
2853 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2854 int ll_fsync(struct file *file, int datasync)
2856 struct dentry *dentry = file->f_path.dentry;
2858 loff_t end = LLONG_MAX;
2860 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2863 loff_t end = LLONG_MAX;
2865 struct inode *inode = dentry->d_inode;
2866 struct ll_inode_info *lli = ll_i2info(inode);
2867 struct ptlrpc_request *req;
2871 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2872 PFID(ll_inode2fid(inode)), inode);
2873 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2875 #ifdef HAVE_FILE_FSYNC_4ARGS
2876 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2877 mutex_lock(&inode->i_mutex);
2879 /* fsync's caller has already called _fdata{sync,write}, we want
2880 * that IO to finish before calling the osc and mdc sync methods */
2881 rc = filemap_fdatawait(inode->i_mapping);
2884 /* catch async errors that were recorded back when async writeback
2885 * failed for pages in this mapping. */
2886 if (!S_ISDIR(inode->i_mode)) {
2887 err = lli->lli_async_rc;
2888 lli->lli_async_rc = 0;
2891 err = lov_read_and_clear_async_rc(lli->lli_clob);
2896 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2900 ptlrpc_req_finished(req);
2902 if (S_ISREG(inode->i_mode)) {
2903 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2905 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2906 if (rc == 0 && err < 0)
2909 fd->fd_write_failed = true;
2911 fd->fd_write_failed = false;
2914 #ifdef HAVE_FILE_FSYNC_4ARGS
2915 mutex_unlock(&inode->i_mutex);
2921 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2923 struct inode *inode = file->f_path.dentry->d_inode;
2924 struct ll_sb_info *sbi = ll_i2sbi(inode);
2925 struct ldlm_enqueue_info einfo = {
2926 .ei_type = LDLM_FLOCK,
2927 .ei_cb_cp = ldlm_flock_completion_ast,
2928 .ei_cbdata = file_lock,
2930 struct md_op_data *op_data;
2931 struct lustre_handle lockh = { 0 };
2932 union ldlm_policy_data flock = { { 0 } };
2933 int fl_type = file_lock->fl_type;
2939 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2940 PFID(ll_inode2fid(inode)), file_lock);
2942 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2944 if (file_lock->fl_flags & FL_FLOCK) {
2945 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2946 /* flocks are whole-file locks */
2947 flock.l_flock.end = OFFSET_MAX;
2948 /* For flocks owner is determined by the local file desctiptor*/
2949 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2950 } else if (file_lock->fl_flags & FL_POSIX) {
2951 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2952 flock.l_flock.start = file_lock->fl_start;
2953 flock.l_flock.end = file_lock->fl_end;
2957 flock.l_flock.pid = file_lock->fl_pid;
2959 /* Somewhat ugly workaround for svc lockd.
2960 * lockd installs custom fl_lmops->lm_compare_owner that checks
2961 * for the fl_owner to be the same (which it always is on local node
2962 * I guess between lockd processes) and then compares pid.
2963 * As such we assign pid to the owner field to make it all work,
2964 * conflict with normal locks is unlikely since pid space and
2965 * pointer space for current->files are not intersecting */
2966 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2967 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2971 einfo.ei_mode = LCK_PR;
2974 /* An unlock request may or may not have any relation to
2975 * existing locks so we may not be able to pass a lock handle
2976 * via a normal ldlm_lock_cancel() request. The request may even
2977 * unlock a byte range in the middle of an existing lock. In
2978 * order to process an unlock request we need all of the same
2979 * information that is given with a normal read or write record
2980 * lock request. To avoid creating another ldlm unlock (cancel)
2981 * message we'll treat a LCK_NL flock request as an unlock. */
2982 einfo.ei_mode = LCK_NL;
2985 einfo.ei_mode = LCK_PW;
2988 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3003 flags = LDLM_FL_BLOCK_NOWAIT;
3009 flags = LDLM_FL_TEST_LOCK;
3012 CERROR("unknown fcntl lock command: %d\n", cmd);
3016 /* Save the old mode so that if the mode in the lock changes we
3017 * can decrement the appropriate reader or writer refcount. */
3018 file_lock->fl_type = einfo.ei_mode;
3020 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3021 LUSTRE_OPC_ANY, NULL);
3022 if (IS_ERR(op_data))
3023 RETURN(PTR_ERR(op_data));
3025 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3026 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3027 flock.l_flock.pid, flags, einfo.ei_mode,
3028 flock.l_flock.start, flock.l_flock.end);
3030 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3033 /* Restore the file lock type if not TEST lock. */
3034 if (!(flags & LDLM_FL_TEST_LOCK))
3035 file_lock->fl_type = fl_type;
3037 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3038 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3039 !(flags & LDLM_FL_TEST_LOCK))
3040 rc2 = locks_lock_file_wait(file, file_lock);
3042 if ((file_lock->fl_flags & FL_FLOCK) &&
3043 (rc == 0 || file_lock->fl_type == F_UNLCK))
3044 rc2 = flock_lock_file_wait(file, file_lock);
3045 if ((file_lock->fl_flags & FL_POSIX) &&
3046 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3047 !(flags & LDLM_FL_TEST_LOCK))
3048 rc2 = posix_lock_file_wait(file, file_lock);
3049 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3051 if (rc2 && file_lock->fl_type != F_UNLCK) {
3052 einfo.ei_mode = LCK_NL;
3053 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3058 ll_finish_md_op_data(op_data);
3063 int ll_get_fid_by_name(struct inode *parent, const char *name,
3064 int namelen, struct lu_fid *fid,
3065 struct inode **inode)
3067 struct md_op_data *op_data = NULL;
3068 struct mdt_body *body;
3069 struct ptlrpc_request *req;
3073 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3074 LUSTRE_OPC_ANY, NULL);
3075 if (IS_ERR(op_data))
3076 RETURN(PTR_ERR(op_data));
3078 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3079 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3080 ll_finish_md_op_data(op_data);
3084 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3086 GOTO(out_req, rc = -EFAULT);
3088 *fid = body->mbo_fid1;
3091 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3093 ptlrpc_req_finished(req);
3097 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3098 const char *name, int namelen)
3100 struct dentry *dchild = NULL;
3101 struct inode *child_inode = NULL;
3102 struct md_op_data *op_data;
3103 struct ptlrpc_request *request = NULL;
3104 struct obd_client_handle *och = NULL;
3106 struct mdt_body *body;
3108 __u64 data_version = 0;
3111 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3112 name, PFID(ll_inode2fid(parent)), mdtidx);
3114 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3115 0, LUSTRE_OPC_ANY, NULL);
3116 if (IS_ERR(op_data))
3117 RETURN(PTR_ERR(op_data));
3119 /* Get child FID first */
3120 qstr.hash = full_name_hash(name, namelen);
3123 dchild = d_lookup(file->f_path.dentry, &qstr);
3124 if (dchild != NULL) {
3125 if (dchild->d_inode != NULL)
3126 child_inode = igrab(dchild->d_inode);
3130 if (child_inode == NULL) {
3131 rc = ll_get_fid_by_name(parent, name, namelen,
3132 &op_data->op_fid3, &child_inode);
3137 if (child_inode == NULL)
3138 GOTO(out_free, rc = -EINVAL);
3141 * lfs migrate command needs to be blocked on the client
3142 * by checking the migrate FID against the FID of the
3145 if (child_inode == parent->i_sb->s_root->d_inode)
3146 GOTO(out_iput, rc = -EINVAL);
3148 mutex_lock(&child_inode->i_mutex);
3149 op_data->op_fid3 = *ll_inode2fid(child_inode);
3150 if (!fid_is_sane(&op_data->op_fid3)) {
3151 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3152 ll_get_fsname(parent->i_sb, NULL, 0), name,
3153 PFID(&op_data->op_fid3));
3154 GOTO(out_unlock, rc = -EINVAL);
3157 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3159 GOTO(out_unlock, rc);
3162 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3163 PFID(&op_data->op_fid3), mdtidx);
3164 GOTO(out_unlock, rc = 0);
3167 if (S_ISREG(child_inode->i_mode)) {
3168 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3172 GOTO(out_unlock, rc);
3175 rc = ll_data_version(child_inode, &data_version,
3178 GOTO(out_close, rc);
3180 op_data->op_handle = och->och_fh;
3181 op_data->op_data = och->och_mod;
3182 op_data->op_data_version = data_version;
3183 op_data->op_lease_handle = och->och_lease_handle;
3184 op_data->op_bias |= MDS_RENAME_MIGRATE;
3187 op_data->op_mds = mdtidx;
3188 op_data->op_cli_flags = CLI_MIGRATE;
3189 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3190 namelen, name, namelen, &request);
3192 ll_update_times(request, parent);
3194 if (request != NULL) {
3195 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3197 ptlrpc_req_finished(request);
3198 GOTO(out_close, rc = -EPROTO);
3201 /* If the server does release layout lock, then we cleanup
3202 * the client och here, otherwise release it in out_close: */
3204 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3205 obd_mod_put(och->och_mod);
3206 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3208 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3212 ptlrpc_req_finished(request);
3215 /* Try again if the file layout has changed. */
3216 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3221 if (och != NULL) /* close the file */
3222 ll_lease_close(och, child_inode, NULL);
3224 clear_nlink(child_inode);
3226 mutex_unlock(&child_inode->i_mutex);
3230 ll_finish_md_op_data(op_data);
3235 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3243 * test if some locks matching bits and l_req_mode are acquired
3244 * - bits can be in different locks
3245 * - if found clear the common lock bits in *bits
3246 * - the bits not found, are kept in *bits
3248 * \param bits [IN] searched lock bits [IN]
3249 * \param l_req_mode [IN] searched lock mode
3250 * \retval boolean, true iff all bits are found
3252 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3254 struct lustre_handle lockh;
3255 union ldlm_policy_data policy;
3256 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3257 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3266 fid = &ll_i2info(inode)->lli_fid;
3267 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3268 ldlm_lockname[mode]);
3270 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3271 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3272 policy.l_inodebits.bits = *bits & (1 << i);
3273 if (policy.l_inodebits.bits == 0)
3276 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3277 &policy, mode, &lockh)) {
3278 struct ldlm_lock *lock;
3280 lock = ldlm_handle2lock(&lockh);
3283 ~(lock->l_policy_data.l_inodebits.bits);
3284 LDLM_LOCK_PUT(lock);
3286 *bits &= ~policy.l_inodebits.bits;
3293 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3294 struct lustre_handle *lockh, __u64 flags,
3295 enum ldlm_mode mode)
3297 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3302 fid = &ll_i2info(inode)->lli_fid;
3303 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3305 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3306 fid, LDLM_IBITS, &policy, mode, lockh);
3311 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3313 /* Already unlinked. Just update nlink and return success */
3314 if (rc == -ENOENT) {
3316 /* If it is striped directory, and there is bad stripe
3317 * Let's revalidate the dentry again, instead of returning
3319 if (S_ISDIR(inode->i_mode) &&
3320 ll_i2info(inode)->lli_lsm_md != NULL)
3323 /* This path cannot be hit for regular files unless in
3324 * case of obscure races, so no need to to validate
3326 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3328 } else if (rc != 0) {
3329 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3330 "%s: revalidate FID "DFID" error: rc = %d\n",
3331 ll_get_fsname(inode->i_sb, NULL, 0),
3332 PFID(ll_inode2fid(inode)), rc);
3338 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3340 struct inode *inode = dentry->d_inode;
3341 struct ptlrpc_request *req = NULL;
3342 struct obd_export *exp;
3346 LASSERT(inode != NULL);
3348 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3349 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3351 exp = ll_i2mdexp(inode);
3353 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3354 * But under CMD case, it caused some lock issues, should be fixed
3355 * with new CMD ibits lock. See bug 12718 */
3356 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3357 struct lookup_intent oit = { .it_op = IT_GETATTR };
3358 struct md_op_data *op_data;
3360 if (ibits == MDS_INODELOCK_LOOKUP)
3361 oit.it_op = IT_LOOKUP;
3363 /* Call getattr by fid, so do not provide name at all. */
3364 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3365 dentry->d_inode, NULL, 0, 0,
3366 LUSTRE_OPC_ANY, NULL);
3367 if (IS_ERR(op_data))
3368 RETURN(PTR_ERR(op_data));
3370 rc = md_intent_lock(exp, op_data, &oit, &req,
3371 &ll_md_blocking_ast, 0);
3372 ll_finish_md_op_data(op_data);
3374 rc = ll_inode_revalidate_fini(inode, rc);
3378 rc = ll_revalidate_it_finish(req, &oit, dentry);
3380 ll_intent_release(&oit);
3384 /* Unlinked? Unhash dentry, so it is not picked up later by
3385 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3386 here to preserve get_cwd functionality on 2.6.
3388 if (!dentry->d_inode->i_nlink) {
3389 ll_lock_dcache(inode);
3390 d_lustre_invalidate(dentry, 0);
3391 ll_unlock_dcache(inode);
3394 ll_lookup_finish_locks(&oit, dentry);
3395 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3396 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3397 u64 valid = OBD_MD_FLGETATTR;
3398 struct md_op_data *op_data;
3401 if (S_ISREG(inode->i_mode)) {
3402 rc = ll_get_default_mdsize(sbi, &ealen);
3405 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3408 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3409 0, ealen, LUSTRE_OPC_ANY,
3411 if (IS_ERR(op_data))
3412 RETURN(PTR_ERR(op_data));
3414 op_data->op_valid = valid;
3415 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3416 ll_finish_md_op_data(op_data);
3418 rc = ll_inode_revalidate_fini(inode, rc);
3422 rc = ll_prep_inode(&inode, req, NULL, NULL);
3425 ptlrpc_req_finished(req);
3429 static int ll_merge_md_attr(struct inode *inode)
3431 struct cl_attr attr = { 0 };
3434 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3435 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3436 &attr, ll_md_blocking_ast);
3440 set_nlink(inode, attr.cat_nlink);
3441 inode->i_blocks = attr.cat_blocks;
3442 i_size_write(inode, attr.cat_size);
3444 ll_i2info(inode)->lli_atime = attr.cat_atime;
3445 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3446 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3452 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3454 struct inode *inode = dentry->d_inode;
3458 rc = __ll_inode_revalidate(dentry, ibits);
3462 /* if object isn't regular file, don't validate size */
3463 if (!S_ISREG(inode->i_mode)) {
3464 if (S_ISDIR(inode->i_mode) &&
3465 ll_i2info(inode)->lli_lsm_md != NULL) {
3466 rc = ll_merge_md_attr(inode);
3471 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3472 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3473 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3475 /* In case of restore, the MDT has the right size and has
3476 * already send it back without granting the layout lock,
3477 * inode is up-to-date so glimpse is useless.
3478 * Also to glimpse we need the layout, in case of a running
3479 * restore the MDT holds the layout lock so the glimpse will
3480 * block up to the end of restore (getattr will block)
3482 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3483 rc = ll_glimpse_size(inode);
3488 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3490 struct inode *inode = de->d_inode;
3491 struct ll_sb_info *sbi = ll_i2sbi(inode);
3492 struct ll_inode_info *lli = ll_i2info(inode);
3495 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3496 MDS_INODELOCK_LOOKUP);
3497 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3502 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3504 stat->dev = inode->i_sb->s_dev;
3505 if (ll_need_32bit_api(sbi))
3506 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3508 stat->ino = inode->i_ino;
3509 stat->mode = inode->i_mode;
3510 stat->uid = inode->i_uid;
3511 stat->gid = inode->i_gid;
3512 stat->rdev = inode->i_rdev;
3513 stat->atime = inode->i_atime;
3514 stat->mtime = inode->i_mtime;
3515 stat->ctime = inode->i_ctime;
3516 stat->blksize = 1 << inode->i_blkbits;
3518 stat->nlink = inode->i_nlink;
3519 stat->size = i_size_read(inode);
3520 stat->blocks = inode->i_blocks;
3525 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3526 __u64 start, __u64 len)
3530 struct fiemap *fiemap;
3531 unsigned int extent_count = fieinfo->fi_extents_max;
3533 num_bytes = sizeof(*fiemap) + (extent_count *
3534 sizeof(struct fiemap_extent));
3535 OBD_ALLOC_LARGE(fiemap, num_bytes);
3540 fiemap->fm_flags = fieinfo->fi_flags;
3541 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3542 fiemap->fm_start = start;
3543 fiemap->fm_length = len;
3544 if (extent_count > 0 &&
3545 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3546 sizeof(struct fiemap_extent)) != 0)
3547 GOTO(out, rc = -EFAULT);
3549 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3551 fieinfo->fi_flags = fiemap->fm_flags;
3552 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3553 if (extent_count > 0 &&
3554 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3555 fiemap->fm_mapped_extents *
3556 sizeof(struct fiemap_extent)) != 0)
3557 GOTO(out, rc = -EFAULT);
3559 OBD_FREE_LARGE(fiemap, num_bytes);
3563 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3565 struct ll_inode_info *lli = ll_i2info(inode);
3566 struct posix_acl *acl = NULL;
3569 spin_lock(&lli->lli_lock);
3570 /* VFS' acl_permission_check->check_acl will release the refcount */
3571 acl = posix_acl_dup(lli->lli_posix_acl);
3572 spin_unlock(&lli->lli_lock);
3577 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3579 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3580 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3582 ll_check_acl(struct inode *inode, int mask)
3585 # ifdef CONFIG_FS_POSIX_ACL
3586 struct posix_acl *acl;
3590 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3591 if (flags & IPERM_FLAG_RCU)
3594 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3599 rc = posix_acl_permission(inode, acl, mask);
3600 posix_acl_release(acl);
3603 # else /* !CONFIG_FS_POSIX_ACL */
3605 # endif /* CONFIG_FS_POSIX_ACL */
3607 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3609 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3610 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3612 # ifdef HAVE_INODE_PERMISION_2ARGS
3613 int ll_inode_permission(struct inode *inode, int mask)
3615 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3620 struct ll_sb_info *sbi;
3621 struct root_squash_info *squash;
3622 struct cred *cred = NULL;
3623 const struct cred *old_cred = NULL;
3625 bool squash_id = false;
3628 #ifdef MAY_NOT_BLOCK
3629 if (mask & MAY_NOT_BLOCK)
3631 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3632 if (flags & IPERM_FLAG_RCU)
3636 /* as root inode are NOT getting validated in lookup operation,
3637 * need to do it before permission check. */
3639 if (inode == inode->i_sb->s_root->d_inode) {
3640 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3641 MDS_INODELOCK_LOOKUP);
3646 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3647 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3649 /* squash fsuid/fsgid if needed */
3650 sbi = ll_i2sbi(inode);
3651 squash = &sbi->ll_squash;
3652 if (unlikely(squash->rsi_uid != 0 &&
3653 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3654 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3658 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3659 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3660 squash->rsi_uid, squash->rsi_gid);
3662 /* update current process's credentials
3663 * and FS capability */
3664 cred = prepare_creds();
3668 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3669 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3670 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3671 if ((1 << cap) & CFS_CAP_FS_MASK)
3672 cap_lower(cred->cap_effective, cap);
3674 old_cred = override_creds(cred);
3677 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3678 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3679 /* restore current process's credentials and FS capability */
3681 revert_creds(old_cred);
3688 /* -o localflock - only provides locally consistent flock locks */
3689 struct file_operations ll_file_operations = {
3690 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3691 # ifdef HAVE_SYNC_READ_WRITE
3692 .read = new_sync_read,
3693 .write = new_sync_write,
3695 .read_iter = ll_file_read_iter,
3696 .write_iter = ll_file_write_iter,
3697 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3698 .read = ll_file_read,
3699 .aio_read = ll_file_aio_read,
3700 .write = ll_file_write,
3701 .aio_write = ll_file_aio_write,
3702 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3703 .unlocked_ioctl = ll_file_ioctl,
3704 .open = ll_file_open,
3705 .release = ll_file_release,
3706 .mmap = ll_file_mmap,
3707 .llseek = ll_file_seek,
3708 .splice_read = ll_file_splice_read,
3713 struct file_operations ll_file_operations_flock = {
3714 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3715 # ifdef HAVE_SYNC_READ_WRITE
3716 .read = new_sync_read,
3717 .write = new_sync_write,
3718 # endif /* HAVE_SYNC_READ_WRITE */
3719 .read_iter = ll_file_read_iter,
3720 .write_iter = ll_file_write_iter,
3721 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3722 .read = ll_file_read,
3723 .aio_read = ll_file_aio_read,
3724 .write = ll_file_write,
3725 .aio_write = ll_file_aio_write,
3726 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3727 .unlocked_ioctl = ll_file_ioctl,
3728 .open = ll_file_open,
3729 .release = ll_file_release,
3730 .mmap = ll_file_mmap,
3731 .llseek = ll_file_seek,
3732 .splice_read = ll_file_splice_read,
3735 .flock = ll_file_flock,
3736 .lock = ll_file_flock
3739 /* These are for -o noflock - to return ENOSYS on flock calls */
3740 struct file_operations ll_file_operations_noflock = {
3741 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3742 # ifdef HAVE_SYNC_READ_WRITE
3743 .read = new_sync_read,
3744 .write = new_sync_write,
3745 # endif /* HAVE_SYNC_READ_WRITE */
3746 .read_iter = ll_file_read_iter,
3747 .write_iter = ll_file_write_iter,
3748 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3749 .read = ll_file_read,
3750 .aio_read = ll_file_aio_read,
3751 .write = ll_file_write,
3752 .aio_write = ll_file_aio_write,
3753 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3754 .unlocked_ioctl = ll_file_ioctl,
3755 .open = ll_file_open,
3756 .release = ll_file_release,
3757 .mmap = ll_file_mmap,
3758 .llseek = ll_file_seek,
3759 .splice_read = ll_file_splice_read,
3762 .flock = ll_file_noflock,
3763 .lock = ll_file_noflock
3766 struct inode_operations ll_file_inode_operations = {
3767 .setattr = ll_setattr,
3768 .getattr = ll_getattr,
3769 .permission = ll_inode_permission,
3770 .setxattr = ll_setxattr,
3771 .getxattr = ll_getxattr,
3772 .listxattr = ll_listxattr,
3773 .removexattr = ll_removexattr,
3774 .fiemap = ll_fiemap,
3775 #ifdef HAVE_IOP_GET_ACL
3776 .get_acl = ll_get_acl,
3780 /* dynamic ioctl number support routins */
3781 static struct llioc_ctl_data {
3782 struct rw_semaphore ioc_sem;
3783 struct list_head ioc_head;
3785 __RWSEM_INITIALIZER(llioc.ioc_sem),
3786 LIST_HEAD_INIT(llioc.ioc_head)
3791 struct list_head iocd_list;
3792 unsigned int iocd_size;
3793 llioc_callback_t iocd_cb;
3794 unsigned int iocd_count;
3795 unsigned int iocd_cmd[0];
3798 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3801 struct llioc_data *in_data = NULL;
3804 if (cb == NULL || cmd == NULL ||
3805 count > LLIOC_MAX_CMD || count < 0)
3808 size = sizeof(*in_data) + count * sizeof(unsigned int);
3809 OBD_ALLOC(in_data, size);
3810 if (in_data == NULL)
3813 memset(in_data, 0, sizeof(*in_data));
3814 in_data->iocd_size = size;
3815 in_data->iocd_cb = cb;
3816 in_data->iocd_count = count;
3817 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3819 down_write(&llioc.ioc_sem);
3820 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3821 up_write(&llioc.ioc_sem);
3826 void ll_iocontrol_unregister(void *magic)
3828 struct llioc_data *tmp;
3833 down_write(&llioc.ioc_sem);
3834 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3836 unsigned int size = tmp->iocd_size;
3838 list_del(&tmp->iocd_list);
3839 up_write(&llioc.ioc_sem);
3841 OBD_FREE(tmp, size);
3845 up_write(&llioc.ioc_sem);
3847 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3850 EXPORT_SYMBOL(ll_iocontrol_register);
3851 EXPORT_SYMBOL(ll_iocontrol_unregister);
3853 static enum llioc_iter
3854 ll_iocontrol_call(struct inode *inode, struct file *file,
3855 unsigned int cmd, unsigned long arg, int *rcp)
3857 enum llioc_iter ret = LLIOC_CONT;
3858 struct llioc_data *data;
3859 int rc = -EINVAL, i;
3861 down_read(&llioc.ioc_sem);
3862 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3863 for (i = 0; i < data->iocd_count; i++) {
3864 if (cmd != data->iocd_cmd[i])
3867 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3871 if (ret == LLIOC_STOP)
3874 up_read(&llioc.ioc_sem);
3881 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3883 struct ll_inode_info *lli = ll_i2info(inode);
3884 struct cl_object *obj = lli->lli_clob;
3893 env = cl_env_get(&refcheck);
3895 RETURN(PTR_ERR(env));
3897 rc = cl_conf_set(env, lli->lli_clob, conf);
3901 if (conf->coc_opc == OBJECT_CONF_SET) {
3902 struct ldlm_lock *lock = conf->coc_lock;
3903 struct cl_layout cl = {
3907 LASSERT(lock != NULL);
3908 LASSERT(ldlm_has_layout(lock));
3910 /* it can only be allowed to match after layout is
3911 * applied to inode otherwise false layout would be
3912 * seen. Applying layout shoud happen before dropping
3913 * the intent lock. */
3914 ldlm_lock_allow_match(lock);
3916 rc = cl_object_layout_get(env, obj, &cl);
3921 DFID": layout version change: %u -> %u\n",
3922 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3924 ll_layout_version_set(lli, cl.cl_layout_gen);
3928 cl_env_put(env, &refcheck);
3933 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3934 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3937 struct ll_sb_info *sbi = ll_i2sbi(inode);
3938 struct ptlrpc_request *req;
3939 struct mdt_body *body;
3946 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3947 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3948 lock->l_lvb_data, lock->l_lvb_len);
3950 if (lock->l_lvb_data != NULL)
3953 /* if layout lock was granted right away, the layout is returned
3954 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3955 * blocked and then granted via completion ast, we have to fetch
3956 * layout here. Please note that we can't use the LVB buffer in
3957 * completion AST because it doesn't have a large enough buffer */
3958 rc = ll_get_default_mdsize(sbi, &lmmsize);
3960 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3961 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3966 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3968 GOTO(out, rc = -EPROTO);
3970 lmmsize = body->mbo_eadatasize;
3971 if (lmmsize == 0) /* empty layout */
3974 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3976 GOTO(out, rc = -EFAULT);
3978 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3979 if (lvbdata == NULL)
3980 GOTO(out, rc = -ENOMEM);
3982 memcpy(lvbdata, lmm, lmmsize);
3983 lock_res_and_lock(lock);
3984 if (unlikely(lock->l_lvb_data == NULL)) {
3985 lock->l_lvb_type = LVB_T_LAYOUT;
3986 lock->l_lvb_data = lvbdata;
3987 lock->l_lvb_len = lmmsize;
3990 unlock_res_and_lock(lock);
3993 OBD_FREE_LARGE(lvbdata, lmmsize);
3998 ptlrpc_req_finished(req);
4003 * Apply the layout to the inode. Layout lock is held and will be released
4006 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4007 struct inode *inode)
4009 struct ll_inode_info *lli = ll_i2info(inode);
4010 struct ll_sb_info *sbi = ll_i2sbi(inode);
4011 struct ldlm_lock *lock;
4012 struct cl_object_conf conf;
4015 bool wait_layout = false;
4018 LASSERT(lustre_handle_is_used(lockh));
4020 lock = ldlm_handle2lock(lockh);
4021 LASSERT(lock != NULL);
4022 LASSERT(ldlm_has_layout(lock));
4024 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4025 PFID(&lli->lli_fid), inode);
4027 /* in case this is a caching lock and reinstate with new inode */
4028 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4030 lock_res_and_lock(lock);
4031 lvb_ready = ldlm_is_lvb_ready(lock);
4032 unlock_res_and_lock(lock);
4033 /* checking lvb_ready is racy but this is okay. The worst case is
4034 * that multi processes may configure the file on the same time. */
4039 rc = ll_layout_fetch(inode, lock);
4043 /* for layout lock, lmm is stored in lock's lvb.
4044 * lvb_data is immutable if the lock is held so it's safe to access it
4047 * set layout to file. Unlikely this will fail as old layout was
4048 * surely eliminated */
4049 memset(&conf, 0, sizeof conf);
4050 conf.coc_opc = OBJECT_CONF_SET;
4051 conf.coc_inode = inode;
4052 conf.coc_lock = lock;
4053 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4054 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4055 rc = ll_layout_conf(inode, &conf);
4057 /* refresh layout failed, need to wait */
4058 wait_layout = rc == -EBUSY;
4062 LDLM_LOCK_PUT(lock);
4063 ldlm_lock_decref(lockh, mode);
4065 /* wait for IO to complete if it's still being used. */
4067 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4068 ll_get_fsname(inode->i_sb, NULL, 0),
4069 PFID(&lli->lli_fid), inode);
4071 memset(&conf, 0, sizeof conf);
4072 conf.coc_opc = OBJECT_CONF_WAIT;
4073 conf.coc_inode = inode;
4074 rc = ll_layout_conf(inode, &conf);
4078 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4079 ll_get_fsname(inode->i_sb, NULL, 0),
4080 PFID(&lli->lli_fid), rc);
4085 static int ll_layout_refresh_locked(struct inode *inode)
4087 struct ll_inode_info *lli = ll_i2info(inode);
4088 struct ll_sb_info *sbi = ll_i2sbi(inode);
4089 struct md_op_data *op_data;
4090 struct lookup_intent it;
4091 struct lustre_handle lockh;
4092 enum ldlm_mode mode;
4093 struct ldlm_enqueue_info einfo = {
4094 .ei_type = LDLM_IBITS,
4096 .ei_cb_bl = &ll_md_blocking_ast,
4097 .ei_cb_cp = &ldlm_completion_ast,
4103 /* mostly layout lock is caching on the local side, so try to match
4104 * it before grabbing layout lock mutex. */
4105 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4106 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4107 if (mode != 0) { /* hit cached lock */
4108 rc = ll_layout_lock_set(&lockh, mode, inode);
4115 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4116 0, 0, LUSTRE_OPC_ANY, NULL);
4117 if (IS_ERR(op_data))
4118 RETURN(PTR_ERR(op_data));
4120 /* have to enqueue one */
4121 memset(&it, 0, sizeof(it));
4122 it.it_op = IT_LAYOUT;
4123 lockh.cookie = 0ULL;
4125 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4126 ll_get_fsname(inode->i_sb, NULL, 0),
4127 PFID(&lli->lli_fid), inode);
4129 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4130 if (it.it_request != NULL)
4131 ptlrpc_req_finished(it.it_request);
4132 it.it_request = NULL;
4134 ll_finish_md_op_data(op_data);
4136 mode = it.it_lock_mode;
4137 it.it_lock_mode = 0;
4138 ll_intent_drop_lock(&it);
4141 /* set lock data in case this is a new lock */
4142 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4143 rc = ll_layout_lock_set(&lockh, mode, inode);
4152 * This function checks if there exists a LAYOUT lock on the client side,
4153 * or enqueues it if it doesn't have one in cache.
4155 * This function will not hold layout lock so it may be revoked any time after
4156 * this function returns. Any operations depend on layout should be redone
4159 * This function should be called before lov_io_init() to get an uptodate
4160 * layout version, the caller should save the version number and after IO
4161 * is finished, this function should be called again to verify that layout
4162 * is not changed during IO time.
4164 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4166 struct ll_inode_info *lli = ll_i2info(inode);
4167 struct ll_sb_info *sbi = ll_i2sbi(inode);
4171 *gen = ll_layout_version_get(lli);
4172 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4176 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4177 LASSERT(S_ISREG(inode->i_mode));
4179 /* take layout lock mutex to enqueue layout lock exclusively. */
4180 mutex_lock(&lli->lli_layout_mutex);
4182 rc = ll_layout_refresh_locked(inode);
4186 *gen = ll_layout_version_get(lli);
4188 mutex_unlock(&lli->lli_layout_mutex);
4194 * This function send a restore request to the MDT
4196 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4198 struct hsm_user_request *hur;
4202 len = sizeof(struct hsm_user_request) +
4203 sizeof(struct hsm_user_item);
4204 OBD_ALLOC(hur, len);
4208 hur->hur_request.hr_action = HUA_RESTORE;
4209 hur->hur_request.hr_archive_id = 0;
4210 hur->hur_request.hr_flags = 0;
4211 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4212 sizeof(hur->hur_user_item[0].hui_fid));
4213 hur->hur_user_item[0].hui_extent.offset = offset;
4214 hur->hur_user_item[0].hui_extent.length = length;
4215 hur->hur_request.hr_itemcount = 1;
4216 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,