4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct mdt_body *body;
439 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
440 och->och_fh = body->mbo_handle;
441 och->och_fid = body->mbo_fid1;
442 och->och_lease_handle.cookie = it->it_lock_handle;
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_flags = it->it_flags;
446 return md_set_open_replay_data(md_exp, och, it);
449 static int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_path.dentry->d_inode;
455 LASSERT(!LUSTRE_FPRIVATE(file));
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
467 LUSTRE_FPRIVATE(file) = fd;
468 ll_readahead_init(inode, &fd->fd_ras);
469 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
471 /* ll_cl_context initialize */
472 rwlock_init(&fd->fd_lock);
473 INIT_LIST_HEAD(&fd->fd_lccs);
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
503 PFID(ll_inode2fid(inode)), inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_openerr, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode))
514 ll_authorize_statahead(inode, fd);
516 if (inode->i_sb->s_root == file->f_path.dentry) {
517 LUSTRE_FPRIVATE(file) = fd;
521 if (!it || !it->it_disposition) {
522 /* Convert f_flags into access mode. We cannot use file->f_mode,
523 * because everything but O_ACCMODE mask was stripped from
525 if ((oit.it_flags + 1) & O_ACCMODE)
527 if (file->f_flags & O_TRUNC)
528 oit.it_flags |= FMODE_WRITE;
530 /* kernel only call f_op->open in dentry_open. filp_open calls
531 * dentry_open after call to open_namei that checks permissions.
532 * Only nfsd_open call dentry_open directly without checking
533 * permissions and because of that this code below is safe. */
534 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
535 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications? */
539 oit.it_flags &= ~O_EXCL;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
543 * consistent with it */
544 if (oit.it_flags & O_CREAT)
545 oit.it_op |= IT_CREAT;
551 /* Let's see if we have file open on MDS already. */
552 if (it->it_flags & FMODE_WRITE) {
553 och_p = &lli->lli_mds_write_och;
554 och_usecount = &lli->lli_open_fd_write_count;
555 } else if (it->it_flags & FMODE_EXEC) {
556 och_p = &lli->lli_mds_exec_och;
557 och_usecount = &lli->lli_open_fd_exec_count;
559 och_p = &lli->lli_mds_read_och;
560 och_usecount = &lli->lli_open_fd_read_count;
563 mutex_lock(&lli->lli_och_mutex);
564 if (*och_p) { /* Open handle is present */
565 if (it_disposition(it, DISP_OPEN_OPEN)) {
566 /* Well, there's extra open request that we do not need,
567 let's close it somehow. This will decref request. */
568 rc = it_open_error(DISP_OPEN_OPEN, it);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 ll_release_openhandle(file->f_path.dentry, it);
578 rc = ll_local_open(file, it, fd, NULL);
581 mutex_unlock(&lli->lli_och_mutex);
582 GOTO(out_openerr, rc);
585 LASSERT(*och_usecount == 0);
586 if (!it->it_disposition) {
587 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
588 /* We cannot just request lock handle now, new ELC code
589 means that one of other OPEN locks for this file
590 could be cancelled, and since blocking ast handler
591 would attempt to grab och_mutex as well, that would
592 result in a deadlock */
593 mutex_unlock(&lli->lli_och_mutex);
595 * Normally called under two situations:
597 * 2. A race/condition on MDS resulting in no open
598 * handle to be returned from LOOKUP|OPEN request,
599 * for example if the target entry was a symlink.
601 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
602 * marked by a bit set in ll_iget_for_nfs. Clear the
603 * bit so that it's not confusing later callers.
605 * NB; when ldd is NULL, it must have come via normal
606 * lookup path only, since ll_iget_for_nfs always calls
609 if (ldd && ldd->lld_nfs_dentry) {
610 ldd->lld_nfs_dentry = 0;
611 it->it_flags |= MDS_OPEN_LOCK;
615 * Always specify MDS_OPEN_BY_FID because we don't want
616 * to get file with different fid.
618 it->it_flags |= MDS_OPEN_BY_FID;
619 rc = ll_intent_file_open(file, NULL, 0, it);
621 GOTO(out_openerr, rc);
625 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
627 GOTO(out_och_free, rc = -ENOMEM);
631 /* md_intent_lock() didn't get a request ref if there was an
632 * open error, so don't do cleanup on the request here
634 /* XXX (green): Should not we bail out on any error here, not
635 * just open error? */
636 rc = it_open_error(DISP_OPEN_OPEN, it);
638 GOTO(out_och_free, rc);
640 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
641 "inode %p: disposition %x, status %d\n", inode,
642 it_disposition(it, ~0), it->it_status);
644 rc = ll_local_open(file, it, fd, *och_p);
646 GOTO(out_och_free, rc);
648 mutex_unlock(&lli->lli_och_mutex);
651 /* Must do this outside lli_och_mutex lock to prevent deadlock where
652 different kind of OPEN lock for this same inode gets cancelled
653 by ldlm_cancel_lru */
654 if (!S_ISREG(inode->i_mode))
655 GOTO(out_och_free, rc);
657 cl_lov_delay_create_clear(&file->f_flags);
658 GOTO(out_och_free, rc);
662 if (och_p && *och_p) {
663 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
664 *och_p = NULL; /* OBD_FREE writes some magic there */
667 mutex_unlock(&lli->lli_och_mutex);
670 if (lli->lli_opendir_key == fd)
671 ll_deauthorize_statahead(inode, fd);
673 ll_file_data_put(fd);
675 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
678 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
679 ptlrpc_req_finished(it->it_request);
680 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
686 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
687 struct ldlm_lock_desc *desc, void *data, int flag)
690 struct lustre_handle lockh;
694 case LDLM_CB_BLOCKING:
695 ldlm_lock2handle(lock, &lockh);
696 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
698 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
702 case LDLM_CB_CANCELING:
710 * Acquire a lease and open the file.
712 static struct obd_client_handle *
713 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
716 struct lookup_intent it = { .it_op = IT_OPEN };
717 struct ll_sb_info *sbi = ll_i2sbi(inode);
718 struct md_op_data *op_data;
719 struct ptlrpc_request *req = NULL;
720 struct lustre_handle old_handle = { 0 };
721 struct obd_client_handle *och = NULL;
726 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
727 RETURN(ERR_PTR(-EINVAL));
730 struct ll_inode_info *lli = ll_i2info(inode);
731 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
732 struct obd_client_handle **och_p;
735 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
736 RETURN(ERR_PTR(-EPERM));
738 /* Get the openhandle of the file */
740 mutex_lock(&lli->lli_och_mutex);
741 if (fd->fd_lease_och != NULL) {
742 mutex_unlock(&lli->lli_och_mutex);
746 if (fd->fd_och == NULL) {
747 if (file->f_mode & FMODE_WRITE) {
748 LASSERT(lli->lli_mds_write_och != NULL);
749 och_p = &lli->lli_mds_write_och;
750 och_usecount = &lli->lli_open_fd_write_count;
752 LASSERT(lli->lli_mds_read_och != NULL);
753 och_p = &lli->lli_mds_read_och;
754 och_usecount = &lli->lli_open_fd_read_count;
756 if (*och_usecount == 1) {
763 mutex_unlock(&lli->lli_och_mutex);
764 if (rc < 0) /* more than 1 opener */
767 LASSERT(fd->fd_och != NULL);
768 old_handle = fd->fd_och->och_fh;
773 RETURN(ERR_PTR(-ENOMEM));
775 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
776 LUSTRE_OPC_ANY, NULL);
778 GOTO(out, rc = PTR_ERR(op_data));
780 /* To tell the MDT this openhandle is from the same owner */
781 op_data->op_handle = old_handle;
783 it.it_flags = fmode | open_flags;
784 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
785 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
786 &ll_md_blocking_lease_ast,
787 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
788 * it can be cancelled which may mislead applications that the lease is
790 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
791 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
792 * doesn't deal with openhandle, so normal openhandle will be leaked. */
793 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
794 ll_finish_md_op_data(op_data);
795 ptlrpc_req_finished(req);
797 GOTO(out_release_it, rc);
799 if (it_disposition(&it, DISP_LOOKUP_NEG))
800 GOTO(out_release_it, rc = -ENOENT);
802 rc = it_open_error(DISP_OPEN_OPEN, &it);
804 GOTO(out_release_it, rc);
806 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
807 ll_och_fill(sbi->ll_md_exp, &it, och);
809 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
810 GOTO(out_close, rc = -EOPNOTSUPP);
812 /* already get lease, handle lease lock */
813 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
814 if (it.it_lock_mode == 0 ||
815 it.it_lock_bits != MDS_INODELOCK_OPEN) {
816 /* open lock must return for lease */
817 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
818 PFID(ll_inode2fid(inode)), it.it_lock_mode,
820 GOTO(out_close, rc = -EPROTO);
823 ll_intent_release(&it);
827 /* Cancel open lock */
828 if (it.it_lock_mode != 0) {
829 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
832 och->och_lease_handle.cookie = 0ULL;
834 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
836 CERROR("%s: error closing file "DFID": %d\n",
837 ll_get_fsname(inode->i_sb, NULL, 0),
838 PFID(&ll_i2info(inode)->lli_fid), rc2);
839 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
841 ll_intent_release(&it);
849 * Check whether a layout swap can be done between two inodes.
851 * \param[in] inode1 First inode to check
852 * \param[in] inode2 Second inode to check
854 * \retval 0 on success, layout swap can be performed between both inodes
855 * \retval negative error code if requirements are not met
857 static int ll_check_swap_layouts_validity(struct inode *inode1,
858 struct inode *inode2)
860 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
863 if (inode_permission(inode1, MAY_WRITE) ||
864 inode_permission(inode2, MAY_WRITE))
867 if (inode1->i_sb != inode2->i_sb)
873 static int ll_swap_layouts_close(struct obd_client_handle *och,
874 struct inode *inode, struct inode *inode2)
876 const struct lu_fid *fid1 = ll_inode2fid(inode);
877 const struct lu_fid *fid2;
881 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
882 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
884 rc = ll_check_swap_layouts_validity(inode, inode2);
886 GOTO(out_free_och, rc);
888 /* We now know that inode2 is a lustre inode */
889 fid2 = ll_inode2fid(inode2);
891 rc = lu_fid_cmp(fid1, fid2);
893 GOTO(out_free_och, rc = -EINVAL);
895 /* Close the file and swap layouts between inode & inode2.
896 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
897 * because we still need it to pack l_remote_handle to MDT. */
898 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
901 och = NULL; /* freed in ll_close_inode_openhandle() */
911 * Release lease and close the file.
912 * It will check if the lease has ever broken.
914 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
917 struct ldlm_lock *lock;
918 bool cancelled = true;
922 lock = ldlm_handle2lock(&och->och_lease_handle);
924 lock_res_and_lock(lock);
925 cancelled = ldlm_is_cancel(lock);
926 unlock_res_and_lock(lock);
930 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
931 PFID(&ll_i2info(inode)->lli_fid), cancelled);
934 ldlm_cli_cancel(&och->och_lease_handle, 0);
935 if (lease_broken != NULL)
936 *lease_broken = cancelled;
938 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
942 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
944 struct ll_inode_info *lli = ll_i2info(inode);
945 struct cl_object *obj = lli->lli_clob;
946 struct cl_attr *attr = vvp_env_thread_attr(env);
954 ll_inode_size_lock(inode);
956 /* Merge timestamps the most recently obtained from MDS with
957 * timestamps obtained from OSTs.
959 * Do not overwrite atime of inode because it may be refreshed
960 * by file_accessed() function. If the read was served by cache
961 * data, there is no RPC to be sent so that atime may not be
962 * transferred to OSTs at all. MDT only updates atime at close time
963 * if it's at least 'mdd.*.atime_diff' older.
964 * All in all, the atime in Lustre does not strictly comply with
965 * POSIX. Solving this problem needs to send an RPC to MDT for each
966 * read, this will hurt performance. */
967 if (LTIME_S(inode->i_atime) < lli->lli_atime)
968 LTIME_S(inode->i_atime) = lli->lli_atime;
969 LTIME_S(inode->i_mtime) = lli->lli_mtime;
970 LTIME_S(inode->i_ctime) = lli->lli_ctime;
972 atime = LTIME_S(inode->i_atime);
973 mtime = LTIME_S(inode->i_mtime);
974 ctime = LTIME_S(inode->i_ctime);
976 cl_object_attr_lock(obj);
977 rc = cl_object_attr_get(env, obj, attr);
978 cl_object_attr_unlock(obj);
981 GOTO(out_size_unlock, rc);
983 if (atime < attr->cat_atime)
984 atime = attr->cat_atime;
986 if (ctime < attr->cat_ctime)
987 ctime = attr->cat_ctime;
989 if (mtime < attr->cat_mtime)
990 mtime = attr->cat_mtime;
992 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
993 PFID(&lli->lli_fid), attr->cat_size);
995 i_size_write(inode, attr->cat_size);
996 inode->i_blocks = attr->cat_blocks;
998 LTIME_S(inode->i_atime) = atime;
999 LTIME_S(inode->i_mtime) = mtime;
1000 LTIME_S(inode->i_ctime) = ctime;
1003 ll_inode_size_unlock(inode);
1008 static bool file_is_noatime(const struct file *file)
1010 const struct vfsmount *mnt = file->f_path.mnt;
1011 const struct inode *inode = file->f_path.dentry->d_inode;
1013 /* Adapted from file_accessed() and touch_atime().*/
1014 if (file->f_flags & O_NOATIME)
1017 if (inode->i_flags & S_NOATIME)
1020 if (IS_NOATIME(inode))
1023 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1026 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1029 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1035 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1037 struct inode *inode = file->f_path.dentry->d_inode;
1039 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1041 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1042 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1043 file->f_flags & O_DIRECT ||
1046 io->ci_obj = ll_i2info(inode)->lli_clob;
1047 io->ci_lockreq = CILR_MAYBE;
1048 if (ll_file_nolock(file)) {
1049 io->ci_lockreq = CILR_NEVER;
1050 io->ci_no_srvlock = 1;
1051 } else if (file->f_flags & O_APPEND) {
1052 io->ci_lockreq = CILR_MANDATORY;
1055 io->ci_noatime = file_is_noatime(file);
1059 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1060 struct file *file, enum cl_io_type iot,
1061 loff_t *ppos, size_t count)
1063 struct vvp_io *vio = vvp_env_io(env);
1064 struct inode *inode = file->f_path.dentry->d_inode;
1065 struct ll_inode_info *lli = ll_i2info(inode);
1066 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1070 struct range_lock range;
1074 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1075 file->f_path.dentry->d_name.name, iot, *ppos, count);
1078 io = vvp_env_thread_io(env);
1079 ll_io_init(io, file, iot == CIT_WRITE);
1081 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1082 bool range_locked = false;
1084 if (file->f_flags & O_APPEND)
1085 range_lock_init(&range, 0, LUSTRE_EOF);
1087 range_lock_init(&range, *ppos, *ppos + count - 1);
1089 vio->vui_fd = LUSTRE_FPRIVATE(file);
1090 vio->vui_io_subtype = args->via_io_subtype;
1092 switch (vio->vui_io_subtype) {
1094 vio->vui_iter = args->u.normal.via_iter;
1095 vio->vui_iocb = args->u.normal.via_iocb;
1096 /* Direct IO reads must also take range lock,
1097 * or multiple reads will try to work on the same pages
1098 * See LU-6227 for details. */
1099 if (((iot == CIT_WRITE) ||
1100 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1101 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1102 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1104 rc = range_lock(&lli->lli_write_tree, &range);
1108 range_locked = true;
1112 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1113 vio->u.splice.vui_flags = args->u.splice.via_flags;
1116 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1120 ll_cl_add(file, env, io, LCC_RW);
1121 rc = cl_io_loop(env, io);
1122 ll_cl_remove(file, env);
1125 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1127 range_unlock(&lli->lli_write_tree, &range);
1130 /* cl_io_rw_init() handled IO */
1134 if (io->ci_nob > 0) {
1135 result += io->ci_nob;
1136 count -= io->ci_nob;
1137 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1139 /* prepare IO restart */
1140 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1141 args->u.normal.via_iter = vio->vui_iter;
1145 cl_io_fini(env, io);
1147 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1149 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1150 file->f_path.dentry->d_name.name,
1151 iot == CIT_READ ? "read" : "write",
1152 *ppos, count, result);
1156 if (iot == CIT_READ) {
1158 ll_stats_ops_tally(ll_i2sbi(inode),
1159 LPROC_LL_READ_BYTES, result);
1160 } else if (iot == CIT_WRITE) {
1162 ll_stats_ops_tally(ll_i2sbi(inode),
1163 LPROC_LL_WRITE_BYTES, result);
1164 fd->fd_write_failed = false;
1165 } else if (result == 0 && rc == 0) {
1168 fd->fd_write_failed = true;
1170 fd->fd_write_failed = false;
1171 } else if (rc != -ERESTARTSYS) {
1172 fd->fd_write_failed = true;
1176 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1178 return result > 0 ? result : rc;
1182 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1183 * especially for small I/O.
1185 * To serve a read request, CLIO has to create and initialize a cl_io and
1186 * then request DLM lock. This has turned out to have siginificant overhead
1187 * and affects the performance of small I/O dramatically.
1189 * It's not necessary to create a cl_io for each I/O. Under the help of read
1190 * ahead, most of the pages being read are already in memory cache and we can
1191 * read those pages directly because if the pages exist, the corresponding DLM
1192 * lock must exist so that page content must be valid.
1194 * In fast read implementation, the llite speculatively finds and reads pages
1195 * in memory cache. There are three scenarios for fast read:
1196 * - If the page exists and is uptodate, kernel VM will provide the data and
1197 * CLIO won't be intervened;
1198 * - If the page was brought into memory by read ahead, it will be exported
1199 * and read ahead parameters will be updated;
1200 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1201 * it will go back and invoke normal read, i.e., a cl_io will be created
1202 * and DLM lock will be requested.
1204 * POSIX compliance: posix standard states that read is intended to be atomic.
1205 * Lustre read implementation is in line with Linux kernel read implementation
1206 * and neither of them complies with POSIX standard in this matter. Fast read
1207 * doesn't make the situation worse on single node but it may interleave write
1208 * results from multiple nodes due to short read handling in ll_file_aio_read().
1210 * \param env - lu_env
1211 * \param iocb - kiocb from kernel
1212 * \param iter - user space buffers where the data will be copied
1214 * \retval - number of bytes have been read, or error code if error occurred.
1217 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1218 struct iov_iter *iter)
1222 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1225 /* NB: we can't do direct IO for fast read because it will need a lock
1226 * to make IO engine happy. */
1227 if (iocb->ki_filp->f_flags & O_DIRECT)
1230 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1231 result = generic_file_read_iter(iocb, iter);
1232 ll_cl_remove(iocb->ki_filp, env);
1234 /* If the first page is not in cache, generic_file_aio_read() will be
1235 * returned with -ENODATA.
1236 * See corresponding code in ll_readpage(). */
1237 if (result == -ENODATA)
1241 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1242 LPROC_LL_READ_BYTES, result);
1248 * Read from a file (through the page cache).
1250 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1253 struct vvp_io_args *args;
1258 env = cl_env_get(&refcheck);
1260 return PTR_ERR(env);
1262 result = ll_do_fast_read(env, iocb, to);
1263 if (result < 0 || iov_iter_count(to) == 0)
1266 args = ll_env_args(env, IO_NORMAL);
1267 args->u.normal.via_iter = to;
1268 args->u.normal.via_iocb = iocb;
1270 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1271 &iocb->ki_pos, iov_iter_count(to));
1274 else if (result == 0)
1278 cl_env_put(env, &refcheck);
1283 * Write to a file (through the page cache).
1285 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1287 struct vvp_io_args *args;
1292 env = cl_env_get(&refcheck);
1294 return PTR_ERR(env);
1296 args = ll_env_args(env, IO_NORMAL);
1297 args->u.normal.via_iter = from;
1298 args->u.normal.via_iocb = iocb;
1300 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1301 &iocb->ki_pos, iov_iter_count(from));
1302 cl_env_put(env, &refcheck);
1306 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1308 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1310 static int ll_file_get_iov_count(const struct iovec *iov,
1311 unsigned long *nr_segs, size_t *count)
1316 for (seg = 0; seg < *nr_segs; seg++) {
1317 const struct iovec *iv = &iov[seg];
1320 * If any segment has a negative length, or the cumulative
1321 * length ever wraps negative then return -EINVAL.
1324 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1326 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1331 cnt -= iv->iov_len; /* This segment is no good */
1338 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1339 unsigned long nr_segs, loff_t pos)
1346 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1350 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1351 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1352 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1353 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1354 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1356 result = ll_file_read_iter(iocb, &to);
1361 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1364 struct iovec iov = { .iov_base = buf, .iov_len = count };
1365 struct kiocb *kiocb;
1369 OBD_ALLOC_PTR(kiocb);
1373 init_sync_kiocb(kiocb, file);
1374 kiocb->ki_pos = *ppos;
1375 #ifdef HAVE_KIOCB_KI_LEFT
1376 kiocb->ki_left = count;
1377 #elif defined(HAVE_KI_NBYTES)
1378 kiocb->ki_nbytes = count;
1381 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1382 *ppos = kiocb->ki_pos;
1384 OBD_FREE_PTR(kiocb);
1389 * Write to a file (through the page cache).
1392 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1393 unsigned long nr_segs, loff_t pos)
1395 struct iov_iter from;
1400 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1404 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1405 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1406 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1407 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1408 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1410 result = ll_file_write_iter(iocb, &from);
1415 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1416 size_t count, loff_t *ppos)
1419 struct iovec iov = { .iov_base = (void __user *)buf,
1421 struct kiocb *kiocb;
1426 env = cl_env_get(&refcheck);
1428 RETURN(PTR_ERR(env));
1430 kiocb = &ll_env_info(env)->lti_kiocb;
1431 init_sync_kiocb(kiocb, file);
1432 kiocb->ki_pos = *ppos;
1433 #ifdef HAVE_KIOCB_KI_LEFT
1434 kiocb->ki_left = count;
1435 #elif defined(HAVE_KI_NBYTES)
1436 kiocb->ki_nbytes = count;
1439 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1440 *ppos = kiocb->ki_pos;
1442 cl_env_put(env, &refcheck);
1445 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1448 * Send file content (through pagecache) somewhere with helper
1450 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1451 struct pipe_inode_info *pipe, size_t count,
1455 struct vvp_io_args *args;
1460 env = cl_env_get(&refcheck);
1462 RETURN(PTR_ERR(env));
1464 args = ll_env_args(env, IO_SPLICE);
1465 args->u.splice.via_pipe = pipe;
1466 args->u.splice.via_flags = flags;
1468 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1469 cl_env_put(env, &refcheck);
1473 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1474 __u64 flags, struct lov_user_md *lum,
1477 struct lookup_intent oit = {
1479 .it_flags = flags | MDS_OPEN_BY_FID,
1484 ll_inode_size_lock(inode);
1485 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1487 GOTO(out_unlock, rc);
1489 ll_release_openhandle(file->f_path.dentry, &oit);
1492 ll_inode_size_unlock(inode);
1493 ll_intent_release(&oit);
1494 cl_lov_delay_create_clear(&file->f_flags);
1499 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1500 struct lov_mds_md **lmmp, int *lmm_size,
1501 struct ptlrpc_request **request)
1503 struct ll_sb_info *sbi = ll_i2sbi(inode);
1504 struct mdt_body *body;
1505 struct lov_mds_md *lmm = NULL;
1506 struct ptlrpc_request *req = NULL;
1507 struct md_op_data *op_data;
1510 rc = ll_get_default_mdsize(sbi, &lmmsize);
1514 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1515 strlen(filename), lmmsize,
1516 LUSTRE_OPC_ANY, NULL);
1517 if (IS_ERR(op_data))
1518 RETURN(PTR_ERR(op_data));
1520 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1521 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1522 ll_finish_md_op_data(op_data);
1524 CDEBUG(D_INFO, "md_getattr_name failed "
1525 "on %s: rc %d\n", filename, rc);
1529 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1530 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1532 lmmsize = body->mbo_eadatasize;
1534 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1536 GOTO(out, rc = -ENODATA);
1539 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1540 LASSERT(lmm != NULL);
1542 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1543 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1544 GOTO(out, rc = -EPROTO);
1548 * This is coming from the MDS, so is probably in
1549 * little endian. We convert it to host endian before
1550 * passing it to userspace.
1552 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1555 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1556 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1559 /* if function called for directory - we should
1560 * avoid swab not existent lsm objects */
1561 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1562 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1563 if (S_ISREG(body->mbo_mode))
1564 lustre_swab_lov_user_md_objects(
1565 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1567 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1568 lustre_swab_lov_user_md_v3(
1569 (struct lov_user_md_v3 *)lmm);
1570 if (S_ISREG(body->mbo_mode))
1571 lustre_swab_lov_user_md_objects(
1572 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1579 *lmm_size = lmmsize;
1584 static int ll_lov_setea(struct inode *inode, struct file *file,
1587 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1588 struct lov_user_md *lump;
1589 int lum_size = sizeof(struct lov_user_md) +
1590 sizeof(struct lov_user_ost_data);
1594 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1597 OBD_ALLOC_LARGE(lump, lum_size);
1601 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1602 GOTO(out_lump, rc = -EFAULT);
1604 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1607 OBD_FREE_LARGE(lump, lum_size);
1611 static int ll_file_getstripe(struct inode *inode,
1612 struct lov_user_md __user *lum)
1619 env = cl_env_get(&refcheck);
1621 RETURN(PTR_ERR(env));
1623 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1624 cl_env_put(env, &refcheck);
1628 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1631 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1632 struct lov_user_md *klum;
1634 __u64 flags = FMODE_WRITE;
1637 rc = ll_copy_user_md(lum, &klum);
1642 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1646 put_user(0, &lum->lmm_stripe_count);
1648 ll_layout_refresh(inode, &gen);
1649 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1652 OBD_FREE(klum, lum_size);
1657 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1659 struct ll_inode_info *lli = ll_i2info(inode);
1660 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1661 struct ll_grouplock grouplock;
1666 CWARN("group id for group lock must not be 0\n");
1670 if (ll_file_nolock(file))
1671 RETURN(-EOPNOTSUPP);
1673 spin_lock(&lli->lli_lock);
1674 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1675 CWARN("group lock already existed with gid %lu\n",
1676 fd->fd_grouplock.lg_gid);
1677 spin_unlock(&lli->lli_lock);
1680 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1681 spin_unlock(&lli->lli_lock);
1683 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1684 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1688 spin_lock(&lli->lli_lock);
1689 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1690 spin_unlock(&lli->lli_lock);
1691 CERROR("another thread just won the race\n");
1692 cl_put_grouplock(&grouplock);
1696 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1697 fd->fd_grouplock = grouplock;
1698 spin_unlock(&lli->lli_lock);
1700 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1704 static int ll_put_grouplock(struct inode *inode, struct file *file,
1707 struct ll_inode_info *lli = ll_i2info(inode);
1708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1709 struct ll_grouplock grouplock;
1712 spin_lock(&lli->lli_lock);
1713 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1714 spin_unlock(&lli->lli_lock);
1715 CWARN("no group lock held\n");
1719 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1721 if (fd->fd_grouplock.lg_gid != arg) {
1722 CWARN("group lock %lu doesn't match current id %lu\n",
1723 arg, fd->fd_grouplock.lg_gid);
1724 spin_unlock(&lli->lli_lock);
1728 grouplock = fd->fd_grouplock;
1729 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1730 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1731 spin_unlock(&lli->lli_lock);
1733 cl_put_grouplock(&grouplock);
1734 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1739 * Close inode open handle
1741 * \param dentry [in] dentry which contains the inode
1742 * \param it [in,out] intent which contains open info and result
1745 * \retval <0 failure
1747 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1749 struct inode *inode = dentry->d_inode;
1750 struct obd_client_handle *och;
1756 /* Root ? Do nothing. */
1757 if (dentry->d_inode->i_sb->s_root == dentry)
1760 /* No open handle to close? Move away */
1761 if (!it_disposition(it, DISP_OPEN_OPEN))
1764 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1766 OBD_ALLOC(och, sizeof(*och));
1768 GOTO(out, rc = -ENOMEM);
1770 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1772 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1774 /* this one is in place of ll_file_open */
1775 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1776 ptlrpc_req_finished(it->it_request);
1777 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1783 * Get size for inode for which FIEMAP mapping is requested.
1784 * Make the FIEMAP get_info call and returns the result.
1785 * \param fiemap kernel buffer to hold extens
1786 * \param num_bytes kernel buffer size
1788 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1794 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1797 /* Checks for fiemap flags */
1798 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1799 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1803 /* Check for FIEMAP_FLAG_SYNC */
1804 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1805 rc = filemap_fdatawrite(inode->i_mapping);
1810 env = cl_env_get(&refcheck);
1812 RETURN(PTR_ERR(env));
1814 if (i_size_read(inode) == 0) {
1815 rc = ll_glimpse_size(inode);
1820 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1821 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1822 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1824 /* If filesize is 0, then there would be no objects for mapping */
1825 if (fmkey.lfik_oa.o_size == 0) {
1826 fiemap->fm_mapped_extents = 0;
1830 fmkey.lfik_fiemap = *fiemap;
1832 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1833 &fmkey, fiemap, &num_bytes);
1835 cl_env_put(env, &refcheck);
1839 int ll_fid2path(struct inode *inode, void __user *arg)
1841 struct obd_export *exp = ll_i2mdexp(inode);
1842 const struct getinfo_fid2path __user *gfin = arg;
1844 struct getinfo_fid2path *gfout;
1850 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1851 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1854 /* Only need to get the buflen */
1855 if (get_user(pathlen, &gfin->gf_pathlen))
1858 if (pathlen > PATH_MAX)
1861 outsize = sizeof(*gfout) + pathlen;
1862 OBD_ALLOC(gfout, outsize);
1866 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1867 GOTO(gf_free, rc = -EFAULT);
1868 /* append root FID after gfout to let MDT know the root FID so that it
1869 * can lookup the correct path, this is mainly for fileset.
1870 * old server without fileset mount support will ignore this. */
1871 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1873 /* Call mdc_iocontrol */
1874 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1878 if (copy_to_user(arg, gfout, outsize))
1882 OBD_FREE(gfout, outsize);
1887 * Read the data_version for inode.
1889 * This value is computed using stripe object version on OST.
1890 * Version is computed using server side locking.
1892 * @param flags if do sync on the OST side;
1894 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1895 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1897 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1899 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1907 /* If no file object initialized, we consider its version is 0. */
1913 env = cl_env_get(&refcheck);
1915 RETURN(PTR_ERR(env));
1917 io = vvp_env_thread_io(env);
1919 io->u.ci_data_version.dv_data_version = 0;
1920 io->u.ci_data_version.dv_flags = flags;
1923 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1924 result = cl_io_loop(env, io);
1926 result = io->ci_result;
1928 *data_version = io->u.ci_data_version.dv_data_version;
1930 cl_io_fini(env, io);
1932 if (unlikely(io->ci_need_restart))
1935 cl_env_put(env, &refcheck);
1941 * Trigger a HSM release request for the provided inode.
1943 int ll_hsm_release(struct inode *inode)
1946 struct obd_client_handle *och = NULL;
1947 __u64 data_version = 0;
1952 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1953 ll_get_fsname(inode->i_sb, NULL, 0),
1954 PFID(&ll_i2info(inode)->lli_fid));
1956 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1958 GOTO(out, rc = PTR_ERR(och));
1960 /* Grab latest data_version and [am]time values */
1961 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1965 env = cl_env_get(&refcheck);
1967 GOTO(out, rc = PTR_ERR(env));
1969 ll_merge_attr(env, inode);
1970 cl_env_put(env, &refcheck);
1972 /* Release the file.
1973 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1974 * we still need it to pack l_remote_handle to MDT. */
1975 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1981 if (och != NULL && !IS_ERR(och)) /* close the file */
1982 ll_lease_close(och, inode, NULL);
1987 struct ll_swap_stack {
1990 struct inode *inode1;
1991 struct inode *inode2;
1996 static int ll_swap_layouts(struct file *file1, struct file *file2,
1997 struct lustre_swap_layouts *lsl)
1999 struct mdc_swap_layouts msl;
2000 struct md_op_data *op_data;
2003 struct ll_swap_stack *llss = NULL;
2006 OBD_ALLOC_PTR(llss);
2010 llss->inode1 = file1->f_path.dentry->d_inode;
2011 llss->inode2 = file2->f_path.dentry->d_inode;
2013 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2017 /* we use 2 bool because it is easier to swap than 2 bits */
2018 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2019 llss->check_dv1 = true;
2021 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2022 llss->check_dv2 = true;
2024 /* we cannot use lsl->sl_dvX directly because we may swap them */
2025 llss->dv1 = lsl->sl_dv1;
2026 llss->dv2 = lsl->sl_dv2;
2028 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2029 if (rc == 0) /* same file, done! */
2032 if (rc < 0) { /* sequentialize it */
2033 swap(llss->inode1, llss->inode2);
2035 swap(llss->dv1, llss->dv2);
2036 swap(llss->check_dv1, llss->check_dv2);
2040 if (gid != 0) { /* application asks to flush dirty cache */
2041 rc = ll_get_grouplock(llss->inode1, file1, gid);
2045 rc = ll_get_grouplock(llss->inode2, file2, gid);
2047 ll_put_grouplock(llss->inode1, file1, gid);
2052 /* ultimate check, before swaping the layouts we check if
2053 * dataversion has changed (if requested) */
2054 if (llss->check_dv1) {
2055 rc = ll_data_version(llss->inode1, &dv, 0);
2058 if (dv != llss->dv1)
2059 GOTO(putgl, rc = -EAGAIN);
2062 if (llss->check_dv2) {
2063 rc = ll_data_version(llss->inode2, &dv, 0);
2066 if (dv != llss->dv2)
2067 GOTO(putgl, rc = -EAGAIN);
2070 /* struct md_op_data is used to send the swap args to the mdt
2071 * only flags is missing, so we use struct mdc_swap_layouts
2072 * through the md_op_data->op_data */
2073 /* flags from user space have to be converted before they are send to
2074 * server, no flag is sent today, they are only used on the client */
2077 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2078 0, LUSTRE_OPC_ANY, &msl);
2079 if (IS_ERR(op_data))
2080 GOTO(free, rc = PTR_ERR(op_data));
2082 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2083 sizeof(*op_data), op_data, NULL);
2084 ll_finish_md_op_data(op_data);
2091 ll_put_grouplock(llss->inode2, file2, gid);
2092 ll_put_grouplock(llss->inode1, file1, gid);
2102 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2104 struct md_op_data *op_data;
2108 /* Detect out-of range masks */
2109 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2112 /* Non-root users are forbidden to set or clear flags which are
2113 * NOT defined in HSM_USER_MASK. */
2114 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2115 !cfs_capable(CFS_CAP_SYS_ADMIN))
2118 /* Detect out-of range archive id */
2119 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2120 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2123 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2124 LUSTRE_OPC_ANY, hss);
2125 if (IS_ERR(op_data))
2126 RETURN(PTR_ERR(op_data));
2128 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2129 sizeof(*op_data), op_data, NULL);
2131 ll_finish_md_op_data(op_data);
2136 static int ll_hsm_import(struct inode *inode, struct file *file,
2137 struct hsm_user_import *hui)
2139 struct hsm_state_set *hss = NULL;
2140 struct iattr *attr = NULL;
2144 if (!S_ISREG(inode->i_mode))
2150 GOTO(out, rc = -ENOMEM);
2152 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2153 hss->hss_archive_id = hui->hui_archive_id;
2154 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2155 rc = ll_hsm_state_set(inode, hss);
2159 OBD_ALLOC_PTR(attr);
2161 GOTO(out, rc = -ENOMEM);
2163 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2164 attr->ia_mode |= S_IFREG;
2165 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2166 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2167 attr->ia_size = hui->hui_size;
2168 attr->ia_mtime.tv_sec = hui->hui_mtime;
2169 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2170 attr->ia_atime.tv_sec = hui->hui_atime;
2171 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2173 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2174 ATTR_UID | ATTR_GID |
2175 ATTR_MTIME | ATTR_MTIME_SET |
2176 ATTR_ATIME | ATTR_ATIME_SET;
2178 mutex_lock(&inode->i_mutex);
2180 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2184 mutex_unlock(&inode->i_mutex);
2196 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2198 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2199 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2202 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2204 struct inode *inode = file->f_path.dentry->d_inode;
2206 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2207 ATTR_MTIME | ATTR_MTIME_SET |
2208 ATTR_CTIME | ATTR_CTIME_SET,
2210 .tv_sec = lfu->lfu_atime_sec,
2211 .tv_nsec = lfu->lfu_atime_nsec,
2214 .tv_sec = lfu->lfu_mtime_sec,
2215 .tv_nsec = lfu->lfu_mtime_nsec,
2218 .tv_sec = lfu->lfu_ctime_sec,
2219 .tv_nsec = lfu->lfu_ctime_nsec,
2225 if (!capable(CAP_SYS_ADMIN))
2228 if (!S_ISREG(inode->i_mode))
2231 mutex_lock(&inode->i_mutex);
2232 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2233 mutex_unlock(&inode->i_mutex);
2239 * Give file access advices
2241 * The ladvise interface is similar to Linux fadvise() system call, except it
2242 * forwards the advices directly from Lustre client to server. The server side
2243 * codes will apply appropriate read-ahead and caching techniques for the
2244 * corresponding files.
2246 * A typical workload for ladvise is e.g. a bunch of different clients are
2247 * doing small random reads of a file, so prefetching pages into OSS cache
2248 * with big linear reads before the random IO is a net benefit. Fetching
2249 * all that data into each client cache with fadvise() may not be, due to
2250 * much more data being sent to the client.
2252 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2253 struct lu_ladvise *ladvise)
2257 struct cl_ladvise_io *lio;
2262 env = cl_env_get(&refcheck);
2264 RETURN(PTR_ERR(env));
2266 io = vvp_env_thread_io(env);
2267 io->ci_obj = ll_i2info(inode)->lli_clob;
2269 /* initialize parameters for ladvise */
2270 lio = &io->u.ci_ladvise;
2271 lio->li_start = ladvise->lla_start;
2272 lio->li_end = ladvise->lla_end;
2273 lio->li_fid = ll_inode2fid(inode);
2274 lio->li_advice = ladvise->lla_advice;
2275 lio->li_flags = flags;
2277 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2278 rc = cl_io_loop(env, io);
2282 cl_io_fini(env, io);
2283 cl_env_put(env, &refcheck);
2288 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2290 struct inode *inode = file->f_path.dentry->d_inode;
2291 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2295 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2296 PFID(ll_inode2fid(inode)), inode, cmd);
2297 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2299 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2300 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2304 case LL_IOC_GETFLAGS:
2305 /* Get the current value of the file flags */
2306 return put_user(fd->fd_flags, (int __user *)arg);
2307 case LL_IOC_SETFLAGS:
2308 case LL_IOC_CLRFLAGS:
2309 /* Set or clear specific file flags */
2310 /* XXX This probably needs checks to ensure the flags are
2311 * not abused, and to handle any flag side effects.
2313 if (get_user(flags, (int __user *) arg))
2316 if (cmd == LL_IOC_SETFLAGS) {
2317 if ((flags & LL_FILE_IGNORE_LOCK) &&
2318 !(file->f_flags & O_DIRECT)) {
2319 CERROR("%s: unable to disable locking on "
2320 "non-O_DIRECT file\n", current->comm);
2324 fd->fd_flags |= flags;
2326 fd->fd_flags &= ~flags;
2329 case LL_IOC_LOV_SETSTRIPE:
2330 RETURN(ll_lov_setstripe(inode, file, arg));
2331 case LL_IOC_LOV_SETEA:
2332 RETURN(ll_lov_setea(inode, file, arg));
2333 case LL_IOC_LOV_SWAP_LAYOUTS: {
2335 struct lustre_swap_layouts lsl;
2337 if (copy_from_user(&lsl, (char __user *)arg,
2338 sizeof(struct lustre_swap_layouts)))
2341 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2344 file2 = fget(lsl.sl_fd);
2348 /* O_WRONLY or O_RDWR */
2349 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2350 GOTO(out, rc = -EPERM);
2352 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2353 struct inode *inode2;
2354 struct ll_inode_info *lli;
2355 struct obd_client_handle *och = NULL;
2357 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2358 GOTO(out, rc = -EINVAL);
2360 lli = ll_i2info(inode);
2361 mutex_lock(&lli->lli_och_mutex);
2362 if (fd->fd_lease_och != NULL) {
2363 och = fd->fd_lease_och;
2364 fd->fd_lease_och = NULL;
2366 mutex_unlock(&lli->lli_och_mutex);
2368 GOTO(out, rc = -ENOLCK);
2369 inode2 = file2->f_path.dentry->d_inode;
2370 rc = ll_swap_layouts_close(och, inode, inode2);
2372 rc = ll_swap_layouts(file, file2, &lsl);
2378 case LL_IOC_LOV_GETSTRIPE:
2379 RETURN(ll_file_getstripe(inode,
2380 (struct lov_user_md __user *)arg));
2381 case FSFILT_IOC_GETFLAGS:
2382 case FSFILT_IOC_SETFLAGS:
2383 RETURN(ll_iocontrol(inode, file, cmd, arg));
2384 case FSFILT_IOC_GETVERSION_OLD:
2385 case FSFILT_IOC_GETVERSION:
2386 RETURN(put_user(inode->i_generation, (int __user *)arg));
2387 case LL_IOC_GROUP_LOCK:
2388 RETURN(ll_get_grouplock(inode, file, arg));
2389 case LL_IOC_GROUP_UNLOCK:
2390 RETURN(ll_put_grouplock(inode, file, arg));
2391 case IOC_OBD_STATFS:
2392 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2394 /* We need to special case any other ioctls we want to handle,
2395 * to send them to the MDS/OST as appropriate and to properly
2396 * network encode the arg field.
2397 case FSFILT_IOC_SETVERSION_OLD:
2398 case FSFILT_IOC_SETVERSION:
2400 case LL_IOC_FLUSHCTX:
2401 RETURN(ll_flush_ctx(inode));
2402 case LL_IOC_PATH2FID: {
2403 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2404 sizeof(struct lu_fid)))
2409 case LL_IOC_GETPARENT:
2410 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2412 case OBD_IOC_FID2PATH:
2413 RETURN(ll_fid2path(inode, (void __user *)arg));
2414 case LL_IOC_DATA_VERSION: {
2415 struct ioc_data_version idv;
2418 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2421 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2422 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2425 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2431 case LL_IOC_GET_MDTIDX: {
2434 mdtidx = ll_get_mdt_idx(inode);
2438 if (put_user((int)mdtidx, (int __user *)arg))
2443 case OBD_IOC_GETDTNAME:
2444 case OBD_IOC_GETMDNAME:
2445 RETURN(ll_get_obd_name(inode, cmd, arg));
2446 case LL_IOC_HSM_STATE_GET: {
2447 struct md_op_data *op_data;
2448 struct hsm_user_state *hus;
2455 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2456 LUSTRE_OPC_ANY, hus);
2457 if (IS_ERR(op_data)) {
2459 RETURN(PTR_ERR(op_data));
2462 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2465 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2468 ll_finish_md_op_data(op_data);
2472 case LL_IOC_HSM_STATE_SET: {
2473 struct hsm_state_set *hss;
2480 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2485 rc = ll_hsm_state_set(inode, hss);
2490 case LL_IOC_HSM_ACTION: {
2491 struct md_op_data *op_data;
2492 struct hsm_current_action *hca;
2499 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2500 LUSTRE_OPC_ANY, hca);
2501 if (IS_ERR(op_data)) {
2503 RETURN(PTR_ERR(op_data));
2506 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2509 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2512 ll_finish_md_op_data(op_data);
2516 case LL_IOC_SET_LEASE: {
2517 struct ll_inode_info *lli = ll_i2info(inode);
2518 struct obd_client_handle *och = NULL;
2523 case LL_LEASE_WRLCK:
2524 if (!(file->f_mode & FMODE_WRITE))
2526 fmode = FMODE_WRITE;
2528 case LL_LEASE_RDLCK:
2529 if (!(file->f_mode & FMODE_READ))
2533 case LL_LEASE_UNLCK:
2534 mutex_lock(&lli->lli_och_mutex);
2535 if (fd->fd_lease_och != NULL) {
2536 och = fd->fd_lease_och;
2537 fd->fd_lease_och = NULL;
2539 mutex_unlock(&lli->lli_och_mutex);
2544 fmode = och->och_flags;
2545 rc = ll_lease_close(och, inode, &lease_broken);
2552 RETURN(ll_lease_type_from_fmode(fmode));
2557 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2559 /* apply for lease */
2560 och = ll_lease_open(inode, file, fmode, 0);
2562 RETURN(PTR_ERR(och));
2565 mutex_lock(&lli->lli_och_mutex);
2566 if (fd->fd_lease_och == NULL) {
2567 fd->fd_lease_och = och;
2570 mutex_unlock(&lli->lli_och_mutex);
2572 /* impossible now that only excl is supported for now */
2573 ll_lease_close(och, inode, &lease_broken);
2578 case LL_IOC_GET_LEASE: {
2579 struct ll_inode_info *lli = ll_i2info(inode);
2580 struct ldlm_lock *lock = NULL;
2583 mutex_lock(&lli->lli_och_mutex);
2584 if (fd->fd_lease_och != NULL) {
2585 struct obd_client_handle *och = fd->fd_lease_och;
2587 lock = ldlm_handle2lock(&och->och_lease_handle);
2589 lock_res_and_lock(lock);
2590 if (!ldlm_is_cancel(lock))
2591 fmode = och->och_flags;
2593 unlock_res_and_lock(lock);
2594 LDLM_LOCK_PUT(lock);
2597 mutex_unlock(&lli->lli_och_mutex);
2599 RETURN(ll_lease_type_from_fmode(fmode));
2601 case LL_IOC_HSM_IMPORT: {
2602 struct hsm_user_import *hui;
2608 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2613 rc = ll_hsm_import(inode, file, hui);
2618 case LL_IOC_FUTIMES_3: {
2619 struct ll_futimes_3 lfu;
2621 if (copy_from_user(&lfu,
2622 (const struct ll_futimes_3 __user *)arg,
2626 RETURN(ll_file_futimes_3(file, &lfu));
2628 case LL_IOC_LADVISE: {
2629 struct ladvise_hdr *ladvise_hdr;
2632 int alloc_size = sizeof(*ladvise_hdr);
2635 OBD_ALLOC_PTR(ladvise_hdr);
2636 if (ladvise_hdr == NULL)
2639 if (copy_from_user(ladvise_hdr,
2640 (const struct ladvise_hdr __user *)arg,
2642 GOTO(out_ladvise, rc = -EFAULT);
2644 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2645 ladvise_hdr->lah_count < 1)
2646 GOTO(out_ladvise, rc = -EINVAL);
2648 num_advise = ladvise_hdr->lah_count;
2649 if (num_advise >= LAH_COUNT_MAX)
2650 GOTO(out_ladvise, rc = -EFBIG);
2652 OBD_FREE_PTR(ladvise_hdr);
2653 alloc_size = offsetof(typeof(*ladvise_hdr),
2654 lah_advise[num_advise]);
2655 OBD_ALLOC(ladvise_hdr, alloc_size);
2656 if (ladvise_hdr == NULL)
2660 * TODO: submit multiple advices to one server in a single RPC
2662 if (copy_from_user(ladvise_hdr,
2663 (const struct ladvise_hdr __user *)arg,
2665 GOTO(out_ladvise, rc = -EFAULT);
2667 for (i = 0; i < num_advise; i++) {
2668 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2669 &ladvise_hdr->lah_advise[i]);
2675 OBD_FREE(ladvise_hdr, alloc_size);
2682 ll_iocontrol_call(inode, file, cmd, arg, &err))
2685 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2686 (void __user *)arg));
2691 #ifndef HAVE_FILE_LLSEEK_SIZE
2692 static inline loff_t
2693 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2695 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2697 if (offset > maxsize)
2700 if (offset != file->f_pos) {
2701 file->f_pos = offset;
2702 file->f_version = 0;
2708 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2709 loff_t maxsize, loff_t eof)
2711 struct inode *inode = file->f_path.dentry->d_inode;
2719 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2720 * position-querying operation. Avoid rewriting the "same"
2721 * f_pos value back to the file because a concurrent read(),
2722 * write() or lseek() might have altered it
2727 * f_lock protects against read/modify/write race with other
2728 * SEEK_CURs. Note that parallel writes and reads behave
2731 mutex_lock(&inode->i_mutex);
2732 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2733 mutex_unlock(&inode->i_mutex);
2737 * In the generic case the entire file is data, so as long as
2738 * offset isn't at the end of the file then the offset is data.
2745 * There is a virtual hole at the end of the file, so as long as
2746 * offset isn't i_size or larger, return i_size.
2754 return llseek_execute(file, offset, maxsize);
2758 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2760 struct inode *inode = file->f_path.dentry->d_inode;
2761 loff_t retval, eof = 0;
2764 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2765 (origin == SEEK_CUR) ? file->f_pos : 0);
2766 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2767 PFID(ll_inode2fid(inode)), inode, retval, retval,
2769 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2771 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2772 retval = ll_glimpse_size(inode);
2775 eof = i_size_read(inode);
2778 retval = ll_generic_file_llseek_size(file, offset, origin,
2779 ll_file_maxbytes(inode), eof);
2783 static int ll_flush(struct file *file, fl_owner_t id)
2785 struct inode *inode = file->f_path.dentry->d_inode;
2786 struct ll_inode_info *lli = ll_i2info(inode);
2787 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2790 LASSERT(!S_ISDIR(inode->i_mode));
2792 /* catch async errors that were recorded back when async writeback
2793 * failed for pages in this mapping. */
2794 rc = lli->lli_async_rc;
2795 lli->lli_async_rc = 0;
2796 if (lli->lli_clob != NULL) {
2797 err = lov_read_and_clear_async_rc(lli->lli_clob);
2802 /* The application has been told write failure already.
2803 * Do not report failure again. */
2804 if (fd->fd_write_failed)
2806 return rc ? -EIO : 0;
2810 * Called to make sure a portion of file has been written out.
2811 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2813 * Return how many pages have been written.
2815 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2816 enum cl_fsync_mode mode, int ignore_layout)
2820 struct cl_fsync_io *fio;
2825 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2826 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2829 env = cl_env_get(&refcheck);
2831 RETURN(PTR_ERR(env));
2833 io = vvp_env_thread_io(env);
2834 io->ci_obj = ll_i2info(inode)->lli_clob;
2835 io->ci_ignore_layout = ignore_layout;
2837 /* initialize parameters for sync */
2838 fio = &io->u.ci_fsync;
2839 fio->fi_start = start;
2841 fio->fi_fid = ll_inode2fid(inode);
2842 fio->fi_mode = mode;
2843 fio->fi_nr_written = 0;
2845 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2846 result = cl_io_loop(env, io);
2848 result = io->ci_result;
2850 result = fio->fi_nr_written;
2851 cl_io_fini(env, io);
2852 cl_env_put(env, &refcheck);
2858 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2859 * null and dentry must be used directly rather than pulled from
2860 * *file->f_path.dentry as is done otherwise.
2863 #ifdef HAVE_FILE_FSYNC_4ARGS
2864 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2866 struct dentry *dentry = file->f_path.dentry;
2867 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2868 int ll_fsync(struct file *file, int datasync)
2870 struct dentry *dentry = file->f_path.dentry;
2872 loff_t end = LLONG_MAX;
2874 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2877 loff_t end = LLONG_MAX;
2879 struct inode *inode = dentry->d_inode;
2880 struct ll_inode_info *lli = ll_i2info(inode);
2881 struct ptlrpc_request *req;
2885 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2886 PFID(ll_inode2fid(inode)), inode);
2887 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2889 #ifdef HAVE_FILE_FSYNC_4ARGS
2890 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2891 mutex_lock(&inode->i_mutex);
2893 /* fsync's caller has already called _fdata{sync,write}, we want
2894 * that IO to finish before calling the osc and mdc sync methods */
2895 rc = filemap_fdatawait(inode->i_mapping);
2898 /* catch async errors that were recorded back when async writeback
2899 * failed for pages in this mapping. */
2900 if (!S_ISDIR(inode->i_mode)) {
2901 err = lli->lli_async_rc;
2902 lli->lli_async_rc = 0;
2905 err = lov_read_and_clear_async_rc(lli->lli_clob);
2910 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2914 ptlrpc_req_finished(req);
2916 if (S_ISREG(inode->i_mode)) {
2917 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2919 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2920 if (rc == 0 && err < 0)
2923 fd->fd_write_failed = true;
2925 fd->fd_write_failed = false;
2928 #ifdef HAVE_FILE_FSYNC_4ARGS
2929 mutex_unlock(&inode->i_mutex);
2935 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2937 struct inode *inode = file->f_path.dentry->d_inode;
2938 struct ll_sb_info *sbi = ll_i2sbi(inode);
2939 struct ldlm_enqueue_info einfo = {
2940 .ei_type = LDLM_FLOCK,
2941 .ei_cb_cp = ldlm_flock_completion_ast,
2942 .ei_cbdata = file_lock,
2944 struct md_op_data *op_data;
2945 struct lustre_handle lockh = { 0 };
2946 union ldlm_policy_data flock = { { 0 } };
2947 int fl_type = file_lock->fl_type;
2953 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2954 PFID(ll_inode2fid(inode)), file_lock);
2956 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2958 if (file_lock->fl_flags & FL_FLOCK) {
2959 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2960 /* flocks are whole-file locks */
2961 flock.l_flock.end = OFFSET_MAX;
2962 /* For flocks owner is determined by the local file desctiptor*/
2963 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2964 } else if (file_lock->fl_flags & FL_POSIX) {
2965 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2966 flock.l_flock.start = file_lock->fl_start;
2967 flock.l_flock.end = file_lock->fl_end;
2971 flock.l_flock.pid = file_lock->fl_pid;
2973 /* Somewhat ugly workaround for svc lockd.
2974 * lockd installs custom fl_lmops->lm_compare_owner that checks
2975 * for the fl_owner to be the same (which it always is on local node
2976 * I guess between lockd processes) and then compares pid.
2977 * As such we assign pid to the owner field to make it all work,
2978 * conflict with normal locks is unlikely since pid space and
2979 * pointer space for current->files are not intersecting */
2980 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2981 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2985 einfo.ei_mode = LCK_PR;
2988 /* An unlock request may or may not have any relation to
2989 * existing locks so we may not be able to pass a lock handle
2990 * via a normal ldlm_lock_cancel() request. The request may even
2991 * unlock a byte range in the middle of an existing lock. In
2992 * order to process an unlock request we need all of the same
2993 * information that is given with a normal read or write record
2994 * lock request. To avoid creating another ldlm unlock (cancel)
2995 * message we'll treat a LCK_NL flock request as an unlock. */
2996 einfo.ei_mode = LCK_NL;
2999 einfo.ei_mode = LCK_PW;
3002 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3017 flags = LDLM_FL_BLOCK_NOWAIT;
3023 flags = LDLM_FL_TEST_LOCK;
3026 CERROR("unknown fcntl lock command: %d\n", cmd);
3030 /* Save the old mode so that if the mode in the lock changes we
3031 * can decrement the appropriate reader or writer refcount. */
3032 file_lock->fl_type = einfo.ei_mode;
3034 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3035 LUSTRE_OPC_ANY, NULL);
3036 if (IS_ERR(op_data))
3037 RETURN(PTR_ERR(op_data));
3039 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3040 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3041 flock.l_flock.pid, flags, einfo.ei_mode,
3042 flock.l_flock.start, flock.l_flock.end);
3044 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3047 /* Restore the file lock type if not TEST lock. */
3048 if (!(flags & LDLM_FL_TEST_LOCK))
3049 file_lock->fl_type = fl_type;
3051 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3052 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3053 !(flags & LDLM_FL_TEST_LOCK))
3054 rc2 = locks_lock_file_wait(file, file_lock);
3056 if ((file_lock->fl_flags & FL_FLOCK) &&
3057 (rc == 0 || file_lock->fl_type == F_UNLCK))
3058 rc2 = flock_lock_file_wait(file, file_lock);
3059 if ((file_lock->fl_flags & FL_POSIX) &&
3060 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3061 !(flags & LDLM_FL_TEST_LOCK))
3062 rc2 = posix_lock_file_wait(file, file_lock);
3063 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3065 if (rc2 && file_lock->fl_type != F_UNLCK) {
3066 einfo.ei_mode = LCK_NL;
3067 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3072 ll_finish_md_op_data(op_data);
3077 int ll_get_fid_by_name(struct inode *parent, const char *name,
3078 int namelen, struct lu_fid *fid,
3079 struct inode **inode)
3081 struct md_op_data *op_data = NULL;
3082 struct mdt_body *body;
3083 struct ptlrpc_request *req;
3087 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3088 LUSTRE_OPC_ANY, NULL);
3089 if (IS_ERR(op_data))
3090 RETURN(PTR_ERR(op_data));
3092 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3093 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3094 ll_finish_md_op_data(op_data);
3098 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3100 GOTO(out_req, rc = -EFAULT);
3102 *fid = body->mbo_fid1;
3105 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3107 ptlrpc_req_finished(req);
3111 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3112 const char *name, int namelen)
3114 struct dentry *dchild = NULL;
3115 struct inode *child_inode = NULL;
3116 struct md_op_data *op_data;
3117 struct ptlrpc_request *request = NULL;
3118 struct obd_client_handle *och = NULL;
3120 struct mdt_body *body;
3122 __u64 data_version = 0;
3125 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3126 name, PFID(ll_inode2fid(parent)), mdtidx);
3128 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3129 0, LUSTRE_OPC_ANY, NULL);
3130 if (IS_ERR(op_data))
3131 RETURN(PTR_ERR(op_data));
3133 /* Get child FID first */
3134 qstr.hash = full_name_hash(name, namelen);
3137 dchild = d_lookup(file->f_path.dentry, &qstr);
3138 if (dchild != NULL) {
3139 if (dchild->d_inode != NULL)
3140 child_inode = igrab(dchild->d_inode);
3144 if (child_inode == NULL) {
3145 rc = ll_get_fid_by_name(parent, name, namelen,
3146 &op_data->op_fid3, &child_inode);
3151 if (child_inode == NULL)
3152 GOTO(out_free, rc = -EINVAL);
3155 * lfs migrate command needs to be blocked on the client
3156 * by checking the migrate FID against the FID of the
3159 if (child_inode == parent->i_sb->s_root->d_inode)
3160 GOTO(out_iput, rc = -EINVAL);
3162 mutex_lock(&child_inode->i_mutex);
3163 op_data->op_fid3 = *ll_inode2fid(child_inode);
3164 if (!fid_is_sane(&op_data->op_fid3)) {
3165 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3166 ll_get_fsname(parent->i_sb, NULL, 0), name,
3167 PFID(&op_data->op_fid3));
3168 GOTO(out_unlock, rc = -EINVAL);
3171 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3173 GOTO(out_unlock, rc);
3176 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3177 PFID(&op_data->op_fid3), mdtidx);
3178 GOTO(out_unlock, rc = 0);
3181 if (S_ISREG(child_inode->i_mode)) {
3182 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3186 GOTO(out_unlock, rc);
3189 rc = ll_data_version(child_inode, &data_version,
3192 GOTO(out_close, rc);
3194 op_data->op_handle = och->och_fh;
3195 op_data->op_data = och->och_mod;
3196 op_data->op_data_version = data_version;
3197 op_data->op_lease_handle = och->och_lease_handle;
3198 op_data->op_bias |= MDS_RENAME_MIGRATE;
3201 op_data->op_mds = mdtidx;
3202 op_data->op_cli_flags = CLI_MIGRATE;
3203 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3204 namelen, name, namelen, &request);
3206 ll_update_times(request, parent);
3208 if (request != NULL) {
3209 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3211 ptlrpc_req_finished(request);
3212 GOTO(out_close, rc = -EPROTO);
3215 /* If the server does release layout lock, then we cleanup
3216 * the client och here, otherwise release it in out_close: */
3218 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3219 obd_mod_put(och->och_mod);
3220 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3222 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3226 ptlrpc_req_finished(request);
3229 /* Try again if the file layout has changed. */
3230 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3235 if (och != NULL) /* close the file */
3236 ll_lease_close(och, child_inode, NULL);
3238 clear_nlink(child_inode);
3240 mutex_unlock(&child_inode->i_mutex);
3244 ll_finish_md_op_data(op_data);
3249 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3257 * test if some locks matching bits and l_req_mode are acquired
3258 * - bits can be in different locks
3259 * - if found clear the common lock bits in *bits
3260 * - the bits not found, are kept in *bits
3262 * \param bits [IN] searched lock bits [IN]
3263 * \param l_req_mode [IN] searched lock mode
3264 * \retval boolean, true iff all bits are found
3266 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3268 struct lustre_handle lockh;
3269 union ldlm_policy_data policy;
3270 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3271 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3280 fid = &ll_i2info(inode)->lli_fid;
3281 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3282 ldlm_lockname[mode]);
3284 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3285 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3286 policy.l_inodebits.bits = *bits & (1 << i);
3287 if (policy.l_inodebits.bits == 0)
3290 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3291 &policy, mode, &lockh)) {
3292 struct ldlm_lock *lock;
3294 lock = ldlm_handle2lock(&lockh);
3297 ~(lock->l_policy_data.l_inodebits.bits);
3298 LDLM_LOCK_PUT(lock);
3300 *bits &= ~policy.l_inodebits.bits;
3307 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3308 struct lustre_handle *lockh, __u64 flags,
3309 enum ldlm_mode mode)
3311 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3316 fid = &ll_i2info(inode)->lli_fid;
3317 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3319 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3320 fid, LDLM_IBITS, &policy, mode, lockh);
3325 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3327 /* Already unlinked. Just update nlink and return success */
3328 if (rc == -ENOENT) {
3330 /* If it is striped directory, and there is bad stripe
3331 * Let's revalidate the dentry again, instead of returning
3333 if (S_ISDIR(inode->i_mode) &&
3334 ll_i2info(inode)->lli_lsm_md != NULL)
3337 /* This path cannot be hit for regular files unless in
3338 * case of obscure races, so no need to to validate
3340 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3342 } else if (rc != 0) {
3343 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3344 "%s: revalidate FID "DFID" error: rc = %d\n",
3345 ll_get_fsname(inode->i_sb, NULL, 0),
3346 PFID(ll_inode2fid(inode)), rc);
3352 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3354 struct inode *inode = dentry->d_inode;
3355 struct ptlrpc_request *req = NULL;
3356 struct obd_export *exp;
3360 LASSERT(inode != NULL);
3362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3363 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3365 exp = ll_i2mdexp(inode);
3367 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3368 * But under CMD case, it caused some lock issues, should be fixed
3369 * with new CMD ibits lock. See bug 12718 */
3370 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3371 struct lookup_intent oit = { .it_op = IT_GETATTR };
3372 struct md_op_data *op_data;
3374 if (ibits == MDS_INODELOCK_LOOKUP)
3375 oit.it_op = IT_LOOKUP;
3377 /* Call getattr by fid, so do not provide name at all. */
3378 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3379 dentry->d_inode, NULL, 0, 0,
3380 LUSTRE_OPC_ANY, NULL);
3381 if (IS_ERR(op_data))
3382 RETURN(PTR_ERR(op_data));
3384 rc = md_intent_lock(exp, op_data, &oit, &req,
3385 &ll_md_blocking_ast, 0);
3386 ll_finish_md_op_data(op_data);
3388 rc = ll_inode_revalidate_fini(inode, rc);
3392 rc = ll_revalidate_it_finish(req, &oit, dentry);
3394 ll_intent_release(&oit);
3398 /* Unlinked? Unhash dentry, so it is not picked up later by
3399 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3400 here to preserve get_cwd functionality on 2.6.
3402 if (!dentry->d_inode->i_nlink) {
3403 ll_lock_dcache(inode);
3404 d_lustre_invalidate(dentry, 0);
3405 ll_unlock_dcache(inode);
3408 ll_lookup_finish_locks(&oit, dentry);
3409 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3410 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3411 u64 valid = OBD_MD_FLGETATTR;
3412 struct md_op_data *op_data;
3415 if (S_ISREG(inode->i_mode)) {
3416 rc = ll_get_default_mdsize(sbi, &ealen);
3419 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3422 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3423 0, ealen, LUSTRE_OPC_ANY,
3425 if (IS_ERR(op_data))
3426 RETURN(PTR_ERR(op_data));
3428 op_data->op_valid = valid;
3429 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3430 ll_finish_md_op_data(op_data);
3432 rc = ll_inode_revalidate_fini(inode, rc);
3436 rc = ll_prep_inode(&inode, req, NULL, NULL);
3439 ptlrpc_req_finished(req);
3443 static int ll_merge_md_attr(struct inode *inode)
3445 struct cl_attr attr = { 0 };
3448 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3449 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3450 &attr, ll_md_blocking_ast);
3454 set_nlink(inode, attr.cat_nlink);
3455 inode->i_blocks = attr.cat_blocks;
3456 i_size_write(inode, attr.cat_size);
3458 ll_i2info(inode)->lli_atime = attr.cat_atime;
3459 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3460 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3466 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3468 struct inode *inode = dentry->d_inode;
3472 rc = __ll_inode_revalidate(dentry, ibits);
3476 /* if object isn't regular file, don't validate size */
3477 if (!S_ISREG(inode->i_mode)) {
3478 if (S_ISDIR(inode->i_mode) &&
3479 ll_i2info(inode)->lli_lsm_md != NULL) {
3480 rc = ll_merge_md_attr(inode);
3485 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3486 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3487 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3489 /* In case of restore, the MDT has the right size and has
3490 * already send it back without granting the layout lock,
3491 * inode is up-to-date so glimpse is useless.
3492 * Also to glimpse we need the layout, in case of a running
3493 * restore the MDT holds the layout lock so the glimpse will
3494 * block up to the end of restore (getattr will block)
3496 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3497 rc = ll_glimpse_size(inode);
3502 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3504 struct inode *inode = de->d_inode;
3505 struct ll_sb_info *sbi = ll_i2sbi(inode);
3506 struct ll_inode_info *lli = ll_i2info(inode);
3509 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3510 MDS_INODELOCK_LOOKUP);
3511 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3516 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3518 stat->dev = inode->i_sb->s_dev;
3519 if (ll_need_32bit_api(sbi))
3520 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3522 stat->ino = inode->i_ino;
3523 stat->mode = inode->i_mode;
3524 stat->uid = inode->i_uid;
3525 stat->gid = inode->i_gid;
3526 stat->rdev = inode->i_rdev;
3527 stat->atime = inode->i_atime;
3528 stat->mtime = inode->i_mtime;
3529 stat->ctime = inode->i_ctime;
3530 stat->blksize = 1 << inode->i_blkbits;
3532 stat->nlink = inode->i_nlink;
3533 stat->size = i_size_read(inode);
3534 stat->blocks = inode->i_blocks;
3539 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3540 __u64 start, __u64 len)
3544 struct fiemap *fiemap;
3545 unsigned int extent_count = fieinfo->fi_extents_max;
3547 num_bytes = sizeof(*fiemap) + (extent_count *
3548 sizeof(struct fiemap_extent));
3549 OBD_ALLOC_LARGE(fiemap, num_bytes);
3554 fiemap->fm_flags = fieinfo->fi_flags;
3555 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3556 fiemap->fm_start = start;
3557 fiemap->fm_length = len;
3558 if (extent_count > 0 &&
3559 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3560 sizeof(struct fiemap_extent)) != 0)
3561 GOTO(out, rc = -EFAULT);
3563 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3565 fieinfo->fi_flags = fiemap->fm_flags;
3566 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3567 if (extent_count > 0 &&
3568 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3569 fiemap->fm_mapped_extents *
3570 sizeof(struct fiemap_extent)) != 0)
3571 GOTO(out, rc = -EFAULT);
3573 OBD_FREE_LARGE(fiemap, num_bytes);
3577 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3579 struct ll_inode_info *lli = ll_i2info(inode);
3580 struct posix_acl *acl = NULL;
3583 spin_lock(&lli->lli_lock);
3584 /* VFS' acl_permission_check->check_acl will release the refcount */
3585 acl = posix_acl_dup(lli->lli_posix_acl);
3586 spin_unlock(&lli->lli_lock);
3591 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3593 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3594 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3596 ll_check_acl(struct inode *inode, int mask)
3599 # ifdef CONFIG_FS_POSIX_ACL
3600 struct posix_acl *acl;
3604 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3605 if (flags & IPERM_FLAG_RCU)
3608 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3613 rc = posix_acl_permission(inode, acl, mask);
3614 posix_acl_release(acl);
3617 # else /* !CONFIG_FS_POSIX_ACL */
3619 # endif /* CONFIG_FS_POSIX_ACL */
3621 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3623 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3624 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3626 # ifdef HAVE_INODE_PERMISION_2ARGS
3627 int ll_inode_permission(struct inode *inode, int mask)
3629 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3634 struct ll_sb_info *sbi;
3635 struct root_squash_info *squash;
3636 struct cred *cred = NULL;
3637 const struct cred *old_cred = NULL;
3639 bool squash_id = false;
3642 #ifdef MAY_NOT_BLOCK
3643 if (mask & MAY_NOT_BLOCK)
3645 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3646 if (flags & IPERM_FLAG_RCU)
3650 /* as root inode are NOT getting validated in lookup operation,
3651 * need to do it before permission check. */
3653 if (inode == inode->i_sb->s_root->d_inode) {
3654 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3655 MDS_INODELOCK_LOOKUP);
3660 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3661 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3663 /* squash fsuid/fsgid if needed */
3664 sbi = ll_i2sbi(inode);
3665 squash = &sbi->ll_squash;
3666 if (unlikely(squash->rsi_uid != 0 &&
3667 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3668 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3672 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3673 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3674 squash->rsi_uid, squash->rsi_gid);
3676 /* update current process's credentials
3677 * and FS capability */
3678 cred = prepare_creds();
3682 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3683 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3684 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3685 if ((1 << cap) & CFS_CAP_FS_MASK)
3686 cap_lower(cred->cap_effective, cap);
3688 old_cred = override_creds(cred);
3691 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3693 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3694 rc = lustre_check_remote_perm(inode, mask);
3696 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3698 /* restore current process's credentials and FS capability */
3700 revert_creds(old_cred);
3707 /* -o localflock - only provides locally consistent flock locks */
3708 struct file_operations ll_file_operations = {
3709 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3710 # ifdef HAVE_SYNC_READ_WRITE
3711 .read = new_sync_read,
3712 .write = new_sync_write,
3714 .read_iter = ll_file_read_iter,
3715 .write_iter = ll_file_write_iter,
3716 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3717 .read = ll_file_read,
3718 .aio_read = ll_file_aio_read,
3719 .write = ll_file_write,
3720 .aio_write = ll_file_aio_write,
3721 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3722 .unlocked_ioctl = ll_file_ioctl,
3723 .open = ll_file_open,
3724 .release = ll_file_release,
3725 .mmap = ll_file_mmap,
3726 .llseek = ll_file_seek,
3727 .splice_read = ll_file_splice_read,
3732 struct file_operations ll_file_operations_flock = {
3733 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3734 # ifdef HAVE_SYNC_READ_WRITE
3735 .read = new_sync_read,
3736 .write = new_sync_write,
3737 # endif /* HAVE_SYNC_READ_WRITE */
3738 .read_iter = ll_file_read_iter,
3739 .write_iter = ll_file_write_iter,
3740 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3741 .read = ll_file_read,
3742 .aio_read = ll_file_aio_read,
3743 .write = ll_file_write,
3744 .aio_write = ll_file_aio_write,
3745 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3746 .unlocked_ioctl = ll_file_ioctl,
3747 .open = ll_file_open,
3748 .release = ll_file_release,
3749 .mmap = ll_file_mmap,
3750 .llseek = ll_file_seek,
3751 .splice_read = ll_file_splice_read,
3754 .flock = ll_file_flock,
3755 .lock = ll_file_flock
3758 /* These are for -o noflock - to return ENOSYS on flock calls */
3759 struct file_operations ll_file_operations_noflock = {
3760 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3761 # ifdef HAVE_SYNC_READ_WRITE
3762 .read = new_sync_read,
3763 .write = new_sync_write,
3764 # endif /* HAVE_SYNC_READ_WRITE */
3765 .read_iter = ll_file_read_iter,
3766 .write_iter = ll_file_write_iter,
3767 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3768 .read = ll_file_read,
3769 .aio_read = ll_file_aio_read,
3770 .write = ll_file_write,
3771 .aio_write = ll_file_aio_write,
3772 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3773 .unlocked_ioctl = ll_file_ioctl,
3774 .open = ll_file_open,
3775 .release = ll_file_release,
3776 .mmap = ll_file_mmap,
3777 .llseek = ll_file_seek,
3778 .splice_read = ll_file_splice_read,
3781 .flock = ll_file_noflock,
3782 .lock = ll_file_noflock
3785 struct inode_operations ll_file_inode_operations = {
3786 .setattr = ll_setattr,
3787 .getattr = ll_getattr,
3788 .permission = ll_inode_permission,
3789 .setxattr = ll_setxattr,
3790 .getxattr = ll_getxattr,
3791 .listxattr = ll_listxattr,
3792 .removexattr = ll_removexattr,
3793 .fiemap = ll_fiemap,
3794 #ifdef HAVE_IOP_GET_ACL
3795 .get_acl = ll_get_acl,
3799 /* dynamic ioctl number support routins */
3800 static struct llioc_ctl_data {
3801 struct rw_semaphore ioc_sem;
3802 struct list_head ioc_head;
3804 __RWSEM_INITIALIZER(llioc.ioc_sem),
3805 LIST_HEAD_INIT(llioc.ioc_head)
3810 struct list_head iocd_list;
3811 unsigned int iocd_size;
3812 llioc_callback_t iocd_cb;
3813 unsigned int iocd_count;
3814 unsigned int iocd_cmd[0];
3817 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3820 struct llioc_data *in_data = NULL;
3823 if (cb == NULL || cmd == NULL ||
3824 count > LLIOC_MAX_CMD || count < 0)
3827 size = sizeof(*in_data) + count * sizeof(unsigned int);
3828 OBD_ALLOC(in_data, size);
3829 if (in_data == NULL)
3832 memset(in_data, 0, sizeof(*in_data));
3833 in_data->iocd_size = size;
3834 in_data->iocd_cb = cb;
3835 in_data->iocd_count = count;
3836 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3838 down_write(&llioc.ioc_sem);
3839 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3840 up_write(&llioc.ioc_sem);
3845 void ll_iocontrol_unregister(void *magic)
3847 struct llioc_data *tmp;
3852 down_write(&llioc.ioc_sem);
3853 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3855 unsigned int size = tmp->iocd_size;
3857 list_del(&tmp->iocd_list);
3858 up_write(&llioc.ioc_sem);
3860 OBD_FREE(tmp, size);
3864 up_write(&llioc.ioc_sem);
3866 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3869 EXPORT_SYMBOL(ll_iocontrol_register);
3870 EXPORT_SYMBOL(ll_iocontrol_unregister);
3872 static enum llioc_iter
3873 ll_iocontrol_call(struct inode *inode, struct file *file,
3874 unsigned int cmd, unsigned long arg, int *rcp)
3876 enum llioc_iter ret = LLIOC_CONT;
3877 struct llioc_data *data;
3878 int rc = -EINVAL, i;
3880 down_read(&llioc.ioc_sem);
3881 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3882 for (i = 0; i < data->iocd_count; i++) {
3883 if (cmd != data->iocd_cmd[i])
3886 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3890 if (ret == LLIOC_STOP)
3893 up_read(&llioc.ioc_sem);
3900 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3902 struct ll_inode_info *lli = ll_i2info(inode);
3903 struct cl_object *obj = lli->lli_clob;
3912 env = cl_env_get(&refcheck);
3914 RETURN(PTR_ERR(env));
3916 rc = cl_conf_set(env, lli->lli_clob, conf);
3920 if (conf->coc_opc == OBJECT_CONF_SET) {
3921 struct ldlm_lock *lock = conf->coc_lock;
3922 struct cl_layout cl = {
3926 LASSERT(lock != NULL);
3927 LASSERT(ldlm_has_layout(lock));
3929 /* it can only be allowed to match after layout is
3930 * applied to inode otherwise false layout would be
3931 * seen. Applying layout shoud happen before dropping
3932 * the intent lock. */
3933 ldlm_lock_allow_match(lock);
3935 rc = cl_object_layout_get(env, obj, &cl);
3940 DFID": layout version change: %u -> %u\n",
3941 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3943 ll_layout_version_set(lli, cl.cl_layout_gen);
3947 cl_env_put(env, &refcheck);
3952 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3953 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3956 struct ll_sb_info *sbi = ll_i2sbi(inode);
3957 struct ptlrpc_request *req;
3958 struct mdt_body *body;
3965 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3966 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3967 lock->l_lvb_data, lock->l_lvb_len);
3969 if (lock->l_lvb_data != NULL)
3972 /* if layout lock was granted right away, the layout is returned
3973 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3974 * blocked and then granted via completion ast, we have to fetch
3975 * layout here. Please note that we can't use the LVB buffer in
3976 * completion AST because it doesn't have a large enough buffer */
3977 rc = ll_get_default_mdsize(sbi, &lmmsize);
3979 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3980 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3985 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3987 GOTO(out, rc = -EPROTO);
3989 lmmsize = body->mbo_eadatasize;
3990 if (lmmsize == 0) /* empty layout */
3993 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3995 GOTO(out, rc = -EFAULT);
3997 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3998 if (lvbdata == NULL)
3999 GOTO(out, rc = -ENOMEM);
4001 memcpy(lvbdata, lmm, lmmsize);
4002 lock_res_and_lock(lock);
4003 if (unlikely(lock->l_lvb_data == NULL)) {
4004 lock->l_lvb_type = LVB_T_LAYOUT;
4005 lock->l_lvb_data = lvbdata;
4006 lock->l_lvb_len = lmmsize;
4009 unlock_res_and_lock(lock);
4012 OBD_FREE_LARGE(lvbdata, lmmsize);
4017 ptlrpc_req_finished(req);
4022 * Apply the layout to the inode. Layout lock is held and will be released
4025 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4026 struct inode *inode)
4028 struct ll_inode_info *lli = ll_i2info(inode);
4029 struct ll_sb_info *sbi = ll_i2sbi(inode);
4030 struct ldlm_lock *lock;
4031 struct cl_object_conf conf;
4034 bool wait_layout = false;
4037 LASSERT(lustre_handle_is_used(lockh));
4039 lock = ldlm_handle2lock(lockh);
4040 LASSERT(lock != NULL);
4041 LASSERT(ldlm_has_layout(lock));
4043 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4044 PFID(&lli->lli_fid), inode);
4046 /* in case this is a caching lock and reinstate with new inode */
4047 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
4049 lock_res_and_lock(lock);
4050 lvb_ready = ldlm_is_lvb_ready(lock);
4051 unlock_res_and_lock(lock);
4052 /* checking lvb_ready is racy but this is okay. The worst case is
4053 * that multi processes may configure the file on the same time. */
4058 rc = ll_layout_fetch(inode, lock);
4062 /* for layout lock, lmm is stored in lock's lvb.
4063 * lvb_data is immutable if the lock is held so it's safe to access it
4066 * set layout to file. Unlikely this will fail as old layout was
4067 * surely eliminated */
4068 memset(&conf, 0, sizeof conf);
4069 conf.coc_opc = OBJECT_CONF_SET;
4070 conf.coc_inode = inode;
4071 conf.coc_lock = lock;
4072 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4073 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4074 rc = ll_layout_conf(inode, &conf);
4076 /* refresh layout failed, need to wait */
4077 wait_layout = rc == -EBUSY;
4081 LDLM_LOCK_PUT(lock);
4082 ldlm_lock_decref(lockh, mode);
4084 /* wait for IO to complete if it's still being used. */
4086 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4087 ll_get_fsname(inode->i_sb, NULL, 0),
4088 PFID(&lli->lli_fid), inode);
4090 memset(&conf, 0, sizeof conf);
4091 conf.coc_opc = OBJECT_CONF_WAIT;
4092 conf.coc_inode = inode;
4093 rc = ll_layout_conf(inode, &conf);
4097 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4098 ll_get_fsname(inode->i_sb, NULL, 0),
4099 PFID(&lli->lli_fid), rc);
4104 static int ll_layout_refresh_locked(struct inode *inode)
4106 struct ll_inode_info *lli = ll_i2info(inode);
4107 struct ll_sb_info *sbi = ll_i2sbi(inode);
4108 struct md_op_data *op_data;
4109 struct lookup_intent it;
4110 struct lustre_handle lockh;
4111 enum ldlm_mode mode;
4112 struct ldlm_enqueue_info einfo = {
4113 .ei_type = LDLM_IBITS,
4115 .ei_cb_bl = &ll_md_blocking_ast,
4116 .ei_cb_cp = &ldlm_completion_ast,
4122 /* mostly layout lock is caching on the local side, so try to match
4123 * it before grabbing layout lock mutex. */
4124 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4125 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4126 if (mode != 0) { /* hit cached lock */
4127 rc = ll_layout_lock_set(&lockh, mode, inode);
4134 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4135 0, 0, LUSTRE_OPC_ANY, NULL);
4136 if (IS_ERR(op_data))
4137 RETURN(PTR_ERR(op_data));
4139 /* have to enqueue one */
4140 memset(&it, 0, sizeof(it));
4141 it.it_op = IT_LAYOUT;
4142 lockh.cookie = 0ULL;
4144 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4145 ll_get_fsname(inode->i_sb, NULL, 0),
4146 PFID(&lli->lli_fid), inode);
4148 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4149 if (it.it_request != NULL)
4150 ptlrpc_req_finished(it.it_request);
4151 it.it_request = NULL;
4153 ll_finish_md_op_data(op_data);
4155 mode = it.it_lock_mode;
4156 it.it_lock_mode = 0;
4157 ll_intent_drop_lock(&it);
4160 /* set lock data in case this is a new lock */
4161 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4162 rc = ll_layout_lock_set(&lockh, mode, inode);
4171 * This function checks if there exists a LAYOUT lock on the client side,
4172 * or enqueues it if it doesn't have one in cache.
4174 * This function will not hold layout lock so it may be revoked any time after
4175 * this function returns. Any operations depend on layout should be redone
4178 * This function should be called before lov_io_init() to get an uptodate
4179 * layout version, the caller should save the version number and after IO
4180 * is finished, this function should be called again to verify that layout
4181 * is not changed during IO time.
4183 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4185 struct ll_inode_info *lli = ll_i2info(inode);
4186 struct ll_sb_info *sbi = ll_i2sbi(inode);
4190 *gen = ll_layout_version_get(lli);
4191 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4195 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4196 LASSERT(S_ISREG(inode->i_mode));
4198 /* take layout lock mutex to enqueue layout lock exclusively. */
4199 mutex_lock(&lli->lli_layout_mutex);
4201 rc = ll_layout_refresh_locked(inode);
4205 *gen = ll_layout_version_get(lli);
4207 mutex_unlock(&lli->lli_layout_mutex);
4213 * This function send a restore request to the MDT
4215 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4217 struct hsm_user_request *hur;
4221 len = sizeof(struct hsm_user_request) +
4222 sizeof(struct hsm_user_item);
4223 OBD_ALLOC(hur, len);
4227 hur->hur_request.hr_action = HUA_RESTORE;
4228 hur->hur_request.hr_archive_id = 0;
4229 hur->hur_request.hr_flags = 0;
4230 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4231 sizeof(hur->hur_user_item[0].hui_fid));
4232 hur->hur_user_item[0].hui_extent.offset = offset;
4233 hur->hur_user_item[0].hui_extent.length = length;
4234 hur->hur_request.hr_itemcount = 1;
4235 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,