4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->d.lustre.it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct ptlrpc_request *req = it->d.lustre.it_data;
438 struct mdt_body *body;
440 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
441 och->och_fh = body->mbo_handle;
442 och->och_fid = body->mbo_fid1;
443 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
444 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
445 och->och_flags = it->it_flags;
447 return md_set_open_replay_data(md_exp, och, it);
450 static int ll_local_open(struct file *file, struct lookup_intent *it,
451 struct ll_file_data *fd, struct obd_client_handle *och)
453 struct inode *inode = file->f_path.dentry->d_inode;
456 LASSERT(!LUSTRE_FPRIVATE(file));
463 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
468 LUSTRE_FPRIVATE(file) = fd;
469 ll_readahead_init(inode, &fd->fd_ras);
470 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
472 /* ll_cl_context initialize */
473 rwlock_init(&fd->fd_lock);
474 INIT_LIST_HEAD(&fd->fd_lccs);
479 /* Open a file, and (for the very first open) create objects on the OSTs at
480 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
481 * creation or open until ll_lov_setstripe() ioctl is called.
483 * If we already have the stripe MD locally then we don't request it in
484 * md_open(), by passing a lmm_size = 0.
486 * It is up to the application to ensure no other processes open this file
487 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
488 * used. We might be able to avoid races of that sort by getting lli_open_sem
489 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
490 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
492 int ll_file_open(struct inode *inode, struct file *file)
494 struct ll_inode_info *lli = ll_i2info(inode);
495 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
496 .it_flags = file->f_flags };
497 struct obd_client_handle **och_p = NULL;
498 __u64 *och_usecount = NULL;
499 struct ll_file_data *fd;
503 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
504 PFID(ll_inode2fid(inode)), inode, file->f_flags);
506 it = file->private_data; /* XXX: compat macro */
507 file->private_data = NULL; /* prevent ll_local_open assertion */
509 fd = ll_file_data_get();
511 GOTO(out_openerr, rc = -ENOMEM);
514 if (S_ISDIR(inode->i_mode))
515 ll_authorize_statahead(inode, fd);
517 if (inode->i_sb->s_root == file->f_path.dentry) {
518 LUSTRE_FPRIVATE(file) = fd;
522 if (!it || !it->d.lustre.it_disposition) {
523 /* Convert f_flags into access mode. We cannot use file->f_mode,
524 * because everything but O_ACCMODE mask was stripped from
526 if ((oit.it_flags + 1) & O_ACCMODE)
528 if (file->f_flags & O_TRUNC)
529 oit.it_flags |= FMODE_WRITE;
531 /* kernel only call f_op->open in dentry_open. filp_open calls
532 * dentry_open after call to open_namei that checks permissions.
533 * Only nfsd_open call dentry_open directly without checking
534 * permissions and because of that this code below is safe. */
535 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
536 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
538 /* We do not want O_EXCL here, presumably we opened the file
539 * already? XXX - NFS implications? */
540 oit.it_flags &= ~O_EXCL;
542 /* bug20584, if "it_flags" contains O_CREAT, the file will be
543 * created if necessary, then "IT_CREAT" should be set to keep
544 * consistent with it */
545 if (oit.it_flags & O_CREAT)
546 oit.it_op |= IT_CREAT;
552 /* Let's see if we have file open on MDS already. */
553 if (it->it_flags & FMODE_WRITE) {
554 och_p = &lli->lli_mds_write_och;
555 och_usecount = &lli->lli_open_fd_write_count;
556 } else if (it->it_flags & FMODE_EXEC) {
557 och_p = &lli->lli_mds_exec_och;
558 och_usecount = &lli->lli_open_fd_exec_count;
560 och_p = &lli->lli_mds_read_och;
561 och_usecount = &lli->lli_open_fd_read_count;
564 mutex_lock(&lli->lli_och_mutex);
565 if (*och_p) { /* Open handle is present */
566 if (it_disposition(it, DISP_OPEN_OPEN)) {
567 /* Well, there's extra open request that we do not need,
568 let's close it somehow. This will decref request. */
569 rc = it_open_error(DISP_OPEN_OPEN, it);
571 mutex_unlock(&lli->lli_och_mutex);
572 GOTO(out_openerr, rc);
575 ll_release_openhandle(file->f_path.dentry, it);
579 rc = ll_local_open(file, it, fd, NULL);
582 mutex_unlock(&lli->lli_och_mutex);
583 GOTO(out_openerr, rc);
586 LASSERT(*och_usecount == 0);
587 if (!it->d.lustre.it_disposition) {
588 /* We cannot just request lock handle now, new ELC code
589 means that one of other OPEN locks for this file
590 could be cancelled, and since blocking ast handler
591 would attempt to grab och_mutex as well, that would
592 result in a deadlock */
593 mutex_unlock(&lli->lli_och_mutex);
595 * Normally called under two situations:
597 * 2. A race/condition on MDS resulting in no open
598 * handle to be returned from LOOKUP|OPEN request,
599 * for example if the target entry was a symlink.
601 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
603 * Always specify MDS_OPEN_BY_FID because we don't want
604 * to get file with different fid.
606 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
607 rc = ll_intent_file_open(file, NULL, 0, it);
609 GOTO(out_openerr, rc);
613 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
615 GOTO(out_och_free, rc = -ENOMEM);
619 /* md_intent_lock() didn't get a request ref if there was an
620 * open error, so don't do cleanup on the request here
622 /* XXX (green): Should not we bail out on any error here, not
623 * just open error? */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 GOTO(out_och_free, rc);
628 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
629 "inode %p: disposition %x, status %d\n", inode,
630 it_disposition(it, ~0), it->d.lustre.it_status);
632 rc = ll_local_open(file, it, fd, *och_p);
634 GOTO(out_och_free, rc);
636 mutex_unlock(&lli->lli_och_mutex);
639 /* Must do this outside lli_och_mutex lock to prevent deadlock where
640 different kind of OPEN lock for this same inode gets cancelled
641 by ldlm_cancel_lru */
642 if (!S_ISREG(inode->i_mode))
643 GOTO(out_och_free, rc);
645 cl_lov_delay_create_clear(&file->f_flags);
646 GOTO(out_och_free, rc);
650 if (och_p && *och_p) {
651 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
652 *och_p = NULL; /* OBD_FREE writes some magic there */
655 mutex_unlock(&lli->lli_och_mutex);
658 if (lli->lli_opendir_key == fd)
659 ll_deauthorize_statahead(inode, fd);
661 ll_file_data_put(fd);
663 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
667 ptlrpc_req_finished(it->d.lustre.it_data);
668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
674 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
675 struct ldlm_lock_desc *desc, void *data, int flag)
678 struct lustre_handle lockh;
682 case LDLM_CB_BLOCKING:
683 ldlm_lock2handle(lock, &lockh);
684 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
686 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
690 case LDLM_CB_CANCELING:
698 * Acquire a lease and open the file.
700 static struct obd_client_handle *
701 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
704 struct lookup_intent it = { .it_op = IT_OPEN };
705 struct ll_sb_info *sbi = ll_i2sbi(inode);
706 struct md_op_data *op_data;
707 struct ptlrpc_request *req = NULL;
708 struct lustre_handle old_handle = { 0 };
709 struct obd_client_handle *och = NULL;
714 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
715 RETURN(ERR_PTR(-EINVAL));
718 struct ll_inode_info *lli = ll_i2info(inode);
719 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
720 struct obd_client_handle **och_p;
723 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
724 RETURN(ERR_PTR(-EPERM));
726 /* Get the openhandle of the file */
728 mutex_lock(&lli->lli_och_mutex);
729 if (fd->fd_lease_och != NULL) {
730 mutex_unlock(&lli->lli_och_mutex);
734 if (fd->fd_och == NULL) {
735 if (file->f_mode & FMODE_WRITE) {
736 LASSERT(lli->lli_mds_write_och != NULL);
737 och_p = &lli->lli_mds_write_och;
738 och_usecount = &lli->lli_open_fd_write_count;
740 LASSERT(lli->lli_mds_read_och != NULL);
741 och_p = &lli->lli_mds_read_och;
742 och_usecount = &lli->lli_open_fd_read_count;
744 if (*och_usecount == 1) {
751 mutex_unlock(&lli->lli_och_mutex);
752 if (rc < 0) /* more than 1 opener */
755 LASSERT(fd->fd_och != NULL);
756 old_handle = fd->fd_och->och_fh;
761 RETURN(ERR_PTR(-ENOMEM));
763 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
764 LUSTRE_OPC_ANY, NULL);
766 GOTO(out, rc = PTR_ERR(op_data));
768 /* To tell the MDT this openhandle is from the same owner */
769 op_data->op_handle = old_handle;
771 it.it_flags = fmode | open_flags;
772 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
773 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
774 &ll_md_blocking_lease_ast,
775 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
776 * it can be cancelled which may mislead applications that the lease is
778 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
779 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
780 * doesn't deal with openhandle, so normal openhandle will be leaked. */
781 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
782 ll_finish_md_op_data(op_data);
783 ptlrpc_req_finished(req);
785 GOTO(out_release_it, rc);
787 if (it_disposition(&it, DISP_LOOKUP_NEG))
788 GOTO(out_release_it, rc = -ENOENT);
790 rc = it_open_error(DISP_OPEN_OPEN, &it);
792 GOTO(out_release_it, rc);
794 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
795 ll_och_fill(sbi->ll_md_exp, &it, och);
797 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
798 GOTO(out_close, rc = -EOPNOTSUPP);
800 /* already get lease, handle lease lock */
801 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
802 if (it.d.lustre.it_lock_mode == 0 ||
803 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
804 /* open lock must return for lease */
805 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
806 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
807 it.d.lustre.it_lock_bits);
808 GOTO(out_close, rc = -EPROTO);
811 ll_intent_release(&it);
815 /* Cancel open lock */
816 if (it.d.lustre.it_lock_mode != 0) {
817 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
818 it.d.lustre.it_lock_mode);
819 it.d.lustre.it_lock_mode = 0;
820 och->och_lease_handle.cookie = 0ULL;
822 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
824 CERROR("%s: error closing file "DFID": %d\n",
825 ll_get_fsname(inode->i_sb, NULL, 0),
826 PFID(&ll_i2info(inode)->lli_fid), rc2);
827 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
829 ll_intent_release(&it);
837 * Check whether a layout swap can be done between two inodes.
839 * \param[in] inode1 First inode to check
840 * \param[in] inode2 Second inode to check
842 * \retval 0 on success, layout swap can be performed between both inodes
843 * \retval negative error code if requirements are not met
845 static int ll_check_swap_layouts_validity(struct inode *inode1,
846 struct inode *inode2)
848 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
851 if (inode_permission(inode1, MAY_WRITE) ||
852 inode_permission(inode2, MAY_WRITE))
855 if (inode1->i_sb != inode2->i_sb)
861 static int ll_swap_layouts_close(struct obd_client_handle *och,
862 struct inode *inode, struct inode *inode2)
864 const struct lu_fid *fid1 = ll_inode2fid(inode);
865 const struct lu_fid *fid2;
869 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
870 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
872 rc = ll_check_swap_layouts_validity(inode, inode2);
874 GOTO(out_free_och, rc);
876 /* We now know that inode2 is a lustre inode */
877 fid2 = ll_inode2fid(inode2);
879 rc = lu_fid_cmp(fid1, fid2);
881 GOTO(out_free_och, rc = -EINVAL);
883 /* Close the file and swap layouts between inode & inode2.
884 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
885 * because we still need it to pack l_remote_handle to MDT. */
886 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
889 och = NULL; /* freed in ll_close_inode_openhandle() */
899 * Release lease and close the file.
900 * It will check if the lease has ever broken.
902 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
905 struct ldlm_lock *lock;
906 bool cancelled = true;
910 lock = ldlm_handle2lock(&och->och_lease_handle);
912 lock_res_and_lock(lock);
913 cancelled = ldlm_is_cancel(lock);
914 unlock_res_and_lock(lock);
918 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
919 PFID(&ll_i2info(inode)->lli_fid), cancelled);
922 ldlm_cli_cancel(&och->och_lease_handle, 0);
923 if (lease_broken != NULL)
924 *lease_broken = cancelled;
926 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
930 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
932 struct ll_inode_info *lli = ll_i2info(inode);
933 struct cl_object *obj = lli->lli_clob;
934 struct cl_attr *attr = vvp_env_thread_attr(env);
942 ll_inode_size_lock(inode);
944 /* merge timestamps the most recently obtained from mds with
945 timestamps obtained from osts */
946 LTIME_S(inode->i_atime) = lli->lli_atime;
947 LTIME_S(inode->i_mtime) = lli->lli_mtime;
948 LTIME_S(inode->i_ctime) = lli->lli_ctime;
950 atime = LTIME_S(inode->i_atime);
951 mtime = LTIME_S(inode->i_mtime);
952 ctime = LTIME_S(inode->i_ctime);
954 cl_object_attr_lock(obj);
955 rc = cl_object_attr_get(env, obj, attr);
956 cl_object_attr_unlock(obj);
959 GOTO(out_size_unlock, rc);
961 if (atime < attr->cat_atime)
962 atime = attr->cat_atime;
964 if (ctime < attr->cat_ctime)
965 ctime = attr->cat_ctime;
967 if (mtime < attr->cat_mtime)
968 mtime = attr->cat_mtime;
970 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
971 PFID(&lli->lli_fid), attr->cat_size);
973 i_size_write(inode, attr->cat_size);
974 inode->i_blocks = attr->cat_blocks;
976 LTIME_S(inode->i_atime) = atime;
977 LTIME_S(inode->i_mtime) = mtime;
978 LTIME_S(inode->i_ctime) = ctime;
981 ll_inode_size_unlock(inode);
986 static bool file_is_noatime(const struct file *file)
988 const struct vfsmount *mnt = file->f_path.mnt;
989 const struct inode *inode = file->f_path.dentry->d_inode;
991 /* Adapted from file_accessed() and touch_atime().*/
992 if (file->f_flags & O_NOATIME)
995 if (inode->i_flags & S_NOATIME)
998 if (IS_NOATIME(inode))
1001 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1004 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1007 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1013 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1015 struct inode *inode = file->f_path.dentry->d_inode;
1017 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1019 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1020 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1021 file->f_flags & O_DIRECT ||
1024 io->ci_obj = ll_i2info(inode)->lli_clob;
1025 io->ci_lockreq = CILR_MAYBE;
1026 if (ll_file_nolock(file)) {
1027 io->ci_lockreq = CILR_NEVER;
1028 io->ci_no_srvlock = 1;
1029 } else if (file->f_flags & O_APPEND) {
1030 io->ci_lockreq = CILR_MANDATORY;
1033 io->ci_noatime = file_is_noatime(file);
1037 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1038 struct file *file, enum cl_io_type iot,
1039 loff_t *ppos, size_t count)
1041 struct vvp_io *vio = vvp_env_io(env);
1042 struct inode *inode = file->f_path.dentry->d_inode;
1043 struct ll_inode_info *lli = ll_i2info(inode);
1044 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1048 struct range_lock range;
1052 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1053 file->f_path.dentry->d_name.name, iot, *ppos, count);
1056 io = vvp_env_thread_io(env);
1057 ll_io_init(io, file, iot == CIT_WRITE);
1059 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1060 bool range_locked = false;
1062 if (file->f_flags & O_APPEND)
1063 range_lock_init(&range, 0, LUSTRE_EOF);
1065 range_lock_init(&range, *ppos, *ppos + count - 1);
1067 vio->vui_fd = LUSTRE_FPRIVATE(file);
1068 vio->vui_io_subtype = args->via_io_subtype;
1070 switch (vio->vui_io_subtype) {
1072 vio->vui_iter = args->u.normal.via_iter;
1073 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1074 vio->vui_tot_nrsegs = vio->vui_iter->nr_segs;
1075 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1076 vio->vui_iocb = args->u.normal.via_iocb;
1077 /* Direct IO reads must also take range lock,
1078 * or multiple reads will try to work on the same pages
1079 * See LU-6227 for details. */
1080 if (((iot == CIT_WRITE) ||
1081 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1082 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1083 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1085 rc = range_lock(&lli->lli_write_tree, &range);
1089 range_locked = true;
1093 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1094 vio->u.splice.vui_flags = args->u.splice.via_flags;
1097 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1101 ll_cl_add(file, env, io);
1102 rc = cl_io_loop(env, io);
1103 ll_cl_remove(file, env);
1106 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1108 range_unlock(&lli->lli_write_tree, &range);
1111 /* cl_io_rw_init() handled IO */
1115 if (io->ci_nob > 0) {
1116 result += io->ci_nob;
1117 count -= io->ci_nob;
1118 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1120 /* prepare IO restart */
1121 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1122 args->u.normal.via_iter = vio->vui_iter;
1123 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1124 args->u.normal.via_iter->nr_segs = vio->vui_tot_nrsegs;
1125 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1130 cl_io_fini(env, io);
1132 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1134 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1135 file->f_path.dentry->d_name.name,
1136 iot == CIT_READ ? "read" : "write",
1137 *ppos, count, result);
1141 if (iot == CIT_READ) {
1143 ll_stats_ops_tally(ll_i2sbi(inode),
1144 LPROC_LL_READ_BYTES, result);
1145 } else if (iot == CIT_WRITE) {
1147 ll_stats_ops_tally(ll_i2sbi(inode),
1148 LPROC_LL_WRITE_BYTES, result);
1149 fd->fd_write_failed = false;
1150 } else if (result == 0 && rc == 0) {
1153 fd->fd_write_failed = true;
1155 fd->fd_write_failed = false;
1156 } else if (rc != -ERESTARTSYS) {
1157 fd->fd_write_failed = true;
1161 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1163 return result > 0 ? result : rc;
1167 * Read from a file (through the page cache).
1169 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1171 struct vvp_io_args *args;
1176 env = cl_env_get(&refcheck);
1178 return PTR_ERR(env);
1180 args = ll_env_args(env, IO_NORMAL);
1181 args->u.normal.via_iter = to;
1182 args->u.normal.via_iocb = iocb;
1184 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1185 &iocb->ki_pos, iov_iter_count(to));
1186 cl_env_put(env, &refcheck);
1191 * Write to a file (through the page cache).
1193 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1195 struct vvp_io_args *args;
1200 env = cl_env_get(&refcheck);
1202 return PTR_ERR(env);
1204 args = ll_env_args(env, IO_NORMAL);
1205 args->u.normal.via_iter = from;
1206 args->u.normal.via_iocb = iocb;
1208 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1209 &iocb->ki_pos, iov_iter_count(from));
1210 cl_env_put(env, &refcheck);
1214 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1216 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1218 static int ll_file_get_iov_count(const struct iovec *iov,
1219 unsigned long *nr_segs, size_t *count)
1224 for (seg = 0; seg < *nr_segs; seg++) {
1225 const struct iovec *iv = &iov[seg];
1228 * If any segment has a negative length, or the cumulative
1229 * length ever wraps negative then return -EINVAL.
1232 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1234 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1239 cnt -= iv->iov_len; /* This segment is no good */
1246 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1247 unsigned long nr_segs, loff_t pos)
1249 struct iovec *local_iov;
1250 struct iov_iter *to;
1253 struct lu_env *env = NULL;
1257 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1263 env = cl_env_get(&refcheck);
1265 RETURN(PTR_ERR(env));
1267 local_iov = &ll_env_info(env)->lti_local_iov;
1271 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1272 if (local_iov == NULL)
1275 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1283 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1284 iov_iter_init(to, READ, local_iov, nr_segs, iov_count);
1285 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1286 iov_iter_init(to, local_iov, nr_segs, iov_count, 0);
1287 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1289 result = ll_file_read_iter(iocb, to);
1294 cl_env_put(env, &refcheck);
1296 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1301 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1305 struct iovec iov = { .iov_base = buf, .iov_len = count };
1306 struct kiocb *kiocb;
1311 env = cl_env_get(&refcheck);
1313 RETURN(PTR_ERR(env));
1315 kiocb = &ll_env_info(env)->lti_kiocb;
1316 init_sync_kiocb(kiocb, file);
1317 kiocb->ki_pos = *ppos;
1318 #ifdef HAVE_KIOCB_KI_LEFT
1319 kiocb->ki_left = count;
1320 #elif defined(HAVE_KI_NBYTES)
1321 kiocb->ki_nbytes = count;
1324 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1325 *ppos = kiocb->ki_pos;
1327 cl_env_put(env, &refcheck);
1332 * Write to a file (through the page cache).
1335 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1336 unsigned long nr_segs, loff_t pos)
1338 struct iovec *local_iov;
1339 struct iov_iter *from;
1342 struct lu_env *env = NULL;
1346 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1351 env = cl_env_get(&refcheck);
1353 RETURN(PTR_ERR(env));
1355 local_iov = &ll_env_info(env)->lti_local_iov;
1358 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1359 if (local_iov == NULL)
1362 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1365 OBD_ALLOC_PTR(from);
1370 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1371 iov_iter_init(from, WRITE, local_iov, nr_segs, iov_count);
1372 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1373 iov_iter_init(from, local_iov, nr_segs, iov_count, 0);
1374 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1376 result = ll_file_write_iter(iocb, from);
1381 cl_env_put(env, &refcheck);
1383 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1388 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1389 size_t count, loff_t *ppos)
1392 struct iovec iov = { .iov_base = (void __user *)buf,
1394 struct kiocb *kiocb;
1399 env = cl_env_get(&refcheck);
1401 RETURN(PTR_ERR(env));
1403 kiocb = &ll_env_info(env)->lti_kiocb;
1404 init_sync_kiocb(kiocb, file);
1405 kiocb->ki_pos = *ppos;
1406 #ifdef HAVE_KIOCB_KI_LEFT
1407 kiocb->ki_left = count;
1408 #elif defined(HAVE_KI_NBYTES)
1409 kiocb->ki_nbytes = count;
1412 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1413 *ppos = kiocb->ki_pos;
1415 cl_env_put(env, &refcheck);
1418 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1421 * Send file content (through pagecache) somewhere with helper
1423 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1424 struct pipe_inode_info *pipe, size_t count,
1428 struct vvp_io_args *args;
1433 env = cl_env_get(&refcheck);
1435 RETURN(PTR_ERR(env));
1437 args = ll_env_args(env, IO_SPLICE);
1438 args->u.splice.via_pipe = pipe;
1439 args->u.splice.via_flags = flags;
1441 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1442 cl_env_put(env, &refcheck);
1446 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1447 __u64 flags, struct lov_user_md *lum,
1450 struct lookup_intent oit = {
1452 .it_flags = flags | MDS_OPEN_BY_FID,
1457 ll_inode_size_lock(inode);
1458 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1460 GOTO(out_unlock, rc);
1462 ll_release_openhandle(file->f_path.dentry, &oit);
1465 ll_inode_size_unlock(inode);
1466 ll_intent_release(&oit);
1467 cl_lov_delay_create_clear(&file->f_flags);
1472 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1473 struct lov_mds_md **lmmp, int *lmm_size,
1474 struct ptlrpc_request **request)
1476 struct ll_sb_info *sbi = ll_i2sbi(inode);
1477 struct mdt_body *body;
1478 struct lov_mds_md *lmm = NULL;
1479 struct ptlrpc_request *req = NULL;
1480 struct md_op_data *op_data;
1483 rc = ll_get_default_mdsize(sbi, &lmmsize);
1487 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1488 strlen(filename), lmmsize,
1489 LUSTRE_OPC_ANY, NULL);
1490 if (IS_ERR(op_data))
1491 RETURN(PTR_ERR(op_data));
1493 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1494 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1495 ll_finish_md_op_data(op_data);
1497 CDEBUG(D_INFO, "md_getattr_name failed "
1498 "on %s: rc %d\n", filename, rc);
1502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1503 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1505 lmmsize = body->mbo_eadatasize;
1507 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1509 GOTO(out, rc = -ENODATA);
1512 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1513 LASSERT(lmm != NULL);
1515 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1516 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1517 GOTO(out, rc = -EPROTO);
1521 * This is coming from the MDS, so is probably in
1522 * little endian. We convert it to host endian before
1523 * passing it to userspace.
1525 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1528 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1529 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1532 /* if function called for directory - we should
1533 * avoid swab not existent lsm objects */
1534 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1535 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1540 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1541 lustre_swab_lov_user_md_v3(
1542 (struct lov_user_md_v3 *)lmm);
1543 if (S_ISREG(body->mbo_mode))
1544 lustre_swab_lov_user_md_objects(
1545 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1552 *lmm_size = lmmsize;
1557 static int ll_lov_setea(struct inode *inode, struct file *file,
1560 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1561 struct lov_user_md *lump;
1562 int lum_size = sizeof(struct lov_user_md) +
1563 sizeof(struct lov_user_ost_data);
1567 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1570 OBD_ALLOC_LARGE(lump, lum_size);
1574 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1575 GOTO(out_lump, rc = -EFAULT);
1577 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1580 OBD_FREE_LARGE(lump, lum_size);
1584 static int ll_file_getstripe(struct inode *inode,
1585 struct lov_user_md __user *lum)
1592 env = cl_env_get(&refcheck);
1594 RETURN(PTR_ERR(env));
1596 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1597 cl_env_put(env, &refcheck);
1601 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1604 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1605 struct lov_user_md *klum;
1607 __u64 flags = FMODE_WRITE;
1610 rc = ll_copy_user_md(lum, &klum);
1615 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1619 put_user(0, &lum->lmm_stripe_count);
1621 ll_layout_refresh(inode, &gen);
1622 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1625 OBD_FREE(klum, lum_size);
1630 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1632 struct ll_inode_info *lli = ll_i2info(inode);
1633 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1634 struct ll_grouplock grouplock;
1639 CWARN("group id for group lock must not be 0\n");
1643 if (ll_file_nolock(file))
1644 RETURN(-EOPNOTSUPP);
1646 spin_lock(&lli->lli_lock);
1647 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1648 CWARN("group lock already existed with gid %lu\n",
1649 fd->fd_grouplock.lg_gid);
1650 spin_unlock(&lli->lli_lock);
1653 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1654 spin_unlock(&lli->lli_lock);
1656 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1657 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1661 spin_lock(&lli->lli_lock);
1662 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1663 spin_unlock(&lli->lli_lock);
1664 CERROR("another thread just won the race\n");
1665 cl_put_grouplock(&grouplock);
1669 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1670 fd->fd_grouplock = grouplock;
1671 spin_unlock(&lli->lli_lock);
1673 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1677 static int ll_put_grouplock(struct inode *inode, struct file *file,
1680 struct ll_inode_info *lli = ll_i2info(inode);
1681 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1682 struct ll_grouplock grouplock;
1685 spin_lock(&lli->lli_lock);
1686 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1687 spin_unlock(&lli->lli_lock);
1688 CWARN("no group lock held\n");
1692 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1694 if (fd->fd_grouplock.lg_gid != arg) {
1695 CWARN("group lock %lu doesn't match current id %lu\n",
1696 arg, fd->fd_grouplock.lg_gid);
1697 spin_unlock(&lli->lli_lock);
1701 grouplock = fd->fd_grouplock;
1702 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1703 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1704 spin_unlock(&lli->lli_lock);
1706 cl_put_grouplock(&grouplock);
1707 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1712 * Close inode open handle
1714 * \param dentry [in] dentry which contains the inode
1715 * \param it [in,out] intent which contains open info and result
1718 * \retval <0 failure
1720 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1722 struct inode *inode = dentry->d_inode;
1723 struct obd_client_handle *och;
1729 /* Root ? Do nothing. */
1730 if (dentry->d_inode->i_sb->s_root == dentry)
1733 /* No open handle to close? Move away */
1734 if (!it_disposition(it, DISP_OPEN_OPEN))
1737 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1739 OBD_ALLOC(och, sizeof(*och));
1741 GOTO(out, rc = -ENOMEM);
1743 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1745 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1747 /* this one is in place of ll_file_open */
1748 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1749 ptlrpc_req_finished(it->d.lustre.it_data);
1750 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1756 * Get size for inode for which FIEMAP mapping is requested.
1757 * Make the FIEMAP get_info call and returns the result.
1758 * \param fiemap kernel buffer to hold extens
1759 * \param num_bytes kernel buffer size
1761 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1767 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1770 /* Checks for fiemap flags */
1771 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1772 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1776 /* Check for FIEMAP_FLAG_SYNC */
1777 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1778 rc = filemap_fdatawrite(inode->i_mapping);
1783 env = cl_env_get(&refcheck);
1785 RETURN(PTR_ERR(env));
1787 if (i_size_read(inode) == 0) {
1788 rc = ll_glimpse_size(inode);
1793 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1794 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1795 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1797 /* If filesize is 0, then there would be no objects for mapping */
1798 if (fmkey.lfik_oa.o_size == 0) {
1799 fiemap->fm_mapped_extents = 0;
1803 fmkey.lfik_fiemap = *fiemap;
1805 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1806 &fmkey, fiemap, &num_bytes);
1808 cl_env_put(env, &refcheck);
1812 int ll_fid2path(struct inode *inode, void __user *arg)
1814 struct obd_export *exp = ll_i2mdexp(inode);
1815 const struct getinfo_fid2path __user *gfin = arg;
1817 struct getinfo_fid2path *gfout;
1823 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1824 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1827 /* Only need to get the buflen */
1828 if (get_user(pathlen, &gfin->gf_pathlen))
1831 if (pathlen > PATH_MAX)
1834 outsize = sizeof(*gfout) + pathlen;
1835 OBD_ALLOC(gfout, outsize);
1839 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1840 GOTO(gf_free, rc = -EFAULT);
1842 /* Call mdc_iocontrol */
1843 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1847 if (copy_to_user(arg, gfout, outsize))
1851 OBD_FREE(gfout, outsize);
1856 * Read the data_version for inode.
1858 * This value is computed using stripe object version on OST.
1859 * Version is computed using server side locking.
1861 * @param flags if do sync on the OST side;
1863 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1864 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1866 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1868 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1876 /* If no file object initialized, we consider its version is 0. */
1882 env = cl_env_get(&refcheck);
1884 RETURN(PTR_ERR(env));
1886 io = vvp_env_thread_io(env);
1888 io->u.ci_data_version.dv_data_version = 0;
1889 io->u.ci_data_version.dv_flags = flags;
1892 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1893 result = cl_io_loop(env, io);
1895 result = io->ci_result;
1897 *data_version = io->u.ci_data_version.dv_data_version;
1899 cl_io_fini(env, io);
1901 if (unlikely(io->ci_need_restart))
1904 cl_env_put(env, &refcheck);
1910 * Trigger a HSM release request for the provided inode.
1912 int ll_hsm_release(struct inode *inode)
1914 struct cl_env_nest nest;
1916 struct obd_client_handle *och = NULL;
1917 __u64 data_version = 0;
1921 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1922 ll_get_fsname(inode->i_sb, NULL, 0),
1923 PFID(&ll_i2info(inode)->lli_fid));
1925 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1927 GOTO(out, rc = PTR_ERR(och));
1929 /* Grab latest data_version and [am]time values */
1930 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1934 env = cl_env_nested_get(&nest);
1936 GOTO(out, rc = PTR_ERR(env));
1938 ll_merge_attr(env, inode);
1939 cl_env_nested_put(&nest, env);
1941 /* Release the file.
1942 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1943 * we still need it to pack l_remote_handle to MDT. */
1944 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1950 if (och != NULL && !IS_ERR(och)) /* close the file */
1951 ll_lease_close(och, inode, NULL);
1956 struct ll_swap_stack {
1959 struct inode *inode1;
1960 struct inode *inode2;
1965 static int ll_swap_layouts(struct file *file1, struct file *file2,
1966 struct lustre_swap_layouts *lsl)
1968 struct mdc_swap_layouts msl;
1969 struct md_op_data *op_data;
1972 struct ll_swap_stack *llss = NULL;
1975 OBD_ALLOC_PTR(llss);
1979 llss->inode1 = file1->f_path.dentry->d_inode;
1980 llss->inode2 = file2->f_path.dentry->d_inode;
1982 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1986 /* we use 2 bool because it is easier to swap than 2 bits */
1987 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1988 llss->check_dv1 = true;
1990 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1991 llss->check_dv2 = true;
1993 /* we cannot use lsl->sl_dvX directly because we may swap them */
1994 llss->dv1 = lsl->sl_dv1;
1995 llss->dv2 = lsl->sl_dv2;
1997 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1998 if (rc == 0) /* same file, done! */
2001 if (rc < 0) { /* sequentialize it */
2002 swap(llss->inode1, llss->inode2);
2004 swap(llss->dv1, llss->dv2);
2005 swap(llss->check_dv1, llss->check_dv2);
2009 if (gid != 0) { /* application asks to flush dirty cache */
2010 rc = ll_get_grouplock(llss->inode1, file1, gid);
2014 rc = ll_get_grouplock(llss->inode2, file2, gid);
2016 ll_put_grouplock(llss->inode1, file1, gid);
2021 /* ultimate check, before swaping the layouts we check if
2022 * dataversion has changed (if requested) */
2023 if (llss->check_dv1) {
2024 rc = ll_data_version(llss->inode1, &dv, 0);
2027 if (dv != llss->dv1)
2028 GOTO(putgl, rc = -EAGAIN);
2031 if (llss->check_dv2) {
2032 rc = ll_data_version(llss->inode2, &dv, 0);
2035 if (dv != llss->dv2)
2036 GOTO(putgl, rc = -EAGAIN);
2039 /* struct md_op_data is used to send the swap args to the mdt
2040 * only flags is missing, so we use struct mdc_swap_layouts
2041 * through the md_op_data->op_data */
2042 /* flags from user space have to be converted before they are send to
2043 * server, no flag is sent today, they are only used on the client */
2046 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2047 0, LUSTRE_OPC_ANY, &msl);
2048 if (IS_ERR(op_data))
2049 GOTO(free, rc = PTR_ERR(op_data));
2051 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2052 sizeof(*op_data), op_data, NULL);
2053 ll_finish_md_op_data(op_data);
2060 ll_put_grouplock(llss->inode2, file2, gid);
2061 ll_put_grouplock(llss->inode1, file1, gid);
2071 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2073 struct md_op_data *op_data;
2077 /* Detect out-of range masks */
2078 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2081 /* Non-root users are forbidden to set or clear flags which are
2082 * NOT defined in HSM_USER_MASK. */
2083 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2084 !cfs_capable(CFS_CAP_SYS_ADMIN))
2087 /* Detect out-of range archive id */
2088 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2089 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2092 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2093 LUSTRE_OPC_ANY, hss);
2094 if (IS_ERR(op_data))
2095 RETURN(PTR_ERR(op_data));
2097 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2098 sizeof(*op_data), op_data, NULL);
2100 ll_finish_md_op_data(op_data);
2105 static int ll_hsm_import(struct inode *inode, struct file *file,
2106 struct hsm_user_import *hui)
2108 struct hsm_state_set *hss = NULL;
2109 struct iattr *attr = NULL;
2113 if (!S_ISREG(inode->i_mode))
2119 GOTO(out, rc = -ENOMEM);
2121 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2122 hss->hss_archive_id = hui->hui_archive_id;
2123 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2124 rc = ll_hsm_state_set(inode, hss);
2128 OBD_ALLOC_PTR(attr);
2130 GOTO(out, rc = -ENOMEM);
2132 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2133 attr->ia_mode |= S_IFREG;
2134 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2135 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2136 attr->ia_size = hui->hui_size;
2137 attr->ia_mtime.tv_sec = hui->hui_mtime;
2138 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2139 attr->ia_atime.tv_sec = hui->hui_atime;
2140 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2142 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2143 ATTR_UID | ATTR_GID |
2144 ATTR_MTIME | ATTR_MTIME_SET |
2145 ATTR_ATIME | ATTR_ATIME_SET;
2147 mutex_lock(&inode->i_mutex);
2149 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2153 mutex_unlock(&inode->i_mutex);
2165 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2167 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2168 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2171 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2173 struct inode *inode = file->f_path.dentry->d_inode;
2175 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2176 ATTR_MTIME | ATTR_MTIME_SET |
2177 ATTR_CTIME | ATTR_CTIME_SET,
2179 .tv_sec = lfu->lfu_atime_sec,
2180 .tv_nsec = lfu->lfu_atime_nsec,
2183 .tv_sec = lfu->lfu_mtime_sec,
2184 .tv_nsec = lfu->lfu_mtime_nsec,
2187 .tv_sec = lfu->lfu_ctime_sec,
2188 .tv_nsec = lfu->lfu_ctime_nsec,
2194 if (!capable(CAP_SYS_ADMIN))
2197 if (!S_ISREG(inode->i_mode))
2200 mutex_lock(&inode->i_mutex);
2201 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2202 mutex_unlock(&inode->i_mutex);
2208 * Give file access advices
2210 * The ladvise interface is similar to Linux fadvise() system call, except it
2211 * forwards the advices directly from Lustre client to server. The server side
2212 * codes will apply appropriate read-ahead and caching techniques for the
2213 * corresponding files.
2215 * A typical workload for ladvise is e.g. a bunch of different clients are
2216 * doing small random reads of a file, so prefetching pages into OSS cache
2217 * with big linear reads before the random IO is a net benefit. Fetching
2218 * all that data into each client cache with fadvise() may not be, due to
2219 * much more data being sent to the client.
2221 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2222 struct lu_ladvise *ladvise)
2224 struct cl_env_nest nest;
2227 struct cl_ladvise_io *lio;
2231 env = cl_env_nested_get(&nest);
2233 RETURN(PTR_ERR(env));
2235 io = vvp_env_thread_io(env);
2236 io->ci_obj = ll_i2info(inode)->lli_clob;
2238 /* initialize parameters for ladvise */
2239 lio = &io->u.ci_ladvise;
2240 lio->li_start = ladvise->lla_start;
2241 lio->li_end = ladvise->lla_end;
2242 lio->li_fid = ll_inode2fid(inode);
2243 lio->li_advice = ladvise->lla_advice;
2244 lio->li_flags = flags;
2246 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2247 rc = cl_io_loop(env, io);
2251 cl_io_fini(env, io);
2252 cl_env_nested_put(&nest, env);
2257 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2259 struct inode *inode = file->f_path.dentry->d_inode;
2260 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2264 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2265 PFID(ll_inode2fid(inode)), inode, cmd);
2266 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2268 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2269 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2273 case LL_IOC_GETFLAGS:
2274 /* Get the current value of the file flags */
2275 return put_user(fd->fd_flags, (int __user *)arg);
2276 case LL_IOC_SETFLAGS:
2277 case LL_IOC_CLRFLAGS:
2278 /* Set or clear specific file flags */
2279 /* XXX This probably needs checks to ensure the flags are
2280 * not abused, and to handle any flag side effects.
2282 if (get_user(flags, (int __user *) arg))
2285 if (cmd == LL_IOC_SETFLAGS) {
2286 if ((flags & LL_FILE_IGNORE_LOCK) &&
2287 !(file->f_flags & O_DIRECT)) {
2288 CERROR("%s: unable to disable locking on "
2289 "non-O_DIRECT file\n", current->comm);
2293 fd->fd_flags |= flags;
2295 fd->fd_flags &= ~flags;
2298 case LL_IOC_LOV_SETSTRIPE:
2299 RETURN(ll_lov_setstripe(inode, file, arg));
2300 case LL_IOC_LOV_SETEA:
2301 RETURN(ll_lov_setea(inode, file, arg));
2302 case LL_IOC_LOV_SWAP_LAYOUTS: {
2304 struct lustre_swap_layouts lsl;
2306 if (copy_from_user(&lsl, (char __user *)arg,
2307 sizeof(struct lustre_swap_layouts)))
2310 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2313 file2 = fget(lsl.sl_fd);
2317 /* O_WRONLY or O_RDWR */
2318 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2319 GOTO(out, rc = -EPERM);
2321 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2322 struct inode *inode2;
2323 struct ll_inode_info *lli;
2324 struct obd_client_handle *och = NULL;
2326 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2327 GOTO(out, rc = -EINVAL);
2329 lli = ll_i2info(inode);
2330 mutex_lock(&lli->lli_och_mutex);
2331 if (fd->fd_lease_och != NULL) {
2332 och = fd->fd_lease_och;
2333 fd->fd_lease_och = NULL;
2335 mutex_unlock(&lli->lli_och_mutex);
2337 GOTO(out, rc = -ENOLCK);
2338 inode2 = file2->f_path.dentry->d_inode;
2339 rc = ll_swap_layouts_close(och, inode, inode2);
2341 rc = ll_swap_layouts(file, file2, &lsl);
2347 case LL_IOC_LOV_GETSTRIPE:
2348 RETURN(ll_file_getstripe(inode,
2349 (struct lov_user_md __user *)arg));
2350 case FSFILT_IOC_GETFLAGS:
2351 case FSFILT_IOC_SETFLAGS:
2352 RETURN(ll_iocontrol(inode, file, cmd, arg));
2353 case FSFILT_IOC_GETVERSION_OLD:
2354 case FSFILT_IOC_GETVERSION:
2355 RETURN(put_user(inode->i_generation, (int __user *)arg));
2356 case LL_IOC_GROUP_LOCK:
2357 RETURN(ll_get_grouplock(inode, file, arg));
2358 case LL_IOC_GROUP_UNLOCK:
2359 RETURN(ll_put_grouplock(inode, file, arg));
2360 case IOC_OBD_STATFS:
2361 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2363 /* We need to special case any other ioctls we want to handle,
2364 * to send them to the MDS/OST as appropriate and to properly
2365 * network encode the arg field.
2366 case FSFILT_IOC_SETVERSION_OLD:
2367 case FSFILT_IOC_SETVERSION:
2369 case LL_IOC_FLUSHCTX:
2370 RETURN(ll_flush_ctx(inode));
2371 case LL_IOC_PATH2FID: {
2372 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2373 sizeof(struct lu_fid)))
2378 case LL_IOC_GETPARENT:
2379 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2381 case OBD_IOC_FID2PATH:
2382 RETURN(ll_fid2path(inode, (void __user *)arg));
2383 case LL_IOC_DATA_VERSION: {
2384 struct ioc_data_version idv;
2387 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2390 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2391 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2394 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2400 case LL_IOC_GET_MDTIDX: {
2403 mdtidx = ll_get_mdt_idx(inode);
2407 if (put_user((int)mdtidx, (int __user *)arg))
2412 case OBD_IOC_GETDTNAME:
2413 case OBD_IOC_GETMDNAME:
2414 RETURN(ll_get_obd_name(inode, cmd, arg));
2415 case LL_IOC_HSM_STATE_GET: {
2416 struct md_op_data *op_data;
2417 struct hsm_user_state *hus;
2424 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2425 LUSTRE_OPC_ANY, hus);
2426 if (IS_ERR(op_data)) {
2428 RETURN(PTR_ERR(op_data));
2431 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2434 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2437 ll_finish_md_op_data(op_data);
2441 case LL_IOC_HSM_STATE_SET: {
2442 struct hsm_state_set *hss;
2449 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2454 rc = ll_hsm_state_set(inode, hss);
2459 case LL_IOC_HSM_ACTION: {
2460 struct md_op_data *op_data;
2461 struct hsm_current_action *hca;
2468 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2469 LUSTRE_OPC_ANY, hca);
2470 if (IS_ERR(op_data)) {
2472 RETURN(PTR_ERR(op_data));
2475 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2478 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2481 ll_finish_md_op_data(op_data);
2485 case LL_IOC_SET_LEASE: {
2486 struct ll_inode_info *lli = ll_i2info(inode);
2487 struct obd_client_handle *och = NULL;
2492 case LL_LEASE_WRLCK:
2493 if (!(file->f_mode & FMODE_WRITE))
2495 fmode = FMODE_WRITE;
2497 case LL_LEASE_RDLCK:
2498 if (!(file->f_mode & FMODE_READ))
2502 case LL_LEASE_UNLCK:
2503 mutex_lock(&lli->lli_och_mutex);
2504 if (fd->fd_lease_och != NULL) {
2505 och = fd->fd_lease_och;
2506 fd->fd_lease_och = NULL;
2508 mutex_unlock(&lli->lli_och_mutex);
2513 fmode = och->och_flags;
2514 rc = ll_lease_close(och, inode, &lease_broken);
2521 RETURN(ll_lease_type_from_fmode(fmode));
2526 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2528 /* apply for lease */
2529 och = ll_lease_open(inode, file, fmode, 0);
2531 RETURN(PTR_ERR(och));
2534 mutex_lock(&lli->lli_och_mutex);
2535 if (fd->fd_lease_och == NULL) {
2536 fd->fd_lease_och = och;
2539 mutex_unlock(&lli->lli_och_mutex);
2541 /* impossible now that only excl is supported for now */
2542 ll_lease_close(och, inode, &lease_broken);
2547 case LL_IOC_GET_LEASE: {
2548 struct ll_inode_info *lli = ll_i2info(inode);
2549 struct ldlm_lock *lock = NULL;
2552 mutex_lock(&lli->lli_och_mutex);
2553 if (fd->fd_lease_och != NULL) {
2554 struct obd_client_handle *och = fd->fd_lease_och;
2556 lock = ldlm_handle2lock(&och->och_lease_handle);
2558 lock_res_and_lock(lock);
2559 if (!ldlm_is_cancel(lock))
2560 fmode = och->och_flags;
2562 unlock_res_and_lock(lock);
2563 LDLM_LOCK_PUT(lock);
2566 mutex_unlock(&lli->lli_och_mutex);
2568 RETURN(ll_lease_type_from_fmode(fmode));
2570 case LL_IOC_HSM_IMPORT: {
2571 struct hsm_user_import *hui;
2577 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2582 rc = ll_hsm_import(inode, file, hui);
2587 case LL_IOC_FUTIMES_3: {
2588 struct ll_futimes_3 lfu;
2590 if (copy_from_user(&lfu,
2591 (const struct ll_futimes_3 __user *)arg,
2595 RETURN(ll_file_futimes_3(file, &lfu));
2597 case LL_IOC_LADVISE: {
2598 struct ladvise_hdr *ladvise_hdr;
2601 int alloc_size = sizeof(*ladvise_hdr);
2604 OBD_ALLOC_PTR(ladvise_hdr);
2605 if (ladvise_hdr == NULL)
2608 if (copy_from_user(ladvise_hdr,
2609 (const struct ladvise_hdr __user *)arg,
2611 GOTO(out_ladvise, rc = -EFAULT);
2613 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2614 ladvise_hdr->lah_count < 1)
2615 GOTO(out_ladvise, rc = -EINVAL);
2617 num_advise = ladvise_hdr->lah_count;
2618 if (num_advise >= LAH_COUNT_MAX)
2619 GOTO(out_ladvise, rc = -EFBIG);
2621 OBD_FREE_PTR(ladvise_hdr);
2622 alloc_size = offsetof(typeof(*ladvise_hdr),
2623 lah_advise[num_advise]);
2624 OBD_ALLOC(ladvise_hdr, alloc_size);
2625 if (ladvise_hdr == NULL)
2629 * TODO: submit multiple advices to one server in a single RPC
2631 if (copy_from_user(ladvise_hdr,
2632 (const struct ladvise_hdr __user *)arg,
2634 GOTO(out_ladvise, rc = -EFAULT);
2636 for (i = 0; i < num_advise; i++) {
2637 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2638 &ladvise_hdr->lah_advise[i]);
2644 OBD_FREE(ladvise_hdr, alloc_size);
2651 ll_iocontrol_call(inode, file, cmd, arg, &err))
2654 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2655 (void __user *)arg));
2660 #ifndef HAVE_FILE_LLSEEK_SIZE
2661 static inline loff_t
2662 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2664 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2666 if (offset > maxsize)
2669 if (offset != file->f_pos) {
2670 file->f_pos = offset;
2671 file->f_version = 0;
2677 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2678 loff_t maxsize, loff_t eof)
2680 struct inode *inode = file->f_path.dentry->d_inode;
2688 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2689 * position-querying operation. Avoid rewriting the "same"
2690 * f_pos value back to the file because a concurrent read(),
2691 * write() or lseek() might have altered it
2696 * f_lock protects against read/modify/write race with other
2697 * SEEK_CURs. Note that parallel writes and reads behave
2700 mutex_lock(&inode->i_mutex);
2701 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2702 mutex_unlock(&inode->i_mutex);
2706 * In the generic case the entire file is data, so as long as
2707 * offset isn't at the end of the file then the offset is data.
2714 * There is a virtual hole at the end of the file, so as long as
2715 * offset isn't i_size or larger, return i_size.
2723 return llseek_execute(file, offset, maxsize);
2727 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2729 struct inode *inode = file->f_path.dentry->d_inode;
2730 loff_t retval, eof = 0;
2733 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2734 (origin == SEEK_CUR) ? file->f_pos : 0);
2735 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2736 PFID(ll_inode2fid(inode)), inode, retval, retval,
2738 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2740 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2741 retval = ll_glimpse_size(inode);
2744 eof = i_size_read(inode);
2747 retval = ll_generic_file_llseek_size(file, offset, origin,
2748 ll_file_maxbytes(inode), eof);
2752 static int ll_flush(struct file *file, fl_owner_t id)
2754 struct inode *inode = file->f_path.dentry->d_inode;
2755 struct ll_inode_info *lli = ll_i2info(inode);
2756 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2759 LASSERT(!S_ISDIR(inode->i_mode));
2761 /* catch async errors that were recorded back when async writeback
2762 * failed for pages in this mapping. */
2763 rc = lli->lli_async_rc;
2764 lli->lli_async_rc = 0;
2765 if (lli->lli_clob != NULL) {
2766 err = lov_read_and_clear_async_rc(lli->lli_clob);
2771 /* The application has been told write failure already.
2772 * Do not report failure again. */
2773 if (fd->fd_write_failed)
2775 return rc ? -EIO : 0;
2779 * Called to make sure a portion of file has been written out.
2780 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2782 * Return how many pages have been written.
2784 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2785 enum cl_fsync_mode mode, int ignore_layout)
2787 struct cl_env_nest nest;
2790 struct cl_fsync_io *fio;
2794 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2795 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2798 env = cl_env_nested_get(&nest);
2800 RETURN(PTR_ERR(env));
2802 io = vvp_env_thread_io(env);
2803 io->ci_obj = ll_i2info(inode)->lli_clob;
2804 io->ci_ignore_layout = ignore_layout;
2806 /* initialize parameters for sync */
2807 fio = &io->u.ci_fsync;
2808 fio->fi_start = start;
2810 fio->fi_fid = ll_inode2fid(inode);
2811 fio->fi_mode = mode;
2812 fio->fi_nr_written = 0;
2814 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2815 result = cl_io_loop(env, io);
2817 result = io->ci_result;
2819 result = fio->fi_nr_written;
2820 cl_io_fini(env, io);
2821 cl_env_nested_put(&nest, env);
2827 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2828 * null and dentry must be used directly rather than pulled from
2829 * *file->f_path.dentry as is done otherwise.
2832 #ifdef HAVE_FILE_FSYNC_4ARGS
2833 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2835 struct dentry *dentry = file->f_path.dentry;
2836 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2837 int ll_fsync(struct file *file, int datasync)
2839 struct dentry *dentry = file->f_path.dentry;
2841 loff_t end = LLONG_MAX;
2843 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2846 loff_t end = LLONG_MAX;
2848 struct inode *inode = dentry->d_inode;
2849 struct ll_inode_info *lli = ll_i2info(inode);
2850 struct ptlrpc_request *req;
2854 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2855 PFID(ll_inode2fid(inode)), inode);
2856 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2858 #ifdef HAVE_FILE_FSYNC_4ARGS
2859 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2860 mutex_lock(&inode->i_mutex);
2862 /* fsync's caller has already called _fdata{sync,write}, we want
2863 * that IO to finish before calling the osc and mdc sync methods */
2864 rc = filemap_fdatawait(inode->i_mapping);
2867 /* catch async errors that were recorded back when async writeback
2868 * failed for pages in this mapping. */
2869 if (!S_ISDIR(inode->i_mode)) {
2870 err = lli->lli_async_rc;
2871 lli->lli_async_rc = 0;
2874 err = lov_read_and_clear_async_rc(lli->lli_clob);
2879 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2883 ptlrpc_req_finished(req);
2885 if (S_ISREG(inode->i_mode)) {
2886 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2888 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2889 if (rc == 0 && err < 0)
2892 fd->fd_write_failed = true;
2894 fd->fd_write_failed = false;
2897 #ifdef HAVE_FILE_FSYNC_4ARGS
2898 mutex_unlock(&inode->i_mutex);
2904 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2906 struct inode *inode = file->f_path.dentry->d_inode;
2907 struct ll_sb_info *sbi = ll_i2sbi(inode);
2908 struct ldlm_enqueue_info einfo = {
2909 .ei_type = LDLM_FLOCK,
2910 .ei_cb_cp = ldlm_flock_completion_ast,
2911 .ei_cbdata = file_lock,
2913 struct md_op_data *op_data;
2914 struct lustre_handle lockh = { 0 };
2915 union ldlm_policy_data flock = { { 0 } };
2916 int fl_type = file_lock->fl_type;
2922 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2923 PFID(ll_inode2fid(inode)), file_lock);
2925 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2927 if (file_lock->fl_flags & FL_FLOCK) {
2928 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2929 /* flocks are whole-file locks */
2930 flock.l_flock.end = OFFSET_MAX;
2931 /* For flocks owner is determined by the local file desctiptor*/
2932 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2933 } else if (file_lock->fl_flags & FL_POSIX) {
2934 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2935 flock.l_flock.start = file_lock->fl_start;
2936 flock.l_flock.end = file_lock->fl_end;
2940 flock.l_flock.pid = file_lock->fl_pid;
2942 /* Somewhat ugly workaround for svc lockd.
2943 * lockd installs custom fl_lmops->lm_compare_owner that checks
2944 * for the fl_owner to be the same (which it always is on local node
2945 * I guess between lockd processes) and then compares pid.
2946 * As such we assign pid to the owner field to make it all work,
2947 * conflict with normal locks is unlikely since pid space and
2948 * pointer space for current->files are not intersecting */
2949 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2950 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2954 einfo.ei_mode = LCK_PR;
2957 /* An unlock request may or may not have any relation to
2958 * existing locks so we may not be able to pass a lock handle
2959 * via a normal ldlm_lock_cancel() request. The request may even
2960 * unlock a byte range in the middle of an existing lock. In
2961 * order to process an unlock request we need all of the same
2962 * information that is given with a normal read or write record
2963 * lock request. To avoid creating another ldlm unlock (cancel)
2964 * message we'll treat a LCK_NL flock request as an unlock. */
2965 einfo.ei_mode = LCK_NL;
2968 einfo.ei_mode = LCK_PW;
2971 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2986 flags = LDLM_FL_BLOCK_NOWAIT;
2992 flags = LDLM_FL_TEST_LOCK;
2995 CERROR("unknown fcntl lock command: %d\n", cmd);
2999 /* Save the old mode so that if the mode in the lock changes we
3000 * can decrement the appropriate reader or writer refcount. */
3001 file_lock->fl_type = einfo.ei_mode;
3003 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3004 LUSTRE_OPC_ANY, NULL);
3005 if (IS_ERR(op_data))
3006 RETURN(PTR_ERR(op_data));
3008 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3009 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3010 flock.l_flock.pid, flags, einfo.ei_mode,
3011 flock.l_flock.start, flock.l_flock.end);
3013 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3016 /* Restore the file lock type if not TEST lock. */
3017 if (!(flags & LDLM_FL_TEST_LOCK))
3018 file_lock->fl_type = fl_type;
3020 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3021 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3022 !(flags & LDLM_FL_TEST_LOCK))
3023 rc2 = locks_lock_file_wait(file, file_lock);
3025 if ((file_lock->fl_flags & FL_FLOCK) &&
3026 (rc == 0 || file_lock->fl_type == F_UNLCK))
3027 rc2 = flock_lock_file_wait(file, file_lock);
3028 if ((file_lock->fl_flags & FL_POSIX) &&
3029 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3030 !(flags & LDLM_FL_TEST_LOCK))
3031 rc2 = posix_lock_file_wait(file, file_lock);
3032 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3034 if (rc2 && file_lock->fl_type != F_UNLCK) {
3035 einfo.ei_mode = LCK_NL;
3036 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3041 ll_finish_md_op_data(op_data);
3046 int ll_get_fid_by_name(struct inode *parent, const char *name,
3047 int namelen, struct lu_fid *fid,
3048 struct inode **inode)
3050 struct md_op_data *op_data = NULL;
3051 struct mdt_body *body;
3052 struct ptlrpc_request *req;
3056 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3057 LUSTRE_OPC_ANY, NULL);
3058 if (IS_ERR(op_data))
3059 RETURN(PTR_ERR(op_data));
3061 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3062 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3063 ll_finish_md_op_data(op_data);
3067 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3069 GOTO(out_req, rc = -EFAULT);
3071 *fid = body->mbo_fid1;
3074 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3076 ptlrpc_req_finished(req);
3080 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3081 const char *name, int namelen)
3083 struct dentry *dchild = NULL;
3084 struct inode *child_inode = NULL;
3085 struct md_op_data *op_data;
3086 struct ptlrpc_request *request = NULL;
3087 struct obd_client_handle *och = NULL;
3089 struct mdt_body *body;
3091 __u64 data_version = 0;
3094 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3095 name, PFID(ll_inode2fid(parent)), mdtidx);
3097 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3098 0, LUSTRE_OPC_ANY, NULL);
3099 if (IS_ERR(op_data))
3100 RETURN(PTR_ERR(op_data));
3102 /* Get child FID first */
3103 qstr.hash = full_name_hash(name, namelen);
3106 dchild = d_lookup(file->f_path.dentry, &qstr);
3107 if (dchild != NULL) {
3108 if (dchild->d_inode != NULL)
3109 child_inode = igrab(dchild->d_inode);
3113 if (child_inode == NULL) {
3114 rc = ll_get_fid_by_name(parent, name, namelen,
3115 &op_data->op_fid3, &child_inode);
3120 if (child_inode == NULL)
3121 GOTO(out_free, rc = -EINVAL);
3123 mutex_lock(&child_inode->i_mutex);
3124 op_data->op_fid3 = *ll_inode2fid(child_inode);
3125 if (!fid_is_sane(&op_data->op_fid3)) {
3126 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3127 ll_get_fsname(parent->i_sb, NULL, 0), name,
3128 PFID(&op_data->op_fid3));
3129 GOTO(out_unlock, rc = -EINVAL);
3132 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3134 GOTO(out_unlock, rc);
3137 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3138 PFID(&op_data->op_fid3), mdtidx);
3139 GOTO(out_unlock, rc = 0);
3142 if (S_ISREG(child_inode->i_mode)) {
3143 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3147 GOTO(out_unlock, rc);
3150 rc = ll_data_version(child_inode, &data_version,
3153 GOTO(out_close, rc);
3155 op_data->op_handle = och->och_fh;
3156 op_data->op_data = och->och_mod;
3157 op_data->op_data_version = data_version;
3158 op_data->op_lease_handle = och->och_lease_handle;
3159 op_data->op_bias |= MDS_RENAME_MIGRATE;
3162 op_data->op_mds = mdtidx;
3163 op_data->op_cli_flags = CLI_MIGRATE;
3164 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3165 namelen, name, namelen, &request);
3167 ll_update_times(request, parent);
3169 if (request != NULL) {
3170 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3172 ptlrpc_req_finished(request);
3173 GOTO(out_close, rc = -EPROTO);
3176 /* If the server does release layout lock, then we cleanup
3177 * the client och here, otherwise release it in out_close: */
3179 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3180 obd_mod_put(och->och_mod);
3181 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3183 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3187 ptlrpc_req_finished(request);
3190 /* Try again if the file layout has changed. */
3191 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3196 if (och != NULL) /* close the file */
3197 ll_lease_close(och, child_inode, NULL);
3199 clear_nlink(child_inode);
3201 mutex_unlock(&child_inode->i_mutex);
3204 ll_finish_md_op_data(op_data);
3209 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3217 * test if some locks matching bits and l_req_mode are acquired
3218 * - bits can be in different locks
3219 * - if found clear the common lock bits in *bits
3220 * - the bits not found, are kept in *bits
3222 * \param bits [IN] searched lock bits [IN]
3223 * \param l_req_mode [IN] searched lock mode
3224 * \retval boolean, true iff all bits are found
3226 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3228 struct lustre_handle lockh;
3229 union ldlm_policy_data policy;
3230 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3231 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3240 fid = &ll_i2info(inode)->lli_fid;
3241 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3242 ldlm_lockname[mode]);
3244 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3245 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3246 policy.l_inodebits.bits = *bits & (1 << i);
3247 if (policy.l_inodebits.bits == 0)
3250 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3251 &policy, mode, &lockh)) {
3252 struct ldlm_lock *lock;
3254 lock = ldlm_handle2lock(&lockh);
3257 ~(lock->l_policy_data.l_inodebits.bits);
3258 LDLM_LOCK_PUT(lock);
3260 *bits &= ~policy.l_inodebits.bits;
3267 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3268 struct lustre_handle *lockh, __u64 flags,
3269 enum ldlm_mode mode)
3271 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3276 fid = &ll_i2info(inode)->lli_fid;
3277 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3279 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3280 fid, LDLM_IBITS, &policy, mode, lockh);
3285 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3287 /* Already unlinked. Just update nlink and return success */
3288 if (rc == -ENOENT) {
3290 /* If it is striped directory, and there is bad stripe
3291 * Let's revalidate the dentry again, instead of returning
3293 if (S_ISDIR(inode->i_mode) &&
3294 ll_i2info(inode)->lli_lsm_md != NULL)
3297 /* This path cannot be hit for regular files unless in
3298 * case of obscure races, so no need to to validate
3300 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3302 } else if (rc != 0) {
3303 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3304 "%s: revalidate FID "DFID" error: rc = %d\n",
3305 ll_get_fsname(inode->i_sb, NULL, 0),
3306 PFID(ll_inode2fid(inode)), rc);
3312 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3314 struct inode *inode = dentry->d_inode;
3315 struct ptlrpc_request *req = NULL;
3316 struct obd_export *exp;
3320 LASSERT(inode != NULL);
3322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3323 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3325 exp = ll_i2mdexp(inode);
3327 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3328 * But under CMD case, it caused some lock issues, should be fixed
3329 * with new CMD ibits lock. See bug 12718 */
3330 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3331 struct lookup_intent oit = { .it_op = IT_GETATTR };
3332 struct md_op_data *op_data;
3334 if (ibits == MDS_INODELOCK_LOOKUP)
3335 oit.it_op = IT_LOOKUP;
3337 /* Call getattr by fid, so do not provide name at all. */
3338 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3339 dentry->d_inode, NULL, 0, 0,
3340 LUSTRE_OPC_ANY, NULL);
3341 if (IS_ERR(op_data))
3342 RETURN(PTR_ERR(op_data));
3344 rc = md_intent_lock(exp, op_data, &oit, &req,
3345 &ll_md_blocking_ast, 0);
3346 ll_finish_md_op_data(op_data);
3348 rc = ll_inode_revalidate_fini(inode, rc);
3352 rc = ll_revalidate_it_finish(req, &oit, dentry);
3354 ll_intent_release(&oit);
3358 /* Unlinked? Unhash dentry, so it is not picked up later by
3359 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3360 here to preserve get_cwd functionality on 2.6.
3362 if (!dentry->d_inode->i_nlink)
3363 d_lustre_invalidate(dentry, 0);
3365 ll_lookup_finish_locks(&oit, dentry);
3366 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3367 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3368 u64 valid = OBD_MD_FLGETATTR;
3369 struct md_op_data *op_data;
3372 if (S_ISREG(inode->i_mode)) {
3373 rc = ll_get_default_mdsize(sbi, &ealen);
3376 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3379 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3380 0, ealen, LUSTRE_OPC_ANY,
3382 if (IS_ERR(op_data))
3383 RETURN(PTR_ERR(op_data));
3385 op_data->op_valid = valid;
3386 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3387 ll_finish_md_op_data(op_data);
3389 rc = ll_inode_revalidate_fini(inode, rc);
3393 rc = ll_prep_inode(&inode, req, NULL, NULL);
3396 ptlrpc_req_finished(req);
3400 static int ll_merge_md_attr(struct inode *inode)
3402 struct cl_attr attr = { 0 };
3405 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3406 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3407 &attr, ll_md_blocking_ast);
3411 set_nlink(inode, attr.cat_nlink);
3412 inode->i_blocks = attr.cat_blocks;
3413 i_size_write(inode, attr.cat_size);
3415 ll_i2info(inode)->lli_atime = attr.cat_atime;
3416 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3417 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3423 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3425 struct inode *inode = dentry->d_inode;
3429 rc = __ll_inode_revalidate(dentry, ibits);
3433 /* if object isn't regular file, don't validate size */
3434 if (!S_ISREG(inode->i_mode)) {
3435 if (S_ISDIR(inode->i_mode) &&
3436 ll_i2info(inode)->lli_lsm_md != NULL) {
3437 rc = ll_merge_md_attr(inode);
3442 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3443 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3444 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3446 /* In case of restore, the MDT has the right size and has
3447 * already send it back without granting the layout lock,
3448 * inode is up-to-date so glimpse is useless.
3449 * Also to glimpse we need the layout, in case of a running
3450 * restore the MDT holds the layout lock so the glimpse will
3451 * block up to the end of restore (getattr will block)
3453 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3454 rc = ll_glimpse_size(inode);
3459 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3461 struct inode *inode = de->d_inode;
3462 struct ll_sb_info *sbi = ll_i2sbi(inode);
3463 struct ll_inode_info *lli = ll_i2info(inode);
3466 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3467 MDS_INODELOCK_LOOKUP);
3468 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3473 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3475 stat->dev = inode->i_sb->s_dev;
3476 if (ll_need_32bit_api(sbi))
3477 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3479 stat->ino = inode->i_ino;
3480 stat->mode = inode->i_mode;
3481 stat->uid = inode->i_uid;
3482 stat->gid = inode->i_gid;
3483 stat->rdev = inode->i_rdev;
3484 stat->atime = inode->i_atime;
3485 stat->mtime = inode->i_mtime;
3486 stat->ctime = inode->i_ctime;
3487 stat->blksize = 1 << inode->i_blkbits;
3489 stat->nlink = inode->i_nlink;
3490 stat->size = i_size_read(inode);
3491 stat->blocks = inode->i_blocks;
3496 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3497 __u64 start, __u64 len)
3501 struct fiemap *fiemap;
3502 unsigned int extent_count = fieinfo->fi_extents_max;
3504 num_bytes = sizeof(*fiemap) + (extent_count *
3505 sizeof(struct fiemap_extent));
3506 OBD_ALLOC_LARGE(fiemap, num_bytes);
3511 fiemap->fm_flags = fieinfo->fi_flags;
3512 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3513 fiemap->fm_start = start;
3514 fiemap->fm_length = len;
3515 if (extent_count > 0 &&
3516 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3517 sizeof(struct fiemap_extent)) != 0)
3518 GOTO(out, rc = -EFAULT);
3520 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3522 fieinfo->fi_flags = fiemap->fm_flags;
3523 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3524 if (extent_count > 0 &&
3525 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3526 fiemap->fm_mapped_extents *
3527 sizeof(struct fiemap_extent)) != 0)
3528 GOTO(out, rc = -EFAULT);
3530 OBD_FREE_LARGE(fiemap, num_bytes);
3534 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3536 struct ll_inode_info *lli = ll_i2info(inode);
3537 struct posix_acl *acl = NULL;
3540 spin_lock(&lli->lli_lock);
3541 /* VFS' acl_permission_check->check_acl will release the refcount */
3542 acl = posix_acl_dup(lli->lli_posix_acl);
3543 spin_unlock(&lli->lli_lock);
3548 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3550 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3551 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3553 ll_check_acl(struct inode *inode, int mask)
3556 # ifdef CONFIG_FS_POSIX_ACL
3557 struct posix_acl *acl;
3561 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3562 if (flags & IPERM_FLAG_RCU)
3565 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3570 rc = posix_acl_permission(inode, acl, mask);
3571 posix_acl_release(acl);
3574 # else /* !CONFIG_FS_POSIX_ACL */
3576 # endif /* CONFIG_FS_POSIX_ACL */
3578 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3580 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3581 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3583 # ifdef HAVE_INODE_PERMISION_2ARGS
3584 int ll_inode_permission(struct inode *inode, int mask)
3586 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3591 struct ll_sb_info *sbi;
3592 struct root_squash_info *squash;
3593 struct cred *cred = NULL;
3594 const struct cred *old_cred = NULL;
3596 bool squash_id = false;
3599 #ifdef MAY_NOT_BLOCK
3600 if (mask & MAY_NOT_BLOCK)
3602 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3603 if (flags & IPERM_FLAG_RCU)
3607 /* as root inode are NOT getting validated in lookup operation,
3608 * need to do it before permission check. */
3610 if (inode == inode->i_sb->s_root->d_inode) {
3611 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3612 MDS_INODELOCK_LOOKUP);
3617 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3618 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3620 /* squash fsuid/fsgid if needed */
3621 sbi = ll_i2sbi(inode);
3622 squash = &sbi->ll_squash;
3623 if (unlikely(squash->rsi_uid != 0 &&
3624 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3625 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3629 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3630 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3631 squash->rsi_uid, squash->rsi_gid);
3633 /* update current process's credentials
3634 * and FS capability */
3635 cred = prepare_creds();
3639 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3640 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3641 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3642 if ((1 << cap) & CFS_CAP_FS_MASK)
3643 cap_lower(cred->cap_effective, cap);
3645 old_cred = override_creds(cred);
3648 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3650 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3651 rc = lustre_check_remote_perm(inode, mask);
3653 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3655 /* restore current process's credentials and FS capability */
3657 revert_creds(old_cred);
3664 /* -o localflock - only provides locally consistent flock locks */
3665 struct file_operations ll_file_operations = {
3666 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3667 # ifdef HAVE_SYNC_READ_WRITE
3668 .read = new_sync_read,
3669 .write = new_sync_write,
3671 .read_iter = ll_file_read_iter,
3672 .write_iter = ll_file_write_iter,
3673 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3674 .read = ll_file_read,
3675 .aio_read = ll_file_aio_read,
3676 .write = ll_file_write,
3677 .aio_write = ll_file_aio_write,
3678 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3679 .unlocked_ioctl = ll_file_ioctl,
3680 .open = ll_file_open,
3681 .release = ll_file_release,
3682 .mmap = ll_file_mmap,
3683 .llseek = ll_file_seek,
3684 .splice_read = ll_file_splice_read,
3689 struct file_operations ll_file_operations_flock = {
3690 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3691 # ifdef HAVE_SYNC_READ_WRITE
3692 .read = new_sync_read,
3693 .write = new_sync_write,
3694 # endif /* HAVE_SYNC_READ_WRITE */
3695 .read_iter = ll_file_read_iter,
3696 .write_iter = ll_file_write_iter,
3697 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3698 .read = ll_file_read,
3699 .aio_read = ll_file_aio_read,
3700 .write = ll_file_write,
3701 .aio_write = ll_file_aio_write,
3702 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3703 .unlocked_ioctl = ll_file_ioctl,
3704 .open = ll_file_open,
3705 .release = ll_file_release,
3706 .mmap = ll_file_mmap,
3707 .llseek = ll_file_seek,
3708 .splice_read = ll_file_splice_read,
3711 .flock = ll_file_flock,
3712 .lock = ll_file_flock
3715 /* These are for -o noflock - to return ENOSYS on flock calls */
3716 struct file_operations ll_file_operations_noflock = {
3717 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3718 # ifdef HAVE_SYNC_READ_WRITE
3719 .read = new_sync_read,
3720 .write = new_sync_write,
3721 # endif /* HAVE_SYNC_READ_WRITE */
3722 .read_iter = ll_file_read_iter,
3723 .write_iter = ll_file_write_iter,
3724 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3725 .read = ll_file_read,
3726 .aio_read = ll_file_aio_read,
3727 .write = ll_file_write,
3728 .aio_write = ll_file_aio_write,
3729 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3730 .unlocked_ioctl = ll_file_ioctl,
3731 .open = ll_file_open,
3732 .release = ll_file_release,
3733 .mmap = ll_file_mmap,
3734 .llseek = ll_file_seek,
3735 .splice_read = ll_file_splice_read,
3738 .flock = ll_file_noflock,
3739 .lock = ll_file_noflock
3742 struct inode_operations ll_file_inode_operations = {
3743 .setattr = ll_setattr,
3744 .getattr = ll_getattr,
3745 .permission = ll_inode_permission,
3746 .setxattr = ll_setxattr,
3747 .getxattr = ll_getxattr,
3748 .listxattr = ll_listxattr,
3749 .removexattr = ll_removexattr,
3750 .fiemap = ll_fiemap,
3751 #ifdef HAVE_IOP_GET_ACL
3752 .get_acl = ll_get_acl,
3756 /* dynamic ioctl number support routins */
3757 static struct llioc_ctl_data {
3758 struct rw_semaphore ioc_sem;
3759 struct list_head ioc_head;
3761 __RWSEM_INITIALIZER(llioc.ioc_sem),
3762 LIST_HEAD_INIT(llioc.ioc_head)
3767 struct list_head iocd_list;
3768 unsigned int iocd_size;
3769 llioc_callback_t iocd_cb;
3770 unsigned int iocd_count;
3771 unsigned int iocd_cmd[0];
3774 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3777 struct llioc_data *in_data = NULL;
3780 if (cb == NULL || cmd == NULL ||
3781 count > LLIOC_MAX_CMD || count < 0)
3784 size = sizeof(*in_data) + count * sizeof(unsigned int);
3785 OBD_ALLOC(in_data, size);
3786 if (in_data == NULL)
3789 memset(in_data, 0, sizeof(*in_data));
3790 in_data->iocd_size = size;
3791 in_data->iocd_cb = cb;
3792 in_data->iocd_count = count;
3793 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3795 down_write(&llioc.ioc_sem);
3796 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3797 up_write(&llioc.ioc_sem);
3802 void ll_iocontrol_unregister(void *magic)
3804 struct llioc_data *tmp;
3809 down_write(&llioc.ioc_sem);
3810 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3812 unsigned int size = tmp->iocd_size;
3814 list_del(&tmp->iocd_list);
3815 up_write(&llioc.ioc_sem);
3817 OBD_FREE(tmp, size);
3821 up_write(&llioc.ioc_sem);
3823 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3826 EXPORT_SYMBOL(ll_iocontrol_register);
3827 EXPORT_SYMBOL(ll_iocontrol_unregister);
3829 static enum llioc_iter
3830 ll_iocontrol_call(struct inode *inode, struct file *file,
3831 unsigned int cmd, unsigned long arg, int *rcp)
3833 enum llioc_iter ret = LLIOC_CONT;
3834 struct llioc_data *data;
3835 int rc = -EINVAL, i;
3837 down_read(&llioc.ioc_sem);
3838 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3839 for (i = 0; i < data->iocd_count; i++) {
3840 if (cmd != data->iocd_cmd[i])
3843 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3847 if (ret == LLIOC_STOP)
3850 up_read(&llioc.ioc_sem);
3857 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3859 struct ll_inode_info *lli = ll_i2info(inode);
3860 struct cl_object *obj = lli->lli_clob;
3861 struct cl_env_nest nest;
3869 env = cl_env_nested_get(&nest);
3871 RETURN(PTR_ERR(env));
3873 rc = cl_conf_set(env, lli->lli_clob, conf);
3877 if (conf->coc_opc == OBJECT_CONF_SET) {
3878 struct ldlm_lock *lock = conf->coc_lock;
3879 struct cl_layout cl = {
3883 LASSERT(lock != NULL);
3884 LASSERT(ldlm_has_layout(lock));
3886 /* it can only be allowed to match after layout is
3887 * applied to inode otherwise false layout would be
3888 * seen. Applying layout shoud happen before dropping
3889 * the intent lock. */
3890 ldlm_lock_allow_match(lock);
3892 rc = cl_object_layout_get(env, obj, &cl);
3897 DFID": layout version change: %u -> %u\n",
3898 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3900 ll_layout_version_set(lli, cl.cl_layout_gen);
3904 cl_env_nested_put(&nest, env);
3909 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3910 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3913 struct ll_sb_info *sbi = ll_i2sbi(inode);
3914 struct ptlrpc_request *req;
3915 struct mdt_body *body;
3922 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3923 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3924 lock->l_lvb_data, lock->l_lvb_len);
3926 if (lock->l_lvb_data != NULL)
3929 /* if layout lock was granted right away, the layout is returned
3930 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3931 * blocked and then granted via completion ast, we have to fetch
3932 * layout here. Please note that we can't use the LVB buffer in
3933 * completion AST because it doesn't have a large enough buffer */
3934 rc = ll_get_default_mdsize(sbi, &lmmsize);
3936 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3937 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3942 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3944 GOTO(out, rc = -EPROTO);
3946 lmmsize = body->mbo_eadatasize;
3947 if (lmmsize == 0) /* empty layout */
3950 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3952 GOTO(out, rc = -EFAULT);
3954 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3955 if (lvbdata == NULL)
3956 GOTO(out, rc = -ENOMEM);
3958 memcpy(lvbdata, lmm, lmmsize);
3959 lock_res_and_lock(lock);
3960 if (unlikely(lock->l_lvb_data == NULL)) {
3961 lock->l_lvb_type = LVB_T_LAYOUT;
3962 lock->l_lvb_data = lvbdata;
3963 lock->l_lvb_len = lmmsize;
3966 unlock_res_and_lock(lock);
3969 OBD_FREE_LARGE(lvbdata, lmmsize);
3974 ptlrpc_req_finished(req);
3979 * Apply the layout to the inode. Layout lock is held and will be released
3982 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3983 struct inode *inode)
3985 struct ll_inode_info *lli = ll_i2info(inode);
3986 struct ll_sb_info *sbi = ll_i2sbi(inode);
3987 struct ldlm_lock *lock;
3988 struct cl_object_conf conf;
3991 bool wait_layout = false;
3994 LASSERT(lustre_handle_is_used(lockh));
3996 lock = ldlm_handle2lock(lockh);
3997 LASSERT(lock != NULL);
3998 LASSERT(ldlm_has_layout(lock));
4000 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4001 PFID(&lli->lli_fid), inode);
4003 /* in case this is a caching lock and reinstate with new inode */
4004 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
4006 lock_res_and_lock(lock);
4007 lvb_ready = ldlm_is_lvb_ready(lock);
4008 unlock_res_and_lock(lock);
4009 /* checking lvb_ready is racy but this is okay. The worst case is
4010 * that multi processes may configure the file on the same time. */
4015 rc = ll_layout_fetch(inode, lock);
4019 /* for layout lock, lmm is stored in lock's lvb.
4020 * lvb_data is immutable if the lock is held so it's safe to access it
4023 * set layout to file. Unlikely this will fail as old layout was
4024 * surely eliminated */
4025 memset(&conf, 0, sizeof conf);
4026 conf.coc_opc = OBJECT_CONF_SET;
4027 conf.coc_inode = inode;
4028 conf.coc_lock = lock;
4029 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4030 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4031 rc = ll_layout_conf(inode, &conf);
4033 /* refresh layout failed, need to wait */
4034 wait_layout = rc == -EBUSY;
4038 LDLM_LOCK_PUT(lock);
4039 ldlm_lock_decref(lockh, mode);
4041 /* wait for IO to complete if it's still being used. */
4043 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4044 ll_get_fsname(inode->i_sb, NULL, 0),
4045 PFID(&lli->lli_fid), inode);
4047 memset(&conf, 0, sizeof conf);
4048 conf.coc_opc = OBJECT_CONF_WAIT;
4049 conf.coc_inode = inode;
4050 rc = ll_layout_conf(inode, &conf);
4054 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4055 ll_get_fsname(inode->i_sb, NULL, 0),
4056 PFID(&lli->lli_fid), rc);
4061 static int ll_layout_refresh_locked(struct inode *inode)
4063 struct ll_inode_info *lli = ll_i2info(inode);
4064 struct ll_sb_info *sbi = ll_i2sbi(inode);
4065 struct md_op_data *op_data;
4066 struct lookup_intent it;
4067 struct lustre_handle lockh;
4068 enum ldlm_mode mode;
4069 struct ldlm_enqueue_info einfo = {
4070 .ei_type = LDLM_IBITS,
4072 .ei_cb_bl = &ll_md_blocking_ast,
4073 .ei_cb_cp = &ldlm_completion_ast,
4079 /* mostly layout lock is caching on the local side, so try to match
4080 * it before grabbing layout lock mutex. */
4081 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4082 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4083 if (mode != 0) { /* hit cached lock */
4084 rc = ll_layout_lock_set(&lockh, mode, inode);
4091 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4092 0, 0, LUSTRE_OPC_ANY, NULL);
4093 if (IS_ERR(op_data))
4094 RETURN(PTR_ERR(op_data));
4096 /* have to enqueue one */
4097 memset(&it, 0, sizeof(it));
4098 it.it_op = IT_LAYOUT;
4099 lockh.cookie = 0ULL;
4101 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4102 ll_get_fsname(inode->i_sb, NULL, 0),
4103 PFID(&lli->lli_fid), inode);
4105 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4106 if (it.d.lustre.it_data != NULL)
4107 ptlrpc_req_finished(it.d.lustre.it_data);
4108 it.d.lustre.it_data = NULL;
4110 ll_finish_md_op_data(op_data);
4112 mode = it.d.lustre.it_lock_mode;
4113 it.d.lustre.it_lock_mode = 0;
4114 ll_intent_drop_lock(&it);
4117 /* set lock data in case this is a new lock */
4118 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4119 rc = ll_layout_lock_set(&lockh, mode, inode);
4128 * This function checks if there exists a LAYOUT lock on the client side,
4129 * or enqueues it if it doesn't have one in cache.
4131 * This function will not hold layout lock so it may be revoked any time after
4132 * this function returns. Any operations depend on layout should be redone
4135 * This function should be called before lov_io_init() to get an uptodate
4136 * layout version, the caller should save the version number and after IO
4137 * is finished, this function should be called again to verify that layout
4138 * is not changed during IO time.
4140 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4142 struct ll_inode_info *lli = ll_i2info(inode);
4143 struct ll_sb_info *sbi = ll_i2sbi(inode);
4147 *gen = ll_layout_version_get(lli);
4148 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4152 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4153 LASSERT(S_ISREG(inode->i_mode));
4155 /* take layout lock mutex to enqueue layout lock exclusively. */
4156 mutex_lock(&lli->lli_layout_mutex);
4158 rc = ll_layout_refresh_locked(inode);
4162 *gen = ll_layout_version_get(lli);
4164 mutex_unlock(&lli->lli_layout_mutex);
4170 * This function send a restore request to the MDT
4172 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4174 struct hsm_user_request *hur;
4178 len = sizeof(struct hsm_user_request) +
4179 sizeof(struct hsm_user_item);
4180 OBD_ALLOC(hur, len);
4184 hur->hur_request.hr_action = HUA_RESTORE;
4185 hur->hur_request.hr_archive_id = 0;
4186 hur->hur_request.hr_flags = 0;
4187 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4188 sizeof(hur->hur_user_item[0].hui_fid));
4189 hur->hur_user_item[0].hui_extent.offset = offset;
4190 hur->hur_user_item[0].hui_extent.length = length;
4191 hur->hur_request.hr_itemcount = 1;
4192 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,