4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct ptlrpc_request *req = it->it_data;
438 struct mdt_body *body;
440 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
441 och->och_fh = body->mbo_handle;
442 och->och_fid = body->mbo_fid1;
443 och->och_lease_handle.cookie = it->it_lock_handle;
444 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
445 och->och_flags = it->it_flags;
447 return md_set_open_replay_data(md_exp, och, it);
450 static int ll_local_open(struct file *file, struct lookup_intent *it,
451 struct ll_file_data *fd, struct obd_client_handle *och)
453 struct inode *inode = file->f_path.dentry->d_inode;
456 LASSERT(!LUSTRE_FPRIVATE(file));
463 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
468 LUSTRE_FPRIVATE(file) = fd;
469 ll_readahead_init(inode, &fd->fd_ras);
470 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
472 /* ll_cl_context initialize */
473 rwlock_init(&fd->fd_lock);
474 INIT_LIST_HEAD(&fd->fd_lccs);
479 /* Open a file, and (for the very first open) create objects on the OSTs at
480 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
481 * creation or open until ll_lov_setstripe() ioctl is called.
483 * If we already have the stripe MD locally then we don't request it in
484 * md_open(), by passing a lmm_size = 0.
486 * It is up to the application to ensure no other processes open this file
487 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
488 * used. We might be able to avoid races of that sort by getting lli_open_sem
489 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
490 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
492 int ll_file_open(struct inode *inode, struct file *file)
494 struct ll_inode_info *lli = ll_i2info(inode);
495 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
496 .it_flags = file->f_flags };
497 struct obd_client_handle **och_p = NULL;
498 __u64 *och_usecount = NULL;
499 struct ll_file_data *fd;
503 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
504 PFID(ll_inode2fid(inode)), inode, file->f_flags);
506 it = file->private_data; /* XXX: compat macro */
507 file->private_data = NULL; /* prevent ll_local_open assertion */
509 fd = ll_file_data_get();
511 GOTO(out_openerr, rc = -ENOMEM);
514 if (S_ISDIR(inode->i_mode))
515 ll_authorize_statahead(inode, fd);
517 if (inode->i_sb->s_root == file->f_path.dentry) {
518 LUSTRE_FPRIVATE(file) = fd;
522 if (!it || !it->it_disposition) {
523 /* Convert f_flags into access mode. We cannot use file->f_mode,
524 * because everything but O_ACCMODE mask was stripped from
526 if ((oit.it_flags + 1) & O_ACCMODE)
528 if (file->f_flags & O_TRUNC)
529 oit.it_flags |= FMODE_WRITE;
531 /* kernel only call f_op->open in dentry_open. filp_open calls
532 * dentry_open after call to open_namei that checks permissions.
533 * Only nfsd_open call dentry_open directly without checking
534 * permissions and because of that this code below is safe. */
535 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
536 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
538 /* We do not want O_EXCL here, presumably we opened the file
539 * already? XXX - NFS implications? */
540 oit.it_flags &= ~O_EXCL;
542 /* bug20584, if "it_flags" contains O_CREAT, the file will be
543 * created if necessary, then "IT_CREAT" should be set to keep
544 * consistent with it */
545 if (oit.it_flags & O_CREAT)
546 oit.it_op |= IT_CREAT;
552 /* Let's see if we have file open on MDS already. */
553 if (it->it_flags & FMODE_WRITE) {
554 och_p = &lli->lli_mds_write_och;
555 och_usecount = &lli->lli_open_fd_write_count;
556 } else if (it->it_flags & FMODE_EXEC) {
557 och_p = &lli->lli_mds_exec_och;
558 och_usecount = &lli->lli_open_fd_exec_count;
560 och_p = &lli->lli_mds_read_och;
561 och_usecount = &lli->lli_open_fd_read_count;
564 mutex_lock(&lli->lli_och_mutex);
565 if (*och_p) { /* Open handle is present */
566 if (it_disposition(it, DISP_OPEN_OPEN)) {
567 /* Well, there's extra open request that we do not need,
568 let's close it somehow. This will decref request. */
569 rc = it_open_error(DISP_OPEN_OPEN, it);
571 mutex_unlock(&lli->lli_och_mutex);
572 GOTO(out_openerr, rc);
575 ll_release_openhandle(file->f_path.dentry, it);
579 rc = ll_local_open(file, it, fd, NULL);
582 mutex_unlock(&lli->lli_och_mutex);
583 GOTO(out_openerr, rc);
586 LASSERT(*och_usecount == 0);
587 if (!it->it_disposition) {
588 /* We cannot just request lock handle now, new ELC code
589 means that one of other OPEN locks for this file
590 could be cancelled, and since blocking ast handler
591 would attempt to grab och_mutex as well, that would
592 result in a deadlock */
593 mutex_unlock(&lli->lli_och_mutex);
595 * Normally called under two situations:
597 * 2. A race/condition on MDS resulting in no open
598 * handle to be returned from LOOKUP|OPEN request,
599 * for example if the target entry was a symlink.
601 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
603 * Always specify MDS_OPEN_BY_FID because we don't want
604 * to get file with different fid.
606 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
607 rc = ll_intent_file_open(file, NULL, 0, it);
609 GOTO(out_openerr, rc);
613 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
615 GOTO(out_och_free, rc = -ENOMEM);
619 /* md_intent_lock() didn't get a request ref if there was an
620 * open error, so don't do cleanup on the request here
622 /* XXX (green): Should not we bail out on any error here, not
623 * just open error? */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 GOTO(out_och_free, rc);
628 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
629 "inode %p: disposition %x, status %d\n", inode,
630 it_disposition(it, ~0), it->it_status);
632 rc = ll_local_open(file, it, fd, *och_p);
634 GOTO(out_och_free, rc);
636 mutex_unlock(&lli->lli_och_mutex);
639 /* Must do this outside lli_och_mutex lock to prevent deadlock where
640 different kind of OPEN lock for this same inode gets cancelled
641 by ldlm_cancel_lru */
642 if (!S_ISREG(inode->i_mode))
643 GOTO(out_och_free, rc);
645 cl_lov_delay_create_clear(&file->f_flags);
646 GOTO(out_och_free, rc);
650 if (och_p && *och_p) {
651 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
652 *och_p = NULL; /* OBD_FREE writes some magic there */
655 mutex_unlock(&lli->lli_och_mutex);
658 if (lli->lli_opendir_key == fd)
659 ll_deauthorize_statahead(inode, fd);
661 ll_file_data_put(fd);
663 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
667 ptlrpc_req_finished(it->it_data);
668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
674 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
675 struct ldlm_lock_desc *desc, void *data, int flag)
678 struct lustre_handle lockh;
682 case LDLM_CB_BLOCKING:
683 ldlm_lock2handle(lock, &lockh);
684 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
686 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
690 case LDLM_CB_CANCELING:
698 * Acquire a lease and open the file.
700 static struct obd_client_handle *
701 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
704 struct lookup_intent it = { .it_op = IT_OPEN };
705 struct ll_sb_info *sbi = ll_i2sbi(inode);
706 struct md_op_data *op_data;
707 struct ptlrpc_request *req = NULL;
708 struct lustre_handle old_handle = { 0 };
709 struct obd_client_handle *och = NULL;
714 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
715 RETURN(ERR_PTR(-EINVAL));
718 struct ll_inode_info *lli = ll_i2info(inode);
719 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
720 struct obd_client_handle **och_p;
723 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
724 RETURN(ERR_PTR(-EPERM));
726 /* Get the openhandle of the file */
728 mutex_lock(&lli->lli_och_mutex);
729 if (fd->fd_lease_och != NULL) {
730 mutex_unlock(&lli->lli_och_mutex);
734 if (fd->fd_och == NULL) {
735 if (file->f_mode & FMODE_WRITE) {
736 LASSERT(lli->lli_mds_write_och != NULL);
737 och_p = &lli->lli_mds_write_och;
738 och_usecount = &lli->lli_open_fd_write_count;
740 LASSERT(lli->lli_mds_read_och != NULL);
741 och_p = &lli->lli_mds_read_och;
742 och_usecount = &lli->lli_open_fd_read_count;
744 if (*och_usecount == 1) {
751 mutex_unlock(&lli->lli_och_mutex);
752 if (rc < 0) /* more than 1 opener */
755 LASSERT(fd->fd_och != NULL);
756 old_handle = fd->fd_och->och_fh;
761 RETURN(ERR_PTR(-ENOMEM));
763 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
764 LUSTRE_OPC_ANY, NULL);
766 GOTO(out, rc = PTR_ERR(op_data));
768 /* To tell the MDT this openhandle is from the same owner */
769 op_data->op_handle = old_handle;
771 it.it_flags = fmode | open_flags;
772 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
773 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
774 &ll_md_blocking_lease_ast,
775 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
776 * it can be cancelled which may mislead applications that the lease is
778 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
779 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
780 * doesn't deal with openhandle, so normal openhandle will be leaked. */
781 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
782 ll_finish_md_op_data(op_data);
783 ptlrpc_req_finished(req);
785 GOTO(out_release_it, rc);
787 if (it_disposition(&it, DISP_LOOKUP_NEG))
788 GOTO(out_release_it, rc = -ENOENT);
790 rc = it_open_error(DISP_OPEN_OPEN, &it);
792 GOTO(out_release_it, rc);
794 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
795 ll_och_fill(sbi->ll_md_exp, &it, och);
797 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
798 GOTO(out_close, rc = -EOPNOTSUPP);
800 /* already get lease, handle lease lock */
801 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
802 if (it.it_lock_mode == 0 ||
803 it.it_lock_bits != MDS_INODELOCK_OPEN) {
804 /* open lock must return for lease */
805 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
806 PFID(ll_inode2fid(inode)), it.it_lock_mode,
808 GOTO(out_close, rc = -EPROTO);
811 ll_intent_release(&it);
815 /* Cancel open lock */
816 if (it.it_lock_mode != 0) {
817 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
820 och->och_lease_handle.cookie = 0ULL;
822 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
824 CERROR("%s: error closing file "DFID": %d\n",
825 ll_get_fsname(inode->i_sb, NULL, 0),
826 PFID(&ll_i2info(inode)->lli_fid), rc2);
827 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
829 ll_intent_release(&it);
837 * Check whether a layout swap can be done between two inodes.
839 * \param[in] inode1 First inode to check
840 * \param[in] inode2 Second inode to check
842 * \retval 0 on success, layout swap can be performed between both inodes
843 * \retval negative error code if requirements are not met
845 static int ll_check_swap_layouts_validity(struct inode *inode1,
846 struct inode *inode2)
848 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
851 if (inode_permission(inode1, MAY_WRITE) ||
852 inode_permission(inode2, MAY_WRITE))
855 if (inode1->i_sb != inode2->i_sb)
861 static int ll_swap_layouts_close(struct obd_client_handle *och,
862 struct inode *inode, struct inode *inode2)
864 const struct lu_fid *fid1 = ll_inode2fid(inode);
865 const struct lu_fid *fid2;
869 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
870 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
872 rc = ll_check_swap_layouts_validity(inode, inode2);
874 GOTO(out_free_och, rc);
876 /* We now know that inode2 is a lustre inode */
877 fid2 = ll_inode2fid(inode2);
879 rc = lu_fid_cmp(fid1, fid2);
881 GOTO(out_free_och, rc = -EINVAL);
883 /* Close the file and swap layouts between inode & inode2.
884 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
885 * because we still need it to pack l_remote_handle to MDT. */
886 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
889 och = NULL; /* freed in ll_close_inode_openhandle() */
899 * Release lease and close the file.
900 * It will check if the lease has ever broken.
902 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
905 struct ldlm_lock *lock;
906 bool cancelled = true;
910 lock = ldlm_handle2lock(&och->och_lease_handle);
912 lock_res_and_lock(lock);
913 cancelled = ldlm_is_cancel(lock);
914 unlock_res_and_lock(lock);
918 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
919 PFID(&ll_i2info(inode)->lli_fid), cancelled);
922 ldlm_cli_cancel(&och->och_lease_handle, 0);
923 if (lease_broken != NULL)
924 *lease_broken = cancelled;
926 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
930 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
932 struct ll_inode_info *lli = ll_i2info(inode);
933 struct cl_object *obj = lli->lli_clob;
934 struct cl_attr *attr = vvp_env_thread_attr(env);
942 ll_inode_size_lock(inode);
944 /* merge timestamps the most recently obtained from mds with
945 timestamps obtained from osts */
946 LTIME_S(inode->i_atime) = lli->lli_atime;
947 LTIME_S(inode->i_mtime) = lli->lli_mtime;
948 LTIME_S(inode->i_ctime) = lli->lli_ctime;
950 atime = LTIME_S(inode->i_atime);
951 mtime = LTIME_S(inode->i_mtime);
952 ctime = LTIME_S(inode->i_ctime);
954 cl_object_attr_lock(obj);
955 rc = cl_object_attr_get(env, obj, attr);
956 cl_object_attr_unlock(obj);
959 GOTO(out_size_unlock, rc);
961 if (atime < attr->cat_atime)
962 atime = attr->cat_atime;
964 if (ctime < attr->cat_ctime)
965 ctime = attr->cat_ctime;
967 if (mtime < attr->cat_mtime)
968 mtime = attr->cat_mtime;
970 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
971 PFID(&lli->lli_fid), attr->cat_size);
973 i_size_write(inode, attr->cat_size);
974 inode->i_blocks = attr->cat_blocks;
976 LTIME_S(inode->i_atime) = atime;
977 LTIME_S(inode->i_mtime) = mtime;
978 LTIME_S(inode->i_ctime) = ctime;
981 ll_inode_size_unlock(inode);
986 static bool file_is_noatime(const struct file *file)
988 const struct vfsmount *mnt = file->f_path.mnt;
989 const struct inode *inode = file->f_path.dentry->d_inode;
991 /* Adapted from file_accessed() and touch_atime().*/
992 if (file->f_flags & O_NOATIME)
995 if (inode->i_flags & S_NOATIME)
998 if (IS_NOATIME(inode))
1001 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1004 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1007 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1013 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1015 struct inode *inode = file->f_path.dentry->d_inode;
1017 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1019 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1020 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1021 file->f_flags & O_DIRECT ||
1024 io->ci_obj = ll_i2info(inode)->lli_clob;
1025 io->ci_lockreq = CILR_MAYBE;
1026 if (ll_file_nolock(file)) {
1027 io->ci_lockreq = CILR_NEVER;
1028 io->ci_no_srvlock = 1;
1029 } else if (file->f_flags & O_APPEND) {
1030 io->ci_lockreq = CILR_MANDATORY;
1033 io->ci_noatime = file_is_noatime(file);
1037 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1038 struct file *file, enum cl_io_type iot,
1039 loff_t *ppos, size_t count)
1041 struct vvp_io *vio = vvp_env_io(env);
1042 struct inode *inode = file->f_path.dentry->d_inode;
1043 struct ll_inode_info *lli = ll_i2info(inode);
1044 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1048 struct range_lock range;
1052 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1053 file->f_path.dentry->d_name.name, iot, *ppos, count);
1056 io = vvp_env_thread_io(env);
1057 ll_io_init(io, file, iot == CIT_WRITE);
1059 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1060 bool range_locked = false;
1062 if (file->f_flags & O_APPEND)
1063 range_lock_init(&range, 0, LUSTRE_EOF);
1065 range_lock_init(&range, *ppos, *ppos + count - 1);
1067 vio->vui_fd = LUSTRE_FPRIVATE(file);
1068 vio->vui_io_subtype = args->via_io_subtype;
1070 switch (vio->vui_io_subtype) {
1072 vio->vui_iter = args->u.normal.via_iter;
1073 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1074 vio->vui_tot_nrsegs = vio->vui_iter->nr_segs;
1075 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1076 vio->vui_iocb = args->u.normal.via_iocb;
1077 /* Direct IO reads must also take range lock,
1078 * or multiple reads will try to work on the same pages
1079 * See LU-6227 for details. */
1080 if (((iot == CIT_WRITE) ||
1081 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1082 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1083 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1085 rc = range_lock(&lli->lli_write_tree, &range);
1089 range_locked = true;
1093 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1094 vio->u.splice.vui_flags = args->u.splice.via_flags;
1097 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1101 ll_cl_add(file, env, io);
1102 rc = cl_io_loop(env, io);
1103 ll_cl_remove(file, env);
1106 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1108 range_unlock(&lli->lli_write_tree, &range);
1111 /* cl_io_rw_init() handled IO */
1115 if (io->ci_nob > 0) {
1116 result += io->ci_nob;
1117 count -= io->ci_nob;
1118 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1120 /* prepare IO restart */
1121 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1122 args->u.normal.via_iter = vio->vui_iter;
1123 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1124 args->u.normal.via_iter->nr_segs = vio->vui_tot_nrsegs;
1125 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1130 cl_io_fini(env, io);
1132 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1134 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1135 file->f_path.dentry->d_name.name,
1136 iot == CIT_READ ? "read" : "write",
1137 *ppos, count, result);
1141 if (iot == CIT_READ) {
1143 ll_stats_ops_tally(ll_i2sbi(inode),
1144 LPROC_LL_READ_BYTES, result);
1145 } else if (iot == CIT_WRITE) {
1147 ll_stats_ops_tally(ll_i2sbi(inode),
1148 LPROC_LL_WRITE_BYTES, result);
1149 fd->fd_write_failed = false;
1150 } else if (result == 0 && rc == 0) {
1153 fd->fd_write_failed = true;
1155 fd->fd_write_failed = false;
1156 } else if (rc != -ERESTARTSYS) {
1157 fd->fd_write_failed = true;
1161 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1163 return result > 0 ? result : rc;
1167 * Read from a file (through the page cache).
1169 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1171 struct vvp_io_args *args;
1176 env = cl_env_get(&refcheck);
1178 return PTR_ERR(env);
1180 args = ll_env_args(env, IO_NORMAL);
1181 args->u.normal.via_iter = to;
1182 args->u.normal.via_iocb = iocb;
1184 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1185 &iocb->ki_pos, iov_iter_count(to));
1186 cl_env_put(env, &refcheck);
1191 * Write to a file (through the page cache).
1193 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1195 struct vvp_io_args *args;
1200 env = cl_env_get(&refcheck);
1202 return PTR_ERR(env);
1204 args = ll_env_args(env, IO_NORMAL);
1205 args->u.normal.via_iter = from;
1206 args->u.normal.via_iocb = iocb;
1208 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1209 &iocb->ki_pos, iov_iter_count(from));
1210 cl_env_put(env, &refcheck);
1214 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1216 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1218 static int ll_file_get_iov_count(const struct iovec *iov,
1219 unsigned long *nr_segs, size_t *count)
1224 for (seg = 0; seg < *nr_segs; seg++) {
1225 const struct iovec *iv = &iov[seg];
1228 * If any segment has a negative length, or the cumulative
1229 * length ever wraps negative then return -EINVAL.
1232 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1234 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1239 cnt -= iv->iov_len; /* This segment is no good */
1246 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1247 unsigned long nr_segs, loff_t pos)
1249 struct iovec *local_iov;
1250 struct iov_iter *to;
1253 struct lu_env *env = NULL;
1257 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1263 env = cl_env_get(&refcheck);
1265 RETURN(PTR_ERR(env));
1267 local_iov = &ll_env_info(env)->lti_local_iov;
1271 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1272 if (local_iov == NULL)
1275 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1283 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1284 iov_iter_init(to, READ, local_iov, nr_segs, iov_count);
1285 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1286 iov_iter_init(to, local_iov, nr_segs, iov_count, 0);
1287 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1289 result = ll_file_read_iter(iocb, to);
1294 cl_env_put(env, &refcheck);
1296 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1301 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1305 struct iovec iov = { .iov_base = buf, .iov_len = count };
1306 struct kiocb *kiocb;
1311 env = cl_env_get(&refcheck);
1313 RETURN(PTR_ERR(env));
1315 kiocb = &ll_env_info(env)->lti_kiocb;
1316 init_sync_kiocb(kiocb, file);
1317 kiocb->ki_pos = *ppos;
1318 #ifdef HAVE_KIOCB_KI_LEFT
1319 kiocb->ki_left = count;
1320 #elif defined(HAVE_KI_NBYTES)
1321 kiocb->ki_nbytes = count;
1324 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1325 *ppos = kiocb->ki_pos;
1327 cl_env_put(env, &refcheck);
1332 * Write to a file (through the page cache).
1335 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1336 unsigned long nr_segs, loff_t pos)
1338 struct iovec *local_iov;
1339 struct iov_iter *from;
1342 struct lu_env *env = NULL;
1346 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1351 env = cl_env_get(&refcheck);
1353 RETURN(PTR_ERR(env));
1355 local_iov = &ll_env_info(env)->lti_local_iov;
1358 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1359 if (local_iov == NULL)
1362 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1365 OBD_ALLOC_PTR(from);
1370 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1371 iov_iter_init(from, WRITE, local_iov, nr_segs, iov_count);
1372 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1373 iov_iter_init(from, local_iov, nr_segs, iov_count, 0);
1374 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1376 result = ll_file_write_iter(iocb, from);
1381 cl_env_put(env, &refcheck);
1383 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1388 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1389 size_t count, loff_t *ppos)
1392 struct iovec iov = { .iov_base = (void __user *)buf,
1394 struct kiocb *kiocb;
1399 env = cl_env_get(&refcheck);
1401 RETURN(PTR_ERR(env));
1403 kiocb = &ll_env_info(env)->lti_kiocb;
1404 init_sync_kiocb(kiocb, file);
1405 kiocb->ki_pos = *ppos;
1406 #ifdef HAVE_KIOCB_KI_LEFT
1407 kiocb->ki_left = count;
1408 #elif defined(HAVE_KI_NBYTES)
1409 kiocb->ki_nbytes = count;
1412 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1413 *ppos = kiocb->ki_pos;
1415 cl_env_put(env, &refcheck);
1418 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1421 * Send file content (through pagecache) somewhere with helper
1423 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1424 struct pipe_inode_info *pipe, size_t count,
1428 struct vvp_io_args *args;
1433 env = cl_env_get(&refcheck);
1435 RETURN(PTR_ERR(env));
1437 args = ll_env_args(env, IO_SPLICE);
1438 args->u.splice.via_pipe = pipe;
1439 args->u.splice.via_flags = flags;
1441 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1442 cl_env_put(env, &refcheck);
1446 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1447 __u64 flags, struct lov_user_md *lum,
1450 struct lookup_intent oit = {
1452 .it_flags = flags | MDS_OPEN_BY_FID,
1457 ll_inode_size_lock(inode);
1458 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1460 GOTO(out_unlock, rc);
1462 ll_release_openhandle(file->f_path.dentry, &oit);
1465 ll_inode_size_unlock(inode);
1466 ll_intent_release(&oit);
1467 cl_lov_delay_create_clear(&file->f_flags);
1472 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1473 struct lov_mds_md **lmmp, int *lmm_size,
1474 struct ptlrpc_request **request)
1476 struct ll_sb_info *sbi = ll_i2sbi(inode);
1477 struct mdt_body *body;
1478 struct lov_mds_md *lmm = NULL;
1479 struct ptlrpc_request *req = NULL;
1480 struct md_op_data *op_data;
1483 rc = ll_get_default_mdsize(sbi, &lmmsize);
1487 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1488 strlen(filename), lmmsize,
1489 LUSTRE_OPC_ANY, NULL);
1490 if (IS_ERR(op_data))
1491 RETURN(PTR_ERR(op_data));
1493 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1494 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1495 ll_finish_md_op_data(op_data);
1497 CDEBUG(D_INFO, "md_getattr_name failed "
1498 "on %s: rc %d\n", filename, rc);
1502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1503 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1505 lmmsize = body->mbo_eadatasize;
1507 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1509 GOTO(out, rc = -ENODATA);
1512 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1513 LASSERT(lmm != NULL);
1515 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1516 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1517 GOTO(out, rc = -EPROTO);
1521 * This is coming from the MDS, so is probably in
1522 * little endian. We convert it to host endian before
1523 * passing it to userspace.
1525 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1528 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1529 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1532 /* if function called for directory - we should
1533 * avoid swab not existent lsm objects */
1534 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1535 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1540 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1541 lustre_swab_lov_user_md_v3(
1542 (struct lov_user_md_v3 *)lmm);
1543 if (S_ISREG(body->mbo_mode))
1544 lustre_swab_lov_user_md_objects(
1545 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1552 *lmm_size = lmmsize;
1557 static int ll_lov_setea(struct inode *inode, struct file *file,
1560 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1561 struct lov_user_md *lump;
1562 int lum_size = sizeof(struct lov_user_md) +
1563 sizeof(struct lov_user_ost_data);
1567 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1570 OBD_ALLOC_LARGE(lump, lum_size);
1574 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1575 GOTO(out_lump, rc = -EFAULT);
1577 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1580 OBD_FREE_LARGE(lump, lum_size);
1584 static int ll_file_getstripe(struct inode *inode,
1585 struct lov_user_md __user *lum)
1592 env = cl_env_get(&refcheck);
1594 RETURN(PTR_ERR(env));
1596 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1597 cl_env_put(env, &refcheck);
1601 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1604 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1605 struct lov_user_md *klum;
1607 __u64 flags = FMODE_WRITE;
1610 rc = ll_copy_user_md(lum, &klum);
1615 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1619 put_user(0, &lum->lmm_stripe_count);
1621 ll_layout_refresh(inode, &gen);
1622 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1625 OBD_FREE(klum, lum_size);
1630 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1632 struct ll_inode_info *lli = ll_i2info(inode);
1633 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1634 struct ll_grouplock grouplock;
1639 CWARN("group id for group lock must not be 0\n");
1643 if (ll_file_nolock(file))
1644 RETURN(-EOPNOTSUPP);
1646 spin_lock(&lli->lli_lock);
1647 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1648 CWARN("group lock already existed with gid %lu\n",
1649 fd->fd_grouplock.lg_gid);
1650 spin_unlock(&lli->lli_lock);
1653 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1654 spin_unlock(&lli->lli_lock);
1656 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1657 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1661 spin_lock(&lli->lli_lock);
1662 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1663 spin_unlock(&lli->lli_lock);
1664 CERROR("another thread just won the race\n");
1665 cl_put_grouplock(&grouplock);
1669 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1670 fd->fd_grouplock = grouplock;
1671 spin_unlock(&lli->lli_lock);
1673 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1677 static int ll_put_grouplock(struct inode *inode, struct file *file,
1680 struct ll_inode_info *lli = ll_i2info(inode);
1681 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1682 struct ll_grouplock grouplock;
1685 spin_lock(&lli->lli_lock);
1686 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1687 spin_unlock(&lli->lli_lock);
1688 CWARN("no group lock held\n");
1692 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1694 if (fd->fd_grouplock.lg_gid != arg) {
1695 CWARN("group lock %lu doesn't match current id %lu\n",
1696 arg, fd->fd_grouplock.lg_gid);
1697 spin_unlock(&lli->lli_lock);
1701 grouplock = fd->fd_grouplock;
1702 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1703 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1704 spin_unlock(&lli->lli_lock);
1706 cl_put_grouplock(&grouplock);
1707 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1712 * Close inode open handle
1714 * \param dentry [in] dentry which contains the inode
1715 * \param it [in,out] intent which contains open info and result
1718 * \retval <0 failure
1720 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1722 struct inode *inode = dentry->d_inode;
1723 struct obd_client_handle *och;
1729 /* Root ? Do nothing. */
1730 if (dentry->d_inode->i_sb->s_root == dentry)
1733 /* No open handle to close? Move away */
1734 if (!it_disposition(it, DISP_OPEN_OPEN))
1737 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1739 OBD_ALLOC(och, sizeof(*och));
1741 GOTO(out, rc = -ENOMEM);
1743 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1745 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1747 /* this one is in place of ll_file_open */
1748 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1749 ptlrpc_req_finished(it->it_data);
1750 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1756 * Get size for inode for which FIEMAP mapping is requested.
1757 * Make the FIEMAP get_info call and returns the result.
1758 * \param fiemap kernel buffer to hold extens
1759 * \param num_bytes kernel buffer size
1761 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1767 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1770 /* Checks for fiemap flags */
1771 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1772 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1776 /* Check for FIEMAP_FLAG_SYNC */
1777 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1778 rc = filemap_fdatawrite(inode->i_mapping);
1783 env = cl_env_get(&refcheck);
1785 RETURN(PTR_ERR(env));
1787 if (i_size_read(inode) == 0) {
1788 rc = ll_glimpse_size(inode);
1793 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1794 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1795 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1797 /* If filesize is 0, then there would be no objects for mapping */
1798 if (fmkey.lfik_oa.o_size == 0) {
1799 fiemap->fm_mapped_extents = 0;
1803 fmkey.lfik_fiemap = *fiemap;
1805 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1806 &fmkey, fiemap, &num_bytes);
1808 cl_env_put(env, &refcheck);
1812 int ll_fid2path(struct inode *inode, void __user *arg)
1814 struct obd_export *exp = ll_i2mdexp(inode);
1815 const struct getinfo_fid2path __user *gfin = arg;
1817 struct getinfo_fid2path *gfout;
1823 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1824 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1827 /* Only need to get the buflen */
1828 if (get_user(pathlen, &gfin->gf_pathlen))
1831 if (pathlen > PATH_MAX)
1834 outsize = sizeof(*gfout) + pathlen;
1835 OBD_ALLOC(gfout, outsize);
1839 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1840 GOTO(gf_free, rc = -EFAULT);
1841 /* append root FID after gfout to let MDT know the root FID so that it
1842 * can lookup the correct path, this is mainly for fileset.
1843 * old server without fileset mount support will ignore this. */
1844 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1846 /* Call mdc_iocontrol */
1847 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1851 if (copy_to_user(arg, gfout, outsize))
1855 OBD_FREE(gfout, outsize);
1860 * Read the data_version for inode.
1862 * This value is computed using stripe object version on OST.
1863 * Version is computed using server side locking.
1865 * @param flags if do sync on the OST side;
1867 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1868 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1870 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1872 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1880 /* If no file object initialized, we consider its version is 0. */
1886 env = cl_env_get(&refcheck);
1888 RETURN(PTR_ERR(env));
1890 io = vvp_env_thread_io(env);
1892 io->u.ci_data_version.dv_data_version = 0;
1893 io->u.ci_data_version.dv_flags = flags;
1896 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1897 result = cl_io_loop(env, io);
1899 result = io->ci_result;
1901 *data_version = io->u.ci_data_version.dv_data_version;
1903 cl_io_fini(env, io);
1905 if (unlikely(io->ci_need_restart))
1908 cl_env_put(env, &refcheck);
1914 * Trigger a HSM release request for the provided inode.
1916 int ll_hsm_release(struct inode *inode)
1918 struct cl_env_nest nest;
1920 struct obd_client_handle *och = NULL;
1921 __u64 data_version = 0;
1925 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1926 ll_get_fsname(inode->i_sb, NULL, 0),
1927 PFID(&ll_i2info(inode)->lli_fid));
1929 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1931 GOTO(out, rc = PTR_ERR(och));
1933 /* Grab latest data_version and [am]time values */
1934 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1938 env = cl_env_nested_get(&nest);
1940 GOTO(out, rc = PTR_ERR(env));
1942 ll_merge_attr(env, inode);
1943 cl_env_nested_put(&nest, env);
1945 /* Release the file.
1946 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1947 * we still need it to pack l_remote_handle to MDT. */
1948 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1954 if (och != NULL && !IS_ERR(och)) /* close the file */
1955 ll_lease_close(och, inode, NULL);
1960 struct ll_swap_stack {
1963 struct inode *inode1;
1964 struct inode *inode2;
1969 static int ll_swap_layouts(struct file *file1, struct file *file2,
1970 struct lustre_swap_layouts *lsl)
1972 struct mdc_swap_layouts msl;
1973 struct md_op_data *op_data;
1976 struct ll_swap_stack *llss = NULL;
1979 OBD_ALLOC_PTR(llss);
1983 llss->inode1 = file1->f_path.dentry->d_inode;
1984 llss->inode2 = file2->f_path.dentry->d_inode;
1986 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1990 /* we use 2 bool because it is easier to swap than 2 bits */
1991 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1992 llss->check_dv1 = true;
1994 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1995 llss->check_dv2 = true;
1997 /* we cannot use lsl->sl_dvX directly because we may swap them */
1998 llss->dv1 = lsl->sl_dv1;
1999 llss->dv2 = lsl->sl_dv2;
2001 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2002 if (rc == 0) /* same file, done! */
2005 if (rc < 0) { /* sequentialize it */
2006 swap(llss->inode1, llss->inode2);
2008 swap(llss->dv1, llss->dv2);
2009 swap(llss->check_dv1, llss->check_dv2);
2013 if (gid != 0) { /* application asks to flush dirty cache */
2014 rc = ll_get_grouplock(llss->inode1, file1, gid);
2018 rc = ll_get_grouplock(llss->inode2, file2, gid);
2020 ll_put_grouplock(llss->inode1, file1, gid);
2025 /* ultimate check, before swaping the layouts we check if
2026 * dataversion has changed (if requested) */
2027 if (llss->check_dv1) {
2028 rc = ll_data_version(llss->inode1, &dv, 0);
2031 if (dv != llss->dv1)
2032 GOTO(putgl, rc = -EAGAIN);
2035 if (llss->check_dv2) {
2036 rc = ll_data_version(llss->inode2, &dv, 0);
2039 if (dv != llss->dv2)
2040 GOTO(putgl, rc = -EAGAIN);
2043 /* struct md_op_data is used to send the swap args to the mdt
2044 * only flags is missing, so we use struct mdc_swap_layouts
2045 * through the md_op_data->op_data */
2046 /* flags from user space have to be converted before they are send to
2047 * server, no flag is sent today, they are only used on the client */
2050 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2051 0, LUSTRE_OPC_ANY, &msl);
2052 if (IS_ERR(op_data))
2053 GOTO(free, rc = PTR_ERR(op_data));
2055 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2056 sizeof(*op_data), op_data, NULL);
2057 ll_finish_md_op_data(op_data);
2064 ll_put_grouplock(llss->inode2, file2, gid);
2065 ll_put_grouplock(llss->inode1, file1, gid);
2075 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2077 struct md_op_data *op_data;
2081 /* Detect out-of range masks */
2082 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2085 /* Non-root users are forbidden to set or clear flags which are
2086 * NOT defined in HSM_USER_MASK. */
2087 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2088 !cfs_capable(CFS_CAP_SYS_ADMIN))
2091 /* Detect out-of range archive id */
2092 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2093 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2096 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2097 LUSTRE_OPC_ANY, hss);
2098 if (IS_ERR(op_data))
2099 RETURN(PTR_ERR(op_data));
2101 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2102 sizeof(*op_data), op_data, NULL);
2104 ll_finish_md_op_data(op_data);
2109 static int ll_hsm_import(struct inode *inode, struct file *file,
2110 struct hsm_user_import *hui)
2112 struct hsm_state_set *hss = NULL;
2113 struct iattr *attr = NULL;
2117 if (!S_ISREG(inode->i_mode))
2123 GOTO(out, rc = -ENOMEM);
2125 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2126 hss->hss_archive_id = hui->hui_archive_id;
2127 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2128 rc = ll_hsm_state_set(inode, hss);
2132 OBD_ALLOC_PTR(attr);
2134 GOTO(out, rc = -ENOMEM);
2136 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2137 attr->ia_mode |= S_IFREG;
2138 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2139 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2140 attr->ia_size = hui->hui_size;
2141 attr->ia_mtime.tv_sec = hui->hui_mtime;
2142 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2143 attr->ia_atime.tv_sec = hui->hui_atime;
2144 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2146 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2147 ATTR_UID | ATTR_GID |
2148 ATTR_MTIME | ATTR_MTIME_SET |
2149 ATTR_ATIME | ATTR_ATIME_SET;
2151 mutex_lock(&inode->i_mutex);
2153 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2157 mutex_unlock(&inode->i_mutex);
2169 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2171 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2172 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2175 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2177 struct inode *inode = file->f_path.dentry->d_inode;
2179 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2180 ATTR_MTIME | ATTR_MTIME_SET |
2181 ATTR_CTIME | ATTR_CTIME_SET,
2183 .tv_sec = lfu->lfu_atime_sec,
2184 .tv_nsec = lfu->lfu_atime_nsec,
2187 .tv_sec = lfu->lfu_mtime_sec,
2188 .tv_nsec = lfu->lfu_mtime_nsec,
2191 .tv_sec = lfu->lfu_ctime_sec,
2192 .tv_nsec = lfu->lfu_ctime_nsec,
2198 if (!capable(CAP_SYS_ADMIN))
2201 if (!S_ISREG(inode->i_mode))
2204 mutex_lock(&inode->i_mutex);
2205 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2206 mutex_unlock(&inode->i_mutex);
2212 * Give file access advices
2214 * The ladvise interface is similar to Linux fadvise() system call, except it
2215 * forwards the advices directly from Lustre client to server. The server side
2216 * codes will apply appropriate read-ahead and caching techniques for the
2217 * corresponding files.
2219 * A typical workload for ladvise is e.g. a bunch of different clients are
2220 * doing small random reads of a file, so prefetching pages into OSS cache
2221 * with big linear reads before the random IO is a net benefit. Fetching
2222 * all that data into each client cache with fadvise() may not be, due to
2223 * much more data being sent to the client.
2225 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2226 struct lu_ladvise *ladvise)
2228 struct cl_env_nest nest;
2231 struct cl_ladvise_io *lio;
2235 env = cl_env_nested_get(&nest);
2237 RETURN(PTR_ERR(env));
2239 io = vvp_env_thread_io(env);
2240 io->ci_obj = ll_i2info(inode)->lli_clob;
2242 /* initialize parameters for ladvise */
2243 lio = &io->u.ci_ladvise;
2244 lio->li_start = ladvise->lla_start;
2245 lio->li_end = ladvise->lla_end;
2246 lio->li_fid = ll_inode2fid(inode);
2247 lio->li_advice = ladvise->lla_advice;
2248 lio->li_flags = flags;
2250 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2251 rc = cl_io_loop(env, io);
2255 cl_io_fini(env, io);
2256 cl_env_nested_put(&nest, env);
2261 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2263 struct inode *inode = file->f_path.dentry->d_inode;
2264 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2268 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2269 PFID(ll_inode2fid(inode)), inode, cmd);
2270 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2272 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2273 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2277 case LL_IOC_GETFLAGS:
2278 /* Get the current value of the file flags */
2279 return put_user(fd->fd_flags, (int __user *)arg);
2280 case LL_IOC_SETFLAGS:
2281 case LL_IOC_CLRFLAGS:
2282 /* Set or clear specific file flags */
2283 /* XXX This probably needs checks to ensure the flags are
2284 * not abused, and to handle any flag side effects.
2286 if (get_user(flags, (int __user *) arg))
2289 if (cmd == LL_IOC_SETFLAGS) {
2290 if ((flags & LL_FILE_IGNORE_LOCK) &&
2291 !(file->f_flags & O_DIRECT)) {
2292 CERROR("%s: unable to disable locking on "
2293 "non-O_DIRECT file\n", current->comm);
2297 fd->fd_flags |= flags;
2299 fd->fd_flags &= ~flags;
2302 case LL_IOC_LOV_SETSTRIPE:
2303 RETURN(ll_lov_setstripe(inode, file, arg));
2304 case LL_IOC_LOV_SETEA:
2305 RETURN(ll_lov_setea(inode, file, arg));
2306 case LL_IOC_LOV_SWAP_LAYOUTS: {
2308 struct lustre_swap_layouts lsl;
2310 if (copy_from_user(&lsl, (char __user *)arg,
2311 sizeof(struct lustre_swap_layouts)))
2314 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2317 file2 = fget(lsl.sl_fd);
2321 /* O_WRONLY or O_RDWR */
2322 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2323 GOTO(out, rc = -EPERM);
2325 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2326 struct inode *inode2;
2327 struct ll_inode_info *lli;
2328 struct obd_client_handle *och = NULL;
2330 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2331 GOTO(out, rc = -EINVAL);
2333 lli = ll_i2info(inode);
2334 mutex_lock(&lli->lli_och_mutex);
2335 if (fd->fd_lease_och != NULL) {
2336 och = fd->fd_lease_och;
2337 fd->fd_lease_och = NULL;
2339 mutex_unlock(&lli->lli_och_mutex);
2341 GOTO(out, rc = -ENOLCK);
2342 inode2 = file2->f_path.dentry->d_inode;
2343 rc = ll_swap_layouts_close(och, inode, inode2);
2345 rc = ll_swap_layouts(file, file2, &lsl);
2351 case LL_IOC_LOV_GETSTRIPE:
2352 RETURN(ll_file_getstripe(inode,
2353 (struct lov_user_md __user *)arg));
2354 case FSFILT_IOC_GETFLAGS:
2355 case FSFILT_IOC_SETFLAGS:
2356 RETURN(ll_iocontrol(inode, file, cmd, arg));
2357 case FSFILT_IOC_GETVERSION_OLD:
2358 case FSFILT_IOC_GETVERSION:
2359 RETURN(put_user(inode->i_generation, (int __user *)arg));
2360 case LL_IOC_GROUP_LOCK:
2361 RETURN(ll_get_grouplock(inode, file, arg));
2362 case LL_IOC_GROUP_UNLOCK:
2363 RETURN(ll_put_grouplock(inode, file, arg));
2364 case IOC_OBD_STATFS:
2365 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2367 /* We need to special case any other ioctls we want to handle,
2368 * to send them to the MDS/OST as appropriate and to properly
2369 * network encode the arg field.
2370 case FSFILT_IOC_SETVERSION_OLD:
2371 case FSFILT_IOC_SETVERSION:
2373 case LL_IOC_FLUSHCTX:
2374 RETURN(ll_flush_ctx(inode));
2375 case LL_IOC_PATH2FID: {
2376 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2377 sizeof(struct lu_fid)))
2382 case LL_IOC_GETPARENT:
2383 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2385 case OBD_IOC_FID2PATH:
2386 RETURN(ll_fid2path(inode, (void __user *)arg));
2387 case LL_IOC_DATA_VERSION: {
2388 struct ioc_data_version idv;
2391 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2394 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2395 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2398 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2404 case LL_IOC_GET_MDTIDX: {
2407 mdtidx = ll_get_mdt_idx(inode);
2411 if (put_user((int)mdtidx, (int __user *)arg))
2416 case OBD_IOC_GETDTNAME:
2417 case OBD_IOC_GETMDNAME:
2418 RETURN(ll_get_obd_name(inode, cmd, arg));
2419 case LL_IOC_HSM_STATE_GET: {
2420 struct md_op_data *op_data;
2421 struct hsm_user_state *hus;
2428 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2429 LUSTRE_OPC_ANY, hus);
2430 if (IS_ERR(op_data)) {
2432 RETURN(PTR_ERR(op_data));
2435 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2438 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2441 ll_finish_md_op_data(op_data);
2445 case LL_IOC_HSM_STATE_SET: {
2446 struct hsm_state_set *hss;
2453 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2458 rc = ll_hsm_state_set(inode, hss);
2463 case LL_IOC_HSM_ACTION: {
2464 struct md_op_data *op_data;
2465 struct hsm_current_action *hca;
2472 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2473 LUSTRE_OPC_ANY, hca);
2474 if (IS_ERR(op_data)) {
2476 RETURN(PTR_ERR(op_data));
2479 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2482 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2485 ll_finish_md_op_data(op_data);
2489 case LL_IOC_SET_LEASE: {
2490 struct ll_inode_info *lli = ll_i2info(inode);
2491 struct obd_client_handle *och = NULL;
2496 case LL_LEASE_WRLCK:
2497 if (!(file->f_mode & FMODE_WRITE))
2499 fmode = FMODE_WRITE;
2501 case LL_LEASE_RDLCK:
2502 if (!(file->f_mode & FMODE_READ))
2506 case LL_LEASE_UNLCK:
2507 mutex_lock(&lli->lli_och_mutex);
2508 if (fd->fd_lease_och != NULL) {
2509 och = fd->fd_lease_och;
2510 fd->fd_lease_och = NULL;
2512 mutex_unlock(&lli->lli_och_mutex);
2517 fmode = och->och_flags;
2518 rc = ll_lease_close(och, inode, &lease_broken);
2525 RETURN(ll_lease_type_from_fmode(fmode));
2530 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2532 /* apply for lease */
2533 och = ll_lease_open(inode, file, fmode, 0);
2535 RETURN(PTR_ERR(och));
2538 mutex_lock(&lli->lli_och_mutex);
2539 if (fd->fd_lease_och == NULL) {
2540 fd->fd_lease_och = och;
2543 mutex_unlock(&lli->lli_och_mutex);
2545 /* impossible now that only excl is supported for now */
2546 ll_lease_close(och, inode, &lease_broken);
2551 case LL_IOC_GET_LEASE: {
2552 struct ll_inode_info *lli = ll_i2info(inode);
2553 struct ldlm_lock *lock = NULL;
2556 mutex_lock(&lli->lli_och_mutex);
2557 if (fd->fd_lease_och != NULL) {
2558 struct obd_client_handle *och = fd->fd_lease_och;
2560 lock = ldlm_handle2lock(&och->och_lease_handle);
2562 lock_res_and_lock(lock);
2563 if (!ldlm_is_cancel(lock))
2564 fmode = och->och_flags;
2566 unlock_res_and_lock(lock);
2567 LDLM_LOCK_PUT(lock);
2570 mutex_unlock(&lli->lli_och_mutex);
2572 RETURN(ll_lease_type_from_fmode(fmode));
2574 case LL_IOC_HSM_IMPORT: {
2575 struct hsm_user_import *hui;
2581 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2586 rc = ll_hsm_import(inode, file, hui);
2591 case LL_IOC_FUTIMES_3: {
2592 struct ll_futimes_3 lfu;
2594 if (copy_from_user(&lfu,
2595 (const struct ll_futimes_3 __user *)arg,
2599 RETURN(ll_file_futimes_3(file, &lfu));
2601 case LL_IOC_LADVISE: {
2602 struct ladvise_hdr *ladvise_hdr;
2605 int alloc_size = sizeof(*ladvise_hdr);
2608 OBD_ALLOC_PTR(ladvise_hdr);
2609 if (ladvise_hdr == NULL)
2612 if (copy_from_user(ladvise_hdr,
2613 (const struct ladvise_hdr __user *)arg,
2615 GOTO(out_ladvise, rc = -EFAULT);
2617 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2618 ladvise_hdr->lah_count < 1)
2619 GOTO(out_ladvise, rc = -EINVAL);
2621 num_advise = ladvise_hdr->lah_count;
2622 if (num_advise >= LAH_COUNT_MAX)
2623 GOTO(out_ladvise, rc = -EFBIG);
2625 OBD_FREE_PTR(ladvise_hdr);
2626 alloc_size = offsetof(typeof(*ladvise_hdr),
2627 lah_advise[num_advise]);
2628 OBD_ALLOC(ladvise_hdr, alloc_size);
2629 if (ladvise_hdr == NULL)
2633 * TODO: submit multiple advices to one server in a single RPC
2635 if (copy_from_user(ladvise_hdr,
2636 (const struct ladvise_hdr __user *)arg,
2638 GOTO(out_ladvise, rc = -EFAULT);
2640 for (i = 0; i < num_advise; i++) {
2641 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2642 &ladvise_hdr->lah_advise[i]);
2648 OBD_FREE(ladvise_hdr, alloc_size);
2655 ll_iocontrol_call(inode, file, cmd, arg, &err))
2658 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2659 (void __user *)arg));
2664 #ifndef HAVE_FILE_LLSEEK_SIZE
2665 static inline loff_t
2666 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2668 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2670 if (offset > maxsize)
2673 if (offset != file->f_pos) {
2674 file->f_pos = offset;
2675 file->f_version = 0;
2681 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2682 loff_t maxsize, loff_t eof)
2684 struct inode *inode = file->f_path.dentry->d_inode;
2692 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2693 * position-querying operation. Avoid rewriting the "same"
2694 * f_pos value back to the file because a concurrent read(),
2695 * write() or lseek() might have altered it
2700 * f_lock protects against read/modify/write race with other
2701 * SEEK_CURs. Note that parallel writes and reads behave
2704 mutex_lock(&inode->i_mutex);
2705 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2706 mutex_unlock(&inode->i_mutex);
2710 * In the generic case the entire file is data, so as long as
2711 * offset isn't at the end of the file then the offset is data.
2718 * There is a virtual hole at the end of the file, so as long as
2719 * offset isn't i_size or larger, return i_size.
2727 return llseek_execute(file, offset, maxsize);
2731 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2733 struct inode *inode = file->f_path.dentry->d_inode;
2734 loff_t retval, eof = 0;
2737 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2738 (origin == SEEK_CUR) ? file->f_pos : 0);
2739 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2740 PFID(ll_inode2fid(inode)), inode, retval, retval,
2742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2744 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2745 retval = ll_glimpse_size(inode);
2748 eof = i_size_read(inode);
2751 retval = ll_generic_file_llseek_size(file, offset, origin,
2752 ll_file_maxbytes(inode), eof);
2756 static int ll_flush(struct file *file, fl_owner_t id)
2758 struct inode *inode = file->f_path.dentry->d_inode;
2759 struct ll_inode_info *lli = ll_i2info(inode);
2760 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2763 LASSERT(!S_ISDIR(inode->i_mode));
2765 /* catch async errors that were recorded back when async writeback
2766 * failed for pages in this mapping. */
2767 rc = lli->lli_async_rc;
2768 lli->lli_async_rc = 0;
2769 if (lli->lli_clob != NULL) {
2770 err = lov_read_and_clear_async_rc(lli->lli_clob);
2775 /* The application has been told write failure already.
2776 * Do not report failure again. */
2777 if (fd->fd_write_failed)
2779 return rc ? -EIO : 0;
2783 * Called to make sure a portion of file has been written out.
2784 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2786 * Return how many pages have been written.
2788 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2789 enum cl_fsync_mode mode, int ignore_layout)
2791 struct cl_env_nest nest;
2794 struct cl_fsync_io *fio;
2798 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2799 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2802 env = cl_env_nested_get(&nest);
2804 RETURN(PTR_ERR(env));
2806 io = vvp_env_thread_io(env);
2807 io->ci_obj = ll_i2info(inode)->lli_clob;
2808 io->ci_ignore_layout = ignore_layout;
2810 /* initialize parameters for sync */
2811 fio = &io->u.ci_fsync;
2812 fio->fi_start = start;
2814 fio->fi_fid = ll_inode2fid(inode);
2815 fio->fi_mode = mode;
2816 fio->fi_nr_written = 0;
2818 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2819 result = cl_io_loop(env, io);
2821 result = io->ci_result;
2823 result = fio->fi_nr_written;
2824 cl_io_fini(env, io);
2825 cl_env_nested_put(&nest, env);
2831 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2832 * null and dentry must be used directly rather than pulled from
2833 * *file->f_path.dentry as is done otherwise.
2836 #ifdef HAVE_FILE_FSYNC_4ARGS
2837 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2839 struct dentry *dentry = file->f_path.dentry;
2840 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2841 int ll_fsync(struct file *file, int datasync)
2843 struct dentry *dentry = file->f_path.dentry;
2845 loff_t end = LLONG_MAX;
2847 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2850 loff_t end = LLONG_MAX;
2852 struct inode *inode = dentry->d_inode;
2853 struct ll_inode_info *lli = ll_i2info(inode);
2854 struct ptlrpc_request *req;
2858 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2859 PFID(ll_inode2fid(inode)), inode);
2860 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2862 #ifdef HAVE_FILE_FSYNC_4ARGS
2863 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2864 mutex_lock(&inode->i_mutex);
2866 /* fsync's caller has already called _fdata{sync,write}, we want
2867 * that IO to finish before calling the osc and mdc sync methods */
2868 rc = filemap_fdatawait(inode->i_mapping);
2871 /* catch async errors that were recorded back when async writeback
2872 * failed for pages in this mapping. */
2873 if (!S_ISDIR(inode->i_mode)) {
2874 err = lli->lli_async_rc;
2875 lli->lli_async_rc = 0;
2878 err = lov_read_and_clear_async_rc(lli->lli_clob);
2883 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2887 ptlrpc_req_finished(req);
2889 if (S_ISREG(inode->i_mode)) {
2890 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2892 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2893 if (rc == 0 && err < 0)
2896 fd->fd_write_failed = true;
2898 fd->fd_write_failed = false;
2901 #ifdef HAVE_FILE_FSYNC_4ARGS
2902 mutex_unlock(&inode->i_mutex);
2908 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2910 struct inode *inode = file->f_path.dentry->d_inode;
2911 struct ll_sb_info *sbi = ll_i2sbi(inode);
2912 struct ldlm_enqueue_info einfo = {
2913 .ei_type = LDLM_FLOCK,
2914 .ei_cb_cp = ldlm_flock_completion_ast,
2915 .ei_cbdata = file_lock,
2917 struct md_op_data *op_data;
2918 struct lustre_handle lockh = { 0 };
2919 union ldlm_policy_data flock = { { 0 } };
2920 int fl_type = file_lock->fl_type;
2926 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2927 PFID(ll_inode2fid(inode)), file_lock);
2929 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2931 if (file_lock->fl_flags & FL_FLOCK) {
2932 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2933 /* flocks are whole-file locks */
2934 flock.l_flock.end = OFFSET_MAX;
2935 /* For flocks owner is determined by the local file desctiptor*/
2936 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2937 } else if (file_lock->fl_flags & FL_POSIX) {
2938 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2939 flock.l_flock.start = file_lock->fl_start;
2940 flock.l_flock.end = file_lock->fl_end;
2944 flock.l_flock.pid = file_lock->fl_pid;
2946 /* Somewhat ugly workaround for svc lockd.
2947 * lockd installs custom fl_lmops->lm_compare_owner that checks
2948 * for the fl_owner to be the same (which it always is on local node
2949 * I guess between lockd processes) and then compares pid.
2950 * As such we assign pid to the owner field to make it all work,
2951 * conflict with normal locks is unlikely since pid space and
2952 * pointer space for current->files are not intersecting */
2953 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2954 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2958 einfo.ei_mode = LCK_PR;
2961 /* An unlock request may or may not have any relation to
2962 * existing locks so we may not be able to pass a lock handle
2963 * via a normal ldlm_lock_cancel() request. The request may even
2964 * unlock a byte range in the middle of an existing lock. In
2965 * order to process an unlock request we need all of the same
2966 * information that is given with a normal read or write record
2967 * lock request. To avoid creating another ldlm unlock (cancel)
2968 * message we'll treat a LCK_NL flock request as an unlock. */
2969 einfo.ei_mode = LCK_NL;
2972 einfo.ei_mode = LCK_PW;
2975 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2990 flags = LDLM_FL_BLOCK_NOWAIT;
2996 flags = LDLM_FL_TEST_LOCK;
2999 CERROR("unknown fcntl lock command: %d\n", cmd);
3003 /* Save the old mode so that if the mode in the lock changes we
3004 * can decrement the appropriate reader or writer refcount. */
3005 file_lock->fl_type = einfo.ei_mode;
3007 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3008 LUSTRE_OPC_ANY, NULL);
3009 if (IS_ERR(op_data))
3010 RETURN(PTR_ERR(op_data));
3012 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3013 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3014 flock.l_flock.pid, flags, einfo.ei_mode,
3015 flock.l_flock.start, flock.l_flock.end);
3017 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3020 /* Restore the file lock type if not TEST lock. */
3021 if (!(flags & LDLM_FL_TEST_LOCK))
3022 file_lock->fl_type = fl_type;
3024 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3025 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3026 !(flags & LDLM_FL_TEST_LOCK))
3027 rc2 = locks_lock_file_wait(file, file_lock);
3029 if ((file_lock->fl_flags & FL_FLOCK) &&
3030 (rc == 0 || file_lock->fl_type == F_UNLCK))
3031 rc2 = flock_lock_file_wait(file, file_lock);
3032 if ((file_lock->fl_flags & FL_POSIX) &&
3033 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3034 !(flags & LDLM_FL_TEST_LOCK))
3035 rc2 = posix_lock_file_wait(file, file_lock);
3036 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3038 if (rc2 && file_lock->fl_type != F_UNLCK) {
3039 einfo.ei_mode = LCK_NL;
3040 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3045 ll_finish_md_op_data(op_data);
3050 int ll_get_fid_by_name(struct inode *parent, const char *name,
3051 int namelen, struct lu_fid *fid,
3052 struct inode **inode)
3054 struct md_op_data *op_data = NULL;
3055 struct mdt_body *body;
3056 struct ptlrpc_request *req;
3060 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3061 LUSTRE_OPC_ANY, NULL);
3062 if (IS_ERR(op_data))
3063 RETURN(PTR_ERR(op_data));
3065 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3066 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3067 ll_finish_md_op_data(op_data);
3071 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3073 GOTO(out_req, rc = -EFAULT);
3075 *fid = body->mbo_fid1;
3078 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3080 ptlrpc_req_finished(req);
3084 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3085 const char *name, int namelen)
3087 struct dentry *dchild = NULL;
3088 struct inode *child_inode = NULL;
3089 struct md_op_data *op_data;
3090 struct ptlrpc_request *request = NULL;
3091 struct obd_client_handle *och = NULL;
3093 struct mdt_body *body;
3095 __u64 data_version = 0;
3098 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3099 name, PFID(ll_inode2fid(parent)), mdtidx);
3101 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3102 0, LUSTRE_OPC_ANY, NULL);
3103 if (IS_ERR(op_data))
3104 RETURN(PTR_ERR(op_data));
3106 /* Get child FID first */
3107 qstr.hash = full_name_hash(name, namelen);
3110 dchild = d_lookup(file->f_path.dentry, &qstr);
3111 if (dchild != NULL) {
3112 if (dchild->d_inode != NULL)
3113 child_inode = igrab(dchild->d_inode);
3117 if (child_inode == NULL) {
3118 rc = ll_get_fid_by_name(parent, name, namelen,
3119 &op_data->op_fid3, &child_inode);
3124 if (child_inode == NULL)
3125 GOTO(out_free, rc = -EINVAL);
3128 * lfs migrate command needs to be blocked on the client
3129 * by checking the migrate FID against the FID of the
3132 if (child_inode == parent->i_sb->s_root->d_inode)
3133 GOTO(out_iput, rc = -EINVAL);
3135 mutex_lock(&child_inode->i_mutex);
3136 op_data->op_fid3 = *ll_inode2fid(child_inode);
3137 if (!fid_is_sane(&op_data->op_fid3)) {
3138 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3139 ll_get_fsname(parent->i_sb, NULL, 0), name,
3140 PFID(&op_data->op_fid3));
3141 GOTO(out_unlock, rc = -EINVAL);
3144 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3146 GOTO(out_unlock, rc);
3149 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3150 PFID(&op_data->op_fid3), mdtidx);
3151 GOTO(out_unlock, rc = 0);
3154 if (S_ISREG(child_inode->i_mode)) {
3155 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3159 GOTO(out_unlock, rc);
3162 rc = ll_data_version(child_inode, &data_version,
3165 GOTO(out_close, rc);
3167 op_data->op_handle = och->och_fh;
3168 op_data->op_data = och->och_mod;
3169 op_data->op_data_version = data_version;
3170 op_data->op_lease_handle = och->och_lease_handle;
3171 op_data->op_bias |= MDS_RENAME_MIGRATE;
3174 op_data->op_mds = mdtidx;
3175 op_data->op_cli_flags = CLI_MIGRATE;
3176 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3177 namelen, name, namelen, &request);
3179 ll_update_times(request, parent);
3181 if (request != NULL) {
3182 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3184 ptlrpc_req_finished(request);
3185 GOTO(out_close, rc = -EPROTO);
3188 /* If the server does release layout lock, then we cleanup
3189 * the client och here, otherwise release it in out_close: */
3191 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3192 obd_mod_put(och->och_mod);
3193 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3195 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3199 ptlrpc_req_finished(request);
3202 /* Try again if the file layout has changed. */
3203 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3208 if (och != NULL) /* close the file */
3209 ll_lease_close(och, child_inode, NULL);
3211 clear_nlink(child_inode);
3213 mutex_unlock(&child_inode->i_mutex);
3217 ll_finish_md_op_data(op_data);
3222 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3230 * test if some locks matching bits and l_req_mode are acquired
3231 * - bits can be in different locks
3232 * - if found clear the common lock bits in *bits
3233 * - the bits not found, are kept in *bits
3235 * \param bits [IN] searched lock bits [IN]
3236 * \param l_req_mode [IN] searched lock mode
3237 * \retval boolean, true iff all bits are found
3239 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3241 struct lustre_handle lockh;
3242 union ldlm_policy_data policy;
3243 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3244 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3253 fid = &ll_i2info(inode)->lli_fid;
3254 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3255 ldlm_lockname[mode]);
3257 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3258 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3259 policy.l_inodebits.bits = *bits & (1 << i);
3260 if (policy.l_inodebits.bits == 0)
3263 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3264 &policy, mode, &lockh)) {
3265 struct ldlm_lock *lock;
3267 lock = ldlm_handle2lock(&lockh);
3270 ~(lock->l_policy_data.l_inodebits.bits);
3271 LDLM_LOCK_PUT(lock);
3273 *bits &= ~policy.l_inodebits.bits;
3280 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3281 struct lustre_handle *lockh, __u64 flags,
3282 enum ldlm_mode mode)
3284 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3289 fid = &ll_i2info(inode)->lli_fid;
3290 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3292 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3293 fid, LDLM_IBITS, &policy, mode, lockh);
3298 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3300 /* Already unlinked. Just update nlink and return success */
3301 if (rc == -ENOENT) {
3303 /* If it is striped directory, and there is bad stripe
3304 * Let's revalidate the dentry again, instead of returning
3306 if (S_ISDIR(inode->i_mode) &&
3307 ll_i2info(inode)->lli_lsm_md != NULL)
3310 /* This path cannot be hit for regular files unless in
3311 * case of obscure races, so no need to to validate
3313 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3315 } else if (rc != 0) {
3316 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3317 "%s: revalidate FID "DFID" error: rc = %d\n",
3318 ll_get_fsname(inode->i_sb, NULL, 0),
3319 PFID(ll_inode2fid(inode)), rc);
3325 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3327 struct inode *inode = dentry->d_inode;
3328 struct ptlrpc_request *req = NULL;
3329 struct obd_export *exp;
3333 LASSERT(inode != NULL);
3335 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3336 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3338 exp = ll_i2mdexp(inode);
3340 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3341 * But under CMD case, it caused some lock issues, should be fixed
3342 * with new CMD ibits lock. See bug 12718 */
3343 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3344 struct lookup_intent oit = { .it_op = IT_GETATTR };
3345 struct md_op_data *op_data;
3347 if (ibits == MDS_INODELOCK_LOOKUP)
3348 oit.it_op = IT_LOOKUP;
3350 /* Call getattr by fid, so do not provide name at all. */
3351 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3352 dentry->d_inode, NULL, 0, 0,
3353 LUSTRE_OPC_ANY, NULL);
3354 if (IS_ERR(op_data))
3355 RETURN(PTR_ERR(op_data));
3357 rc = md_intent_lock(exp, op_data, &oit, &req,
3358 &ll_md_blocking_ast, 0);
3359 ll_finish_md_op_data(op_data);
3361 rc = ll_inode_revalidate_fini(inode, rc);
3365 rc = ll_revalidate_it_finish(req, &oit, dentry);
3367 ll_intent_release(&oit);
3371 /* Unlinked? Unhash dentry, so it is not picked up later by
3372 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3373 here to preserve get_cwd functionality on 2.6.
3375 if (!dentry->d_inode->i_nlink) {
3376 ll_lock_dcache(inode);
3377 d_lustre_invalidate(dentry, 0);
3378 ll_unlock_dcache(inode);
3381 ll_lookup_finish_locks(&oit, dentry);
3382 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3383 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3384 u64 valid = OBD_MD_FLGETATTR;
3385 struct md_op_data *op_data;
3388 if (S_ISREG(inode->i_mode)) {
3389 rc = ll_get_default_mdsize(sbi, &ealen);
3392 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3395 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3396 0, ealen, LUSTRE_OPC_ANY,
3398 if (IS_ERR(op_data))
3399 RETURN(PTR_ERR(op_data));
3401 op_data->op_valid = valid;
3402 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3403 ll_finish_md_op_data(op_data);
3405 rc = ll_inode_revalidate_fini(inode, rc);
3409 rc = ll_prep_inode(&inode, req, NULL, NULL);
3412 ptlrpc_req_finished(req);
3416 static int ll_merge_md_attr(struct inode *inode)
3418 struct cl_attr attr = { 0 };
3421 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3422 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3423 &attr, ll_md_blocking_ast);
3427 set_nlink(inode, attr.cat_nlink);
3428 inode->i_blocks = attr.cat_blocks;
3429 i_size_write(inode, attr.cat_size);
3431 ll_i2info(inode)->lli_atime = attr.cat_atime;
3432 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3433 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3439 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3441 struct inode *inode = dentry->d_inode;
3445 rc = __ll_inode_revalidate(dentry, ibits);
3449 /* if object isn't regular file, don't validate size */
3450 if (!S_ISREG(inode->i_mode)) {
3451 if (S_ISDIR(inode->i_mode) &&
3452 ll_i2info(inode)->lli_lsm_md != NULL) {
3453 rc = ll_merge_md_attr(inode);
3458 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3459 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3460 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3462 /* In case of restore, the MDT has the right size and has
3463 * already send it back without granting the layout lock,
3464 * inode is up-to-date so glimpse is useless.
3465 * Also to glimpse we need the layout, in case of a running
3466 * restore the MDT holds the layout lock so the glimpse will
3467 * block up to the end of restore (getattr will block)
3469 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3470 rc = ll_glimpse_size(inode);
3475 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3477 struct inode *inode = de->d_inode;
3478 struct ll_sb_info *sbi = ll_i2sbi(inode);
3479 struct ll_inode_info *lli = ll_i2info(inode);
3482 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3483 MDS_INODELOCK_LOOKUP);
3484 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3489 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3491 stat->dev = inode->i_sb->s_dev;
3492 if (ll_need_32bit_api(sbi))
3493 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3495 stat->ino = inode->i_ino;
3496 stat->mode = inode->i_mode;
3497 stat->uid = inode->i_uid;
3498 stat->gid = inode->i_gid;
3499 stat->rdev = inode->i_rdev;
3500 stat->atime = inode->i_atime;
3501 stat->mtime = inode->i_mtime;
3502 stat->ctime = inode->i_ctime;
3503 stat->blksize = 1 << inode->i_blkbits;
3505 stat->nlink = inode->i_nlink;
3506 stat->size = i_size_read(inode);
3507 stat->blocks = inode->i_blocks;
3512 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3513 __u64 start, __u64 len)
3517 struct fiemap *fiemap;
3518 unsigned int extent_count = fieinfo->fi_extents_max;
3520 num_bytes = sizeof(*fiemap) + (extent_count *
3521 sizeof(struct fiemap_extent));
3522 OBD_ALLOC_LARGE(fiemap, num_bytes);
3527 fiemap->fm_flags = fieinfo->fi_flags;
3528 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3529 fiemap->fm_start = start;
3530 fiemap->fm_length = len;
3531 if (extent_count > 0 &&
3532 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3533 sizeof(struct fiemap_extent)) != 0)
3534 GOTO(out, rc = -EFAULT);
3536 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3538 fieinfo->fi_flags = fiemap->fm_flags;
3539 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3540 if (extent_count > 0 &&
3541 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3542 fiemap->fm_mapped_extents *
3543 sizeof(struct fiemap_extent)) != 0)
3544 GOTO(out, rc = -EFAULT);
3546 OBD_FREE_LARGE(fiemap, num_bytes);
3550 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3552 struct ll_inode_info *lli = ll_i2info(inode);
3553 struct posix_acl *acl = NULL;
3556 spin_lock(&lli->lli_lock);
3557 /* VFS' acl_permission_check->check_acl will release the refcount */
3558 acl = posix_acl_dup(lli->lli_posix_acl);
3559 spin_unlock(&lli->lli_lock);
3564 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3566 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3567 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3569 ll_check_acl(struct inode *inode, int mask)
3572 # ifdef CONFIG_FS_POSIX_ACL
3573 struct posix_acl *acl;
3577 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3578 if (flags & IPERM_FLAG_RCU)
3581 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3586 rc = posix_acl_permission(inode, acl, mask);
3587 posix_acl_release(acl);
3590 # else /* !CONFIG_FS_POSIX_ACL */
3592 # endif /* CONFIG_FS_POSIX_ACL */
3594 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3596 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3597 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3599 # ifdef HAVE_INODE_PERMISION_2ARGS
3600 int ll_inode_permission(struct inode *inode, int mask)
3602 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3607 struct ll_sb_info *sbi;
3608 struct root_squash_info *squash;
3609 struct cred *cred = NULL;
3610 const struct cred *old_cred = NULL;
3612 bool squash_id = false;
3615 #ifdef MAY_NOT_BLOCK
3616 if (mask & MAY_NOT_BLOCK)
3618 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3619 if (flags & IPERM_FLAG_RCU)
3623 /* as root inode are NOT getting validated in lookup operation,
3624 * need to do it before permission check. */
3626 if (inode == inode->i_sb->s_root->d_inode) {
3627 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3628 MDS_INODELOCK_LOOKUP);
3633 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3634 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3636 /* squash fsuid/fsgid if needed */
3637 sbi = ll_i2sbi(inode);
3638 squash = &sbi->ll_squash;
3639 if (unlikely(squash->rsi_uid != 0 &&
3640 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3641 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3645 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3646 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3647 squash->rsi_uid, squash->rsi_gid);
3649 /* update current process's credentials
3650 * and FS capability */
3651 cred = prepare_creds();
3655 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3656 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3657 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3658 if ((1 << cap) & CFS_CAP_FS_MASK)
3659 cap_lower(cred->cap_effective, cap);
3661 old_cred = override_creds(cred);
3664 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3666 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3667 rc = lustre_check_remote_perm(inode, mask);
3669 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3671 /* restore current process's credentials and FS capability */
3673 revert_creds(old_cred);
3680 /* -o localflock - only provides locally consistent flock locks */
3681 struct file_operations ll_file_operations = {
3682 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3683 # ifdef HAVE_SYNC_READ_WRITE
3684 .read = new_sync_read,
3685 .write = new_sync_write,
3687 .read_iter = ll_file_read_iter,
3688 .write_iter = ll_file_write_iter,
3689 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3690 .read = ll_file_read,
3691 .aio_read = ll_file_aio_read,
3692 .write = ll_file_write,
3693 .aio_write = ll_file_aio_write,
3694 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3695 .unlocked_ioctl = ll_file_ioctl,
3696 .open = ll_file_open,
3697 .release = ll_file_release,
3698 .mmap = ll_file_mmap,
3699 .llseek = ll_file_seek,
3700 .splice_read = ll_file_splice_read,
3705 struct file_operations ll_file_operations_flock = {
3706 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3707 # ifdef HAVE_SYNC_READ_WRITE
3708 .read = new_sync_read,
3709 .write = new_sync_write,
3710 # endif /* HAVE_SYNC_READ_WRITE */
3711 .read_iter = ll_file_read_iter,
3712 .write_iter = ll_file_write_iter,
3713 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3714 .read = ll_file_read,
3715 .aio_read = ll_file_aio_read,
3716 .write = ll_file_write,
3717 .aio_write = ll_file_aio_write,
3718 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3719 .unlocked_ioctl = ll_file_ioctl,
3720 .open = ll_file_open,
3721 .release = ll_file_release,
3722 .mmap = ll_file_mmap,
3723 .llseek = ll_file_seek,
3724 .splice_read = ll_file_splice_read,
3727 .flock = ll_file_flock,
3728 .lock = ll_file_flock
3731 /* These are for -o noflock - to return ENOSYS on flock calls */
3732 struct file_operations ll_file_operations_noflock = {
3733 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3734 # ifdef HAVE_SYNC_READ_WRITE
3735 .read = new_sync_read,
3736 .write = new_sync_write,
3737 # endif /* HAVE_SYNC_READ_WRITE */
3738 .read_iter = ll_file_read_iter,
3739 .write_iter = ll_file_write_iter,
3740 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3741 .read = ll_file_read,
3742 .aio_read = ll_file_aio_read,
3743 .write = ll_file_write,
3744 .aio_write = ll_file_aio_write,
3745 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3746 .unlocked_ioctl = ll_file_ioctl,
3747 .open = ll_file_open,
3748 .release = ll_file_release,
3749 .mmap = ll_file_mmap,
3750 .llseek = ll_file_seek,
3751 .splice_read = ll_file_splice_read,
3754 .flock = ll_file_noflock,
3755 .lock = ll_file_noflock
3758 struct inode_operations ll_file_inode_operations = {
3759 .setattr = ll_setattr,
3760 .getattr = ll_getattr,
3761 .permission = ll_inode_permission,
3762 .setxattr = ll_setxattr,
3763 .getxattr = ll_getxattr,
3764 .listxattr = ll_listxattr,
3765 .removexattr = ll_removexattr,
3766 .fiemap = ll_fiemap,
3767 #ifdef HAVE_IOP_GET_ACL
3768 .get_acl = ll_get_acl,
3772 /* dynamic ioctl number support routins */
3773 static struct llioc_ctl_data {
3774 struct rw_semaphore ioc_sem;
3775 struct list_head ioc_head;
3777 __RWSEM_INITIALIZER(llioc.ioc_sem),
3778 LIST_HEAD_INIT(llioc.ioc_head)
3783 struct list_head iocd_list;
3784 unsigned int iocd_size;
3785 llioc_callback_t iocd_cb;
3786 unsigned int iocd_count;
3787 unsigned int iocd_cmd[0];
3790 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3793 struct llioc_data *in_data = NULL;
3796 if (cb == NULL || cmd == NULL ||
3797 count > LLIOC_MAX_CMD || count < 0)
3800 size = sizeof(*in_data) + count * sizeof(unsigned int);
3801 OBD_ALLOC(in_data, size);
3802 if (in_data == NULL)
3805 memset(in_data, 0, sizeof(*in_data));
3806 in_data->iocd_size = size;
3807 in_data->iocd_cb = cb;
3808 in_data->iocd_count = count;
3809 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3811 down_write(&llioc.ioc_sem);
3812 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3813 up_write(&llioc.ioc_sem);
3818 void ll_iocontrol_unregister(void *magic)
3820 struct llioc_data *tmp;
3825 down_write(&llioc.ioc_sem);
3826 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3828 unsigned int size = tmp->iocd_size;
3830 list_del(&tmp->iocd_list);
3831 up_write(&llioc.ioc_sem);
3833 OBD_FREE(tmp, size);
3837 up_write(&llioc.ioc_sem);
3839 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3842 EXPORT_SYMBOL(ll_iocontrol_register);
3843 EXPORT_SYMBOL(ll_iocontrol_unregister);
3845 static enum llioc_iter
3846 ll_iocontrol_call(struct inode *inode, struct file *file,
3847 unsigned int cmd, unsigned long arg, int *rcp)
3849 enum llioc_iter ret = LLIOC_CONT;
3850 struct llioc_data *data;
3851 int rc = -EINVAL, i;
3853 down_read(&llioc.ioc_sem);
3854 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3855 for (i = 0; i < data->iocd_count; i++) {
3856 if (cmd != data->iocd_cmd[i])
3859 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3863 if (ret == LLIOC_STOP)
3866 up_read(&llioc.ioc_sem);
3873 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3875 struct ll_inode_info *lli = ll_i2info(inode);
3876 struct cl_object *obj = lli->lli_clob;
3877 struct cl_env_nest nest;
3885 env = cl_env_nested_get(&nest);
3887 RETURN(PTR_ERR(env));
3889 rc = cl_conf_set(env, lli->lli_clob, conf);
3893 if (conf->coc_opc == OBJECT_CONF_SET) {
3894 struct ldlm_lock *lock = conf->coc_lock;
3895 struct cl_layout cl = {
3899 LASSERT(lock != NULL);
3900 LASSERT(ldlm_has_layout(lock));
3902 /* it can only be allowed to match after layout is
3903 * applied to inode otherwise false layout would be
3904 * seen. Applying layout shoud happen before dropping
3905 * the intent lock. */
3906 ldlm_lock_allow_match(lock);
3908 rc = cl_object_layout_get(env, obj, &cl);
3913 DFID": layout version change: %u -> %u\n",
3914 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3916 ll_layout_version_set(lli, cl.cl_layout_gen);
3920 cl_env_nested_put(&nest, env);
3925 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3926 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3929 struct ll_sb_info *sbi = ll_i2sbi(inode);
3930 struct ptlrpc_request *req;
3931 struct mdt_body *body;
3938 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3939 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3940 lock->l_lvb_data, lock->l_lvb_len);
3942 if (lock->l_lvb_data != NULL)
3945 /* if layout lock was granted right away, the layout is returned
3946 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3947 * blocked and then granted via completion ast, we have to fetch
3948 * layout here. Please note that we can't use the LVB buffer in
3949 * completion AST because it doesn't have a large enough buffer */
3950 rc = ll_get_default_mdsize(sbi, &lmmsize);
3952 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3953 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3958 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3960 GOTO(out, rc = -EPROTO);
3962 lmmsize = body->mbo_eadatasize;
3963 if (lmmsize == 0) /* empty layout */
3966 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3968 GOTO(out, rc = -EFAULT);
3970 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3971 if (lvbdata == NULL)
3972 GOTO(out, rc = -ENOMEM);
3974 memcpy(lvbdata, lmm, lmmsize);
3975 lock_res_and_lock(lock);
3976 if (unlikely(lock->l_lvb_data == NULL)) {
3977 lock->l_lvb_type = LVB_T_LAYOUT;
3978 lock->l_lvb_data = lvbdata;
3979 lock->l_lvb_len = lmmsize;
3982 unlock_res_and_lock(lock);
3985 OBD_FREE_LARGE(lvbdata, lmmsize);
3990 ptlrpc_req_finished(req);
3995 * Apply the layout to the inode. Layout lock is held and will be released
3998 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3999 struct inode *inode)
4001 struct ll_inode_info *lli = ll_i2info(inode);
4002 struct ll_sb_info *sbi = ll_i2sbi(inode);
4003 struct ldlm_lock *lock;
4004 struct cl_object_conf conf;
4007 bool wait_layout = false;
4010 LASSERT(lustre_handle_is_used(lockh));
4012 lock = ldlm_handle2lock(lockh);
4013 LASSERT(lock != NULL);
4014 LASSERT(ldlm_has_layout(lock));
4016 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4017 PFID(&lli->lli_fid), inode);
4019 /* in case this is a caching lock and reinstate with new inode */
4020 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
4022 lock_res_and_lock(lock);
4023 lvb_ready = ldlm_is_lvb_ready(lock);
4024 unlock_res_and_lock(lock);
4025 /* checking lvb_ready is racy but this is okay. The worst case is
4026 * that multi processes may configure the file on the same time. */
4031 rc = ll_layout_fetch(inode, lock);
4035 /* for layout lock, lmm is stored in lock's lvb.
4036 * lvb_data is immutable if the lock is held so it's safe to access it
4039 * set layout to file. Unlikely this will fail as old layout was
4040 * surely eliminated */
4041 memset(&conf, 0, sizeof conf);
4042 conf.coc_opc = OBJECT_CONF_SET;
4043 conf.coc_inode = inode;
4044 conf.coc_lock = lock;
4045 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4046 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4047 rc = ll_layout_conf(inode, &conf);
4049 /* refresh layout failed, need to wait */
4050 wait_layout = rc == -EBUSY;
4054 LDLM_LOCK_PUT(lock);
4055 ldlm_lock_decref(lockh, mode);
4057 /* wait for IO to complete if it's still being used. */
4059 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4060 ll_get_fsname(inode->i_sb, NULL, 0),
4061 PFID(&lli->lli_fid), inode);
4063 memset(&conf, 0, sizeof conf);
4064 conf.coc_opc = OBJECT_CONF_WAIT;
4065 conf.coc_inode = inode;
4066 rc = ll_layout_conf(inode, &conf);
4070 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4071 ll_get_fsname(inode->i_sb, NULL, 0),
4072 PFID(&lli->lli_fid), rc);
4077 static int ll_layout_refresh_locked(struct inode *inode)
4079 struct ll_inode_info *lli = ll_i2info(inode);
4080 struct ll_sb_info *sbi = ll_i2sbi(inode);
4081 struct md_op_data *op_data;
4082 struct lookup_intent it;
4083 struct lustre_handle lockh;
4084 enum ldlm_mode mode;
4085 struct ldlm_enqueue_info einfo = {
4086 .ei_type = LDLM_IBITS,
4088 .ei_cb_bl = &ll_md_blocking_ast,
4089 .ei_cb_cp = &ldlm_completion_ast,
4095 /* mostly layout lock is caching on the local side, so try to match
4096 * it before grabbing layout lock mutex. */
4097 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4098 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4099 if (mode != 0) { /* hit cached lock */
4100 rc = ll_layout_lock_set(&lockh, mode, inode);
4107 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4108 0, 0, LUSTRE_OPC_ANY, NULL);
4109 if (IS_ERR(op_data))
4110 RETURN(PTR_ERR(op_data));
4112 /* have to enqueue one */
4113 memset(&it, 0, sizeof(it));
4114 it.it_op = IT_LAYOUT;
4115 lockh.cookie = 0ULL;
4117 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4118 ll_get_fsname(inode->i_sb, NULL, 0),
4119 PFID(&lli->lli_fid), inode);
4121 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4122 if (it.it_data != NULL)
4123 ptlrpc_req_finished(it.it_data);
4126 ll_finish_md_op_data(op_data);
4128 mode = it.it_lock_mode;
4129 it.it_lock_mode = 0;
4130 ll_intent_drop_lock(&it);
4133 /* set lock data in case this is a new lock */
4134 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4135 rc = ll_layout_lock_set(&lockh, mode, inode);
4144 * This function checks if there exists a LAYOUT lock on the client side,
4145 * or enqueues it if it doesn't have one in cache.
4147 * This function will not hold layout lock so it may be revoked any time after
4148 * this function returns. Any operations depend on layout should be redone
4151 * This function should be called before lov_io_init() to get an uptodate
4152 * layout version, the caller should save the version number and after IO
4153 * is finished, this function should be called again to verify that layout
4154 * is not changed during IO time.
4156 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4158 struct ll_inode_info *lli = ll_i2info(inode);
4159 struct ll_sb_info *sbi = ll_i2sbi(inode);
4163 *gen = ll_layout_version_get(lli);
4164 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4168 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4169 LASSERT(S_ISREG(inode->i_mode));
4171 /* take layout lock mutex to enqueue layout lock exclusively. */
4172 mutex_lock(&lli->lli_layout_mutex);
4174 rc = ll_layout_refresh_locked(inode);
4178 *gen = ll_layout_version_get(lli);
4180 mutex_unlock(&lli->lli_layout_mutex);
4186 * This function send a restore request to the MDT
4188 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4190 struct hsm_user_request *hur;
4194 len = sizeof(struct hsm_user_request) +
4195 sizeof(struct hsm_user_item);
4196 OBD_ALLOC(hur, len);
4200 hur->hur_request.hr_action = HUA_RESTORE;
4201 hur->hur_request.hr_archive_id = 0;
4202 hur->hur_request.hr_flags = 0;
4203 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4204 sizeof(hur->hur_user_item[0].hui_fid));
4205 hur->hur_user_item[0].hui_extent.offset = offset;
4206 hur->hur_user_item[0].hui_extent.length = length;
4207 hur->hur_request.hr_itemcount = 1;
4208 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,