4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
71 fd->fd_write_failed = false;
76 static void ll_file_data_put(struct ll_file_data *fd)
79 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 * Packs all the attributes into @op_data for the CLOSE rpc.
85 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
86 struct obd_client_handle *och)
90 ll_prep_md_op_data(op_data, inode, NULL, NULL,
91 0, 0, LUSTRE_OPC_ANY, NULL);
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
101 op_data->op_attr_blocks = inode->i_blocks;
102 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
103 op_data->op_handle = och->och_fh;
105 if (och->och_flags & FMODE_WRITE &&
106 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
107 /* For HSM: if inode data has been modified, pack it so that
108 * MDT can set data dirty flag in the archive. */
109 op_data->op_bias |= MDS_DATA_MODIFIED;
115 * Perform a close, possibly with a bias.
116 * The meaning of "data" depends on the value of "bias".
118 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
119 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
122 static int ll_close_inode_openhandle(struct inode *inode,
123 struct obd_client_handle *och,
124 enum mds_op_bias bias, void *data)
126 struct obd_export *md_exp = ll_i2mdexp(inode);
127 const struct ll_inode_info *lli = ll_i2info(inode);
128 struct md_op_data *op_data;
129 struct ptlrpc_request *req = NULL;
133 if (class_exp2obd(md_exp) == NULL) {
134 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
135 ll_get_fsname(inode->i_sb, NULL, 0),
136 PFID(&lli->lli_fid));
140 OBD_ALLOC_PTR(op_data);
141 /* We leak openhandle and request here on error, but not much to be
142 * done in OOM case since app won't retry close on error either. */
144 GOTO(out, rc = -ENOMEM);
146 ll_prepare_close(inode, op_data, och);
148 case MDS_CLOSE_LAYOUT_SWAP:
149 LASSERT(data != NULL);
150 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
151 op_data->op_data_version = 0;
152 op_data->op_lease_handle = och->och_lease_handle;
153 op_data->op_fid2 = *ll_inode2fid(data);
156 case MDS_HSM_RELEASE:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *(__u64 *)data;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
165 LASSERT(data == NULL);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 if (rc != 0 && rc != -EINTR)
171 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
172 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
175 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
176 struct mdt_body *body;
178 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
179 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
183 ll_finish_md_op_data(op_data);
187 md_clear_open_replay_data(md_exp, och);
188 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
191 ptlrpc_req_finished(req); /* This is close request */
195 int ll_md_real_close(struct inode *inode, fmode_t fmode)
197 struct ll_inode_info *lli = ll_i2info(inode);
198 struct obd_client_handle **och_p;
199 struct obd_client_handle *och;
204 if (fmode & FMODE_WRITE) {
205 och_p = &lli->lli_mds_write_och;
206 och_usecount = &lli->lli_open_fd_write_count;
207 } else if (fmode & FMODE_EXEC) {
208 och_p = &lli->lli_mds_exec_och;
209 och_usecount = &lli->lli_open_fd_exec_count;
211 LASSERT(fmode & FMODE_READ);
212 och_p = &lli->lli_mds_read_och;
213 och_usecount = &lli->lli_open_fd_read_count;
216 mutex_lock(&lli->lli_och_mutex);
217 if (*och_usecount > 0) {
218 /* There are still users of this handle, so skip
220 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
229 /* There might be a race and this handle may already
231 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
237 static int ll_md_close(struct inode *inode, struct file *file)
239 union ldlm_policy_data policy = {
240 .l_inodebits = { MDS_INODELOCK_OPEN },
242 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
244 struct ll_inode_info *lli = ll_i2info(inode);
245 struct lustre_handle lockh;
246 enum ldlm_mode lockmode;
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 mutex_lock(&lli->lli_och_mutex);
275 if (fd->fd_omode & FMODE_WRITE) {
277 LASSERT(lli->lli_open_fd_write_count);
278 lli->lli_open_fd_write_count--;
279 } else if (fd->fd_omode & FMODE_EXEC) {
281 LASSERT(lli->lli_open_fd_exec_count);
282 lli->lli_open_fd_exec_count--;
285 LASSERT(lli->lli_open_fd_read_count);
286 lli->lli_open_fd_read_count--;
288 mutex_unlock(&lli->lli_och_mutex);
290 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
291 LDLM_IBITS, &policy, lockmode, &lockh))
292 rc = ll_md_real_close(inode, fd->fd_omode);
295 LUSTRE_FPRIVATE(file) = NULL;
296 ll_file_data_put(fd);
301 /* While this returns an error code, fput() the caller does not, so we need
302 * to make every effort to clean up all of our state here. Also, applications
303 * rarely check close errors and even if an error is returned they will not
304 * re-try the close call.
306 int ll_file_release(struct inode *inode, struct file *file)
308 struct ll_file_data *fd;
309 struct ll_sb_info *sbi = ll_i2sbi(inode);
310 struct ll_inode_info *lli = ll_i2info(inode);
314 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
315 PFID(ll_inode2fid(inode)), inode);
317 if (inode->i_sb->s_root != file_dentry(file))
318 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
319 fd = LUSTRE_FPRIVATE(file);
322 /* The last ref on @file, maybe not the the owner pid of statahead,
323 * because parent and child process can share the same file handle. */
324 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
325 ll_deauthorize_statahead(inode, fd);
327 if (inode->i_sb->s_root == file_dentry(file)) {
328 LUSTRE_FPRIVATE(file) = NULL;
329 ll_file_data_put(fd);
333 if (!S_ISDIR(inode->i_mode)) {
334 if (lli->lli_clob != NULL)
335 lov_read_and_clear_async_rc(lli->lli_clob);
336 lli->lli_async_rc = 0;
339 rc = ll_md_close(inode, file);
341 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
342 libcfs_debug_dumplog();
347 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
348 struct lookup_intent *itp)
350 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
351 struct dentry *parent = de->d_parent;
352 const char *name = NULL;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req = NULL;
359 LASSERT(parent != NULL);
360 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
362 /* if server supports open-by-fid, or file name is invalid, don't pack
363 * name in open request */
364 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
365 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
366 name = de->d_name.name;
367 len = de->d_name.len;
370 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
371 name, len, 0, LUSTRE_OPC_ANY, NULL);
373 RETURN(PTR_ERR(op_data));
374 op_data->op_data = lmm;
375 op_data->op_data_size = lmmsize;
377 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
378 &ll_md_blocking_ast, 0);
379 ll_finish_md_op_data(op_data);
381 /* reason for keep own exit path - don`t flood log
382 * with messages with -ESTALE errors.
384 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
385 it_open_error(DISP_OPEN_OPEN, itp))
387 ll_release_openhandle(de, itp);
391 if (it_disposition(itp, DISP_LOOKUP_NEG))
392 GOTO(out, rc = -ENOENT);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
401 if (!rc && itp->it_lock_mode)
402 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
405 ptlrpc_req_finished(req);
406 ll_intent_drop_lock(itp);
408 /* We did open by fid, but by the time we got to the server,
409 * the object disappeared. If this is a create, we cannot really
410 * tell the userspace that the file it was trying to create
411 * does not exist. Instead let's return -ESTALE, and the VFS will
412 * retry the create with LOOKUP_REVAL that we are going to catch
413 * in ll_revalidate_dentry() and use lookup then.
415 if (rc == -ENOENT && itp->it_op & IT_CREAT)
421 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
422 struct obd_client_handle *och)
424 struct mdt_body *body;
426 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
427 och->och_fh = body->mbo_handle;
428 och->och_fid = body->mbo_fid1;
429 och->och_lease_handle.cookie = it->it_lock_handle;
430 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
431 och->och_flags = it->it_flags;
433 return md_set_open_replay_data(md_exp, och, it);
436 static int ll_local_open(struct file *file, struct lookup_intent *it,
437 struct ll_file_data *fd, struct obd_client_handle *och)
439 struct inode *inode = file_inode(file);
442 LASSERT(!LUSTRE_FPRIVATE(file));
449 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
454 LUSTRE_FPRIVATE(file) = fd;
455 ll_readahead_init(inode, &fd->fd_ras);
456 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
458 /* ll_cl_context initialize */
459 rwlock_init(&fd->fd_lock);
460 INIT_LIST_HEAD(&fd->fd_lccs);
465 /* Open a file, and (for the very first open) create objects on the OSTs at
466 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
467 * creation or open until ll_lov_setstripe() ioctl is called.
469 * If we already have the stripe MD locally then we don't request it in
470 * md_open(), by passing a lmm_size = 0.
472 * It is up to the application to ensure no other processes open this file
473 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
474 * used. We might be able to avoid races of that sort by getting lli_open_sem
475 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
476 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
478 int ll_file_open(struct inode *inode, struct file *file)
480 struct ll_inode_info *lli = ll_i2info(inode);
481 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
482 .it_flags = file->f_flags };
483 struct obd_client_handle **och_p = NULL;
484 __u64 *och_usecount = NULL;
485 struct ll_file_data *fd;
489 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
490 PFID(ll_inode2fid(inode)), inode, file->f_flags);
492 it = file->private_data; /* XXX: compat macro */
493 file->private_data = NULL; /* prevent ll_local_open assertion */
495 fd = ll_file_data_get();
497 GOTO(out_openerr, rc = -ENOMEM);
500 if (S_ISDIR(inode->i_mode))
501 ll_authorize_statahead(inode, fd);
503 if (inode->i_sb->s_root == file_dentry(file)) {
504 LUSTRE_FPRIVATE(file) = fd;
508 if (!it || !it->it_disposition) {
509 /* Convert f_flags into access mode. We cannot use file->f_mode,
510 * because everything but O_ACCMODE mask was stripped from
512 if ((oit.it_flags + 1) & O_ACCMODE)
514 if (file->f_flags & O_TRUNC)
515 oit.it_flags |= FMODE_WRITE;
517 /* kernel only call f_op->open in dentry_open. filp_open calls
518 * dentry_open after call to open_namei that checks permissions.
519 * Only nfsd_open call dentry_open directly without checking
520 * permissions and because of that this code below is safe. */
521 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
522 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
524 /* We do not want O_EXCL here, presumably we opened the file
525 * already? XXX - NFS implications? */
526 oit.it_flags &= ~O_EXCL;
528 /* bug20584, if "it_flags" contains O_CREAT, the file will be
529 * created if necessary, then "IT_CREAT" should be set to keep
530 * consistent with it */
531 if (oit.it_flags & O_CREAT)
532 oit.it_op |= IT_CREAT;
538 /* Let's see if we have file open on MDS already. */
539 if (it->it_flags & FMODE_WRITE) {
540 och_p = &lli->lli_mds_write_och;
541 och_usecount = &lli->lli_open_fd_write_count;
542 } else if (it->it_flags & FMODE_EXEC) {
543 och_p = &lli->lli_mds_exec_och;
544 och_usecount = &lli->lli_open_fd_exec_count;
546 och_p = &lli->lli_mds_read_och;
547 och_usecount = &lli->lli_open_fd_read_count;
550 mutex_lock(&lli->lli_och_mutex);
551 if (*och_p) { /* Open handle is present */
552 if (it_disposition(it, DISP_OPEN_OPEN)) {
553 /* Well, there's extra open request that we do not need,
554 let's close it somehow. This will decref request. */
555 rc = it_open_error(DISP_OPEN_OPEN, it);
557 mutex_unlock(&lli->lli_och_mutex);
558 GOTO(out_openerr, rc);
561 ll_release_openhandle(file_dentry(file), it);
565 rc = ll_local_open(file, it, fd, NULL);
568 mutex_unlock(&lli->lli_och_mutex);
569 GOTO(out_openerr, rc);
572 LASSERT(*och_usecount == 0);
573 if (!it->it_disposition) {
574 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
575 /* We cannot just request lock handle now, new ELC code
576 means that one of other OPEN locks for this file
577 could be cancelled, and since blocking ast handler
578 would attempt to grab och_mutex as well, that would
579 result in a deadlock */
580 mutex_unlock(&lli->lli_och_mutex);
582 * Normally called under two situations:
584 * 2. A race/condition on MDS resulting in no open
585 * handle to be returned from LOOKUP|OPEN request,
586 * for example if the target entry was a symlink.
588 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
589 * marked by a bit set in ll_iget_for_nfs. Clear the
590 * bit so that it's not confusing later callers.
592 * NB; when ldd is NULL, it must have come via normal
593 * lookup path only, since ll_iget_for_nfs always calls
596 if (ldd && ldd->lld_nfs_dentry) {
597 ldd->lld_nfs_dentry = 0;
598 it->it_flags |= MDS_OPEN_LOCK;
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
609 GOTO(out_openerr, rc);
613 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
615 GOTO(out_och_free, rc = -ENOMEM);
619 /* md_intent_lock() didn't get a request ref if there was an
620 * open error, so don't do cleanup on the request here
622 /* XXX (green): Should not we bail out on any error here, not
623 * just open error? */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 GOTO(out_och_free, rc);
628 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
629 "inode %p: disposition %x, status %d\n", inode,
630 it_disposition(it, ~0), it->it_status);
632 rc = ll_local_open(file, it, fd, *och_p);
634 GOTO(out_och_free, rc);
636 mutex_unlock(&lli->lli_och_mutex);
639 /* Must do this outside lli_och_mutex lock to prevent deadlock where
640 different kind of OPEN lock for this same inode gets cancelled
641 by ldlm_cancel_lru */
642 if (!S_ISREG(inode->i_mode))
643 GOTO(out_och_free, rc);
645 cl_lov_delay_create_clear(&file->f_flags);
646 GOTO(out_och_free, rc);
650 if (och_p && *och_p) {
651 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
652 *och_p = NULL; /* OBD_FREE writes some magic there */
655 mutex_unlock(&lli->lli_och_mutex);
658 if (lli->lli_opendir_key == fd)
659 ll_deauthorize_statahead(inode, fd);
661 ll_file_data_put(fd);
663 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
667 ptlrpc_req_finished(it->it_request);
668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
674 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
675 struct ldlm_lock_desc *desc, void *data, int flag)
678 struct lustre_handle lockh;
682 case LDLM_CB_BLOCKING:
683 ldlm_lock2handle(lock, &lockh);
684 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
686 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
690 case LDLM_CB_CANCELING:
698 * When setting a lease on a file, we take ownership of the lli_mds_*_och
699 * and save it as fd->fd_och so as to force client to reopen the file even
700 * if it has an open lock in cache already.
702 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
703 struct lustre_handle *old_handle)
705 struct ll_inode_info *lli = ll_i2info(inode);
706 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
707 struct obd_client_handle **och_p;
712 /* Get the openhandle of the file */
713 mutex_lock(&lli->lli_och_mutex);
714 if (fd->fd_lease_och != NULL)
715 GOTO(out_unlock, rc = -EBUSY);
717 if (fd->fd_och == NULL) {
718 if (file->f_mode & FMODE_WRITE) {
719 LASSERT(lli->lli_mds_write_och != NULL);
720 och_p = &lli->lli_mds_write_och;
721 och_usecount = &lli->lli_open_fd_write_count;
723 LASSERT(lli->lli_mds_read_och != NULL);
724 och_p = &lli->lli_mds_read_och;
725 och_usecount = &lli->lli_open_fd_read_count;
728 if (*och_usecount > 1)
729 GOTO(out_unlock, rc = -EBUSY);
736 *old_handle = fd->fd_och->och_fh;
740 mutex_unlock(&lli->lli_och_mutex);
745 * Release ownership on lli_mds_*_och when putting back a file lease.
747 static int ll_lease_och_release(struct inode *inode, struct file *file)
749 struct ll_inode_info *lli = ll_i2info(inode);
750 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
751 struct obd_client_handle **och_p;
752 struct obd_client_handle *old_och = NULL;
757 mutex_lock(&lli->lli_och_mutex);
758 if (file->f_mode & FMODE_WRITE) {
759 och_p = &lli->lli_mds_write_och;
760 och_usecount = &lli->lli_open_fd_write_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 /* The file may have been open by another process (broken lease) so
767 * *och_p is not NULL. In this case we should simply increase usecount
770 if (*och_p != NULL) {
771 old_och = fd->fd_och;
778 mutex_unlock(&lli->lli_och_mutex);
781 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
787 * Acquire a lease and open the file.
789 static struct obd_client_handle *
790 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
793 struct lookup_intent it = { .it_op = IT_OPEN };
794 struct ll_sb_info *sbi = ll_i2sbi(inode);
795 struct md_op_data *op_data;
796 struct ptlrpc_request *req = NULL;
797 struct lustre_handle old_handle = { 0 };
798 struct obd_client_handle *och = NULL;
803 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
804 RETURN(ERR_PTR(-EINVAL));
807 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
808 RETURN(ERR_PTR(-EPERM));
810 rc = ll_lease_och_acquire(inode, file, &old_handle);
817 RETURN(ERR_PTR(-ENOMEM));
819 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
820 LUSTRE_OPC_ANY, NULL);
822 GOTO(out, rc = PTR_ERR(op_data));
824 /* To tell the MDT this openhandle is from the same owner */
825 op_data->op_handle = old_handle;
827 it.it_flags = fmode | open_flags;
828 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
829 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
830 &ll_md_blocking_lease_ast,
831 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
832 * it can be cancelled which may mislead applications that the lease is
834 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
835 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
836 * doesn't deal with openhandle, so normal openhandle will be leaked. */
837 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
838 ll_finish_md_op_data(op_data);
839 ptlrpc_req_finished(req);
841 GOTO(out_release_it, rc);
843 if (it_disposition(&it, DISP_LOOKUP_NEG))
844 GOTO(out_release_it, rc = -ENOENT);
846 rc = it_open_error(DISP_OPEN_OPEN, &it);
848 GOTO(out_release_it, rc);
850 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
851 ll_och_fill(sbi->ll_md_exp, &it, och);
853 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
854 GOTO(out_close, rc = -EOPNOTSUPP);
856 /* already get lease, handle lease lock */
857 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
858 if (it.it_lock_mode == 0 ||
859 it.it_lock_bits != MDS_INODELOCK_OPEN) {
860 /* open lock must return for lease */
861 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
862 PFID(ll_inode2fid(inode)), it.it_lock_mode,
864 GOTO(out_close, rc = -EPROTO);
867 ll_intent_release(&it);
871 /* Cancel open lock */
872 if (it.it_lock_mode != 0) {
873 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
876 och->och_lease_handle.cookie = 0ULL;
878 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
880 CERROR("%s: error closing file "DFID": %d\n",
881 ll_get_fsname(inode->i_sb, NULL, 0),
882 PFID(&ll_i2info(inode)->lli_fid), rc2);
883 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
885 ll_intent_release(&it);
893 * Check whether a layout swap can be done between two inodes.
895 * \param[in] inode1 First inode to check
896 * \param[in] inode2 Second inode to check
898 * \retval 0 on success, layout swap can be performed between both inodes
899 * \retval negative error code if requirements are not met
901 static int ll_check_swap_layouts_validity(struct inode *inode1,
902 struct inode *inode2)
904 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
907 if (inode_permission(inode1, MAY_WRITE) ||
908 inode_permission(inode2, MAY_WRITE))
911 if (inode1->i_sb != inode2->i_sb)
917 static int ll_swap_layouts_close(struct obd_client_handle *och,
918 struct inode *inode, struct inode *inode2)
920 const struct lu_fid *fid1 = ll_inode2fid(inode);
921 const struct lu_fid *fid2;
925 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
926 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
928 rc = ll_check_swap_layouts_validity(inode, inode2);
930 GOTO(out_free_och, rc);
932 /* We now know that inode2 is a lustre inode */
933 fid2 = ll_inode2fid(inode2);
935 rc = lu_fid_cmp(fid1, fid2);
937 GOTO(out_free_och, rc = -EINVAL);
939 /* Close the file and swap layouts between inode & inode2.
940 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
941 * because we still need it to pack l_remote_handle to MDT. */
942 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
945 och = NULL; /* freed in ll_close_inode_openhandle() */
955 * Release lease and close the file.
956 * It will check if the lease has ever broken.
958 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
961 struct ldlm_lock *lock;
962 bool cancelled = true;
966 lock = ldlm_handle2lock(&och->och_lease_handle);
968 lock_res_and_lock(lock);
969 cancelled = ldlm_is_cancel(lock);
970 unlock_res_and_lock(lock);
974 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
975 PFID(&ll_i2info(inode)->lli_fid), cancelled);
978 ldlm_cli_cancel(&och->och_lease_handle, 0);
980 if (lease_broken != NULL)
981 *lease_broken = cancelled;
983 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
987 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = vvp_env_thread_attr(env);
999 ll_inode_size_lock(inode);
1001 /* Merge timestamps the most recently obtained from MDS with
1002 * timestamps obtained from OSTs.
1004 * Do not overwrite atime of inode because it may be refreshed
1005 * by file_accessed() function. If the read was served by cache
1006 * data, there is no RPC to be sent so that atime may not be
1007 * transferred to OSTs at all. MDT only updates atime at close time
1008 * if it's at least 'mdd.*.atime_diff' older.
1009 * All in all, the atime in Lustre does not strictly comply with
1010 * POSIX. Solving this problem needs to send an RPC to MDT for each
1011 * read, this will hurt performance. */
1012 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1013 LTIME_S(inode->i_atime) = lli->lli_atime;
1014 lli->lli_update_atime = 0;
1016 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1017 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1019 atime = LTIME_S(inode->i_atime);
1020 mtime = LTIME_S(inode->i_mtime);
1021 ctime = LTIME_S(inode->i_ctime);
1023 cl_object_attr_lock(obj);
1024 rc = cl_object_attr_get(env, obj, attr);
1025 cl_object_attr_unlock(obj);
1028 GOTO(out_size_unlock, rc);
1030 if (atime < attr->cat_atime)
1031 atime = attr->cat_atime;
1033 if (ctime < attr->cat_ctime)
1034 ctime = attr->cat_ctime;
1036 if (mtime < attr->cat_mtime)
1037 mtime = attr->cat_mtime;
1039 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1040 PFID(&lli->lli_fid), attr->cat_size);
1042 i_size_write(inode, attr->cat_size);
1043 inode->i_blocks = attr->cat_blocks;
1045 LTIME_S(inode->i_atime) = atime;
1046 LTIME_S(inode->i_mtime) = mtime;
1047 LTIME_S(inode->i_ctime) = ctime;
1050 ll_inode_size_unlock(inode);
1055 static bool file_is_noatime(const struct file *file)
1057 const struct vfsmount *mnt = file->f_path.mnt;
1058 const struct inode *inode = file_inode((struct file *)file);
1060 /* Adapted from file_accessed() and touch_atime().*/
1061 if (file->f_flags & O_NOATIME)
1064 if (inode->i_flags & S_NOATIME)
1067 if (IS_NOATIME(inode))
1070 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1073 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1076 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1082 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1084 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1086 struct inode *inode = file_inode(file);
1088 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1089 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1090 io->u.ci_rw.rw_file = file;
1091 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1092 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1093 if (iot == CIT_WRITE) {
1094 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1095 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1096 file->f_flags & O_DIRECT ||
1099 io->ci_obj = ll_i2info(inode)->lli_clob;
1100 io->ci_lockreq = CILR_MAYBE;
1101 if (ll_file_nolock(file)) {
1102 io->ci_lockreq = CILR_NEVER;
1103 io->ci_no_srvlock = 1;
1104 } else if (file->f_flags & O_APPEND) {
1105 io->ci_lockreq = CILR_MANDATORY;
1107 io->ci_noatime = file_is_noatime(file);
1108 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1109 io->ci_pio = !io->u.ci_rw.rw_append;
1114 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1116 struct cl_io_pt *pt = ptask->pt_cbdata;
1117 struct file *file = pt->cip_file;
1120 loff_t pos = pt->cip_pos;
1125 env = cl_env_get(&refcheck);
1127 RETURN(PTR_ERR(env));
1129 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1130 file_dentry(file)->d_name.name,
1131 pt->cip_iot == CIT_READ ? "read" : "write",
1132 pos, pos + pt->cip_count);
1135 io = vvp_env_thread_io(env);
1136 ll_io_init(io, file, pt->cip_iot);
1137 io->u.ci_rw.rw_iter = pt->cip_iter;
1138 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1139 io->ci_pio = 0; /* It's already in parallel task */
1141 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1142 pt->cip_count - pt->cip_result);
1144 struct vvp_io *vio = vvp_env_io(env);
1146 vio->vui_io_subtype = IO_NORMAL;
1147 vio->vui_fd = LUSTRE_FPRIVATE(file);
1149 ll_cl_add(file, env, io, LCC_RW);
1150 rc = cl_io_loop(env, io);
1151 ll_cl_remove(file, env);
1153 /* cl_io_rw_init() handled IO */
1157 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1163 if (io->ci_nob > 0) {
1164 pt->cip_result += io->ci_nob;
1165 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1167 pt->cip_iocb.ki_pos = pos;
1168 #ifdef HAVE_KIOCB_KI_LEFT
1169 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1170 #elif defined(HAVE_KI_NBYTES)
1171 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1175 cl_io_fini(env, io);
1177 if ((rc == 0 || rc == -ENODATA) &&
1178 pt->cip_result < pt->cip_count &&
1179 io->ci_need_restart) {
1181 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1182 file_dentry(file)->d_name.name,
1183 pt->cip_iot == CIT_READ ? "read" : "write",
1184 pos, pos + pt->cip_count - pt->cip_result,
1185 pt->cip_result, rc);
1189 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1190 file_dentry(file)->d_name.name,
1191 pt->cip_iot == CIT_READ ? "read" : "write",
1192 pt->cip_result, rc);
1194 cl_env_put(env, &refcheck);
1195 RETURN(pt->cip_result > 0 ? 0 : rc);
1199 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1200 struct file *file, enum cl_io_type iot,
1201 loff_t *ppos, size_t count)
1203 struct range_lock range;
1204 struct vvp_io *vio = vvp_env_io(env);
1205 struct inode *inode = file_inode(file);
1206 struct ll_inode_info *lli = ll_i2info(inode);
1207 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1215 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1216 file_dentry(file)->d_name.name,
1217 iot == CIT_READ ? "read" : "write", pos, pos + count);
1220 io = vvp_env_thread_io(env);
1221 ll_io_init(io, file, iot);
1222 if (args->via_io_subtype == IO_NORMAL) {
1223 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1224 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1229 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1230 bool range_locked = false;
1232 if (file->f_flags & O_APPEND)
1233 range_lock_init(&range, 0, LUSTRE_EOF);
1235 range_lock_init(&range, pos, pos + count - 1);
1237 vio->vui_fd = LUSTRE_FPRIVATE(file);
1238 vio->vui_io_subtype = args->via_io_subtype;
1240 switch (vio->vui_io_subtype) {
1242 /* Direct IO reads must also take range lock,
1243 * or multiple reads will try to work on the same pages
1244 * See LU-6227 for details. */
1245 if (((iot == CIT_WRITE) ||
1246 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1247 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1248 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1250 rc = range_lock(&lli->lli_write_tree, &range);
1254 range_locked = true;
1258 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1259 vio->u.splice.vui_flags = args->u.splice.via_flags;
1262 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1266 ll_cl_add(file, env, io, LCC_RW);
1267 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1268 !lli->lli_inode_locked) {
1270 lli->lli_inode_locked = 1;
1272 rc = cl_io_loop(env, io);
1273 if (lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 0;
1275 inode_unlock(inode);
1277 ll_cl_remove(file, env);
1280 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1282 range_unlock(&lli->lli_write_tree, &range);
1285 /* cl_io_rw_init() handled IO */
1289 if (io->ci_nob > 0) {
1290 result += io->ci_nob;
1291 count -= io->ci_nob;
1293 if (args->via_io_subtype == IO_NORMAL) {
1294 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1296 /* CLIO is too complicated. See LU-11069. */
1297 if (cl_io_is_append(io))
1298 pos = io->u.ci_rw.rw_iocb.ki_pos;
1302 args->u.normal.via_iocb->ki_pos = pos;
1303 #ifdef HAVE_KIOCB_KI_LEFT
1304 args->u.normal.via_iocb->ki_left = count;
1305 #elif defined(HAVE_KI_NBYTES)
1306 args->u.normal.via_iocb->ki_nbytes = count;
1310 pos = io->u.ci_rw.rw_range.cir_pos;
1314 cl_io_fini(env, io);
1316 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1318 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1319 file_dentry(file)->d_name.name,
1320 iot == CIT_READ ? "read" : "write",
1321 pos, pos + count, result, rc);
1325 if (iot == CIT_READ) {
1327 ll_stats_ops_tally(ll_i2sbi(inode),
1328 LPROC_LL_READ_BYTES, result);
1329 } else if (iot == CIT_WRITE) {
1331 ll_stats_ops_tally(ll_i2sbi(inode),
1332 LPROC_LL_WRITE_BYTES, result);
1333 fd->fd_write_failed = false;
1334 } else if (result == 0 && rc == 0) {
1337 fd->fd_write_failed = true;
1339 fd->fd_write_failed = false;
1340 } else if (rc != -ERESTARTSYS) {
1341 fd->fd_write_failed = true;
1345 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1346 file_dentry(file)->d_name.name,
1347 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1351 RETURN(result > 0 ? result : rc);
1355 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1356 * especially for small I/O.
1358 * To serve a read request, CLIO has to create and initialize a cl_io and
1359 * then request DLM lock. This has turned out to have siginificant overhead
1360 * and affects the performance of small I/O dramatically.
1362 * It's not necessary to create a cl_io for each I/O. Under the help of read
1363 * ahead, most of the pages being read are already in memory cache and we can
1364 * read those pages directly because if the pages exist, the corresponding DLM
1365 * lock must exist so that page content must be valid.
1367 * In fast read implementation, the llite speculatively finds and reads pages
1368 * in memory cache. There are three scenarios for fast read:
1369 * - If the page exists and is uptodate, kernel VM will provide the data and
1370 * CLIO won't be intervened;
1371 * - If the page was brought into memory by read ahead, it will be exported
1372 * and read ahead parameters will be updated;
1373 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1374 * it will go back and invoke normal read, i.e., a cl_io will be created
1375 * and DLM lock will be requested.
1377 * POSIX compliance: posix standard states that read is intended to be atomic.
1378 * Lustre read implementation is in line with Linux kernel read implementation
1379 * and neither of them complies with POSIX standard in this matter. Fast read
1380 * doesn't make the situation worse on single node but it may interleave write
1381 * results from multiple nodes due to short read handling in ll_file_aio_read().
1383 * \param env - lu_env
1384 * \param iocb - kiocb from kernel
1385 * \param iter - user space buffers where the data will be copied
1387 * \retval - number of bytes have been read, or error code if error occurred.
1390 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1391 struct iov_iter *iter)
1395 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1398 /* NB: we can't do direct IO for fast read because it will need a lock
1399 * to make IO engine happy. */
1400 if (iocb->ki_filp->f_flags & O_DIRECT)
1403 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1404 result = generic_file_read_iter(iocb, iter);
1405 ll_cl_remove(iocb->ki_filp, env);
1407 /* If the first page is not in cache, generic_file_aio_read() will be
1408 * returned with -ENODATA.
1409 * See corresponding code in ll_readpage(). */
1410 if (result == -ENODATA)
1414 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1415 LPROC_LL_READ_BYTES, result);
1421 * Read from a file (through the page cache).
1423 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1426 struct vvp_io_args *args;
1431 env = cl_env_get(&refcheck);
1433 return PTR_ERR(env);
1435 result = ll_do_fast_read(env, iocb, to);
1436 if (result < 0 || iov_iter_count(to) == 0)
1439 args = ll_env_args(env, IO_NORMAL);
1440 args->u.normal.via_iter = to;
1441 args->u.normal.via_iocb = iocb;
1443 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1444 &iocb->ki_pos, iov_iter_count(to));
1447 else if (result == 0)
1451 cl_env_put(env, &refcheck);
1456 * Write to a file (through the page cache).
1458 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1460 struct vvp_io_args *args;
1465 env = cl_env_get(&refcheck);
1467 return PTR_ERR(env);
1469 args = ll_env_args(env, IO_NORMAL);
1470 args->u.normal.via_iter = from;
1471 args->u.normal.via_iocb = iocb;
1473 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1474 &iocb->ki_pos, iov_iter_count(from));
1475 cl_env_put(env, &refcheck);
1479 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1481 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1483 static int ll_file_get_iov_count(const struct iovec *iov,
1484 unsigned long *nr_segs, size_t *count)
1489 for (seg = 0; seg < *nr_segs; seg++) {
1490 const struct iovec *iv = &iov[seg];
1493 * If any segment has a negative length, or the cumulative
1494 * length ever wraps negative then return -EINVAL.
1497 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1499 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1504 cnt -= iv->iov_len; /* This segment is no good */
1511 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1512 unsigned long nr_segs, loff_t pos)
1519 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1523 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1524 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1525 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1526 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1527 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1529 result = ll_file_read_iter(iocb, &to);
1534 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1538 struct iovec iov = { .iov_base = buf, .iov_len = count };
1539 struct kiocb *kiocb;
1544 env = cl_env_get(&refcheck);
1546 RETURN(PTR_ERR(env));
1548 kiocb = &ll_env_info(env)->lti_kiocb;
1549 init_sync_kiocb(kiocb, file);
1550 kiocb->ki_pos = *ppos;
1551 #ifdef HAVE_KIOCB_KI_LEFT
1552 kiocb->ki_left = count;
1553 #elif defined(HAVE_KI_NBYTES)
1554 kiocb->ki_nbytes = count;
1557 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1558 *ppos = kiocb->ki_pos;
1560 cl_env_put(env, &refcheck);
1565 * Write to a file (through the page cache).
1568 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1569 unsigned long nr_segs, loff_t pos)
1571 struct iov_iter from;
1576 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1580 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1581 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1582 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1583 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1584 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1586 result = ll_file_write_iter(iocb, &from);
1591 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1592 size_t count, loff_t *ppos)
1595 struct iovec iov = { .iov_base = (void __user *)buf,
1597 struct kiocb *kiocb;
1602 env = cl_env_get(&refcheck);
1604 RETURN(PTR_ERR(env));
1606 kiocb = &ll_env_info(env)->lti_kiocb;
1607 init_sync_kiocb(kiocb, file);
1608 kiocb->ki_pos = *ppos;
1609 #ifdef HAVE_KIOCB_KI_LEFT
1610 kiocb->ki_left = count;
1611 #elif defined(HAVE_KI_NBYTES)
1612 kiocb->ki_nbytes = count;
1615 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1616 *ppos = kiocb->ki_pos;
1618 cl_env_put(env, &refcheck);
1621 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1624 * Send file content (through pagecache) somewhere with helper
1626 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1627 struct pipe_inode_info *pipe, size_t count,
1631 struct vvp_io_args *args;
1636 env = cl_env_get(&refcheck);
1638 RETURN(PTR_ERR(env));
1640 args = ll_env_args(env, IO_SPLICE);
1641 args->u.splice.via_pipe = pipe;
1642 args->u.splice.via_flags = flags;
1644 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1645 cl_env_put(env, &refcheck);
1649 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1650 __u64 flags, struct lov_user_md *lum, int lum_size)
1652 struct lookup_intent oit = {
1654 .it_flags = flags | MDS_OPEN_BY_FID,
1659 ll_inode_size_lock(inode);
1660 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1662 GOTO(out_unlock, rc);
1664 ll_release_openhandle(dentry, &oit);
1667 ll_inode_size_unlock(inode);
1668 ll_intent_release(&oit);
1673 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1674 struct lov_mds_md **lmmp, int *lmm_size,
1675 struct ptlrpc_request **request)
1677 struct ll_sb_info *sbi = ll_i2sbi(inode);
1678 struct mdt_body *body;
1679 struct lov_mds_md *lmm = NULL;
1680 struct ptlrpc_request *req = NULL;
1681 struct md_op_data *op_data;
1684 rc = ll_get_default_mdsize(sbi, &lmmsize);
1688 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1689 strlen(filename), lmmsize,
1690 LUSTRE_OPC_ANY, NULL);
1691 if (IS_ERR(op_data))
1692 RETURN(PTR_ERR(op_data));
1694 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1695 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1696 ll_finish_md_op_data(op_data);
1698 CDEBUG(D_INFO, "md_getattr_name failed "
1699 "on %s: rc %d\n", filename, rc);
1703 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1704 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1706 lmmsize = body->mbo_eadatasize;
1708 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1710 GOTO(out, rc = -ENODATA);
1713 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1714 LASSERT(lmm != NULL);
1716 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1717 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1718 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1719 GOTO(out, rc = -EPROTO);
1722 * This is coming from the MDS, so is probably in
1723 * little endian. We convert it to host endian before
1724 * passing it to userspace.
1726 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1729 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1730 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1731 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1732 if (le32_to_cpu(lmm->lmm_pattern) &
1733 LOV_PATTERN_F_RELEASED)
1737 /* if function called for directory - we should
1738 * avoid swab not existent lsm objects */
1739 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1740 lustre_swab_lov_user_md_v1(
1741 (struct lov_user_md_v1 *)lmm);
1742 if (S_ISREG(body->mbo_mode))
1743 lustre_swab_lov_user_md_objects(
1744 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1746 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1747 lustre_swab_lov_user_md_v3(
1748 (struct lov_user_md_v3 *)lmm);
1749 if (S_ISREG(body->mbo_mode))
1750 lustre_swab_lov_user_md_objects(
1751 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1753 } else if (lmm->lmm_magic ==
1754 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1755 lustre_swab_lov_comp_md_v1(
1756 (struct lov_comp_md_v1 *)lmm);
1762 *lmm_size = lmmsize;
1767 static int ll_lov_setea(struct inode *inode, struct file *file,
1770 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1771 struct lov_user_md *lump;
1772 int lum_size = sizeof(struct lov_user_md) +
1773 sizeof(struct lov_user_ost_data);
1777 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1780 OBD_ALLOC_LARGE(lump, lum_size);
1784 if (copy_from_user(lump, arg, lum_size))
1785 GOTO(out_lump, rc = -EFAULT);
1787 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1789 cl_lov_delay_create_clear(&file->f_flags);
1792 OBD_FREE_LARGE(lump, lum_size);
1796 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1803 env = cl_env_get(&refcheck);
1805 RETURN(PTR_ERR(env));
1807 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1808 cl_env_put(env, &refcheck);
1812 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1815 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1816 struct lov_user_md *klum;
1818 __u64 flags = FMODE_WRITE;
1821 rc = ll_copy_user_md(lum, &klum);
1826 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1831 rc = put_user(0, &lum->lmm_stripe_count);
1835 rc = ll_layout_refresh(inode, &gen);
1839 rc = ll_file_getstripe(inode, arg, lum_size);
1841 cl_lov_delay_create_clear(&file->f_flags);
1844 OBD_FREE(klum, lum_size);
1849 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1851 struct ll_inode_info *lli = ll_i2info(inode);
1852 struct cl_object *obj = lli->lli_clob;
1853 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1854 struct ll_grouplock grouplock;
1859 CWARN("group id for group lock must not be 0\n");
1863 if (ll_file_nolock(file))
1864 RETURN(-EOPNOTSUPP);
1866 spin_lock(&lli->lli_lock);
1867 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1868 CWARN("group lock already existed with gid %lu\n",
1869 fd->fd_grouplock.lg_gid);
1870 spin_unlock(&lli->lli_lock);
1873 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1874 spin_unlock(&lli->lli_lock);
1877 * XXX: group lock needs to protect all OST objects while PFL
1878 * can add new OST objects during the IO, so we'd instantiate
1879 * all OST objects before getting its group lock.
1884 struct cl_layout cl = {
1885 .cl_is_composite = false,
1888 env = cl_env_get(&refcheck);
1890 RETURN(PTR_ERR(env));
1892 rc = cl_object_layout_get(env, obj, &cl);
1893 if (!rc && cl.cl_is_composite)
1894 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1896 cl_env_put(env, &refcheck);
1901 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1902 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1906 spin_lock(&lli->lli_lock);
1907 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1908 spin_unlock(&lli->lli_lock);
1909 CERROR("another thread just won the race\n");
1910 cl_put_grouplock(&grouplock);
1914 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1915 fd->fd_grouplock = grouplock;
1916 spin_unlock(&lli->lli_lock);
1918 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1922 static int ll_put_grouplock(struct inode *inode, struct file *file,
1925 struct ll_inode_info *lli = ll_i2info(inode);
1926 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1927 struct ll_grouplock grouplock;
1930 spin_lock(&lli->lli_lock);
1931 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1932 spin_unlock(&lli->lli_lock);
1933 CWARN("no group lock held\n");
1937 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1939 if (fd->fd_grouplock.lg_gid != arg) {
1940 CWARN("group lock %lu doesn't match current id %lu\n",
1941 arg, fd->fd_grouplock.lg_gid);
1942 spin_unlock(&lli->lli_lock);
1946 grouplock = fd->fd_grouplock;
1947 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1948 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1949 spin_unlock(&lli->lli_lock);
1951 cl_put_grouplock(&grouplock);
1952 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1957 * Close inode open handle
1959 * \param dentry [in] dentry which contains the inode
1960 * \param it [in,out] intent which contains open info and result
1963 * \retval <0 failure
1965 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1967 struct inode *inode = dentry->d_inode;
1968 struct obd_client_handle *och;
1974 /* Root ? Do nothing. */
1975 if (dentry->d_inode->i_sb->s_root == dentry)
1978 /* No open handle to close? Move away */
1979 if (!it_disposition(it, DISP_OPEN_OPEN))
1982 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1984 OBD_ALLOC(och, sizeof(*och));
1986 GOTO(out, rc = -ENOMEM);
1988 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1990 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1992 /* this one is in place of ll_file_open */
1993 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1994 ptlrpc_req_finished(it->it_request);
1995 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2001 * Get size for inode for which FIEMAP mapping is requested.
2002 * Make the FIEMAP get_info call and returns the result.
2003 * \param fiemap kernel buffer to hold extens
2004 * \param num_bytes kernel buffer size
2006 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2012 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2015 /* Checks for fiemap flags */
2016 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2017 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2021 /* Check for FIEMAP_FLAG_SYNC */
2022 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2023 rc = filemap_fdatawrite(inode->i_mapping);
2028 env = cl_env_get(&refcheck);
2030 RETURN(PTR_ERR(env));
2032 if (i_size_read(inode) == 0) {
2033 rc = ll_glimpse_size(inode);
2038 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2039 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2040 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2042 /* If filesize is 0, then there would be no objects for mapping */
2043 if (fmkey.lfik_oa.o_size == 0) {
2044 fiemap->fm_mapped_extents = 0;
2048 fmkey.lfik_fiemap = *fiemap;
2050 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2051 &fmkey, fiemap, &num_bytes);
2053 cl_env_put(env, &refcheck);
2057 int ll_fid2path(struct inode *inode, void __user *arg)
2059 struct obd_export *exp = ll_i2mdexp(inode);
2060 const struct getinfo_fid2path __user *gfin = arg;
2062 struct getinfo_fid2path *gfout;
2068 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2069 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2072 /* Only need to get the buflen */
2073 if (get_user(pathlen, &gfin->gf_pathlen))
2076 if (pathlen > PATH_MAX)
2079 outsize = sizeof(*gfout) + pathlen;
2080 OBD_ALLOC(gfout, outsize);
2084 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2085 GOTO(gf_free, rc = -EFAULT);
2086 /* append root FID after gfout to let MDT know the root FID so that it
2087 * can lookup the correct path, this is mainly for fileset.
2088 * old server without fileset mount support will ignore this. */
2089 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2091 /* Call mdc_iocontrol */
2092 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2096 if (copy_to_user(arg, gfout, outsize))
2100 OBD_FREE(gfout, outsize);
2105 * Read the data_version for inode.
2107 * This value is computed using stripe object version on OST.
2108 * Version is computed using server side locking.
2110 * @param flags if do sync on the OST side;
2112 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2113 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2115 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2117 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2125 /* If no file object initialized, we consider its version is 0. */
2131 env = cl_env_get(&refcheck);
2133 RETURN(PTR_ERR(env));
2135 io = vvp_env_thread_io(env);
2137 io->u.ci_data_version.dv_data_version = 0;
2138 io->u.ci_data_version.dv_flags = flags;
2141 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2142 result = cl_io_loop(env, io);
2144 result = io->ci_result;
2146 *data_version = io->u.ci_data_version.dv_data_version;
2148 cl_io_fini(env, io);
2150 if (unlikely(io->ci_need_restart))
2153 cl_env_put(env, &refcheck);
2159 * Trigger a HSM release request for the provided inode.
2161 int ll_hsm_release(struct inode *inode)
2164 struct obd_client_handle *och = NULL;
2165 __u64 data_version = 0;
2170 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2171 ll_get_fsname(inode->i_sb, NULL, 0),
2172 PFID(&ll_i2info(inode)->lli_fid));
2174 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2176 GOTO(out, rc = PTR_ERR(och));
2178 /* Grab latest data_version and [am]time values */
2179 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2183 env = cl_env_get(&refcheck);
2185 GOTO(out, rc = PTR_ERR(env));
2187 ll_merge_attr(env, inode);
2188 cl_env_put(env, &refcheck);
2190 /* Release the file.
2191 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2192 * we still need it to pack l_remote_handle to MDT. */
2193 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2199 if (och != NULL && !IS_ERR(och)) /* close the file */
2200 ll_lease_close(och, inode, NULL);
2205 struct ll_swap_stack {
2208 struct inode *inode1;
2209 struct inode *inode2;
2214 static int ll_swap_layouts(struct file *file1, struct file *file2,
2215 struct lustre_swap_layouts *lsl)
2217 struct mdc_swap_layouts msl;
2218 struct md_op_data *op_data;
2221 struct ll_swap_stack *llss = NULL;
2224 OBD_ALLOC_PTR(llss);
2228 llss->inode1 = file_inode(file1);
2229 llss->inode2 = file_inode(file2);
2231 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2235 /* we use 2 bool because it is easier to swap than 2 bits */
2236 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2237 llss->check_dv1 = true;
2239 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2240 llss->check_dv2 = true;
2242 /* we cannot use lsl->sl_dvX directly because we may swap them */
2243 llss->dv1 = lsl->sl_dv1;
2244 llss->dv2 = lsl->sl_dv2;
2246 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2247 if (rc == 0) /* same file, done! */
2250 if (rc < 0) { /* sequentialize it */
2251 swap(llss->inode1, llss->inode2);
2253 swap(llss->dv1, llss->dv2);
2254 swap(llss->check_dv1, llss->check_dv2);
2258 if (gid != 0) { /* application asks to flush dirty cache */
2259 rc = ll_get_grouplock(llss->inode1, file1, gid);
2263 rc = ll_get_grouplock(llss->inode2, file2, gid);
2265 ll_put_grouplock(llss->inode1, file1, gid);
2270 /* ultimate check, before swaping the layouts we check if
2271 * dataversion has changed (if requested) */
2272 if (llss->check_dv1) {
2273 rc = ll_data_version(llss->inode1, &dv, 0);
2276 if (dv != llss->dv1)
2277 GOTO(putgl, rc = -EAGAIN);
2280 if (llss->check_dv2) {
2281 rc = ll_data_version(llss->inode2, &dv, 0);
2284 if (dv != llss->dv2)
2285 GOTO(putgl, rc = -EAGAIN);
2288 /* struct md_op_data is used to send the swap args to the mdt
2289 * only flags is missing, so we use struct mdc_swap_layouts
2290 * through the md_op_data->op_data */
2291 /* flags from user space have to be converted before they are send to
2292 * server, no flag is sent today, they are only used on the client */
2295 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2296 0, LUSTRE_OPC_ANY, &msl);
2297 if (IS_ERR(op_data))
2298 GOTO(free, rc = PTR_ERR(op_data));
2300 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2301 sizeof(*op_data), op_data, NULL);
2302 ll_finish_md_op_data(op_data);
2309 ll_put_grouplock(llss->inode2, file2, gid);
2310 ll_put_grouplock(llss->inode1, file1, gid);
2320 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2322 struct md_op_data *op_data;
2326 /* Detect out-of range masks */
2327 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2330 /* Non-root users are forbidden to set or clear flags which are
2331 * NOT defined in HSM_USER_MASK. */
2332 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2333 !cfs_capable(CFS_CAP_SYS_ADMIN))
2336 /* Detect out-of range archive id */
2337 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2338 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2341 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2342 LUSTRE_OPC_ANY, hss);
2343 if (IS_ERR(op_data))
2344 RETURN(PTR_ERR(op_data));
2346 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2347 sizeof(*op_data), op_data, NULL);
2349 ll_finish_md_op_data(op_data);
2354 static int ll_hsm_import(struct inode *inode, struct file *file,
2355 struct hsm_user_import *hui)
2357 struct hsm_state_set *hss = NULL;
2358 struct iattr *attr = NULL;
2362 if (!S_ISREG(inode->i_mode))
2368 GOTO(out, rc = -ENOMEM);
2370 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2371 hss->hss_archive_id = hui->hui_archive_id;
2372 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2373 rc = ll_hsm_state_set(inode, hss);
2377 OBD_ALLOC_PTR(attr);
2379 GOTO(out, rc = -ENOMEM);
2381 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2382 attr->ia_mode |= S_IFREG;
2383 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2384 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2385 attr->ia_size = hui->hui_size;
2386 attr->ia_mtime.tv_sec = hui->hui_mtime;
2387 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2388 attr->ia_atime.tv_sec = hui->hui_atime;
2389 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2391 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2392 ATTR_UID | ATTR_GID |
2393 ATTR_MTIME | ATTR_MTIME_SET |
2394 ATTR_ATIME | ATTR_ATIME_SET;
2398 rc = ll_setattr_raw(file_dentry(file), attr, true);
2402 inode_unlock(inode);
2414 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2416 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2417 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2420 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2422 struct inode *inode = file_inode(file);
2424 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2425 ATTR_MTIME | ATTR_MTIME_SET |
2426 ATTR_CTIME | ATTR_CTIME_SET,
2428 .tv_sec = lfu->lfu_atime_sec,
2429 .tv_nsec = lfu->lfu_atime_nsec,
2432 .tv_sec = lfu->lfu_mtime_sec,
2433 .tv_nsec = lfu->lfu_mtime_nsec,
2436 .tv_sec = lfu->lfu_ctime_sec,
2437 .tv_nsec = lfu->lfu_ctime_nsec,
2443 if (!capable(CAP_SYS_ADMIN))
2446 if (!S_ISREG(inode->i_mode))
2450 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2451 inode_unlock(inode);
2457 * Give file access advices
2459 * The ladvise interface is similar to Linux fadvise() system call, except it
2460 * forwards the advices directly from Lustre client to server. The server side
2461 * codes will apply appropriate read-ahead and caching techniques for the
2462 * corresponding files.
2464 * A typical workload for ladvise is e.g. a bunch of different clients are
2465 * doing small random reads of a file, so prefetching pages into OSS cache
2466 * with big linear reads before the random IO is a net benefit. Fetching
2467 * all that data into each client cache with fadvise() may not be, due to
2468 * much more data being sent to the client.
2470 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2471 struct llapi_lu_ladvise *ladvise)
2475 struct cl_ladvise_io *lio;
2480 env = cl_env_get(&refcheck);
2482 RETURN(PTR_ERR(env));
2484 io = vvp_env_thread_io(env);
2485 io->ci_obj = ll_i2info(inode)->lli_clob;
2487 /* initialize parameters for ladvise */
2488 lio = &io->u.ci_ladvise;
2489 lio->li_start = ladvise->lla_start;
2490 lio->li_end = ladvise->lla_end;
2491 lio->li_fid = ll_inode2fid(inode);
2492 lio->li_advice = ladvise->lla_advice;
2493 lio->li_flags = flags;
2495 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2496 rc = cl_io_loop(env, io);
2500 cl_io_fini(env, io);
2501 cl_env_put(env, &refcheck);
2505 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2508 struct fsxattr fsxattr;
2510 if (copy_from_user(&fsxattr,
2511 (const struct fsxattr __user *)arg,
2515 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2516 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2517 if (copy_to_user((struct fsxattr __user *)arg,
2518 &fsxattr, sizeof(fsxattr)))
2524 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2528 struct md_op_data *op_data;
2529 struct ptlrpc_request *req = NULL;
2531 struct fsxattr fsxattr;
2532 struct cl_object *obj;
2534 /* only root could change project ID */
2535 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2538 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2539 LUSTRE_OPC_ANY, NULL);
2540 if (IS_ERR(op_data))
2541 RETURN(PTR_ERR(op_data));
2543 if (copy_from_user(&fsxattr,
2544 (const struct fsxattr __user *)arg,
2546 GOTO(out_fsxattr1, rc = -EFAULT);
2548 op_data->op_attr_flags = fsxattr.fsx_xflags;
2549 op_data->op_projid = fsxattr.fsx_projid;
2550 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2551 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2553 ptlrpc_req_finished(req);
2555 obj = ll_i2info(inode)->lli_clob;
2559 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2560 OBD_ALLOC_PTR(attr);
2562 GOTO(out_fsxattr1, rc = -ENOMEM);
2563 attr->ia_valid = ATTR_ATTR_FLAG;
2564 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2569 ll_finish_md_op_data(op_data);
2576 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2578 struct inode *inode = file_inode(file);
2579 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2583 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2584 PFID(ll_inode2fid(inode)), inode, cmd);
2585 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2587 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2588 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2592 case LL_IOC_GETFLAGS:
2593 /* Get the current value of the file flags */
2594 return put_user(fd->fd_flags, (int __user *)arg);
2595 case LL_IOC_SETFLAGS:
2596 case LL_IOC_CLRFLAGS:
2597 /* Set or clear specific file flags */
2598 /* XXX This probably needs checks to ensure the flags are
2599 * not abused, and to handle any flag side effects.
2601 if (get_user(flags, (int __user *) arg))
2604 if (cmd == LL_IOC_SETFLAGS) {
2605 if ((flags & LL_FILE_IGNORE_LOCK) &&
2606 !(file->f_flags & O_DIRECT)) {
2607 CERROR("%s: unable to disable locking on "
2608 "non-O_DIRECT file\n", current->comm);
2612 fd->fd_flags |= flags;
2614 fd->fd_flags &= ~flags;
2617 case LL_IOC_LOV_SETSTRIPE:
2618 case LL_IOC_LOV_SETSTRIPE_NEW:
2619 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2620 case LL_IOC_LOV_SETEA:
2621 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2622 case LL_IOC_LOV_SWAP_LAYOUTS: {
2624 struct lustre_swap_layouts lsl;
2626 if (copy_from_user(&lsl, (char __user *)arg,
2627 sizeof(struct lustre_swap_layouts)))
2630 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2633 file2 = fget(lsl.sl_fd);
2637 /* O_WRONLY or O_RDWR */
2638 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2639 GOTO(out, rc = -EPERM);
2641 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2642 struct inode *inode2;
2643 struct ll_inode_info *lli;
2644 struct obd_client_handle *och = NULL;
2646 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2647 GOTO(out, rc = -EINVAL);
2649 lli = ll_i2info(inode);
2650 mutex_lock(&lli->lli_och_mutex);
2651 if (fd->fd_lease_och != NULL) {
2652 och = fd->fd_lease_och;
2653 fd->fd_lease_och = NULL;
2655 mutex_unlock(&lli->lli_och_mutex);
2657 GOTO(out, rc = -ENOLCK);
2658 inode2 = file_inode(file2);
2659 rc = ll_swap_layouts_close(och, inode, inode2);
2661 rc = ll_swap_layouts(file, file2, &lsl);
2667 case LL_IOC_LOV_GETSTRIPE:
2668 case LL_IOC_LOV_GETSTRIPE_NEW:
2669 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2670 case FSFILT_IOC_GETFLAGS:
2671 case FSFILT_IOC_SETFLAGS:
2672 RETURN(ll_iocontrol(inode, file, cmd, arg));
2673 case FSFILT_IOC_GETVERSION_OLD:
2674 case FSFILT_IOC_GETVERSION:
2675 RETURN(put_user(inode->i_generation, (int __user *)arg));
2676 case LL_IOC_GROUP_LOCK:
2677 RETURN(ll_get_grouplock(inode, file, arg));
2678 case LL_IOC_GROUP_UNLOCK:
2679 RETURN(ll_put_grouplock(inode, file, arg));
2680 case IOC_OBD_STATFS:
2681 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2683 /* We need to special case any other ioctls we want to handle,
2684 * to send them to the MDS/OST as appropriate and to properly
2685 * network encode the arg field.
2686 case FSFILT_IOC_SETVERSION_OLD:
2687 case FSFILT_IOC_SETVERSION:
2689 case LL_IOC_FLUSHCTX:
2690 RETURN(ll_flush_ctx(inode));
2691 case LL_IOC_PATH2FID: {
2692 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2693 sizeof(struct lu_fid)))
2698 case LL_IOC_GETPARENT:
2699 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2701 case OBD_IOC_FID2PATH:
2702 RETURN(ll_fid2path(inode, (void __user *)arg));
2703 case LL_IOC_DATA_VERSION: {
2704 struct ioc_data_version idv;
2707 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2710 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2711 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2714 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2720 case LL_IOC_GET_MDTIDX: {
2723 mdtidx = ll_get_mdt_idx(inode);
2727 if (put_user((int)mdtidx, (int __user *)arg))
2732 case OBD_IOC_GETDTNAME:
2733 case OBD_IOC_GETMDNAME:
2734 RETURN(ll_get_obd_name(inode, cmd, arg));
2735 case LL_IOC_HSM_STATE_GET: {
2736 struct md_op_data *op_data;
2737 struct hsm_user_state *hus;
2744 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2745 LUSTRE_OPC_ANY, hus);
2746 if (IS_ERR(op_data)) {
2748 RETURN(PTR_ERR(op_data));
2751 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2754 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2757 ll_finish_md_op_data(op_data);
2761 case LL_IOC_HSM_STATE_SET: {
2762 struct hsm_state_set *hss;
2769 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2774 rc = ll_hsm_state_set(inode, hss);
2779 case LL_IOC_HSM_ACTION: {
2780 struct md_op_data *op_data;
2781 struct hsm_current_action *hca;
2788 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2789 LUSTRE_OPC_ANY, hca);
2790 if (IS_ERR(op_data)) {
2792 RETURN(PTR_ERR(op_data));
2795 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2798 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2801 ll_finish_md_op_data(op_data);
2805 case LL_IOC_SET_LEASE: {
2806 struct ll_inode_info *lli = ll_i2info(inode);
2807 struct obd_client_handle *och = NULL;
2812 case LL_LEASE_WRLCK:
2813 if (!(file->f_mode & FMODE_WRITE))
2815 fmode = FMODE_WRITE;
2817 case LL_LEASE_RDLCK:
2818 if (!(file->f_mode & FMODE_READ))
2822 case LL_LEASE_UNLCK:
2823 mutex_lock(&lli->lli_och_mutex);
2824 if (fd->fd_lease_och != NULL) {
2825 och = fd->fd_lease_och;
2826 fd->fd_lease_och = NULL;
2828 mutex_unlock(&lli->lli_och_mutex);
2833 fmode = och->och_flags;
2834 rc = ll_lease_close(och, inode, &lease_broken);
2838 rc = ll_lease_och_release(inode, file);
2845 RETURN(ll_lease_type_from_fmode(fmode));
2850 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2852 /* apply for lease */
2853 och = ll_lease_open(inode, file, fmode, 0);
2855 RETURN(PTR_ERR(och));
2858 mutex_lock(&lli->lli_och_mutex);
2859 if (fd->fd_lease_och == NULL) {
2860 fd->fd_lease_och = och;
2863 mutex_unlock(&lli->lli_och_mutex);
2865 /* impossible now that only excl is supported for now */
2866 ll_lease_close(och, inode, &lease_broken);
2871 case LL_IOC_GET_LEASE: {
2872 struct ll_inode_info *lli = ll_i2info(inode);
2873 struct ldlm_lock *lock = NULL;
2876 mutex_lock(&lli->lli_och_mutex);
2877 if (fd->fd_lease_och != NULL) {
2878 struct obd_client_handle *och = fd->fd_lease_och;
2880 lock = ldlm_handle2lock(&och->och_lease_handle);
2882 lock_res_and_lock(lock);
2883 if (!ldlm_is_cancel(lock))
2884 fmode = och->och_flags;
2886 unlock_res_and_lock(lock);
2887 LDLM_LOCK_PUT(lock);
2890 mutex_unlock(&lli->lli_och_mutex);
2892 RETURN(ll_lease_type_from_fmode(fmode));
2894 case LL_IOC_HSM_IMPORT: {
2895 struct hsm_user_import *hui;
2901 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2906 rc = ll_hsm_import(inode, file, hui);
2911 case LL_IOC_FUTIMES_3: {
2912 struct ll_futimes_3 lfu;
2914 if (copy_from_user(&lfu,
2915 (const struct ll_futimes_3 __user *)arg,
2919 RETURN(ll_file_futimes_3(file, &lfu));
2921 case LL_IOC_LADVISE: {
2922 struct llapi_ladvise_hdr *ladvise_hdr;
2925 int alloc_size = sizeof(*ladvise_hdr);
2928 OBD_ALLOC_PTR(ladvise_hdr);
2929 if (ladvise_hdr == NULL)
2932 if (copy_from_user(ladvise_hdr,
2933 (const struct llapi_ladvise_hdr __user *)arg,
2935 GOTO(out_ladvise, rc = -EFAULT);
2937 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2938 ladvise_hdr->lah_count < 1)
2939 GOTO(out_ladvise, rc = -EINVAL);
2941 num_advise = ladvise_hdr->lah_count;
2942 if (num_advise >= LAH_COUNT_MAX)
2943 GOTO(out_ladvise, rc = -EFBIG);
2945 OBD_FREE_PTR(ladvise_hdr);
2946 alloc_size = offsetof(typeof(*ladvise_hdr),
2947 lah_advise[num_advise]);
2948 OBD_ALLOC(ladvise_hdr, alloc_size);
2949 if (ladvise_hdr == NULL)
2953 * TODO: submit multiple advices to one server in a single RPC
2955 if (copy_from_user(ladvise_hdr,
2956 (const struct llapi_ladvise_hdr __user *)arg,
2958 GOTO(out_ladvise, rc = -EFAULT);
2960 for (i = 0; i < num_advise; i++) {
2961 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2962 &ladvise_hdr->lah_advise[i]);
2968 OBD_FREE(ladvise_hdr, alloc_size);
2971 case LL_IOC_FSGETXATTR:
2972 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2973 case LL_IOC_FSSETXATTR:
2974 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2976 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
2978 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2979 (void __user *)arg));
2983 #ifndef HAVE_FILE_LLSEEK_SIZE
2984 static inline loff_t
2985 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2987 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2989 if (offset > maxsize)
2992 if (offset != file->f_pos) {
2993 file->f_pos = offset;
2994 file->f_version = 0;
3000 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3001 loff_t maxsize, loff_t eof)
3003 struct inode *inode = file_inode(file);
3011 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3012 * position-querying operation. Avoid rewriting the "same"
3013 * f_pos value back to the file because a concurrent read(),
3014 * write() or lseek() might have altered it
3019 * f_lock protects against read/modify/write race with other
3020 * SEEK_CURs. Note that parallel writes and reads behave
3024 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3025 inode_unlock(inode);
3029 * In the generic case the entire file is data, so as long as
3030 * offset isn't at the end of the file then the offset is data.
3037 * There is a virtual hole at the end of the file, so as long as
3038 * offset isn't i_size or larger, return i_size.
3046 return llseek_execute(file, offset, maxsize);
3050 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3052 struct inode *inode = file_inode(file);
3053 loff_t retval, eof = 0;
3056 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3057 (origin == SEEK_CUR) ? file->f_pos : 0);
3058 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3059 PFID(ll_inode2fid(inode)), inode, retval, retval,
3061 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3063 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3064 retval = ll_glimpse_size(inode);
3067 eof = i_size_read(inode);
3070 retval = ll_generic_file_llseek_size(file, offset, origin,
3071 ll_file_maxbytes(inode), eof);
3075 static int ll_flush(struct file *file, fl_owner_t id)
3077 struct inode *inode = file_inode(file);
3078 struct ll_inode_info *lli = ll_i2info(inode);
3079 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3082 LASSERT(!S_ISDIR(inode->i_mode));
3084 /* catch async errors that were recorded back when async writeback
3085 * failed for pages in this mapping. */
3086 rc = lli->lli_async_rc;
3087 lli->lli_async_rc = 0;
3088 if (lli->lli_clob != NULL) {
3089 err = lov_read_and_clear_async_rc(lli->lli_clob);
3094 /* The application has been told write failure already.
3095 * Do not report failure again. */
3096 if (fd->fd_write_failed)
3098 return rc ? -EIO : 0;
3102 * Called to make sure a portion of file has been written out.
3103 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3105 * Return how many pages have been written.
3107 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3108 enum cl_fsync_mode mode, int ignore_layout)
3112 struct cl_fsync_io *fio;
3117 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3118 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3121 env = cl_env_get(&refcheck);
3123 RETURN(PTR_ERR(env));
3125 io = vvp_env_thread_io(env);
3126 io->ci_obj = ll_i2info(inode)->lli_clob;
3127 io->ci_ignore_layout = ignore_layout;
3129 /* initialize parameters for sync */
3130 fio = &io->u.ci_fsync;
3131 fio->fi_start = start;
3133 fio->fi_fid = ll_inode2fid(inode);
3134 fio->fi_mode = mode;
3135 fio->fi_nr_written = 0;
3137 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3138 result = cl_io_loop(env, io);
3140 result = io->ci_result;
3142 result = fio->fi_nr_written;
3143 cl_io_fini(env, io);
3144 cl_env_put(env, &refcheck);
3150 * When dentry is provided (the 'else' case), file_dentry() may be
3151 * null and dentry must be used directly rather than pulled from
3152 * file_dentry() as is done otherwise.
3155 #ifdef HAVE_FILE_FSYNC_4ARGS
3156 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3158 struct dentry *dentry = file_dentry(file);
3160 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3161 int ll_fsync(struct file *file, int datasync)
3163 struct dentry *dentry = file_dentry(file);
3165 loff_t end = LLONG_MAX;
3167 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3170 loff_t end = LLONG_MAX;
3172 struct inode *inode = dentry->d_inode;
3173 struct ll_inode_info *lli = ll_i2info(inode);
3174 struct ptlrpc_request *req;
3178 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3179 PFID(ll_inode2fid(inode)), inode);
3180 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3182 #ifdef HAVE_FILE_FSYNC_4ARGS
3183 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3184 lock_inode = !lli->lli_inode_locked;
3188 /* fsync's caller has already called _fdata{sync,write}, we want
3189 * that IO to finish before calling the osc and mdc sync methods */
3190 rc = filemap_fdatawait(inode->i_mapping);
3193 /* catch async errors that were recorded back when async writeback
3194 * failed for pages in this mapping. */
3195 if (!S_ISDIR(inode->i_mode)) {
3196 err = lli->lli_async_rc;
3197 lli->lli_async_rc = 0;
3200 if (lli->lli_clob != NULL) {
3201 err = lov_read_and_clear_async_rc(lli->lli_clob);
3207 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3211 ptlrpc_req_finished(req);
3213 if (S_ISREG(inode->i_mode)) {
3214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3216 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3217 if (rc == 0 && err < 0)
3220 fd->fd_write_failed = true;
3222 fd->fd_write_failed = false;
3225 #ifdef HAVE_FILE_FSYNC_4ARGS
3227 inode_unlock(inode);
3233 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3235 struct inode *inode = file_inode(file);
3236 struct ll_sb_info *sbi = ll_i2sbi(inode);
3237 struct ldlm_enqueue_info einfo = {
3238 .ei_type = LDLM_FLOCK,
3239 .ei_cb_cp = ldlm_flock_completion_ast,
3240 .ei_cbdata = file_lock,
3242 struct md_op_data *op_data;
3243 struct lustre_handle lockh = { 0 };
3244 union ldlm_policy_data flock = { { 0 } };
3245 int fl_type = file_lock->fl_type;
3251 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3252 PFID(ll_inode2fid(inode)), file_lock);
3254 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3256 if (file_lock->fl_flags & FL_FLOCK) {
3257 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3258 /* flocks are whole-file locks */
3259 flock.l_flock.end = OFFSET_MAX;
3260 /* For flocks owner is determined by the local file desctiptor*/
3261 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3262 } else if (file_lock->fl_flags & FL_POSIX) {
3263 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3264 flock.l_flock.start = file_lock->fl_start;
3265 flock.l_flock.end = file_lock->fl_end;
3269 flock.l_flock.pid = file_lock->fl_pid;
3271 /* Somewhat ugly workaround for svc lockd.
3272 * lockd installs custom fl_lmops->lm_compare_owner that checks
3273 * for the fl_owner to be the same (which it always is on local node
3274 * I guess between lockd processes) and then compares pid.
3275 * As such we assign pid to the owner field to make it all work,
3276 * conflict with normal locks is unlikely since pid space and
3277 * pointer space for current->files are not intersecting */
3278 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3279 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3283 einfo.ei_mode = LCK_PR;
3286 /* An unlock request may or may not have any relation to
3287 * existing locks so we may not be able to pass a lock handle
3288 * via a normal ldlm_lock_cancel() request. The request may even
3289 * unlock a byte range in the middle of an existing lock. In
3290 * order to process an unlock request we need all of the same
3291 * information that is given with a normal read or write record
3292 * lock request. To avoid creating another ldlm unlock (cancel)
3293 * message we'll treat a LCK_NL flock request as an unlock. */
3294 einfo.ei_mode = LCK_NL;
3297 einfo.ei_mode = LCK_PW;
3300 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3315 flags = LDLM_FL_BLOCK_NOWAIT;
3321 flags = LDLM_FL_TEST_LOCK;
3324 CERROR("unknown fcntl lock command: %d\n", cmd);
3328 /* Save the old mode so that if the mode in the lock changes we
3329 * can decrement the appropriate reader or writer refcount. */
3330 file_lock->fl_type = einfo.ei_mode;
3332 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3333 LUSTRE_OPC_ANY, NULL);
3334 if (IS_ERR(op_data))
3335 RETURN(PTR_ERR(op_data));
3337 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3338 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3339 flock.l_flock.pid, flags, einfo.ei_mode,
3340 flock.l_flock.start, flock.l_flock.end);
3342 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3345 /* Restore the file lock type if not TEST lock. */
3346 if (!(flags & LDLM_FL_TEST_LOCK))
3347 file_lock->fl_type = fl_type;
3349 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3350 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3351 !(flags & LDLM_FL_TEST_LOCK))
3352 rc2 = locks_lock_file_wait(file, file_lock);
3354 if ((file_lock->fl_flags & FL_FLOCK) &&
3355 (rc == 0 || file_lock->fl_type == F_UNLCK))
3356 rc2 = flock_lock_file_wait(file, file_lock);
3357 if ((file_lock->fl_flags & FL_POSIX) &&
3358 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3359 !(flags & LDLM_FL_TEST_LOCK))
3360 rc2 = posix_lock_file_wait(file, file_lock);
3361 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3363 if (rc2 && file_lock->fl_type != F_UNLCK) {
3364 einfo.ei_mode = LCK_NL;
3365 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3370 ll_finish_md_op_data(op_data);
3375 int ll_get_fid_by_name(struct inode *parent, const char *name,
3376 int namelen, struct lu_fid *fid,
3377 struct inode **inode)
3379 struct md_op_data *op_data = NULL;
3380 struct mdt_body *body;
3381 struct ptlrpc_request *req;
3385 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3386 LUSTRE_OPC_ANY, NULL);
3387 if (IS_ERR(op_data))
3388 RETURN(PTR_ERR(op_data));
3390 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3391 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3392 ll_finish_md_op_data(op_data);
3396 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3398 GOTO(out_req, rc = -EFAULT);
3400 *fid = body->mbo_fid1;
3403 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3405 ptlrpc_req_finished(req);
3409 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3410 const char *name, int namelen)
3412 struct dentry *dchild = NULL;
3413 struct inode *child_inode = NULL;
3414 struct md_op_data *op_data;
3415 struct ptlrpc_request *request = NULL;
3416 struct obd_client_handle *och = NULL;
3418 struct mdt_body *body;
3420 __u64 data_version = 0;
3423 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3424 name, PFID(ll_inode2fid(parent)), mdtidx);
3426 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3427 0, LUSTRE_OPC_ANY, NULL);
3428 if (IS_ERR(op_data))
3429 RETURN(PTR_ERR(op_data));
3431 /* Get child FID first */
3432 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3435 dchild = d_lookup(file_dentry(file), &qstr);
3436 if (dchild != NULL) {
3437 if (dchild->d_inode != NULL)
3438 child_inode = igrab(dchild->d_inode);
3442 if (child_inode == NULL) {
3443 rc = ll_get_fid_by_name(parent, name, namelen,
3444 &op_data->op_fid3, &child_inode);
3449 if (child_inode == NULL)
3450 GOTO(out_free, rc = -EINVAL);
3453 * lfs migrate command needs to be blocked on the client
3454 * by checking the migrate FID against the FID of the
3457 if (child_inode == parent->i_sb->s_root->d_inode)
3458 GOTO(out_iput, rc = -EINVAL);
3460 inode_lock(child_inode);
3461 op_data->op_fid3 = *ll_inode2fid(child_inode);
3462 if (!fid_is_sane(&op_data->op_fid3)) {
3463 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3464 ll_get_fsname(parent->i_sb, NULL, 0), name,
3465 PFID(&op_data->op_fid3));
3466 GOTO(out_unlock, rc = -EINVAL);
3469 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3471 GOTO(out_unlock, rc);
3474 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3475 PFID(&op_data->op_fid3), mdtidx);
3476 GOTO(out_unlock, rc = 0);
3479 if (S_ISREG(child_inode->i_mode)) {
3480 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3484 GOTO(out_unlock, rc);
3487 rc = ll_data_version(child_inode, &data_version,
3490 GOTO(out_close, rc);
3492 op_data->op_handle = och->och_fh;
3493 op_data->op_data = och->och_mod;
3494 op_data->op_data_version = data_version;
3495 op_data->op_lease_handle = och->och_lease_handle;
3496 op_data->op_bias |= MDS_RENAME_MIGRATE;
3499 op_data->op_mds = mdtidx;
3500 op_data->op_cli_flags = CLI_MIGRATE;
3501 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3502 namelen, name, namelen, &request);
3504 LASSERT(request != NULL);
3505 ll_update_times(request, parent);
3507 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3508 LASSERT(body != NULL);
3510 /* If the server does release layout lock, then we cleanup
3511 * the client och here, otherwise release it in out_close: */
3513 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3514 obd_mod_put(och->och_mod);
3515 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3517 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3523 if (request != NULL) {
3524 ptlrpc_req_finished(request);
3528 /* Try again if the file layout has changed. */
3529 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3533 if (och != NULL) /* close the file */
3534 ll_lease_close(och, child_inode, NULL);
3536 clear_nlink(child_inode);
3538 inode_unlock(child_inode);
3542 ll_finish_md_op_data(op_data);
3547 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3555 * test if some locks matching bits and l_req_mode are acquired
3556 * - bits can be in different locks
3557 * - if found clear the common lock bits in *bits
3558 * - the bits not found, are kept in *bits
3560 * \param bits [IN] searched lock bits [IN]
3561 * \param l_req_mode [IN] searched lock mode
3562 * \retval boolean, true iff all bits are found
3564 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3566 struct lustre_handle lockh;
3567 union ldlm_policy_data policy;
3568 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3569 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3578 fid = &ll_i2info(inode)->lli_fid;
3579 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3580 ldlm_lockname[mode]);
3582 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3583 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3584 policy.l_inodebits.bits = *bits & (1 << i);
3585 if (policy.l_inodebits.bits == 0)
3588 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3589 &policy, mode, &lockh)) {
3590 struct ldlm_lock *lock;
3592 lock = ldlm_handle2lock(&lockh);
3595 ~(lock->l_policy_data.l_inodebits.bits);
3596 LDLM_LOCK_PUT(lock);
3598 *bits &= ~policy.l_inodebits.bits;
3605 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3606 struct lustre_handle *lockh, __u64 flags,
3607 enum ldlm_mode mode)
3609 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3614 fid = &ll_i2info(inode)->lli_fid;
3615 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3617 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3618 fid, LDLM_IBITS, &policy, mode, lockh);
3623 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3625 /* Already unlinked. Just update nlink and return success */
3626 if (rc == -ENOENT) {
3628 /* If it is striped directory, and there is bad stripe
3629 * Let's revalidate the dentry again, instead of returning
3631 if (S_ISDIR(inode->i_mode) &&
3632 ll_i2info(inode)->lli_lsm_md != NULL)
3635 /* This path cannot be hit for regular files unless in
3636 * case of obscure races, so no need to to validate
3638 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3640 } else if (rc != 0) {
3641 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3642 "%s: revalidate FID "DFID" error: rc = %d\n",
3643 ll_get_fsname(inode->i_sb, NULL, 0),
3644 PFID(ll_inode2fid(inode)), rc);
3650 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3652 struct inode *inode = dentry->d_inode;
3653 struct ptlrpc_request *req = NULL;
3654 struct obd_export *exp;
3658 LASSERT(inode != NULL);
3660 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3661 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3663 exp = ll_i2mdexp(inode);
3665 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3666 * But under CMD case, it caused some lock issues, should be fixed
3667 * with new CMD ibits lock. See bug 12718 */
3668 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3669 struct lookup_intent oit = { .it_op = IT_GETATTR };
3670 struct md_op_data *op_data;
3672 if (ibits == MDS_INODELOCK_LOOKUP)
3673 oit.it_op = IT_LOOKUP;
3675 /* Call getattr by fid, so do not provide name at all. */
3676 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3677 dentry->d_inode, NULL, 0, 0,
3678 LUSTRE_OPC_ANY, NULL);
3679 if (IS_ERR(op_data))
3680 RETURN(PTR_ERR(op_data));
3682 rc = md_intent_lock(exp, op_data, &oit, &req,
3683 &ll_md_blocking_ast, 0);
3684 ll_finish_md_op_data(op_data);
3686 rc = ll_inode_revalidate_fini(inode, rc);
3690 rc = ll_revalidate_it_finish(req, &oit, dentry);
3692 ll_intent_release(&oit);
3696 /* Unlinked? Unhash dentry, so it is not picked up later by
3697 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3698 here to preserve get_cwd functionality on 2.6.
3700 if (!dentry->d_inode->i_nlink) {
3701 ll_lock_dcache(inode);
3702 d_lustre_invalidate(dentry, 0);
3703 ll_unlock_dcache(inode);
3706 ll_lookup_finish_locks(&oit, dentry);
3707 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3708 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3709 u64 valid = OBD_MD_FLGETATTR;
3710 struct md_op_data *op_data;
3713 if (S_ISREG(inode->i_mode)) {
3714 rc = ll_get_default_mdsize(sbi, &ealen);
3717 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3720 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3721 0, ealen, LUSTRE_OPC_ANY,
3723 if (IS_ERR(op_data))
3724 RETURN(PTR_ERR(op_data));
3726 op_data->op_valid = valid;
3727 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3728 ll_finish_md_op_data(op_data);
3730 rc = ll_inode_revalidate_fini(inode, rc);
3734 rc = ll_prep_inode(&inode, req, NULL, NULL);
3737 ptlrpc_req_finished(req);
3741 static int ll_merge_md_attr(struct inode *inode)
3743 struct cl_attr attr = { 0 };
3746 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3747 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3748 &attr, ll_md_blocking_ast);
3752 set_nlink(inode, attr.cat_nlink);
3753 inode->i_blocks = attr.cat_blocks;
3754 i_size_write(inode, attr.cat_size);
3756 ll_i2info(inode)->lli_atime = attr.cat_atime;
3757 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3758 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3764 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3766 struct inode *inode = dentry->d_inode;
3770 rc = __ll_inode_revalidate(dentry, ibits);
3774 /* if object isn't regular file, don't validate size */
3775 if (!S_ISREG(inode->i_mode)) {
3776 if (S_ISDIR(inode->i_mode) &&
3777 ll_i2info(inode)->lli_lsm_md != NULL) {
3778 rc = ll_merge_md_attr(inode);
3783 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3784 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3785 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3787 /* In case of restore, the MDT has the right size and has
3788 * already send it back without granting the layout lock,
3789 * inode is up-to-date so glimpse is useless.
3790 * Also to glimpse we need the layout, in case of a running
3791 * restore the MDT holds the layout lock so the glimpse will
3792 * block up to the end of restore (getattr will block)
3794 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3795 rc = ll_glimpse_size(inode);
3800 static inline dev_t ll_compat_encode_dev(dev_t dev)
3802 /* The compat_sys_*stat*() syscalls will fail unless the
3803 * device majors and minors are both less than 256. Note that
3804 * the value returned here will be passed through
3805 * old_encode_dev() in cp_compat_stat(). And so we are not
3806 * trying to return a valid compat (u16) device number, just
3807 * one that will pass the old_valid_dev() check. */
3809 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3812 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
3813 int ll_getattr(const struct path *path, struct kstat *stat,
3814 u32 request_mask, unsigned int flags)
3817 struct dentry *de = path->dentry;
3819 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3822 struct inode *inode = de->d_inode;
3823 struct ll_sb_info *sbi = ll_i2sbi(inode);
3824 struct ll_inode_info *lli = ll_i2info(inode);
3827 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3828 MDS_INODELOCK_LOOKUP);
3829 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3834 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3836 if (ll_need_32bit_api(sbi)) {
3837 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3838 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3839 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3841 stat->ino = inode->i_ino;
3842 stat->dev = inode->i_sb->s_dev;
3843 stat->rdev = inode->i_rdev;
3846 stat->mode = inode->i_mode;
3847 stat->uid = inode->i_uid;
3848 stat->gid = inode->i_gid;
3849 stat->atime = inode->i_atime;
3850 stat->mtime = inode->i_mtime;
3851 stat->ctime = inode->i_ctime;
3852 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3854 stat->nlink = inode->i_nlink;
3855 stat->size = i_size_read(inode);
3856 stat->blocks = inode->i_blocks;
3861 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3862 __u64 start, __u64 len)
3866 struct fiemap *fiemap;
3867 unsigned int extent_count = fieinfo->fi_extents_max;
3869 num_bytes = sizeof(*fiemap) + (extent_count *
3870 sizeof(struct fiemap_extent));
3871 OBD_ALLOC_LARGE(fiemap, num_bytes);
3876 fiemap->fm_flags = fieinfo->fi_flags;
3877 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3878 fiemap->fm_start = start;
3879 fiemap->fm_length = len;
3880 if (extent_count > 0 &&
3881 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3882 sizeof(struct fiemap_extent)) != 0)
3883 GOTO(out, rc = -EFAULT);
3885 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3887 fieinfo->fi_flags = fiemap->fm_flags;
3888 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3889 if (extent_count > 0 &&
3890 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3891 fiemap->fm_mapped_extents *
3892 sizeof(struct fiemap_extent)) != 0)
3893 GOTO(out, rc = -EFAULT);
3895 OBD_FREE_LARGE(fiemap, num_bytes);
3899 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3901 struct ll_inode_info *lli = ll_i2info(inode);
3902 struct posix_acl *acl = NULL;
3905 spin_lock(&lli->lli_lock);
3906 /* VFS' acl_permission_check->check_acl will release the refcount */
3907 acl = posix_acl_dup(lli->lli_posix_acl);
3908 spin_unlock(&lli->lli_lock);
3913 #ifdef HAVE_IOP_SET_ACL
3914 #ifdef CONFIG_FS_POSIX_ACL
3915 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
3917 struct ll_sb_info *sbi = ll_i2sbi(inode);
3918 struct ptlrpc_request *req = NULL;
3919 const char *name = NULL;
3921 size_t value_size = 0;
3926 case ACL_TYPE_ACCESS:
3927 name = XATTR_NAME_POSIX_ACL_ACCESS;
3929 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
3935 case ACL_TYPE_DEFAULT:
3936 name = XATTR_NAME_POSIX_ACL_DEFAULT;
3937 if (!S_ISDIR(inode->i_mode))
3938 GOTO(out, rc = acl ? -EACCES : 0);
3942 GOTO(out, rc = -EINVAL);
3946 value_size = posix_acl_xattr_size(acl->a_count);
3947 value = kmalloc(value_size, GFP_NOFS);
3949 GOTO(out, rc = -ENOMEM);
3951 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
3953 GOTO(out_value, rc);
3956 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3957 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
3958 name, value, value_size, 0, 0, 0, &req);
3960 ptlrpc_req_finished(req);
3965 set_cached_acl(inode, type, acl);
3967 forget_cached_acl(inode, type);
3970 #endif /* CONFIG_FS_POSIX_ACL */
3971 #endif /* HAVE_IOP_SET_ACL */
3973 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3975 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3976 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3978 ll_check_acl(struct inode *inode, int mask)
3981 # ifdef CONFIG_FS_POSIX_ACL
3982 struct posix_acl *acl;
3986 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3987 if (flags & IPERM_FLAG_RCU)
3990 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3995 rc = posix_acl_permission(inode, acl, mask);
3996 posix_acl_release(acl);
3999 # else /* !CONFIG_FS_POSIX_ACL */
4001 # endif /* CONFIG_FS_POSIX_ACL */
4003 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4005 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4006 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4008 # ifdef HAVE_INODE_PERMISION_2ARGS
4009 int ll_inode_permission(struct inode *inode, int mask)
4011 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4016 struct ll_sb_info *sbi;
4017 struct root_squash_info *squash;
4018 struct cred *cred = NULL;
4019 const struct cred *old_cred = NULL;
4021 bool squash_id = false;
4024 #ifdef MAY_NOT_BLOCK
4025 if (mask & MAY_NOT_BLOCK)
4027 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4028 if (flags & IPERM_FLAG_RCU)
4032 /* as root inode are NOT getting validated in lookup operation,
4033 * need to do it before permission check. */
4035 if (inode == inode->i_sb->s_root->d_inode) {
4036 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4037 MDS_INODELOCK_LOOKUP);
4042 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4043 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4045 /* squash fsuid/fsgid if needed */
4046 sbi = ll_i2sbi(inode);
4047 squash = &sbi->ll_squash;
4048 if (unlikely(squash->rsi_uid != 0 &&
4049 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4050 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4054 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4055 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4056 squash->rsi_uid, squash->rsi_gid);
4058 /* update current process's credentials
4059 * and FS capability */
4060 cred = prepare_creds();
4064 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4065 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4066 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4067 if ((1 << cap) & CFS_CAP_FS_MASK)
4068 cap_lower(cred->cap_effective, cap);
4070 old_cred = override_creds(cred);
4073 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4074 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4075 /* restore current process's credentials and FS capability */
4077 revert_creds(old_cred);
4084 /* -o localflock - only provides locally consistent flock locks */
4085 struct file_operations ll_file_operations = {
4086 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4087 # ifdef HAVE_SYNC_READ_WRITE
4088 .read = new_sync_read,
4089 .write = new_sync_write,
4091 .read_iter = ll_file_read_iter,
4092 .write_iter = ll_file_write_iter,
4093 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4094 .read = ll_file_read,
4095 .aio_read = ll_file_aio_read,
4096 .write = ll_file_write,
4097 .aio_write = ll_file_aio_write,
4098 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4099 .unlocked_ioctl = ll_file_ioctl,
4100 .open = ll_file_open,
4101 .release = ll_file_release,
4102 .mmap = ll_file_mmap,
4103 .llseek = ll_file_seek,
4104 .splice_read = ll_file_splice_read,
4109 struct file_operations ll_file_operations_flock = {
4110 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4111 # ifdef HAVE_SYNC_READ_WRITE
4112 .read = new_sync_read,
4113 .write = new_sync_write,
4114 # endif /* HAVE_SYNC_READ_WRITE */
4115 .read_iter = ll_file_read_iter,
4116 .write_iter = ll_file_write_iter,
4117 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4118 .read = ll_file_read,
4119 .aio_read = ll_file_aio_read,
4120 .write = ll_file_write,
4121 .aio_write = ll_file_aio_write,
4122 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4123 .unlocked_ioctl = ll_file_ioctl,
4124 .open = ll_file_open,
4125 .release = ll_file_release,
4126 .mmap = ll_file_mmap,
4127 .llseek = ll_file_seek,
4128 .splice_read = ll_file_splice_read,
4131 .flock = ll_file_flock,
4132 .lock = ll_file_flock
4135 /* These are for -o noflock - to return ENOSYS on flock calls */
4136 struct file_operations ll_file_operations_noflock = {
4137 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4138 # ifdef HAVE_SYNC_READ_WRITE
4139 .read = new_sync_read,
4140 .write = new_sync_write,
4141 # endif /* HAVE_SYNC_READ_WRITE */
4142 .read_iter = ll_file_read_iter,
4143 .write_iter = ll_file_write_iter,
4144 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4145 .read = ll_file_read,
4146 .aio_read = ll_file_aio_read,
4147 .write = ll_file_write,
4148 .aio_write = ll_file_aio_write,
4149 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4150 .unlocked_ioctl = ll_file_ioctl,
4151 .open = ll_file_open,
4152 .release = ll_file_release,
4153 .mmap = ll_file_mmap,
4154 .llseek = ll_file_seek,
4155 .splice_read = ll_file_splice_read,
4158 .flock = ll_file_noflock,
4159 .lock = ll_file_noflock
4162 struct inode_operations ll_file_inode_operations = {
4163 .setattr = ll_setattr,
4164 .getattr = ll_getattr,
4165 .permission = ll_inode_permission,
4166 #ifdef HAVE_IOP_XATTR
4167 .setxattr = ll_setxattr,
4168 .getxattr = ll_getxattr,
4169 .removexattr = ll_removexattr,
4171 .listxattr = ll_listxattr,
4172 .fiemap = ll_fiemap,
4173 #ifdef HAVE_IOP_GET_ACL
4174 .get_acl = ll_get_acl,
4176 #ifdef HAVE_IOP_SET_ACL
4177 .set_acl = ll_set_acl,
4181 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4183 struct ll_inode_info *lli = ll_i2info(inode);
4184 struct cl_object *obj = lli->lli_clob;
4193 env = cl_env_get(&refcheck);
4195 RETURN(PTR_ERR(env));
4197 rc = cl_conf_set(env, lli->lli_clob, conf);
4201 if (conf->coc_opc == OBJECT_CONF_SET) {
4202 struct ldlm_lock *lock = conf->coc_lock;
4203 struct cl_layout cl = {
4207 LASSERT(lock != NULL);
4208 LASSERT(ldlm_has_layout(lock));
4210 /* it can only be allowed to match after layout is
4211 * applied to inode otherwise false layout would be
4212 * seen. Applying layout shoud happen before dropping
4213 * the intent lock. */
4214 ldlm_lock_allow_match(lock);
4216 rc = cl_object_layout_get(env, obj, &cl);
4221 DFID": layout version change: %u -> %u\n",
4222 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4224 ll_layout_version_set(lli, cl.cl_layout_gen);
4228 cl_env_put(env, &refcheck);
4233 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4234 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4237 struct ll_sb_info *sbi = ll_i2sbi(inode);
4238 struct ptlrpc_request *req;
4239 struct mdt_body *body;
4246 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4247 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4248 lock->l_lvb_data, lock->l_lvb_len);
4250 if (lock->l_lvb_data != NULL)
4253 /* if layout lock was granted right away, the layout is returned
4254 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4255 * blocked and then granted via completion ast, we have to fetch
4256 * layout here. Please note that we can't use the LVB buffer in
4257 * completion AST because it doesn't have a large enough buffer */
4258 rc = ll_get_default_mdsize(sbi, &lmmsize);
4260 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4261 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4266 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4268 GOTO(out, rc = -EPROTO);
4270 lmmsize = body->mbo_eadatasize;
4271 if (lmmsize == 0) /* empty layout */
4274 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4276 GOTO(out, rc = -EFAULT);
4278 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4279 if (lvbdata == NULL)
4280 GOTO(out, rc = -ENOMEM);
4282 memcpy(lvbdata, lmm, lmmsize);
4283 lock_res_and_lock(lock);
4284 if (unlikely(lock->l_lvb_data == NULL)) {
4285 lock->l_lvb_type = LVB_T_LAYOUT;
4286 lock->l_lvb_data = lvbdata;
4287 lock->l_lvb_len = lmmsize;
4290 unlock_res_and_lock(lock);
4293 OBD_FREE_LARGE(lvbdata, lmmsize);
4298 ptlrpc_req_finished(req);
4303 * Apply the layout to the inode. Layout lock is held and will be released
4306 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4307 struct inode *inode)
4309 struct ll_inode_info *lli = ll_i2info(inode);
4310 struct ll_sb_info *sbi = ll_i2sbi(inode);
4311 struct ldlm_lock *lock;
4312 struct cl_object_conf conf;
4315 bool wait_layout = false;
4318 LASSERT(lustre_handle_is_used(lockh));
4320 lock = ldlm_handle2lock(lockh);
4321 LASSERT(lock != NULL);
4322 LASSERT(ldlm_has_layout(lock));
4324 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4325 PFID(&lli->lli_fid), inode);
4327 /* in case this is a caching lock and reinstate with new inode */
4328 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4330 lock_res_and_lock(lock);
4331 lvb_ready = ldlm_is_lvb_ready(lock);
4332 unlock_res_and_lock(lock);
4334 /* checking lvb_ready is racy but this is okay. The worst case is
4335 * that multi processes may configure the file on the same time. */
4339 rc = ll_layout_fetch(inode, lock);
4343 /* for layout lock, lmm is stored in lock's lvb.
4344 * lvb_data is immutable if the lock is held so it's safe to access it
4347 * set layout to file. Unlikely this will fail as old layout was
4348 * surely eliminated */
4349 memset(&conf, 0, sizeof conf);
4350 conf.coc_opc = OBJECT_CONF_SET;
4351 conf.coc_inode = inode;
4352 conf.coc_lock = lock;
4353 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4354 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4355 rc = ll_layout_conf(inode, &conf);
4357 /* refresh layout failed, need to wait */
4358 wait_layout = rc == -EBUSY;
4361 LDLM_LOCK_PUT(lock);
4362 ldlm_lock_decref(lockh, mode);
4364 /* wait for IO to complete if it's still being used. */
4366 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4367 ll_get_fsname(inode->i_sb, NULL, 0),
4368 PFID(&lli->lli_fid), inode);
4370 memset(&conf, 0, sizeof conf);
4371 conf.coc_opc = OBJECT_CONF_WAIT;
4372 conf.coc_inode = inode;
4373 rc = ll_layout_conf(inode, &conf);
4377 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4378 ll_get_fsname(inode->i_sb, NULL, 0),
4379 PFID(&lli->lli_fid), rc);
4385 * Issue layout intent RPC to MDS.
4386 * \param inode [in] file inode
4387 * \param intent [in] layout intent
4389 * \retval 0 on success
4390 * \retval < 0 error code
4392 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4394 struct ll_inode_info *lli = ll_i2info(inode);
4395 struct ll_sb_info *sbi = ll_i2sbi(inode);
4396 struct md_op_data *op_data;
4397 struct lookup_intent it;
4398 struct ptlrpc_request *req;
4402 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4403 0, 0, LUSTRE_OPC_ANY, NULL);
4404 if (IS_ERR(op_data))
4405 RETURN(PTR_ERR(op_data));
4407 op_data->op_data = intent;
4408 op_data->op_data_size = sizeof(*intent);
4410 memset(&it, 0, sizeof(it));
4411 it.it_op = IT_LAYOUT;
4412 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4413 intent->li_opc == LAYOUT_INTENT_TRUNC)
4414 it.it_flags = FMODE_WRITE;
4416 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4417 ll_get_fsname(inode->i_sb, NULL, 0),
4418 PFID(&lli->lli_fid), inode);
4420 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4421 &ll_md_blocking_ast, 0);
4422 if (it.it_request != NULL)
4423 ptlrpc_req_finished(it.it_request);
4424 it.it_request = NULL;
4426 ll_finish_md_op_data(op_data);
4428 /* set lock data in case this is a new lock */
4430 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4432 ll_intent_drop_lock(&it);
4438 * This function checks if there exists a LAYOUT lock on the client side,
4439 * or enqueues it if it doesn't have one in cache.
4441 * This function will not hold layout lock so it may be revoked any time after
4442 * this function returns. Any operations depend on layout should be redone
4445 * This function should be called before lov_io_init() to get an uptodate
4446 * layout version, the caller should save the version number and after IO
4447 * is finished, this function should be called again to verify that layout
4448 * is not changed during IO time.
4450 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4452 struct ll_inode_info *lli = ll_i2info(inode);
4453 struct ll_sb_info *sbi = ll_i2sbi(inode);
4454 struct lustre_handle lockh;
4455 struct layout_intent intent = {
4456 .li_opc = LAYOUT_INTENT_ACCESS,
4458 enum ldlm_mode mode;
4462 *gen = ll_layout_version_get(lli);
4463 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4467 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4468 LASSERT(S_ISREG(inode->i_mode));
4470 /* take layout lock mutex to enqueue layout lock exclusively. */
4471 mutex_lock(&lli->lli_layout_mutex);
4474 /* mostly layout lock is caching on the local side, so try to
4475 * match it before grabbing layout lock mutex. */
4476 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4477 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4478 if (mode != 0) { /* hit cached lock */
4479 rc = ll_layout_lock_set(&lockh, mode, inode);
4485 rc = ll_layout_intent(inode, &intent);
4491 *gen = ll_layout_version_get(lli);
4492 mutex_unlock(&lli->lli_layout_mutex);
4498 * Issue layout intent RPC indicating where in a file an IO is about to write.
4500 * \param[in] inode file inode.
4501 * \param[in] start start offset of fille in bytes where an IO is about to
4503 * \param[in] end exclusive end offset in bytes of the write range.
4505 * \retval 0 on success
4506 * \retval < 0 error code
4508 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4510 struct layout_intent intent = {
4511 .li_opc = LAYOUT_INTENT_WRITE,
4518 rc = ll_layout_intent(inode, &intent);
4524 * This function send a restore request to the MDT
4526 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4528 struct hsm_user_request *hur;
4532 len = sizeof(struct hsm_user_request) +
4533 sizeof(struct hsm_user_item);
4534 OBD_ALLOC(hur, len);
4538 hur->hur_request.hr_action = HUA_RESTORE;
4539 hur->hur_request.hr_archive_id = 0;
4540 hur->hur_request.hr_flags = 0;
4541 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4542 sizeof(hur->hur_user_item[0].hui_fid));
4543 hur->hur_user_item[0].hui_extent.offset = offset;
4544 hur->hur_user_item[0].hui_extent.length = length;
4545 hur->hur_request.hr_itemcount = 1;
4546 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,