4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
71 fd->fd_write_failed = false;
76 static void ll_file_data_put(struct ll_file_data *fd)
79 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 * Packs all the attributes into @op_data for the CLOSE rpc.
85 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
86 struct obd_client_handle *och)
90 ll_prep_md_op_data(op_data, inode, NULL, NULL,
91 0, 0, LUSTRE_OPC_ANY, NULL);
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
101 op_data->op_attr_blocks = inode->i_blocks;
102 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
103 op_data->op_handle = och->och_fh;
105 if (och->och_flags & FMODE_WRITE &&
106 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
107 /* For HSM: if inode data has been modified, pack it so that
108 * MDT can set data dirty flag in the archive. */
109 op_data->op_bias |= MDS_DATA_MODIFIED;
115 * Perform a close, possibly with a bias.
116 * The meaning of "data" depends on the value of "bias".
118 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
119 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
122 static int ll_close_inode_openhandle(struct inode *inode,
123 struct obd_client_handle *och,
124 enum mds_op_bias bias, void *data)
126 struct obd_export *md_exp = ll_i2mdexp(inode);
127 const struct ll_inode_info *lli = ll_i2info(inode);
128 struct md_op_data *op_data;
129 struct ptlrpc_request *req = NULL;
133 if (class_exp2obd(md_exp) == NULL) {
134 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
135 ll_get_fsname(inode->i_sb, NULL, 0),
136 PFID(&lli->lli_fid));
140 OBD_ALLOC_PTR(op_data);
141 /* We leak openhandle and request here on error, but not much to be
142 * done in OOM case since app won't retry close on error either. */
144 GOTO(out, rc = -ENOMEM);
146 ll_prepare_close(inode, op_data, och);
148 case MDS_CLOSE_LAYOUT_SWAP:
149 LASSERT(data != NULL);
150 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
151 op_data->op_data_version = 0;
152 op_data->op_lease_handle = och->och_lease_handle;
153 op_data->op_fid2 = *ll_inode2fid(data);
156 case MDS_HSM_RELEASE:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *(__u64 *)data;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
165 LASSERT(data == NULL);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 if (rc != 0 && rc != -EINTR)
171 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
172 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
175 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
176 struct mdt_body *body;
178 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
179 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
183 ll_finish_md_op_data(op_data);
187 md_clear_open_replay_data(md_exp, och);
188 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
191 ptlrpc_req_finished(req); /* This is close request */
195 int ll_md_real_close(struct inode *inode, fmode_t fmode)
197 struct ll_inode_info *lli = ll_i2info(inode);
198 struct obd_client_handle **och_p;
199 struct obd_client_handle *och;
204 if (fmode & FMODE_WRITE) {
205 och_p = &lli->lli_mds_write_och;
206 och_usecount = &lli->lli_open_fd_write_count;
207 } else if (fmode & FMODE_EXEC) {
208 och_p = &lli->lli_mds_exec_och;
209 och_usecount = &lli->lli_open_fd_exec_count;
211 LASSERT(fmode & FMODE_READ);
212 och_p = &lli->lli_mds_read_och;
213 och_usecount = &lli->lli_open_fd_read_count;
216 mutex_lock(&lli->lli_och_mutex);
217 if (*och_usecount > 0) {
218 /* There are still users of this handle, so skip
220 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
229 /* There might be a race and this handle may already
231 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
237 static int ll_md_close(struct inode *inode, struct file *file)
239 union ldlm_policy_data policy = {
240 .l_inodebits = { MDS_INODELOCK_OPEN },
242 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
244 struct ll_inode_info *lli = ll_i2info(inode);
245 struct lustre_handle lockh;
246 enum ldlm_mode lockmode;
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 mutex_lock(&lli->lli_och_mutex);
275 if (fd->fd_omode & FMODE_WRITE) {
277 LASSERT(lli->lli_open_fd_write_count);
278 lli->lli_open_fd_write_count--;
279 } else if (fd->fd_omode & FMODE_EXEC) {
281 LASSERT(lli->lli_open_fd_exec_count);
282 lli->lli_open_fd_exec_count--;
285 LASSERT(lli->lli_open_fd_read_count);
286 lli->lli_open_fd_read_count--;
288 mutex_unlock(&lli->lli_och_mutex);
290 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
291 LDLM_IBITS, &policy, lockmode, &lockh))
292 rc = ll_md_real_close(inode, fd->fd_omode);
295 LUSTRE_FPRIVATE(file) = NULL;
296 ll_file_data_put(fd);
301 /* While this returns an error code, fput() the caller does not, so we need
302 * to make every effort to clean up all of our state here. Also, applications
303 * rarely check close errors and even if an error is returned they will not
304 * re-try the close call.
306 int ll_file_release(struct inode *inode, struct file *file)
308 struct ll_file_data *fd;
309 struct ll_sb_info *sbi = ll_i2sbi(inode);
310 struct ll_inode_info *lli = ll_i2info(inode);
314 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
315 PFID(ll_inode2fid(inode)), inode);
317 if (inode->i_sb->s_root != file_dentry(file))
318 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
319 fd = LUSTRE_FPRIVATE(file);
322 /* The last ref on @file, maybe not the the owner pid of statahead,
323 * because parent and child process can share the same file handle. */
324 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
325 ll_deauthorize_statahead(inode, fd);
327 if (inode->i_sb->s_root == file_dentry(file)) {
328 LUSTRE_FPRIVATE(file) = NULL;
329 ll_file_data_put(fd);
333 if (!S_ISDIR(inode->i_mode)) {
334 if (lli->lli_clob != NULL)
335 lov_read_and_clear_async_rc(lli->lli_clob);
336 lli->lli_async_rc = 0;
339 rc = ll_md_close(inode, file);
341 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
342 libcfs_debug_dumplog();
347 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
348 struct lookup_intent *itp)
350 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
351 struct dentry *parent = de->d_parent;
352 const char *name = NULL;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req = NULL;
359 LASSERT(parent != NULL);
360 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
362 /* if server supports open-by-fid, or file name is invalid, don't pack
363 * name in open request */
364 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
365 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
366 name = de->d_name.name;
367 len = de->d_name.len;
370 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
371 name, len, 0, LUSTRE_OPC_ANY, NULL);
373 RETURN(PTR_ERR(op_data));
374 op_data->op_data = lmm;
375 op_data->op_data_size = lmmsize;
377 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
378 &ll_md_blocking_ast, 0);
379 ll_finish_md_op_data(op_data);
381 /* reason for keep own exit path - don`t flood log
382 * with messages with -ESTALE errors.
384 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
385 it_open_error(DISP_OPEN_OPEN, itp))
387 ll_release_openhandle(de, itp);
391 if (it_disposition(itp, DISP_LOOKUP_NEG))
392 GOTO(out, rc = -ENOENT);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
401 if (!rc && itp->it_lock_mode)
402 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
405 ptlrpc_req_finished(req);
406 ll_intent_drop_lock(itp);
408 /* We did open by fid, but by the time we got to the server,
409 * the object disappeared. If this is a create, we cannot really
410 * tell the userspace that the file it was trying to create
411 * does not exist. Instead let's return -ESTALE, and the VFS will
412 * retry the create with LOOKUP_REVAL that we are going to catch
413 * in ll_revalidate_dentry() and use lookup then.
415 if (rc == -ENOENT && itp->it_op & IT_CREAT)
421 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
422 struct obd_client_handle *och)
424 struct mdt_body *body;
426 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
427 och->och_fh = body->mbo_handle;
428 och->och_fid = body->mbo_fid1;
429 och->och_lease_handle.cookie = it->it_lock_handle;
430 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
431 och->och_flags = it->it_flags;
433 return md_set_open_replay_data(md_exp, och, it);
436 static int ll_local_open(struct file *file, struct lookup_intent *it,
437 struct ll_file_data *fd, struct obd_client_handle *och)
439 struct inode *inode = file_inode(file);
442 LASSERT(!LUSTRE_FPRIVATE(file));
449 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
454 LUSTRE_FPRIVATE(file) = fd;
455 ll_readahead_init(inode, &fd->fd_ras);
456 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
458 /* ll_cl_context initialize */
459 rwlock_init(&fd->fd_lock);
460 INIT_LIST_HEAD(&fd->fd_lccs);
465 /* Open a file, and (for the very first open) create objects on the OSTs at
466 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
467 * creation or open until ll_lov_setstripe() ioctl is called.
469 * If we already have the stripe MD locally then we don't request it in
470 * md_open(), by passing a lmm_size = 0.
472 * It is up to the application to ensure no other processes open this file
473 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
474 * used. We might be able to avoid races of that sort by getting lli_open_sem
475 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
476 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
478 int ll_file_open(struct inode *inode, struct file *file)
480 struct ll_inode_info *lli = ll_i2info(inode);
481 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
482 .it_flags = file->f_flags };
483 struct obd_client_handle **och_p = NULL;
484 __u64 *och_usecount = NULL;
485 struct ll_file_data *fd;
489 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
490 PFID(ll_inode2fid(inode)), inode, file->f_flags);
492 it = file->private_data; /* XXX: compat macro */
493 file->private_data = NULL; /* prevent ll_local_open assertion */
495 fd = ll_file_data_get();
497 GOTO(out_openerr, rc = -ENOMEM);
500 if (S_ISDIR(inode->i_mode))
501 ll_authorize_statahead(inode, fd);
503 if (inode->i_sb->s_root == file_dentry(file)) {
504 LUSTRE_FPRIVATE(file) = fd;
508 if (!it || !it->it_disposition) {
509 /* Convert f_flags into access mode. We cannot use file->f_mode,
510 * because everything but O_ACCMODE mask was stripped from
512 if ((oit.it_flags + 1) & O_ACCMODE)
514 if (file->f_flags & O_TRUNC)
515 oit.it_flags |= FMODE_WRITE;
517 /* kernel only call f_op->open in dentry_open. filp_open calls
518 * dentry_open after call to open_namei that checks permissions.
519 * Only nfsd_open call dentry_open directly without checking
520 * permissions and because of that this code below is safe. */
521 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
522 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
524 /* We do not want O_EXCL here, presumably we opened the file
525 * already? XXX - NFS implications? */
526 oit.it_flags &= ~O_EXCL;
528 /* bug20584, if "it_flags" contains O_CREAT, the file will be
529 * created if necessary, then "IT_CREAT" should be set to keep
530 * consistent with it */
531 if (oit.it_flags & O_CREAT)
532 oit.it_op |= IT_CREAT;
538 /* Let's see if we have file open on MDS already. */
539 if (it->it_flags & FMODE_WRITE) {
540 och_p = &lli->lli_mds_write_och;
541 och_usecount = &lli->lli_open_fd_write_count;
542 } else if (it->it_flags & FMODE_EXEC) {
543 och_p = &lli->lli_mds_exec_och;
544 och_usecount = &lli->lli_open_fd_exec_count;
546 och_p = &lli->lli_mds_read_och;
547 och_usecount = &lli->lli_open_fd_read_count;
550 mutex_lock(&lli->lli_och_mutex);
551 if (*och_p) { /* Open handle is present */
552 if (it_disposition(it, DISP_OPEN_OPEN)) {
553 /* Well, there's extra open request that we do not need,
554 let's close it somehow. This will decref request. */
555 rc = it_open_error(DISP_OPEN_OPEN, it);
557 mutex_unlock(&lli->lli_och_mutex);
558 GOTO(out_openerr, rc);
561 ll_release_openhandle(file_dentry(file), it);
565 rc = ll_local_open(file, it, fd, NULL);
568 mutex_unlock(&lli->lli_och_mutex);
569 GOTO(out_openerr, rc);
572 LASSERT(*och_usecount == 0);
573 if (!it->it_disposition) {
574 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
575 /* We cannot just request lock handle now, new ELC code
576 means that one of other OPEN locks for this file
577 could be cancelled, and since blocking ast handler
578 would attempt to grab och_mutex as well, that would
579 result in a deadlock */
580 mutex_unlock(&lli->lli_och_mutex);
582 * Normally called under two situations:
584 * 2. A race/condition on MDS resulting in no open
585 * handle to be returned from LOOKUP|OPEN request,
586 * for example if the target entry was a symlink.
588 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
589 * marked by a bit set in ll_iget_for_nfs. Clear the
590 * bit so that it's not confusing later callers.
592 * NB; when ldd is NULL, it must have come via normal
593 * lookup path only, since ll_iget_for_nfs always calls
596 if (ldd && ldd->lld_nfs_dentry) {
597 ldd->lld_nfs_dentry = 0;
598 it->it_flags |= MDS_OPEN_LOCK;
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
609 GOTO(out_openerr, rc);
613 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
615 GOTO(out_och_free, rc = -ENOMEM);
619 /* md_intent_lock() didn't get a request ref if there was an
620 * open error, so don't do cleanup on the request here
622 /* XXX (green): Should not we bail out on any error here, not
623 * just open error? */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 GOTO(out_och_free, rc);
628 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
629 "inode %p: disposition %x, status %d\n", inode,
630 it_disposition(it, ~0), it->it_status);
632 rc = ll_local_open(file, it, fd, *och_p);
634 GOTO(out_och_free, rc);
636 mutex_unlock(&lli->lli_och_mutex);
639 /* Must do this outside lli_och_mutex lock to prevent deadlock where
640 different kind of OPEN lock for this same inode gets cancelled
641 by ldlm_cancel_lru */
642 if (!S_ISREG(inode->i_mode))
643 GOTO(out_och_free, rc);
645 cl_lov_delay_create_clear(&file->f_flags);
646 GOTO(out_och_free, rc);
650 if (och_p && *och_p) {
651 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
652 *och_p = NULL; /* OBD_FREE writes some magic there */
655 mutex_unlock(&lli->lli_och_mutex);
658 if (lli->lli_opendir_key == fd)
659 ll_deauthorize_statahead(inode, fd);
661 ll_file_data_put(fd);
663 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
667 ptlrpc_req_finished(it->it_request);
668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
674 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
675 struct ldlm_lock_desc *desc, void *data, int flag)
678 struct lustre_handle lockh;
682 case LDLM_CB_BLOCKING:
683 ldlm_lock2handle(lock, &lockh);
684 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
686 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
690 case LDLM_CB_CANCELING:
698 * When setting a lease on a file, we take ownership of the lli_mds_*_och
699 * and save it as fd->fd_och so as to force client to reopen the file even
700 * if it has an open lock in cache already.
702 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
703 struct lustre_handle *old_handle)
705 struct ll_inode_info *lli = ll_i2info(inode);
706 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
707 struct obd_client_handle **och_p;
712 /* Get the openhandle of the file */
713 mutex_lock(&lli->lli_och_mutex);
714 if (fd->fd_lease_och != NULL)
715 GOTO(out_unlock, rc = -EBUSY);
717 if (fd->fd_och == NULL) {
718 if (file->f_mode & FMODE_WRITE) {
719 LASSERT(lli->lli_mds_write_och != NULL);
720 och_p = &lli->lli_mds_write_och;
721 och_usecount = &lli->lli_open_fd_write_count;
723 LASSERT(lli->lli_mds_read_och != NULL);
724 och_p = &lli->lli_mds_read_och;
725 och_usecount = &lli->lli_open_fd_read_count;
728 if (*och_usecount > 1)
729 GOTO(out_unlock, rc = -EBUSY);
736 *old_handle = fd->fd_och->och_fh;
740 mutex_unlock(&lli->lli_och_mutex);
745 * Release ownership on lli_mds_*_och when putting back a file lease.
747 static int ll_lease_och_release(struct inode *inode, struct file *file)
749 struct ll_inode_info *lli = ll_i2info(inode);
750 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
751 struct obd_client_handle **och_p;
752 struct obd_client_handle *old_och = NULL;
757 mutex_lock(&lli->lli_och_mutex);
758 if (file->f_mode & FMODE_WRITE) {
759 och_p = &lli->lli_mds_write_och;
760 och_usecount = &lli->lli_open_fd_write_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 /* The file may have been open by another process (broken lease) so
767 * *och_p is not NULL. In this case we should simply increase usecount
770 if (*och_p != NULL) {
771 old_och = fd->fd_och;
778 mutex_unlock(&lli->lli_och_mutex);
781 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
787 * Acquire a lease and open the file.
789 static struct obd_client_handle *
790 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
793 struct lookup_intent it = { .it_op = IT_OPEN };
794 struct ll_sb_info *sbi = ll_i2sbi(inode);
795 struct md_op_data *op_data;
796 struct ptlrpc_request *req = NULL;
797 struct lustre_handle old_handle = { 0 };
798 struct obd_client_handle *och = NULL;
803 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
804 RETURN(ERR_PTR(-EINVAL));
807 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
808 RETURN(ERR_PTR(-EPERM));
810 rc = ll_lease_och_acquire(inode, file, &old_handle);
817 RETURN(ERR_PTR(-ENOMEM));
819 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
820 LUSTRE_OPC_ANY, NULL);
822 GOTO(out, rc = PTR_ERR(op_data));
824 /* To tell the MDT this openhandle is from the same owner */
825 op_data->op_handle = old_handle;
827 it.it_flags = fmode | open_flags;
828 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
829 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
830 &ll_md_blocking_lease_ast,
831 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
832 * it can be cancelled which may mislead applications that the lease is
834 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
835 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
836 * doesn't deal with openhandle, so normal openhandle will be leaked. */
837 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
838 ll_finish_md_op_data(op_data);
839 ptlrpc_req_finished(req);
841 GOTO(out_release_it, rc);
843 if (it_disposition(&it, DISP_LOOKUP_NEG))
844 GOTO(out_release_it, rc = -ENOENT);
846 rc = it_open_error(DISP_OPEN_OPEN, &it);
848 GOTO(out_release_it, rc);
850 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
851 ll_och_fill(sbi->ll_md_exp, &it, och);
853 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
854 GOTO(out_close, rc = -EOPNOTSUPP);
856 /* already get lease, handle lease lock */
857 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
858 if (it.it_lock_mode == 0 ||
859 it.it_lock_bits != MDS_INODELOCK_OPEN) {
860 /* open lock must return for lease */
861 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
862 PFID(ll_inode2fid(inode)), it.it_lock_mode,
864 GOTO(out_close, rc = -EPROTO);
867 ll_intent_release(&it);
871 /* Cancel open lock */
872 if (it.it_lock_mode != 0) {
873 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
876 och->och_lease_handle.cookie = 0ULL;
878 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
880 CERROR("%s: error closing file "DFID": %d\n",
881 ll_get_fsname(inode->i_sb, NULL, 0),
882 PFID(&ll_i2info(inode)->lli_fid), rc2);
883 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
885 ll_intent_release(&it);
893 * Check whether a layout swap can be done between two inodes.
895 * \param[in] inode1 First inode to check
896 * \param[in] inode2 Second inode to check
898 * \retval 0 on success, layout swap can be performed between both inodes
899 * \retval negative error code if requirements are not met
901 static int ll_check_swap_layouts_validity(struct inode *inode1,
902 struct inode *inode2)
904 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
907 if (inode_permission(inode1, MAY_WRITE) ||
908 inode_permission(inode2, MAY_WRITE))
911 if (inode1->i_sb != inode2->i_sb)
917 static int ll_swap_layouts_close(struct obd_client_handle *och,
918 struct inode *inode, struct inode *inode2)
920 const struct lu_fid *fid1 = ll_inode2fid(inode);
921 const struct lu_fid *fid2;
925 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
926 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
928 rc = ll_check_swap_layouts_validity(inode, inode2);
930 GOTO(out_free_och, rc);
932 /* We now know that inode2 is a lustre inode */
933 fid2 = ll_inode2fid(inode2);
935 rc = lu_fid_cmp(fid1, fid2);
937 GOTO(out_free_och, rc = -EINVAL);
939 /* Close the file and swap layouts between inode & inode2.
940 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
941 * because we still need it to pack l_remote_handle to MDT. */
942 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
945 och = NULL; /* freed in ll_close_inode_openhandle() */
955 * Release lease and close the file.
956 * It will check if the lease has ever broken.
958 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
961 struct ldlm_lock *lock;
962 bool cancelled = true;
966 lock = ldlm_handle2lock(&och->och_lease_handle);
968 lock_res_and_lock(lock);
969 cancelled = ldlm_is_cancel(lock);
970 unlock_res_and_lock(lock);
974 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
975 PFID(&ll_i2info(inode)->lli_fid), cancelled);
978 ldlm_cli_cancel(&och->och_lease_handle, 0);
980 if (lease_broken != NULL)
981 *lease_broken = cancelled;
983 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
987 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = vvp_env_thread_attr(env);
999 ll_inode_size_lock(inode);
1001 /* Merge timestamps the most recently obtained from MDS with
1002 * timestamps obtained from OSTs.
1004 * Do not overwrite atime of inode because it may be refreshed
1005 * by file_accessed() function. If the read was served by cache
1006 * data, there is no RPC to be sent so that atime may not be
1007 * transferred to OSTs at all. MDT only updates atime at close time
1008 * if it's at least 'mdd.*.atime_diff' older.
1009 * All in all, the atime in Lustre does not strictly comply with
1010 * POSIX. Solving this problem needs to send an RPC to MDT for each
1011 * read, this will hurt performance. */
1012 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1013 LTIME_S(inode->i_atime) = lli->lli_atime;
1014 lli->lli_update_atime = 0;
1016 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1017 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1019 atime = LTIME_S(inode->i_atime);
1020 mtime = LTIME_S(inode->i_mtime);
1021 ctime = LTIME_S(inode->i_ctime);
1023 cl_object_attr_lock(obj);
1024 rc = cl_object_attr_get(env, obj, attr);
1025 cl_object_attr_unlock(obj);
1028 GOTO(out_size_unlock, rc);
1030 if (atime < attr->cat_atime)
1031 atime = attr->cat_atime;
1033 if (ctime < attr->cat_ctime)
1034 ctime = attr->cat_ctime;
1036 if (mtime < attr->cat_mtime)
1037 mtime = attr->cat_mtime;
1039 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1040 PFID(&lli->lli_fid), attr->cat_size);
1042 i_size_write(inode, attr->cat_size);
1043 inode->i_blocks = attr->cat_blocks;
1045 LTIME_S(inode->i_atime) = atime;
1046 LTIME_S(inode->i_mtime) = mtime;
1047 LTIME_S(inode->i_ctime) = ctime;
1050 ll_inode_size_unlock(inode);
1055 static bool file_is_noatime(const struct file *file)
1057 const struct vfsmount *mnt = file->f_path.mnt;
1058 const struct inode *inode = file_inode((struct file *)file);
1060 /* Adapted from file_accessed() and touch_atime().*/
1061 if (file->f_flags & O_NOATIME)
1064 if (inode->i_flags & S_NOATIME)
1067 if (IS_NOATIME(inode))
1070 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1073 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1076 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1082 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1084 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1086 struct inode *inode = file_inode(file);
1088 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1089 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1090 io->u.ci_rw.rw_file = file;
1091 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1092 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1093 if (iot == CIT_WRITE) {
1094 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1095 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1096 file->f_flags & O_DIRECT ||
1099 io->ci_obj = ll_i2info(inode)->lli_clob;
1100 io->ci_lockreq = CILR_MAYBE;
1101 if (ll_file_nolock(file)) {
1102 io->ci_lockreq = CILR_NEVER;
1103 io->ci_no_srvlock = 1;
1104 } else if (file->f_flags & O_APPEND) {
1105 io->ci_lockreq = CILR_MANDATORY;
1107 io->ci_noatime = file_is_noatime(file);
1108 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1109 io->ci_pio = !io->u.ci_rw.rw_append;
1114 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1116 struct cl_io_pt *pt = ptask->pt_cbdata;
1117 struct file *file = pt->cip_file;
1120 loff_t pos = pt->cip_pos;
1125 env = cl_env_get(&refcheck);
1127 RETURN(PTR_ERR(env));
1129 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1130 file_dentry(file)->d_name.name,
1131 pt->cip_iot == CIT_READ ? "read" : "write",
1132 pos, pos + pt->cip_count);
1135 io = vvp_env_thread_io(env);
1136 ll_io_init(io, file, pt->cip_iot);
1137 io->u.ci_rw.rw_iter = pt->cip_iter;
1138 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1139 io->ci_pio = 0; /* It's already in parallel task */
1141 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1142 pt->cip_count - pt->cip_result);
1144 struct vvp_io *vio = vvp_env_io(env);
1146 vio->vui_io_subtype = IO_NORMAL;
1147 vio->vui_fd = LUSTRE_FPRIVATE(file);
1149 ll_cl_add(file, env, io, LCC_RW);
1150 rc = cl_io_loop(env, io);
1151 ll_cl_remove(file, env);
1153 /* cl_io_rw_init() handled IO */
1157 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1163 if (io->ci_nob > 0) {
1164 pt->cip_result += io->ci_nob;
1165 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1167 pt->cip_iocb.ki_pos = pos;
1168 #ifdef HAVE_KIOCB_KI_LEFT
1169 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1170 #elif defined(HAVE_KI_NBYTES)
1171 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1175 cl_io_fini(env, io);
1177 if ((rc == 0 || rc == -ENODATA) &&
1178 pt->cip_result < pt->cip_count &&
1179 io->ci_need_restart) {
1181 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1182 file_dentry(file)->d_name.name,
1183 pt->cip_iot == CIT_READ ? "read" : "write",
1184 pos, pos + pt->cip_count - pt->cip_result,
1185 pt->cip_result, rc);
1189 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1190 file_dentry(file)->d_name.name,
1191 pt->cip_iot == CIT_READ ? "read" : "write",
1192 pt->cip_result, rc);
1194 cl_env_put(env, &refcheck);
1195 RETURN(pt->cip_result > 0 ? 0 : rc);
1199 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1200 struct file *file, enum cl_io_type iot,
1201 loff_t *ppos, size_t count)
1203 struct range_lock range;
1204 struct vvp_io *vio = vvp_env_io(env);
1205 struct inode *inode = file_inode(file);
1206 struct ll_inode_info *lli = ll_i2info(inode);
1207 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1215 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1216 file_dentry(file)->d_name.name,
1217 iot == CIT_READ ? "read" : "write", pos, pos + count);
1220 io = vvp_env_thread_io(env);
1221 ll_io_init(io, file, iot);
1222 if (args->via_io_subtype == IO_NORMAL) {
1223 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1224 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1229 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1230 bool range_locked = false;
1232 if (file->f_flags & O_APPEND)
1233 range_lock_init(&range, 0, LUSTRE_EOF);
1235 range_lock_init(&range, pos, pos + count - 1);
1237 vio->vui_fd = LUSTRE_FPRIVATE(file);
1238 vio->vui_io_subtype = args->via_io_subtype;
1240 switch (vio->vui_io_subtype) {
1242 /* Direct IO reads must also take range lock,
1243 * or multiple reads will try to work on the same pages
1244 * See LU-6227 for details. */
1245 if (((iot == CIT_WRITE) ||
1246 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1247 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1248 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1250 rc = range_lock(&lli->lli_write_tree, &range);
1254 range_locked = true;
1258 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1259 vio->u.splice.vui_flags = args->u.splice.via_flags;
1262 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1266 ll_cl_add(file, env, io, LCC_RW);
1267 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1268 !lli->lli_inode_locked) {
1270 lli->lli_inode_locked = 1;
1272 rc = cl_io_loop(env, io);
1273 if (lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 0;
1275 inode_unlock(inode);
1277 ll_cl_remove(file, env);
1280 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1282 range_unlock(&lli->lli_write_tree, &range);
1285 /* cl_io_rw_init() handled IO */
1289 if (io->ci_nob > 0) {
1290 result += io->ci_nob;
1291 count -= io->ci_nob;
1293 if (args->via_io_subtype == IO_NORMAL) {
1294 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1296 args->u.normal.via_iocb->ki_pos = pos;
1297 #ifdef HAVE_KIOCB_KI_LEFT
1298 args->u.normal.via_iocb->ki_left = count;
1299 #elif defined(HAVE_KI_NBYTES)
1300 args->u.normal.via_iocb->ki_nbytes = count;
1304 pos = io->u.ci_rw.rw_range.cir_pos;
1308 cl_io_fini(env, io);
1310 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1312 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1313 file_dentry(file)->d_name.name,
1314 iot == CIT_READ ? "read" : "write",
1315 pos, pos + count, result, rc);
1319 if (iot == CIT_READ) {
1321 ll_stats_ops_tally(ll_i2sbi(inode),
1322 LPROC_LL_READ_BYTES, result);
1323 } else if (iot == CIT_WRITE) {
1325 ll_stats_ops_tally(ll_i2sbi(inode),
1326 LPROC_LL_WRITE_BYTES, result);
1327 fd->fd_write_failed = false;
1328 } else if (result == 0 && rc == 0) {
1331 fd->fd_write_failed = true;
1333 fd->fd_write_failed = false;
1334 } else if (rc != -ERESTARTSYS) {
1335 fd->fd_write_failed = true;
1339 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1340 file_dentry(file)->d_name.name,
1341 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1345 RETURN(result > 0 ? result : rc);
1349 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1350 * especially for small I/O.
1352 * To serve a read request, CLIO has to create and initialize a cl_io and
1353 * then request DLM lock. This has turned out to have siginificant overhead
1354 * and affects the performance of small I/O dramatically.
1356 * It's not necessary to create a cl_io for each I/O. Under the help of read
1357 * ahead, most of the pages being read are already in memory cache and we can
1358 * read those pages directly because if the pages exist, the corresponding DLM
1359 * lock must exist so that page content must be valid.
1361 * In fast read implementation, the llite speculatively finds and reads pages
1362 * in memory cache. There are three scenarios for fast read:
1363 * - If the page exists and is uptodate, kernel VM will provide the data and
1364 * CLIO won't be intervened;
1365 * - If the page was brought into memory by read ahead, it will be exported
1366 * and read ahead parameters will be updated;
1367 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1368 * it will go back and invoke normal read, i.e., a cl_io will be created
1369 * and DLM lock will be requested.
1371 * POSIX compliance: posix standard states that read is intended to be atomic.
1372 * Lustre read implementation is in line with Linux kernel read implementation
1373 * and neither of them complies with POSIX standard in this matter. Fast read
1374 * doesn't make the situation worse on single node but it may interleave write
1375 * results from multiple nodes due to short read handling in ll_file_aio_read().
1377 * \param env - lu_env
1378 * \param iocb - kiocb from kernel
1379 * \param iter - user space buffers where the data will be copied
1381 * \retval - number of bytes have been read, or error code if error occurred.
1384 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1385 struct iov_iter *iter)
1389 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1392 /* NB: we can't do direct IO for fast read because it will need a lock
1393 * to make IO engine happy. */
1394 if (iocb->ki_filp->f_flags & O_DIRECT)
1397 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1398 result = generic_file_read_iter(iocb, iter);
1399 ll_cl_remove(iocb->ki_filp, env);
1401 /* If the first page is not in cache, generic_file_aio_read() will be
1402 * returned with -ENODATA.
1403 * See corresponding code in ll_readpage(). */
1404 if (result == -ENODATA)
1408 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1409 LPROC_LL_READ_BYTES, result);
1415 * Read from a file (through the page cache).
1417 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1420 struct vvp_io_args *args;
1425 env = cl_env_get(&refcheck);
1427 return PTR_ERR(env);
1429 result = ll_do_fast_read(env, iocb, to);
1430 if (result < 0 || iov_iter_count(to) == 0)
1433 args = ll_env_args(env, IO_NORMAL);
1434 args->u.normal.via_iter = to;
1435 args->u.normal.via_iocb = iocb;
1437 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1438 &iocb->ki_pos, iov_iter_count(to));
1441 else if (result == 0)
1445 cl_env_put(env, &refcheck);
1450 * Write to a file (through the page cache).
1452 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1454 struct vvp_io_args *args;
1459 env = cl_env_get(&refcheck);
1461 return PTR_ERR(env);
1463 args = ll_env_args(env, IO_NORMAL);
1464 args->u.normal.via_iter = from;
1465 args->u.normal.via_iocb = iocb;
1467 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1468 &iocb->ki_pos, iov_iter_count(from));
1469 cl_env_put(env, &refcheck);
1473 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1475 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1477 static int ll_file_get_iov_count(const struct iovec *iov,
1478 unsigned long *nr_segs, size_t *count)
1483 for (seg = 0; seg < *nr_segs; seg++) {
1484 const struct iovec *iv = &iov[seg];
1487 * If any segment has a negative length, or the cumulative
1488 * length ever wraps negative then return -EINVAL.
1491 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1493 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1498 cnt -= iv->iov_len; /* This segment is no good */
1505 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1506 unsigned long nr_segs, loff_t pos)
1513 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1517 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1518 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1519 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1520 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1521 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1523 result = ll_file_read_iter(iocb, &to);
1528 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1532 struct iovec iov = { .iov_base = buf, .iov_len = count };
1533 struct kiocb *kiocb;
1538 env = cl_env_get(&refcheck);
1540 RETURN(PTR_ERR(env));
1542 kiocb = &ll_env_info(env)->lti_kiocb;
1543 init_sync_kiocb(kiocb, file);
1544 kiocb->ki_pos = *ppos;
1545 #ifdef HAVE_KIOCB_KI_LEFT
1546 kiocb->ki_left = count;
1547 #elif defined(HAVE_KI_NBYTES)
1548 kiocb->ki_nbytes = count;
1551 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1552 *ppos = kiocb->ki_pos;
1554 cl_env_put(env, &refcheck);
1559 * Write to a file (through the page cache).
1562 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1563 unsigned long nr_segs, loff_t pos)
1565 struct iov_iter from;
1570 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1574 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1575 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1576 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1577 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1578 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1580 result = ll_file_write_iter(iocb, &from);
1585 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1586 size_t count, loff_t *ppos)
1589 struct iovec iov = { .iov_base = (void __user *)buf,
1591 struct kiocb *kiocb;
1596 env = cl_env_get(&refcheck);
1598 RETURN(PTR_ERR(env));
1600 kiocb = &ll_env_info(env)->lti_kiocb;
1601 init_sync_kiocb(kiocb, file);
1602 kiocb->ki_pos = *ppos;
1603 #ifdef HAVE_KIOCB_KI_LEFT
1604 kiocb->ki_left = count;
1605 #elif defined(HAVE_KI_NBYTES)
1606 kiocb->ki_nbytes = count;
1609 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1610 *ppos = kiocb->ki_pos;
1612 cl_env_put(env, &refcheck);
1615 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1618 * Send file content (through pagecache) somewhere with helper
1620 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1621 struct pipe_inode_info *pipe, size_t count,
1625 struct vvp_io_args *args;
1630 env = cl_env_get(&refcheck);
1632 RETURN(PTR_ERR(env));
1634 args = ll_env_args(env, IO_SPLICE);
1635 args->u.splice.via_pipe = pipe;
1636 args->u.splice.via_flags = flags;
1638 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1639 cl_env_put(env, &refcheck);
1643 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1644 __u64 flags, struct lov_user_md *lum, int lum_size)
1646 struct lookup_intent oit = {
1648 .it_flags = flags | MDS_OPEN_BY_FID,
1653 ll_inode_size_lock(inode);
1654 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1656 GOTO(out_unlock, rc);
1658 ll_release_openhandle(dentry, &oit);
1661 ll_inode_size_unlock(inode);
1662 ll_intent_release(&oit);
1667 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1668 struct lov_mds_md **lmmp, int *lmm_size,
1669 struct ptlrpc_request **request)
1671 struct ll_sb_info *sbi = ll_i2sbi(inode);
1672 struct mdt_body *body;
1673 struct lov_mds_md *lmm = NULL;
1674 struct ptlrpc_request *req = NULL;
1675 struct md_op_data *op_data;
1678 rc = ll_get_default_mdsize(sbi, &lmmsize);
1682 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1683 strlen(filename), lmmsize,
1684 LUSTRE_OPC_ANY, NULL);
1685 if (IS_ERR(op_data))
1686 RETURN(PTR_ERR(op_data));
1688 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1689 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1690 ll_finish_md_op_data(op_data);
1692 CDEBUG(D_INFO, "md_getattr_name failed "
1693 "on %s: rc %d\n", filename, rc);
1697 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1698 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1700 lmmsize = body->mbo_eadatasize;
1702 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1704 GOTO(out, rc = -ENODATA);
1707 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1708 LASSERT(lmm != NULL);
1710 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1711 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1712 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1713 GOTO(out, rc = -EPROTO);
1716 * This is coming from the MDS, so is probably in
1717 * little endian. We convert it to host endian before
1718 * passing it to userspace.
1720 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1723 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1724 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1725 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1726 if (le32_to_cpu(lmm->lmm_pattern) &
1727 LOV_PATTERN_F_RELEASED)
1731 /* if function called for directory - we should
1732 * avoid swab not existent lsm objects */
1733 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1734 lustre_swab_lov_user_md_v1(
1735 (struct lov_user_md_v1 *)lmm);
1736 if (S_ISREG(body->mbo_mode))
1737 lustre_swab_lov_user_md_objects(
1738 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1740 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1741 lustre_swab_lov_user_md_v3(
1742 (struct lov_user_md_v3 *)lmm);
1743 if (S_ISREG(body->mbo_mode))
1744 lustre_swab_lov_user_md_objects(
1745 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1747 } else if (lmm->lmm_magic ==
1748 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1749 lustre_swab_lov_comp_md_v1(
1750 (struct lov_comp_md_v1 *)lmm);
1756 *lmm_size = lmmsize;
1761 static int ll_lov_setea(struct inode *inode, struct file *file,
1764 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1765 struct lov_user_md *lump;
1766 int lum_size = sizeof(struct lov_user_md) +
1767 sizeof(struct lov_user_ost_data);
1771 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1774 OBD_ALLOC_LARGE(lump, lum_size);
1778 if (copy_from_user(lump, arg, lum_size))
1779 GOTO(out_lump, rc = -EFAULT);
1781 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1783 cl_lov_delay_create_clear(&file->f_flags);
1786 OBD_FREE_LARGE(lump, lum_size);
1790 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1797 env = cl_env_get(&refcheck);
1799 RETURN(PTR_ERR(env));
1801 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1802 cl_env_put(env, &refcheck);
1806 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1809 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1810 struct lov_user_md *klum;
1812 __u64 flags = FMODE_WRITE;
1815 rc = ll_copy_user_md(lum, &klum);
1820 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1825 rc = put_user(0, &lum->lmm_stripe_count);
1829 rc = ll_layout_refresh(inode, &gen);
1833 rc = ll_file_getstripe(inode, arg, lum_size);
1835 cl_lov_delay_create_clear(&file->f_flags);
1838 OBD_FREE(klum, lum_size);
1843 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1845 struct ll_inode_info *lli = ll_i2info(inode);
1846 struct cl_object *obj = lli->lli_clob;
1847 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1848 struct ll_grouplock grouplock;
1853 CWARN("group id for group lock must not be 0\n");
1857 if (ll_file_nolock(file))
1858 RETURN(-EOPNOTSUPP);
1860 spin_lock(&lli->lli_lock);
1861 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1862 CWARN("group lock already existed with gid %lu\n",
1863 fd->fd_grouplock.lg_gid);
1864 spin_unlock(&lli->lli_lock);
1867 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1868 spin_unlock(&lli->lli_lock);
1871 * XXX: group lock needs to protect all OST objects while PFL
1872 * can add new OST objects during the IO, so we'd instantiate
1873 * all OST objects before getting its group lock.
1878 struct cl_layout cl = {
1879 .cl_is_composite = false,
1882 env = cl_env_get(&refcheck);
1884 RETURN(PTR_ERR(env));
1886 rc = cl_object_layout_get(env, obj, &cl);
1887 if (!rc && cl.cl_is_composite)
1888 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1890 cl_env_put(env, &refcheck);
1895 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1896 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1900 spin_lock(&lli->lli_lock);
1901 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1902 spin_unlock(&lli->lli_lock);
1903 CERROR("another thread just won the race\n");
1904 cl_put_grouplock(&grouplock);
1908 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1909 fd->fd_grouplock = grouplock;
1910 spin_unlock(&lli->lli_lock);
1912 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1916 static int ll_put_grouplock(struct inode *inode, struct file *file,
1919 struct ll_inode_info *lli = ll_i2info(inode);
1920 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1921 struct ll_grouplock grouplock;
1924 spin_lock(&lli->lli_lock);
1925 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1926 spin_unlock(&lli->lli_lock);
1927 CWARN("no group lock held\n");
1931 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1933 if (fd->fd_grouplock.lg_gid != arg) {
1934 CWARN("group lock %lu doesn't match current id %lu\n",
1935 arg, fd->fd_grouplock.lg_gid);
1936 spin_unlock(&lli->lli_lock);
1940 grouplock = fd->fd_grouplock;
1941 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1942 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1943 spin_unlock(&lli->lli_lock);
1945 cl_put_grouplock(&grouplock);
1946 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1951 * Close inode open handle
1953 * \param dentry [in] dentry which contains the inode
1954 * \param it [in,out] intent which contains open info and result
1957 * \retval <0 failure
1959 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1961 struct inode *inode = dentry->d_inode;
1962 struct obd_client_handle *och;
1968 /* Root ? Do nothing. */
1969 if (dentry->d_inode->i_sb->s_root == dentry)
1972 /* No open handle to close? Move away */
1973 if (!it_disposition(it, DISP_OPEN_OPEN))
1976 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1978 OBD_ALLOC(och, sizeof(*och));
1980 GOTO(out, rc = -ENOMEM);
1982 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1984 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1986 /* this one is in place of ll_file_open */
1987 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1988 ptlrpc_req_finished(it->it_request);
1989 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1995 * Get size for inode for which FIEMAP mapping is requested.
1996 * Make the FIEMAP get_info call and returns the result.
1997 * \param fiemap kernel buffer to hold extens
1998 * \param num_bytes kernel buffer size
2000 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2006 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2009 /* Checks for fiemap flags */
2010 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2011 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2015 /* Check for FIEMAP_FLAG_SYNC */
2016 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2017 rc = filemap_fdatawrite(inode->i_mapping);
2022 env = cl_env_get(&refcheck);
2024 RETURN(PTR_ERR(env));
2026 if (i_size_read(inode) == 0) {
2027 rc = ll_glimpse_size(inode);
2032 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2033 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2034 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2036 /* If filesize is 0, then there would be no objects for mapping */
2037 if (fmkey.lfik_oa.o_size == 0) {
2038 fiemap->fm_mapped_extents = 0;
2042 fmkey.lfik_fiemap = *fiemap;
2044 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2045 &fmkey, fiemap, &num_bytes);
2047 cl_env_put(env, &refcheck);
2051 int ll_fid2path(struct inode *inode, void __user *arg)
2053 struct obd_export *exp = ll_i2mdexp(inode);
2054 const struct getinfo_fid2path __user *gfin = arg;
2056 struct getinfo_fid2path *gfout;
2062 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2063 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2066 /* Only need to get the buflen */
2067 if (get_user(pathlen, &gfin->gf_pathlen))
2070 if (pathlen > PATH_MAX)
2073 outsize = sizeof(*gfout) + pathlen;
2074 OBD_ALLOC(gfout, outsize);
2078 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2079 GOTO(gf_free, rc = -EFAULT);
2080 /* append root FID after gfout to let MDT know the root FID so that it
2081 * can lookup the correct path, this is mainly for fileset.
2082 * old server without fileset mount support will ignore this. */
2083 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2085 /* Call mdc_iocontrol */
2086 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2090 if (copy_to_user(arg, gfout, outsize))
2094 OBD_FREE(gfout, outsize);
2099 * Read the data_version for inode.
2101 * This value is computed using stripe object version on OST.
2102 * Version is computed using server side locking.
2104 * @param flags if do sync on the OST side;
2106 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2107 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2109 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2111 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2119 /* If no file object initialized, we consider its version is 0. */
2125 env = cl_env_get(&refcheck);
2127 RETURN(PTR_ERR(env));
2129 io = vvp_env_thread_io(env);
2131 io->u.ci_data_version.dv_data_version = 0;
2132 io->u.ci_data_version.dv_flags = flags;
2135 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2136 result = cl_io_loop(env, io);
2138 result = io->ci_result;
2140 *data_version = io->u.ci_data_version.dv_data_version;
2142 cl_io_fini(env, io);
2144 if (unlikely(io->ci_need_restart))
2147 cl_env_put(env, &refcheck);
2153 * Trigger a HSM release request for the provided inode.
2155 int ll_hsm_release(struct inode *inode)
2158 struct obd_client_handle *och = NULL;
2159 __u64 data_version = 0;
2164 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2165 ll_get_fsname(inode->i_sb, NULL, 0),
2166 PFID(&ll_i2info(inode)->lli_fid));
2168 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2170 GOTO(out, rc = PTR_ERR(och));
2172 /* Grab latest data_version and [am]time values */
2173 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2177 env = cl_env_get(&refcheck);
2179 GOTO(out, rc = PTR_ERR(env));
2181 ll_merge_attr(env, inode);
2182 cl_env_put(env, &refcheck);
2184 /* Release the file.
2185 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2186 * we still need it to pack l_remote_handle to MDT. */
2187 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2193 if (och != NULL && !IS_ERR(och)) /* close the file */
2194 ll_lease_close(och, inode, NULL);
2199 struct ll_swap_stack {
2202 struct inode *inode1;
2203 struct inode *inode2;
2208 static int ll_swap_layouts(struct file *file1, struct file *file2,
2209 struct lustre_swap_layouts *lsl)
2211 struct mdc_swap_layouts msl;
2212 struct md_op_data *op_data;
2215 struct ll_swap_stack *llss = NULL;
2218 OBD_ALLOC_PTR(llss);
2222 llss->inode1 = file_inode(file1);
2223 llss->inode2 = file_inode(file2);
2225 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2229 /* we use 2 bool because it is easier to swap than 2 bits */
2230 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2231 llss->check_dv1 = true;
2233 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2234 llss->check_dv2 = true;
2236 /* we cannot use lsl->sl_dvX directly because we may swap them */
2237 llss->dv1 = lsl->sl_dv1;
2238 llss->dv2 = lsl->sl_dv2;
2240 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2241 if (rc == 0) /* same file, done! */
2244 if (rc < 0) { /* sequentialize it */
2245 swap(llss->inode1, llss->inode2);
2247 swap(llss->dv1, llss->dv2);
2248 swap(llss->check_dv1, llss->check_dv2);
2252 if (gid != 0) { /* application asks to flush dirty cache */
2253 rc = ll_get_grouplock(llss->inode1, file1, gid);
2257 rc = ll_get_grouplock(llss->inode2, file2, gid);
2259 ll_put_grouplock(llss->inode1, file1, gid);
2264 /* ultimate check, before swaping the layouts we check if
2265 * dataversion has changed (if requested) */
2266 if (llss->check_dv1) {
2267 rc = ll_data_version(llss->inode1, &dv, 0);
2270 if (dv != llss->dv1)
2271 GOTO(putgl, rc = -EAGAIN);
2274 if (llss->check_dv2) {
2275 rc = ll_data_version(llss->inode2, &dv, 0);
2278 if (dv != llss->dv2)
2279 GOTO(putgl, rc = -EAGAIN);
2282 /* struct md_op_data is used to send the swap args to the mdt
2283 * only flags is missing, so we use struct mdc_swap_layouts
2284 * through the md_op_data->op_data */
2285 /* flags from user space have to be converted before they are send to
2286 * server, no flag is sent today, they are only used on the client */
2289 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2290 0, LUSTRE_OPC_ANY, &msl);
2291 if (IS_ERR(op_data))
2292 GOTO(free, rc = PTR_ERR(op_data));
2294 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2295 sizeof(*op_data), op_data, NULL);
2296 ll_finish_md_op_data(op_data);
2303 ll_put_grouplock(llss->inode2, file2, gid);
2304 ll_put_grouplock(llss->inode1, file1, gid);
2314 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2316 struct md_op_data *op_data;
2320 /* Detect out-of range masks */
2321 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2324 /* Non-root users are forbidden to set or clear flags which are
2325 * NOT defined in HSM_USER_MASK. */
2326 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2327 !cfs_capable(CFS_CAP_SYS_ADMIN))
2330 /* Detect out-of range archive id */
2331 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2332 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2335 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2336 LUSTRE_OPC_ANY, hss);
2337 if (IS_ERR(op_data))
2338 RETURN(PTR_ERR(op_data));
2340 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2341 sizeof(*op_data), op_data, NULL);
2343 ll_finish_md_op_data(op_data);
2348 static int ll_hsm_import(struct inode *inode, struct file *file,
2349 struct hsm_user_import *hui)
2351 struct hsm_state_set *hss = NULL;
2352 struct iattr *attr = NULL;
2356 if (!S_ISREG(inode->i_mode))
2362 GOTO(out, rc = -ENOMEM);
2364 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2365 hss->hss_archive_id = hui->hui_archive_id;
2366 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2367 rc = ll_hsm_state_set(inode, hss);
2371 OBD_ALLOC_PTR(attr);
2373 GOTO(out, rc = -ENOMEM);
2375 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2376 attr->ia_mode |= S_IFREG;
2377 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2378 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2379 attr->ia_size = hui->hui_size;
2380 attr->ia_mtime.tv_sec = hui->hui_mtime;
2381 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2382 attr->ia_atime.tv_sec = hui->hui_atime;
2383 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2385 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2386 ATTR_UID | ATTR_GID |
2387 ATTR_MTIME | ATTR_MTIME_SET |
2388 ATTR_ATIME | ATTR_ATIME_SET;
2392 rc = ll_setattr_raw(file_dentry(file), attr, true);
2396 inode_unlock(inode);
2408 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2410 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2411 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2414 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2416 struct inode *inode = file_inode(file);
2418 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2419 ATTR_MTIME | ATTR_MTIME_SET |
2420 ATTR_CTIME | ATTR_CTIME_SET,
2422 .tv_sec = lfu->lfu_atime_sec,
2423 .tv_nsec = lfu->lfu_atime_nsec,
2426 .tv_sec = lfu->lfu_mtime_sec,
2427 .tv_nsec = lfu->lfu_mtime_nsec,
2430 .tv_sec = lfu->lfu_ctime_sec,
2431 .tv_nsec = lfu->lfu_ctime_nsec,
2437 if (!capable(CAP_SYS_ADMIN))
2440 if (!S_ISREG(inode->i_mode))
2444 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2445 inode_unlock(inode);
2451 * Give file access advices
2453 * The ladvise interface is similar to Linux fadvise() system call, except it
2454 * forwards the advices directly from Lustre client to server. The server side
2455 * codes will apply appropriate read-ahead and caching techniques for the
2456 * corresponding files.
2458 * A typical workload for ladvise is e.g. a bunch of different clients are
2459 * doing small random reads of a file, so prefetching pages into OSS cache
2460 * with big linear reads before the random IO is a net benefit. Fetching
2461 * all that data into each client cache with fadvise() may not be, due to
2462 * much more data being sent to the client.
2464 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2465 struct llapi_lu_ladvise *ladvise)
2469 struct cl_ladvise_io *lio;
2474 env = cl_env_get(&refcheck);
2476 RETURN(PTR_ERR(env));
2478 io = vvp_env_thread_io(env);
2479 io->ci_obj = ll_i2info(inode)->lli_clob;
2481 /* initialize parameters for ladvise */
2482 lio = &io->u.ci_ladvise;
2483 lio->li_start = ladvise->lla_start;
2484 lio->li_end = ladvise->lla_end;
2485 lio->li_fid = ll_inode2fid(inode);
2486 lio->li_advice = ladvise->lla_advice;
2487 lio->li_flags = flags;
2489 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2490 rc = cl_io_loop(env, io);
2494 cl_io_fini(env, io);
2495 cl_env_put(env, &refcheck);
2499 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2502 struct fsxattr fsxattr;
2504 if (copy_from_user(&fsxattr,
2505 (const struct fsxattr __user *)arg,
2509 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2510 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2511 if (copy_to_user((struct fsxattr __user *)arg,
2512 &fsxattr, sizeof(fsxattr)))
2518 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2522 struct md_op_data *op_data;
2523 struct ptlrpc_request *req = NULL;
2525 struct fsxattr fsxattr;
2526 struct cl_object *obj;
2528 /* only root could change project ID */
2529 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2532 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2533 LUSTRE_OPC_ANY, NULL);
2534 if (IS_ERR(op_data))
2535 RETURN(PTR_ERR(op_data));
2537 if (copy_from_user(&fsxattr,
2538 (const struct fsxattr __user *)arg,
2540 GOTO(out_fsxattr1, rc = -EFAULT);
2542 op_data->op_attr_flags = fsxattr.fsx_xflags;
2543 op_data->op_projid = fsxattr.fsx_projid;
2544 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2545 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2547 ptlrpc_req_finished(req);
2549 obj = ll_i2info(inode)->lli_clob;
2553 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2554 OBD_ALLOC_PTR(attr);
2556 GOTO(out_fsxattr1, rc = -ENOMEM);
2557 attr->ia_valid = ATTR_ATTR_FLAG;
2558 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2563 ll_finish_md_op_data(op_data);
2570 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2572 struct inode *inode = file_inode(file);
2573 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2577 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2578 PFID(ll_inode2fid(inode)), inode, cmd);
2579 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2581 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2582 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2586 case LL_IOC_GETFLAGS:
2587 /* Get the current value of the file flags */
2588 return put_user(fd->fd_flags, (int __user *)arg);
2589 case LL_IOC_SETFLAGS:
2590 case LL_IOC_CLRFLAGS:
2591 /* Set or clear specific file flags */
2592 /* XXX This probably needs checks to ensure the flags are
2593 * not abused, and to handle any flag side effects.
2595 if (get_user(flags, (int __user *) arg))
2598 if (cmd == LL_IOC_SETFLAGS) {
2599 if ((flags & LL_FILE_IGNORE_LOCK) &&
2600 !(file->f_flags & O_DIRECT)) {
2601 CERROR("%s: unable to disable locking on "
2602 "non-O_DIRECT file\n", current->comm);
2606 fd->fd_flags |= flags;
2608 fd->fd_flags &= ~flags;
2611 case LL_IOC_LOV_SETSTRIPE:
2612 case LL_IOC_LOV_SETSTRIPE_NEW:
2613 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2614 case LL_IOC_LOV_SETEA:
2615 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2616 case LL_IOC_LOV_SWAP_LAYOUTS: {
2618 struct lustre_swap_layouts lsl;
2620 if (copy_from_user(&lsl, (char __user *)arg,
2621 sizeof(struct lustre_swap_layouts)))
2624 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2627 file2 = fget(lsl.sl_fd);
2631 /* O_WRONLY or O_RDWR */
2632 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2633 GOTO(out, rc = -EPERM);
2635 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2636 struct inode *inode2;
2637 struct ll_inode_info *lli;
2638 struct obd_client_handle *och = NULL;
2640 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2641 GOTO(out, rc = -EINVAL);
2643 lli = ll_i2info(inode);
2644 mutex_lock(&lli->lli_och_mutex);
2645 if (fd->fd_lease_och != NULL) {
2646 och = fd->fd_lease_och;
2647 fd->fd_lease_och = NULL;
2649 mutex_unlock(&lli->lli_och_mutex);
2651 GOTO(out, rc = -ENOLCK);
2652 inode2 = file_inode(file2);
2653 rc = ll_swap_layouts_close(och, inode, inode2);
2655 rc = ll_swap_layouts(file, file2, &lsl);
2661 case LL_IOC_LOV_GETSTRIPE:
2662 case LL_IOC_LOV_GETSTRIPE_NEW:
2663 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2664 case FSFILT_IOC_GETFLAGS:
2665 case FSFILT_IOC_SETFLAGS:
2666 RETURN(ll_iocontrol(inode, file, cmd, arg));
2667 case FSFILT_IOC_GETVERSION_OLD:
2668 case FSFILT_IOC_GETVERSION:
2669 RETURN(put_user(inode->i_generation, (int __user *)arg));
2670 case LL_IOC_GROUP_LOCK:
2671 RETURN(ll_get_grouplock(inode, file, arg));
2672 case LL_IOC_GROUP_UNLOCK:
2673 RETURN(ll_put_grouplock(inode, file, arg));
2674 case IOC_OBD_STATFS:
2675 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2677 /* We need to special case any other ioctls we want to handle,
2678 * to send them to the MDS/OST as appropriate and to properly
2679 * network encode the arg field.
2680 case FSFILT_IOC_SETVERSION_OLD:
2681 case FSFILT_IOC_SETVERSION:
2683 case LL_IOC_FLUSHCTX:
2684 RETURN(ll_flush_ctx(inode));
2685 case LL_IOC_PATH2FID: {
2686 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2687 sizeof(struct lu_fid)))
2692 case LL_IOC_GETPARENT:
2693 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2695 case OBD_IOC_FID2PATH:
2696 RETURN(ll_fid2path(inode, (void __user *)arg));
2697 case LL_IOC_DATA_VERSION: {
2698 struct ioc_data_version idv;
2701 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2704 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2705 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2708 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2714 case LL_IOC_GET_MDTIDX: {
2717 mdtidx = ll_get_mdt_idx(inode);
2721 if (put_user((int)mdtidx, (int __user *)arg))
2726 case OBD_IOC_GETDTNAME:
2727 case OBD_IOC_GETMDNAME:
2728 RETURN(ll_get_obd_name(inode, cmd, arg));
2729 case LL_IOC_HSM_STATE_GET: {
2730 struct md_op_data *op_data;
2731 struct hsm_user_state *hus;
2738 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2739 LUSTRE_OPC_ANY, hus);
2740 if (IS_ERR(op_data)) {
2742 RETURN(PTR_ERR(op_data));
2745 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2748 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2751 ll_finish_md_op_data(op_data);
2755 case LL_IOC_HSM_STATE_SET: {
2756 struct hsm_state_set *hss;
2763 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2768 rc = ll_hsm_state_set(inode, hss);
2773 case LL_IOC_HSM_ACTION: {
2774 struct md_op_data *op_data;
2775 struct hsm_current_action *hca;
2782 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2783 LUSTRE_OPC_ANY, hca);
2784 if (IS_ERR(op_data)) {
2786 RETURN(PTR_ERR(op_data));
2789 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2792 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2795 ll_finish_md_op_data(op_data);
2799 case LL_IOC_SET_LEASE: {
2800 struct ll_inode_info *lli = ll_i2info(inode);
2801 struct obd_client_handle *och = NULL;
2806 case LL_LEASE_WRLCK:
2807 if (!(file->f_mode & FMODE_WRITE))
2809 fmode = FMODE_WRITE;
2811 case LL_LEASE_RDLCK:
2812 if (!(file->f_mode & FMODE_READ))
2816 case LL_LEASE_UNLCK:
2817 mutex_lock(&lli->lli_och_mutex);
2818 if (fd->fd_lease_och != NULL) {
2819 och = fd->fd_lease_och;
2820 fd->fd_lease_och = NULL;
2822 mutex_unlock(&lli->lli_och_mutex);
2827 fmode = och->och_flags;
2828 rc = ll_lease_close(och, inode, &lease_broken);
2832 rc = ll_lease_och_release(inode, file);
2839 RETURN(ll_lease_type_from_fmode(fmode));
2844 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2846 /* apply for lease */
2847 och = ll_lease_open(inode, file, fmode, 0);
2849 RETURN(PTR_ERR(och));
2852 mutex_lock(&lli->lli_och_mutex);
2853 if (fd->fd_lease_och == NULL) {
2854 fd->fd_lease_och = och;
2857 mutex_unlock(&lli->lli_och_mutex);
2859 /* impossible now that only excl is supported for now */
2860 ll_lease_close(och, inode, &lease_broken);
2865 case LL_IOC_GET_LEASE: {
2866 struct ll_inode_info *lli = ll_i2info(inode);
2867 struct ldlm_lock *lock = NULL;
2870 mutex_lock(&lli->lli_och_mutex);
2871 if (fd->fd_lease_och != NULL) {
2872 struct obd_client_handle *och = fd->fd_lease_och;
2874 lock = ldlm_handle2lock(&och->och_lease_handle);
2876 lock_res_and_lock(lock);
2877 if (!ldlm_is_cancel(lock))
2878 fmode = och->och_flags;
2880 unlock_res_and_lock(lock);
2881 LDLM_LOCK_PUT(lock);
2884 mutex_unlock(&lli->lli_och_mutex);
2886 RETURN(ll_lease_type_from_fmode(fmode));
2888 case LL_IOC_HSM_IMPORT: {
2889 struct hsm_user_import *hui;
2895 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2900 rc = ll_hsm_import(inode, file, hui);
2905 case LL_IOC_FUTIMES_3: {
2906 struct ll_futimes_3 lfu;
2908 if (copy_from_user(&lfu,
2909 (const struct ll_futimes_3 __user *)arg,
2913 RETURN(ll_file_futimes_3(file, &lfu));
2915 case LL_IOC_LADVISE: {
2916 struct llapi_ladvise_hdr *ladvise_hdr;
2919 int alloc_size = sizeof(*ladvise_hdr);
2922 OBD_ALLOC_PTR(ladvise_hdr);
2923 if (ladvise_hdr == NULL)
2926 if (copy_from_user(ladvise_hdr,
2927 (const struct llapi_ladvise_hdr __user *)arg,
2929 GOTO(out_ladvise, rc = -EFAULT);
2931 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2932 ladvise_hdr->lah_count < 1)
2933 GOTO(out_ladvise, rc = -EINVAL);
2935 num_advise = ladvise_hdr->lah_count;
2936 if (num_advise >= LAH_COUNT_MAX)
2937 GOTO(out_ladvise, rc = -EFBIG);
2939 OBD_FREE_PTR(ladvise_hdr);
2940 alloc_size = offsetof(typeof(*ladvise_hdr),
2941 lah_advise[num_advise]);
2942 OBD_ALLOC(ladvise_hdr, alloc_size);
2943 if (ladvise_hdr == NULL)
2947 * TODO: submit multiple advices to one server in a single RPC
2949 if (copy_from_user(ladvise_hdr,
2950 (const struct llapi_ladvise_hdr __user *)arg,
2952 GOTO(out_ladvise, rc = -EFAULT);
2954 for (i = 0; i < num_advise; i++) {
2955 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2956 &ladvise_hdr->lah_advise[i]);
2962 OBD_FREE(ladvise_hdr, alloc_size);
2965 case LL_IOC_FSGETXATTR:
2966 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2967 case LL_IOC_FSSETXATTR:
2968 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2970 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
2972 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2973 (void __user *)arg));
2977 #ifndef HAVE_FILE_LLSEEK_SIZE
2978 static inline loff_t
2979 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2981 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2983 if (offset > maxsize)
2986 if (offset != file->f_pos) {
2987 file->f_pos = offset;
2988 file->f_version = 0;
2994 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2995 loff_t maxsize, loff_t eof)
2997 struct inode *inode = file_inode(file);
3005 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3006 * position-querying operation. Avoid rewriting the "same"
3007 * f_pos value back to the file because a concurrent read(),
3008 * write() or lseek() might have altered it
3013 * f_lock protects against read/modify/write race with other
3014 * SEEK_CURs. Note that parallel writes and reads behave
3018 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3019 inode_unlock(inode);
3023 * In the generic case the entire file is data, so as long as
3024 * offset isn't at the end of the file then the offset is data.
3031 * There is a virtual hole at the end of the file, so as long as
3032 * offset isn't i_size or larger, return i_size.
3040 return llseek_execute(file, offset, maxsize);
3044 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3046 struct inode *inode = file_inode(file);
3047 loff_t retval, eof = 0;
3050 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3051 (origin == SEEK_CUR) ? file->f_pos : 0);
3052 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3053 PFID(ll_inode2fid(inode)), inode, retval, retval,
3055 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3057 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3058 retval = ll_glimpse_size(inode);
3061 eof = i_size_read(inode);
3064 retval = ll_generic_file_llseek_size(file, offset, origin,
3065 ll_file_maxbytes(inode), eof);
3069 static int ll_flush(struct file *file, fl_owner_t id)
3071 struct inode *inode = file_inode(file);
3072 struct ll_inode_info *lli = ll_i2info(inode);
3073 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3076 LASSERT(!S_ISDIR(inode->i_mode));
3078 /* catch async errors that were recorded back when async writeback
3079 * failed for pages in this mapping. */
3080 rc = lli->lli_async_rc;
3081 lli->lli_async_rc = 0;
3082 if (lli->lli_clob != NULL) {
3083 err = lov_read_and_clear_async_rc(lli->lli_clob);
3088 /* The application has been told write failure already.
3089 * Do not report failure again. */
3090 if (fd->fd_write_failed)
3092 return rc ? -EIO : 0;
3096 * Called to make sure a portion of file has been written out.
3097 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3099 * Return how many pages have been written.
3101 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3102 enum cl_fsync_mode mode, int ignore_layout)
3106 struct cl_fsync_io *fio;
3111 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3112 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3115 env = cl_env_get(&refcheck);
3117 RETURN(PTR_ERR(env));
3119 io = vvp_env_thread_io(env);
3120 io->ci_obj = ll_i2info(inode)->lli_clob;
3121 io->ci_ignore_layout = ignore_layout;
3123 /* initialize parameters for sync */
3124 fio = &io->u.ci_fsync;
3125 fio->fi_start = start;
3127 fio->fi_fid = ll_inode2fid(inode);
3128 fio->fi_mode = mode;
3129 fio->fi_nr_written = 0;
3131 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3132 result = cl_io_loop(env, io);
3134 result = io->ci_result;
3136 result = fio->fi_nr_written;
3137 cl_io_fini(env, io);
3138 cl_env_put(env, &refcheck);
3144 * When dentry is provided (the 'else' case), file_dentry() may be
3145 * null and dentry must be used directly rather than pulled from
3146 * file_dentry() as is done otherwise.
3149 #ifdef HAVE_FILE_FSYNC_4ARGS
3150 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3152 struct dentry *dentry = file_dentry(file);
3154 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3155 int ll_fsync(struct file *file, int datasync)
3157 struct dentry *dentry = file_dentry(file);
3159 loff_t end = LLONG_MAX;
3161 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3164 loff_t end = LLONG_MAX;
3166 struct inode *inode = dentry->d_inode;
3167 struct ll_inode_info *lli = ll_i2info(inode);
3168 struct ptlrpc_request *req;
3172 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3173 PFID(ll_inode2fid(inode)), inode);
3174 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3176 #ifdef HAVE_FILE_FSYNC_4ARGS
3177 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3178 lock_inode = !lli->lli_inode_locked;
3182 /* fsync's caller has already called _fdata{sync,write}, we want
3183 * that IO to finish before calling the osc and mdc sync methods */
3184 rc = filemap_fdatawait(inode->i_mapping);
3187 /* catch async errors that were recorded back when async writeback
3188 * failed for pages in this mapping. */
3189 if (!S_ISDIR(inode->i_mode)) {
3190 err = lli->lli_async_rc;
3191 lli->lli_async_rc = 0;
3194 if (lli->lli_clob != NULL) {
3195 err = lov_read_and_clear_async_rc(lli->lli_clob);
3201 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3205 ptlrpc_req_finished(req);
3207 if (S_ISREG(inode->i_mode)) {
3208 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3210 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3211 if (rc == 0 && err < 0)
3214 fd->fd_write_failed = true;
3216 fd->fd_write_failed = false;
3219 #ifdef HAVE_FILE_FSYNC_4ARGS
3221 inode_unlock(inode);
3227 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3229 struct inode *inode = file_inode(file);
3230 struct ll_sb_info *sbi = ll_i2sbi(inode);
3231 struct ldlm_enqueue_info einfo = {
3232 .ei_type = LDLM_FLOCK,
3233 .ei_cb_cp = ldlm_flock_completion_ast,
3234 .ei_cbdata = file_lock,
3236 struct md_op_data *op_data;
3237 struct lustre_handle lockh = { 0 };
3238 union ldlm_policy_data flock = { { 0 } };
3239 int fl_type = file_lock->fl_type;
3245 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3246 PFID(ll_inode2fid(inode)), file_lock);
3248 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3250 if (file_lock->fl_flags & FL_FLOCK) {
3251 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3252 /* flocks are whole-file locks */
3253 flock.l_flock.end = OFFSET_MAX;
3254 /* For flocks owner is determined by the local file desctiptor*/
3255 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3256 } else if (file_lock->fl_flags & FL_POSIX) {
3257 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3258 flock.l_flock.start = file_lock->fl_start;
3259 flock.l_flock.end = file_lock->fl_end;
3263 flock.l_flock.pid = file_lock->fl_pid;
3265 /* Somewhat ugly workaround for svc lockd.
3266 * lockd installs custom fl_lmops->lm_compare_owner that checks
3267 * for the fl_owner to be the same (which it always is on local node
3268 * I guess between lockd processes) and then compares pid.
3269 * As such we assign pid to the owner field to make it all work,
3270 * conflict with normal locks is unlikely since pid space and
3271 * pointer space for current->files are not intersecting */
3272 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3273 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3277 einfo.ei_mode = LCK_PR;
3280 /* An unlock request may or may not have any relation to
3281 * existing locks so we may not be able to pass a lock handle
3282 * via a normal ldlm_lock_cancel() request. The request may even
3283 * unlock a byte range in the middle of an existing lock. In
3284 * order to process an unlock request we need all of the same
3285 * information that is given with a normal read or write record
3286 * lock request. To avoid creating another ldlm unlock (cancel)
3287 * message we'll treat a LCK_NL flock request as an unlock. */
3288 einfo.ei_mode = LCK_NL;
3291 einfo.ei_mode = LCK_PW;
3294 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3309 flags = LDLM_FL_BLOCK_NOWAIT;
3315 flags = LDLM_FL_TEST_LOCK;
3318 CERROR("unknown fcntl lock command: %d\n", cmd);
3322 /* Save the old mode so that if the mode in the lock changes we
3323 * can decrement the appropriate reader or writer refcount. */
3324 file_lock->fl_type = einfo.ei_mode;
3326 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3327 LUSTRE_OPC_ANY, NULL);
3328 if (IS_ERR(op_data))
3329 RETURN(PTR_ERR(op_data));
3331 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3332 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3333 flock.l_flock.pid, flags, einfo.ei_mode,
3334 flock.l_flock.start, flock.l_flock.end);
3336 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3339 /* Restore the file lock type if not TEST lock. */
3340 if (!(flags & LDLM_FL_TEST_LOCK))
3341 file_lock->fl_type = fl_type;
3343 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3344 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3345 !(flags & LDLM_FL_TEST_LOCK))
3346 rc2 = locks_lock_file_wait(file, file_lock);
3348 if ((file_lock->fl_flags & FL_FLOCK) &&
3349 (rc == 0 || file_lock->fl_type == F_UNLCK))
3350 rc2 = flock_lock_file_wait(file, file_lock);
3351 if ((file_lock->fl_flags & FL_POSIX) &&
3352 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3353 !(flags & LDLM_FL_TEST_LOCK))
3354 rc2 = posix_lock_file_wait(file, file_lock);
3355 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3357 if (rc2 && file_lock->fl_type != F_UNLCK) {
3358 einfo.ei_mode = LCK_NL;
3359 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3364 ll_finish_md_op_data(op_data);
3369 int ll_get_fid_by_name(struct inode *parent, const char *name,
3370 int namelen, struct lu_fid *fid,
3371 struct inode **inode)
3373 struct md_op_data *op_data = NULL;
3374 struct mdt_body *body;
3375 struct ptlrpc_request *req;
3379 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3380 LUSTRE_OPC_ANY, NULL);
3381 if (IS_ERR(op_data))
3382 RETURN(PTR_ERR(op_data));
3384 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3385 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3386 ll_finish_md_op_data(op_data);
3390 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3392 GOTO(out_req, rc = -EFAULT);
3394 *fid = body->mbo_fid1;
3397 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3399 ptlrpc_req_finished(req);
3403 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3404 const char *name, int namelen)
3406 struct dentry *dchild = NULL;
3407 struct inode *child_inode = NULL;
3408 struct md_op_data *op_data;
3409 struct ptlrpc_request *request = NULL;
3410 struct obd_client_handle *och = NULL;
3412 struct mdt_body *body;
3414 __u64 data_version = 0;
3417 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3418 name, PFID(ll_inode2fid(parent)), mdtidx);
3420 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3421 0, LUSTRE_OPC_ANY, NULL);
3422 if (IS_ERR(op_data))
3423 RETURN(PTR_ERR(op_data));
3425 /* Get child FID first */
3426 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3429 dchild = d_lookup(file_dentry(file), &qstr);
3430 if (dchild != NULL) {
3431 if (dchild->d_inode != NULL)
3432 child_inode = igrab(dchild->d_inode);
3436 if (child_inode == NULL) {
3437 rc = ll_get_fid_by_name(parent, name, namelen,
3438 &op_data->op_fid3, &child_inode);
3443 if (child_inode == NULL)
3444 GOTO(out_free, rc = -EINVAL);
3447 * lfs migrate command needs to be blocked on the client
3448 * by checking the migrate FID against the FID of the
3451 if (child_inode == parent->i_sb->s_root->d_inode)
3452 GOTO(out_iput, rc = -EINVAL);
3454 inode_lock(child_inode);
3455 op_data->op_fid3 = *ll_inode2fid(child_inode);
3456 if (!fid_is_sane(&op_data->op_fid3)) {
3457 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3458 ll_get_fsname(parent->i_sb, NULL, 0), name,
3459 PFID(&op_data->op_fid3));
3460 GOTO(out_unlock, rc = -EINVAL);
3463 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3465 GOTO(out_unlock, rc);
3468 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3469 PFID(&op_data->op_fid3), mdtidx);
3470 GOTO(out_unlock, rc = 0);
3473 if (S_ISREG(child_inode->i_mode)) {
3474 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3478 GOTO(out_unlock, rc);
3481 rc = ll_data_version(child_inode, &data_version,
3484 GOTO(out_close, rc);
3486 op_data->op_handle = och->och_fh;
3487 op_data->op_data = och->och_mod;
3488 op_data->op_data_version = data_version;
3489 op_data->op_lease_handle = och->och_lease_handle;
3490 op_data->op_bias |= MDS_RENAME_MIGRATE;
3493 op_data->op_mds = mdtidx;
3494 op_data->op_cli_flags = CLI_MIGRATE;
3495 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3496 namelen, name, namelen, &request);
3498 LASSERT(request != NULL);
3499 ll_update_times(request, parent);
3501 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3502 LASSERT(body != NULL);
3504 /* If the server does release layout lock, then we cleanup
3505 * the client och here, otherwise release it in out_close: */
3507 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3508 obd_mod_put(och->och_mod);
3509 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3511 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3517 if (request != NULL) {
3518 ptlrpc_req_finished(request);
3522 /* Try again if the file layout has changed. */
3523 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3527 if (och != NULL) /* close the file */
3528 ll_lease_close(och, child_inode, NULL);
3530 clear_nlink(child_inode);
3532 inode_unlock(child_inode);
3536 ll_finish_md_op_data(op_data);
3541 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3549 * test if some locks matching bits and l_req_mode are acquired
3550 * - bits can be in different locks
3551 * - if found clear the common lock bits in *bits
3552 * - the bits not found, are kept in *bits
3554 * \param bits [IN] searched lock bits [IN]
3555 * \param l_req_mode [IN] searched lock mode
3556 * \retval boolean, true iff all bits are found
3558 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3560 struct lustre_handle lockh;
3561 union ldlm_policy_data policy;
3562 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3563 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3572 fid = &ll_i2info(inode)->lli_fid;
3573 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3574 ldlm_lockname[mode]);
3576 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3577 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3578 policy.l_inodebits.bits = *bits & (1 << i);
3579 if (policy.l_inodebits.bits == 0)
3582 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3583 &policy, mode, &lockh)) {
3584 struct ldlm_lock *lock;
3586 lock = ldlm_handle2lock(&lockh);
3589 ~(lock->l_policy_data.l_inodebits.bits);
3590 LDLM_LOCK_PUT(lock);
3592 *bits &= ~policy.l_inodebits.bits;
3599 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3600 struct lustre_handle *lockh, __u64 flags,
3601 enum ldlm_mode mode)
3603 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3608 fid = &ll_i2info(inode)->lli_fid;
3609 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3611 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3612 fid, LDLM_IBITS, &policy, mode, lockh);
3617 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3619 /* Already unlinked. Just update nlink and return success */
3620 if (rc == -ENOENT) {
3622 /* If it is striped directory, and there is bad stripe
3623 * Let's revalidate the dentry again, instead of returning
3625 if (S_ISDIR(inode->i_mode) &&
3626 ll_i2info(inode)->lli_lsm_md != NULL)
3629 /* This path cannot be hit for regular files unless in
3630 * case of obscure races, so no need to to validate
3632 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3634 } else if (rc != 0) {
3635 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3636 "%s: revalidate FID "DFID" error: rc = %d\n",
3637 ll_get_fsname(inode->i_sb, NULL, 0),
3638 PFID(ll_inode2fid(inode)), rc);
3644 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3646 struct inode *inode = dentry->d_inode;
3647 struct ptlrpc_request *req = NULL;
3648 struct obd_export *exp;
3652 LASSERT(inode != NULL);
3654 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3655 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3657 exp = ll_i2mdexp(inode);
3659 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3660 * But under CMD case, it caused some lock issues, should be fixed
3661 * with new CMD ibits lock. See bug 12718 */
3662 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3663 struct lookup_intent oit = { .it_op = IT_GETATTR };
3664 struct md_op_data *op_data;
3666 if (ibits == MDS_INODELOCK_LOOKUP)
3667 oit.it_op = IT_LOOKUP;
3669 /* Call getattr by fid, so do not provide name at all. */
3670 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3671 dentry->d_inode, NULL, 0, 0,
3672 LUSTRE_OPC_ANY, NULL);
3673 if (IS_ERR(op_data))
3674 RETURN(PTR_ERR(op_data));
3676 rc = md_intent_lock(exp, op_data, &oit, &req,
3677 &ll_md_blocking_ast, 0);
3678 ll_finish_md_op_data(op_data);
3680 rc = ll_inode_revalidate_fini(inode, rc);
3684 rc = ll_revalidate_it_finish(req, &oit, dentry);
3686 ll_intent_release(&oit);
3690 /* Unlinked? Unhash dentry, so it is not picked up later by
3691 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3692 here to preserve get_cwd functionality on 2.6.
3694 if (!dentry->d_inode->i_nlink) {
3695 ll_lock_dcache(inode);
3696 d_lustre_invalidate(dentry, 0);
3697 ll_unlock_dcache(inode);
3700 ll_lookup_finish_locks(&oit, dentry);
3701 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3702 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3703 u64 valid = OBD_MD_FLGETATTR;
3704 struct md_op_data *op_data;
3707 if (S_ISREG(inode->i_mode)) {
3708 rc = ll_get_default_mdsize(sbi, &ealen);
3711 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3714 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3715 0, ealen, LUSTRE_OPC_ANY,
3717 if (IS_ERR(op_data))
3718 RETURN(PTR_ERR(op_data));
3720 op_data->op_valid = valid;
3721 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3722 ll_finish_md_op_data(op_data);
3724 rc = ll_inode_revalidate_fini(inode, rc);
3728 rc = ll_prep_inode(&inode, req, NULL, NULL);
3731 ptlrpc_req_finished(req);
3735 static int ll_merge_md_attr(struct inode *inode)
3737 struct cl_attr attr = { 0 };
3740 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3741 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3742 &attr, ll_md_blocking_ast);
3746 set_nlink(inode, attr.cat_nlink);
3747 inode->i_blocks = attr.cat_blocks;
3748 i_size_write(inode, attr.cat_size);
3750 ll_i2info(inode)->lli_atime = attr.cat_atime;
3751 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3752 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3758 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3760 struct inode *inode = dentry->d_inode;
3764 rc = __ll_inode_revalidate(dentry, ibits);
3768 /* if object isn't regular file, don't validate size */
3769 if (!S_ISREG(inode->i_mode)) {
3770 if (S_ISDIR(inode->i_mode) &&
3771 ll_i2info(inode)->lli_lsm_md != NULL) {
3772 rc = ll_merge_md_attr(inode);
3777 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3778 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3779 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3781 /* In case of restore, the MDT has the right size and has
3782 * already send it back without granting the layout lock,
3783 * inode is up-to-date so glimpse is useless.
3784 * Also to glimpse we need the layout, in case of a running
3785 * restore the MDT holds the layout lock so the glimpse will
3786 * block up to the end of restore (getattr will block)
3788 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3789 rc = ll_glimpse_size(inode);
3794 static inline dev_t ll_compat_encode_dev(dev_t dev)
3796 /* The compat_sys_*stat*() syscalls will fail unless the
3797 * device majors and minors are both less than 256. Note that
3798 * the value returned here will be passed through
3799 * old_encode_dev() in cp_compat_stat(). And so we are not
3800 * trying to return a valid compat (u16) device number, just
3801 * one that will pass the old_valid_dev() check. */
3803 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3806 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
3807 int ll_getattr(const struct path *path, struct kstat *stat,
3808 u32 request_mask, unsigned int flags)
3811 struct dentry *de = path->dentry;
3813 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3816 struct inode *inode = de->d_inode;
3817 struct ll_sb_info *sbi = ll_i2sbi(inode);
3818 struct ll_inode_info *lli = ll_i2info(inode);
3821 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3822 MDS_INODELOCK_LOOKUP);
3823 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3828 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3830 if (ll_need_32bit_api(sbi)) {
3831 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3832 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3833 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3835 stat->ino = inode->i_ino;
3836 stat->dev = inode->i_sb->s_dev;
3837 stat->rdev = inode->i_rdev;
3840 stat->mode = inode->i_mode;
3841 stat->uid = inode->i_uid;
3842 stat->gid = inode->i_gid;
3843 stat->atime = inode->i_atime;
3844 stat->mtime = inode->i_mtime;
3845 stat->ctime = inode->i_ctime;
3846 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3848 stat->nlink = inode->i_nlink;
3849 stat->size = i_size_read(inode);
3850 stat->blocks = inode->i_blocks;
3855 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3856 __u64 start, __u64 len)
3860 struct fiemap *fiemap;
3861 unsigned int extent_count = fieinfo->fi_extents_max;
3863 num_bytes = sizeof(*fiemap) + (extent_count *
3864 sizeof(struct fiemap_extent));
3865 OBD_ALLOC_LARGE(fiemap, num_bytes);
3870 fiemap->fm_flags = fieinfo->fi_flags;
3871 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3872 fiemap->fm_start = start;
3873 fiemap->fm_length = len;
3874 if (extent_count > 0 &&
3875 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3876 sizeof(struct fiemap_extent)) != 0)
3877 GOTO(out, rc = -EFAULT);
3879 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3881 fieinfo->fi_flags = fiemap->fm_flags;
3882 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3883 if (extent_count > 0 &&
3884 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3885 fiemap->fm_mapped_extents *
3886 sizeof(struct fiemap_extent)) != 0)
3887 GOTO(out, rc = -EFAULT);
3889 OBD_FREE_LARGE(fiemap, num_bytes);
3893 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3895 struct ll_inode_info *lli = ll_i2info(inode);
3896 struct posix_acl *acl = NULL;
3899 spin_lock(&lli->lli_lock);
3900 /* VFS' acl_permission_check->check_acl will release the refcount */
3901 acl = posix_acl_dup(lli->lli_posix_acl);
3902 spin_unlock(&lli->lli_lock);
3907 #ifdef HAVE_IOP_SET_ACL
3908 #ifdef CONFIG_FS_POSIX_ACL
3909 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
3911 struct ll_sb_info *sbi = ll_i2sbi(inode);
3912 struct ptlrpc_request *req = NULL;
3913 const char *name = NULL;
3915 size_t value_size = 0;
3920 case ACL_TYPE_ACCESS:
3921 name = XATTR_NAME_POSIX_ACL_ACCESS;
3923 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
3929 case ACL_TYPE_DEFAULT:
3930 name = XATTR_NAME_POSIX_ACL_DEFAULT;
3931 if (!S_ISDIR(inode->i_mode))
3932 GOTO(out, rc = acl ? -EACCES : 0);
3936 GOTO(out, rc = -EINVAL);
3940 value_size = posix_acl_xattr_size(acl->a_count);
3941 value = kmalloc(value_size, GFP_NOFS);
3943 GOTO(out, rc = -ENOMEM);
3945 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
3947 GOTO(out_value, rc);
3950 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3951 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
3952 name, value, value_size, 0, 0, 0, &req);
3954 ptlrpc_req_finished(req);
3959 set_cached_acl(inode, type, acl);
3961 forget_cached_acl(inode, type);
3964 #endif /* CONFIG_FS_POSIX_ACL */
3965 #endif /* HAVE_IOP_SET_ACL */
3967 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3969 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3970 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3972 ll_check_acl(struct inode *inode, int mask)
3975 # ifdef CONFIG_FS_POSIX_ACL
3976 struct posix_acl *acl;
3980 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3981 if (flags & IPERM_FLAG_RCU)
3984 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3989 rc = posix_acl_permission(inode, acl, mask);
3990 posix_acl_release(acl);
3993 # else /* !CONFIG_FS_POSIX_ACL */
3995 # endif /* CONFIG_FS_POSIX_ACL */
3997 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3999 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4000 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4002 # ifdef HAVE_INODE_PERMISION_2ARGS
4003 int ll_inode_permission(struct inode *inode, int mask)
4005 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4010 struct ll_sb_info *sbi;
4011 struct root_squash_info *squash;
4012 struct cred *cred = NULL;
4013 const struct cred *old_cred = NULL;
4015 bool squash_id = false;
4018 #ifdef MAY_NOT_BLOCK
4019 if (mask & MAY_NOT_BLOCK)
4021 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4022 if (flags & IPERM_FLAG_RCU)
4026 /* as root inode are NOT getting validated in lookup operation,
4027 * need to do it before permission check. */
4029 if (inode == inode->i_sb->s_root->d_inode) {
4030 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4031 MDS_INODELOCK_LOOKUP);
4036 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4037 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4039 /* squash fsuid/fsgid if needed */
4040 sbi = ll_i2sbi(inode);
4041 squash = &sbi->ll_squash;
4042 if (unlikely(squash->rsi_uid != 0 &&
4043 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4044 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4048 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4049 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4050 squash->rsi_uid, squash->rsi_gid);
4052 /* update current process's credentials
4053 * and FS capability */
4054 cred = prepare_creds();
4058 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4059 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4060 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4061 if ((1 << cap) & CFS_CAP_FS_MASK)
4062 cap_lower(cred->cap_effective, cap);
4064 old_cred = override_creds(cred);
4067 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4068 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4069 /* restore current process's credentials and FS capability */
4071 revert_creds(old_cred);
4078 /* -o localflock - only provides locally consistent flock locks */
4079 struct file_operations ll_file_operations = {
4080 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4081 # ifdef HAVE_SYNC_READ_WRITE
4082 .read = new_sync_read,
4083 .write = new_sync_write,
4085 .read_iter = ll_file_read_iter,
4086 .write_iter = ll_file_write_iter,
4087 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4088 .read = ll_file_read,
4089 .aio_read = ll_file_aio_read,
4090 .write = ll_file_write,
4091 .aio_write = ll_file_aio_write,
4092 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4093 .unlocked_ioctl = ll_file_ioctl,
4094 .open = ll_file_open,
4095 .release = ll_file_release,
4096 .mmap = ll_file_mmap,
4097 .llseek = ll_file_seek,
4098 .splice_read = ll_file_splice_read,
4103 struct file_operations ll_file_operations_flock = {
4104 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4105 # ifdef HAVE_SYNC_READ_WRITE
4106 .read = new_sync_read,
4107 .write = new_sync_write,
4108 # endif /* HAVE_SYNC_READ_WRITE */
4109 .read_iter = ll_file_read_iter,
4110 .write_iter = ll_file_write_iter,
4111 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4112 .read = ll_file_read,
4113 .aio_read = ll_file_aio_read,
4114 .write = ll_file_write,
4115 .aio_write = ll_file_aio_write,
4116 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4117 .unlocked_ioctl = ll_file_ioctl,
4118 .open = ll_file_open,
4119 .release = ll_file_release,
4120 .mmap = ll_file_mmap,
4121 .llseek = ll_file_seek,
4122 .splice_read = ll_file_splice_read,
4125 .flock = ll_file_flock,
4126 .lock = ll_file_flock
4129 /* These are for -o noflock - to return ENOSYS on flock calls */
4130 struct file_operations ll_file_operations_noflock = {
4131 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4132 # ifdef HAVE_SYNC_READ_WRITE
4133 .read = new_sync_read,
4134 .write = new_sync_write,
4135 # endif /* HAVE_SYNC_READ_WRITE */
4136 .read_iter = ll_file_read_iter,
4137 .write_iter = ll_file_write_iter,
4138 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4139 .read = ll_file_read,
4140 .aio_read = ll_file_aio_read,
4141 .write = ll_file_write,
4142 .aio_write = ll_file_aio_write,
4143 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4144 .unlocked_ioctl = ll_file_ioctl,
4145 .open = ll_file_open,
4146 .release = ll_file_release,
4147 .mmap = ll_file_mmap,
4148 .llseek = ll_file_seek,
4149 .splice_read = ll_file_splice_read,
4152 .flock = ll_file_noflock,
4153 .lock = ll_file_noflock
4156 struct inode_operations ll_file_inode_operations = {
4157 .setattr = ll_setattr,
4158 .getattr = ll_getattr,
4159 .permission = ll_inode_permission,
4160 #ifdef HAVE_IOP_XATTR
4161 .setxattr = ll_setxattr,
4162 .getxattr = ll_getxattr,
4163 .removexattr = ll_removexattr,
4165 .listxattr = ll_listxattr,
4166 .fiemap = ll_fiemap,
4167 #ifdef HAVE_IOP_GET_ACL
4168 .get_acl = ll_get_acl,
4170 #ifdef HAVE_IOP_SET_ACL
4171 .set_acl = ll_set_acl,
4175 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4177 struct ll_inode_info *lli = ll_i2info(inode);
4178 struct cl_object *obj = lli->lli_clob;
4187 env = cl_env_get(&refcheck);
4189 RETURN(PTR_ERR(env));
4191 rc = cl_conf_set(env, lli->lli_clob, conf);
4195 if (conf->coc_opc == OBJECT_CONF_SET) {
4196 struct ldlm_lock *lock = conf->coc_lock;
4197 struct cl_layout cl = {
4201 LASSERT(lock != NULL);
4202 LASSERT(ldlm_has_layout(lock));
4204 /* it can only be allowed to match after layout is
4205 * applied to inode otherwise false layout would be
4206 * seen. Applying layout shoud happen before dropping
4207 * the intent lock. */
4208 ldlm_lock_allow_match(lock);
4210 rc = cl_object_layout_get(env, obj, &cl);
4215 DFID": layout version change: %u -> %u\n",
4216 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4218 ll_layout_version_set(lli, cl.cl_layout_gen);
4222 cl_env_put(env, &refcheck);
4227 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4228 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4231 struct ll_sb_info *sbi = ll_i2sbi(inode);
4232 struct ptlrpc_request *req;
4233 struct mdt_body *body;
4240 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4241 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4242 lock->l_lvb_data, lock->l_lvb_len);
4244 if (lock->l_lvb_data != NULL)
4247 /* if layout lock was granted right away, the layout is returned
4248 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4249 * blocked and then granted via completion ast, we have to fetch
4250 * layout here. Please note that we can't use the LVB buffer in
4251 * completion AST because it doesn't have a large enough buffer */
4252 rc = ll_get_default_mdsize(sbi, &lmmsize);
4254 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4255 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4260 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4262 GOTO(out, rc = -EPROTO);
4264 lmmsize = body->mbo_eadatasize;
4265 if (lmmsize == 0) /* empty layout */
4268 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4270 GOTO(out, rc = -EFAULT);
4272 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4273 if (lvbdata == NULL)
4274 GOTO(out, rc = -ENOMEM);
4276 memcpy(lvbdata, lmm, lmmsize);
4277 lock_res_and_lock(lock);
4278 if (unlikely(lock->l_lvb_data == NULL)) {
4279 lock->l_lvb_type = LVB_T_LAYOUT;
4280 lock->l_lvb_data = lvbdata;
4281 lock->l_lvb_len = lmmsize;
4284 unlock_res_and_lock(lock);
4287 OBD_FREE_LARGE(lvbdata, lmmsize);
4292 ptlrpc_req_finished(req);
4297 * Apply the layout to the inode. Layout lock is held and will be released
4300 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4301 struct inode *inode)
4303 struct ll_inode_info *lli = ll_i2info(inode);
4304 struct ll_sb_info *sbi = ll_i2sbi(inode);
4305 struct ldlm_lock *lock;
4306 struct cl_object_conf conf;
4309 bool wait_layout = false;
4312 LASSERT(lustre_handle_is_used(lockh));
4314 lock = ldlm_handle2lock(lockh);
4315 LASSERT(lock != NULL);
4316 LASSERT(ldlm_has_layout(lock));
4318 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4319 PFID(&lli->lli_fid), inode);
4321 /* in case this is a caching lock and reinstate with new inode */
4322 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4324 lock_res_and_lock(lock);
4325 lvb_ready = ldlm_is_lvb_ready(lock);
4326 unlock_res_and_lock(lock);
4328 /* checking lvb_ready is racy but this is okay. The worst case is
4329 * that multi processes may configure the file on the same time. */
4333 rc = ll_layout_fetch(inode, lock);
4337 /* for layout lock, lmm is stored in lock's lvb.
4338 * lvb_data is immutable if the lock is held so it's safe to access it
4341 * set layout to file. Unlikely this will fail as old layout was
4342 * surely eliminated */
4343 memset(&conf, 0, sizeof conf);
4344 conf.coc_opc = OBJECT_CONF_SET;
4345 conf.coc_inode = inode;
4346 conf.coc_lock = lock;
4347 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4348 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4349 rc = ll_layout_conf(inode, &conf);
4351 /* refresh layout failed, need to wait */
4352 wait_layout = rc == -EBUSY;
4355 LDLM_LOCK_PUT(lock);
4356 ldlm_lock_decref(lockh, mode);
4358 /* wait for IO to complete if it's still being used. */
4360 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4361 ll_get_fsname(inode->i_sb, NULL, 0),
4362 PFID(&lli->lli_fid), inode);
4364 memset(&conf, 0, sizeof conf);
4365 conf.coc_opc = OBJECT_CONF_WAIT;
4366 conf.coc_inode = inode;
4367 rc = ll_layout_conf(inode, &conf);
4371 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4372 ll_get_fsname(inode->i_sb, NULL, 0),
4373 PFID(&lli->lli_fid), rc);
4379 * Issue layout intent RPC to MDS.
4380 * \param inode [in] file inode
4381 * \param intent [in] layout intent
4383 * \retval 0 on success
4384 * \retval < 0 error code
4386 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4388 struct ll_inode_info *lli = ll_i2info(inode);
4389 struct ll_sb_info *sbi = ll_i2sbi(inode);
4390 struct md_op_data *op_data;
4391 struct lookup_intent it;
4392 struct ptlrpc_request *req;
4396 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4397 0, 0, LUSTRE_OPC_ANY, NULL);
4398 if (IS_ERR(op_data))
4399 RETURN(PTR_ERR(op_data));
4401 op_data->op_data = intent;
4402 op_data->op_data_size = sizeof(*intent);
4404 memset(&it, 0, sizeof(it));
4405 it.it_op = IT_LAYOUT;
4406 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4407 intent->li_opc == LAYOUT_INTENT_TRUNC)
4408 it.it_flags = FMODE_WRITE;
4410 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4411 ll_get_fsname(inode->i_sb, NULL, 0),
4412 PFID(&lli->lli_fid), inode);
4414 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4415 &ll_md_blocking_ast, 0);
4416 if (it.it_request != NULL)
4417 ptlrpc_req_finished(it.it_request);
4418 it.it_request = NULL;
4420 ll_finish_md_op_data(op_data);
4422 /* set lock data in case this is a new lock */
4424 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4426 ll_intent_drop_lock(&it);
4432 * This function checks if there exists a LAYOUT lock on the client side,
4433 * or enqueues it if it doesn't have one in cache.
4435 * This function will not hold layout lock so it may be revoked any time after
4436 * this function returns. Any operations depend on layout should be redone
4439 * This function should be called before lov_io_init() to get an uptodate
4440 * layout version, the caller should save the version number and after IO
4441 * is finished, this function should be called again to verify that layout
4442 * is not changed during IO time.
4444 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4446 struct ll_inode_info *lli = ll_i2info(inode);
4447 struct ll_sb_info *sbi = ll_i2sbi(inode);
4448 struct lustre_handle lockh;
4449 struct layout_intent intent = {
4450 .li_opc = LAYOUT_INTENT_ACCESS,
4452 enum ldlm_mode mode;
4456 *gen = ll_layout_version_get(lli);
4457 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4461 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4462 LASSERT(S_ISREG(inode->i_mode));
4464 /* take layout lock mutex to enqueue layout lock exclusively. */
4465 mutex_lock(&lli->lli_layout_mutex);
4468 /* mostly layout lock is caching on the local side, so try to
4469 * match it before grabbing layout lock mutex. */
4470 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4471 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4472 if (mode != 0) { /* hit cached lock */
4473 rc = ll_layout_lock_set(&lockh, mode, inode);
4479 rc = ll_layout_intent(inode, &intent);
4485 *gen = ll_layout_version_get(lli);
4486 mutex_unlock(&lli->lli_layout_mutex);
4492 * Issue layout intent RPC indicating where in a file an IO is about to write.
4494 * \param[in] inode file inode.
4495 * \param[in] start start offset of fille in bytes where an IO is about to
4497 * \param[in] end exclusive end offset in bytes of the write range.
4499 * \retval 0 on success
4500 * \retval < 0 error code
4502 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4504 struct layout_intent intent = {
4505 .li_opc = LAYOUT_INTENT_WRITE,
4512 rc = ll_layout_intent(inode, &intent);
4518 * This function send a restore request to the MDT
4520 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4522 struct hsm_user_request *hur;
4526 len = sizeof(struct hsm_user_request) +
4527 sizeof(struct hsm_user_item);
4528 OBD_ALLOC(hur, len);
4532 hur->hur_request.hr_action = HUA_RESTORE;
4533 hur->hur_request.hr_archive_id = 0;
4534 hur->hur_request.hr_flags = 0;
4535 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4536 sizeof(hur->hur_user_item[0].hui_fid));
4537 hur->hur_user_item[0].hui_extent.offset = offset;
4538 hur->hur_user_item[0].hui_extent.length = length;
4539 hur->hur_request.hr_itemcount = 1;
4540 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,