4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
355 struct dentry *parent = de->d_parent;
356 const char *name = NULL;
358 struct md_op_data *op_data;
359 struct ptlrpc_request *req = NULL;
363 LASSERT(parent != NULL);
364 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
366 /* if server supports open-by-fid, or file name is invalid, don't pack
367 * name in open request */
368 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
369 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
370 name = de->d_name.name;
371 len = de->d_name.len;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
375 name, len, 0, LUSTRE_OPC_ANY, NULL);
377 RETURN(PTR_ERR(op_data));
378 op_data->op_data = lmm;
379 op_data->op_data_size = lmmsize;
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
382 &ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(de, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
405 if (!rc && itp->it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
409 ptlrpc_req_finished(req);
410 ll_intent_drop_lock(itp);
412 /* We did open by fid, but by the time we got to the server,
413 * the object disappeared. If this is a create, we cannot really
414 * tell the userspace that the file it was trying to create
415 * does not exist. Instead let's return -ESTALE, and the VFS will
416 * retry the create with LOOKUP_REVAL that we are going to catch
417 * in ll_revalidate_dentry() and use lookup then.
419 if (rc == -ENOENT && itp->it_op & IT_CREAT)
425 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
426 struct obd_client_handle *och)
428 struct mdt_body *body;
430 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
431 och->och_fh = body->mbo_handle;
432 och->och_fid = body->mbo_fid1;
433 och->och_lease_handle.cookie = it->it_lock_handle;
434 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
435 och->och_flags = it->it_flags;
437 return md_set_open_replay_data(md_exp, och, it);
440 static int ll_local_open(struct file *file, struct lookup_intent *it,
441 struct ll_file_data *fd, struct obd_client_handle *och)
443 struct inode *inode = file_inode(file);
446 LASSERT(!LUSTRE_FPRIVATE(file));
453 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
458 LUSTRE_FPRIVATE(file) = fd;
459 ll_readahead_init(inode, &fd->fd_ras);
460 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
462 /* ll_cl_context initialize */
463 rwlock_init(&fd->fd_lock);
464 INIT_LIST_HEAD(&fd->fd_lccs);
469 /* Open a file, and (for the very first open) create objects on the OSTs at
470 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
471 * creation or open until ll_lov_setstripe() ioctl is called.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct obd_client_handle **och_p = NULL;
488 __u64 *och_usecount = NULL;
489 struct ll_file_data *fd;
493 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
494 PFID(ll_inode2fid(inode)), inode, file->f_flags);
496 it = file->private_data; /* XXX: compat macro */
497 file->private_data = NULL; /* prevent ll_local_open assertion */
499 fd = ll_file_data_get();
501 GOTO(out_openerr, rc = -ENOMEM);
504 if (S_ISDIR(inode->i_mode))
505 ll_authorize_statahead(inode, fd);
507 if (inode->i_sb->s_root == file_dentry(file)) {
508 LUSTRE_FPRIVATE(file) = fd;
512 if (!it || !it->it_disposition) {
513 /* Convert f_flags into access mode. We cannot use file->f_mode,
514 * because everything but O_ACCMODE mask was stripped from
516 if ((oit.it_flags + 1) & O_ACCMODE)
518 if (file->f_flags & O_TRUNC)
519 oit.it_flags |= FMODE_WRITE;
521 /* kernel only call f_op->open in dentry_open. filp_open calls
522 * dentry_open after call to open_namei that checks permissions.
523 * Only nfsd_open call dentry_open directly without checking
524 * permissions and because of that this code below is safe. */
525 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
526 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
528 /* We do not want O_EXCL here, presumably we opened the file
529 * already? XXX - NFS implications? */
530 oit.it_flags &= ~O_EXCL;
532 /* bug20584, if "it_flags" contains O_CREAT, the file will be
533 * created if necessary, then "IT_CREAT" should be set to keep
534 * consistent with it */
535 if (oit.it_flags & O_CREAT)
536 oit.it_op |= IT_CREAT;
542 /* Let's see if we have file open on MDS already. */
543 if (it->it_flags & FMODE_WRITE) {
544 och_p = &lli->lli_mds_write_och;
545 och_usecount = &lli->lli_open_fd_write_count;
546 } else if (it->it_flags & FMODE_EXEC) {
547 och_p = &lli->lli_mds_exec_och;
548 och_usecount = &lli->lli_open_fd_exec_count;
550 och_p = &lli->lli_mds_read_och;
551 och_usecount = &lli->lli_open_fd_read_count;
554 mutex_lock(&lli->lli_och_mutex);
555 if (*och_p) { /* Open handle is present */
556 if (it_disposition(it, DISP_OPEN_OPEN)) {
557 /* Well, there's extra open request that we do not need,
558 let's close it somehow. This will decref request. */
559 rc = it_open_error(DISP_OPEN_OPEN, it);
561 mutex_unlock(&lli->lli_och_mutex);
562 GOTO(out_openerr, rc);
565 ll_release_openhandle(file_dentry(file), it);
569 rc = ll_local_open(file, it, fd, NULL);
572 mutex_unlock(&lli->lli_och_mutex);
573 GOTO(out_openerr, rc);
576 LASSERT(*och_usecount == 0);
577 if (!it->it_disposition) {
578 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
579 /* We cannot just request lock handle now, new ELC code
580 means that one of other OPEN locks for this file
581 could be cancelled, and since blocking ast handler
582 would attempt to grab och_mutex as well, that would
583 result in a deadlock */
584 mutex_unlock(&lli->lli_och_mutex);
586 * Normally called under two situations:
588 * 2. A race/condition on MDS resulting in no open
589 * handle to be returned from LOOKUP|OPEN request,
590 * for example if the target entry was a symlink.
592 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
593 * marked by a bit set in ll_iget_for_nfs. Clear the
594 * bit so that it's not confusing later callers.
596 * NB; when ldd is NULL, it must have come via normal
597 * lookup path only, since ll_iget_for_nfs always calls
600 if (ldd && ldd->lld_nfs_dentry) {
601 ldd->lld_nfs_dentry = 0;
602 it->it_flags |= MDS_OPEN_LOCK;
606 * Always specify MDS_OPEN_BY_FID because we don't want
607 * to get file with different fid.
609 it->it_flags |= MDS_OPEN_BY_FID;
610 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1088 struct inode *inode = file_inode((struct file *)file);
1090 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1092 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1093 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1094 file->f_flags & O_DIRECT ||
1097 io->ci_obj = ll_i2info(inode)->lli_clob;
1098 io->ci_lockreq = CILR_MAYBE;
1099 if (ll_file_nolock(file)) {
1100 io->ci_lockreq = CILR_NEVER;
1101 io->ci_no_srvlock = 1;
1102 } else if (file->f_flags & O_APPEND) {
1103 io->ci_lockreq = CILR_MANDATORY;
1106 io->ci_noatime = file_is_noatime(file);
1110 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1111 struct file *file, enum cl_io_type iot,
1112 loff_t *ppos, size_t count)
1114 struct vvp_io *vio = vvp_env_io(env);
1115 struct inode *inode = file_inode(file);
1116 struct ll_inode_info *lli = ll_i2info(inode);
1117 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1121 struct range_lock range;
1125 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1126 file_dentry(file)->d_name.name, iot, *ppos, count);
1129 io = vvp_env_thread_io(env);
1130 ll_io_init(io, file, iot == CIT_WRITE);
1132 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1133 bool range_locked = false;
1135 if (file->f_flags & O_APPEND)
1136 range_lock_init(&range, 0, LUSTRE_EOF);
1138 range_lock_init(&range, *ppos, *ppos + count - 1);
1140 vio->vui_fd = LUSTRE_FPRIVATE(file);
1141 vio->vui_io_subtype = args->via_io_subtype;
1143 switch (vio->vui_io_subtype) {
1145 vio->vui_iter = args->u.normal.via_iter;
1146 vio->vui_iocb = args->u.normal.via_iocb;
1147 /* Direct IO reads must also take range lock,
1148 * or multiple reads will try to work on the same pages
1149 * See LU-6227 for details. */
1150 if (((iot == CIT_WRITE) ||
1151 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1152 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1153 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1155 rc = range_lock(&lli->lli_write_tree, &range);
1159 range_locked = true;
1163 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1164 vio->u.splice.vui_flags = args->u.splice.via_flags;
1167 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1171 ll_cl_add(file, env, io, LCC_RW);
1172 rc = cl_io_loop(env, io);
1173 ll_cl_remove(file, env);
1176 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1178 range_unlock(&lli->lli_write_tree, &range);
1181 /* cl_io_rw_init() handled IO */
1185 if (io->ci_nob > 0) {
1186 result += io->ci_nob;
1187 count -= io->ci_nob;
1188 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1190 /* prepare IO restart */
1191 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1192 args->u.normal.via_iter = vio->vui_iter;
1195 cl_io_fini(env, io);
1197 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1199 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1200 file_dentry(file)->d_name.name,
1201 iot == CIT_READ ? "read" : "write",
1202 *ppos, count, result);
1206 if (iot == CIT_READ) {
1208 ll_stats_ops_tally(ll_i2sbi(inode),
1209 LPROC_LL_READ_BYTES, result);
1210 } else if (iot == CIT_WRITE) {
1212 ll_stats_ops_tally(ll_i2sbi(inode),
1213 LPROC_LL_WRITE_BYTES, result);
1214 fd->fd_write_failed = false;
1215 } else if (result == 0 && rc == 0) {
1218 fd->fd_write_failed = true;
1220 fd->fd_write_failed = false;
1221 } else if (rc != -ERESTARTSYS) {
1222 fd->fd_write_failed = true;
1226 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1228 RETURN(result > 0 ? result : rc);
1232 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1233 * especially for small I/O.
1235 * To serve a read request, CLIO has to create and initialize a cl_io and
1236 * then request DLM lock. This has turned out to have siginificant overhead
1237 * and affects the performance of small I/O dramatically.
1239 * It's not necessary to create a cl_io for each I/O. Under the help of read
1240 * ahead, most of the pages being read are already in memory cache and we can
1241 * read those pages directly because if the pages exist, the corresponding DLM
1242 * lock must exist so that page content must be valid.
1244 * In fast read implementation, the llite speculatively finds and reads pages
1245 * in memory cache. There are three scenarios for fast read:
1246 * - If the page exists and is uptodate, kernel VM will provide the data and
1247 * CLIO won't be intervened;
1248 * - If the page was brought into memory by read ahead, it will be exported
1249 * and read ahead parameters will be updated;
1250 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1251 * it will go back and invoke normal read, i.e., a cl_io will be created
1252 * and DLM lock will be requested.
1254 * POSIX compliance: posix standard states that read is intended to be atomic.
1255 * Lustre read implementation is in line with Linux kernel read implementation
1256 * and neither of them complies with POSIX standard in this matter. Fast read
1257 * doesn't make the situation worse on single node but it may interleave write
1258 * results from multiple nodes due to short read handling in ll_file_aio_read().
1260 * \param env - lu_env
1261 * \param iocb - kiocb from kernel
1262 * \param iter - user space buffers where the data will be copied
1264 * \retval - number of bytes have been read, or error code if error occurred.
1267 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1268 struct iov_iter *iter)
1272 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1275 /* NB: we can't do direct IO for fast read because it will need a lock
1276 * to make IO engine happy. */
1277 if (iocb->ki_filp->f_flags & O_DIRECT)
1280 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1281 result = generic_file_read_iter(iocb, iter);
1282 ll_cl_remove(iocb->ki_filp, env);
1284 /* If the first page is not in cache, generic_file_aio_read() will be
1285 * returned with -ENODATA.
1286 * See corresponding code in ll_readpage(). */
1287 if (result == -ENODATA)
1291 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1292 LPROC_LL_READ_BYTES, result);
1298 * Read from a file (through the page cache).
1300 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1303 struct vvp_io_args *args;
1308 env = cl_env_get(&refcheck);
1310 return PTR_ERR(env);
1312 result = ll_do_fast_read(env, iocb, to);
1313 if (result < 0 || iov_iter_count(to) == 0)
1316 args = ll_env_args(env, IO_NORMAL);
1317 args->u.normal.via_iter = to;
1318 args->u.normal.via_iocb = iocb;
1320 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1321 &iocb->ki_pos, iov_iter_count(to));
1324 else if (result == 0)
1328 cl_env_put(env, &refcheck);
1333 * Write to a file (through the page cache).
1335 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1337 struct vvp_io_args *args;
1342 env = cl_env_get(&refcheck);
1344 return PTR_ERR(env);
1346 args = ll_env_args(env, IO_NORMAL);
1347 args->u.normal.via_iter = from;
1348 args->u.normal.via_iocb = iocb;
1350 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1351 &iocb->ki_pos, iov_iter_count(from));
1352 cl_env_put(env, &refcheck);
1356 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1358 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1360 static int ll_file_get_iov_count(const struct iovec *iov,
1361 unsigned long *nr_segs, size_t *count)
1366 for (seg = 0; seg < *nr_segs; seg++) {
1367 const struct iovec *iv = &iov[seg];
1370 * If any segment has a negative length, or the cumulative
1371 * length ever wraps negative then return -EINVAL.
1374 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1376 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1381 cnt -= iv->iov_len; /* This segment is no good */
1388 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1389 unsigned long nr_segs, loff_t pos)
1396 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1400 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1401 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1402 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1403 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1404 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1406 result = ll_file_read_iter(iocb, &to);
1411 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1415 struct iovec iov = { .iov_base = buf, .iov_len = count };
1416 struct kiocb *kiocb;
1421 env = cl_env_get(&refcheck);
1423 RETURN(PTR_ERR(env));
1425 kiocb = &ll_env_info(env)->lti_kiocb;
1426 init_sync_kiocb(kiocb, file);
1427 kiocb->ki_pos = *ppos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429 kiocb->ki_left = count;
1430 #elif defined(HAVE_KI_NBYTES)
1431 kiocb->ki_nbytes = count;
1434 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1435 *ppos = kiocb->ki_pos;
1437 cl_env_put(env, &refcheck);
1442 * Write to a file (through the page cache).
1445 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1446 unsigned long nr_segs, loff_t pos)
1448 struct iov_iter from;
1453 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1457 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1458 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1459 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1460 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1461 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1463 result = ll_file_write_iter(iocb, &from);
1468 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1469 size_t count, loff_t *ppos)
1472 struct iovec iov = { .iov_base = (void __user *)buf,
1474 struct kiocb *kiocb;
1479 env = cl_env_get(&refcheck);
1481 RETURN(PTR_ERR(env));
1483 kiocb = &ll_env_info(env)->lti_kiocb;
1484 init_sync_kiocb(kiocb, file);
1485 kiocb->ki_pos = *ppos;
1486 #ifdef HAVE_KIOCB_KI_LEFT
1487 kiocb->ki_left = count;
1488 #elif defined(HAVE_KI_NBYTES)
1489 kiocb->ki_nbytes = count;
1492 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1493 *ppos = kiocb->ki_pos;
1495 cl_env_put(env, &refcheck);
1498 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1501 * Send file content (through pagecache) somewhere with helper
1503 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1504 struct pipe_inode_info *pipe, size_t count,
1508 struct vvp_io_args *args;
1513 env = cl_env_get(&refcheck);
1515 RETURN(PTR_ERR(env));
1517 args = ll_env_args(env, IO_SPLICE);
1518 args->u.splice.via_pipe = pipe;
1519 args->u.splice.via_flags = flags;
1521 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1522 cl_env_put(env, &refcheck);
1526 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1527 __u64 flags, struct lov_user_md *lum, int lum_size)
1529 struct lookup_intent oit = {
1531 .it_flags = flags | MDS_OPEN_BY_FID,
1536 ll_inode_size_lock(inode);
1537 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1539 GOTO(out_unlock, rc);
1541 ll_release_openhandle(dentry, &oit);
1544 ll_inode_size_unlock(inode);
1545 ll_intent_release(&oit);
1550 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1551 struct lov_mds_md **lmmp, int *lmm_size,
1552 struct ptlrpc_request **request)
1554 struct ll_sb_info *sbi = ll_i2sbi(inode);
1555 struct mdt_body *body;
1556 struct lov_mds_md *lmm = NULL;
1557 struct ptlrpc_request *req = NULL;
1558 struct md_op_data *op_data;
1561 rc = ll_get_default_mdsize(sbi, &lmmsize);
1565 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1566 strlen(filename), lmmsize,
1567 LUSTRE_OPC_ANY, NULL);
1568 if (IS_ERR(op_data))
1569 RETURN(PTR_ERR(op_data));
1571 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1572 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1573 ll_finish_md_op_data(op_data);
1575 CDEBUG(D_INFO, "md_getattr_name failed "
1576 "on %s: rc %d\n", filename, rc);
1580 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1581 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1583 lmmsize = body->mbo_eadatasize;
1585 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1587 GOTO(out, rc = -ENODATA);
1590 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1591 LASSERT(lmm != NULL);
1593 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1594 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1595 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1596 GOTO(out, rc = -EPROTO);
1599 * This is coming from the MDS, so is probably in
1600 * little endian. We convert it to host endian before
1601 * passing it to userspace.
1603 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1606 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1607 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1608 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1609 if (le32_to_cpu(lmm->lmm_pattern) &
1610 LOV_PATTERN_F_RELEASED)
1614 /* if function called for directory - we should
1615 * avoid swab not existent lsm objects */
1616 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1617 lustre_swab_lov_user_md_v1(
1618 (struct lov_user_md_v1 *)lmm);
1619 if (S_ISREG(body->mbo_mode))
1620 lustre_swab_lov_user_md_objects(
1621 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1623 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1624 lustre_swab_lov_user_md_v3(
1625 (struct lov_user_md_v3 *)lmm);
1626 if (S_ISREG(body->mbo_mode))
1627 lustre_swab_lov_user_md_objects(
1628 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1630 } else if (lmm->lmm_magic ==
1631 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1632 lustre_swab_lov_comp_md_v1(
1633 (struct lov_comp_md_v1 *)lmm);
1639 *lmm_size = lmmsize;
1644 static int ll_lov_setea(struct inode *inode, struct file *file,
1647 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1648 struct lov_user_md *lump;
1649 int lum_size = sizeof(struct lov_user_md) +
1650 sizeof(struct lov_user_ost_data);
1654 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1657 OBD_ALLOC_LARGE(lump, lum_size);
1661 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1662 GOTO(out_lump, rc = -EFAULT);
1664 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1666 cl_lov_delay_create_clear(&file->f_flags);
1669 OBD_FREE_LARGE(lump, lum_size);
1673 static int ll_file_getstripe(struct inode *inode,
1674 struct lov_user_md __user *lum)
1681 env = cl_env_get(&refcheck);
1683 RETURN(PTR_ERR(env));
1685 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1686 cl_env_put(env, &refcheck);
1690 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1693 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1694 struct lov_user_md *klum;
1696 __u64 flags = FMODE_WRITE;
1699 rc = ll_copy_user_md(lum, &klum);
1704 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1706 cl_lov_delay_create_clear(&file->f_flags);
1707 OBD_FREE(klum, lum_size);
1712 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1714 struct ll_inode_info *lli = ll_i2info(inode);
1715 struct cl_object *obj = lli->lli_clob;
1716 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1717 struct ll_grouplock grouplock;
1722 CWARN("group id for group lock must not be 0\n");
1726 if (ll_file_nolock(file))
1727 RETURN(-EOPNOTSUPP);
1729 spin_lock(&lli->lli_lock);
1730 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1731 CWARN("group lock already existed with gid %lu\n",
1732 fd->fd_grouplock.lg_gid);
1733 spin_unlock(&lli->lli_lock);
1736 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1737 spin_unlock(&lli->lli_lock);
1740 * XXX: group lock needs to protect all OST objects while PFL
1741 * can add new OST objects during the IO, so we'd instantiate
1742 * all OST objects before getting its group lock.
1747 struct cl_layout cl = {
1748 .cl_is_composite = false,
1751 env = cl_env_get(&refcheck);
1753 RETURN(PTR_ERR(env));
1755 rc = cl_object_layout_get(env, obj, &cl);
1756 if (!rc && cl.cl_is_composite)
1757 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1759 cl_env_put(env, &refcheck);
1764 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1765 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1769 spin_lock(&lli->lli_lock);
1770 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1771 spin_unlock(&lli->lli_lock);
1772 CERROR("another thread just won the race\n");
1773 cl_put_grouplock(&grouplock);
1777 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1778 fd->fd_grouplock = grouplock;
1779 spin_unlock(&lli->lli_lock);
1781 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1785 static int ll_put_grouplock(struct inode *inode, struct file *file,
1788 struct ll_inode_info *lli = ll_i2info(inode);
1789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1790 struct ll_grouplock grouplock;
1793 spin_lock(&lli->lli_lock);
1794 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1795 spin_unlock(&lli->lli_lock);
1796 CWARN("no group lock held\n");
1800 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1802 if (fd->fd_grouplock.lg_gid != arg) {
1803 CWARN("group lock %lu doesn't match current id %lu\n",
1804 arg, fd->fd_grouplock.lg_gid);
1805 spin_unlock(&lli->lli_lock);
1809 grouplock = fd->fd_grouplock;
1810 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1811 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1812 spin_unlock(&lli->lli_lock);
1814 cl_put_grouplock(&grouplock);
1815 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1820 * Close inode open handle
1822 * \param dentry [in] dentry which contains the inode
1823 * \param it [in,out] intent which contains open info and result
1826 * \retval <0 failure
1828 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1830 struct inode *inode = dentry->d_inode;
1831 struct obd_client_handle *och;
1837 /* Root ? Do nothing. */
1838 if (dentry->d_inode->i_sb->s_root == dentry)
1841 /* No open handle to close? Move away */
1842 if (!it_disposition(it, DISP_OPEN_OPEN))
1845 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1847 OBD_ALLOC(och, sizeof(*och));
1849 GOTO(out, rc = -ENOMEM);
1851 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1853 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1855 /* this one is in place of ll_file_open */
1856 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1857 ptlrpc_req_finished(it->it_request);
1858 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1864 * Get size for inode for which FIEMAP mapping is requested.
1865 * Make the FIEMAP get_info call and returns the result.
1866 * \param fiemap kernel buffer to hold extens
1867 * \param num_bytes kernel buffer size
1869 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1875 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1878 /* Checks for fiemap flags */
1879 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1880 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1884 /* Check for FIEMAP_FLAG_SYNC */
1885 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1886 rc = filemap_fdatawrite(inode->i_mapping);
1891 env = cl_env_get(&refcheck);
1893 RETURN(PTR_ERR(env));
1895 if (i_size_read(inode) == 0) {
1896 rc = ll_glimpse_size(inode);
1901 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1902 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1903 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1905 /* If filesize is 0, then there would be no objects for mapping */
1906 if (fmkey.lfik_oa.o_size == 0) {
1907 fiemap->fm_mapped_extents = 0;
1911 fmkey.lfik_fiemap = *fiemap;
1913 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1914 &fmkey, fiemap, &num_bytes);
1916 cl_env_put(env, &refcheck);
1920 int ll_fid2path(struct inode *inode, void __user *arg)
1922 struct obd_export *exp = ll_i2mdexp(inode);
1923 const struct getinfo_fid2path __user *gfin = arg;
1925 struct getinfo_fid2path *gfout;
1931 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1932 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1935 /* Only need to get the buflen */
1936 if (get_user(pathlen, &gfin->gf_pathlen))
1939 if (pathlen > PATH_MAX)
1942 outsize = sizeof(*gfout) + pathlen;
1943 OBD_ALLOC(gfout, outsize);
1947 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1948 GOTO(gf_free, rc = -EFAULT);
1949 /* append root FID after gfout to let MDT know the root FID so that it
1950 * can lookup the correct path, this is mainly for fileset.
1951 * old server without fileset mount support will ignore this. */
1952 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1954 /* Call mdc_iocontrol */
1955 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1959 if (copy_to_user(arg, gfout, outsize))
1963 OBD_FREE(gfout, outsize);
1968 * Read the data_version for inode.
1970 * This value is computed using stripe object version on OST.
1971 * Version is computed using server side locking.
1973 * @param flags if do sync on the OST side;
1975 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1976 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1978 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1980 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1988 /* If no file object initialized, we consider its version is 0. */
1994 env = cl_env_get(&refcheck);
1996 RETURN(PTR_ERR(env));
1998 io = vvp_env_thread_io(env);
2000 io->u.ci_data_version.dv_data_version = 0;
2001 io->u.ci_data_version.dv_flags = flags;
2004 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2005 result = cl_io_loop(env, io);
2007 result = io->ci_result;
2009 *data_version = io->u.ci_data_version.dv_data_version;
2011 cl_io_fini(env, io);
2013 if (unlikely(io->ci_need_restart))
2016 cl_env_put(env, &refcheck);
2022 * Trigger a HSM release request for the provided inode.
2024 int ll_hsm_release(struct inode *inode)
2027 struct obd_client_handle *och = NULL;
2028 __u64 data_version = 0;
2033 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2034 ll_get_fsname(inode->i_sb, NULL, 0),
2035 PFID(&ll_i2info(inode)->lli_fid));
2037 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2039 GOTO(out, rc = PTR_ERR(och));
2041 /* Grab latest data_version and [am]time values */
2042 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2046 env = cl_env_get(&refcheck);
2048 GOTO(out, rc = PTR_ERR(env));
2050 ll_merge_attr(env, inode);
2051 cl_env_put(env, &refcheck);
2053 /* Release the file.
2054 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2055 * we still need it to pack l_remote_handle to MDT. */
2056 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2062 if (och != NULL && !IS_ERR(och)) /* close the file */
2063 ll_lease_close(och, inode, NULL);
2068 struct ll_swap_stack {
2071 struct inode *inode1;
2072 struct inode *inode2;
2077 static int ll_swap_layouts(struct file *file1, struct file *file2,
2078 struct lustre_swap_layouts *lsl)
2080 struct mdc_swap_layouts msl;
2081 struct md_op_data *op_data;
2084 struct ll_swap_stack *llss = NULL;
2087 OBD_ALLOC_PTR(llss);
2091 llss->inode1 = file_inode(file1);
2092 llss->inode2 = file_inode(file2);
2094 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2098 /* we use 2 bool because it is easier to swap than 2 bits */
2099 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2100 llss->check_dv1 = true;
2102 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2103 llss->check_dv2 = true;
2105 /* we cannot use lsl->sl_dvX directly because we may swap them */
2106 llss->dv1 = lsl->sl_dv1;
2107 llss->dv2 = lsl->sl_dv2;
2109 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2110 if (rc == 0) /* same file, done! */
2113 if (rc < 0) { /* sequentialize it */
2114 swap(llss->inode1, llss->inode2);
2116 swap(llss->dv1, llss->dv2);
2117 swap(llss->check_dv1, llss->check_dv2);
2121 if (gid != 0) { /* application asks to flush dirty cache */
2122 rc = ll_get_grouplock(llss->inode1, file1, gid);
2126 rc = ll_get_grouplock(llss->inode2, file2, gid);
2128 ll_put_grouplock(llss->inode1, file1, gid);
2133 /* ultimate check, before swaping the layouts we check if
2134 * dataversion has changed (if requested) */
2135 if (llss->check_dv1) {
2136 rc = ll_data_version(llss->inode1, &dv, 0);
2139 if (dv != llss->dv1)
2140 GOTO(putgl, rc = -EAGAIN);
2143 if (llss->check_dv2) {
2144 rc = ll_data_version(llss->inode2, &dv, 0);
2147 if (dv != llss->dv2)
2148 GOTO(putgl, rc = -EAGAIN);
2151 /* struct md_op_data is used to send the swap args to the mdt
2152 * only flags is missing, so we use struct mdc_swap_layouts
2153 * through the md_op_data->op_data */
2154 /* flags from user space have to be converted before they are send to
2155 * server, no flag is sent today, they are only used on the client */
2158 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2159 0, LUSTRE_OPC_ANY, &msl);
2160 if (IS_ERR(op_data))
2161 GOTO(free, rc = PTR_ERR(op_data));
2163 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2164 sizeof(*op_data), op_data, NULL);
2165 ll_finish_md_op_data(op_data);
2172 ll_put_grouplock(llss->inode2, file2, gid);
2173 ll_put_grouplock(llss->inode1, file1, gid);
2183 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2185 struct md_op_data *op_data;
2189 /* Detect out-of range masks */
2190 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2193 /* Non-root users are forbidden to set or clear flags which are
2194 * NOT defined in HSM_USER_MASK. */
2195 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2196 !cfs_capable(CFS_CAP_SYS_ADMIN))
2199 /* Detect out-of range archive id */
2200 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2201 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2204 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2205 LUSTRE_OPC_ANY, hss);
2206 if (IS_ERR(op_data))
2207 RETURN(PTR_ERR(op_data));
2209 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2210 sizeof(*op_data), op_data, NULL);
2212 ll_finish_md_op_data(op_data);
2217 static int ll_hsm_import(struct inode *inode, struct file *file,
2218 struct hsm_user_import *hui)
2220 struct hsm_state_set *hss = NULL;
2221 struct iattr *attr = NULL;
2225 if (!S_ISREG(inode->i_mode))
2231 GOTO(out, rc = -ENOMEM);
2233 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2234 hss->hss_archive_id = hui->hui_archive_id;
2235 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2236 rc = ll_hsm_state_set(inode, hss);
2240 OBD_ALLOC_PTR(attr);
2242 GOTO(out, rc = -ENOMEM);
2244 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2245 attr->ia_mode |= S_IFREG;
2246 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2247 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2248 attr->ia_size = hui->hui_size;
2249 attr->ia_mtime.tv_sec = hui->hui_mtime;
2250 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2251 attr->ia_atime.tv_sec = hui->hui_atime;
2252 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2254 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2255 ATTR_UID | ATTR_GID |
2256 ATTR_MTIME | ATTR_MTIME_SET |
2257 ATTR_ATIME | ATTR_ATIME_SET;
2261 rc = ll_setattr_raw(file_dentry(file), attr, true);
2265 inode_unlock(inode);
2277 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2279 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2280 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2283 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2285 struct inode *inode = file_inode(file);
2287 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2288 ATTR_MTIME | ATTR_MTIME_SET |
2289 ATTR_CTIME | ATTR_CTIME_SET,
2291 .tv_sec = lfu->lfu_atime_sec,
2292 .tv_nsec = lfu->lfu_atime_nsec,
2295 .tv_sec = lfu->lfu_mtime_sec,
2296 .tv_nsec = lfu->lfu_mtime_nsec,
2299 .tv_sec = lfu->lfu_ctime_sec,
2300 .tv_nsec = lfu->lfu_ctime_nsec,
2306 if (!capable(CAP_SYS_ADMIN))
2309 if (!S_ISREG(inode->i_mode))
2313 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2314 inode_unlock(inode);
2320 * Give file access advices
2322 * The ladvise interface is similar to Linux fadvise() system call, except it
2323 * forwards the advices directly from Lustre client to server. The server side
2324 * codes will apply appropriate read-ahead and caching techniques for the
2325 * corresponding files.
2327 * A typical workload for ladvise is e.g. a bunch of different clients are
2328 * doing small random reads of a file, so prefetching pages into OSS cache
2329 * with big linear reads before the random IO is a net benefit. Fetching
2330 * all that data into each client cache with fadvise() may not be, due to
2331 * much more data being sent to the client.
2333 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2334 struct llapi_lu_ladvise *ladvise)
2338 struct cl_ladvise_io *lio;
2343 env = cl_env_get(&refcheck);
2345 RETURN(PTR_ERR(env));
2347 io = vvp_env_thread_io(env);
2348 io->ci_obj = ll_i2info(inode)->lli_clob;
2350 /* initialize parameters for ladvise */
2351 lio = &io->u.ci_ladvise;
2352 lio->li_start = ladvise->lla_start;
2353 lio->li_end = ladvise->lla_end;
2354 lio->li_fid = ll_inode2fid(inode);
2355 lio->li_advice = ladvise->lla_advice;
2356 lio->li_flags = flags;
2358 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2359 rc = cl_io_loop(env, io);
2363 cl_io_fini(env, io);
2364 cl_env_put(env, &refcheck);
2368 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2371 struct fsxattr fsxattr;
2373 if (copy_from_user(&fsxattr,
2374 (const struct fsxattr __user *)arg,
2378 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2379 if (copy_to_user((struct fsxattr __user *)arg,
2380 &fsxattr, sizeof(fsxattr)))
2386 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2390 struct md_op_data *op_data;
2391 struct ptlrpc_request *req = NULL;
2393 struct fsxattr fsxattr;
2395 /* only root could change project ID */
2396 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2399 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2400 LUSTRE_OPC_ANY, NULL);
2401 if (IS_ERR(op_data))
2402 RETURN(PTR_ERR(op_data));
2404 if (copy_from_user(&fsxattr,
2405 (const struct fsxattr __user *)arg,
2407 GOTO(out_fsxattr1, rc = -EFAULT);
2409 op_data->op_projid = fsxattr.fsx_projid;
2410 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2411 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2413 ptlrpc_req_finished(req);
2416 ll_finish_md_op_data(op_data);
2423 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2425 struct inode *inode = file_inode(file);
2426 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2430 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2431 PFID(ll_inode2fid(inode)), inode, cmd);
2432 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2434 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2435 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2439 case LL_IOC_GETFLAGS:
2440 /* Get the current value of the file flags */
2441 return put_user(fd->fd_flags, (int __user *)arg);
2442 case LL_IOC_SETFLAGS:
2443 case LL_IOC_CLRFLAGS:
2444 /* Set or clear specific file flags */
2445 /* XXX This probably needs checks to ensure the flags are
2446 * not abused, and to handle any flag side effects.
2448 if (get_user(flags, (int __user *) arg))
2451 if (cmd == LL_IOC_SETFLAGS) {
2452 if ((flags & LL_FILE_IGNORE_LOCK) &&
2453 !(file->f_flags & O_DIRECT)) {
2454 CERROR("%s: unable to disable locking on "
2455 "non-O_DIRECT file\n", current->comm);
2459 fd->fd_flags |= flags;
2461 fd->fd_flags &= ~flags;
2464 case LL_IOC_LOV_SETSTRIPE:
2465 RETURN(ll_lov_setstripe(inode, file, arg));
2466 case LL_IOC_LOV_SETEA:
2467 RETURN(ll_lov_setea(inode, file, arg));
2468 case LL_IOC_LOV_SWAP_LAYOUTS: {
2470 struct lustre_swap_layouts lsl;
2472 if (copy_from_user(&lsl, (char __user *)arg,
2473 sizeof(struct lustre_swap_layouts)))
2476 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2479 file2 = fget(lsl.sl_fd);
2483 /* O_WRONLY or O_RDWR */
2484 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2485 GOTO(out, rc = -EPERM);
2487 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2488 struct inode *inode2;
2489 struct ll_inode_info *lli;
2490 struct obd_client_handle *och = NULL;
2492 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2493 GOTO(out, rc = -EINVAL);
2495 lli = ll_i2info(inode);
2496 mutex_lock(&lli->lli_och_mutex);
2497 if (fd->fd_lease_och != NULL) {
2498 och = fd->fd_lease_och;
2499 fd->fd_lease_och = NULL;
2501 mutex_unlock(&lli->lli_och_mutex);
2503 GOTO(out, rc = -ENOLCK);
2504 inode2 = file_inode(file2);
2505 rc = ll_swap_layouts_close(och, inode, inode2);
2507 rc = ll_swap_layouts(file, file2, &lsl);
2513 case LL_IOC_LOV_GETSTRIPE:
2514 RETURN(ll_file_getstripe(inode,
2515 (struct lov_user_md __user *)arg));
2516 case FSFILT_IOC_GETFLAGS:
2517 case FSFILT_IOC_SETFLAGS:
2518 RETURN(ll_iocontrol(inode, file, cmd, arg));
2519 case FSFILT_IOC_GETVERSION_OLD:
2520 case FSFILT_IOC_GETVERSION:
2521 RETURN(put_user(inode->i_generation, (int __user *)arg));
2522 case LL_IOC_GROUP_LOCK:
2523 RETURN(ll_get_grouplock(inode, file, arg));
2524 case LL_IOC_GROUP_UNLOCK:
2525 RETURN(ll_put_grouplock(inode, file, arg));
2526 case IOC_OBD_STATFS:
2527 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2529 /* We need to special case any other ioctls we want to handle,
2530 * to send them to the MDS/OST as appropriate and to properly
2531 * network encode the arg field.
2532 case FSFILT_IOC_SETVERSION_OLD:
2533 case FSFILT_IOC_SETVERSION:
2535 case LL_IOC_FLUSHCTX:
2536 RETURN(ll_flush_ctx(inode));
2537 case LL_IOC_PATH2FID: {
2538 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2539 sizeof(struct lu_fid)))
2544 case LL_IOC_GETPARENT:
2545 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2547 case OBD_IOC_FID2PATH:
2548 RETURN(ll_fid2path(inode, (void __user *)arg));
2549 case LL_IOC_DATA_VERSION: {
2550 struct ioc_data_version idv;
2553 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2556 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2557 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2560 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2566 case LL_IOC_GET_MDTIDX: {
2569 mdtidx = ll_get_mdt_idx(inode);
2573 if (put_user((int)mdtidx, (int __user *)arg))
2578 case OBD_IOC_GETDTNAME:
2579 case OBD_IOC_GETMDNAME:
2580 RETURN(ll_get_obd_name(inode, cmd, arg));
2581 case LL_IOC_HSM_STATE_GET: {
2582 struct md_op_data *op_data;
2583 struct hsm_user_state *hus;
2590 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2591 LUSTRE_OPC_ANY, hus);
2592 if (IS_ERR(op_data)) {
2594 RETURN(PTR_ERR(op_data));
2597 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2600 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2603 ll_finish_md_op_data(op_data);
2607 case LL_IOC_HSM_STATE_SET: {
2608 struct hsm_state_set *hss;
2615 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2620 rc = ll_hsm_state_set(inode, hss);
2625 case LL_IOC_HSM_ACTION: {
2626 struct md_op_data *op_data;
2627 struct hsm_current_action *hca;
2634 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2635 LUSTRE_OPC_ANY, hca);
2636 if (IS_ERR(op_data)) {
2638 RETURN(PTR_ERR(op_data));
2641 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2644 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2647 ll_finish_md_op_data(op_data);
2651 case LL_IOC_SET_LEASE: {
2652 struct ll_inode_info *lli = ll_i2info(inode);
2653 struct obd_client_handle *och = NULL;
2658 case LL_LEASE_WRLCK:
2659 if (!(file->f_mode & FMODE_WRITE))
2661 fmode = FMODE_WRITE;
2663 case LL_LEASE_RDLCK:
2664 if (!(file->f_mode & FMODE_READ))
2668 case LL_LEASE_UNLCK:
2669 mutex_lock(&lli->lli_och_mutex);
2670 if (fd->fd_lease_och != NULL) {
2671 och = fd->fd_lease_och;
2672 fd->fd_lease_och = NULL;
2674 mutex_unlock(&lli->lli_och_mutex);
2679 fmode = och->och_flags;
2680 rc = ll_lease_close(och, inode, &lease_broken);
2684 rc = ll_lease_och_release(inode, file);
2691 RETURN(ll_lease_type_from_fmode(fmode));
2696 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2698 /* apply for lease */
2699 och = ll_lease_open(inode, file, fmode, 0);
2701 RETURN(PTR_ERR(och));
2704 mutex_lock(&lli->lli_och_mutex);
2705 if (fd->fd_lease_och == NULL) {
2706 fd->fd_lease_och = och;
2709 mutex_unlock(&lli->lli_och_mutex);
2711 /* impossible now that only excl is supported for now */
2712 ll_lease_close(och, inode, &lease_broken);
2717 case LL_IOC_GET_LEASE: {
2718 struct ll_inode_info *lli = ll_i2info(inode);
2719 struct ldlm_lock *lock = NULL;
2722 mutex_lock(&lli->lli_och_mutex);
2723 if (fd->fd_lease_och != NULL) {
2724 struct obd_client_handle *och = fd->fd_lease_och;
2726 lock = ldlm_handle2lock(&och->och_lease_handle);
2728 lock_res_and_lock(lock);
2729 if (!ldlm_is_cancel(lock))
2730 fmode = och->och_flags;
2732 unlock_res_and_lock(lock);
2733 LDLM_LOCK_PUT(lock);
2736 mutex_unlock(&lli->lli_och_mutex);
2738 RETURN(ll_lease_type_from_fmode(fmode));
2740 case LL_IOC_HSM_IMPORT: {
2741 struct hsm_user_import *hui;
2747 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2752 rc = ll_hsm_import(inode, file, hui);
2757 case LL_IOC_FUTIMES_3: {
2758 struct ll_futimes_3 lfu;
2760 if (copy_from_user(&lfu,
2761 (const struct ll_futimes_3 __user *)arg,
2765 RETURN(ll_file_futimes_3(file, &lfu));
2767 case LL_IOC_LADVISE: {
2768 struct llapi_ladvise_hdr *ladvise_hdr;
2771 int alloc_size = sizeof(*ladvise_hdr);
2774 OBD_ALLOC_PTR(ladvise_hdr);
2775 if (ladvise_hdr == NULL)
2778 if (copy_from_user(ladvise_hdr,
2779 (const struct llapi_ladvise_hdr __user *)arg,
2781 GOTO(out_ladvise, rc = -EFAULT);
2783 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2784 ladvise_hdr->lah_count < 1)
2785 GOTO(out_ladvise, rc = -EINVAL);
2787 num_advise = ladvise_hdr->lah_count;
2788 if (num_advise >= LAH_COUNT_MAX)
2789 GOTO(out_ladvise, rc = -EFBIG);
2791 OBD_FREE_PTR(ladvise_hdr);
2792 alloc_size = offsetof(typeof(*ladvise_hdr),
2793 lah_advise[num_advise]);
2794 OBD_ALLOC(ladvise_hdr, alloc_size);
2795 if (ladvise_hdr == NULL)
2799 * TODO: submit multiple advices to one server in a single RPC
2801 if (copy_from_user(ladvise_hdr,
2802 (const struct llapi_ladvise_hdr __user *)arg,
2804 GOTO(out_ladvise, rc = -EFAULT);
2806 for (i = 0; i < num_advise; i++) {
2807 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2808 &ladvise_hdr->lah_advise[i]);
2814 OBD_FREE(ladvise_hdr, alloc_size);
2817 case LL_IOC_FSGETXATTR:
2818 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2819 case LL_IOC_FSSETXATTR:
2820 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2825 ll_iocontrol_call(inode, file, cmd, arg, &err))
2828 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2829 (void __user *)arg));
2834 #ifndef HAVE_FILE_LLSEEK_SIZE
2835 static inline loff_t
2836 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2838 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2840 if (offset > maxsize)
2843 if (offset != file->f_pos) {
2844 file->f_pos = offset;
2845 file->f_version = 0;
2851 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2852 loff_t maxsize, loff_t eof)
2854 struct inode *inode = file_inode(file);
2862 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2863 * position-querying operation. Avoid rewriting the "same"
2864 * f_pos value back to the file because a concurrent read(),
2865 * write() or lseek() might have altered it
2870 * f_lock protects against read/modify/write race with other
2871 * SEEK_CURs. Note that parallel writes and reads behave
2875 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2876 inode_unlock(inode);
2880 * In the generic case the entire file is data, so as long as
2881 * offset isn't at the end of the file then the offset is data.
2888 * There is a virtual hole at the end of the file, so as long as
2889 * offset isn't i_size or larger, return i_size.
2897 return llseek_execute(file, offset, maxsize);
2901 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2903 struct inode *inode = file_inode(file);
2904 loff_t retval, eof = 0;
2907 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2908 (origin == SEEK_CUR) ? file->f_pos : 0);
2909 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2910 PFID(ll_inode2fid(inode)), inode, retval, retval,
2912 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2914 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2915 retval = ll_glimpse_size(inode);
2918 eof = i_size_read(inode);
2921 retval = ll_generic_file_llseek_size(file, offset, origin,
2922 ll_file_maxbytes(inode), eof);
2926 static int ll_flush(struct file *file, fl_owner_t id)
2928 struct inode *inode = file_inode(file);
2929 struct ll_inode_info *lli = ll_i2info(inode);
2930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2933 LASSERT(!S_ISDIR(inode->i_mode));
2935 /* catch async errors that were recorded back when async writeback
2936 * failed for pages in this mapping. */
2937 rc = lli->lli_async_rc;
2938 lli->lli_async_rc = 0;
2939 if (lli->lli_clob != NULL) {
2940 err = lov_read_and_clear_async_rc(lli->lli_clob);
2945 /* The application has been told write failure already.
2946 * Do not report failure again. */
2947 if (fd->fd_write_failed)
2949 return rc ? -EIO : 0;
2953 * Called to make sure a portion of file has been written out.
2954 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2956 * Return how many pages have been written.
2958 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2959 enum cl_fsync_mode mode, int ignore_layout)
2963 struct cl_fsync_io *fio;
2968 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2969 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2972 env = cl_env_get(&refcheck);
2974 RETURN(PTR_ERR(env));
2976 io = vvp_env_thread_io(env);
2977 io->ci_obj = ll_i2info(inode)->lli_clob;
2978 io->ci_ignore_layout = ignore_layout;
2980 /* initialize parameters for sync */
2981 fio = &io->u.ci_fsync;
2982 fio->fi_start = start;
2984 fio->fi_fid = ll_inode2fid(inode);
2985 fio->fi_mode = mode;
2986 fio->fi_nr_written = 0;
2988 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2989 result = cl_io_loop(env, io);
2991 result = io->ci_result;
2993 result = fio->fi_nr_written;
2994 cl_io_fini(env, io);
2995 cl_env_put(env, &refcheck);
3001 * When dentry is provided (the 'else' case), file_dentry() may be
3002 * null and dentry must be used directly rather than pulled from
3003 * file_dentry() as is done otherwise.
3006 #ifdef HAVE_FILE_FSYNC_4ARGS
3007 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3009 struct dentry *dentry = file_dentry(file);
3010 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3011 int ll_fsync(struct file *file, int datasync)
3013 struct dentry *dentry = file_dentry(file);
3015 loff_t end = LLONG_MAX;
3017 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3020 loff_t end = LLONG_MAX;
3022 struct inode *inode = dentry->d_inode;
3023 struct ll_inode_info *lli = ll_i2info(inode);
3024 struct ptlrpc_request *req;
3028 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3029 PFID(ll_inode2fid(inode)), inode);
3030 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3032 #ifdef HAVE_FILE_FSYNC_4ARGS
3033 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3036 /* fsync's caller has already called _fdata{sync,write}, we want
3037 * that IO to finish before calling the osc and mdc sync methods */
3038 rc = filemap_fdatawait(inode->i_mapping);
3041 /* catch async errors that were recorded back when async writeback
3042 * failed for pages in this mapping. */
3043 if (!S_ISDIR(inode->i_mode)) {
3044 err = lli->lli_async_rc;
3045 lli->lli_async_rc = 0;
3048 if (lli->lli_clob != NULL) {
3049 err = lov_read_and_clear_async_rc(lli->lli_clob);
3055 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3059 ptlrpc_req_finished(req);
3061 if (S_ISREG(inode->i_mode)) {
3062 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3064 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3065 if (rc == 0 && err < 0)
3068 fd->fd_write_failed = true;
3070 fd->fd_write_failed = false;
3073 #ifdef HAVE_FILE_FSYNC_4ARGS
3074 inode_unlock(inode);
3080 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3082 struct inode *inode = file_inode(file);
3083 struct ll_sb_info *sbi = ll_i2sbi(inode);
3084 struct ldlm_enqueue_info einfo = {
3085 .ei_type = LDLM_FLOCK,
3086 .ei_cb_cp = ldlm_flock_completion_ast,
3087 .ei_cbdata = file_lock,
3089 struct md_op_data *op_data;
3090 struct lustre_handle lockh = { 0 };
3091 union ldlm_policy_data flock = { { 0 } };
3092 int fl_type = file_lock->fl_type;
3098 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3099 PFID(ll_inode2fid(inode)), file_lock);
3101 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3103 if (file_lock->fl_flags & FL_FLOCK) {
3104 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3105 /* flocks are whole-file locks */
3106 flock.l_flock.end = OFFSET_MAX;
3107 /* For flocks owner is determined by the local file desctiptor*/
3108 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3109 } else if (file_lock->fl_flags & FL_POSIX) {
3110 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3111 flock.l_flock.start = file_lock->fl_start;
3112 flock.l_flock.end = file_lock->fl_end;
3116 flock.l_flock.pid = file_lock->fl_pid;
3118 /* Somewhat ugly workaround for svc lockd.
3119 * lockd installs custom fl_lmops->lm_compare_owner that checks
3120 * for the fl_owner to be the same (which it always is on local node
3121 * I guess between lockd processes) and then compares pid.
3122 * As such we assign pid to the owner field to make it all work,
3123 * conflict with normal locks is unlikely since pid space and
3124 * pointer space for current->files are not intersecting */
3125 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3126 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3130 einfo.ei_mode = LCK_PR;
3133 /* An unlock request may or may not have any relation to
3134 * existing locks so we may not be able to pass a lock handle
3135 * via a normal ldlm_lock_cancel() request. The request may even
3136 * unlock a byte range in the middle of an existing lock. In
3137 * order to process an unlock request we need all of the same
3138 * information that is given with a normal read or write record
3139 * lock request. To avoid creating another ldlm unlock (cancel)
3140 * message we'll treat a LCK_NL flock request as an unlock. */
3141 einfo.ei_mode = LCK_NL;
3144 einfo.ei_mode = LCK_PW;
3147 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3162 flags = LDLM_FL_BLOCK_NOWAIT;
3168 flags = LDLM_FL_TEST_LOCK;
3171 CERROR("unknown fcntl lock command: %d\n", cmd);
3175 /* Save the old mode so that if the mode in the lock changes we
3176 * can decrement the appropriate reader or writer refcount. */
3177 file_lock->fl_type = einfo.ei_mode;
3179 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3180 LUSTRE_OPC_ANY, NULL);
3181 if (IS_ERR(op_data))
3182 RETURN(PTR_ERR(op_data));
3184 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3185 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3186 flock.l_flock.pid, flags, einfo.ei_mode,
3187 flock.l_flock.start, flock.l_flock.end);
3189 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3192 /* Restore the file lock type if not TEST lock. */
3193 if (!(flags & LDLM_FL_TEST_LOCK))
3194 file_lock->fl_type = fl_type;
3196 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3197 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3198 !(flags & LDLM_FL_TEST_LOCK))
3199 rc2 = locks_lock_file_wait(file, file_lock);
3201 if ((file_lock->fl_flags & FL_FLOCK) &&
3202 (rc == 0 || file_lock->fl_type == F_UNLCK))
3203 rc2 = flock_lock_file_wait(file, file_lock);
3204 if ((file_lock->fl_flags & FL_POSIX) &&
3205 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3206 !(flags & LDLM_FL_TEST_LOCK))
3207 rc2 = posix_lock_file_wait(file, file_lock);
3208 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3210 if (rc2 && file_lock->fl_type != F_UNLCK) {
3211 einfo.ei_mode = LCK_NL;
3212 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3217 ll_finish_md_op_data(op_data);
3222 int ll_get_fid_by_name(struct inode *parent, const char *name,
3223 int namelen, struct lu_fid *fid,
3224 struct inode **inode)
3226 struct md_op_data *op_data = NULL;
3227 struct mdt_body *body;
3228 struct ptlrpc_request *req;
3232 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3233 LUSTRE_OPC_ANY, NULL);
3234 if (IS_ERR(op_data))
3235 RETURN(PTR_ERR(op_data));
3237 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3238 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3239 ll_finish_md_op_data(op_data);
3243 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3245 GOTO(out_req, rc = -EFAULT);
3247 *fid = body->mbo_fid1;
3250 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3252 ptlrpc_req_finished(req);
3256 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3257 const char *name, int namelen)
3259 struct dentry *dchild = NULL;
3260 struct inode *child_inode = NULL;
3261 struct md_op_data *op_data;
3262 struct ptlrpc_request *request = NULL;
3263 struct obd_client_handle *och = NULL;
3265 struct mdt_body *body;
3267 __u64 data_version = 0;
3270 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3271 name, PFID(ll_inode2fid(parent)), mdtidx);
3273 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3274 0, LUSTRE_OPC_ANY, NULL);
3275 if (IS_ERR(op_data))
3276 RETURN(PTR_ERR(op_data));
3278 /* Get child FID first */
3279 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3282 dchild = d_lookup(file_dentry(file), &qstr);
3283 if (dchild != NULL) {
3284 if (dchild->d_inode != NULL)
3285 child_inode = igrab(dchild->d_inode);
3289 if (child_inode == NULL) {
3290 rc = ll_get_fid_by_name(parent, name, namelen,
3291 &op_data->op_fid3, &child_inode);
3296 if (child_inode == NULL)
3297 GOTO(out_free, rc = -EINVAL);
3300 * lfs migrate command needs to be blocked on the client
3301 * by checking the migrate FID against the FID of the
3304 if (child_inode == parent->i_sb->s_root->d_inode)
3305 GOTO(out_iput, rc = -EINVAL);
3307 inode_lock(child_inode);
3308 op_data->op_fid3 = *ll_inode2fid(child_inode);
3309 if (!fid_is_sane(&op_data->op_fid3)) {
3310 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3311 ll_get_fsname(parent->i_sb, NULL, 0), name,
3312 PFID(&op_data->op_fid3));
3313 GOTO(out_unlock, rc = -EINVAL);
3316 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3318 GOTO(out_unlock, rc);
3321 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3322 PFID(&op_data->op_fid3), mdtidx);
3323 GOTO(out_unlock, rc = 0);
3326 if (S_ISREG(child_inode->i_mode)) {
3327 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3331 GOTO(out_unlock, rc);
3334 rc = ll_data_version(child_inode, &data_version,
3337 GOTO(out_close, rc);
3339 op_data->op_handle = och->och_fh;
3340 op_data->op_data = och->och_mod;
3341 op_data->op_data_version = data_version;
3342 op_data->op_lease_handle = och->och_lease_handle;
3343 op_data->op_bias |= MDS_RENAME_MIGRATE;
3346 op_data->op_mds = mdtidx;
3347 op_data->op_cli_flags = CLI_MIGRATE;
3348 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3349 namelen, name, namelen, &request);
3351 LASSERT(request != NULL);
3352 ll_update_times(request, parent);
3354 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3355 LASSERT(body != NULL);
3357 /* If the server does release layout lock, then we cleanup
3358 * the client och here, otherwise release it in out_close: */
3360 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3361 obd_mod_put(och->och_mod);
3362 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3364 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3370 if (request != NULL) {
3371 ptlrpc_req_finished(request);
3375 /* Try again if the file layout has changed. */
3376 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3380 if (och != NULL) /* close the file */
3381 ll_lease_close(och, child_inode, NULL);
3383 clear_nlink(child_inode);
3385 inode_unlock(child_inode);
3389 ll_finish_md_op_data(op_data);
3394 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3402 * test if some locks matching bits and l_req_mode are acquired
3403 * - bits can be in different locks
3404 * - if found clear the common lock bits in *bits
3405 * - the bits not found, are kept in *bits
3407 * \param bits [IN] searched lock bits [IN]
3408 * \param l_req_mode [IN] searched lock mode
3409 * \retval boolean, true iff all bits are found
3411 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3413 struct lustre_handle lockh;
3414 union ldlm_policy_data policy;
3415 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3416 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3425 fid = &ll_i2info(inode)->lli_fid;
3426 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3427 ldlm_lockname[mode]);
3429 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3430 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3431 policy.l_inodebits.bits = *bits & (1 << i);
3432 if (policy.l_inodebits.bits == 0)
3435 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3436 &policy, mode, &lockh)) {
3437 struct ldlm_lock *lock;
3439 lock = ldlm_handle2lock(&lockh);
3442 ~(lock->l_policy_data.l_inodebits.bits);
3443 LDLM_LOCK_PUT(lock);
3445 *bits &= ~policy.l_inodebits.bits;
3452 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3453 struct lustre_handle *lockh, __u64 flags,
3454 enum ldlm_mode mode)
3456 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3461 fid = &ll_i2info(inode)->lli_fid;
3462 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3464 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3465 fid, LDLM_IBITS, &policy, mode, lockh);
3470 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3472 /* Already unlinked. Just update nlink and return success */
3473 if (rc == -ENOENT) {
3475 /* If it is striped directory, and there is bad stripe
3476 * Let's revalidate the dentry again, instead of returning
3478 if (S_ISDIR(inode->i_mode) &&
3479 ll_i2info(inode)->lli_lsm_md != NULL)
3482 /* This path cannot be hit for regular files unless in
3483 * case of obscure races, so no need to to validate
3485 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3487 } else if (rc != 0) {
3488 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3489 "%s: revalidate FID "DFID" error: rc = %d\n",
3490 ll_get_fsname(inode->i_sb, NULL, 0),
3491 PFID(ll_inode2fid(inode)), rc);
3497 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3499 struct inode *inode = dentry->d_inode;
3500 struct ptlrpc_request *req = NULL;
3501 struct obd_export *exp;
3505 LASSERT(inode != NULL);
3507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3508 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3510 exp = ll_i2mdexp(inode);
3512 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3513 * But under CMD case, it caused some lock issues, should be fixed
3514 * with new CMD ibits lock. See bug 12718 */
3515 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3516 struct lookup_intent oit = { .it_op = IT_GETATTR };
3517 struct md_op_data *op_data;
3519 if (ibits == MDS_INODELOCK_LOOKUP)
3520 oit.it_op = IT_LOOKUP;
3522 /* Call getattr by fid, so do not provide name at all. */
3523 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3524 dentry->d_inode, NULL, 0, 0,
3525 LUSTRE_OPC_ANY, NULL);
3526 if (IS_ERR(op_data))
3527 RETURN(PTR_ERR(op_data));
3529 rc = md_intent_lock(exp, op_data, &oit, &req,
3530 &ll_md_blocking_ast, 0);
3531 ll_finish_md_op_data(op_data);
3533 rc = ll_inode_revalidate_fini(inode, rc);
3537 rc = ll_revalidate_it_finish(req, &oit, dentry);
3539 ll_intent_release(&oit);
3543 /* Unlinked? Unhash dentry, so it is not picked up later by
3544 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3545 here to preserve get_cwd functionality on 2.6.
3547 if (!dentry->d_inode->i_nlink) {
3548 ll_lock_dcache(inode);
3549 d_lustre_invalidate(dentry, 0);
3550 ll_unlock_dcache(inode);
3553 ll_lookup_finish_locks(&oit, dentry);
3554 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3555 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3556 u64 valid = OBD_MD_FLGETATTR;
3557 struct md_op_data *op_data;
3560 if (S_ISREG(inode->i_mode)) {
3561 rc = ll_get_default_mdsize(sbi, &ealen);
3564 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3567 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3568 0, ealen, LUSTRE_OPC_ANY,
3570 if (IS_ERR(op_data))
3571 RETURN(PTR_ERR(op_data));
3573 op_data->op_valid = valid;
3574 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3575 ll_finish_md_op_data(op_data);
3577 rc = ll_inode_revalidate_fini(inode, rc);
3581 rc = ll_prep_inode(&inode, req, NULL, NULL);
3584 ptlrpc_req_finished(req);
3588 static int ll_merge_md_attr(struct inode *inode)
3590 struct cl_attr attr = { 0 };
3593 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3594 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3595 &attr, ll_md_blocking_ast);
3599 set_nlink(inode, attr.cat_nlink);
3600 inode->i_blocks = attr.cat_blocks;
3601 i_size_write(inode, attr.cat_size);
3603 ll_i2info(inode)->lli_atime = attr.cat_atime;
3604 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3605 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3611 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3613 struct inode *inode = dentry->d_inode;
3617 rc = __ll_inode_revalidate(dentry, ibits);
3621 /* if object isn't regular file, don't validate size */
3622 if (!S_ISREG(inode->i_mode)) {
3623 if (S_ISDIR(inode->i_mode) &&
3624 ll_i2info(inode)->lli_lsm_md != NULL) {
3625 rc = ll_merge_md_attr(inode);
3630 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3631 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3632 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3634 /* In case of restore, the MDT has the right size and has
3635 * already send it back without granting the layout lock,
3636 * inode is up-to-date so glimpse is useless.
3637 * Also to glimpse we need the layout, in case of a running
3638 * restore the MDT holds the layout lock so the glimpse will
3639 * block up to the end of restore (getattr will block)
3641 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3642 rc = ll_glimpse_size(inode);
3647 static inline dev_t ll_compat_encode_dev(dev_t dev)
3649 /* The compat_sys_*stat*() syscalls will fail unless the
3650 * device majors and minors are both less than 256. Note that
3651 * the value returned here will be passed through
3652 * old_encode_dev() in cp_compat_stat(). And so we are not
3653 * trying to return a valid compat (u16) device number, just
3654 * one that will pass the old_valid_dev() check. */
3656 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3659 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3661 struct inode *inode = de->d_inode;
3662 struct ll_sb_info *sbi = ll_i2sbi(inode);
3663 struct ll_inode_info *lli = ll_i2info(inode);
3666 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3667 MDS_INODELOCK_LOOKUP);
3668 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3673 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3675 if (ll_need_32bit_api(sbi)) {
3676 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3677 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3678 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3680 stat->ino = inode->i_ino;
3681 stat->dev = inode->i_sb->s_dev;
3682 stat->rdev = inode->i_rdev;
3685 stat->mode = inode->i_mode;
3686 stat->uid = inode->i_uid;
3687 stat->gid = inode->i_gid;
3688 stat->atime = inode->i_atime;
3689 stat->mtime = inode->i_mtime;
3690 stat->ctime = inode->i_ctime;
3691 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3693 stat->nlink = inode->i_nlink;
3694 stat->size = i_size_read(inode);
3695 stat->blocks = inode->i_blocks;
3700 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3701 __u64 start, __u64 len)
3705 struct fiemap *fiemap;
3706 unsigned int extent_count = fieinfo->fi_extents_max;
3708 num_bytes = sizeof(*fiemap) + (extent_count *
3709 sizeof(struct fiemap_extent));
3710 OBD_ALLOC_LARGE(fiemap, num_bytes);
3715 fiemap->fm_flags = fieinfo->fi_flags;
3716 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3717 fiemap->fm_start = start;
3718 fiemap->fm_length = len;
3719 if (extent_count > 0 &&
3720 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3721 sizeof(struct fiemap_extent)) != 0)
3722 GOTO(out, rc = -EFAULT);
3724 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3726 fieinfo->fi_flags = fiemap->fm_flags;
3727 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3728 if (extent_count > 0 &&
3729 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3730 fiemap->fm_mapped_extents *
3731 sizeof(struct fiemap_extent)) != 0)
3732 GOTO(out, rc = -EFAULT);
3734 OBD_FREE_LARGE(fiemap, num_bytes);
3738 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3740 struct ll_inode_info *lli = ll_i2info(inode);
3741 struct posix_acl *acl = NULL;
3744 spin_lock(&lli->lli_lock);
3745 /* VFS' acl_permission_check->check_acl will release the refcount */
3746 acl = posix_acl_dup(lli->lli_posix_acl);
3747 spin_unlock(&lli->lli_lock);
3752 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3754 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3755 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3757 ll_check_acl(struct inode *inode, int mask)
3760 # ifdef CONFIG_FS_POSIX_ACL
3761 struct posix_acl *acl;
3765 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3766 if (flags & IPERM_FLAG_RCU)
3769 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3774 rc = posix_acl_permission(inode, acl, mask);
3775 posix_acl_release(acl);
3778 # else /* !CONFIG_FS_POSIX_ACL */
3780 # endif /* CONFIG_FS_POSIX_ACL */
3782 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3784 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3785 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3787 # ifdef HAVE_INODE_PERMISION_2ARGS
3788 int ll_inode_permission(struct inode *inode, int mask)
3790 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3795 struct ll_sb_info *sbi;
3796 struct root_squash_info *squash;
3797 struct cred *cred = NULL;
3798 const struct cred *old_cred = NULL;
3800 bool squash_id = false;
3803 #ifdef MAY_NOT_BLOCK
3804 if (mask & MAY_NOT_BLOCK)
3806 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3807 if (flags & IPERM_FLAG_RCU)
3811 /* as root inode are NOT getting validated in lookup operation,
3812 * need to do it before permission check. */
3814 if (inode == inode->i_sb->s_root->d_inode) {
3815 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3816 MDS_INODELOCK_LOOKUP);
3821 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3822 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3824 /* squash fsuid/fsgid if needed */
3825 sbi = ll_i2sbi(inode);
3826 squash = &sbi->ll_squash;
3827 if (unlikely(squash->rsi_uid != 0 &&
3828 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3829 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3833 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3834 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3835 squash->rsi_uid, squash->rsi_gid);
3837 /* update current process's credentials
3838 * and FS capability */
3839 cred = prepare_creds();
3843 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3844 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3845 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3846 if ((1 << cap) & CFS_CAP_FS_MASK)
3847 cap_lower(cred->cap_effective, cap);
3849 old_cred = override_creds(cred);
3852 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3853 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3854 /* restore current process's credentials and FS capability */
3856 revert_creds(old_cred);
3863 /* -o localflock - only provides locally consistent flock locks */
3864 struct file_operations ll_file_operations = {
3865 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3866 # ifdef HAVE_SYNC_READ_WRITE
3867 .read = new_sync_read,
3868 .write = new_sync_write,
3870 .read_iter = ll_file_read_iter,
3871 .write_iter = ll_file_write_iter,
3872 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3873 .read = ll_file_read,
3874 .aio_read = ll_file_aio_read,
3875 .write = ll_file_write,
3876 .aio_write = ll_file_aio_write,
3877 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3878 .unlocked_ioctl = ll_file_ioctl,
3879 .open = ll_file_open,
3880 .release = ll_file_release,
3881 .mmap = ll_file_mmap,
3882 .llseek = ll_file_seek,
3883 .splice_read = ll_file_splice_read,
3888 struct file_operations ll_file_operations_flock = {
3889 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3890 # ifdef HAVE_SYNC_READ_WRITE
3891 .read = new_sync_read,
3892 .write = new_sync_write,
3893 # endif /* HAVE_SYNC_READ_WRITE */
3894 .read_iter = ll_file_read_iter,
3895 .write_iter = ll_file_write_iter,
3896 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3897 .read = ll_file_read,
3898 .aio_read = ll_file_aio_read,
3899 .write = ll_file_write,
3900 .aio_write = ll_file_aio_write,
3901 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3902 .unlocked_ioctl = ll_file_ioctl,
3903 .open = ll_file_open,
3904 .release = ll_file_release,
3905 .mmap = ll_file_mmap,
3906 .llseek = ll_file_seek,
3907 .splice_read = ll_file_splice_read,
3910 .flock = ll_file_flock,
3911 .lock = ll_file_flock
3914 /* These are for -o noflock - to return ENOSYS on flock calls */
3915 struct file_operations ll_file_operations_noflock = {
3916 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3917 # ifdef HAVE_SYNC_READ_WRITE
3918 .read = new_sync_read,
3919 .write = new_sync_write,
3920 # endif /* HAVE_SYNC_READ_WRITE */
3921 .read_iter = ll_file_read_iter,
3922 .write_iter = ll_file_write_iter,
3923 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3924 .read = ll_file_read,
3925 .aio_read = ll_file_aio_read,
3926 .write = ll_file_write,
3927 .aio_write = ll_file_aio_write,
3928 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3929 .unlocked_ioctl = ll_file_ioctl,
3930 .open = ll_file_open,
3931 .release = ll_file_release,
3932 .mmap = ll_file_mmap,
3933 .llseek = ll_file_seek,
3934 .splice_read = ll_file_splice_read,
3937 .flock = ll_file_noflock,
3938 .lock = ll_file_noflock
3941 struct inode_operations ll_file_inode_operations = {
3942 .setattr = ll_setattr,
3943 .getattr = ll_getattr,
3944 .permission = ll_inode_permission,
3945 .setxattr = ll_setxattr,
3946 .getxattr = ll_getxattr,
3947 .listxattr = ll_listxattr,
3948 .removexattr = ll_removexattr,
3949 .fiemap = ll_fiemap,
3950 #ifdef HAVE_IOP_GET_ACL
3951 .get_acl = ll_get_acl,
3955 /* dynamic ioctl number support routins */
3956 static struct llioc_ctl_data {
3957 struct rw_semaphore ioc_sem;
3958 struct list_head ioc_head;
3960 __RWSEM_INITIALIZER(llioc.ioc_sem),
3961 LIST_HEAD_INIT(llioc.ioc_head)
3966 struct list_head iocd_list;
3967 unsigned int iocd_size;
3968 llioc_callback_t iocd_cb;
3969 unsigned int iocd_count;
3970 unsigned int iocd_cmd[0];
3973 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3976 struct llioc_data *in_data = NULL;
3979 if (cb == NULL || cmd == NULL ||
3980 count > LLIOC_MAX_CMD || count < 0)
3983 size = sizeof(*in_data) + count * sizeof(unsigned int);
3984 OBD_ALLOC(in_data, size);
3985 if (in_data == NULL)
3988 memset(in_data, 0, sizeof(*in_data));
3989 in_data->iocd_size = size;
3990 in_data->iocd_cb = cb;
3991 in_data->iocd_count = count;
3992 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3994 down_write(&llioc.ioc_sem);
3995 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3996 up_write(&llioc.ioc_sem);
4001 void ll_iocontrol_unregister(void *magic)
4003 struct llioc_data *tmp;
4008 down_write(&llioc.ioc_sem);
4009 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
4011 unsigned int size = tmp->iocd_size;
4013 list_del(&tmp->iocd_list);
4014 up_write(&llioc.ioc_sem);
4016 OBD_FREE(tmp, size);
4020 up_write(&llioc.ioc_sem);
4022 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
4025 EXPORT_SYMBOL(ll_iocontrol_register);
4026 EXPORT_SYMBOL(ll_iocontrol_unregister);
4028 static enum llioc_iter
4029 ll_iocontrol_call(struct inode *inode, struct file *file,
4030 unsigned int cmd, unsigned long arg, int *rcp)
4032 enum llioc_iter ret = LLIOC_CONT;
4033 struct llioc_data *data;
4034 int rc = -EINVAL, i;
4036 down_read(&llioc.ioc_sem);
4037 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
4038 for (i = 0; i < data->iocd_count; i++) {
4039 if (cmd != data->iocd_cmd[i])
4042 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
4046 if (ret == LLIOC_STOP)
4049 up_read(&llioc.ioc_sem);
4056 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4058 struct ll_inode_info *lli = ll_i2info(inode);
4059 struct cl_object *obj = lli->lli_clob;
4068 env = cl_env_get(&refcheck);
4070 RETURN(PTR_ERR(env));
4072 rc = cl_conf_set(env, lli->lli_clob, conf);
4076 if (conf->coc_opc == OBJECT_CONF_SET) {
4077 struct ldlm_lock *lock = conf->coc_lock;
4078 struct cl_layout cl = {
4082 LASSERT(lock != NULL);
4083 LASSERT(ldlm_has_layout(lock));
4085 /* it can only be allowed to match after layout is
4086 * applied to inode otherwise false layout would be
4087 * seen. Applying layout shoud happen before dropping
4088 * the intent lock. */
4089 ldlm_lock_allow_match(lock);
4091 rc = cl_object_layout_get(env, obj, &cl);
4096 DFID": layout version change: %u -> %u\n",
4097 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4099 ll_layout_version_set(lli, cl.cl_layout_gen);
4103 cl_env_put(env, &refcheck);
4108 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4109 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4112 struct ll_sb_info *sbi = ll_i2sbi(inode);
4113 struct ptlrpc_request *req;
4114 struct mdt_body *body;
4121 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4122 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4123 lock->l_lvb_data, lock->l_lvb_len);
4125 if (lock->l_lvb_data != NULL)
4128 /* if layout lock was granted right away, the layout is returned
4129 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4130 * blocked and then granted via completion ast, we have to fetch
4131 * layout here. Please note that we can't use the LVB buffer in
4132 * completion AST because it doesn't have a large enough buffer */
4133 rc = ll_get_default_mdsize(sbi, &lmmsize);
4135 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4136 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4141 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4143 GOTO(out, rc = -EPROTO);
4145 lmmsize = body->mbo_eadatasize;
4146 if (lmmsize == 0) /* empty layout */
4149 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4151 GOTO(out, rc = -EFAULT);
4153 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4154 if (lvbdata == NULL)
4155 GOTO(out, rc = -ENOMEM);
4157 memcpy(lvbdata, lmm, lmmsize);
4158 lock_res_and_lock(lock);
4159 if (unlikely(lock->l_lvb_data == NULL)) {
4160 lock->l_lvb_type = LVB_T_LAYOUT;
4161 lock->l_lvb_data = lvbdata;
4162 lock->l_lvb_len = lmmsize;
4165 unlock_res_and_lock(lock);
4168 OBD_FREE_LARGE(lvbdata, lmmsize);
4173 ptlrpc_req_finished(req);
4178 * Apply the layout to the inode. Layout lock is held and will be released
4181 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4182 struct inode *inode)
4184 struct ll_inode_info *lli = ll_i2info(inode);
4185 struct ll_sb_info *sbi = ll_i2sbi(inode);
4186 struct ldlm_lock *lock;
4187 struct cl_object_conf conf;
4190 bool wait_layout = false;
4193 LASSERT(lustre_handle_is_used(lockh));
4195 lock = ldlm_handle2lock(lockh);
4196 LASSERT(lock != NULL);
4197 LASSERT(ldlm_has_layout(lock));
4199 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4200 PFID(&lli->lli_fid), inode);
4202 /* in case this is a caching lock and reinstate with new inode */
4203 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4205 lock_res_and_lock(lock);
4206 lvb_ready = ldlm_is_lvb_ready(lock);
4207 unlock_res_and_lock(lock);
4209 /* checking lvb_ready is racy but this is okay. The worst case is
4210 * that multi processes may configure the file on the same time. */
4214 rc = ll_layout_fetch(inode, lock);
4218 /* for layout lock, lmm is stored in lock's lvb.
4219 * lvb_data is immutable if the lock is held so it's safe to access it
4222 * set layout to file. Unlikely this will fail as old layout was
4223 * surely eliminated */
4224 memset(&conf, 0, sizeof conf);
4225 conf.coc_opc = OBJECT_CONF_SET;
4226 conf.coc_inode = inode;
4227 conf.coc_lock = lock;
4228 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4229 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4230 rc = ll_layout_conf(inode, &conf);
4232 /* refresh layout failed, need to wait */
4233 wait_layout = rc == -EBUSY;
4236 LDLM_LOCK_PUT(lock);
4237 ldlm_lock_decref(lockh, mode);
4239 /* wait for IO to complete if it's still being used. */
4241 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4242 ll_get_fsname(inode->i_sb, NULL, 0),
4243 PFID(&lli->lli_fid), inode);
4245 memset(&conf, 0, sizeof conf);
4246 conf.coc_opc = OBJECT_CONF_WAIT;
4247 conf.coc_inode = inode;
4248 rc = ll_layout_conf(inode, &conf);
4252 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4253 ll_get_fsname(inode->i_sb, NULL, 0),
4254 PFID(&lli->lli_fid), rc);
4260 * Issue layout intent RPC to MDS.
4261 * \param inode [in] file inode
4262 * \param intent [in] layout intent
4264 * \retval 0 on success
4265 * \retval < 0 error code
4267 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4269 struct ll_inode_info *lli = ll_i2info(inode);
4270 struct ll_sb_info *sbi = ll_i2sbi(inode);
4271 struct md_op_data *op_data;
4272 struct lookup_intent it;
4273 struct ptlrpc_request *req;
4277 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4278 0, 0, LUSTRE_OPC_ANY, NULL);
4279 if (IS_ERR(op_data))
4280 RETURN(PTR_ERR(op_data));
4282 op_data->op_data = intent;
4283 op_data->op_data_size = sizeof(*intent);
4285 memset(&it, 0, sizeof(it));
4286 it.it_op = IT_LAYOUT;
4287 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4288 intent->li_opc == LAYOUT_INTENT_TRUNC)
4289 it.it_flags = FMODE_WRITE;
4291 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4292 ll_get_fsname(inode->i_sb, NULL, 0),
4293 PFID(&lli->lli_fid), inode);
4295 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4296 &ll_md_blocking_ast, 0);
4297 if (it.it_request != NULL)
4298 ptlrpc_req_finished(it.it_request);
4299 it.it_request = NULL;
4301 ll_finish_md_op_data(op_data);
4303 /* set lock data in case this is a new lock */
4305 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4307 ll_intent_drop_lock(&it);
4313 * This function checks if there exists a LAYOUT lock on the client side,
4314 * or enqueues it if it doesn't have one in cache.
4316 * This function will not hold layout lock so it may be revoked any time after
4317 * this function returns. Any operations depend on layout should be redone
4320 * This function should be called before lov_io_init() to get an uptodate
4321 * layout version, the caller should save the version number and after IO
4322 * is finished, this function should be called again to verify that layout
4323 * is not changed during IO time.
4325 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4327 struct ll_inode_info *lli = ll_i2info(inode);
4328 struct ll_sb_info *sbi = ll_i2sbi(inode);
4329 struct lustre_handle lockh;
4330 struct layout_intent intent = {
4331 .li_opc = LAYOUT_INTENT_ACCESS,
4333 enum ldlm_mode mode;
4337 *gen = ll_layout_version_get(lli);
4338 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4342 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4343 LASSERT(S_ISREG(inode->i_mode));
4345 /* take layout lock mutex to enqueue layout lock exclusively. */
4346 mutex_lock(&lli->lli_layout_mutex);
4349 /* mostly layout lock is caching on the local side, so try to
4350 * match it before grabbing layout lock mutex. */
4351 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4352 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4353 if (mode != 0) { /* hit cached lock */
4354 rc = ll_layout_lock_set(&lockh, mode, inode);
4360 rc = ll_layout_intent(inode, &intent);
4366 *gen = ll_layout_version_get(lli);
4367 mutex_unlock(&lli->lli_layout_mutex);
4373 * Issue layout intent RPC indicating where in a file an IO is about to write.
4375 * \param[in] inode file inode.
4376 * \param[in] start start offset of fille in bytes where an IO is about to
4378 * \param[in] end exclusive end offset in bytes of the write range.
4380 * \retval 0 on success
4381 * \retval < 0 error code
4383 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4385 struct layout_intent intent = {
4386 .li_opc = LAYOUT_INTENT_WRITE,
4393 rc = ll_layout_intent(inode, &intent);
4399 * This function send a restore request to the MDT
4401 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4403 struct hsm_user_request *hur;
4407 len = sizeof(struct hsm_user_request) +
4408 sizeof(struct hsm_user_item);
4409 OBD_ALLOC(hur, len);
4413 hur->hur_request.hr_action = HUA_RESTORE;
4414 hur->hur_request.hr_archive_id = 0;
4415 hur->hur_request.hr_flags = 0;
4416 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4417 sizeof(hur->hur_user_item[0].hui_fid));
4418 hur->hur_user_item[0].hui_extent.offset = offset;
4419 hur->hur_user_item[0].hui_extent.length = length;
4420 hur->hur_request.hr_itemcount = 1;
4421 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,