4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct mdt_body *body;
439 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
440 och->och_fh = body->mbo_handle;
441 och->och_fid = body->mbo_fid1;
442 och->och_lease_handle.cookie = it->it_lock_handle;
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_flags = it->it_flags;
446 return md_set_open_replay_data(md_exp, och, it);
449 static int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_path.dentry->d_inode;
455 LASSERT(!LUSTRE_FPRIVATE(file));
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
467 LUSTRE_FPRIVATE(file) = fd;
468 ll_readahead_init(inode, &fd->fd_ras);
469 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
471 /* ll_cl_context initialize */
472 rwlock_init(&fd->fd_lock);
473 INIT_LIST_HEAD(&fd->fd_lccs);
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
503 PFID(ll_inode2fid(inode)), inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_openerr, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode))
514 ll_authorize_statahead(inode, fd);
516 if (inode->i_sb->s_root == file->f_path.dentry) {
517 LUSTRE_FPRIVATE(file) = fd;
521 if (!it || !it->it_disposition) {
522 /* Convert f_flags into access mode. We cannot use file->f_mode,
523 * because everything but O_ACCMODE mask was stripped from
525 if ((oit.it_flags + 1) & O_ACCMODE)
527 if (file->f_flags & O_TRUNC)
528 oit.it_flags |= FMODE_WRITE;
530 /* kernel only call f_op->open in dentry_open. filp_open calls
531 * dentry_open after call to open_namei that checks permissions.
532 * Only nfsd_open call dentry_open directly without checking
533 * permissions and because of that this code below is safe. */
534 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
535 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications? */
539 oit.it_flags &= ~O_EXCL;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
543 * consistent with it */
544 if (oit.it_flags & O_CREAT)
545 oit.it_op |= IT_CREAT;
551 /* Let's see if we have file open on MDS already. */
552 if (it->it_flags & FMODE_WRITE) {
553 och_p = &lli->lli_mds_write_och;
554 och_usecount = &lli->lli_open_fd_write_count;
555 } else if (it->it_flags & FMODE_EXEC) {
556 och_p = &lli->lli_mds_exec_och;
557 och_usecount = &lli->lli_open_fd_exec_count;
559 och_p = &lli->lli_mds_read_och;
560 och_usecount = &lli->lli_open_fd_read_count;
563 mutex_lock(&lli->lli_och_mutex);
564 if (*och_p) { /* Open handle is present */
565 if (it_disposition(it, DISP_OPEN_OPEN)) {
566 /* Well, there's extra open request that we do not need,
567 let's close it somehow. This will decref request. */
568 rc = it_open_error(DISP_OPEN_OPEN, it);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 ll_release_openhandle(file->f_path.dentry, it);
578 rc = ll_local_open(file, it, fd, NULL);
581 mutex_unlock(&lli->lli_och_mutex);
582 GOTO(out_openerr, rc);
585 LASSERT(*och_usecount == 0);
586 if (!it->it_disposition) {
587 /* We cannot just request lock handle now, new ELC code
588 means that one of other OPEN locks for this file
589 could be cancelled, and since blocking ast handler
590 would attempt to grab och_mutex as well, that would
591 result in a deadlock */
592 mutex_unlock(&lli->lli_och_mutex);
594 * Normally called under two situations:
596 * 2. A race/condition on MDS resulting in no open
597 * handle to be returned from LOOKUP|OPEN request,
598 * for example if the target entry was a symlink.
600 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file, NULL, 0, it);
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * Acquire a lease and open the file.
699 static struct obd_client_handle *
700 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
703 struct lookup_intent it = { .it_op = IT_OPEN };
704 struct ll_sb_info *sbi = ll_i2sbi(inode);
705 struct md_op_data *op_data;
706 struct ptlrpc_request *req = NULL;
707 struct lustre_handle old_handle = { 0 };
708 struct obd_client_handle *och = NULL;
713 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
714 RETURN(ERR_PTR(-EINVAL));
717 struct ll_inode_info *lli = ll_i2info(inode);
718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
719 struct obd_client_handle **och_p;
722 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
723 RETURN(ERR_PTR(-EPERM));
725 /* Get the openhandle of the file */
727 mutex_lock(&lli->lli_och_mutex);
728 if (fd->fd_lease_och != NULL) {
729 mutex_unlock(&lli->lli_och_mutex);
733 if (fd->fd_och == NULL) {
734 if (file->f_mode & FMODE_WRITE) {
735 LASSERT(lli->lli_mds_write_och != NULL);
736 och_p = &lli->lli_mds_write_och;
737 och_usecount = &lli->lli_open_fd_write_count;
739 LASSERT(lli->lli_mds_read_och != NULL);
740 och_p = &lli->lli_mds_read_och;
741 och_usecount = &lli->lli_open_fd_read_count;
743 if (*och_usecount == 1) {
750 mutex_unlock(&lli->lli_och_mutex);
751 if (rc < 0) /* more than 1 opener */
754 LASSERT(fd->fd_och != NULL);
755 old_handle = fd->fd_och->och_fh;
760 RETURN(ERR_PTR(-ENOMEM));
762 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
763 LUSTRE_OPC_ANY, NULL);
765 GOTO(out, rc = PTR_ERR(op_data));
767 /* To tell the MDT this openhandle is from the same owner */
768 op_data->op_handle = old_handle;
770 it.it_flags = fmode | open_flags;
771 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
772 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
773 &ll_md_blocking_lease_ast,
774 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
775 * it can be cancelled which may mislead applications that the lease is
777 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
778 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
779 * doesn't deal with openhandle, so normal openhandle will be leaked. */
780 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
781 ll_finish_md_op_data(op_data);
782 ptlrpc_req_finished(req);
784 GOTO(out_release_it, rc);
786 if (it_disposition(&it, DISP_LOOKUP_NEG))
787 GOTO(out_release_it, rc = -ENOENT);
789 rc = it_open_error(DISP_OPEN_OPEN, &it);
791 GOTO(out_release_it, rc);
793 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
794 ll_och_fill(sbi->ll_md_exp, &it, och);
796 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
797 GOTO(out_close, rc = -EOPNOTSUPP);
799 /* already get lease, handle lease lock */
800 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
801 if (it.it_lock_mode == 0 ||
802 it.it_lock_bits != MDS_INODELOCK_OPEN) {
803 /* open lock must return for lease */
804 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
805 PFID(ll_inode2fid(inode)), it.it_lock_mode,
807 GOTO(out_close, rc = -EPROTO);
810 ll_intent_release(&it);
814 /* Cancel open lock */
815 if (it.it_lock_mode != 0) {
816 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
819 och->och_lease_handle.cookie = 0ULL;
821 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
823 CERROR("%s: error closing file "DFID": %d\n",
824 ll_get_fsname(inode->i_sb, NULL, 0),
825 PFID(&ll_i2info(inode)->lli_fid), rc2);
826 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
828 ll_intent_release(&it);
836 * Check whether a layout swap can be done between two inodes.
838 * \param[in] inode1 First inode to check
839 * \param[in] inode2 Second inode to check
841 * \retval 0 on success, layout swap can be performed between both inodes
842 * \retval negative error code if requirements are not met
844 static int ll_check_swap_layouts_validity(struct inode *inode1,
845 struct inode *inode2)
847 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
850 if (inode_permission(inode1, MAY_WRITE) ||
851 inode_permission(inode2, MAY_WRITE))
854 if (inode1->i_sb != inode2->i_sb)
860 static int ll_swap_layouts_close(struct obd_client_handle *och,
861 struct inode *inode, struct inode *inode2)
863 const struct lu_fid *fid1 = ll_inode2fid(inode);
864 const struct lu_fid *fid2;
868 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
869 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
871 rc = ll_check_swap_layouts_validity(inode, inode2);
873 GOTO(out_free_och, rc);
875 /* We now know that inode2 is a lustre inode */
876 fid2 = ll_inode2fid(inode2);
878 rc = lu_fid_cmp(fid1, fid2);
880 GOTO(out_free_och, rc = -EINVAL);
882 /* Close the file and swap layouts between inode & inode2.
883 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
884 * because we still need it to pack l_remote_handle to MDT. */
885 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
888 och = NULL; /* freed in ll_close_inode_openhandle() */
898 * Release lease and close the file.
899 * It will check if the lease has ever broken.
901 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
904 struct ldlm_lock *lock;
905 bool cancelled = true;
909 lock = ldlm_handle2lock(&och->och_lease_handle);
911 lock_res_and_lock(lock);
912 cancelled = ldlm_is_cancel(lock);
913 unlock_res_and_lock(lock);
917 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
918 PFID(&ll_i2info(inode)->lli_fid), cancelled);
921 ldlm_cli_cancel(&och->och_lease_handle, 0);
922 if (lease_broken != NULL)
923 *lease_broken = cancelled;
925 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
929 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
931 struct ll_inode_info *lli = ll_i2info(inode);
932 struct cl_object *obj = lli->lli_clob;
933 struct cl_attr *attr = vvp_env_thread_attr(env);
941 ll_inode_size_lock(inode);
943 /* Merge timestamps the most recently obtained from MDS with
944 * timestamps obtained from OSTs.
946 * Do not overwrite atime of inode because it may be refreshed
947 * by file_accessed() function. If the read was served by cache
948 * data, there is no RPC to be sent so that atime may not be
949 * transferred to OSTs at all. MDT only updates atime at close time
950 * if it's at least 'mdd.*.atime_diff' older.
951 * All in all, the atime in Lustre does not strictly comply with
952 * POSIX. Solving this problem needs to send an RPC to MDT for each
953 * read, this will hurt performance. */
954 if (LTIME_S(inode->i_atime) < lli->lli_atime)
955 LTIME_S(inode->i_atime) = lli->lli_atime;
956 LTIME_S(inode->i_mtime) = lli->lli_mtime;
957 LTIME_S(inode->i_ctime) = lli->lli_ctime;
959 atime = LTIME_S(inode->i_atime);
960 mtime = LTIME_S(inode->i_mtime);
961 ctime = LTIME_S(inode->i_ctime);
963 cl_object_attr_lock(obj);
964 rc = cl_object_attr_get(env, obj, attr);
965 cl_object_attr_unlock(obj);
968 GOTO(out_size_unlock, rc);
970 if (atime < attr->cat_atime)
971 atime = attr->cat_atime;
973 if (ctime < attr->cat_ctime)
974 ctime = attr->cat_ctime;
976 if (mtime < attr->cat_mtime)
977 mtime = attr->cat_mtime;
979 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
980 PFID(&lli->lli_fid), attr->cat_size);
982 i_size_write(inode, attr->cat_size);
983 inode->i_blocks = attr->cat_blocks;
985 LTIME_S(inode->i_atime) = atime;
986 LTIME_S(inode->i_mtime) = mtime;
987 LTIME_S(inode->i_ctime) = ctime;
990 ll_inode_size_unlock(inode);
995 static bool file_is_noatime(const struct file *file)
997 const struct vfsmount *mnt = file->f_path.mnt;
998 const struct inode *inode = file->f_path.dentry->d_inode;
1000 /* Adapted from file_accessed() and touch_atime().*/
1001 if (file->f_flags & O_NOATIME)
1004 if (inode->i_flags & S_NOATIME)
1007 if (IS_NOATIME(inode))
1010 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1013 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1016 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1022 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1024 struct inode *inode = file->f_path.dentry->d_inode;
1026 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1028 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1029 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1030 file->f_flags & O_DIRECT ||
1033 io->ci_obj = ll_i2info(inode)->lli_clob;
1034 io->ci_lockreq = CILR_MAYBE;
1035 if (ll_file_nolock(file)) {
1036 io->ci_lockreq = CILR_NEVER;
1037 io->ci_no_srvlock = 1;
1038 } else if (file->f_flags & O_APPEND) {
1039 io->ci_lockreq = CILR_MANDATORY;
1042 io->ci_noatime = file_is_noatime(file);
1046 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1047 struct file *file, enum cl_io_type iot,
1048 loff_t *ppos, size_t count)
1050 struct vvp_io *vio = vvp_env_io(env);
1051 struct inode *inode = file->f_path.dentry->d_inode;
1052 struct ll_inode_info *lli = ll_i2info(inode);
1053 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1057 struct range_lock range;
1061 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1062 file->f_path.dentry->d_name.name, iot, *ppos, count);
1065 io = vvp_env_thread_io(env);
1066 ll_io_init(io, file, iot == CIT_WRITE);
1068 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1069 bool range_locked = false;
1071 if (file->f_flags & O_APPEND)
1072 range_lock_init(&range, 0, LUSTRE_EOF);
1074 range_lock_init(&range, *ppos, *ppos + count - 1);
1076 vio->vui_fd = LUSTRE_FPRIVATE(file);
1077 vio->vui_io_subtype = args->via_io_subtype;
1079 switch (vio->vui_io_subtype) {
1081 vio->vui_iter = args->u.normal.via_iter;
1082 vio->vui_iocb = args->u.normal.via_iocb;
1083 /* Direct IO reads must also take range lock,
1084 * or multiple reads will try to work on the same pages
1085 * See LU-6227 for details. */
1086 if (((iot == CIT_WRITE) ||
1087 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1088 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1089 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1091 rc = range_lock(&lli->lli_write_tree, &range);
1095 range_locked = true;
1099 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1100 vio->u.splice.vui_flags = args->u.splice.via_flags;
1103 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1107 ll_cl_add(file, env, io, LCC_RW);
1108 rc = cl_io_loop(env, io);
1109 ll_cl_remove(file, env);
1112 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1114 range_unlock(&lli->lli_write_tree, &range);
1117 /* cl_io_rw_init() handled IO */
1121 if (io->ci_nob > 0) {
1122 result += io->ci_nob;
1123 count -= io->ci_nob;
1124 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1126 /* prepare IO restart */
1127 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1128 args->u.normal.via_iter = vio->vui_iter;
1132 cl_io_fini(env, io);
1134 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1136 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1137 file->f_path.dentry->d_name.name,
1138 iot == CIT_READ ? "read" : "write",
1139 *ppos, count, result);
1143 if (iot == CIT_READ) {
1145 ll_stats_ops_tally(ll_i2sbi(inode),
1146 LPROC_LL_READ_BYTES, result);
1147 } else if (iot == CIT_WRITE) {
1149 ll_stats_ops_tally(ll_i2sbi(inode),
1150 LPROC_LL_WRITE_BYTES, result);
1151 fd->fd_write_failed = false;
1152 } else if (result == 0 && rc == 0) {
1155 fd->fd_write_failed = true;
1157 fd->fd_write_failed = false;
1158 } else if (rc != -ERESTARTSYS) {
1159 fd->fd_write_failed = true;
1163 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1165 return result > 0 ? result : rc;
1169 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1170 * especially for small I/O.
1172 * To serve a read request, CLIO has to create and initialize a cl_io and
1173 * then request DLM lock. This has turned out to have siginificant overhead
1174 * and affects the performance of small I/O dramatically.
1176 * It's not necessary to create a cl_io for each I/O. Under the help of read
1177 * ahead, most of the pages being read are already in memory cache and we can
1178 * read those pages directly because if the pages exist, the corresponding DLM
1179 * lock must exist so that page content must be valid.
1181 * In fast read implementation, the llite speculatively finds and reads pages
1182 * in memory cache. There are three scenarios for fast read:
1183 * - If the page exists and is uptodate, kernel VM will provide the data and
1184 * CLIO won't be intervened;
1185 * - If the page was brought into memory by read ahead, it will be exported
1186 * and read ahead parameters will be updated;
1187 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1188 * it will go back and invoke normal read, i.e., a cl_io will be created
1189 * and DLM lock will be requested.
1191 * POSIX compliance: posix standard states that read is intended to be atomic.
1192 * Lustre read implementation is in line with Linux kernel read implementation
1193 * and neither of them complies with POSIX standard in this matter. Fast read
1194 * doesn't make the situation worse on single node but it may interleave write
1195 * results from multiple nodes due to short read handling in ll_file_aio_read().
1197 * \param env - lu_env
1198 * \param iocb - kiocb from kernel
1199 * \param iter - user space buffers where the data will be copied
1201 * \retval - number of bytes have been read, or error code if error occurred.
1204 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1205 struct iov_iter *iter)
1209 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1212 /* NB: we can't do direct IO for fast read because it will need a lock
1213 * to make IO engine happy. */
1214 if (iocb->ki_filp->f_flags & O_DIRECT)
1217 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1218 result = generic_file_read_iter(iocb, iter);
1219 ll_cl_remove(iocb->ki_filp, env);
1221 /* If the first page is not in cache, generic_file_aio_read() will be
1222 * returned with -ENODATA.
1223 * See corresponding code in ll_readpage(). */
1224 if (result == -ENODATA)
1228 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1229 LPROC_LL_READ_BYTES, result);
1235 * Read from a file (through the page cache).
1237 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1240 struct vvp_io_args *args;
1245 env = cl_env_get(&refcheck);
1247 return PTR_ERR(env);
1249 result = ll_do_fast_read(env, iocb, to);
1250 if (result < 0 || iov_iter_count(to) == 0)
1253 args = ll_env_args(env, IO_NORMAL);
1254 args->u.normal.via_iter = to;
1255 args->u.normal.via_iocb = iocb;
1257 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1258 &iocb->ki_pos, iov_iter_count(to));
1261 else if (result == 0)
1265 cl_env_put(env, &refcheck);
1270 * Write to a file (through the page cache).
1272 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1274 struct vvp_io_args *args;
1279 env = cl_env_get(&refcheck);
1281 return PTR_ERR(env);
1283 args = ll_env_args(env, IO_NORMAL);
1284 args->u.normal.via_iter = from;
1285 args->u.normal.via_iocb = iocb;
1287 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1288 &iocb->ki_pos, iov_iter_count(from));
1289 cl_env_put(env, &refcheck);
1293 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1295 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1297 static int ll_file_get_iov_count(const struct iovec *iov,
1298 unsigned long *nr_segs, size_t *count)
1303 for (seg = 0; seg < *nr_segs; seg++) {
1304 const struct iovec *iv = &iov[seg];
1307 * If any segment has a negative length, or the cumulative
1308 * length ever wraps negative then return -EINVAL.
1311 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1313 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1318 cnt -= iv->iov_len; /* This segment is no good */
1325 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1326 unsigned long nr_segs, loff_t pos)
1333 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1337 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1338 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1339 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1340 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1341 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1343 result = ll_file_read_iter(iocb, &to);
1348 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1351 struct iovec iov = { .iov_base = buf, .iov_len = count };
1352 struct kiocb *kiocb;
1356 OBD_ALLOC_PTR(kiocb);
1360 init_sync_kiocb(kiocb, file);
1361 kiocb->ki_pos = *ppos;
1362 #ifdef HAVE_KIOCB_KI_LEFT
1363 kiocb->ki_left = count;
1364 #elif defined(HAVE_KI_NBYTES)
1365 kiocb->ki_nbytes = count;
1368 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1369 *ppos = kiocb->ki_pos;
1371 OBD_FREE_PTR(kiocb);
1376 * Write to a file (through the page cache).
1379 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1380 unsigned long nr_segs, loff_t pos)
1382 struct iov_iter from;
1387 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1391 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1392 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1393 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1394 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1395 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1397 result = ll_file_write_iter(iocb, &from);
1402 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1403 size_t count, loff_t *ppos)
1406 struct iovec iov = { .iov_base = (void __user *)buf,
1408 struct kiocb *kiocb;
1413 env = cl_env_get(&refcheck);
1415 RETURN(PTR_ERR(env));
1417 kiocb = &ll_env_info(env)->lti_kiocb;
1418 init_sync_kiocb(kiocb, file);
1419 kiocb->ki_pos = *ppos;
1420 #ifdef HAVE_KIOCB_KI_LEFT
1421 kiocb->ki_left = count;
1422 #elif defined(HAVE_KI_NBYTES)
1423 kiocb->ki_nbytes = count;
1426 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1427 *ppos = kiocb->ki_pos;
1429 cl_env_put(env, &refcheck);
1432 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1435 * Send file content (through pagecache) somewhere with helper
1437 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1438 struct pipe_inode_info *pipe, size_t count,
1442 struct vvp_io_args *args;
1447 env = cl_env_get(&refcheck);
1449 RETURN(PTR_ERR(env));
1451 args = ll_env_args(env, IO_SPLICE);
1452 args->u.splice.via_pipe = pipe;
1453 args->u.splice.via_flags = flags;
1455 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1456 cl_env_put(env, &refcheck);
1460 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1461 __u64 flags, struct lov_user_md *lum,
1464 struct lookup_intent oit = {
1466 .it_flags = flags | MDS_OPEN_BY_FID,
1471 ll_inode_size_lock(inode);
1472 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1474 GOTO(out_unlock, rc);
1476 ll_release_openhandle(file->f_path.dentry, &oit);
1479 ll_inode_size_unlock(inode);
1480 ll_intent_release(&oit);
1481 cl_lov_delay_create_clear(&file->f_flags);
1486 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1487 struct lov_mds_md **lmmp, int *lmm_size,
1488 struct ptlrpc_request **request)
1490 struct ll_sb_info *sbi = ll_i2sbi(inode);
1491 struct mdt_body *body;
1492 struct lov_mds_md *lmm = NULL;
1493 struct ptlrpc_request *req = NULL;
1494 struct md_op_data *op_data;
1497 rc = ll_get_default_mdsize(sbi, &lmmsize);
1501 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1502 strlen(filename), lmmsize,
1503 LUSTRE_OPC_ANY, NULL);
1504 if (IS_ERR(op_data))
1505 RETURN(PTR_ERR(op_data));
1507 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1508 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1509 ll_finish_md_op_data(op_data);
1511 CDEBUG(D_INFO, "md_getattr_name failed "
1512 "on %s: rc %d\n", filename, rc);
1516 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1517 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1519 lmmsize = body->mbo_eadatasize;
1521 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1523 GOTO(out, rc = -ENODATA);
1526 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1527 LASSERT(lmm != NULL);
1529 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1530 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1531 GOTO(out, rc = -EPROTO);
1535 * This is coming from the MDS, so is probably in
1536 * little endian. We convert it to host endian before
1537 * passing it to userspace.
1539 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1542 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1543 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1546 /* if function called for directory - we should
1547 * avoid swab not existent lsm objects */
1548 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1549 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1550 if (S_ISREG(body->mbo_mode))
1551 lustre_swab_lov_user_md_objects(
1552 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1554 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1555 lustre_swab_lov_user_md_v3(
1556 (struct lov_user_md_v3 *)lmm);
1557 if (S_ISREG(body->mbo_mode))
1558 lustre_swab_lov_user_md_objects(
1559 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1566 *lmm_size = lmmsize;
1571 static int ll_lov_setea(struct inode *inode, struct file *file,
1574 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1575 struct lov_user_md *lump;
1576 int lum_size = sizeof(struct lov_user_md) +
1577 sizeof(struct lov_user_ost_data);
1581 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1584 OBD_ALLOC_LARGE(lump, lum_size);
1588 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1589 GOTO(out_lump, rc = -EFAULT);
1591 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1594 OBD_FREE_LARGE(lump, lum_size);
1598 static int ll_file_getstripe(struct inode *inode,
1599 struct lov_user_md __user *lum)
1606 env = cl_env_get(&refcheck);
1608 RETURN(PTR_ERR(env));
1610 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1611 cl_env_put(env, &refcheck);
1615 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1618 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1619 struct lov_user_md *klum;
1621 __u64 flags = FMODE_WRITE;
1624 rc = ll_copy_user_md(lum, &klum);
1629 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1633 put_user(0, &lum->lmm_stripe_count);
1635 ll_layout_refresh(inode, &gen);
1636 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1639 OBD_FREE(klum, lum_size);
1644 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1646 struct ll_inode_info *lli = ll_i2info(inode);
1647 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1648 struct ll_grouplock grouplock;
1653 CWARN("group id for group lock must not be 0\n");
1657 if (ll_file_nolock(file))
1658 RETURN(-EOPNOTSUPP);
1660 spin_lock(&lli->lli_lock);
1661 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1662 CWARN("group lock already existed with gid %lu\n",
1663 fd->fd_grouplock.lg_gid);
1664 spin_unlock(&lli->lli_lock);
1667 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1668 spin_unlock(&lli->lli_lock);
1670 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1671 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1675 spin_lock(&lli->lli_lock);
1676 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1677 spin_unlock(&lli->lli_lock);
1678 CERROR("another thread just won the race\n");
1679 cl_put_grouplock(&grouplock);
1683 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1684 fd->fd_grouplock = grouplock;
1685 spin_unlock(&lli->lli_lock);
1687 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1691 static int ll_put_grouplock(struct inode *inode, struct file *file,
1694 struct ll_inode_info *lli = ll_i2info(inode);
1695 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1696 struct ll_grouplock grouplock;
1699 spin_lock(&lli->lli_lock);
1700 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1701 spin_unlock(&lli->lli_lock);
1702 CWARN("no group lock held\n");
1706 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1708 if (fd->fd_grouplock.lg_gid != arg) {
1709 CWARN("group lock %lu doesn't match current id %lu\n",
1710 arg, fd->fd_grouplock.lg_gid);
1711 spin_unlock(&lli->lli_lock);
1715 grouplock = fd->fd_grouplock;
1716 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1717 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1718 spin_unlock(&lli->lli_lock);
1720 cl_put_grouplock(&grouplock);
1721 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1726 * Close inode open handle
1728 * \param dentry [in] dentry which contains the inode
1729 * \param it [in,out] intent which contains open info and result
1732 * \retval <0 failure
1734 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1736 struct inode *inode = dentry->d_inode;
1737 struct obd_client_handle *och;
1743 /* Root ? Do nothing. */
1744 if (dentry->d_inode->i_sb->s_root == dentry)
1747 /* No open handle to close? Move away */
1748 if (!it_disposition(it, DISP_OPEN_OPEN))
1751 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1753 OBD_ALLOC(och, sizeof(*och));
1755 GOTO(out, rc = -ENOMEM);
1757 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1759 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1761 /* this one is in place of ll_file_open */
1762 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1763 ptlrpc_req_finished(it->it_request);
1764 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1770 * Get size for inode for which FIEMAP mapping is requested.
1771 * Make the FIEMAP get_info call and returns the result.
1772 * \param fiemap kernel buffer to hold extens
1773 * \param num_bytes kernel buffer size
1775 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1781 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1784 /* Checks for fiemap flags */
1785 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1786 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1790 /* Check for FIEMAP_FLAG_SYNC */
1791 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1792 rc = filemap_fdatawrite(inode->i_mapping);
1797 env = cl_env_get(&refcheck);
1799 RETURN(PTR_ERR(env));
1801 if (i_size_read(inode) == 0) {
1802 rc = ll_glimpse_size(inode);
1807 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1808 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1809 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1811 /* If filesize is 0, then there would be no objects for mapping */
1812 if (fmkey.lfik_oa.o_size == 0) {
1813 fiemap->fm_mapped_extents = 0;
1817 fmkey.lfik_fiemap = *fiemap;
1819 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1820 &fmkey, fiemap, &num_bytes);
1822 cl_env_put(env, &refcheck);
1826 int ll_fid2path(struct inode *inode, void __user *arg)
1828 struct obd_export *exp = ll_i2mdexp(inode);
1829 const struct getinfo_fid2path __user *gfin = arg;
1831 struct getinfo_fid2path *gfout;
1837 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1838 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1841 /* Only need to get the buflen */
1842 if (get_user(pathlen, &gfin->gf_pathlen))
1845 if (pathlen > PATH_MAX)
1848 outsize = sizeof(*gfout) + pathlen;
1849 OBD_ALLOC(gfout, outsize);
1853 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1854 GOTO(gf_free, rc = -EFAULT);
1855 /* append root FID after gfout to let MDT know the root FID so that it
1856 * can lookup the correct path, this is mainly for fileset.
1857 * old server without fileset mount support will ignore this. */
1858 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1860 /* Call mdc_iocontrol */
1861 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1865 if (copy_to_user(arg, gfout, outsize))
1869 OBD_FREE(gfout, outsize);
1874 * Read the data_version for inode.
1876 * This value is computed using stripe object version on OST.
1877 * Version is computed using server side locking.
1879 * @param flags if do sync on the OST side;
1881 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1882 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1884 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1886 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1894 /* If no file object initialized, we consider its version is 0. */
1900 env = cl_env_get(&refcheck);
1902 RETURN(PTR_ERR(env));
1904 io = vvp_env_thread_io(env);
1906 io->u.ci_data_version.dv_data_version = 0;
1907 io->u.ci_data_version.dv_flags = flags;
1910 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1911 result = cl_io_loop(env, io);
1913 result = io->ci_result;
1915 *data_version = io->u.ci_data_version.dv_data_version;
1917 cl_io_fini(env, io);
1919 if (unlikely(io->ci_need_restart))
1922 cl_env_put(env, &refcheck);
1928 * Trigger a HSM release request for the provided inode.
1930 int ll_hsm_release(struct inode *inode)
1933 struct obd_client_handle *och = NULL;
1934 __u64 data_version = 0;
1939 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1940 ll_get_fsname(inode->i_sb, NULL, 0),
1941 PFID(&ll_i2info(inode)->lli_fid));
1943 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1945 GOTO(out, rc = PTR_ERR(och));
1947 /* Grab latest data_version and [am]time values */
1948 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1952 env = cl_env_get(&refcheck);
1954 GOTO(out, rc = PTR_ERR(env));
1956 ll_merge_attr(env, inode);
1957 cl_env_put(env, &refcheck);
1959 /* Release the file.
1960 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1961 * we still need it to pack l_remote_handle to MDT. */
1962 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1968 if (och != NULL && !IS_ERR(och)) /* close the file */
1969 ll_lease_close(och, inode, NULL);
1974 struct ll_swap_stack {
1977 struct inode *inode1;
1978 struct inode *inode2;
1983 static int ll_swap_layouts(struct file *file1, struct file *file2,
1984 struct lustre_swap_layouts *lsl)
1986 struct mdc_swap_layouts msl;
1987 struct md_op_data *op_data;
1990 struct ll_swap_stack *llss = NULL;
1993 OBD_ALLOC_PTR(llss);
1997 llss->inode1 = file1->f_path.dentry->d_inode;
1998 llss->inode2 = file2->f_path.dentry->d_inode;
2000 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2004 /* we use 2 bool because it is easier to swap than 2 bits */
2005 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2006 llss->check_dv1 = true;
2008 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2009 llss->check_dv2 = true;
2011 /* we cannot use lsl->sl_dvX directly because we may swap them */
2012 llss->dv1 = lsl->sl_dv1;
2013 llss->dv2 = lsl->sl_dv2;
2015 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2016 if (rc == 0) /* same file, done! */
2019 if (rc < 0) { /* sequentialize it */
2020 swap(llss->inode1, llss->inode2);
2022 swap(llss->dv1, llss->dv2);
2023 swap(llss->check_dv1, llss->check_dv2);
2027 if (gid != 0) { /* application asks to flush dirty cache */
2028 rc = ll_get_grouplock(llss->inode1, file1, gid);
2032 rc = ll_get_grouplock(llss->inode2, file2, gid);
2034 ll_put_grouplock(llss->inode1, file1, gid);
2039 /* ultimate check, before swaping the layouts we check if
2040 * dataversion has changed (if requested) */
2041 if (llss->check_dv1) {
2042 rc = ll_data_version(llss->inode1, &dv, 0);
2045 if (dv != llss->dv1)
2046 GOTO(putgl, rc = -EAGAIN);
2049 if (llss->check_dv2) {
2050 rc = ll_data_version(llss->inode2, &dv, 0);
2053 if (dv != llss->dv2)
2054 GOTO(putgl, rc = -EAGAIN);
2057 /* struct md_op_data is used to send the swap args to the mdt
2058 * only flags is missing, so we use struct mdc_swap_layouts
2059 * through the md_op_data->op_data */
2060 /* flags from user space have to be converted before they are send to
2061 * server, no flag is sent today, they are only used on the client */
2064 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2065 0, LUSTRE_OPC_ANY, &msl);
2066 if (IS_ERR(op_data))
2067 GOTO(free, rc = PTR_ERR(op_data));
2069 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2070 sizeof(*op_data), op_data, NULL);
2071 ll_finish_md_op_data(op_data);
2078 ll_put_grouplock(llss->inode2, file2, gid);
2079 ll_put_grouplock(llss->inode1, file1, gid);
2089 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2091 struct md_op_data *op_data;
2095 /* Detect out-of range masks */
2096 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2099 /* Non-root users are forbidden to set or clear flags which are
2100 * NOT defined in HSM_USER_MASK. */
2101 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2102 !cfs_capable(CFS_CAP_SYS_ADMIN))
2105 /* Detect out-of range archive id */
2106 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2107 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2110 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2111 LUSTRE_OPC_ANY, hss);
2112 if (IS_ERR(op_data))
2113 RETURN(PTR_ERR(op_data));
2115 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2116 sizeof(*op_data), op_data, NULL);
2118 ll_finish_md_op_data(op_data);
2123 static int ll_hsm_import(struct inode *inode, struct file *file,
2124 struct hsm_user_import *hui)
2126 struct hsm_state_set *hss = NULL;
2127 struct iattr *attr = NULL;
2131 if (!S_ISREG(inode->i_mode))
2137 GOTO(out, rc = -ENOMEM);
2139 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2140 hss->hss_archive_id = hui->hui_archive_id;
2141 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2142 rc = ll_hsm_state_set(inode, hss);
2146 OBD_ALLOC_PTR(attr);
2148 GOTO(out, rc = -ENOMEM);
2150 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2151 attr->ia_mode |= S_IFREG;
2152 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2153 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2154 attr->ia_size = hui->hui_size;
2155 attr->ia_mtime.tv_sec = hui->hui_mtime;
2156 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2157 attr->ia_atime.tv_sec = hui->hui_atime;
2158 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2160 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2161 ATTR_UID | ATTR_GID |
2162 ATTR_MTIME | ATTR_MTIME_SET |
2163 ATTR_ATIME | ATTR_ATIME_SET;
2165 mutex_lock(&inode->i_mutex);
2167 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2171 mutex_unlock(&inode->i_mutex);
2183 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2185 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2186 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2189 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2191 struct inode *inode = file->f_path.dentry->d_inode;
2193 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2194 ATTR_MTIME | ATTR_MTIME_SET |
2195 ATTR_CTIME | ATTR_CTIME_SET,
2197 .tv_sec = lfu->lfu_atime_sec,
2198 .tv_nsec = lfu->lfu_atime_nsec,
2201 .tv_sec = lfu->lfu_mtime_sec,
2202 .tv_nsec = lfu->lfu_mtime_nsec,
2205 .tv_sec = lfu->lfu_ctime_sec,
2206 .tv_nsec = lfu->lfu_ctime_nsec,
2212 if (!capable(CAP_SYS_ADMIN))
2215 if (!S_ISREG(inode->i_mode))
2218 mutex_lock(&inode->i_mutex);
2219 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2220 mutex_unlock(&inode->i_mutex);
2226 * Give file access advices
2228 * The ladvise interface is similar to Linux fadvise() system call, except it
2229 * forwards the advices directly from Lustre client to server. The server side
2230 * codes will apply appropriate read-ahead and caching techniques for the
2231 * corresponding files.
2233 * A typical workload for ladvise is e.g. a bunch of different clients are
2234 * doing small random reads of a file, so prefetching pages into OSS cache
2235 * with big linear reads before the random IO is a net benefit. Fetching
2236 * all that data into each client cache with fadvise() may not be, due to
2237 * much more data being sent to the client.
2239 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2240 struct lu_ladvise *ladvise)
2244 struct cl_ladvise_io *lio;
2249 env = cl_env_get(&refcheck);
2251 RETURN(PTR_ERR(env));
2253 io = vvp_env_thread_io(env);
2254 io->ci_obj = ll_i2info(inode)->lli_clob;
2256 /* initialize parameters for ladvise */
2257 lio = &io->u.ci_ladvise;
2258 lio->li_start = ladvise->lla_start;
2259 lio->li_end = ladvise->lla_end;
2260 lio->li_fid = ll_inode2fid(inode);
2261 lio->li_advice = ladvise->lla_advice;
2262 lio->li_flags = flags;
2264 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2265 rc = cl_io_loop(env, io);
2269 cl_io_fini(env, io);
2270 cl_env_put(env, &refcheck);
2275 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2277 struct inode *inode = file->f_path.dentry->d_inode;
2278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2282 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2283 PFID(ll_inode2fid(inode)), inode, cmd);
2284 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2286 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2287 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2291 case LL_IOC_GETFLAGS:
2292 /* Get the current value of the file flags */
2293 return put_user(fd->fd_flags, (int __user *)arg);
2294 case LL_IOC_SETFLAGS:
2295 case LL_IOC_CLRFLAGS:
2296 /* Set or clear specific file flags */
2297 /* XXX This probably needs checks to ensure the flags are
2298 * not abused, and to handle any flag side effects.
2300 if (get_user(flags, (int __user *) arg))
2303 if (cmd == LL_IOC_SETFLAGS) {
2304 if ((flags & LL_FILE_IGNORE_LOCK) &&
2305 !(file->f_flags & O_DIRECT)) {
2306 CERROR("%s: unable to disable locking on "
2307 "non-O_DIRECT file\n", current->comm);
2311 fd->fd_flags |= flags;
2313 fd->fd_flags &= ~flags;
2316 case LL_IOC_LOV_SETSTRIPE:
2317 RETURN(ll_lov_setstripe(inode, file, arg));
2318 case LL_IOC_LOV_SETEA:
2319 RETURN(ll_lov_setea(inode, file, arg));
2320 case LL_IOC_LOV_SWAP_LAYOUTS: {
2322 struct lustre_swap_layouts lsl;
2324 if (copy_from_user(&lsl, (char __user *)arg,
2325 sizeof(struct lustre_swap_layouts)))
2328 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2331 file2 = fget(lsl.sl_fd);
2335 /* O_WRONLY or O_RDWR */
2336 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2337 GOTO(out, rc = -EPERM);
2339 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2340 struct inode *inode2;
2341 struct ll_inode_info *lli;
2342 struct obd_client_handle *och = NULL;
2344 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2345 GOTO(out, rc = -EINVAL);
2347 lli = ll_i2info(inode);
2348 mutex_lock(&lli->lli_och_mutex);
2349 if (fd->fd_lease_och != NULL) {
2350 och = fd->fd_lease_och;
2351 fd->fd_lease_och = NULL;
2353 mutex_unlock(&lli->lli_och_mutex);
2355 GOTO(out, rc = -ENOLCK);
2356 inode2 = file2->f_path.dentry->d_inode;
2357 rc = ll_swap_layouts_close(och, inode, inode2);
2359 rc = ll_swap_layouts(file, file2, &lsl);
2365 case LL_IOC_LOV_GETSTRIPE:
2366 RETURN(ll_file_getstripe(inode,
2367 (struct lov_user_md __user *)arg));
2368 case FSFILT_IOC_GETFLAGS:
2369 case FSFILT_IOC_SETFLAGS:
2370 RETURN(ll_iocontrol(inode, file, cmd, arg));
2371 case FSFILT_IOC_GETVERSION_OLD:
2372 case FSFILT_IOC_GETVERSION:
2373 RETURN(put_user(inode->i_generation, (int __user *)arg));
2374 case LL_IOC_GROUP_LOCK:
2375 RETURN(ll_get_grouplock(inode, file, arg));
2376 case LL_IOC_GROUP_UNLOCK:
2377 RETURN(ll_put_grouplock(inode, file, arg));
2378 case IOC_OBD_STATFS:
2379 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2381 /* We need to special case any other ioctls we want to handle,
2382 * to send them to the MDS/OST as appropriate and to properly
2383 * network encode the arg field.
2384 case FSFILT_IOC_SETVERSION_OLD:
2385 case FSFILT_IOC_SETVERSION:
2387 case LL_IOC_FLUSHCTX:
2388 RETURN(ll_flush_ctx(inode));
2389 case LL_IOC_PATH2FID: {
2390 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2391 sizeof(struct lu_fid)))
2396 case LL_IOC_GETPARENT:
2397 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2399 case OBD_IOC_FID2PATH:
2400 RETURN(ll_fid2path(inode, (void __user *)arg));
2401 case LL_IOC_DATA_VERSION: {
2402 struct ioc_data_version idv;
2405 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2408 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2409 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2412 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2418 case LL_IOC_GET_MDTIDX: {
2421 mdtidx = ll_get_mdt_idx(inode);
2425 if (put_user((int)mdtidx, (int __user *)arg))
2430 case OBD_IOC_GETDTNAME:
2431 case OBD_IOC_GETMDNAME:
2432 RETURN(ll_get_obd_name(inode, cmd, arg));
2433 case LL_IOC_HSM_STATE_GET: {
2434 struct md_op_data *op_data;
2435 struct hsm_user_state *hus;
2442 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2443 LUSTRE_OPC_ANY, hus);
2444 if (IS_ERR(op_data)) {
2446 RETURN(PTR_ERR(op_data));
2449 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2452 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2455 ll_finish_md_op_data(op_data);
2459 case LL_IOC_HSM_STATE_SET: {
2460 struct hsm_state_set *hss;
2467 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2472 rc = ll_hsm_state_set(inode, hss);
2477 case LL_IOC_HSM_ACTION: {
2478 struct md_op_data *op_data;
2479 struct hsm_current_action *hca;
2486 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2487 LUSTRE_OPC_ANY, hca);
2488 if (IS_ERR(op_data)) {
2490 RETURN(PTR_ERR(op_data));
2493 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2496 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2499 ll_finish_md_op_data(op_data);
2503 case LL_IOC_SET_LEASE: {
2504 struct ll_inode_info *lli = ll_i2info(inode);
2505 struct obd_client_handle *och = NULL;
2510 case LL_LEASE_WRLCK:
2511 if (!(file->f_mode & FMODE_WRITE))
2513 fmode = FMODE_WRITE;
2515 case LL_LEASE_RDLCK:
2516 if (!(file->f_mode & FMODE_READ))
2520 case LL_LEASE_UNLCK:
2521 mutex_lock(&lli->lli_och_mutex);
2522 if (fd->fd_lease_och != NULL) {
2523 och = fd->fd_lease_och;
2524 fd->fd_lease_och = NULL;
2526 mutex_unlock(&lli->lli_och_mutex);
2531 fmode = och->och_flags;
2532 rc = ll_lease_close(och, inode, &lease_broken);
2539 RETURN(ll_lease_type_from_fmode(fmode));
2544 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2546 /* apply for lease */
2547 och = ll_lease_open(inode, file, fmode, 0);
2549 RETURN(PTR_ERR(och));
2552 mutex_lock(&lli->lli_och_mutex);
2553 if (fd->fd_lease_och == NULL) {
2554 fd->fd_lease_och = och;
2557 mutex_unlock(&lli->lli_och_mutex);
2559 /* impossible now that only excl is supported for now */
2560 ll_lease_close(och, inode, &lease_broken);
2565 case LL_IOC_GET_LEASE: {
2566 struct ll_inode_info *lli = ll_i2info(inode);
2567 struct ldlm_lock *lock = NULL;
2570 mutex_lock(&lli->lli_och_mutex);
2571 if (fd->fd_lease_och != NULL) {
2572 struct obd_client_handle *och = fd->fd_lease_och;
2574 lock = ldlm_handle2lock(&och->och_lease_handle);
2576 lock_res_and_lock(lock);
2577 if (!ldlm_is_cancel(lock))
2578 fmode = och->och_flags;
2580 unlock_res_and_lock(lock);
2581 LDLM_LOCK_PUT(lock);
2584 mutex_unlock(&lli->lli_och_mutex);
2586 RETURN(ll_lease_type_from_fmode(fmode));
2588 case LL_IOC_HSM_IMPORT: {
2589 struct hsm_user_import *hui;
2595 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2600 rc = ll_hsm_import(inode, file, hui);
2605 case LL_IOC_FUTIMES_3: {
2606 struct ll_futimes_3 lfu;
2608 if (copy_from_user(&lfu,
2609 (const struct ll_futimes_3 __user *)arg,
2613 RETURN(ll_file_futimes_3(file, &lfu));
2615 case LL_IOC_LADVISE: {
2616 struct ladvise_hdr *ladvise_hdr;
2619 int alloc_size = sizeof(*ladvise_hdr);
2622 OBD_ALLOC_PTR(ladvise_hdr);
2623 if (ladvise_hdr == NULL)
2626 if (copy_from_user(ladvise_hdr,
2627 (const struct ladvise_hdr __user *)arg,
2629 GOTO(out_ladvise, rc = -EFAULT);
2631 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2632 ladvise_hdr->lah_count < 1)
2633 GOTO(out_ladvise, rc = -EINVAL);
2635 num_advise = ladvise_hdr->lah_count;
2636 if (num_advise >= LAH_COUNT_MAX)
2637 GOTO(out_ladvise, rc = -EFBIG);
2639 OBD_FREE_PTR(ladvise_hdr);
2640 alloc_size = offsetof(typeof(*ladvise_hdr),
2641 lah_advise[num_advise]);
2642 OBD_ALLOC(ladvise_hdr, alloc_size);
2643 if (ladvise_hdr == NULL)
2647 * TODO: submit multiple advices to one server in a single RPC
2649 if (copy_from_user(ladvise_hdr,
2650 (const struct ladvise_hdr __user *)arg,
2652 GOTO(out_ladvise, rc = -EFAULT);
2654 for (i = 0; i < num_advise; i++) {
2655 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2656 &ladvise_hdr->lah_advise[i]);
2662 OBD_FREE(ladvise_hdr, alloc_size);
2669 ll_iocontrol_call(inode, file, cmd, arg, &err))
2672 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2673 (void __user *)arg));
2678 #ifndef HAVE_FILE_LLSEEK_SIZE
2679 static inline loff_t
2680 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2682 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2684 if (offset > maxsize)
2687 if (offset != file->f_pos) {
2688 file->f_pos = offset;
2689 file->f_version = 0;
2695 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2696 loff_t maxsize, loff_t eof)
2698 struct inode *inode = file->f_path.dentry->d_inode;
2706 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2707 * position-querying operation. Avoid rewriting the "same"
2708 * f_pos value back to the file because a concurrent read(),
2709 * write() or lseek() might have altered it
2714 * f_lock protects against read/modify/write race with other
2715 * SEEK_CURs. Note that parallel writes and reads behave
2718 mutex_lock(&inode->i_mutex);
2719 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2720 mutex_unlock(&inode->i_mutex);
2724 * In the generic case the entire file is data, so as long as
2725 * offset isn't at the end of the file then the offset is data.
2732 * There is a virtual hole at the end of the file, so as long as
2733 * offset isn't i_size or larger, return i_size.
2741 return llseek_execute(file, offset, maxsize);
2745 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2747 struct inode *inode = file->f_path.dentry->d_inode;
2748 loff_t retval, eof = 0;
2751 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2752 (origin == SEEK_CUR) ? file->f_pos : 0);
2753 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2754 PFID(ll_inode2fid(inode)), inode, retval, retval,
2756 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2758 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2759 retval = ll_glimpse_size(inode);
2762 eof = i_size_read(inode);
2765 retval = ll_generic_file_llseek_size(file, offset, origin,
2766 ll_file_maxbytes(inode), eof);
2770 static int ll_flush(struct file *file, fl_owner_t id)
2772 struct inode *inode = file->f_path.dentry->d_inode;
2773 struct ll_inode_info *lli = ll_i2info(inode);
2774 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2777 LASSERT(!S_ISDIR(inode->i_mode));
2779 /* catch async errors that were recorded back when async writeback
2780 * failed for pages in this mapping. */
2781 rc = lli->lli_async_rc;
2782 lli->lli_async_rc = 0;
2783 if (lli->lli_clob != NULL) {
2784 err = lov_read_and_clear_async_rc(lli->lli_clob);
2789 /* The application has been told write failure already.
2790 * Do not report failure again. */
2791 if (fd->fd_write_failed)
2793 return rc ? -EIO : 0;
2797 * Called to make sure a portion of file has been written out.
2798 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2800 * Return how many pages have been written.
2802 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2803 enum cl_fsync_mode mode, int ignore_layout)
2807 struct cl_fsync_io *fio;
2812 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2813 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2816 env = cl_env_get(&refcheck);
2818 RETURN(PTR_ERR(env));
2820 io = vvp_env_thread_io(env);
2821 io->ci_obj = ll_i2info(inode)->lli_clob;
2822 io->ci_ignore_layout = ignore_layout;
2824 /* initialize parameters for sync */
2825 fio = &io->u.ci_fsync;
2826 fio->fi_start = start;
2828 fio->fi_fid = ll_inode2fid(inode);
2829 fio->fi_mode = mode;
2830 fio->fi_nr_written = 0;
2832 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2833 result = cl_io_loop(env, io);
2835 result = io->ci_result;
2837 result = fio->fi_nr_written;
2838 cl_io_fini(env, io);
2839 cl_env_put(env, &refcheck);
2845 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2846 * null and dentry must be used directly rather than pulled from
2847 * *file->f_path.dentry as is done otherwise.
2850 #ifdef HAVE_FILE_FSYNC_4ARGS
2851 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2853 struct dentry *dentry = file->f_path.dentry;
2854 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2855 int ll_fsync(struct file *file, int datasync)
2857 struct dentry *dentry = file->f_path.dentry;
2859 loff_t end = LLONG_MAX;
2861 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2864 loff_t end = LLONG_MAX;
2866 struct inode *inode = dentry->d_inode;
2867 struct ll_inode_info *lli = ll_i2info(inode);
2868 struct ptlrpc_request *req;
2872 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2873 PFID(ll_inode2fid(inode)), inode);
2874 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2876 #ifdef HAVE_FILE_FSYNC_4ARGS
2877 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2878 mutex_lock(&inode->i_mutex);
2880 /* fsync's caller has already called _fdata{sync,write}, we want
2881 * that IO to finish before calling the osc and mdc sync methods */
2882 rc = filemap_fdatawait(inode->i_mapping);
2885 /* catch async errors that were recorded back when async writeback
2886 * failed for pages in this mapping. */
2887 if (!S_ISDIR(inode->i_mode)) {
2888 err = lli->lli_async_rc;
2889 lli->lli_async_rc = 0;
2892 err = lov_read_and_clear_async_rc(lli->lli_clob);
2897 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2901 ptlrpc_req_finished(req);
2903 if (S_ISREG(inode->i_mode)) {
2904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2906 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2907 if (rc == 0 && err < 0)
2910 fd->fd_write_failed = true;
2912 fd->fd_write_failed = false;
2915 #ifdef HAVE_FILE_FSYNC_4ARGS
2916 mutex_unlock(&inode->i_mutex);
2922 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2924 struct inode *inode = file->f_path.dentry->d_inode;
2925 struct ll_sb_info *sbi = ll_i2sbi(inode);
2926 struct ldlm_enqueue_info einfo = {
2927 .ei_type = LDLM_FLOCK,
2928 .ei_cb_cp = ldlm_flock_completion_ast,
2929 .ei_cbdata = file_lock,
2931 struct md_op_data *op_data;
2932 struct lustre_handle lockh = { 0 };
2933 union ldlm_policy_data flock = { { 0 } };
2934 int fl_type = file_lock->fl_type;
2940 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2941 PFID(ll_inode2fid(inode)), file_lock);
2943 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2945 if (file_lock->fl_flags & FL_FLOCK) {
2946 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2947 /* flocks are whole-file locks */
2948 flock.l_flock.end = OFFSET_MAX;
2949 /* For flocks owner is determined by the local file desctiptor*/
2950 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2951 } else if (file_lock->fl_flags & FL_POSIX) {
2952 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2953 flock.l_flock.start = file_lock->fl_start;
2954 flock.l_flock.end = file_lock->fl_end;
2958 flock.l_flock.pid = file_lock->fl_pid;
2960 /* Somewhat ugly workaround for svc lockd.
2961 * lockd installs custom fl_lmops->lm_compare_owner that checks
2962 * for the fl_owner to be the same (which it always is on local node
2963 * I guess between lockd processes) and then compares pid.
2964 * As such we assign pid to the owner field to make it all work,
2965 * conflict with normal locks is unlikely since pid space and
2966 * pointer space for current->files are not intersecting */
2967 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2968 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2972 einfo.ei_mode = LCK_PR;
2975 /* An unlock request may or may not have any relation to
2976 * existing locks so we may not be able to pass a lock handle
2977 * via a normal ldlm_lock_cancel() request. The request may even
2978 * unlock a byte range in the middle of an existing lock. In
2979 * order to process an unlock request we need all of the same
2980 * information that is given with a normal read or write record
2981 * lock request. To avoid creating another ldlm unlock (cancel)
2982 * message we'll treat a LCK_NL flock request as an unlock. */
2983 einfo.ei_mode = LCK_NL;
2986 einfo.ei_mode = LCK_PW;
2989 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3004 flags = LDLM_FL_BLOCK_NOWAIT;
3010 flags = LDLM_FL_TEST_LOCK;
3013 CERROR("unknown fcntl lock command: %d\n", cmd);
3017 /* Save the old mode so that if the mode in the lock changes we
3018 * can decrement the appropriate reader or writer refcount. */
3019 file_lock->fl_type = einfo.ei_mode;
3021 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3022 LUSTRE_OPC_ANY, NULL);
3023 if (IS_ERR(op_data))
3024 RETURN(PTR_ERR(op_data));
3026 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3027 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3028 flock.l_flock.pid, flags, einfo.ei_mode,
3029 flock.l_flock.start, flock.l_flock.end);
3031 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3034 /* Restore the file lock type if not TEST lock. */
3035 if (!(flags & LDLM_FL_TEST_LOCK))
3036 file_lock->fl_type = fl_type;
3038 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3039 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3040 !(flags & LDLM_FL_TEST_LOCK))
3041 rc2 = locks_lock_file_wait(file, file_lock);
3043 if ((file_lock->fl_flags & FL_FLOCK) &&
3044 (rc == 0 || file_lock->fl_type == F_UNLCK))
3045 rc2 = flock_lock_file_wait(file, file_lock);
3046 if ((file_lock->fl_flags & FL_POSIX) &&
3047 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3048 !(flags & LDLM_FL_TEST_LOCK))
3049 rc2 = posix_lock_file_wait(file, file_lock);
3050 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3052 if (rc2 && file_lock->fl_type != F_UNLCK) {
3053 einfo.ei_mode = LCK_NL;
3054 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3059 ll_finish_md_op_data(op_data);
3064 int ll_get_fid_by_name(struct inode *parent, const char *name,
3065 int namelen, struct lu_fid *fid,
3066 struct inode **inode)
3068 struct md_op_data *op_data = NULL;
3069 struct mdt_body *body;
3070 struct ptlrpc_request *req;
3074 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3075 LUSTRE_OPC_ANY, NULL);
3076 if (IS_ERR(op_data))
3077 RETURN(PTR_ERR(op_data));
3079 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3080 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3081 ll_finish_md_op_data(op_data);
3085 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3087 GOTO(out_req, rc = -EFAULT);
3089 *fid = body->mbo_fid1;
3092 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3094 ptlrpc_req_finished(req);
3098 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3099 const char *name, int namelen)
3101 struct dentry *dchild = NULL;
3102 struct inode *child_inode = NULL;
3103 struct md_op_data *op_data;
3104 struct ptlrpc_request *request = NULL;
3105 struct obd_client_handle *och = NULL;
3107 struct mdt_body *body;
3109 __u64 data_version = 0;
3112 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3113 name, PFID(ll_inode2fid(parent)), mdtidx);
3115 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3116 0, LUSTRE_OPC_ANY, NULL);
3117 if (IS_ERR(op_data))
3118 RETURN(PTR_ERR(op_data));
3120 /* Get child FID first */
3121 qstr.hash = full_name_hash(name, namelen);
3124 dchild = d_lookup(file->f_path.dentry, &qstr);
3125 if (dchild != NULL) {
3126 if (dchild->d_inode != NULL)
3127 child_inode = igrab(dchild->d_inode);
3131 if (child_inode == NULL) {
3132 rc = ll_get_fid_by_name(parent, name, namelen,
3133 &op_data->op_fid3, &child_inode);
3138 if (child_inode == NULL)
3139 GOTO(out_free, rc = -EINVAL);
3142 * lfs migrate command needs to be blocked on the client
3143 * by checking the migrate FID against the FID of the
3146 if (child_inode == parent->i_sb->s_root->d_inode)
3147 GOTO(out_iput, rc = -EINVAL);
3149 mutex_lock(&child_inode->i_mutex);
3150 op_data->op_fid3 = *ll_inode2fid(child_inode);
3151 if (!fid_is_sane(&op_data->op_fid3)) {
3152 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3153 ll_get_fsname(parent->i_sb, NULL, 0), name,
3154 PFID(&op_data->op_fid3));
3155 GOTO(out_unlock, rc = -EINVAL);
3158 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3160 GOTO(out_unlock, rc);
3163 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3164 PFID(&op_data->op_fid3), mdtidx);
3165 GOTO(out_unlock, rc = 0);
3168 if (S_ISREG(child_inode->i_mode)) {
3169 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3173 GOTO(out_unlock, rc);
3176 rc = ll_data_version(child_inode, &data_version,
3179 GOTO(out_close, rc);
3181 op_data->op_handle = och->och_fh;
3182 op_data->op_data = och->och_mod;
3183 op_data->op_data_version = data_version;
3184 op_data->op_lease_handle = och->och_lease_handle;
3185 op_data->op_bias |= MDS_RENAME_MIGRATE;
3188 op_data->op_mds = mdtidx;
3189 op_data->op_cli_flags = CLI_MIGRATE;
3190 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3191 namelen, name, namelen, &request);
3193 ll_update_times(request, parent);
3195 if (request != NULL) {
3196 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3198 ptlrpc_req_finished(request);
3199 GOTO(out_close, rc = -EPROTO);
3202 /* If the server does release layout lock, then we cleanup
3203 * the client och here, otherwise release it in out_close: */
3205 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3206 obd_mod_put(och->och_mod);
3207 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3209 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3213 ptlrpc_req_finished(request);
3216 /* Try again if the file layout has changed. */
3217 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3222 if (och != NULL) /* close the file */
3223 ll_lease_close(och, child_inode, NULL);
3225 clear_nlink(child_inode);
3227 mutex_unlock(&child_inode->i_mutex);
3231 ll_finish_md_op_data(op_data);
3236 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3244 * test if some locks matching bits and l_req_mode are acquired
3245 * - bits can be in different locks
3246 * - if found clear the common lock bits in *bits
3247 * - the bits not found, are kept in *bits
3249 * \param bits [IN] searched lock bits [IN]
3250 * \param l_req_mode [IN] searched lock mode
3251 * \retval boolean, true iff all bits are found
3253 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3255 struct lustre_handle lockh;
3256 union ldlm_policy_data policy;
3257 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3258 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3267 fid = &ll_i2info(inode)->lli_fid;
3268 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3269 ldlm_lockname[mode]);
3271 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3272 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3273 policy.l_inodebits.bits = *bits & (1 << i);
3274 if (policy.l_inodebits.bits == 0)
3277 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3278 &policy, mode, &lockh)) {
3279 struct ldlm_lock *lock;
3281 lock = ldlm_handle2lock(&lockh);
3284 ~(lock->l_policy_data.l_inodebits.bits);
3285 LDLM_LOCK_PUT(lock);
3287 *bits &= ~policy.l_inodebits.bits;
3294 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3295 struct lustre_handle *lockh, __u64 flags,
3296 enum ldlm_mode mode)
3298 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3303 fid = &ll_i2info(inode)->lli_fid;
3304 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3306 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3307 fid, LDLM_IBITS, &policy, mode, lockh);
3312 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3314 /* Already unlinked. Just update nlink and return success */
3315 if (rc == -ENOENT) {
3317 /* If it is striped directory, and there is bad stripe
3318 * Let's revalidate the dentry again, instead of returning
3320 if (S_ISDIR(inode->i_mode) &&
3321 ll_i2info(inode)->lli_lsm_md != NULL)
3324 /* This path cannot be hit for regular files unless in
3325 * case of obscure races, so no need to to validate
3327 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3329 } else if (rc != 0) {
3330 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3331 "%s: revalidate FID "DFID" error: rc = %d\n",
3332 ll_get_fsname(inode->i_sb, NULL, 0),
3333 PFID(ll_inode2fid(inode)), rc);
3339 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3341 struct inode *inode = dentry->d_inode;
3342 struct ptlrpc_request *req = NULL;
3343 struct obd_export *exp;
3347 LASSERT(inode != NULL);
3349 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3350 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3352 exp = ll_i2mdexp(inode);
3354 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3355 * But under CMD case, it caused some lock issues, should be fixed
3356 * with new CMD ibits lock. See bug 12718 */
3357 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3358 struct lookup_intent oit = { .it_op = IT_GETATTR };
3359 struct md_op_data *op_data;
3361 if (ibits == MDS_INODELOCK_LOOKUP)
3362 oit.it_op = IT_LOOKUP;
3364 /* Call getattr by fid, so do not provide name at all. */
3365 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3366 dentry->d_inode, NULL, 0, 0,
3367 LUSTRE_OPC_ANY, NULL);
3368 if (IS_ERR(op_data))
3369 RETURN(PTR_ERR(op_data));
3371 rc = md_intent_lock(exp, op_data, &oit, &req,
3372 &ll_md_blocking_ast, 0);
3373 ll_finish_md_op_data(op_data);
3375 rc = ll_inode_revalidate_fini(inode, rc);
3379 rc = ll_revalidate_it_finish(req, &oit, dentry);
3381 ll_intent_release(&oit);
3385 /* Unlinked? Unhash dentry, so it is not picked up later by
3386 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3387 here to preserve get_cwd functionality on 2.6.
3389 if (!dentry->d_inode->i_nlink) {
3390 ll_lock_dcache(inode);
3391 d_lustre_invalidate(dentry, 0);
3392 ll_unlock_dcache(inode);
3395 ll_lookup_finish_locks(&oit, dentry);
3396 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3397 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3398 u64 valid = OBD_MD_FLGETATTR;
3399 struct md_op_data *op_data;
3402 if (S_ISREG(inode->i_mode)) {
3403 rc = ll_get_default_mdsize(sbi, &ealen);
3406 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3409 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3410 0, ealen, LUSTRE_OPC_ANY,
3412 if (IS_ERR(op_data))
3413 RETURN(PTR_ERR(op_data));
3415 op_data->op_valid = valid;
3416 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3417 ll_finish_md_op_data(op_data);
3419 rc = ll_inode_revalidate_fini(inode, rc);
3423 rc = ll_prep_inode(&inode, req, NULL, NULL);
3426 ptlrpc_req_finished(req);
3430 static int ll_merge_md_attr(struct inode *inode)
3432 struct cl_attr attr = { 0 };
3435 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3436 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3437 &attr, ll_md_blocking_ast);
3441 set_nlink(inode, attr.cat_nlink);
3442 inode->i_blocks = attr.cat_blocks;
3443 i_size_write(inode, attr.cat_size);
3445 ll_i2info(inode)->lli_atime = attr.cat_atime;
3446 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3447 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3453 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3455 struct inode *inode = dentry->d_inode;
3459 rc = __ll_inode_revalidate(dentry, ibits);
3463 /* if object isn't regular file, don't validate size */
3464 if (!S_ISREG(inode->i_mode)) {
3465 if (S_ISDIR(inode->i_mode) &&
3466 ll_i2info(inode)->lli_lsm_md != NULL) {
3467 rc = ll_merge_md_attr(inode);
3472 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3473 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3474 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3476 /* In case of restore, the MDT has the right size and has
3477 * already send it back without granting the layout lock,
3478 * inode is up-to-date so glimpse is useless.
3479 * Also to glimpse we need the layout, in case of a running
3480 * restore the MDT holds the layout lock so the glimpse will
3481 * block up to the end of restore (getattr will block)
3483 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3484 rc = ll_glimpse_size(inode);
3489 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3491 struct inode *inode = de->d_inode;
3492 struct ll_sb_info *sbi = ll_i2sbi(inode);
3493 struct ll_inode_info *lli = ll_i2info(inode);
3496 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3497 MDS_INODELOCK_LOOKUP);
3498 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3503 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3505 stat->dev = inode->i_sb->s_dev;
3506 if (ll_need_32bit_api(sbi))
3507 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3509 stat->ino = inode->i_ino;
3510 stat->mode = inode->i_mode;
3511 stat->uid = inode->i_uid;
3512 stat->gid = inode->i_gid;
3513 stat->rdev = inode->i_rdev;
3514 stat->atime = inode->i_atime;
3515 stat->mtime = inode->i_mtime;
3516 stat->ctime = inode->i_ctime;
3517 stat->blksize = 1 << inode->i_blkbits;
3519 stat->nlink = inode->i_nlink;
3520 stat->size = i_size_read(inode);
3521 stat->blocks = inode->i_blocks;
3526 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3527 __u64 start, __u64 len)
3531 struct fiemap *fiemap;
3532 unsigned int extent_count = fieinfo->fi_extents_max;
3534 num_bytes = sizeof(*fiemap) + (extent_count *
3535 sizeof(struct fiemap_extent));
3536 OBD_ALLOC_LARGE(fiemap, num_bytes);
3541 fiemap->fm_flags = fieinfo->fi_flags;
3542 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3543 fiemap->fm_start = start;
3544 fiemap->fm_length = len;
3545 if (extent_count > 0 &&
3546 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3547 sizeof(struct fiemap_extent)) != 0)
3548 GOTO(out, rc = -EFAULT);
3550 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3552 fieinfo->fi_flags = fiemap->fm_flags;
3553 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3554 if (extent_count > 0 &&
3555 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3556 fiemap->fm_mapped_extents *
3557 sizeof(struct fiemap_extent)) != 0)
3558 GOTO(out, rc = -EFAULT);
3560 OBD_FREE_LARGE(fiemap, num_bytes);
3564 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3566 struct ll_inode_info *lli = ll_i2info(inode);
3567 struct posix_acl *acl = NULL;
3570 spin_lock(&lli->lli_lock);
3571 /* VFS' acl_permission_check->check_acl will release the refcount */
3572 acl = posix_acl_dup(lli->lli_posix_acl);
3573 spin_unlock(&lli->lli_lock);
3578 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3580 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3581 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3583 ll_check_acl(struct inode *inode, int mask)
3586 # ifdef CONFIG_FS_POSIX_ACL
3587 struct posix_acl *acl;
3591 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3592 if (flags & IPERM_FLAG_RCU)
3595 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3600 rc = posix_acl_permission(inode, acl, mask);
3601 posix_acl_release(acl);
3604 # else /* !CONFIG_FS_POSIX_ACL */
3606 # endif /* CONFIG_FS_POSIX_ACL */
3608 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3610 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3611 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3613 # ifdef HAVE_INODE_PERMISION_2ARGS
3614 int ll_inode_permission(struct inode *inode, int mask)
3616 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3621 struct ll_sb_info *sbi;
3622 struct root_squash_info *squash;
3623 struct cred *cred = NULL;
3624 const struct cred *old_cred = NULL;
3626 bool squash_id = false;
3629 #ifdef MAY_NOT_BLOCK
3630 if (mask & MAY_NOT_BLOCK)
3632 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3633 if (flags & IPERM_FLAG_RCU)
3637 /* as root inode are NOT getting validated in lookup operation,
3638 * need to do it before permission check. */
3640 if (inode == inode->i_sb->s_root->d_inode) {
3641 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3642 MDS_INODELOCK_LOOKUP);
3647 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3648 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3650 /* squash fsuid/fsgid if needed */
3651 sbi = ll_i2sbi(inode);
3652 squash = &sbi->ll_squash;
3653 if (unlikely(squash->rsi_uid != 0 &&
3654 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3655 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3659 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3660 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3661 squash->rsi_uid, squash->rsi_gid);
3663 /* update current process's credentials
3664 * and FS capability */
3665 cred = prepare_creds();
3669 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3670 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3671 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3672 if ((1 << cap) & CFS_CAP_FS_MASK)
3673 cap_lower(cred->cap_effective, cap);
3675 old_cred = override_creds(cred);
3678 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3680 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3681 rc = lustre_check_remote_perm(inode, mask);
3683 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3685 /* restore current process's credentials and FS capability */
3687 revert_creds(old_cred);
3694 /* -o localflock - only provides locally consistent flock locks */
3695 struct file_operations ll_file_operations = {
3696 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3697 # ifdef HAVE_SYNC_READ_WRITE
3698 .read = new_sync_read,
3699 .write = new_sync_write,
3701 .read_iter = ll_file_read_iter,
3702 .write_iter = ll_file_write_iter,
3703 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3704 .read = ll_file_read,
3705 .aio_read = ll_file_aio_read,
3706 .write = ll_file_write,
3707 .aio_write = ll_file_aio_write,
3708 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3709 .unlocked_ioctl = ll_file_ioctl,
3710 .open = ll_file_open,
3711 .release = ll_file_release,
3712 .mmap = ll_file_mmap,
3713 .llseek = ll_file_seek,
3714 .splice_read = ll_file_splice_read,
3719 struct file_operations ll_file_operations_flock = {
3720 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3721 # ifdef HAVE_SYNC_READ_WRITE
3722 .read = new_sync_read,
3723 .write = new_sync_write,
3724 # endif /* HAVE_SYNC_READ_WRITE */
3725 .read_iter = ll_file_read_iter,
3726 .write_iter = ll_file_write_iter,
3727 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3728 .read = ll_file_read,
3729 .aio_read = ll_file_aio_read,
3730 .write = ll_file_write,
3731 .aio_write = ll_file_aio_write,
3732 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3733 .unlocked_ioctl = ll_file_ioctl,
3734 .open = ll_file_open,
3735 .release = ll_file_release,
3736 .mmap = ll_file_mmap,
3737 .llseek = ll_file_seek,
3738 .splice_read = ll_file_splice_read,
3741 .flock = ll_file_flock,
3742 .lock = ll_file_flock
3745 /* These are for -o noflock - to return ENOSYS on flock calls */
3746 struct file_operations ll_file_operations_noflock = {
3747 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3748 # ifdef HAVE_SYNC_READ_WRITE
3749 .read = new_sync_read,
3750 .write = new_sync_write,
3751 # endif /* HAVE_SYNC_READ_WRITE */
3752 .read_iter = ll_file_read_iter,
3753 .write_iter = ll_file_write_iter,
3754 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3755 .read = ll_file_read,
3756 .aio_read = ll_file_aio_read,
3757 .write = ll_file_write,
3758 .aio_write = ll_file_aio_write,
3759 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3760 .unlocked_ioctl = ll_file_ioctl,
3761 .open = ll_file_open,
3762 .release = ll_file_release,
3763 .mmap = ll_file_mmap,
3764 .llseek = ll_file_seek,
3765 .splice_read = ll_file_splice_read,
3768 .flock = ll_file_noflock,
3769 .lock = ll_file_noflock
3772 struct inode_operations ll_file_inode_operations = {
3773 .setattr = ll_setattr,
3774 .getattr = ll_getattr,
3775 .permission = ll_inode_permission,
3776 .setxattr = ll_setxattr,
3777 .getxattr = ll_getxattr,
3778 .listxattr = ll_listxattr,
3779 .removexattr = ll_removexattr,
3780 .fiemap = ll_fiemap,
3781 #ifdef HAVE_IOP_GET_ACL
3782 .get_acl = ll_get_acl,
3786 /* dynamic ioctl number support routins */
3787 static struct llioc_ctl_data {
3788 struct rw_semaphore ioc_sem;
3789 struct list_head ioc_head;
3791 __RWSEM_INITIALIZER(llioc.ioc_sem),
3792 LIST_HEAD_INIT(llioc.ioc_head)
3797 struct list_head iocd_list;
3798 unsigned int iocd_size;
3799 llioc_callback_t iocd_cb;
3800 unsigned int iocd_count;
3801 unsigned int iocd_cmd[0];
3804 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3807 struct llioc_data *in_data = NULL;
3810 if (cb == NULL || cmd == NULL ||
3811 count > LLIOC_MAX_CMD || count < 0)
3814 size = sizeof(*in_data) + count * sizeof(unsigned int);
3815 OBD_ALLOC(in_data, size);
3816 if (in_data == NULL)
3819 memset(in_data, 0, sizeof(*in_data));
3820 in_data->iocd_size = size;
3821 in_data->iocd_cb = cb;
3822 in_data->iocd_count = count;
3823 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3825 down_write(&llioc.ioc_sem);
3826 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3827 up_write(&llioc.ioc_sem);
3832 void ll_iocontrol_unregister(void *magic)
3834 struct llioc_data *tmp;
3839 down_write(&llioc.ioc_sem);
3840 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3842 unsigned int size = tmp->iocd_size;
3844 list_del(&tmp->iocd_list);
3845 up_write(&llioc.ioc_sem);
3847 OBD_FREE(tmp, size);
3851 up_write(&llioc.ioc_sem);
3853 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3856 EXPORT_SYMBOL(ll_iocontrol_register);
3857 EXPORT_SYMBOL(ll_iocontrol_unregister);
3859 static enum llioc_iter
3860 ll_iocontrol_call(struct inode *inode, struct file *file,
3861 unsigned int cmd, unsigned long arg, int *rcp)
3863 enum llioc_iter ret = LLIOC_CONT;
3864 struct llioc_data *data;
3865 int rc = -EINVAL, i;
3867 down_read(&llioc.ioc_sem);
3868 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3869 for (i = 0; i < data->iocd_count; i++) {
3870 if (cmd != data->iocd_cmd[i])
3873 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3877 if (ret == LLIOC_STOP)
3880 up_read(&llioc.ioc_sem);
3887 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3889 struct ll_inode_info *lli = ll_i2info(inode);
3890 struct cl_object *obj = lli->lli_clob;
3899 env = cl_env_get(&refcheck);
3901 RETURN(PTR_ERR(env));
3903 rc = cl_conf_set(env, lli->lli_clob, conf);
3907 if (conf->coc_opc == OBJECT_CONF_SET) {
3908 struct ldlm_lock *lock = conf->coc_lock;
3909 struct cl_layout cl = {
3913 LASSERT(lock != NULL);
3914 LASSERT(ldlm_has_layout(lock));
3916 /* it can only be allowed to match after layout is
3917 * applied to inode otherwise false layout would be
3918 * seen. Applying layout shoud happen before dropping
3919 * the intent lock. */
3920 ldlm_lock_allow_match(lock);
3922 rc = cl_object_layout_get(env, obj, &cl);
3927 DFID": layout version change: %u -> %u\n",
3928 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3930 ll_layout_version_set(lli, cl.cl_layout_gen);
3934 cl_env_put(env, &refcheck);
3939 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3940 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3943 struct ll_sb_info *sbi = ll_i2sbi(inode);
3944 struct ptlrpc_request *req;
3945 struct mdt_body *body;
3952 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3953 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3954 lock->l_lvb_data, lock->l_lvb_len);
3956 if (lock->l_lvb_data != NULL)
3959 /* if layout lock was granted right away, the layout is returned
3960 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3961 * blocked and then granted via completion ast, we have to fetch
3962 * layout here. Please note that we can't use the LVB buffer in
3963 * completion AST because it doesn't have a large enough buffer */
3964 rc = ll_get_default_mdsize(sbi, &lmmsize);
3966 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3967 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3972 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3974 GOTO(out, rc = -EPROTO);
3976 lmmsize = body->mbo_eadatasize;
3977 if (lmmsize == 0) /* empty layout */
3980 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3982 GOTO(out, rc = -EFAULT);
3984 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3985 if (lvbdata == NULL)
3986 GOTO(out, rc = -ENOMEM);
3988 memcpy(lvbdata, lmm, lmmsize);
3989 lock_res_and_lock(lock);
3990 if (unlikely(lock->l_lvb_data == NULL)) {
3991 lock->l_lvb_type = LVB_T_LAYOUT;
3992 lock->l_lvb_data = lvbdata;
3993 lock->l_lvb_len = lmmsize;
3996 unlock_res_and_lock(lock);
3999 OBD_FREE_LARGE(lvbdata, lmmsize);
4004 ptlrpc_req_finished(req);
4009 * Apply the layout to the inode. Layout lock is held and will be released
4012 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4013 struct inode *inode)
4015 struct ll_inode_info *lli = ll_i2info(inode);
4016 struct ll_sb_info *sbi = ll_i2sbi(inode);
4017 struct ldlm_lock *lock;
4018 struct cl_object_conf conf;
4021 bool wait_layout = false;
4024 LASSERT(lustre_handle_is_used(lockh));
4026 lock = ldlm_handle2lock(lockh);
4027 LASSERT(lock != NULL);
4028 LASSERT(ldlm_has_layout(lock));
4030 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4031 PFID(&lli->lli_fid), inode);
4033 /* in case this is a caching lock and reinstate with new inode */
4034 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
4036 lock_res_and_lock(lock);
4037 lvb_ready = ldlm_is_lvb_ready(lock);
4038 unlock_res_and_lock(lock);
4039 /* checking lvb_ready is racy but this is okay. The worst case is
4040 * that multi processes may configure the file on the same time. */
4045 rc = ll_layout_fetch(inode, lock);
4049 /* for layout lock, lmm is stored in lock's lvb.
4050 * lvb_data is immutable if the lock is held so it's safe to access it
4053 * set layout to file. Unlikely this will fail as old layout was
4054 * surely eliminated */
4055 memset(&conf, 0, sizeof conf);
4056 conf.coc_opc = OBJECT_CONF_SET;
4057 conf.coc_inode = inode;
4058 conf.coc_lock = lock;
4059 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4060 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4061 rc = ll_layout_conf(inode, &conf);
4063 /* refresh layout failed, need to wait */
4064 wait_layout = rc == -EBUSY;
4068 LDLM_LOCK_PUT(lock);
4069 ldlm_lock_decref(lockh, mode);
4071 /* wait for IO to complete if it's still being used. */
4073 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4074 ll_get_fsname(inode->i_sb, NULL, 0),
4075 PFID(&lli->lli_fid), inode);
4077 memset(&conf, 0, sizeof conf);
4078 conf.coc_opc = OBJECT_CONF_WAIT;
4079 conf.coc_inode = inode;
4080 rc = ll_layout_conf(inode, &conf);
4084 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4085 ll_get_fsname(inode->i_sb, NULL, 0),
4086 PFID(&lli->lli_fid), rc);
4091 static int ll_layout_refresh_locked(struct inode *inode)
4093 struct ll_inode_info *lli = ll_i2info(inode);
4094 struct ll_sb_info *sbi = ll_i2sbi(inode);
4095 struct md_op_data *op_data;
4096 struct lookup_intent it;
4097 struct lustre_handle lockh;
4098 enum ldlm_mode mode;
4099 struct ldlm_enqueue_info einfo = {
4100 .ei_type = LDLM_IBITS,
4102 .ei_cb_bl = &ll_md_blocking_ast,
4103 .ei_cb_cp = &ldlm_completion_ast,
4109 /* mostly layout lock is caching on the local side, so try to match
4110 * it before grabbing layout lock mutex. */
4111 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4112 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4113 if (mode != 0) { /* hit cached lock */
4114 rc = ll_layout_lock_set(&lockh, mode, inode);
4121 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4122 0, 0, LUSTRE_OPC_ANY, NULL);
4123 if (IS_ERR(op_data))
4124 RETURN(PTR_ERR(op_data));
4126 /* have to enqueue one */
4127 memset(&it, 0, sizeof(it));
4128 it.it_op = IT_LAYOUT;
4129 lockh.cookie = 0ULL;
4131 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4132 ll_get_fsname(inode->i_sb, NULL, 0),
4133 PFID(&lli->lli_fid), inode);
4135 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4136 if (it.it_request != NULL)
4137 ptlrpc_req_finished(it.it_request);
4138 it.it_request = NULL;
4140 ll_finish_md_op_data(op_data);
4142 mode = it.it_lock_mode;
4143 it.it_lock_mode = 0;
4144 ll_intent_drop_lock(&it);
4147 /* set lock data in case this is a new lock */
4148 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4149 rc = ll_layout_lock_set(&lockh, mode, inode);
4158 * This function checks if there exists a LAYOUT lock on the client side,
4159 * or enqueues it if it doesn't have one in cache.
4161 * This function will not hold layout lock so it may be revoked any time after
4162 * this function returns. Any operations depend on layout should be redone
4165 * This function should be called before lov_io_init() to get an uptodate
4166 * layout version, the caller should save the version number and after IO
4167 * is finished, this function should be called again to verify that layout
4168 * is not changed during IO time.
4170 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4172 struct ll_inode_info *lli = ll_i2info(inode);
4173 struct ll_sb_info *sbi = ll_i2sbi(inode);
4177 *gen = ll_layout_version_get(lli);
4178 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4182 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4183 LASSERT(S_ISREG(inode->i_mode));
4185 /* take layout lock mutex to enqueue layout lock exclusively. */
4186 mutex_lock(&lli->lli_layout_mutex);
4188 rc = ll_layout_refresh_locked(inode);
4192 *gen = ll_layout_version_get(lli);
4194 mutex_unlock(&lli->lli_layout_mutex);
4200 * This function send a restore request to the MDT
4202 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4204 struct hsm_user_request *hur;
4208 len = sizeof(struct hsm_user_request) +
4209 sizeof(struct hsm_user_item);
4210 OBD_ALLOC(hur, len);
4214 hur->hur_request.hr_action = HUA_RESTORE;
4215 hur->hur_request.hr_archive_id = 0;
4216 hur->hur_request.hr_flags = 0;
4217 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4218 sizeof(hur->hur_user_item[0].hui_fid));
4219 hur->hur_user_item[0].hui_extent.offset = offset;
4220 hur->hur_user_item[0].hui_extent.length = length;
4221 hur->hur_request.hr_itemcount = 1;
4222 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,