4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct mdt_body *body;
439 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
440 och->och_fh = body->mbo_handle;
441 och->och_fid = body->mbo_fid1;
442 och->och_lease_handle.cookie = it->it_lock_handle;
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_flags = it->it_flags;
446 return md_set_open_replay_data(md_exp, och, it);
449 static int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_path.dentry->d_inode;
455 LASSERT(!LUSTRE_FPRIVATE(file));
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
467 LUSTRE_FPRIVATE(file) = fd;
468 ll_readahead_init(inode, &fd->fd_ras);
469 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
471 /* ll_cl_context initialize */
472 rwlock_init(&fd->fd_lock);
473 INIT_LIST_HEAD(&fd->fd_lccs);
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
503 PFID(ll_inode2fid(inode)), inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_openerr, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode))
514 ll_authorize_statahead(inode, fd);
516 if (inode->i_sb->s_root == file->f_path.dentry) {
517 LUSTRE_FPRIVATE(file) = fd;
521 if (!it || !it->it_disposition) {
522 /* Convert f_flags into access mode. We cannot use file->f_mode,
523 * because everything but O_ACCMODE mask was stripped from
525 if ((oit.it_flags + 1) & O_ACCMODE)
527 if (file->f_flags & O_TRUNC)
528 oit.it_flags |= FMODE_WRITE;
530 /* kernel only call f_op->open in dentry_open. filp_open calls
531 * dentry_open after call to open_namei that checks permissions.
532 * Only nfsd_open call dentry_open directly without checking
533 * permissions and because of that this code below is safe. */
534 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
535 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications? */
539 oit.it_flags &= ~O_EXCL;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
543 * consistent with it */
544 if (oit.it_flags & O_CREAT)
545 oit.it_op |= IT_CREAT;
551 /* Let's see if we have file open on MDS already. */
552 if (it->it_flags & FMODE_WRITE) {
553 och_p = &lli->lli_mds_write_och;
554 och_usecount = &lli->lli_open_fd_write_count;
555 } else if (it->it_flags & FMODE_EXEC) {
556 och_p = &lli->lli_mds_exec_och;
557 och_usecount = &lli->lli_open_fd_exec_count;
559 och_p = &lli->lli_mds_read_och;
560 och_usecount = &lli->lli_open_fd_read_count;
563 mutex_lock(&lli->lli_och_mutex);
564 if (*och_p) { /* Open handle is present */
565 if (it_disposition(it, DISP_OPEN_OPEN)) {
566 /* Well, there's extra open request that we do not need,
567 let's close it somehow. This will decref request. */
568 rc = it_open_error(DISP_OPEN_OPEN, it);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 ll_release_openhandle(file->f_path.dentry, it);
578 rc = ll_local_open(file, it, fd, NULL);
581 mutex_unlock(&lli->lli_och_mutex);
582 GOTO(out_openerr, rc);
585 LASSERT(*och_usecount == 0);
586 if (!it->it_disposition) {
587 /* We cannot just request lock handle now, new ELC code
588 means that one of other OPEN locks for this file
589 could be cancelled, and since blocking ast handler
590 would attempt to grab och_mutex as well, that would
591 result in a deadlock */
592 mutex_unlock(&lli->lli_och_mutex);
594 * Normally called under two situations:
596 * 2. A race/condition on MDS resulting in no open
597 * handle to be returned from LOOKUP|OPEN request,
598 * for example if the target entry was a symlink.
600 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file, NULL, 0, it);
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * Acquire a lease and open the file.
699 static struct obd_client_handle *
700 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
703 struct lookup_intent it = { .it_op = IT_OPEN };
704 struct ll_sb_info *sbi = ll_i2sbi(inode);
705 struct md_op_data *op_data;
706 struct ptlrpc_request *req = NULL;
707 struct lustre_handle old_handle = { 0 };
708 struct obd_client_handle *och = NULL;
713 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
714 RETURN(ERR_PTR(-EINVAL));
717 struct ll_inode_info *lli = ll_i2info(inode);
718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
719 struct obd_client_handle **och_p;
722 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
723 RETURN(ERR_PTR(-EPERM));
725 /* Get the openhandle of the file */
727 mutex_lock(&lli->lli_och_mutex);
728 if (fd->fd_lease_och != NULL) {
729 mutex_unlock(&lli->lli_och_mutex);
733 if (fd->fd_och == NULL) {
734 if (file->f_mode & FMODE_WRITE) {
735 LASSERT(lli->lli_mds_write_och != NULL);
736 och_p = &lli->lli_mds_write_och;
737 och_usecount = &lli->lli_open_fd_write_count;
739 LASSERT(lli->lli_mds_read_och != NULL);
740 och_p = &lli->lli_mds_read_och;
741 och_usecount = &lli->lli_open_fd_read_count;
743 if (*och_usecount == 1) {
750 mutex_unlock(&lli->lli_och_mutex);
751 if (rc < 0) /* more than 1 opener */
754 LASSERT(fd->fd_och != NULL);
755 old_handle = fd->fd_och->och_fh;
760 RETURN(ERR_PTR(-ENOMEM));
762 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
763 LUSTRE_OPC_ANY, NULL);
765 GOTO(out, rc = PTR_ERR(op_data));
767 /* To tell the MDT this openhandle is from the same owner */
768 op_data->op_handle = old_handle;
770 it.it_flags = fmode | open_flags;
771 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
772 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
773 &ll_md_blocking_lease_ast,
774 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
775 * it can be cancelled which may mislead applications that the lease is
777 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
778 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
779 * doesn't deal with openhandle, so normal openhandle will be leaked. */
780 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
781 ll_finish_md_op_data(op_data);
782 ptlrpc_req_finished(req);
784 GOTO(out_release_it, rc);
786 if (it_disposition(&it, DISP_LOOKUP_NEG))
787 GOTO(out_release_it, rc = -ENOENT);
789 rc = it_open_error(DISP_OPEN_OPEN, &it);
791 GOTO(out_release_it, rc);
793 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
794 ll_och_fill(sbi->ll_md_exp, &it, och);
796 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
797 GOTO(out_close, rc = -EOPNOTSUPP);
799 /* already get lease, handle lease lock */
800 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
801 if (it.it_lock_mode == 0 ||
802 it.it_lock_bits != MDS_INODELOCK_OPEN) {
803 /* open lock must return for lease */
804 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
805 PFID(ll_inode2fid(inode)), it.it_lock_mode,
807 GOTO(out_close, rc = -EPROTO);
810 ll_intent_release(&it);
814 /* Cancel open lock */
815 if (it.it_lock_mode != 0) {
816 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
819 och->och_lease_handle.cookie = 0ULL;
821 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
823 CERROR("%s: error closing file "DFID": %d\n",
824 ll_get_fsname(inode->i_sb, NULL, 0),
825 PFID(&ll_i2info(inode)->lli_fid), rc2);
826 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
828 ll_intent_release(&it);
836 * Check whether a layout swap can be done between two inodes.
838 * \param[in] inode1 First inode to check
839 * \param[in] inode2 Second inode to check
841 * \retval 0 on success, layout swap can be performed between both inodes
842 * \retval negative error code if requirements are not met
844 static int ll_check_swap_layouts_validity(struct inode *inode1,
845 struct inode *inode2)
847 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
850 if (inode_permission(inode1, MAY_WRITE) ||
851 inode_permission(inode2, MAY_WRITE))
854 if (inode1->i_sb != inode2->i_sb)
860 static int ll_swap_layouts_close(struct obd_client_handle *och,
861 struct inode *inode, struct inode *inode2)
863 const struct lu_fid *fid1 = ll_inode2fid(inode);
864 const struct lu_fid *fid2;
868 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
869 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
871 rc = ll_check_swap_layouts_validity(inode, inode2);
873 GOTO(out_free_och, rc);
875 /* We now know that inode2 is a lustre inode */
876 fid2 = ll_inode2fid(inode2);
878 rc = lu_fid_cmp(fid1, fid2);
880 GOTO(out_free_och, rc = -EINVAL);
882 /* Close the file and swap layouts between inode & inode2.
883 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
884 * because we still need it to pack l_remote_handle to MDT. */
885 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
888 och = NULL; /* freed in ll_close_inode_openhandle() */
898 * Release lease and close the file.
899 * It will check if the lease has ever broken.
901 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
904 struct ldlm_lock *lock;
905 bool cancelled = true;
909 lock = ldlm_handle2lock(&och->och_lease_handle);
911 lock_res_and_lock(lock);
912 cancelled = ldlm_is_cancel(lock);
913 unlock_res_and_lock(lock);
917 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
918 PFID(&ll_i2info(inode)->lli_fid), cancelled);
921 ldlm_cli_cancel(&och->och_lease_handle, 0);
922 if (lease_broken != NULL)
923 *lease_broken = cancelled;
925 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
929 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
931 struct ll_inode_info *lli = ll_i2info(inode);
932 struct cl_object *obj = lli->lli_clob;
933 struct cl_attr *attr = vvp_env_thread_attr(env);
941 ll_inode_size_lock(inode);
943 /* merge timestamps the most recently obtained from mds with
944 timestamps obtained from osts */
945 LTIME_S(inode->i_atime) = lli->lli_atime;
946 LTIME_S(inode->i_mtime) = lli->lli_mtime;
947 LTIME_S(inode->i_ctime) = lli->lli_ctime;
949 atime = LTIME_S(inode->i_atime);
950 mtime = LTIME_S(inode->i_mtime);
951 ctime = LTIME_S(inode->i_ctime);
953 cl_object_attr_lock(obj);
954 rc = cl_object_attr_get(env, obj, attr);
955 cl_object_attr_unlock(obj);
958 GOTO(out_size_unlock, rc);
960 if (atime < attr->cat_atime)
961 atime = attr->cat_atime;
963 if (ctime < attr->cat_ctime)
964 ctime = attr->cat_ctime;
966 if (mtime < attr->cat_mtime)
967 mtime = attr->cat_mtime;
969 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
970 PFID(&lli->lli_fid), attr->cat_size);
972 i_size_write(inode, attr->cat_size);
973 inode->i_blocks = attr->cat_blocks;
975 LTIME_S(inode->i_atime) = atime;
976 LTIME_S(inode->i_mtime) = mtime;
977 LTIME_S(inode->i_ctime) = ctime;
980 ll_inode_size_unlock(inode);
985 static bool file_is_noatime(const struct file *file)
987 const struct vfsmount *mnt = file->f_path.mnt;
988 const struct inode *inode = file->f_path.dentry->d_inode;
990 /* Adapted from file_accessed() and touch_atime().*/
991 if (file->f_flags & O_NOATIME)
994 if (inode->i_flags & S_NOATIME)
997 if (IS_NOATIME(inode))
1000 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1003 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1006 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1012 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1014 struct inode *inode = file->f_path.dentry->d_inode;
1016 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1018 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1019 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1020 file->f_flags & O_DIRECT ||
1023 io->ci_obj = ll_i2info(inode)->lli_clob;
1024 io->ci_lockreq = CILR_MAYBE;
1025 if (ll_file_nolock(file)) {
1026 io->ci_lockreq = CILR_NEVER;
1027 io->ci_no_srvlock = 1;
1028 } else if (file->f_flags & O_APPEND) {
1029 io->ci_lockreq = CILR_MANDATORY;
1032 io->ci_noatime = file_is_noatime(file);
1036 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1037 struct file *file, enum cl_io_type iot,
1038 loff_t *ppos, size_t count)
1040 struct vvp_io *vio = vvp_env_io(env);
1041 struct inode *inode = file->f_path.dentry->d_inode;
1042 struct ll_inode_info *lli = ll_i2info(inode);
1043 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1047 struct range_lock range;
1051 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1052 file->f_path.dentry->d_name.name, iot, *ppos, count);
1055 io = vvp_env_thread_io(env);
1056 ll_io_init(io, file, iot == CIT_WRITE);
1058 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1059 bool range_locked = false;
1061 if (file->f_flags & O_APPEND)
1062 range_lock_init(&range, 0, LUSTRE_EOF);
1064 range_lock_init(&range, *ppos, *ppos + count - 1);
1066 vio->vui_fd = LUSTRE_FPRIVATE(file);
1067 vio->vui_io_subtype = args->via_io_subtype;
1069 switch (vio->vui_io_subtype) {
1071 vio->vui_iter = args->u.normal.via_iter;
1072 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1073 vio->vui_tot_nrsegs = vio->vui_iter->nr_segs;
1074 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1075 vio->vui_iocb = args->u.normal.via_iocb;
1076 /* Direct IO reads must also take range lock,
1077 * or multiple reads will try to work on the same pages
1078 * See LU-6227 for details. */
1079 if (((iot == CIT_WRITE) ||
1080 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1081 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1082 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1084 rc = range_lock(&lli->lli_write_tree, &range);
1088 range_locked = true;
1092 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1093 vio->u.splice.vui_flags = args->u.splice.via_flags;
1096 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1100 ll_cl_add(file, env, io);
1101 rc = cl_io_loop(env, io);
1102 ll_cl_remove(file, env);
1105 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1107 range_unlock(&lli->lli_write_tree, &range);
1110 /* cl_io_rw_init() handled IO */
1114 if (io->ci_nob > 0) {
1115 result += io->ci_nob;
1116 count -= io->ci_nob;
1117 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1119 /* prepare IO restart */
1120 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1121 args->u.normal.via_iter = vio->vui_iter;
1122 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1123 args->u.normal.via_iter->nr_segs = vio->vui_tot_nrsegs;
1124 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1129 cl_io_fini(env, io);
1131 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1133 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1134 file->f_path.dentry->d_name.name,
1135 iot == CIT_READ ? "read" : "write",
1136 *ppos, count, result);
1140 if (iot == CIT_READ) {
1142 ll_stats_ops_tally(ll_i2sbi(inode),
1143 LPROC_LL_READ_BYTES, result);
1144 } else if (iot == CIT_WRITE) {
1146 ll_stats_ops_tally(ll_i2sbi(inode),
1147 LPROC_LL_WRITE_BYTES, result);
1148 fd->fd_write_failed = false;
1149 } else if (result == 0 && rc == 0) {
1152 fd->fd_write_failed = true;
1154 fd->fd_write_failed = false;
1155 } else if (rc != -ERESTARTSYS) {
1156 fd->fd_write_failed = true;
1160 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1162 return result > 0 ? result : rc;
1166 * Read from a file (through the page cache).
1168 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1170 struct vvp_io_args *args;
1175 env = cl_env_get(&refcheck);
1177 return PTR_ERR(env);
1179 args = ll_env_args(env, IO_NORMAL);
1180 args->u.normal.via_iter = to;
1181 args->u.normal.via_iocb = iocb;
1183 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1184 &iocb->ki_pos, iov_iter_count(to));
1185 cl_env_put(env, &refcheck);
1190 * Write to a file (through the page cache).
1192 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1194 struct vvp_io_args *args;
1199 env = cl_env_get(&refcheck);
1201 return PTR_ERR(env);
1203 args = ll_env_args(env, IO_NORMAL);
1204 args->u.normal.via_iter = from;
1205 args->u.normal.via_iocb = iocb;
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1208 &iocb->ki_pos, iov_iter_count(from));
1209 cl_env_put(env, &refcheck);
1213 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1215 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1217 static int ll_file_get_iov_count(const struct iovec *iov,
1218 unsigned long *nr_segs, size_t *count)
1223 for (seg = 0; seg < *nr_segs; seg++) {
1224 const struct iovec *iv = &iov[seg];
1227 * If any segment has a negative length, or the cumulative
1228 * length ever wraps negative then return -EINVAL.
1231 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1233 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1238 cnt -= iv->iov_len; /* This segment is no good */
1245 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1246 unsigned long nr_segs, loff_t pos)
1248 struct iovec *local_iov;
1254 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1258 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1259 if (local_iov == NULL)
1262 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1264 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1265 iov_iter_init(&to, READ, local_iov, nr_segs, iov_count);
1266 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1267 iov_iter_init(&to, local_iov, nr_segs, iov_count, 0);
1268 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1270 result = ll_file_read_iter(iocb, &to);
1272 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1276 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1279 struct iovec iov = { .iov_base = buf, .iov_len = count };
1280 struct kiocb *kiocb;
1284 OBD_ALLOC_PTR(kiocb);
1288 init_sync_kiocb(kiocb, file);
1289 kiocb->ki_pos = *ppos;
1290 #ifdef HAVE_KIOCB_KI_LEFT
1291 kiocb->ki_left = count;
1292 #elif defined(HAVE_KI_NBYTES)
1293 kiocb->ki_nbytes = count;
1296 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1297 *ppos = kiocb->ki_pos;
1299 OBD_FREE_PTR(kiocb);
1304 * Write to a file (through the page cache).
1307 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1308 unsigned long nr_segs, loff_t pos)
1310 struct iovec *local_iov;
1311 struct iov_iter from;
1316 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1320 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1321 if (local_iov == NULL)
1324 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1326 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1327 iov_iter_init(&from, WRITE, local_iov, nr_segs, iov_count);
1328 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1329 iov_iter_init(&from, local_iov, nr_segs, iov_count, 0);
1330 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1332 result = ll_file_write_iter(iocb, &from);
1334 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1339 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1340 size_t count, loff_t *ppos)
1343 struct iovec iov = { .iov_base = (void __user *)buf,
1345 struct kiocb *kiocb;
1350 env = cl_env_get(&refcheck);
1352 RETURN(PTR_ERR(env));
1354 kiocb = &ll_env_info(env)->lti_kiocb;
1355 init_sync_kiocb(kiocb, file);
1356 kiocb->ki_pos = *ppos;
1357 #ifdef HAVE_KIOCB_KI_LEFT
1358 kiocb->ki_left = count;
1359 #elif defined(HAVE_KI_NBYTES)
1360 kiocb->ki_nbytes = count;
1363 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1364 *ppos = kiocb->ki_pos;
1366 cl_env_put(env, &refcheck);
1369 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1372 * Send file content (through pagecache) somewhere with helper
1374 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1375 struct pipe_inode_info *pipe, size_t count,
1379 struct vvp_io_args *args;
1384 env = cl_env_get(&refcheck);
1386 RETURN(PTR_ERR(env));
1388 args = ll_env_args(env, IO_SPLICE);
1389 args->u.splice.via_pipe = pipe;
1390 args->u.splice.via_flags = flags;
1392 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1393 cl_env_put(env, &refcheck);
1397 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1398 __u64 flags, struct lov_user_md *lum,
1401 struct lookup_intent oit = {
1403 .it_flags = flags | MDS_OPEN_BY_FID,
1408 ll_inode_size_lock(inode);
1409 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1411 GOTO(out_unlock, rc);
1413 ll_release_openhandle(file->f_path.dentry, &oit);
1416 ll_inode_size_unlock(inode);
1417 ll_intent_release(&oit);
1418 cl_lov_delay_create_clear(&file->f_flags);
1423 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1424 struct lov_mds_md **lmmp, int *lmm_size,
1425 struct ptlrpc_request **request)
1427 struct ll_sb_info *sbi = ll_i2sbi(inode);
1428 struct mdt_body *body;
1429 struct lov_mds_md *lmm = NULL;
1430 struct ptlrpc_request *req = NULL;
1431 struct md_op_data *op_data;
1434 rc = ll_get_default_mdsize(sbi, &lmmsize);
1438 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1439 strlen(filename), lmmsize,
1440 LUSTRE_OPC_ANY, NULL);
1441 if (IS_ERR(op_data))
1442 RETURN(PTR_ERR(op_data));
1444 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1445 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1446 ll_finish_md_op_data(op_data);
1448 CDEBUG(D_INFO, "md_getattr_name failed "
1449 "on %s: rc %d\n", filename, rc);
1453 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1454 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1456 lmmsize = body->mbo_eadatasize;
1458 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1460 GOTO(out, rc = -ENODATA);
1463 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1464 LASSERT(lmm != NULL);
1466 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1467 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1468 GOTO(out, rc = -EPROTO);
1472 * This is coming from the MDS, so is probably in
1473 * little endian. We convert it to host endian before
1474 * passing it to userspace.
1476 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1479 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1480 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1483 /* if function called for directory - we should
1484 * avoid swab not existent lsm objects */
1485 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1486 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1487 if (S_ISREG(body->mbo_mode))
1488 lustre_swab_lov_user_md_objects(
1489 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1491 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1492 lustre_swab_lov_user_md_v3(
1493 (struct lov_user_md_v3 *)lmm);
1494 if (S_ISREG(body->mbo_mode))
1495 lustre_swab_lov_user_md_objects(
1496 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1503 *lmm_size = lmmsize;
1508 static int ll_lov_setea(struct inode *inode, struct file *file,
1511 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1512 struct lov_user_md *lump;
1513 int lum_size = sizeof(struct lov_user_md) +
1514 sizeof(struct lov_user_ost_data);
1518 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1521 OBD_ALLOC_LARGE(lump, lum_size);
1525 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1526 GOTO(out_lump, rc = -EFAULT);
1528 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1531 OBD_FREE_LARGE(lump, lum_size);
1535 static int ll_file_getstripe(struct inode *inode,
1536 struct lov_user_md __user *lum)
1543 env = cl_env_get(&refcheck);
1545 RETURN(PTR_ERR(env));
1547 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1548 cl_env_put(env, &refcheck);
1552 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1555 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1556 struct lov_user_md *klum;
1558 __u64 flags = FMODE_WRITE;
1561 rc = ll_copy_user_md(lum, &klum);
1566 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1570 put_user(0, &lum->lmm_stripe_count);
1572 ll_layout_refresh(inode, &gen);
1573 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1576 OBD_FREE(klum, lum_size);
1581 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1583 struct ll_inode_info *lli = ll_i2info(inode);
1584 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1585 struct ll_grouplock grouplock;
1590 CWARN("group id for group lock must not be 0\n");
1594 if (ll_file_nolock(file))
1595 RETURN(-EOPNOTSUPP);
1597 spin_lock(&lli->lli_lock);
1598 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1599 CWARN("group lock already existed with gid %lu\n",
1600 fd->fd_grouplock.lg_gid);
1601 spin_unlock(&lli->lli_lock);
1604 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1605 spin_unlock(&lli->lli_lock);
1607 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1608 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1612 spin_lock(&lli->lli_lock);
1613 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1614 spin_unlock(&lli->lli_lock);
1615 CERROR("another thread just won the race\n");
1616 cl_put_grouplock(&grouplock);
1620 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1621 fd->fd_grouplock = grouplock;
1622 spin_unlock(&lli->lli_lock);
1624 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1628 static int ll_put_grouplock(struct inode *inode, struct file *file,
1631 struct ll_inode_info *lli = ll_i2info(inode);
1632 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1633 struct ll_grouplock grouplock;
1636 spin_lock(&lli->lli_lock);
1637 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1638 spin_unlock(&lli->lli_lock);
1639 CWARN("no group lock held\n");
1643 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1645 if (fd->fd_grouplock.lg_gid != arg) {
1646 CWARN("group lock %lu doesn't match current id %lu\n",
1647 arg, fd->fd_grouplock.lg_gid);
1648 spin_unlock(&lli->lli_lock);
1652 grouplock = fd->fd_grouplock;
1653 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1654 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1655 spin_unlock(&lli->lli_lock);
1657 cl_put_grouplock(&grouplock);
1658 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1663 * Close inode open handle
1665 * \param dentry [in] dentry which contains the inode
1666 * \param it [in,out] intent which contains open info and result
1669 * \retval <0 failure
1671 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1673 struct inode *inode = dentry->d_inode;
1674 struct obd_client_handle *och;
1680 /* Root ? Do nothing. */
1681 if (dentry->d_inode->i_sb->s_root == dentry)
1684 /* No open handle to close? Move away */
1685 if (!it_disposition(it, DISP_OPEN_OPEN))
1688 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1690 OBD_ALLOC(och, sizeof(*och));
1692 GOTO(out, rc = -ENOMEM);
1694 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1696 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1698 /* this one is in place of ll_file_open */
1699 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1700 ptlrpc_req_finished(it->it_request);
1701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1707 * Get size for inode for which FIEMAP mapping is requested.
1708 * Make the FIEMAP get_info call and returns the result.
1709 * \param fiemap kernel buffer to hold extens
1710 * \param num_bytes kernel buffer size
1712 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1718 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1721 /* Checks for fiemap flags */
1722 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1723 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1727 /* Check for FIEMAP_FLAG_SYNC */
1728 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1729 rc = filemap_fdatawrite(inode->i_mapping);
1734 env = cl_env_get(&refcheck);
1736 RETURN(PTR_ERR(env));
1738 if (i_size_read(inode) == 0) {
1739 rc = ll_glimpse_size(inode);
1744 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1745 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1746 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1748 /* If filesize is 0, then there would be no objects for mapping */
1749 if (fmkey.lfik_oa.o_size == 0) {
1750 fiemap->fm_mapped_extents = 0;
1754 fmkey.lfik_fiemap = *fiemap;
1756 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1757 &fmkey, fiemap, &num_bytes);
1759 cl_env_put(env, &refcheck);
1763 int ll_fid2path(struct inode *inode, void __user *arg)
1765 struct obd_export *exp = ll_i2mdexp(inode);
1766 const struct getinfo_fid2path __user *gfin = arg;
1768 struct getinfo_fid2path *gfout;
1774 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1775 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1778 /* Only need to get the buflen */
1779 if (get_user(pathlen, &gfin->gf_pathlen))
1782 if (pathlen > PATH_MAX)
1785 outsize = sizeof(*gfout) + pathlen;
1786 OBD_ALLOC(gfout, outsize);
1790 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1791 GOTO(gf_free, rc = -EFAULT);
1792 /* append root FID after gfout to let MDT know the root FID so that it
1793 * can lookup the correct path, this is mainly for fileset.
1794 * old server without fileset mount support will ignore this. */
1795 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1797 /* Call mdc_iocontrol */
1798 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1802 if (copy_to_user(arg, gfout, outsize))
1806 OBD_FREE(gfout, outsize);
1811 * Read the data_version for inode.
1813 * This value is computed using stripe object version on OST.
1814 * Version is computed using server side locking.
1816 * @param flags if do sync on the OST side;
1818 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1819 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1821 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1823 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1831 /* If no file object initialized, we consider its version is 0. */
1837 env = cl_env_get(&refcheck);
1839 RETURN(PTR_ERR(env));
1841 io = vvp_env_thread_io(env);
1843 io->u.ci_data_version.dv_data_version = 0;
1844 io->u.ci_data_version.dv_flags = flags;
1847 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1848 result = cl_io_loop(env, io);
1850 result = io->ci_result;
1852 *data_version = io->u.ci_data_version.dv_data_version;
1854 cl_io_fini(env, io);
1856 if (unlikely(io->ci_need_restart))
1859 cl_env_put(env, &refcheck);
1865 * Trigger a HSM release request for the provided inode.
1867 int ll_hsm_release(struct inode *inode)
1870 struct obd_client_handle *och = NULL;
1871 __u64 data_version = 0;
1876 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1877 ll_get_fsname(inode->i_sb, NULL, 0),
1878 PFID(&ll_i2info(inode)->lli_fid));
1880 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1882 GOTO(out, rc = PTR_ERR(och));
1884 /* Grab latest data_version and [am]time values */
1885 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1889 env = cl_env_get(&refcheck);
1891 GOTO(out, rc = PTR_ERR(env));
1893 ll_merge_attr(env, inode);
1894 cl_env_put(env, &refcheck);
1896 /* Release the file.
1897 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1898 * we still need it to pack l_remote_handle to MDT. */
1899 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1905 if (och != NULL && !IS_ERR(och)) /* close the file */
1906 ll_lease_close(och, inode, NULL);
1911 struct ll_swap_stack {
1914 struct inode *inode1;
1915 struct inode *inode2;
1920 static int ll_swap_layouts(struct file *file1, struct file *file2,
1921 struct lustre_swap_layouts *lsl)
1923 struct mdc_swap_layouts msl;
1924 struct md_op_data *op_data;
1927 struct ll_swap_stack *llss = NULL;
1930 OBD_ALLOC_PTR(llss);
1934 llss->inode1 = file1->f_path.dentry->d_inode;
1935 llss->inode2 = file2->f_path.dentry->d_inode;
1937 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1941 /* we use 2 bool because it is easier to swap than 2 bits */
1942 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1943 llss->check_dv1 = true;
1945 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1946 llss->check_dv2 = true;
1948 /* we cannot use lsl->sl_dvX directly because we may swap them */
1949 llss->dv1 = lsl->sl_dv1;
1950 llss->dv2 = lsl->sl_dv2;
1952 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1953 if (rc == 0) /* same file, done! */
1956 if (rc < 0) { /* sequentialize it */
1957 swap(llss->inode1, llss->inode2);
1959 swap(llss->dv1, llss->dv2);
1960 swap(llss->check_dv1, llss->check_dv2);
1964 if (gid != 0) { /* application asks to flush dirty cache */
1965 rc = ll_get_grouplock(llss->inode1, file1, gid);
1969 rc = ll_get_grouplock(llss->inode2, file2, gid);
1971 ll_put_grouplock(llss->inode1, file1, gid);
1976 /* ultimate check, before swaping the layouts we check if
1977 * dataversion has changed (if requested) */
1978 if (llss->check_dv1) {
1979 rc = ll_data_version(llss->inode1, &dv, 0);
1982 if (dv != llss->dv1)
1983 GOTO(putgl, rc = -EAGAIN);
1986 if (llss->check_dv2) {
1987 rc = ll_data_version(llss->inode2, &dv, 0);
1990 if (dv != llss->dv2)
1991 GOTO(putgl, rc = -EAGAIN);
1994 /* struct md_op_data is used to send the swap args to the mdt
1995 * only flags is missing, so we use struct mdc_swap_layouts
1996 * through the md_op_data->op_data */
1997 /* flags from user space have to be converted before they are send to
1998 * server, no flag is sent today, they are only used on the client */
2001 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2002 0, LUSTRE_OPC_ANY, &msl);
2003 if (IS_ERR(op_data))
2004 GOTO(free, rc = PTR_ERR(op_data));
2006 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2007 sizeof(*op_data), op_data, NULL);
2008 ll_finish_md_op_data(op_data);
2015 ll_put_grouplock(llss->inode2, file2, gid);
2016 ll_put_grouplock(llss->inode1, file1, gid);
2026 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2028 struct md_op_data *op_data;
2032 /* Detect out-of range masks */
2033 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2036 /* Non-root users are forbidden to set or clear flags which are
2037 * NOT defined in HSM_USER_MASK. */
2038 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2039 !cfs_capable(CFS_CAP_SYS_ADMIN))
2042 /* Detect out-of range archive id */
2043 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2044 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2047 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2048 LUSTRE_OPC_ANY, hss);
2049 if (IS_ERR(op_data))
2050 RETURN(PTR_ERR(op_data));
2052 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2053 sizeof(*op_data), op_data, NULL);
2055 ll_finish_md_op_data(op_data);
2060 static int ll_hsm_import(struct inode *inode, struct file *file,
2061 struct hsm_user_import *hui)
2063 struct hsm_state_set *hss = NULL;
2064 struct iattr *attr = NULL;
2068 if (!S_ISREG(inode->i_mode))
2074 GOTO(out, rc = -ENOMEM);
2076 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2077 hss->hss_archive_id = hui->hui_archive_id;
2078 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2079 rc = ll_hsm_state_set(inode, hss);
2083 OBD_ALLOC_PTR(attr);
2085 GOTO(out, rc = -ENOMEM);
2087 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2088 attr->ia_mode |= S_IFREG;
2089 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2090 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2091 attr->ia_size = hui->hui_size;
2092 attr->ia_mtime.tv_sec = hui->hui_mtime;
2093 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2094 attr->ia_atime.tv_sec = hui->hui_atime;
2095 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2097 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2098 ATTR_UID | ATTR_GID |
2099 ATTR_MTIME | ATTR_MTIME_SET |
2100 ATTR_ATIME | ATTR_ATIME_SET;
2102 mutex_lock(&inode->i_mutex);
2104 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2108 mutex_unlock(&inode->i_mutex);
2120 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2122 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2123 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2126 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2128 struct inode *inode = file->f_path.dentry->d_inode;
2130 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2131 ATTR_MTIME | ATTR_MTIME_SET |
2132 ATTR_CTIME | ATTR_CTIME_SET,
2134 .tv_sec = lfu->lfu_atime_sec,
2135 .tv_nsec = lfu->lfu_atime_nsec,
2138 .tv_sec = lfu->lfu_mtime_sec,
2139 .tv_nsec = lfu->lfu_mtime_nsec,
2142 .tv_sec = lfu->lfu_ctime_sec,
2143 .tv_nsec = lfu->lfu_ctime_nsec,
2149 if (!capable(CAP_SYS_ADMIN))
2152 if (!S_ISREG(inode->i_mode))
2155 mutex_lock(&inode->i_mutex);
2156 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2157 mutex_unlock(&inode->i_mutex);
2163 * Give file access advices
2165 * The ladvise interface is similar to Linux fadvise() system call, except it
2166 * forwards the advices directly from Lustre client to server. The server side
2167 * codes will apply appropriate read-ahead and caching techniques for the
2168 * corresponding files.
2170 * A typical workload for ladvise is e.g. a bunch of different clients are
2171 * doing small random reads of a file, so prefetching pages into OSS cache
2172 * with big linear reads before the random IO is a net benefit. Fetching
2173 * all that data into each client cache with fadvise() may not be, due to
2174 * much more data being sent to the client.
2176 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2177 struct lu_ladvise *ladvise)
2181 struct cl_ladvise_io *lio;
2186 env = cl_env_get(&refcheck);
2188 RETURN(PTR_ERR(env));
2190 io = vvp_env_thread_io(env);
2191 io->ci_obj = ll_i2info(inode)->lli_clob;
2193 /* initialize parameters for ladvise */
2194 lio = &io->u.ci_ladvise;
2195 lio->li_start = ladvise->lla_start;
2196 lio->li_end = ladvise->lla_end;
2197 lio->li_fid = ll_inode2fid(inode);
2198 lio->li_advice = ladvise->lla_advice;
2199 lio->li_flags = flags;
2201 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2202 rc = cl_io_loop(env, io);
2206 cl_io_fini(env, io);
2207 cl_env_put(env, &refcheck);
2212 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2214 struct inode *inode = file->f_path.dentry->d_inode;
2215 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2219 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2220 PFID(ll_inode2fid(inode)), inode, cmd);
2221 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2223 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2224 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2228 case LL_IOC_GETFLAGS:
2229 /* Get the current value of the file flags */
2230 return put_user(fd->fd_flags, (int __user *)arg);
2231 case LL_IOC_SETFLAGS:
2232 case LL_IOC_CLRFLAGS:
2233 /* Set or clear specific file flags */
2234 /* XXX This probably needs checks to ensure the flags are
2235 * not abused, and to handle any flag side effects.
2237 if (get_user(flags, (int __user *) arg))
2240 if (cmd == LL_IOC_SETFLAGS) {
2241 if ((flags & LL_FILE_IGNORE_LOCK) &&
2242 !(file->f_flags & O_DIRECT)) {
2243 CERROR("%s: unable to disable locking on "
2244 "non-O_DIRECT file\n", current->comm);
2248 fd->fd_flags |= flags;
2250 fd->fd_flags &= ~flags;
2253 case LL_IOC_LOV_SETSTRIPE:
2254 RETURN(ll_lov_setstripe(inode, file, arg));
2255 case LL_IOC_LOV_SETEA:
2256 RETURN(ll_lov_setea(inode, file, arg));
2257 case LL_IOC_LOV_SWAP_LAYOUTS: {
2259 struct lustre_swap_layouts lsl;
2261 if (copy_from_user(&lsl, (char __user *)arg,
2262 sizeof(struct lustre_swap_layouts)))
2265 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2268 file2 = fget(lsl.sl_fd);
2272 /* O_WRONLY or O_RDWR */
2273 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2274 GOTO(out, rc = -EPERM);
2276 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2277 struct inode *inode2;
2278 struct ll_inode_info *lli;
2279 struct obd_client_handle *och = NULL;
2281 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2282 GOTO(out, rc = -EINVAL);
2284 lli = ll_i2info(inode);
2285 mutex_lock(&lli->lli_och_mutex);
2286 if (fd->fd_lease_och != NULL) {
2287 och = fd->fd_lease_och;
2288 fd->fd_lease_och = NULL;
2290 mutex_unlock(&lli->lli_och_mutex);
2292 GOTO(out, rc = -ENOLCK);
2293 inode2 = file2->f_path.dentry->d_inode;
2294 rc = ll_swap_layouts_close(och, inode, inode2);
2296 rc = ll_swap_layouts(file, file2, &lsl);
2302 case LL_IOC_LOV_GETSTRIPE:
2303 RETURN(ll_file_getstripe(inode,
2304 (struct lov_user_md __user *)arg));
2305 case FSFILT_IOC_GETFLAGS:
2306 case FSFILT_IOC_SETFLAGS:
2307 RETURN(ll_iocontrol(inode, file, cmd, arg));
2308 case FSFILT_IOC_GETVERSION_OLD:
2309 case FSFILT_IOC_GETVERSION:
2310 RETURN(put_user(inode->i_generation, (int __user *)arg));
2311 case LL_IOC_GROUP_LOCK:
2312 RETURN(ll_get_grouplock(inode, file, arg));
2313 case LL_IOC_GROUP_UNLOCK:
2314 RETURN(ll_put_grouplock(inode, file, arg));
2315 case IOC_OBD_STATFS:
2316 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2318 /* We need to special case any other ioctls we want to handle,
2319 * to send them to the MDS/OST as appropriate and to properly
2320 * network encode the arg field.
2321 case FSFILT_IOC_SETVERSION_OLD:
2322 case FSFILT_IOC_SETVERSION:
2324 case LL_IOC_FLUSHCTX:
2325 RETURN(ll_flush_ctx(inode));
2326 case LL_IOC_PATH2FID: {
2327 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2328 sizeof(struct lu_fid)))
2333 case LL_IOC_GETPARENT:
2334 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2336 case OBD_IOC_FID2PATH:
2337 RETURN(ll_fid2path(inode, (void __user *)arg));
2338 case LL_IOC_DATA_VERSION: {
2339 struct ioc_data_version idv;
2342 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2345 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2346 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2349 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2355 case LL_IOC_GET_MDTIDX: {
2358 mdtidx = ll_get_mdt_idx(inode);
2362 if (put_user((int)mdtidx, (int __user *)arg))
2367 case OBD_IOC_GETDTNAME:
2368 case OBD_IOC_GETMDNAME:
2369 RETURN(ll_get_obd_name(inode, cmd, arg));
2370 case LL_IOC_HSM_STATE_GET: {
2371 struct md_op_data *op_data;
2372 struct hsm_user_state *hus;
2379 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2380 LUSTRE_OPC_ANY, hus);
2381 if (IS_ERR(op_data)) {
2383 RETURN(PTR_ERR(op_data));
2386 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2389 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2392 ll_finish_md_op_data(op_data);
2396 case LL_IOC_HSM_STATE_SET: {
2397 struct hsm_state_set *hss;
2404 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2409 rc = ll_hsm_state_set(inode, hss);
2414 case LL_IOC_HSM_ACTION: {
2415 struct md_op_data *op_data;
2416 struct hsm_current_action *hca;
2423 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2424 LUSTRE_OPC_ANY, hca);
2425 if (IS_ERR(op_data)) {
2427 RETURN(PTR_ERR(op_data));
2430 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2433 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2436 ll_finish_md_op_data(op_data);
2440 case LL_IOC_SET_LEASE: {
2441 struct ll_inode_info *lli = ll_i2info(inode);
2442 struct obd_client_handle *och = NULL;
2447 case LL_LEASE_WRLCK:
2448 if (!(file->f_mode & FMODE_WRITE))
2450 fmode = FMODE_WRITE;
2452 case LL_LEASE_RDLCK:
2453 if (!(file->f_mode & FMODE_READ))
2457 case LL_LEASE_UNLCK:
2458 mutex_lock(&lli->lli_och_mutex);
2459 if (fd->fd_lease_och != NULL) {
2460 och = fd->fd_lease_och;
2461 fd->fd_lease_och = NULL;
2463 mutex_unlock(&lli->lli_och_mutex);
2468 fmode = och->och_flags;
2469 rc = ll_lease_close(och, inode, &lease_broken);
2476 RETURN(ll_lease_type_from_fmode(fmode));
2481 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2483 /* apply for lease */
2484 och = ll_lease_open(inode, file, fmode, 0);
2486 RETURN(PTR_ERR(och));
2489 mutex_lock(&lli->lli_och_mutex);
2490 if (fd->fd_lease_och == NULL) {
2491 fd->fd_lease_och = och;
2494 mutex_unlock(&lli->lli_och_mutex);
2496 /* impossible now that only excl is supported for now */
2497 ll_lease_close(och, inode, &lease_broken);
2502 case LL_IOC_GET_LEASE: {
2503 struct ll_inode_info *lli = ll_i2info(inode);
2504 struct ldlm_lock *lock = NULL;
2507 mutex_lock(&lli->lli_och_mutex);
2508 if (fd->fd_lease_och != NULL) {
2509 struct obd_client_handle *och = fd->fd_lease_och;
2511 lock = ldlm_handle2lock(&och->och_lease_handle);
2513 lock_res_and_lock(lock);
2514 if (!ldlm_is_cancel(lock))
2515 fmode = och->och_flags;
2517 unlock_res_and_lock(lock);
2518 LDLM_LOCK_PUT(lock);
2521 mutex_unlock(&lli->lli_och_mutex);
2523 RETURN(ll_lease_type_from_fmode(fmode));
2525 case LL_IOC_HSM_IMPORT: {
2526 struct hsm_user_import *hui;
2532 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2537 rc = ll_hsm_import(inode, file, hui);
2542 case LL_IOC_FUTIMES_3: {
2543 struct ll_futimes_3 lfu;
2545 if (copy_from_user(&lfu,
2546 (const struct ll_futimes_3 __user *)arg,
2550 RETURN(ll_file_futimes_3(file, &lfu));
2552 case LL_IOC_LADVISE: {
2553 struct ladvise_hdr *ladvise_hdr;
2556 int alloc_size = sizeof(*ladvise_hdr);
2559 OBD_ALLOC_PTR(ladvise_hdr);
2560 if (ladvise_hdr == NULL)
2563 if (copy_from_user(ladvise_hdr,
2564 (const struct ladvise_hdr __user *)arg,
2566 GOTO(out_ladvise, rc = -EFAULT);
2568 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2569 ladvise_hdr->lah_count < 1)
2570 GOTO(out_ladvise, rc = -EINVAL);
2572 num_advise = ladvise_hdr->lah_count;
2573 if (num_advise >= LAH_COUNT_MAX)
2574 GOTO(out_ladvise, rc = -EFBIG);
2576 OBD_FREE_PTR(ladvise_hdr);
2577 alloc_size = offsetof(typeof(*ladvise_hdr),
2578 lah_advise[num_advise]);
2579 OBD_ALLOC(ladvise_hdr, alloc_size);
2580 if (ladvise_hdr == NULL)
2584 * TODO: submit multiple advices to one server in a single RPC
2586 if (copy_from_user(ladvise_hdr,
2587 (const struct ladvise_hdr __user *)arg,
2589 GOTO(out_ladvise, rc = -EFAULT);
2591 for (i = 0; i < num_advise; i++) {
2592 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2593 &ladvise_hdr->lah_advise[i]);
2599 OBD_FREE(ladvise_hdr, alloc_size);
2606 ll_iocontrol_call(inode, file, cmd, arg, &err))
2609 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2610 (void __user *)arg));
2615 #ifndef HAVE_FILE_LLSEEK_SIZE
2616 static inline loff_t
2617 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2619 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2621 if (offset > maxsize)
2624 if (offset != file->f_pos) {
2625 file->f_pos = offset;
2626 file->f_version = 0;
2632 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2633 loff_t maxsize, loff_t eof)
2635 struct inode *inode = file->f_path.dentry->d_inode;
2643 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2644 * position-querying operation. Avoid rewriting the "same"
2645 * f_pos value back to the file because a concurrent read(),
2646 * write() or lseek() might have altered it
2651 * f_lock protects against read/modify/write race with other
2652 * SEEK_CURs. Note that parallel writes and reads behave
2655 mutex_lock(&inode->i_mutex);
2656 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2657 mutex_unlock(&inode->i_mutex);
2661 * In the generic case the entire file is data, so as long as
2662 * offset isn't at the end of the file then the offset is data.
2669 * There is a virtual hole at the end of the file, so as long as
2670 * offset isn't i_size or larger, return i_size.
2678 return llseek_execute(file, offset, maxsize);
2682 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2684 struct inode *inode = file->f_path.dentry->d_inode;
2685 loff_t retval, eof = 0;
2688 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2689 (origin == SEEK_CUR) ? file->f_pos : 0);
2690 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2691 PFID(ll_inode2fid(inode)), inode, retval, retval,
2693 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2695 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2696 retval = ll_glimpse_size(inode);
2699 eof = i_size_read(inode);
2702 retval = ll_generic_file_llseek_size(file, offset, origin,
2703 ll_file_maxbytes(inode), eof);
2707 static int ll_flush(struct file *file, fl_owner_t id)
2709 struct inode *inode = file->f_path.dentry->d_inode;
2710 struct ll_inode_info *lli = ll_i2info(inode);
2711 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2714 LASSERT(!S_ISDIR(inode->i_mode));
2716 /* catch async errors that were recorded back when async writeback
2717 * failed for pages in this mapping. */
2718 rc = lli->lli_async_rc;
2719 lli->lli_async_rc = 0;
2720 if (lli->lli_clob != NULL) {
2721 err = lov_read_and_clear_async_rc(lli->lli_clob);
2726 /* The application has been told write failure already.
2727 * Do not report failure again. */
2728 if (fd->fd_write_failed)
2730 return rc ? -EIO : 0;
2734 * Called to make sure a portion of file has been written out.
2735 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2737 * Return how many pages have been written.
2739 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2740 enum cl_fsync_mode mode, int ignore_layout)
2744 struct cl_fsync_io *fio;
2749 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2750 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2753 env = cl_env_get(&refcheck);
2755 RETURN(PTR_ERR(env));
2757 io = vvp_env_thread_io(env);
2758 io->ci_obj = ll_i2info(inode)->lli_clob;
2759 io->ci_ignore_layout = ignore_layout;
2761 /* initialize parameters for sync */
2762 fio = &io->u.ci_fsync;
2763 fio->fi_start = start;
2765 fio->fi_fid = ll_inode2fid(inode);
2766 fio->fi_mode = mode;
2767 fio->fi_nr_written = 0;
2769 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2770 result = cl_io_loop(env, io);
2772 result = io->ci_result;
2774 result = fio->fi_nr_written;
2775 cl_io_fini(env, io);
2776 cl_env_put(env, &refcheck);
2782 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2783 * null and dentry must be used directly rather than pulled from
2784 * *file->f_path.dentry as is done otherwise.
2787 #ifdef HAVE_FILE_FSYNC_4ARGS
2788 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2790 struct dentry *dentry = file->f_path.dentry;
2791 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2792 int ll_fsync(struct file *file, int datasync)
2794 struct dentry *dentry = file->f_path.dentry;
2796 loff_t end = LLONG_MAX;
2798 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2801 loff_t end = LLONG_MAX;
2803 struct inode *inode = dentry->d_inode;
2804 struct ll_inode_info *lli = ll_i2info(inode);
2805 struct ptlrpc_request *req;
2809 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2810 PFID(ll_inode2fid(inode)), inode);
2811 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2813 #ifdef HAVE_FILE_FSYNC_4ARGS
2814 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2815 mutex_lock(&inode->i_mutex);
2817 /* fsync's caller has already called _fdata{sync,write}, we want
2818 * that IO to finish before calling the osc and mdc sync methods */
2819 rc = filemap_fdatawait(inode->i_mapping);
2822 /* catch async errors that were recorded back when async writeback
2823 * failed for pages in this mapping. */
2824 if (!S_ISDIR(inode->i_mode)) {
2825 err = lli->lli_async_rc;
2826 lli->lli_async_rc = 0;
2829 err = lov_read_and_clear_async_rc(lli->lli_clob);
2834 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2838 ptlrpc_req_finished(req);
2840 if (S_ISREG(inode->i_mode)) {
2841 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2843 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2844 if (rc == 0 && err < 0)
2847 fd->fd_write_failed = true;
2849 fd->fd_write_failed = false;
2852 #ifdef HAVE_FILE_FSYNC_4ARGS
2853 mutex_unlock(&inode->i_mutex);
2859 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2861 struct inode *inode = file->f_path.dentry->d_inode;
2862 struct ll_sb_info *sbi = ll_i2sbi(inode);
2863 struct ldlm_enqueue_info einfo = {
2864 .ei_type = LDLM_FLOCK,
2865 .ei_cb_cp = ldlm_flock_completion_ast,
2866 .ei_cbdata = file_lock,
2868 struct md_op_data *op_data;
2869 struct lustre_handle lockh = { 0 };
2870 union ldlm_policy_data flock = { { 0 } };
2871 int fl_type = file_lock->fl_type;
2877 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2878 PFID(ll_inode2fid(inode)), file_lock);
2880 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2882 if (file_lock->fl_flags & FL_FLOCK) {
2883 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2884 /* flocks are whole-file locks */
2885 flock.l_flock.end = OFFSET_MAX;
2886 /* For flocks owner is determined by the local file desctiptor*/
2887 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2888 } else if (file_lock->fl_flags & FL_POSIX) {
2889 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2890 flock.l_flock.start = file_lock->fl_start;
2891 flock.l_flock.end = file_lock->fl_end;
2895 flock.l_flock.pid = file_lock->fl_pid;
2897 /* Somewhat ugly workaround for svc lockd.
2898 * lockd installs custom fl_lmops->lm_compare_owner that checks
2899 * for the fl_owner to be the same (which it always is on local node
2900 * I guess between lockd processes) and then compares pid.
2901 * As such we assign pid to the owner field to make it all work,
2902 * conflict with normal locks is unlikely since pid space and
2903 * pointer space for current->files are not intersecting */
2904 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2905 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2909 einfo.ei_mode = LCK_PR;
2912 /* An unlock request may or may not have any relation to
2913 * existing locks so we may not be able to pass a lock handle
2914 * via a normal ldlm_lock_cancel() request. The request may even
2915 * unlock a byte range in the middle of an existing lock. In
2916 * order to process an unlock request we need all of the same
2917 * information that is given with a normal read or write record
2918 * lock request. To avoid creating another ldlm unlock (cancel)
2919 * message we'll treat a LCK_NL flock request as an unlock. */
2920 einfo.ei_mode = LCK_NL;
2923 einfo.ei_mode = LCK_PW;
2926 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2941 flags = LDLM_FL_BLOCK_NOWAIT;
2947 flags = LDLM_FL_TEST_LOCK;
2950 CERROR("unknown fcntl lock command: %d\n", cmd);
2954 /* Save the old mode so that if the mode in the lock changes we
2955 * can decrement the appropriate reader or writer refcount. */
2956 file_lock->fl_type = einfo.ei_mode;
2958 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2959 LUSTRE_OPC_ANY, NULL);
2960 if (IS_ERR(op_data))
2961 RETURN(PTR_ERR(op_data));
2963 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2964 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2965 flock.l_flock.pid, flags, einfo.ei_mode,
2966 flock.l_flock.start, flock.l_flock.end);
2968 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2971 /* Restore the file lock type if not TEST lock. */
2972 if (!(flags & LDLM_FL_TEST_LOCK))
2973 file_lock->fl_type = fl_type;
2975 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
2976 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
2977 !(flags & LDLM_FL_TEST_LOCK))
2978 rc2 = locks_lock_file_wait(file, file_lock);
2980 if ((file_lock->fl_flags & FL_FLOCK) &&
2981 (rc == 0 || file_lock->fl_type == F_UNLCK))
2982 rc2 = flock_lock_file_wait(file, file_lock);
2983 if ((file_lock->fl_flags & FL_POSIX) &&
2984 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2985 !(flags & LDLM_FL_TEST_LOCK))
2986 rc2 = posix_lock_file_wait(file, file_lock);
2987 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
2989 if (rc2 && file_lock->fl_type != F_UNLCK) {
2990 einfo.ei_mode = LCK_NL;
2991 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2996 ll_finish_md_op_data(op_data);
3001 int ll_get_fid_by_name(struct inode *parent, const char *name,
3002 int namelen, struct lu_fid *fid,
3003 struct inode **inode)
3005 struct md_op_data *op_data = NULL;
3006 struct mdt_body *body;
3007 struct ptlrpc_request *req;
3011 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3012 LUSTRE_OPC_ANY, NULL);
3013 if (IS_ERR(op_data))
3014 RETURN(PTR_ERR(op_data));
3016 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3017 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3018 ll_finish_md_op_data(op_data);
3022 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3024 GOTO(out_req, rc = -EFAULT);
3026 *fid = body->mbo_fid1;
3029 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3031 ptlrpc_req_finished(req);
3035 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3036 const char *name, int namelen)
3038 struct dentry *dchild = NULL;
3039 struct inode *child_inode = NULL;
3040 struct md_op_data *op_data;
3041 struct ptlrpc_request *request = NULL;
3042 struct obd_client_handle *och = NULL;
3044 struct mdt_body *body;
3046 __u64 data_version = 0;
3049 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3050 name, PFID(ll_inode2fid(parent)), mdtidx);
3052 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3053 0, LUSTRE_OPC_ANY, NULL);
3054 if (IS_ERR(op_data))
3055 RETURN(PTR_ERR(op_data));
3057 /* Get child FID first */
3058 qstr.hash = full_name_hash(name, namelen);
3061 dchild = d_lookup(file->f_path.dentry, &qstr);
3062 if (dchild != NULL) {
3063 if (dchild->d_inode != NULL)
3064 child_inode = igrab(dchild->d_inode);
3068 if (child_inode == NULL) {
3069 rc = ll_get_fid_by_name(parent, name, namelen,
3070 &op_data->op_fid3, &child_inode);
3075 if (child_inode == NULL)
3076 GOTO(out_free, rc = -EINVAL);
3079 * lfs migrate command needs to be blocked on the client
3080 * by checking the migrate FID against the FID of the
3083 if (child_inode == parent->i_sb->s_root->d_inode)
3084 GOTO(out_iput, rc = -EINVAL);
3086 mutex_lock(&child_inode->i_mutex);
3087 op_data->op_fid3 = *ll_inode2fid(child_inode);
3088 if (!fid_is_sane(&op_data->op_fid3)) {
3089 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3090 ll_get_fsname(parent->i_sb, NULL, 0), name,
3091 PFID(&op_data->op_fid3));
3092 GOTO(out_unlock, rc = -EINVAL);
3095 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3097 GOTO(out_unlock, rc);
3100 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3101 PFID(&op_data->op_fid3), mdtidx);
3102 GOTO(out_unlock, rc = 0);
3105 if (S_ISREG(child_inode->i_mode)) {
3106 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3110 GOTO(out_unlock, rc);
3113 rc = ll_data_version(child_inode, &data_version,
3116 GOTO(out_close, rc);
3118 op_data->op_handle = och->och_fh;
3119 op_data->op_data = och->och_mod;
3120 op_data->op_data_version = data_version;
3121 op_data->op_lease_handle = och->och_lease_handle;
3122 op_data->op_bias |= MDS_RENAME_MIGRATE;
3125 op_data->op_mds = mdtidx;
3126 op_data->op_cli_flags = CLI_MIGRATE;
3127 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3128 namelen, name, namelen, &request);
3130 ll_update_times(request, parent);
3132 if (request != NULL) {
3133 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3135 ptlrpc_req_finished(request);
3136 GOTO(out_close, rc = -EPROTO);
3139 /* If the server does release layout lock, then we cleanup
3140 * the client och here, otherwise release it in out_close: */
3142 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3143 obd_mod_put(och->och_mod);
3144 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3146 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3150 ptlrpc_req_finished(request);
3153 /* Try again if the file layout has changed. */
3154 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3159 if (och != NULL) /* close the file */
3160 ll_lease_close(och, child_inode, NULL);
3162 clear_nlink(child_inode);
3164 mutex_unlock(&child_inode->i_mutex);
3168 ll_finish_md_op_data(op_data);
3173 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3181 * test if some locks matching bits and l_req_mode are acquired
3182 * - bits can be in different locks
3183 * - if found clear the common lock bits in *bits
3184 * - the bits not found, are kept in *bits
3186 * \param bits [IN] searched lock bits [IN]
3187 * \param l_req_mode [IN] searched lock mode
3188 * \retval boolean, true iff all bits are found
3190 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3192 struct lustre_handle lockh;
3193 union ldlm_policy_data policy;
3194 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3195 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3204 fid = &ll_i2info(inode)->lli_fid;
3205 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3206 ldlm_lockname[mode]);
3208 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3209 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3210 policy.l_inodebits.bits = *bits & (1 << i);
3211 if (policy.l_inodebits.bits == 0)
3214 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3215 &policy, mode, &lockh)) {
3216 struct ldlm_lock *lock;
3218 lock = ldlm_handle2lock(&lockh);
3221 ~(lock->l_policy_data.l_inodebits.bits);
3222 LDLM_LOCK_PUT(lock);
3224 *bits &= ~policy.l_inodebits.bits;
3231 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3232 struct lustre_handle *lockh, __u64 flags,
3233 enum ldlm_mode mode)
3235 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3240 fid = &ll_i2info(inode)->lli_fid;
3241 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3243 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3244 fid, LDLM_IBITS, &policy, mode, lockh);
3249 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3251 /* Already unlinked. Just update nlink and return success */
3252 if (rc == -ENOENT) {
3254 /* If it is striped directory, and there is bad stripe
3255 * Let's revalidate the dentry again, instead of returning
3257 if (S_ISDIR(inode->i_mode) &&
3258 ll_i2info(inode)->lli_lsm_md != NULL)
3261 /* This path cannot be hit for regular files unless in
3262 * case of obscure races, so no need to to validate
3264 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3266 } else if (rc != 0) {
3267 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3268 "%s: revalidate FID "DFID" error: rc = %d\n",
3269 ll_get_fsname(inode->i_sb, NULL, 0),
3270 PFID(ll_inode2fid(inode)), rc);
3276 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3278 struct inode *inode = dentry->d_inode;
3279 struct ptlrpc_request *req = NULL;
3280 struct obd_export *exp;
3284 LASSERT(inode != NULL);
3286 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3287 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3289 exp = ll_i2mdexp(inode);
3291 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3292 * But under CMD case, it caused some lock issues, should be fixed
3293 * with new CMD ibits lock. See bug 12718 */
3294 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3295 struct lookup_intent oit = { .it_op = IT_GETATTR };
3296 struct md_op_data *op_data;
3298 if (ibits == MDS_INODELOCK_LOOKUP)
3299 oit.it_op = IT_LOOKUP;
3301 /* Call getattr by fid, so do not provide name at all. */
3302 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3303 dentry->d_inode, NULL, 0, 0,
3304 LUSTRE_OPC_ANY, NULL);
3305 if (IS_ERR(op_data))
3306 RETURN(PTR_ERR(op_data));
3308 rc = md_intent_lock(exp, op_data, &oit, &req,
3309 &ll_md_blocking_ast, 0);
3310 ll_finish_md_op_data(op_data);
3312 rc = ll_inode_revalidate_fini(inode, rc);
3316 rc = ll_revalidate_it_finish(req, &oit, dentry);
3318 ll_intent_release(&oit);
3322 /* Unlinked? Unhash dentry, so it is not picked up later by
3323 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3324 here to preserve get_cwd functionality on 2.6.
3326 if (!dentry->d_inode->i_nlink) {
3327 ll_lock_dcache(inode);
3328 d_lustre_invalidate(dentry, 0);
3329 ll_unlock_dcache(inode);
3332 ll_lookup_finish_locks(&oit, dentry);
3333 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3334 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3335 u64 valid = OBD_MD_FLGETATTR;
3336 struct md_op_data *op_data;
3339 if (S_ISREG(inode->i_mode)) {
3340 rc = ll_get_default_mdsize(sbi, &ealen);
3343 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3346 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3347 0, ealen, LUSTRE_OPC_ANY,
3349 if (IS_ERR(op_data))
3350 RETURN(PTR_ERR(op_data));
3352 op_data->op_valid = valid;
3353 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3354 ll_finish_md_op_data(op_data);
3356 rc = ll_inode_revalidate_fini(inode, rc);
3360 rc = ll_prep_inode(&inode, req, NULL, NULL);
3363 ptlrpc_req_finished(req);
3367 static int ll_merge_md_attr(struct inode *inode)
3369 struct cl_attr attr = { 0 };
3372 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3373 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3374 &attr, ll_md_blocking_ast);
3378 set_nlink(inode, attr.cat_nlink);
3379 inode->i_blocks = attr.cat_blocks;
3380 i_size_write(inode, attr.cat_size);
3382 ll_i2info(inode)->lli_atime = attr.cat_atime;
3383 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3384 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3390 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3392 struct inode *inode = dentry->d_inode;
3396 rc = __ll_inode_revalidate(dentry, ibits);
3400 /* if object isn't regular file, don't validate size */
3401 if (!S_ISREG(inode->i_mode)) {
3402 if (S_ISDIR(inode->i_mode) &&
3403 ll_i2info(inode)->lli_lsm_md != NULL) {
3404 rc = ll_merge_md_attr(inode);
3409 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3410 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3411 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3413 /* In case of restore, the MDT has the right size and has
3414 * already send it back without granting the layout lock,
3415 * inode is up-to-date so glimpse is useless.
3416 * Also to glimpse we need the layout, in case of a running
3417 * restore the MDT holds the layout lock so the glimpse will
3418 * block up to the end of restore (getattr will block)
3420 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3421 rc = ll_glimpse_size(inode);
3426 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3428 struct inode *inode = de->d_inode;
3429 struct ll_sb_info *sbi = ll_i2sbi(inode);
3430 struct ll_inode_info *lli = ll_i2info(inode);
3433 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3434 MDS_INODELOCK_LOOKUP);
3435 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3440 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3442 stat->dev = inode->i_sb->s_dev;
3443 if (ll_need_32bit_api(sbi))
3444 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3446 stat->ino = inode->i_ino;
3447 stat->mode = inode->i_mode;
3448 stat->uid = inode->i_uid;
3449 stat->gid = inode->i_gid;
3450 stat->rdev = inode->i_rdev;
3451 stat->atime = inode->i_atime;
3452 stat->mtime = inode->i_mtime;
3453 stat->ctime = inode->i_ctime;
3454 stat->blksize = 1 << inode->i_blkbits;
3456 stat->nlink = inode->i_nlink;
3457 stat->size = i_size_read(inode);
3458 stat->blocks = inode->i_blocks;
3463 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3464 __u64 start, __u64 len)
3468 struct fiemap *fiemap;
3469 unsigned int extent_count = fieinfo->fi_extents_max;
3471 num_bytes = sizeof(*fiemap) + (extent_count *
3472 sizeof(struct fiemap_extent));
3473 OBD_ALLOC_LARGE(fiemap, num_bytes);
3478 fiemap->fm_flags = fieinfo->fi_flags;
3479 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3480 fiemap->fm_start = start;
3481 fiemap->fm_length = len;
3482 if (extent_count > 0 &&
3483 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3484 sizeof(struct fiemap_extent)) != 0)
3485 GOTO(out, rc = -EFAULT);
3487 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3489 fieinfo->fi_flags = fiemap->fm_flags;
3490 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3491 if (extent_count > 0 &&
3492 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3493 fiemap->fm_mapped_extents *
3494 sizeof(struct fiemap_extent)) != 0)
3495 GOTO(out, rc = -EFAULT);
3497 OBD_FREE_LARGE(fiemap, num_bytes);
3501 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3503 struct ll_inode_info *lli = ll_i2info(inode);
3504 struct posix_acl *acl = NULL;
3507 spin_lock(&lli->lli_lock);
3508 /* VFS' acl_permission_check->check_acl will release the refcount */
3509 acl = posix_acl_dup(lli->lli_posix_acl);
3510 spin_unlock(&lli->lli_lock);
3515 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3517 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3518 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3520 ll_check_acl(struct inode *inode, int mask)
3523 # ifdef CONFIG_FS_POSIX_ACL
3524 struct posix_acl *acl;
3528 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3529 if (flags & IPERM_FLAG_RCU)
3532 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3537 rc = posix_acl_permission(inode, acl, mask);
3538 posix_acl_release(acl);
3541 # else /* !CONFIG_FS_POSIX_ACL */
3543 # endif /* CONFIG_FS_POSIX_ACL */
3545 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3547 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3548 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3550 # ifdef HAVE_INODE_PERMISION_2ARGS
3551 int ll_inode_permission(struct inode *inode, int mask)
3553 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3558 struct ll_sb_info *sbi;
3559 struct root_squash_info *squash;
3560 struct cred *cred = NULL;
3561 const struct cred *old_cred = NULL;
3563 bool squash_id = false;
3566 #ifdef MAY_NOT_BLOCK
3567 if (mask & MAY_NOT_BLOCK)
3569 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3570 if (flags & IPERM_FLAG_RCU)
3574 /* as root inode are NOT getting validated in lookup operation,
3575 * need to do it before permission check. */
3577 if (inode == inode->i_sb->s_root->d_inode) {
3578 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3579 MDS_INODELOCK_LOOKUP);
3584 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3585 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3587 /* squash fsuid/fsgid if needed */
3588 sbi = ll_i2sbi(inode);
3589 squash = &sbi->ll_squash;
3590 if (unlikely(squash->rsi_uid != 0 &&
3591 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3592 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3596 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3597 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3598 squash->rsi_uid, squash->rsi_gid);
3600 /* update current process's credentials
3601 * and FS capability */
3602 cred = prepare_creds();
3606 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3607 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3608 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3609 if ((1 << cap) & CFS_CAP_FS_MASK)
3610 cap_lower(cred->cap_effective, cap);
3612 old_cred = override_creds(cred);
3615 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3617 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3618 rc = lustre_check_remote_perm(inode, mask);
3620 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3622 /* restore current process's credentials and FS capability */
3624 revert_creds(old_cred);
3631 /* -o localflock - only provides locally consistent flock locks */
3632 struct file_operations ll_file_operations = {
3633 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3634 # ifdef HAVE_SYNC_READ_WRITE
3635 .read = new_sync_read,
3636 .write = new_sync_write,
3638 .read_iter = ll_file_read_iter,
3639 .write_iter = ll_file_write_iter,
3640 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3641 .read = ll_file_read,
3642 .aio_read = ll_file_aio_read,
3643 .write = ll_file_write,
3644 .aio_write = ll_file_aio_write,
3645 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3646 .unlocked_ioctl = ll_file_ioctl,
3647 .open = ll_file_open,
3648 .release = ll_file_release,
3649 .mmap = ll_file_mmap,
3650 .llseek = ll_file_seek,
3651 .splice_read = ll_file_splice_read,
3656 struct file_operations ll_file_operations_flock = {
3657 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3658 # ifdef HAVE_SYNC_READ_WRITE
3659 .read = new_sync_read,
3660 .write = new_sync_write,
3661 # endif /* HAVE_SYNC_READ_WRITE */
3662 .read_iter = ll_file_read_iter,
3663 .write_iter = ll_file_write_iter,
3664 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3665 .read = ll_file_read,
3666 .aio_read = ll_file_aio_read,
3667 .write = ll_file_write,
3668 .aio_write = ll_file_aio_write,
3669 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3670 .unlocked_ioctl = ll_file_ioctl,
3671 .open = ll_file_open,
3672 .release = ll_file_release,
3673 .mmap = ll_file_mmap,
3674 .llseek = ll_file_seek,
3675 .splice_read = ll_file_splice_read,
3678 .flock = ll_file_flock,
3679 .lock = ll_file_flock
3682 /* These are for -o noflock - to return ENOSYS on flock calls */
3683 struct file_operations ll_file_operations_noflock = {
3684 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3685 # ifdef HAVE_SYNC_READ_WRITE
3686 .read = new_sync_read,
3687 .write = new_sync_write,
3688 # endif /* HAVE_SYNC_READ_WRITE */
3689 .read_iter = ll_file_read_iter,
3690 .write_iter = ll_file_write_iter,
3691 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3692 .read = ll_file_read,
3693 .aio_read = ll_file_aio_read,
3694 .write = ll_file_write,
3695 .aio_write = ll_file_aio_write,
3696 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3697 .unlocked_ioctl = ll_file_ioctl,
3698 .open = ll_file_open,
3699 .release = ll_file_release,
3700 .mmap = ll_file_mmap,
3701 .llseek = ll_file_seek,
3702 .splice_read = ll_file_splice_read,
3705 .flock = ll_file_noflock,
3706 .lock = ll_file_noflock
3709 struct inode_operations ll_file_inode_operations = {
3710 .setattr = ll_setattr,
3711 .getattr = ll_getattr,
3712 .permission = ll_inode_permission,
3713 .setxattr = ll_setxattr,
3714 .getxattr = ll_getxattr,
3715 .listxattr = ll_listxattr,
3716 .removexattr = ll_removexattr,
3717 .fiemap = ll_fiemap,
3718 #ifdef HAVE_IOP_GET_ACL
3719 .get_acl = ll_get_acl,
3723 /* dynamic ioctl number support routins */
3724 static struct llioc_ctl_data {
3725 struct rw_semaphore ioc_sem;
3726 struct list_head ioc_head;
3728 __RWSEM_INITIALIZER(llioc.ioc_sem),
3729 LIST_HEAD_INIT(llioc.ioc_head)
3734 struct list_head iocd_list;
3735 unsigned int iocd_size;
3736 llioc_callback_t iocd_cb;
3737 unsigned int iocd_count;
3738 unsigned int iocd_cmd[0];
3741 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3744 struct llioc_data *in_data = NULL;
3747 if (cb == NULL || cmd == NULL ||
3748 count > LLIOC_MAX_CMD || count < 0)
3751 size = sizeof(*in_data) + count * sizeof(unsigned int);
3752 OBD_ALLOC(in_data, size);
3753 if (in_data == NULL)
3756 memset(in_data, 0, sizeof(*in_data));
3757 in_data->iocd_size = size;
3758 in_data->iocd_cb = cb;
3759 in_data->iocd_count = count;
3760 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3762 down_write(&llioc.ioc_sem);
3763 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3764 up_write(&llioc.ioc_sem);
3769 void ll_iocontrol_unregister(void *magic)
3771 struct llioc_data *tmp;
3776 down_write(&llioc.ioc_sem);
3777 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3779 unsigned int size = tmp->iocd_size;
3781 list_del(&tmp->iocd_list);
3782 up_write(&llioc.ioc_sem);
3784 OBD_FREE(tmp, size);
3788 up_write(&llioc.ioc_sem);
3790 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3793 EXPORT_SYMBOL(ll_iocontrol_register);
3794 EXPORT_SYMBOL(ll_iocontrol_unregister);
3796 static enum llioc_iter
3797 ll_iocontrol_call(struct inode *inode, struct file *file,
3798 unsigned int cmd, unsigned long arg, int *rcp)
3800 enum llioc_iter ret = LLIOC_CONT;
3801 struct llioc_data *data;
3802 int rc = -EINVAL, i;
3804 down_read(&llioc.ioc_sem);
3805 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3806 for (i = 0; i < data->iocd_count; i++) {
3807 if (cmd != data->iocd_cmd[i])
3810 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3814 if (ret == LLIOC_STOP)
3817 up_read(&llioc.ioc_sem);
3824 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3826 struct ll_inode_info *lli = ll_i2info(inode);
3827 struct cl_object *obj = lli->lli_clob;
3836 env = cl_env_get(&refcheck);
3838 RETURN(PTR_ERR(env));
3840 rc = cl_conf_set(env, lli->lli_clob, conf);
3844 if (conf->coc_opc == OBJECT_CONF_SET) {
3845 struct ldlm_lock *lock = conf->coc_lock;
3846 struct cl_layout cl = {
3850 LASSERT(lock != NULL);
3851 LASSERT(ldlm_has_layout(lock));
3853 /* it can only be allowed to match after layout is
3854 * applied to inode otherwise false layout would be
3855 * seen. Applying layout shoud happen before dropping
3856 * the intent lock. */
3857 ldlm_lock_allow_match(lock);
3859 rc = cl_object_layout_get(env, obj, &cl);
3864 DFID": layout version change: %u -> %u\n",
3865 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3867 ll_layout_version_set(lli, cl.cl_layout_gen);
3871 cl_env_put(env, &refcheck);
3876 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3877 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3880 struct ll_sb_info *sbi = ll_i2sbi(inode);
3881 struct ptlrpc_request *req;
3882 struct mdt_body *body;
3889 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3890 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3891 lock->l_lvb_data, lock->l_lvb_len);
3893 if (lock->l_lvb_data != NULL)
3896 /* if layout lock was granted right away, the layout is returned
3897 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3898 * blocked and then granted via completion ast, we have to fetch
3899 * layout here. Please note that we can't use the LVB buffer in
3900 * completion AST because it doesn't have a large enough buffer */
3901 rc = ll_get_default_mdsize(sbi, &lmmsize);
3903 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3904 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3909 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3911 GOTO(out, rc = -EPROTO);
3913 lmmsize = body->mbo_eadatasize;
3914 if (lmmsize == 0) /* empty layout */
3917 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3919 GOTO(out, rc = -EFAULT);
3921 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3922 if (lvbdata == NULL)
3923 GOTO(out, rc = -ENOMEM);
3925 memcpy(lvbdata, lmm, lmmsize);
3926 lock_res_and_lock(lock);
3927 if (unlikely(lock->l_lvb_data == NULL)) {
3928 lock->l_lvb_type = LVB_T_LAYOUT;
3929 lock->l_lvb_data = lvbdata;
3930 lock->l_lvb_len = lmmsize;
3933 unlock_res_and_lock(lock);
3936 OBD_FREE_LARGE(lvbdata, lmmsize);
3941 ptlrpc_req_finished(req);
3946 * Apply the layout to the inode. Layout lock is held and will be released
3949 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3950 struct inode *inode)
3952 struct ll_inode_info *lli = ll_i2info(inode);
3953 struct ll_sb_info *sbi = ll_i2sbi(inode);
3954 struct ldlm_lock *lock;
3955 struct cl_object_conf conf;
3958 bool wait_layout = false;
3961 LASSERT(lustre_handle_is_used(lockh));
3963 lock = ldlm_handle2lock(lockh);
3964 LASSERT(lock != NULL);
3965 LASSERT(ldlm_has_layout(lock));
3967 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3968 PFID(&lli->lli_fid), inode);
3970 /* in case this is a caching lock and reinstate with new inode */
3971 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3973 lock_res_and_lock(lock);
3974 lvb_ready = ldlm_is_lvb_ready(lock);
3975 unlock_res_and_lock(lock);
3976 /* checking lvb_ready is racy but this is okay. The worst case is
3977 * that multi processes may configure the file on the same time. */
3982 rc = ll_layout_fetch(inode, lock);
3986 /* for layout lock, lmm is stored in lock's lvb.
3987 * lvb_data is immutable if the lock is held so it's safe to access it
3990 * set layout to file. Unlikely this will fail as old layout was
3991 * surely eliminated */
3992 memset(&conf, 0, sizeof conf);
3993 conf.coc_opc = OBJECT_CONF_SET;
3994 conf.coc_inode = inode;
3995 conf.coc_lock = lock;
3996 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
3997 conf.u.coc_layout.lb_len = lock->l_lvb_len;
3998 rc = ll_layout_conf(inode, &conf);
4000 /* refresh layout failed, need to wait */
4001 wait_layout = rc == -EBUSY;
4005 LDLM_LOCK_PUT(lock);
4006 ldlm_lock_decref(lockh, mode);
4008 /* wait for IO to complete if it's still being used. */
4010 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4011 ll_get_fsname(inode->i_sb, NULL, 0),
4012 PFID(&lli->lli_fid), inode);
4014 memset(&conf, 0, sizeof conf);
4015 conf.coc_opc = OBJECT_CONF_WAIT;
4016 conf.coc_inode = inode;
4017 rc = ll_layout_conf(inode, &conf);
4021 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4022 ll_get_fsname(inode->i_sb, NULL, 0),
4023 PFID(&lli->lli_fid), rc);
4028 static int ll_layout_refresh_locked(struct inode *inode)
4030 struct ll_inode_info *lli = ll_i2info(inode);
4031 struct ll_sb_info *sbi = ll_i2sbi(inode);
4032 struct md_op_data *op_data;
4033 struct lookup_intent it;
4034 struct lustre_handle lockh;
4035 enum ldlm_mode mode;
4036 struct ldlm_enqueue_info einfo = {
4037 .ei_type = LDLM_IBITS,
4039 .ei_cb_bl = &ll_md_blocking_ast,
4040 .ei_cb_cp = &ldlm_completion_ast,
4046 /* mostly layout lock is caching on the local side, so try to match
4047 * it before grabbing layout lock mutex. */
4048 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4049 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4050 if (mode != 0) { /* hit cached lock */
4051 rc = ll_layout_lock_set(&lockh, mode, inode);
4058 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4059 0, 0, LUSTRE_OPC_ANY, NULL);
4060 if (IS_ERR(op_data))
4061 RETURN(PTR_ERR(op_data));
4063 /* have to enqueue one */
4064 memset(&it, 0, sizeof(it));
4065 it.it_op = IT_LAYOUT;
4066 lockh.cookie = 0ULL;
4068 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4069 ll_get_fsname(inode->i_sb, NULL, 0),
4070 PFID(&lli->lli_fid), inode);
4072 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4073 if (it.it_request != NULL)
4074 ptlrpc_req_finished(it.it_request);
4075 it.it_request = NULL;
4077 ll_finish_md_op_data(op_data);
4079 mode = it.it_lock_mode;
4080 it.it_lock_mode = 0;
4081 ll_intent_drop_lock(&it);
4084 /* set lock data in case this is a new lock */
4085 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4086 rc = ll_layout_lock_set(&lockh, mode, inode);
4095 * This function checks if there exists a LAYOUT lock on the client side,
4096 * or enqueues it if it doesn't have one in cache.
4098 * This function will not hold layout lock so it may be revoked any time after
4099 * this function returns. Any operations depend on layout should be redone
4102 * This function should be called before lov_io_init() to get an uptodate
4103 * layout version, the caller should save the version number and after IO
4104 * is finished, this function should be called again to verify that layout
4105 * is not changed during IO time.
4107 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4109 struct ll_inode_info *lli = ll_i2info(inode);
4110 struct ll_sb_info *sbi = ll_i2sbi(inode);
4114 *gen = ll_layout_version_get(lli);
4115 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4119 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4120 LASSERT(S_ISREG(inode->i_mode));
4122 /* take layout lock mutex to enqueue layout lock exclusively. */
4123 mutex_lock(&lli->lli_layout_mutex);
4125 rc = ll_layout_refresh_locked(inode);
4129 *gen = ll_layout_version_get(lli);
4131 mutex_unlock(&lli->lli_layout_mutex);
4137 * This function send a restore request to the MDT
4139 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4141 struct hsm_user_request *hur;
4145 len = sizeof(struct hsm_user_request) +
4146 sizeof(struct hsm_user_item);
4147 OBD_ALLOC(hur, len);
4151 hur->hur_request.hr_action = HUA_RESTORE;
4152 hur->hur_request.hr_archive_id = 0;
4153 hur->hur_request.hr_flags = 0;
4154 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4155 sizeof(hur->hur_user_item[0].hui_fid));
4156 hur->hur_user_item[0].hui_extent.offset = offset;
4157 hur->hur_user_item[0].hui_extent.length = length;
4158 hur->hur_request.hr_itemcount = 1;
4159 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,