4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct mdt_body *body;
439 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
440 och->och_fh = body->mbo_handle;
441 och->och_fid = body->mbo_fid1;
442 och->och_lease_handle.cookie = it->it_lock_handle;
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_flags = it->it_flags;
446 return md_set_open_replay_data(md_exp, och, it);
449 static int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_path.dentry->d_inode;
455 LASSERT(!LUSTRE_FPRIVATE(file));
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
467 LUSTRE_FPRIVATE(file) = fd;
468 ll_readahead_init(inode, &fd->fd_ras);
469 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
471 /* ll_cl_context initialize */
472 rwlock_init(&fd->fd_lock);
473 INIT_LIST_HEAD(&fd->fd_lccs);
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
503 PFID(ll_inode2fid(inode)), inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_openerr, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode))
514 ll_authorize_statahead(inode, fd);
516 if (inode->i_sb->s_root == file->f_path.dentry) {
517 LUSTRE_FPRIVATE(file) = fd;
521 if (!it || !it->it_disposition) {
522 /* Convert f_flags into access mode. We cannot use file->f_mode,
523 * because everything but O_ACCMODE mask was stripped from
525 if ((oit.it_flags + 1) & O_ACCMODE)
527 if (file->f_flags & O_TRUNC)
528 oit.it_flags |= FMODE_WRITE;
530 /* kernel only call f_op->open in dentry_open. filp_open calls
531 * dentry_open after call to open_namei that checks permissions.
532 * Only nfsd_open call dentry_open directly without checking
533 * permissions and because of that this code below is safe. */
534 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
535 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications? */
539 oit.it_flags &= ~O_EXCL;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
543 * consistent with it */
544 if (oit.it_flags & O_CREAT)
545 oit.it_op |= IT_CREAT;
551 /* Let's see if we have file open on MDS already. */
552 if (it->it_flags & FMODE_WRITE) {
553 och_p = &lli->lli_mds_write_och;
554 och_usecount = &lli->lli_open_fd_write_count;
555 } else if (it->it_flags & FMODE_EXEC) {
556 och_p = &lli->lli_mds_exec_och;
557 och_usecount = &lli->lli_open_fd_exec_count;
559 och_p = &lli->lli_mds_read_och;
560 och_usecount = &lli->lli_open_fd_read_count;
563 mutex_lock(&lli->lli_och_mutex);
564 if (*och_p) { /* Open handle is present */
565 if (it_disposition(it, DISP_OPEN_OPEN)) {
566 /* Well, there's extra open request that we do not need,
567 let's close it somehow. This will decref request. */
568 rc = it_open_error(DISP_OPEN_OPEN, it);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 ll_release_openhandle(file->f_path.dentry, it);
578 rc = ll_local_open(file, it, fd, NULL);
581 mutex_unlock(&lli->lli_och_mutex);
582 GOTO(out_openerr, rc);
585 LASSERT(*och_usecount == 0);
586 if (!it->it_disposition) {
587 /* We cannot just request lock handle now, new ELC code
588 means that one of other OPEN locks for this file
589 could be cancelled, and since blocking ast handler
590 would attempt to grab och_mutex as well, that would
591 result in a deadlock */
592 mutex_unlock(&lli->lli_och_mutex);
594 * Normally called under two situations:
596 * 2. A race/condition on MDS resulting in no open
597 * handle to be returned from LOOKUP|OPEN request,
598 * for example if the target entry was a symlink.
600 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file, NULL, 0, it);
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * Acquire a lease and open the file.
699 static struct obd_client_handle *
700 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
703 struct lookup_intent it = { .it_op = IT_OPEN };
704 struct ll_sb_info *sbi = ll_i2sbi(inode);
705 struct md_op_data *op_data;
706 struct ptlrpc_request *req = NULL;
707 struct lustre_handle old_handle = { 0 };
708 struct obd_client_handle *och = NULL;
713 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
714 RETURN(ERR_PTR(-EINVAL));
717 struct ll_inode_info *lli = ll_i2info(inode);
718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
719 struct obd_client_handle **och_p;
722 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
723 RETURN(ERR_PTR(-EPERM));
725 /* Get the openhandle of the file */
727 mutex_lock(&lli->lli_och_mutex);
728 if (fd->fd_lease_och != NULL) {
729 mutex_unlock(&lli->lli_och_mutex);
733 if (fd->fd_och == NULL) {
734 if (file->f_mode & FMODE_WRITE) {
735 LASSERT(lli->lli_mds_write_och != NULL);
736 och_p = &lli->lli_mds_write_och;
737 och_usecount = &lli->lli_open_fd_write_count;
739 LASSERT(lli->lli_mds_read_och != NULL);
740 och_p = &lli->lli_mds_read_och;
741 och_usecount = &lli->lli_open_fd_read_count;
743 if (*och_usecount == 1) {
750 mutex_unlock(&lli->lli_och_mutex);
751 if (rc < 0) /* more than 1 opener */
754 LASSERT(fd->fd_och != NULL);
755 old_handle = fd->fd_och->och_fh;
760 RETURN(ERR_PTR(-ENOMEM));
762 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
763 LUSTRE_OPC_ANY, NULL);
765 GOTO(out, rc = PTR_ERR(op_data));
767 /* To tell the MDT this openhandle is from the same owner */
768 op_data->op_handle = old_handle;
770 it.it_flags = fmode | open_flags;
771 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
772 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
773 &ll_md_blocking_lease_ast,
774 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
775 * it can be cancelled which may mislead applications that the lease is
777 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
778 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
779 * doesn't deal with openhandle, so normal openhandle will be leaked. */
780 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
781 ll_finish_md_op_data(op_data);
782 ptlrpc_req_finished(req);
784 GOTO(out_release_it, rc);
786 if (it_disposition(&it, DISP_LOOKUP_NEG))
787 GOTO(out_release_it, rc = -ENOENT);
789 rc = it_open_error(DISP_OPEN_OPEN, &it);
791 GOTO(out_release_it, rc);
793 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
794 ll_och_fill(sbi->ll_md_exp, &it, och);
796 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
797 GOTO(out_close, rc = -EOPNOTSUPP);
799 /* already get lease, handle lease lock */
800 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
801 if (it.it_lock_mode == 0 ||
802 it.it_lock_bits != MDS_INODELOCK_OPEN) {
803 /* open lock must return for lease */
804 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
805 PFID(ll_inode2fid(inode)), it.it_lock_mode,
807 GOTO(out_close, rc = -EPROTO);
810 ll_intent_release(&it);
814 /* Cancel open lock */
815 if (it.it_lock_mode != 0) {
816 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
819 och->och_lease_handle.cookie = 0ULL;
821 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
823 CERROR("%s: error closing file "DFID": %d\n",
824 ll_get_fsname(inode->i_sb, NULL, 0),
825 PFID(&ll_i2info(inode)->lli_fid), rc2);
826 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
828 ll_intent_release(&it);
836 * Check whether a layout swap can be done between two inodes.
838 * \param[in] inode1 First inode to check
839 * \param[in] inode2 Second inode to check
841 * \retval 0 on success, layout swap can be performed between both inodes
842 * \retval negative error code if requirements are not met
844 static int ll_check_swap_layouts_validity(struct inode *inode1,
845 struct inode *inode2)
847 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
850 if (inode_permission(inode1, MAY_WRITE) ||
851 inode_permission(inode2, MAY_WRITE))
854 if (inode1->i_sb != inode2->i_sb)
860 static int ll_swap_layouts_close(struct obd_client_handle *och,
861 struct inode *inode, struct inode *inode2)
863 const struct lu_fid *fid1 = ll_inode2fid(inode);
864 const struct lu_fid *fid2;
868 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
869 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
871 rc = ll_check_swap_layouts_validity(inode, inode2);
873 GOTO(out_free_och, rc);
875 /* We now know that inode2 is a lustre inode */
876 fid2 = ll_inode2fid(inode2);
878 rc = lu_fid_cmp(fid1, fid2);
880 GOTO(out_free_och, rc = -EINVAL);
882 /* Close the file and swap layouts between inode & inode2.
883 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
884 * because we still need it to pack l_remote_handle to MDT. */
885 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
888 och = NULL; /* freed in ll_close_inode_openhandle() */
898 * Release lease and close the file.
899 * It will check if the lease has ever broken.
901 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
904 struct ldlm_lock *lock;
905 bool cancelled = true;
909 lock = ldlm_handle2lock(&och->och_lease_handle);
911 lock_res_and_lock(lock);
912 cancelled = ldlm_is_cancel(lock);
913 unlock_res_and_lock(lock);
917 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
918 PFID(&ll_i2info(inode)->lli_fid), cancelled);
921 ldlm_cli_cancel(&och->och_lease_handle, 0);
922 if (lease_broken != NULL)
923 *lease_broken = cancelled;
925 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
929 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
931 struct ll_inode_info *lli = ll_i2info(inode);
932 struct cl_object *obj = lli->lli_clob;
933 struct cl_attr *attr = vvp_env_thread_attr(env);
941 ll_inode_size_lock(inode);
943 /* merge timestamps the most recently obtained from mds with
944 timestamps obtained from osts */
945 LTIME_S(inode->i_atime) = lli->lli_atime;
946 LTIME_S(inode->i_mtime) = lli->lli_mtime;
947 LTIME_S(inode->i_ctime) = lli->lli_ctime;
949 atime = LTIME_S(inode->i_atime);
950 mtime = LTIME_S(inode->i_mtime);
951 ctime = LTIME_S(inode->i_ctime);
953 cl_object_attr_lock(obj);
954 rc = cl_object_attr_get(env, obj, attr);
955 cl_object_attr_unlock(obj);
958 GOTO(out_size_unlock, rc);
960 if (atime < attr->cat_atime)
961 atime = attr->cat_atime;
963 if (ctime < attr->cat_ctime)
964 ctime = attr->cat_ctime;
966 if (mtime < attr->cat_mtime)
967 mtime = attr->cat_mtime;
969 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
970 PFID(&lli->lli_fid), attr->cat_size);
972 i_size_write(inode, attr->cat_size);
973 inode->i_blocks = attr->cat_blocks;
975 LTIME_S(inode->i_atime) = atime;
976 LTIME_S(inode->i_mtime) = mtime;
977 LTIME_S(inode->i_ctime) = ctime;
980 ll_inode_size_unlock(inode);
985 static bool file_is_noatime(const struct file *file)
987 const struct vfsmount *mnt = file->f_path.mnt;
988 const struct inode *inode = file->f_path.dentry->d_inode;
990 /* Adapted from file_accessed() and touch_atime().*/
991 if (file->f_flags & O_NOATIME)
994 if (inode->i_flags & S_NOATIME)
997 if (IS_NOATIME(inode))
1000 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1003 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1006 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1012 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1014 struct inode *inode = file->f_path.dentry->d_inode;
1016 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1018 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1019 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1020 file->f_flags & O_DIRECT ||
1023 io->ci_obj = ll_i2info(inode)->lli_clob;
1024 io->ci_lockreq = CILR_MAYBE;
1025 if (ll_file_nolock(file)) {
1026 io->ci_lockreq = CILR_NEVER;
1027 io->ci_no_srvlock = 1;
1028 } else if (file->f_flags & O_APPEND) {
1029 io->ci_lockreq = CILR_MANDATORY;
1032 io->ci_noatime = file_is_noatime(file);
1036 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1037 struct file *file, enum cl_io_type iot,
1038 loff_t *ppos, size_t count)
1040 struct vvp_io *vio = vvp_env_io(env);
1041 struct inode *inode = file->f_path.dentry->d_inode;
1042 struct ll_inode_info *lli = ll_i2info(inode);
1043 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1047 struct range_lock range;
1051 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1052 file->f_path.dentry->d_name.name, iot, *ppos, count);
1055 io = vvp_env_thread_io(env);
1056 ll_io_init(io, file, iot == CIT_WRITE);
1058 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1059 bool range_locked = false;
1061 if (file->f_flags & O_APPEND)
1062 range_lock_init(&range, 0, LUSTRE_EOF);
1064 range_lock_init(&range, *ppos, *ppos + count - 1);
1066 vio->vui_fd = LUSTRE_FPRIVATE(file);
1067 vio->vui_io_subtype = args->via_io_subtype;
1069 switch (vio->vui_io_subtype) {
1071 vio->vui_iter = args->u.normal.via_iter;
1072 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1073 vio->vui_tot_nrsegs = vio->vui_iter->nr_segs;
1074 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1075 vio->vui_iocb = args->u.normal.via_iocb;
1076 /* Direct IO reads must also take range lock,
1077 * or multiple reads will try to work on the same pages
1078 * See LU-6227 for details. */
1079 if (((iot == CIT_WRITE) ||
1080 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1081 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1082 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1084 rc = range_lock(&lli->lli_write_tree, &range);
1088 range_locked = true;
1092 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1093 vio->u.splice.vui_flags = args->u.splice.via_flags;
1096 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1100 ll_cl_add(file, env, io);
1101 rc = cl_io_loop(env, io);
1102 ll_cl_remove(file, env);
1105 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1107 range_unlock(&lli->lli_write_tree, &range);
1110 /* cl_io_rw_init() handled IO */
1114 if (io->ci_nob > 0) {
1115 result += io->ci_nob;
1116 count -= io->ci_nob;
1117 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1119 /* prepare IO restart */
1120 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1121 args->u.normal.via_iter = vio->vui_iter;
1122 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1123 args->u.normal.via_iter->nr_segs = vio->vui_tot_nrsegs;
1124 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1129 cl_io_fini(env, io);
1131 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1133 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1134 file->f_path.dentry->d_name.name,
1135 iot == CIT_READ ? "read" : "write",
1136 *ppos, count, result);
1140 if (iot == CIT_READ) {
1142 ll_stats_ops_tally(ll_i2sbi(inode),
1143 LPROC_LL_READ_BYTES, result);
1144 } else if (iot == CIT_WRITE) {
1146 ll_stats_ops_tally(ll_i2sbi(inode),
1147 LPROC_LL_WRITE_BYTES, result);
1148 fd->fd_write_failed = false;
1149 } else if (result == 0 && rc == 0) {
1152 fd->fd_write_failed = true;
1154 fd->fd_write_failed = false;
1155 } else if (rc != -ERESTARTSYS) {
1156 fd->fd_write_failed = true;
1160 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1162 return result > 0 ? result : rc;
1166 * Read from a file (through the page cache).
1168 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1170 struct vvp_io_args *args;
1175 env = cl_env_get(&refcheck);
1177 return PTR_ERR(env);
1179 args = ll_env_args(env, IO_NORMAL);
1180 args->u.normal.via_iter = to;
1181 args->u.normal.via_iocb = iocb;
1183 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1184 &iocb->ki_pos, iov_iter_count(to));
1185 cl_env_put(env, &refcheck);
1190 * Write to a file (through the page cache).
1192 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1194 struct vvp_io_args *args;
1199 env = cl_env_get(&refcheck);
1201 return PTR_ERR(env);
1203 args = ll_env_args(env, IO_NORMAL);
1204 args->u.normal.via_iter = from;
1205 args->u.normal.via_iocb = iocb;
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1208 &iocb->ki_pos, iov_iter_count(from));
1209 cl_env_put(env, &refcheck);
1213 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1215 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1217 static int ll_file_get_iov_count(const struct iovec *iov,
1218 unsigned long *nr_segs, size_t *count)
1223 for (seg = 0; seg < *nr_segs; seg++) {
1224 const struct iovec *iv = &iov[seg];
1227 * If any segment has a negative length, or the cumulative
1228 * length ever wraps negative then return -EINVAL.
1231 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1233 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1238 cnt -= iv->iov_len; /* This segment is no good */
1245 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1246 unsigned long nr_segs, loff_t pos)
1248 struct iovec *local_iov;
1249 struct iov_iter *to;
1252 struct lu_env *env = NULL;
1256 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1262 env = cl_env_get(&refcheck);
1264 RETURN(PTR_ERR(env));
1266 local_iov = &ll_env_info(env)->lti_local_iov;
1270 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1271 if (local_iov == NULL)
1274 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1282 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1283 iov_iter_init(to, READ, local_iov, nr_segs, iov_count);
1284 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1285 iov_iter_init(to, local_iov, nr_segs, iov_count, 0);
1286 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1288 result = ll_file_read_iter(iocb, to);
1293 cl_env_put(env, &refcheck);
1295 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1300 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1304 struct iovec iov = { .iov_base = buf, .iov_len = count };
1305 struct kiocb *kiocb;
1310 env = cl_env_get(&refcheck);
1312 RETURN(PTR_ERR(env));
1314 kiocb = &ll_env_info(env)->lti_kiocb;
1315 init_sync_kiocb(kiocb, file);
1316 kiocb->ki_pos = *ppos;
1317 #ifdef HAVE_KIOCB_KI_LEFT
1318 kiocb->ki_left = count;
1319 #elif defined(HAVE_KI_NBYTES)
1320 kiocb->ki_nbytes = count;
1323 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1324 *ppos = kiocb->ki_pos;
1326 cl_env_put(env, &refcheck);
1331 * Write to a file (through the page cache).
1334 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1335 unsigned long nr_segs, loff_t pos)
1337 struct iovec *local_iov;
1338 struct iov_iter *from;
1341 struct lu_env *env = NULL;
1345 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1350 env = cl_env_get(&refcheck);
1352 RETURN(PTR_ERR(env));
1354 local_iov = &ll_env_info(env)->lti_local_iov;
1357 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1358 if (local_iov == NULL)
1361 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1364 OBD_ALLOC_PTR(from);
1369 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1370 iov_iter_init(from, WRITE, local_iov, nr_segs, iov_count);
1371 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1372 iov_iter_init(from, local_iov, nr_segs, iov_count, 0);
1373 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1375 result = ll_file_write_iter(iocb, from);
1380 cl_env_put(env, &refcheck);
1382 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1387 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1388 size_t count, loff_t *ppos)
1391 struct iovec iov = { .iov_base = (void __user *)buf,
1393 struct kiocb *kiocb;
1398 env = cl_env_get(&refcheck);
1400 RETURN(PTR_ERR(env));
1402 kiocb = &ll_env_info(env)->lti_kiocb;
1403 init_sync_kiocb(kiocb, file);
1404 kiocb->ki_pos = *ppos;
1405 #ifdef HAVE_KIOCB_KI_LEFT
1406 kiocb->ki_left = count;
1407 #elif defined(HAVE_KI_NBYTES)
1408 kiocb->ki_nbytes = count;
1411 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1412 *ppos = kiocb->ki_pos;
1414 cl_env_put(env, &refcheck);
1417 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1420 * Send file content (through pagecache) somewhere with helper
1422 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1423 struct pipe_inode_info *pipe, size_t count,
1427 struct vvp_io_args *args;
1432 env = cl_env_get(&refcheck);
1434 RETURN(PTR_ERR(env));
1436 args = ll_env_args(env, IO_SPLICE);
1437 args->u.splice.via_pipe = pipe;
1438 args->u.splice.via_flags = flags;
1440 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1441 cl_env_put(env, &refcheck);
1445 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1446 __u64 flags, struct lov_user_md *lum,
1449 struct lookup_intent oit = {
1451 .it_flags = flags | MDS_OPEN_BY_FID,
1456 ll_inode_size_lock(inode);
1457 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1459 GOTO(out_unlock, rc);
1461 ll_release_openhandle(file->f_path.dentry, &oit);
1464 ll_inode_size_unlock(inode);
1465 ll_intent_release(&oit);
1466 cl_lov_delay_create_clear(&file->f_flags);
1471 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1472 struct lov_mds_md **lmmp, int *lmm_size,
1473 struct ptlrpc_request **request)
1475 struct ll_sb_info *sbi = ll_i2sbi(inode);
1476 struct mdt_body *body;
1477 struct lov_mds_md *lmm = NULL;
1478 struct ptlrpc_request *req = NULL;
1479 struct md_op_data *op_data;
1482 rc = ll_get_default_mdsize(sbi, &lmmsize);
1486 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1487 strlen(filename), lmmsize,
1488 LUSTRE_OPC_ANY, NULL);
1489 if (IS_ERR(op_data))
1490 RETURN(PTR_ERR(op_data));
1492 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1493 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1494 ll_finish_md_op_data(op_data);
1496 CDEBUG(D_INFO, "md_getattr_name failed "
1497 "on %s: rc %d\n", filename, rc);
1501 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1502 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1504 lmmsize = body->mbo_eadatasize;
1506 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1508 GOTO(out, rc = -ENODATA);
1511 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1512 LASSERT(lmm != NULL);
1514 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1515 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1516 GOTO(out, rc = -EPROTO);
1520 * This is coming from the MDS, so is probably in
1521 * little endian. We convert it to host endian before
1522 * passing it to userspace.
1524 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1527 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1528 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1531 /* if function called for directory - we should
1532 * avoid swab not existent lsm objects */
1533 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1534 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1535 if (S_ISREG(body->mbo_mode))
1536 lustre_swab_lov_user_md_objects(
1537 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1539 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1540 lustre_swab_lov_user_md_v3(
1541 (struct lov_user_md_v3 *)lmm);
1542 if (S_ISREG(body->mbo_mode))
1543 lustre_swab_lov_user_md_objects(
1544 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1551 *lmm_size = lmmsize;
1556 static int ll_lov_setea(struct inode *inode, struct file *file,
1559 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1560 struct lov_user_md *lump;
1561 int lum_size = sizeof(struct lov_user_md) +
1562 sizeof(struct lov_user_ost_data);
1566 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1569 OBD_ALLOC_LARGE(lump, lum_size);
1573 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1574 GOTO(out_lump, rc = -EFAULT);
1576 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1579 OBD_FREE_LARGE(lump, lum_size);
1583 static int ll_file_getstripe(struct inode *inode,
1584 struct lov_user_md __user *lum)
1591 env = cl_env_get(&refcheck);
1593 RETURN(PTR_ERR(env));
1595 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1596 cl_env_put(env, &refcheck);
1600 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1603 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1604 struct lov_user_md *klum;
1606 __u64 flags = FMODE_WRITE;
1609 rc = ll_copy_user_md(lum, &klum);
1614 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1618 put_user(0, &lum->lmm_stripe_count);
1620 ll_layout_refresh(inode, &gen);
1621 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1624 OBD_FREE(klum, lum_size);
1629 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1631 struct ll_inode_info *lli = ll_i2info(inode);
1632 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1633 struct ll_grouplock grouplock;
1638 CWARN("group id for group lock must not be 0\n");
1642 if (ll_file_nolock(file))
1643 RETURN(-EOPNOTSUPP);
1645 spin_lock(&lli->lli_lock);
1646 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1647 CWARN("group lock already existed with gid %lu\n",
1648 fd->fd_grouplock.lg_gid);
1649 spin_unlock(&lli->lli_lock);
1652 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1653 spin_unlock(&lli->lli_lock);
1655 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1656 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1660 spin_lock(&lli->lli_lock);
1661 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1662 spin_unlock(&lli->lli_lock);
1663 CERROR("another thread just won the race\n");
1664 cl_put_grouplock(&grouplock);
1668 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1669 fd->fd_grouplock = grouplock;
1670 spin_unlock(&lli->lli_lock);
1672 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1676 static int ll_put_grouplock(struct inode *inode, struct file *file,
1679 struct ll_inode_info *lli = ll_i2info(inode);
1680 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1681 struct ll_grouplock grouplock;
1684 spin_lock(&lli->lli_lock);
1685 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1686 spin_unlock(&lli->lli_lock);
1687 CWARN("no group lock held\n");
1691 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1693 if (fd->fd_grouplock.lg_gid != arg) {
1694 CWARN("group lock %lu doesn't match current id %lu\n",
1695 arg, fd->fd_grouplock.lg_gid);
1696 spin_unlock(&lli->lli_lock);
1700 grouplock = fd->fd_grouplock;
1701 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1702 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1703 spin_unlock(&lli->lli_lock);
1705 cl_put_grouplock(&grouplock);
1706 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1711 * Close inode open handle
1713 * \param dentry [in] dentry which contains the inode
1714 * \param it [in,out] intent which contains open info and result
1717 * \retval <0 failure
1719 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1721 struct inode *inode = dentry->d_inode;
1722 struct obd_client_handle *och;
1728 /* Root ? Do nothing. */
1729 if (dentry->d_inode->i_sb->s_root == dentry)
1732 /* No open handle to close? Move away */
1733 if (!it_disposition(it, DISP_OPEN_OPEN))
1736 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1738 OBD_ALLOC(och, sizeof(*och));
1740 GOTO(out, rc = -ENOMEM);
1742 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1744 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1746 /* this one is in place of ll_file_open */
1747 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1748 ptlrpc_req_finished(it->it_request);
1749 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1755 * Get size for inode for which FIEMAP mapping is requested.
1756 * Make the FIEMAP get_info call and returns the result.
1757 * \param fiemap kernel buffer to hold extens
1758 * \param num_bytes kernel buffer size
1760 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1766 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1769 /* Checks for fiemap flags */
1770 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1771 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1775 /* Check for FIEMAP_FLAG_SYNC */
1776 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1777 rc = filemap_fdatawrite(inode->i_mapping);
1782 env = cl_env_get(&refcheck);
1784 RETURN(PTR_ERR(env));
1786 if (i_size_read(inode) == 0) {
1787 rc = ll_glimpse_size(inode);
1792 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1793 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1794 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1796 /* If filesize is 0, then there would be no objects for mapping */
1797 if (fmkey.lfik_oa.o_size == 0) {
1798 fiemap->fm_mapped_extents = 0;
1802 fmkey.lfik_fiemap = *fiemap;
1804 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1805 &fmkey, fiemap, &num_bytes);
1807 cl_env_put(env, &refcheck);
1811 int ll_fid2path(struct inode *inode, void __user *arg)
1813 struct obd_export *exp = ll_i2mdexp(inode);
1814 const struct getinfo_fid2path __user *gfin = arg;
1816 struct getinfo_fid2path *gfout;
1822 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1823 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1826 /* Only need to get the buflen */
1827 if (get_user(pathlen, &gfin->gf_pathlen))
1830 if (pathlen > PATH_MAX)
1833 outsize = sizeof(*gfout) + pathlen;
1834 OBD_ALLOC(gfout, outsize);
1838 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1839 GOTO(gf_free, rc = -EFAULT);
1840 /* append root FID after gfout to let MDT know the root FID so that it
1841 * can lookup the correct path, this is mainly for fileset.
1842 * old server without fileset mount support will ignore this. */
1843 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1845 /* Call mdc_iocontrol */
1846 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1850 if (copy_to_user(arg, gfout, outsize))
1854 OBD_FREE(gfout, outsize);
1859 * Read the data_version for inode.
1861 * This value is computed using stripe object version on OST.
1862 * Version is computed using server side locking.
1864 * @param flags if do sync on the OST side;
1866 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1867 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1869 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1871 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1879 /* If no file object initialized, we consider its version is 0. */
1885 env = cl_env_get(&refcheck);
1887 RETURN(PTR_ERR(env));
1889 io = vvp_env_thread_io(env);
1891 io->u.ci_data_version.dv_data_version = 0;
1892 io->u.ci_data_version.dv_flags = flags;
1895 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1896 result = cl_io_loop(env, io);
1898 result = io->ci_result;
1900 *data_version = io->u.ci_data_version.dv_data_version;
1902 cl_io_fini(env, io);
1904 if (unlikely(io->ci_need_restart))
1907 cl_env_put(env, &refcheck);
1913 * Trigger a HSM release request for the provided inode.
1915 int ll_hsm_release(struct inode *inode)
1917 struct cl_env_nest nest;
1919 struct obd_client_handle *och = NULL;
1920 __u64 data_version = 0;
1924 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1925 ll_get_fsname(inode->i_sb, NULL, 0),
1926 PFID(&ll_i2info(inode)->lli_fid));
1928 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1930 GOTO(out, rc = PTR_ERR(och));
1932 /* Grab latest data_version and [am]time values */
1933 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1937 env = cl_env_nested_get(&nest);
1939 GOTO(out, rc = PTR_ERR(env));
1941 ll_merge_attr(env, inode);
1942 cl_env_nested_put(&nest, env);
1944 /* Release the file.
1945 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1946 * we still need it to pack l_remote_handle to MDT. */
1947 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1953 if (och != NULL && !IS_ERR(och)) /* close the file */
1954 ll_lease_close(och, inode, NULL);
1959 struct ll_swap_stack {
1962 struct inode *inode1;
1963 struct inode *inode2;
1968 static int ll_swap_layouts(struct file *file1, struct file *file2,
1969 struct lustre_swap_layouts *lsl)
1971 struct mdc_swap_layouts msl;
1972 struct md_op_data *op_data;
1975 struct ll_swap_stack *llss = NULL;
1978 OBD_ALLOC_PTR(llss);
1982 llss->inode1 = file1->f_path.dentry->d_inode;
1983 llss->inode2 = file2->f_path.dentry->d_inode;
1985 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1989 /* we use 2 bool because it is easier to swap than 2 bits */
1990 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1991 llss->check_dv1 = true;
1993 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1994 llss->check_dv2 = true;
1996 /* we cannot use lsl->sl_dvX directly because we may swap them */
1997 llss->dv1 = lsl->sl_dv1;
1998 llss->dv2 = lsl->sl_dv2;
2000 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2001 if (rc == 0) /* same file, done! */
2004 if (rc < 0) { /* sequentialize it */
2005 swap(llss->inode1, llss->inode2);
2007 swap(llss->dv1, llss->dv2);
2008 swap(llss->check_dv1, llss->check_dv2);
2012 if (gid != 0) { /* application asks to flush dirty cache */
2013 rc = ll_get_grouplock(llss->inode1, file1, gid);
2017 rc = ll_get_grouplock(llss->inode2, file2, gid);
2019 ll_put_grouplock(llss->inode1, file1, gid);
2024 /* ultimate check, before swaping the layouts we check if
2025 * dataversion has changed (if requested) */
2026 if (llss->check_dv1) {
2027 rc = ll_data_version(llss->inode1, &dv, 0);
2030 if (dv != llss->dv1)
2031 GOTO(putgl, rc = -EAGAIN);
2034 if (llss->check_dv2) {
2035 rc = ll_data_version(llss->inode2, &dv, 0);
2038 if (dv != llss->dv2)
2039 GOTO(putgl, rc = -EAGAIN);
2042 /* struct md_op_data is used to send the swap args to the mdt
2043 * only flags is missing, so we use struct mdc_swap_layouts
2044 * through the md_op_data->op_data */
2045 /* flags from user space have to be converted before they are send to
2046 * server, no flag is sent today, they are only used on the client */
2049 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2050 0, LUSTRE_OPC_ANY, &msl);
2051 if (IS_ERR(op_data))
2052 GOTO(free, rc = PTR_ERR(op_data));
2054 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2055 sizeof(*op_data), op_data, NULL);
2056 ll_finish_md_op_data(op_data);
2063 ll_put_grouplock(llss->inode2, file2, gid);
2064 ll_put_grouplock(llss->inode1, file1, gid);
2074 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2076 struct md_op_data *op_data;
2080 /* Detect out-of range masks */
2081 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2084 /* Non-root users are forbidden to set or clear flags which are
2085 * NOT defined in HSM_USER_MASK. */
2086 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2087 !cfs_capable(CFS_CAP_SYS_ADMIN))
2090 /* Detect out-of range archive id */
2091 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2092 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2095 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2096 LUSTRE_OPC_ANY, hss);
2097 if (IS_ERR(op_data))
2098 RETURN(PTR_ERR(op_data));
2100 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2101 sizeof(*op_data), op_data, NULL);
2103 ll_finish_md_op_data(op_data);
2108 static int ll_hsm_import(struct inode *inode, struct file *file,
2109 struct hsm_user_import *hui)
2111 struct hsm_state_set *hss = NULL;
2112 struct iattr *attr = NULL;
2116 if (!S_ISREG(inode->i_mode))
2122 GOTO(out, rc = -ENOMEM);
2124 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2125 hss->hss_archive_id = hui->hui_archive_id;
2126 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2127 rc = ll_hsm_state_set(inode, hss);
2131 OBD_ALLOC_PTR(attr);
2133 GOTO(out, rc = -ENOMEM);
2135 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2136 attr->ia_mode |= S_IFREG;
2137 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2138 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2139 attr->ia_size = hui->hui_size;
2140 attr->ia_mtime.tv_sec = hui->hui_mtime;
2141 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2142 attr->ia_atime.tv_sec = hui->hui_atime;
2143 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2145 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2146 ATTR_UID | ATTR_GID |
2147 ATTR_MTIME | ATTR_MTIME_SET |
2148 ATTR_ATIME | ATTR_ATIME_SET;
2150 mutex_lock(&inode->i_mutex);
2152 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2156 mutex_unlock(&inode->i_mutex);
2168 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2170 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2171 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2174 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2176 struct inode *inode = file->f_path.dentry->d_inode;
2178 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2179 ATTR_MTIME | ATTR_MTIME_SET |
2180 ATTR_CTIME | ATTR_CTIME_SET,
2182 .tv_sec = lfu->lfu_atime_sec,
2183 .tv_nsec = lfu->lfu_atime_nsec,
2186 .tv_sec = lfu->lfu_mtime_sec,
2187 .tv_nsec = lfu->lfu_mtime_nsec,
2190 .tv_sec = lfu->lfu_ctime_sec,
2191 .tv_nsec = lfu->lfu_ctime_nsec,
2197 if (!capable(CAP_SYS_ADMIN))
2200 if (!S_ISREG(inode->i_mode))
2203 mutex_lock(&inode->i_mutex);
2204 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2205 mutex_unlock(&inode->i_mutex);
2211 * Give file access advices
2213 * The ladvise interface is similar to Linux fadvise() system call, except it
2214 * forwards the advices directly from Lustre client to server. The server side
2215 * codes will apply appropriate read-ahead and caching techniques for the
2216 * corresponding files.
2218 * A typical workload for ladvise is e.g. a bunch of different clients are
2219 * doing small random reads of a file, so prefetching pages into OSS cache
2220 * with big linear reads before the random IO is a net benefit. Fetching
2221 * all that data into each client cache with fadvise() may not be, due to
2222 * much more data being sent to the client.
2224 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2225 struct lu_ladvise *ladvise)
2227 struct cl_env_nest nest;
2230 struct cl_ladvise_io *lio;
2234 env = cl_env_nested_get(&nest);
2236 RETURN(PTR_ERR(env));
2238 io = vvp_env_thread_io(env);
2239 io->ci_obj = ll_i2info(inode)->lli_clob;
2241 /* initialize parameters for ladvise */
2242 lio = &io->u.ci_ladvise;
2243 lio->li_start = ladvise->lla_start;
2244 lio->li_end = ladvise->lla_end;
2245 lio->li_fid = ll_inode2fid(inode);
2246 lio->li_advice = ladvise->lla_advice;
2247 lio->li_flags = flags;
2249 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2250 rc = cl_io_loop(env, io);
2254 cl_io_fini(env, io);
2255 cl_env_nested_put(&nest, env);
2260 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2262 struct inode *inode = file->f_path.dentry->d_inode;
2263 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2267 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2268 PFID(ll_inode2fid(inode)), inode, cmd);
2269 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2271 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2272 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2276 case LL_IOC_GETFLAGS:
2277 /* Get the current value of the file flags */
2278 return put_user(fd->fd_flags, (int __user *)arg);
2279 case LL_IOC_SETFLAGS:
2280 case LL_IOC_CLRFLAGS:
2281 /* Set or clear specific file flags */
2282 /* XXX This probably needs checks to ensure the flags are
2283 * not abused, and to handle any flag side effects.
2285 if (get_user(flags, (int __user *) arg))
2288 if (cmd == LL_IOC_SETFLAGS) {
2289 if ((flags & LL_FILE_IGNORE_LOCK) &&
2290 !(file->f_flags & O_DIRECT)) {
2291 CERROR("%s: unable to disable locking on "
2292 "non-O_DIRECT file\n", current->comm);
2296 fd->fd_flags |= flags;
2298 fd->fd_flags &= ~flags;
2301 case LL_IOC_LOV_SETSTRIPE:
2302 RETURN(ll_lov_setstripe(inode, file, arg));
2303 case LL_IOC_LOV_SETEA:
2304 RETURN(ll_lov_setea(inode, file, arg));
2305 case LL_IOC_LOV_SWAP_LAYOUTS: {
2307 struct lustre_swap_layouts lsl;
2309 if (copy_from_user(&lsl, (char __user *)arg,
2310 sizeof(struct lustre_swap_layouts)))
2313 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2316 file2 = fget(lsl.sl_fd);
2320 /* O_WRONLY or O_RDWR */
2321 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2322 GOTO(out, rc = -EPERM);
2324 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2325 struct inode *inode2;
2326 struct ll_inode_info *lli;
2327 struct obd_client_handle *och = NULL;
2329 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2330 GOTO(out, rc = -EINVAL);
2332 lli = ll_i2info(inode);
2333 mutex_lock(&lli->lli_och_mutex);
2334 if (fd->fd_lease_och != NULL) {
2335 och = fd->fd_lease_och;
2336 fd->fd_lease_och = NULL;
2338 mutex_unlock(&lli->lli_och_mutex);
2340 GOTO(out, rc = -ENOLCK);
2341 inode2 = file2->f_path.dentry->d_inode;
2342 rc = ll_swap_layouts_close(och, inode, inode2);
2344 rc = ll_swap_layouts(file, file2, &lsl);
2350 case LL_IOC_LOV_GETSTRIPE:
2351 RETURN(ll_file_getstripe(inode,
2352 (struct lov_user_md __user *)arg));
2353 case FSFILT_IOC_GETFLAGS:
2354 case FSFILT_IOC_SETFLAGS:
2355 RETURN(ll_iocontrol(inode, file, cmd, arg));
2356 case FSFILT_IOC_GETVERSION_OLD:
2357 case FSFILT_IOC_GETVERSION:
2358 RETURN(put_user(inode->i_generation, (int __user *)arg));
2359 case LL_IOC_GROUP_LOCK:
2360 RETURN(ll_get_grouplock(inode, file, arg));
2361 case LL_IOC_GROUP_UNLOCK:
2362 RETURN(ll_put_grouplock(inode, file, arg));
2363 case IOC_OBD_STATFS:
2364 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2366 /* We need to special case any other ioctls we want to handle,
2367 * to send them to the MDS/OST as appropriate and to properly
2368 * network encode the arg field.
2369 case FSFILT_IOC_SETVERSION_OLD:
2370 case FSFILT_IOC_SETVERSION:
2372 case LL_IOC_FLUSHCTX:
2373 RETURN(ll_flush_ctx(inode));
2374 case LL_IOC_PATH2FID: {
2375 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2376 sizeof(struct lu_fid)))
2381 case LL_IOC_GETPARENT:
2382 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2384 case OBD_IOC_FID2PATH:
2385 RETURN(ll_fid2path(inode, (void __user *)arg));
2386 case LL_IOC_DATA_VERSION: {
2387 struct ioc_data_version idv;
2390 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2393 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2394 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2397 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2403 case LL_IOC_GET_MDTIDX: {
2406 mdtidx = ll_get_mdt_idx(inode);
2410 if (put_user((int)mdtidx, (int __user *)arg))
2415 case OBD_IOC_GETDTNAME:
2416 case OBD_IOC_GETMDNAME:
2417 RETURN(ll_get_obd_name(inode, cmd, arg));
2418 case LL_IOC_HSM_STATE_GET: {
2419 struct md_op_data *op_data;
2420 struct hsm_user_state *hus;
2427 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2428 LUSTRE_OPC_ANY, hus);
2429 if (IS_ERR(op_data)) {
2431 RETURN(PTR_ERR(op_data));
2434 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2437 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2440 ll_finish_md_op_data(op_data);
2444 case LL_IOC_HSM_STATE_SET: {
2445 struct hsm_state_set *hss;
2452 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2457 rc = ll_hsm_state_set(inode, hss);
2462 case LL_IOC_HSM_ACTION: {
2463 struct md_op_data *op_data;
2464 struct hsm_current_action *hca;
2471 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2472 LUSTRE_OPC_ANY, hca);
2473 if (IS_ERR(op_data)) {
2475 RETURN(PTR_ERR(op_data));
2478 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2481 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2484 ll_finish_md_op_data(op_data);
2488 case LL_IOC_SET_LEASE: {
2489 struct ll_inode_info *lli = ll_i2info(inode);
2490 struct obd_client_handle *och = NULL;
2495 case LL_LEASE_WRLCK:
2496 if (!(file->f_mode & FMODE_WRITE))
2498 fmode = FMODE_WRITE;
2500 case LL_LEASE_RDLCK:
2501 if (!(file->f_mode & FMODE_READ))
2505 case LL_LEASE_UNLCK:
2506 mutex_lock(&lli->lli_och_mutex);
2507 if (fd->fd_lease_och != NULL) {
2508 och = fd->fd_lease_och;
2509 fd->fd_lease_och = NULL;
2511 mutex_unlock(&lli->lli_och_mutex);
2516 fmode = och->och_flags;
2517 rc = ll_lease_close(och, inode, &lease_broken);
2524 RETURN(ll_lease_type_from_fmode(fmode));
2529 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2531 /* apply for lease */
2532 och = ll_lease_open(inode, file, fmode, 0);
2534 RETURN(PTR_ERR(och));
2537 mutex_lock(&lli->lli_och_mutex);
2538 if (fd->fd_lease_och == NULL) {
2539 fd->fd_lease_och = och;
2542 mutex_unlock(&lli->lli_och_mutex);
2544 /* impossible now that only excl is supported for now */
2545 ll_lease_close(och, inode, &lease_broken);
2550 case LL_IOC_GET_LEASE: {
2551 struct ll_inode_info *lli = ll_i2info(inode);
2552 struct ldlm_lock *lock = NULL;
2555 mutex_lock(&lli->lli_och_mutex);
2556 if (fd->fd_lease_och != NULL) {
2557 struct obd_client_handle *och = fd->fd_lease_och;
2559 lock = ldlm_handle2lock(&och->och_lease_handle);
2561 lock_res_and_lock(lock);
2562 if (!ldlm_is_cancel(lock))
2563 fmode = och->och_flags;
2565 unlock_res_and_lock(lock);
2566 LDLM_LOCK_PUT(lock);
2569 mutex_unlock(&lli->lli_och_mutex);
2571 RETURN(ll_lease_type_from_fmode(fmode));
2573 case LL_IOC_HSM_IMPORT: {
2574 struct hsm_user_import *hui;
2580 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2585 rc = ll_hsm_import(inode, file, hui);
2590 case LL_IOC_FUTIMES_3: {
2591 struct ll_futimes_3 lfu;
2593 if (copy_from_user(&lfu,
2594 (const struct ll_futimes_3 __user *)arg,
2598 RETURN(ll_file_futimes_3(file, &lfu));
2600 case LL_IOC_LADVISE: {
2601 struct ladvise_hdr *ladvise_hdr;
2604 int alloc_size = sizeof(*ladvise_hdr);
2607 OBD_ALLOC_PTR(ladvise_hdr);
2608 if (ladvise_hdr == NULL)
2611 if (copy_from_user(ladvise_hdr,
2612 (const struct ladvise_hdr __user *)arg,
2614 GOTO(out_ladvise, rc = -EFAULT);
2616 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2617 ladvise_hdr->lah_count < 1)
2618 GOTO(out_ladvise, rc = -EINVAL);
2620 num_advise = ladvise_hdr->lah_count;
2621 if (num_advise >= LAH_COUNT_MAX)
2622 GOTO(out_ladvise, rc = -EFBIG);
2624 OBD_FREE_PTR(ladvise_hdr);
2625 alloc_size = offsetof(typeof(*ladvise_hdr),
2626 lah_advise[num_advise]);
2627 OBD_ALLOC(ladvise_hdr, alloc_size);
2628 if (ladvise_hdr == NULL)
2632 * TODO: submit multiple advices to one server in a single RPC
2634 if (copy_from_user(ladvise_hdr,
2635 (const struct ladvise_hdr __user *)arg,
2637 GOTO(out_ladvise, rc = -EFAULT);
2639 for (i = 0; i < num_advise; i++) {
2640 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2641 &ladvise_hdr->lah_advise[i]);
2647 OBD_FREE(ladvise_hdr, alloc_size);
2654 ll_iocontrol_call(inode, file, cmd, arg, &err))
2657 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2658 (void __user *)arg));
2663 #ifndef HAVE_FILE_LLSEEK_SIZE
2664 static inline loff_t
2665 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2667 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2669 if (offset > maxsize)
2672 if (offset != file->f_pos) {
2673 file->f_pos = offset;
2674 file->f_version = 0;
2680 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2681 loff_t maxsize, loff_t eof)
2683 struct inode *inode = file->f_path.dentry->d_inode;
2691 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2692 * position-querying operation. Avoid rewriting the "same"
2693 * f_pos value back to the file because a concurrent read(),
2694 * write() or lseek() might have altered it
2699 * f_lock protects against read/modify/write race with other
2700 * SEEK_CURs. Note that parallel writes and reads behave
2703 mutex_lock(&inode->i_mutex);
2704 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2705 mutex_unlock(&inode->i_mutex);
2709 * In the generic case the entire file is data, so as long as
2710 * offset isn't at the end of the file then the offset is data.
2717 * There is a virtual hole at the end of the file, so as long as
2718 * offset isn't i_size or larger, return i_size.
2726 return llseek_execute(file, offset, maxsize);
2730 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2732 struct inode *inode = file->f_path.dentry->d_inode;
2733 loff_t retval, eof = 0;
2736 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2737 (origin == SEEK_CUR) ? file->f_pos : 0);
2738 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2739 PFID(ll_inode2fid(inode)), inode, retval, retval,
2741 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2743 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2744 retval = ll_glimpse_size(inode);
2747 eof = i_size_read(inode);
2750 retval = ll_generic_file_llseek_size(file, offset, origin,
2751 ll_file_maxbytes(inode), eof);
2755 static int ll_flush(struct file *file, fl_owner_t id)
2757 struct inode *inode = file->f_path.dentry->d_inode;
2758 struct ll_inode_info *lli = ll_i2info(inode);
2759 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2762 LASSERT(!S_ISDIR(inode->i_mode));
2764 /* catch async errors that were recorded back when async writeback
2765 * failed for pages in this mapping. */
2766 rc = lli->lli_async_rc;
2767 lli->lli_async_rc = 0;
2768 if (lli->lli_clob != NULL) {
2769 err = lov_read_and_clear_async_rc(lli->lli_clob);
2774 /* The application has been told write failure already.
2775 * Do not report failure again. */
2776 if (fd->fd_write_failed)
2778 return rc ? -EIO : 0;
2782 * Called to make sure a portion of file has been written out.
2783 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2785 * Return how many pages have been written.
2787 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2788 enum cl_fsync_mode mode, int ignore_layout)
2790 struct cl_env_nest nest;
2793 struct cl_fsync_io *fio;
2797 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2798 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2801 env = cl_env_nested_get(&nest);
2803 RETURN(PTR_ERR(env));
2805 io = vvp_env_thread_io(env);
2806 io->ci_obj = ll_i2info(inode)->lli_clob;
2807 io->ci_ignore_layout = ignore_layout;
2809 /* initialize parameters for sync */
2810 fio = &io->u.ci_fsync;
2811 fio->fi_start = start;
2813 fio->fi_fid = ll_inode2fid(inode);
2814 fio->fi_mode = mode;
2815 fio->fi_nr_written = 0;
2817 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2818 result = cl_io_loop(env, io);
2820 result = io->ci_result;
2822 result = fio->fi_nr_written;
2823 cl_io_fini(env, io);
2824 cl_env_nested_put(&nest, env);
2830 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2831 * null and dentry must be used directly rather than pulled from
2832 * *file->f_path.dentry as is done otherwise.
2835 #ifdef HAVE_FILE_FSYNC_4ARGS
2836 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2838 struct dentry *dentry = file->f_path.dentry;
2839 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2840 int ll_fsync(struct file *file, int datasync)
2842 struct dentry *dentry = file->f_path.dentry;
2844 loff_t end = LLONG_MAX;
2846 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2849 loff_t end = LLONG_MAX;
2851 struct inode *inode = dentry->d_inode;
2852 struct ll_inode_info *lli = ll_i2info(inode);
2853 struct ptlrpc_request *req;
2857 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2858 PFID(ll_inode2fid(inode)), inode);
2859 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2861 #ifdef HAVE_FILE_FSYNC_4ARGS
2862 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2863 mutex_lock(&inode->i_mutex);
2865 /* fsync's caller has already called _fdata{sync,write}, we want
2866 * that IO to finish before calling the osc and mdc sync methods */
2867 rc = filemap_fdatawait(inode->i_mapping);
2870 /* catch async errors that were recorded back when async writeback
2871 * failed for pages in this mapping. */
2872 if (!S_ISDIR(inode->i_mode)) {
2873 err = lli->lli_async_rc;
2874 lli->lli_async_rc = 0;
2877 err = lov_read_and_clear_async_rc(lli->lli_clob);
2882 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2886 ptlrpc_req_finished(req);
2888 if (S_ISREG(inode->i_mode)) {
2889 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2891 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2892 if (rc == 0 && err < 0)
2895 fd->fd_write_failed = true;
2897 fd->fd_write_failed = false;
2900 #ifdef HAVE_FILE_FSYNC_4ARGS
2901 mutex_unlock(&inode->i_mutex);
2907 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2909 struct inode *inode = file->f_path.dentry->d_inode;
2910 struct ll_sb_info *sbi = ll_i2sbi(inode);
2911 struct ldlm_enqueue_info einfo = {
2912 .ei_type = LDLM_FLOCK,
2913 .ei_cb_cp = ldlm_flock_completion_ast,
2914 .ei_cbdata = file_lock,
2916 struct md_op_data *op_data;
2917 struct lustre_handle lockh = { 0 };
2918 union ldlm_policy_data flock = { { 0 } };
2919 int fl_type = file_lock->fl_type;
2925 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2926 PFID(ll_inode2fid(inode)), file_lock);
2928 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2930 if (file_lock->fl_flags & FL_FLOCK) {
2931 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2932 /* flocks are whole-file locks */
2933 flock.l_flock.end = OFFSET_MAX;
2934 /* For flocks owner is determined by the local file desctiptor*/
2935 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2936 } else if (file_lock->fl_flags & FL_POSIX) {
2937 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2938 flock.l_flock.start = file_lock->fl_start;
2939 flock.l_flock.end = file_lock->fl_end;
2943 flock.l_flock.pid = file_lock->fl_pid;
2945 /* Somewhat ugly workaround for svc lockd.
2946 * lockd installs custom fl_lmops->lm_compare_owner that checks
2947 * for the fl_owner to be the same (which it always is on local node
2948 * I guess between lockd processes) and then compares pid.
2949 * As such we assign pid to the owner field to make it all work,
2950 * conflict with normal locks is unlikely since pid space and
2951 * pointer space for current->files are not intersecting */
2952 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2953 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2957 einfo.ei_mode = LCK_PR;
2960 /* An unlock request may or may not have any relation to
2961 * existing locks so we may not be able to pass a lock handle
2962 * via a normal ldlm_lock_cancel() request. The request may even
2963 * unlock a byte range in the middle of an existing lock. In
2964 * order to process an unlock request we need all of the same
2965 * information that is given with a normal read or write record
2966 * lock request. To avoid creating another ldlm unlock (cancel)
2967 * message we'll treat a LCK_NL flock request as an unlock. */
2968 einfo.ei_mode = LCK_NL;
2971 einfo.ei_mode = LCK_PW;
2974 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2989 flags = LDLM_FL_BLOCK_NOWAIT;
2995 flags = LDLM_FL_TEST_LOCK;
2998 CERROR("unknown fcntl lock command: %d\n", cmd);
3002 /* Save the old mode so that if the mode in the lock changes we
3003 * can decrement the appropriate reader or writer refcount. */
3004 file_lock->fl_type = einfo.ei_mode;
3006 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3007 LUSTRE_OPC_ANY, NULL);
3008 if (IS_ERR(op_data))
3009 RETURN(PTR_ERR(op_data));
3011 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3012 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3013 flock.l_flock.pid, flags, einfo.ei_mode,
3014 flock.l_flock.start, flock.l_flock.end);
3016 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3019 /* Restore the file lock type if not TEST lock. */
3020 if (!(flags & LDLM_FL_TEST_LOCK))
3021 file_lock->fl_type = fl_type;
3023 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3024 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3025 !(flags & LDLM_FL_TEST_LOCK))
3026 rc2 = locks_lock_file_wait(file, file_lock);
3028 if ((file_lock->fl_flags & FL_FLOCK) &&
3029 (rc == 0 || file_lock->fl_type == F_UNLCK))
3030 rc2 = flock_lock_file_wait(file, file_lock);
3031 if ((file_lock->fl_flags & FL_POSIX) &&
3032 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3033 !(flags & LDLM_FL_TEST_LOCK))
3034 rc2 = posix_lock_file_wait(file, file_lock);
3035 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3037 if (rc2 && file_lock->fl_type != F_UNLCK) {
3038 einfo.ei_mode = LCK_NL;
3039 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3044 ll_finish_md_op_data(op_data);
3049 int ll_get_fid_by_name(struct inode *parent, const char *name,
3050 int namelen, struct lu_fid *fid,
3051 struct inode **inode)
3053 struct md_op_data *op_data = NULL;
3054 struct mdt_body *body;
3055 struct ptlrpc_request *req;
3059 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3060 LUSTRE_OPC_ANY, NULL);
3061 if (IS_ERR(op_data))
3062 RETURN(PTR_ERR(op_data));
3064 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3065 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3066 ll_finish_md_op_data(op_data);
3070 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3072 GOTO(out_req, rc = -EFAULT);
3074 *fid = body->mbo_fid1;
3077 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3079 ptlrpc_req_finished(req);
3083 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3084 const char *name, int namelen)
3086 struct dentry *dchild = NULL;
3087 struct inode *child_inode = NULL;
3088 struct md_op_data *op_data;
3089 struct ptlrpc_request *request = NULL;
3090 struct obd_client_handle *och = NULL;
3092 struct mdt_body *body;
3094 __u64 data_version = 0;
3097 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3098 name, PFID(ll_inode2fid(parent)), mdtidx);
3100 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3101 0, LUSTRE_OPC_ANY, NULL);
3102 if (IS_ERR(op_data))
3103 RETURN(PTR_ERR(op_data));
3105 /* Get child FID first */
3106 qstr.hash = full_name_hash(name, namelen);
3109 dchild = d_lookup(file->f_path.dentry, &qstr);
3110 if (dchild != NULL) {
3111 if (dchild->d_inode != NULL)
3112 child_inode = igrab(dchild->d_inode);
3116 if (child_inode == NULL) {
3117 rc = ll_get_fid_by_name(parent, name, namelen,
3118 &op_data->op_fid3, &child_inode);
3123 if (child_inode == NULL)
3124 GOTO(out_free, rc = -EINVAL);
3127 * lfs migrate command needs to be blocked on the client
3128 * by checking the migrate FID against the FID of the
3131 if (child_inode == parent->i_sb->s_root->d_inode)
3132 GOTO(out_iput, rc = -EINVAL);
3134 mutex_lock(&child_inode->i_mutex);
3135 op_data->op_fid3 = *ll_inode2fid(child_inode);
3136 if (!fid_is_sane(&op_data->op_fid3)) {
3137 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3138 ll_get_fsname(parent->i_sb, NULL, 0), name,
3139 PFID(&op_data->op_fid3));
3140 GOTO(out_unlock, rc = -EINVAL);
3143 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3145 GOTO(out_unlock, rc);
3148 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3149 PFID(&op_data->op_fid3), mdtidx);
3150 GOTO(out_unlock, rc = 0);
3153 if (S_ISREG(child_inode->i_mode)) {
3154 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3158 GOTO(out_unlock, rc);
3161 rc = ll_data_version(child_inode, &data_version,
3164 GOTO(out_close, rc);
3166 op_data->op_handle = och->och_fh;
3167 op_data->op_data = och->och_mod;
3168 op_data->op_data_version = data_version;
3169 op_data->op_lease_handle = och->och_lease_handle;
3170 op_data->op_bias |= MDS_RENAME_MIGRATE;
3173 op_data->op_mds = mdtidx;
3174 op_data->op_cli_flags = CLI_MIGRATE;
3175 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3176 namelen, name, namelen, &request);
3178 ll_update_times(request, parent);
3180 if (request != NULL) {
3181 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3183 ptlrpc_req_finished(request);
3184 GOTO(out_close, rc = -EPROTO);
3187 /* If the server does release layout lock, then we cleanup
3188 * the client och here, otherwise release it in out_close: */
3190 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3191 obd_mod_put(och->och_mod);
3192 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3198 ptlrpc_req_finished(request);
3201 /* Try again if the file layout has changed. */
3202 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3207 if (och != NULL) /* close the file */
3208 ll_lease_close(och, child_inode, NULL);
3210 clear_nlink(child_inode);
3212 mutex_unlock(&child_inode->i_mutex);
3216 ll_finish_md_op_data(op_data);
3221 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3229 * test if some locks matching bits and l_req_mode are acquired
3230 * - bits can be in different locks
3231 * - if found clear the common lock bits in *bits
3232 * - the bits not found, are kept in *bits
3234 * \param bits [IN] searched lock bits [IN]
3235 * \param l_req_mode [IN] searched lock mode
3236 * \retval boolean, true iff all bits are found
3238 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3240 struct lustre_handle lockh;
3241 union ldlm_policy_data policy;
3242 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3243 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3252 fid = &ll_i2info(inode)->lli_fid;
3253 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3254 ldlm_lockname[mode]);
3256 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3257 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3258 policy.l_inodebits.bits = *bits & (1 << i);
3259 if (policy.l_inodebits.bits == 0)
3262 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3263 &policy, mode, &lockh)) {
3264 struct ldlm_lock *lock;
3266 lock = ldlm_handle2lock(&lockh);
3269 ~(lock->l_policy_data.l_inodebits.bits);
3270 LDLM_LOCK_PUT(lock);
3272 *bits &= ~policy.l_inodebits.bits;
3279 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3280 struct lustre_handle *lockh, __u64 flags,
3281 enum ldlm_mode mode)
3283 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3288 fid = &ll_i2info(inode)->lli_fid;
3289 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3291 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3292 fid, LDLM_IBITS, &policy, mode, lockh);
3297 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3299 /* Already unlinked. Just update nlink and return success */
3300 if (rc == -ENOENT) {
3302 /* If it is striped directory, and there is bad stripe
3303 * Let's revalidate the dentry again, instead of returning
3305 if (S_ISDIR(inode->i_mode) &&
3306 ll_i2info(inode)->lli_lsm_md != NULL)
3309 /* This path cannot be hit for regular files unless in
3310 * case of obscure races, so no need to to validate
3312 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3314 } else if (rc != 0) {
3315 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3316 "%s: revalidate FID "DFID" error: rc = %d\n",
3317 ll_get_fsname(inode->i_sb, NULL, 0),
3318 PFID(ll_inode2fid(inode)), rc);
3324 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3326 struct inode *inode = dentry->d_inode;
3327 struct ptlrpc_request *req = NULL;
3328 struct obd_export *exp;
3332 LASSERT(inode != NULL);
3334 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3335 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3337 exp = ll_i2mdexp(inode);
3339 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3340 * But under CMD case, it caused some lock issues, should be fixed
3341 * with new CMD ibits lock. See bug 12718 */
3342 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3343 struct lookup_intent oit = { .it_op = IT_GETATTR };
3344 struct md_op_data *op_data;
3346 if (ibits == MDS_INODELOCK_LOOKUP)
3347 oit.it_op = IT_LOOKUP;
3349 /* Call getattr by fid, so do not provide name at all. */
3350 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3351 dentry->d_inode, NULL, 0, 0,
3352 LUSTRE_OPC_ANY, NULL);
3353 if (IS_ERR(op_data))
3354 RETURN(PTR_ERR(op_data));
3356 rc = md_intent_lock(exp, op_data, &oit, &req,
3357 &ll_md_blocking_ast, 0);
3358 ll_finish_md_op_data(op_data);
3360 rc = ll_inode_revalidate_fini(inode, rc);
3364 rc = ll_revalidate_it_finish(req, &oit, dentry);
3366 ll_intent_release(&oit);
3370 /* Unlinked? Unhash dentry, so it is not picked up later by
3371 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3372 here to preserve get_cwd functionality on 2.6.
3374 if (!dentry->d_inode->i_nlink) {
3375 ll_lock_dcache(inode);
3376 d_lustre_invalidate(dentry, 0);
3377 ll_unlock_dcache(inode);
3380 ll_lookup_finish_locks(&oit, dentry);
3381 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3382 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3383 u64 valid = OBD_MD_FLGETATTR;
3384 struct md_op_data *op_data;
3387 if (S_ISREG(inode->i_mode)) {
3388 rc = ll_get_default_mdsize(sbi, &ealen);
3391 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3394 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3395 0, ealen, LUSTRE_OPC_ANY,
3397 if (IS_ERR(op_data))
3398 RETURN(PTR_ERR(op_data));
3400 op_data->op_valid = valid;
3401 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3402 ll_finish_md_op_data(op_data);
3404 rc = ll_inode_revalidate_fini(inode, rc);
3408 rc = ll_prep_inode(&inode, req, NULL, NULL);
3411 ptlrpc_req_finished(req);
3415 static int ll_merge_md_attr(struct inode *inode)
3417 struct cl_attr attr = { 0 };
3420 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3421 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3422 &attr, ll_md_blocking_ast);
3426 set_nlink(inode, attr.cat_nlink);
3427 inode->i_blocks = attr.cat_blocks;
3428 i_size_write(inode, attr.cat_size);
3430 ll_i2info(inode)->lli_atime = attr.cat_atime;
3431 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3432 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3438 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3440 struct inode *inode = dentry->d_inode;
3444 rc = __ll_inode_revalidate(dentry, ibits);
3448 /* if object isn't regular file, don't validate size */
3449 if (!S_ISREG(inode->i_mode)) {
3450 if (S_ISDIR(inode->i_mode) &&
3451 ll_i2info(inode)->lli_lsm_md != NULL) {
3452 rc = ll_merge_md_attr(inode);
3457 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3458 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3459 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3461 /* In case of restore, the MDT has the right size and has
3462 * already send it back without granting the layout lock,
3463 * inode is up-to-date so glimpse is useless.
3464 * Also to glimpse we need the layout, in case of a running
3465 * restore the MDT holds the layout lock so the glimpse will
3466 * block up to the end of restore (getattr will block)
3468 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3469 rc = ll_glimpse_size(inode);
3474 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3476 struct inode *inode = de->d_inode;
3477 struct ll_sb_info *sbi = ll_i2sbi(inode);
3478 struct ll_inode_info *lli = ll_i2info(inode);
3481 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3482 MDS_INODELOCK_LOOKUP);
3483 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3488 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3490 stat->dev = inode->i_sb->s_dev;
3491 if (ll_need_32bit_api(sbi))
3492 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3494 stat->ino = inode->i_ino;
3495 stat->mode = inode->i_mode;
3496 stat->uid = inode->i_uid;
3497 stat->gid = inode->i_gid;
3498 stat->rdev = inode->i_rdev;
3499 stat->atime = inode->i_atime;
3500 stat->mtime = inode->i_mtime;
3501 stat->ctime = inode->i_ctime;
3502 stat->blksize = 1 << inode->i_blkbits;
3504 stat->nlink = inode->i_nlink;
3505 stat->size = i_size_read(inode);
3506 stat->blocks = inode->i_blocks;
3511 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3512 __u64 start, __u64 len)
3516 struct fiemap *fiemap;
3517 unsigned int extent_count = fieinfo->fi_extents_max;
3519 num_bytes = sizeof(*fiemap) + (extent_count *
3520 sizeof(struct fiemap_extent));
3521 OBD_ALLOC_LARGE(fiemap, num_bytes);
3526 fiemap->fm_flags = fieinfo->fi_flags;
3527 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3528 fiemap->fm_start = start;
3529 fiemap->fm_length = len;
3530 if (extent_count > 0 &&
3531 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3532 sizeof(struct fiemap_extent)) != 0)
3533 GOTO(out, rc = -EFAULT);
3535 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3537 fieinfo->fi_flags = fiemap->fm_flags;
3538 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3539 if (extent_count > 0 &&
3540 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3541 fiemap->fm_mapped_extents *
3542 sizeof(struct fiemap_extent)) != 0)
3543 GOTO(out, rc = -EFAULT);
3545 OBD_FREE_LARGE(fiemap, num_bytes);
3549 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3551 struct ll_inode_info *lli = ll_i2info(inode);
3552 struct posix_acl *acl = NULL;
3555 spin_lock(&lli->lli_lock);
3556 /* VFS' acl_permission_check->check_acl will release the refcount */
3557 acl = posix_acl_dup(lli->lli_posix_acl);
3558 spin_unlock(&lli->lli_lock);
3563 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3565 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3566 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3568 ll_check_acl(struct inode *inode, int mask)
3571 # ifdef CONFIG_FS_POSIX_ACL
3572 struct posix_acl *acl;
3576 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3577 if (flags & IPERM_FLAG_RCU)
3580 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3585 rc = posix_acl_permission(inode, acl, mask);
3586 posix_acl_release(acl);
3589 # else /* !CONFIG_FS_POSIX_ACL */
3591 # endif /* CONFIG_FS_POSIX_ACL */
3593 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3595 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3596 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3598 # ifdef HAVE_INODE_PERMISION_2ARGS
3599 int ll_inode_permission(struct inode *inode, int mask)
3601 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3606 struct ll_sb_info *sbi;
3607 struct root_squash_info *squash;
3608 struct cred *cred = NULL;
3609 const struct cred *old_cred = NULL;
3611 bool squash_id = false;
3614 #ifdef MAY_NOT_BLOCK
3615 if (mask & MAY_NOT_BLOCK)
3617 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3618 if (flags & IPERM_FLAG_RCU)
3622 /* as root inode are NOT getting validated in lookup operation,
3623 * need to do it before permission check. */
3625 if (inode == inode->i_sb->s_root->d_inode) {
3626 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3627 MDS_INODELOCK_LOOKUP);
3632 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3633 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3635 /* squash fsuid/fsgid if needed */
3636 sbi = ll_i2sbi(inode);
3637 squash = &sbi->ll_squash;
3638 if (unlikely(squash->rsi_uid != 0 &&
3639 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3640 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3644 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3645 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3646 squash->rsi_uid, squash->rsi_gid);
3648 /* update current process's credentials
3649 * and FS capability */
3650 cred = prepare_creds();
3654 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3655 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3656 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3657 if ((1 << cap) & CFS_CAP_FS_MASK)
3658 cap_lower(cred->cap_effective, cap);
3660 old_cred = override_creds(cred);
3663 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3665 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3666 rc = lustre_check_remote_perm(inode, mask);
3668 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3670 /* restore current process's credentials and FS capability */
3672 revert_creds(old_cred);
3679 /* -o localflock - only provides locally consistent flock locks */
3680 struct file_operations ll_file_operations = {
3681 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3682 # ifdef HAVE_SYNC_READ_WRITE
3683 .read = new_sync_read,
3684 .write = new_sync_write,
3686 .read_iter = ll_file_read_iter,
3687 .write_iter = ll_file_write_iter,
3688 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3689 .read = ll_file_read,
3690 .aio_read = ll_file_aio_read,
3691 .write = ll_file_write,
3692 .aio_write = ll_file_aio_write,
3693 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3694 .unlocked_ioctl = ll_file_ioctl,
3695 .open = ll_file_open,
3696 .release = ll_file_release,
3697 .mmap = ll_file_mmap,
3698 .llseek = ll_file_seek,
3699 .splice_read = ll_file_splice_read,
3704 struct file_operations ll_file_operations_flock = {
3705 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3706 # ifdef HAVE_SYNC_READ_WRITE
3707 .read = new_sync_read,
3708 .write = new_sync_write,
3709 # endif /* HAVE_SYNC_READ_WRITE */
3710 .read_iter = ll_file_read_iter,
3711 .write_iter = ll_file_write_iter,
3712 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3713 .read = ll_file_read,
3714 .aio_read = ll_file_aio_read,
3715 .write = ll_file_write,
3716 .aio_write = ll_file_aio_write,
3717 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3718 .unlocked_ioctl = ll_file_ioctl,
3719 .open = ll_file_open,
3720 .release = ll_file_release,
3721 .mmap = ll_file_mmap,
3722 .llseek = ll_file_seek,
3723 .splice_read = ll_file_splice_read,
3726 .flock = ll_file_flock,
3727 .lock = ll_file_flock
3730 /* These are for -o noflock - to return ENOSYS on flock calls */
3731 struct file_operations ll_file_operations_noflock = {
3732 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3733 # ifdef HAVE_SYNC_READ_WRITE
3734 .read = new_sync_read,
3735 .write = new_sync_write,
3736 # endif /* HAVE_SYNC_READ_WRITE */
3737 .read_iter = ll_file_read_iter,
3738 .write_iter = ll_file_write_iter,
3739 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3740 .read = ll_file_read,
3741 .aio_read = ll_file_aio_read,
3742 .write = ll_file_write,
3743 .aio_write = ll_file_aio_write,
3744 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3745 .unlocked_ioctl = ll_file_ioctl,
3746 .open = ll_file_open,
3747 .release = ll_file_release,
3748 .mmap = ll_file_mmap,
3749 .llseek = ll_file_seek,
3750 .splice_read = ll_file_splice_read,
3753 .flock = ll_file_noflock,
3754 .lock = ll_file_noflock
3757 struct inode_operations ll_file_inode_operations = {
3758 .setattr = ll_setattr,
3759 .getattr = ll_getattr,
3760 .permission = ll_inode_permission,
3761 .setxattr = ll_setxattr,
3762 .getxattr = ll_getxattr,
3763 .listxattr = ll_listxattr,
3764 .removexattr = ll_removexattr,
3765 .fiemap = ll_fiemap,
3766 #ifdef HAVE_IOP_GET_ACL
3767 .get_acl = ll_get_acl,
3771 /* dynamic ioctl number support routins */
3772 static struct llioc_ctl_data {
3773 struct rw_semaphore ioc_sem;
3774 struct list_head ioc_head;
3776 __RWSEM_INITIALIZER(llioc.ioc_sem),
3777 LIST_HEAD_INIT(llioc.ioc_head)
3782 struct list_head iocd_list;
3783 unsigned int iocd_size;
3784 llioc_callback_t iocd_cb;
3785 unsigned int iocd_count;
3786 unsigned int iocd_cmd[0];
3789 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3792 struct llioc_data *in_data = NULL;
3795 if (cb == NULL || cmd == NULL ||
3796 count > LLIOC_MAX_CMD || count < 0)
3799 size = sizeof(*in_data) + count * sizeof(unsigned int);
3800 OBD_ALLOC(in_data, size);
3801 if (in_data == NULL)
3804 memset(in_data, 0, sizeof(*in_data));
3805 in_data->iocd_size = size;
3806 in_data->iocd_cb = cb;
3807 in_data->iocd_count = count;
3808 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3810 down_write(&llioc.ioc_sem);
3811 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3812 up_write(&llioc.ioc_sem);
3817 void ll_iocontrol_unregister(void *magic)
3819 struct llioc_data *tmp;
3824 down_write(&llioc.ioc_sem);
3825 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3827 unsigned int size = tmp->iocd_size;
3829 list_del(&tmp->iocd_list);
3830 up_write(&llioc.ioc_sem);
3832 OBD_FREE(tmp, size);
3836 up_write(&llioc.ioc_sem);
3838 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3841 EXPORT_SYMBOL(ll_iocontrol_register);
3842 EXPORT_SYMBOL(ll_iocontrol_unregister);
3844 static enum llioc_iter
3845 ll_iocontrol_call(struct inode *inode, struct file *file,
3846 unsigned int cmd, unsigned long arg, int *rcp)
3848 enum llioc_iter ret = LLIOC_CONT;
3849 struct llioc_data *data;
3850 int rc = -EINVAL, i;
3852 down_read(&llioc.ioc_sem);
3853 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3854 for (i = 0; i < data->iocd_count; i++) {
3855 if (cmd != data->iocd_cmd[i])
3858 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3862 if (ret == LLIOC_STOP)
3865 up_read(&llioc.ioc_sem);
3872 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3874 struct ll_inode_info *lli = ll_i2info(inode);
3875 struct cl_object *obj = lli->lli_clob;
3876 struct cl_env_nest nest;
3884 env = cl_env_nested_get(&nest);
3886 RETURN(PTR_ERR(env));
3888 rc = cl_conf_set(env, lli->lli_clob, conf);
3892 if (conf->coc_opc == OBJECT_CONF_SET) {
3893 struct ldlm_lock *lock = conf->coc_lock;
3894 struct cl_layout cl = {
3898 LASSERT(lock != NULL);
3899 LASSERT(ldlm_has_layout(lock));
3901 /* it can only be allowed to match after layout is
3902 * applied to inode otherwise false layout would be
3903 * seen. Applying layout shoud happen before dropping
3904 * the intent lock. */
3905 ldlm_lock_allow_match(lock);
3907 rc = cl_object_layout_get(env, obj, &cl);
3912 DFID": layout version change: %u -> %u\n",
3913 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3915 ll_layout_version_set(lli, cl.cl_layout_gen);
3919 cl_env_nested_put(&nest, env);
3924 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3925 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3928 struct ll_sb_info *sbi = ll_i2sbi(inode);
3929 struct ptlrpc_request *req;
3930 struct mdt_body *body;
3937 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3938 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3939 lock->l_lvb_data, lock->l_lvb_len);
3941 if (lock->l_lvb_data != NULL)
3944 /* if layout lock was granted right away, the layout is returned
3945 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3946 * blocked and then granted via completion ast, we have to fetch
3947 * layout here. Please note that we can't use the LVB buffer in
3948 * completion AST because it doesn't have a large enough buffer */
3949 rc = ll_get_default_mdsize(sbi, &lmmsize);
3951 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3952 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3957 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3959 GOTO(out, rc = -EPROTO);
3961 lmmsize = body->mbo_eadatasize;
3962 if (lmmsize == 0) /* empty layout */
3965 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3967 GOTO(out, rc = -EFAULT);
3969 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3970 if (lvbdata == NULL)
3971 GOTO(out, rc = -ENOMEM);
3973 memcpy(lvbdata, lmm, lmmsize);
3974 lock_res_and_lock(lock);
3975 if (unlikely(lock->l_lvb_data == NULL)) {
3976 lock->l_lvb_type = LVB_T_LAYOUT;
3977 lock->l_lvb_data = lvbdata;
3978 lock->l_lvb_len = lmmsize;
3981 unlock_res_and_lock(lock);
3984 OBD_FREE_LARGE(lvbdata, lmmsize);
3989 ptlrpc_req_finished(req);
3994 * Apply the layout to the inode. Layout lock is held and will be released
3997 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3998 struct inode *inode)
4000 struct ll_inode_info *lli = ll_i2info(inode);
4001 struct ll_sb_info *sbi = ll_i2sbi(inode);
4002 struct ldlm_lock *lock;
4003 struct cl_object_conf conf;
4006 bool wait_layout = false;
4009 LASSERT(lustre_handle_is_used(lockh));
4011 lock = ldlm_handle2lock(lockh);
4012 LASSERT(lock != NULL);
4013 LASSERT(ldlm_has_layout(lock));
4015 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4016 PFID(&lli->lli_fid), inode);
4018 /* in case this is a caching lock and reinstate with new inode */
4019 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
4021 lock_res_and_lock(lock);
4022 lvb_ready = ldlm_is_lvb_ready(lock);
4023 unlock_res_and_lock(lock);
4024 /* checking lvb_ready is racy but this is okay. The worst case is
4025 * that multi processes may configure the file on the same time. */
4030 rc = ll_layout_fetch(inode, lock);
4034 /* for layout lock, lmm is stored in lock's lvb.
4035 * lvb_data is immutable if the lock is held so it's safe to access it
4038 * set layout to file. Unlikely this will fail as old layout was
4039 * surely eliminated */
4040 memset(&conf, 0, sizeof conf);
4041 conf.coc_opc = OBJECT_CONF_SET;
4042 conf.coc_inode = inode;
4043 conf.coc_lock = lock;
4044 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4045 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4046 rc = ll_layout_conf(inode, &conf);
4048 /* refresh layout failed, need to wait */
4049 wait_layout = rc == -EBUSY;
4053 LDLM_LOCK_PUT(lock);
4054 ldlm_lock_decref(lockh, mode);
4056 /* wait for IO to complete if it's still being used. */
4058 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4059 ll_get_fsname(inode->i_sb, NULL, 0),
4060 PFID(&lli->lli_fid), inode);
4062 memset(&conf, 0, sizeof conf);
4063 conf.coc_opc = OBJECT_CONF_WAIT;
4064 conf.coc_inode = inode;
4065 rc = ll_layout_conf(inode, &conf);
4069 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4070 ll_get_fsname(inode->i_sb, NULL, 0),
4071 PFID(&lli->lli_fid), rc);
4076 static int ll_layout_refresh_locked(struct inode *inode)
4078 struct ll_inode_info *lli = ll_i2info(inode);
4079 struct ll_sb_info *sbi = ll_i2sbi(inode);
4080 struct md_op_data *op_data;
4081 struct lookup_intent it;
4082 struct lustre_handle lockh;
4083 enum ldlm_mode mode;
4084 struct ldlm_enqueue_info einfo = {
4085 .ei_type = LDLM_IBITS,
4087 .ei_cb_bl = &ll_md_blocking_ast,
4088 .ei_cb_cp = &ldlm_completion_ast,
4094 /* mostly layout lock is caching on the local side, so try to match
4095 * it before grabbing layout lock mutex. */
4096 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4097 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4098 if (mode != 0) { /* hit cached lock */
4099 rc = ll_layout_lock_set(&lockh, mode, inode);
4106 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4107 0, 0, LUSTRE_OPC_ANY, NULL);
4108 if (IS_ERR(op_data))
4109 RETURN(PTR_ERR(op_data));
4111 /* have to enqueue one */
4112 memset(&it, 0, sizeof(it));
4113 it.it_op = IT_LAYOUT;
4114 lockh.cookie = 0ULL;
4116 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4117 ll_get_fsname(inode->i_sb, NULL, 0),
4118 PFID(&lli->lli_fid), inode);
4120 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4121 if (it.it_request != NULL)
4122 ptlrpc_req_finished(it.it_request);
4123 it.it_request = NULL;
4125 ll_finish_md_op_data(op_data);
4127 mode = it.it_lock_mode;
4128 it.it_lock_mode = 0;
4129 ll_intent_drop_lock(&it);
4132 /* set lock data in case this is a new lock */
4133 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4134 rc = ll_layout_lock_set(&lockh, mode, inode);
4143 * This function checks if there exists a LAYOUT lock on the client side,
4144 * or enqueues it if it doesn't have one in cache.
4146 * This function will not hold layout lock so it may be revoked any time after
4147 * this function returns. Any operations depend on layout should be redone
4150 * This function should be called before lov_io_init() to get an uptodate
4151 * layout version, the caller should save the version number and after IO
4152 * is finished, this function should be called again to verify that layout
4153 * is not changed during IO time.
4155 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4157 struct ll_inode_info *lli = ll_i2info(inode);
4158 struct ll_sb_info *sbi = ll_i2sbi(inode);
4162 *gen = ll_layout_version_get(lli);
4163 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4167 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4168 LASSERT(S_ISREG(inode->i_mode));
4170 /* take layout lock mutex to enqueue layout lock exclusively. */
4171 mutex_lock(&lli->lli_layout_mutex);
4173 rc = ll_layout_refresh_locked(inode);
4177 *gen = ll_layout_version_get(lli);
4179 mutex_unlock(&lli->lli_layout_mutex);
4185 * This function send a restore request to the MDT
4187 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4189 struct hsm_user_request *hur;
4193 len = sizeof(struct hsm_user_request) +
4194 sizeof(struct hsm_user_item);
4195 OBD_ALLOC(hur, len);
4199 hur->hur_request.hr_action = HUA_RESTORE;
4200 hur->hur_request.hr_archive_id = 0;
4201 hur->hur_request.hr_flags = 0;
4202 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4203 sizeof(hur->hur_user_item[0].hui_fid));
4204 hur->hur_user_item[0].hui_extent.offset = offset;
4205 hur->hur_user_item[0].hui_extent.length = length;
4206 hur->hur_request.hr_itemcount = 1;
4207 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,