4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 #ifdef CONFIG_FS_POSIX_ACL
326 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
327 inode == inode->i_sb->s_root->d_inode) {
328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
331 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
332 fd->fd_flags &= ~LL_FILE_RMTACL;
333 rct_del(&sbi->ll_rct, current_pid());
334 et_search_free(&sbi->ll_et, current_pid());
339 if (inode->i_sb->s_root != file->f_path.dentry)
340 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
341 fd = LUSTRE_FPRIVATE(file);
344 /* The last ref on @file, maybe not the the owner pid of statahead,
345 * because parent and child process can share the same file handle. */
346 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
347 ll_deauthorize_statahead(inode, fd);
349 if (inode->i_sb->s_root == file->f_path.dentry) {
350 LUSTRE_FPRIVATE(file) = NULL;
351 ll_file_data_put(fd);
355 if (!S_ISDIR(inode->i_mode)) {
356 if (lli->lli_clob != NULL)
357 lov_read_and_clear_async_rc(lli->lli_clob);
358 lli->lli_async_rc = 0;
361 rc = ll_md_close(inode, file);
363 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
364 libcfs_debug_dumplog();
369 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
370 struct lookup_intent *itp)
372 struct dentry *de = file->f_path.dentry;
373 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
374 struct dentry *parent = de->d_parent;
375 const char *name = NULL;
377 struct md_op_data *op_data;
378 struct ptlrpc_request *req = NULL;
382 LASSERT(parent != NULL);
383 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
385 /* if server supports open-by-fid, or file name is invalid, don't pack
386 * name in open request */
387 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
388 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
389 name = de->d_name.name;
390 len = de->d_name.len;
393 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
394 name, len, 0, LUSTRE_OPC_ANY, NULL);
396 RETURN(PTR_ERR(op_data));
397 op_data->op_data = lmm;
398 op_data->op_data_size = lmmsize;
400 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
401 &ll_md_blocking_ast, 0);
402 ll_finish_md_op_data(op_data);
404 /* reason for keep own exit path - don`t flood log
405 * with messages with -ESTALE errors.
407 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
408 it_open_error(DISP_OPEN_OPEN, itp))
410 ll_release_openhandle(de, itp);
414 if (it_disposition(itp, DISP_LOOKUP_NEG))
415 GOTO(out, rc = -ENOENT);
417 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
418 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
419 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
423 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
424 if (!rc && itp->it_lock_mode)
425 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
428 ptlrpc_req_finished(req);
429 ll_intent_drop_lock(itp);
434 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
437 struct mdt_body *body;
439 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
440 och->och_fh = body->mbo_handle;
441 och->och_fid = body->mbo_fid1;
442 och->och_lease_handle.cookie = it->it_lock_handle;
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_flags = it->it_flags;
446 return md_set_open_replay_data(md_exp, och, it);
449 static int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_path.dentry->d_inode;
455 LASSERT(!LUSTRE_FPRIVATE(file));
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
467 LUSTRE_FPRIVATE(file) = fd;
468 ll_readahead_init(inode, &fd->fd_ras);
469 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
471 /* ll_cl_context initialize */
472 rwlock_init(&fd->fd_lock);
473 INIT_LIST_HEAD(&fd->fd_lccs);
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
503 PFID(ll_inode2fid(inode)), inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_openerr, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode))
514 ll_authorize_statahead(inode, fd);
516 if (inode->i_sb->s_root == file->f_path.dentry) {
517 LUSTRE_FPRIVATE(file) = fd;
521 if (!it || !it->it_disposition) {
522 /* Convert f_flags into access mode. We cannot use file->f_mode,
523 * because everything but O_ACCMODE mask was stripped from
525 if ((oit.it_flags + 1) & O_ACCMODE)
527 if (file->f_flags & O_TRUNC)
528 oit.it_flags |= FMODE_WRITE;
530 /* kernel only call f_op->open in dentry_open. filp_open calls
531 * dentry_open after call to open_namei that checks permissions.
532 * Only nfsd_open call dentry_open directly without checking
533 * permissions and because of that this code below is safe. */
534 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
535 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications? */
539 oit.it_flags &= ~O_EXCL;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
543 * consistent with it */
544 if (oit.it_flags & O_CREAT)
545 oit.it_op |= IT_CREAT;
551 /* Let's see if we have file open on MDS already. */
552 if (it->it_flags & FMODE_WRITE) {
553 och_p = &lli->lli_mds_write_och;
554 och_usecount = &lli->lli_open_fd_write_count;
555 } else if (it->it_flags & FMODE_EXEC) {
556 och_p = &lli->lli_mds_exec_och;
557 och_usecount = &lli->lli_open_fd_exec_count;
559 och_p = &lli->lli_mds_read_och;
560 och_usecount = &lli->lli_open_fd_read_count;
563 mutex_lock(&lli->lli_och_mutex);
564 if (*och_p) { /* Open handle is present */
565 if (it_disposition(it, DISP_OPEN_OPEN)) {
566 /* Well, there's extra open request that we do not need,
567 let's close it somehow. This will decref request. */
568 rc = it_open_error(DISP_OPEN_OPEN, it);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 ll_release_openhandle(file->f_path.dentry, it);
578 rc = ll_local_open(file, it, fd, NULL);
581 mutex_unlock(&lli->lli_och_mutex);
582 GOTO(out_openerr, rc);
585 LASSERT(*och_usecount == 0);
586 if (!it->it_disposition) {
587 /* We cannot just request lock handle now, new ELC code
588 means that one of other OPEN locks for this file
589 could be cancelled, and since blocking ast handler
590 would attempt to grab och_mutex as well, that would
591 result in a deadlock */
592 mutex_unlock(&lli->lli_och_mutex);
594 * Normally called under two situations:
596 * 2. A race/condition on MDS resulting in no open
597 * handle to be returned from LOOKUP|OPEN request,
598 * for example if the target entry was a symlink.
600 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file, NULL, 0, it);
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * Acquire a lease and open the file.
699 static struct obd_client_handle *
700 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
703 struct lookup_intent it = { .it_op = IT_OPEN };
704 struct ll_sb_info *sbi = ll_i2sbi(inode);
705 struct md_op_data *op_data;
706 struct ptlrpc_request *req = NULL;
707 struct lustre_handle old_handle = { 0 };
708 struct obd_client_handle *och = NULL;
713 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
714 RETURN(ERR_PTR(-EINVAL));
717 struct ll_inode_info *lli = ll_i2info(inode);
718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
719 struct obd_client_handle **och_p;
722 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
723 RETURN(ERR_PTR(-EPERM));
725 /* Get the openhandle of the file */
727 mutex_lock(&lli->lli_och_mutex);
728 if (fd->fd_lease_och != NULL) {
729 mutex_unlock(&lli->lli_och_mutex);
733 if (fd->fd_och == NULL) {
734 if (file->f_mode & FMODE_WRITE) {
735 LASSERT(lli->lli_mds_write_och != NULL);
736 och_p = &lli->lli_mds_write_och;
737 och_usecount = &lli->lli_open_fd_write_count;
739 LASSERT(lli->lli_mds_read_och != NULL);
740 och_p = &lli->lli_mds_read_och;
741 och_usecount = &lli->lli_open_fd_read_count;
743 if (*och_usecount == 1) {
750 mutex_unlock(&lli->lli_och_mutex);
751 if (rc < 0) /* more than 1 opener */
754 LASSERT(fd->fd_och != NULL);
755 old_handle = fd->fd_och->och_fh;
760 RETURN(ERR_PTR(-ENOMEM));
762 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
763 LUSTRE_OPC_ANY, NULL);
765 GOTO(out, rc = PTR_ERR(op_data));
767 /* To tell the MDT this openhandle is from the same owner */
768 op_data->op_handle = old_handle;
770 it.it_flags = fmode | open_flags;
771 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
772 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
773 &ll_md_blocking_lease_ast,
774 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
775 * it can be cancelled which may mislead applications that the lease is
777 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
778 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
779 * doesn't deal with openhandle, so normal openhandle will be leaked. */
780 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
781 ll_finish_md_op_data(op_data);
782 ptlrpc_req_finished(req);
784 GOTO(out_release_it, rc);
786 if (it_disposition(&it, DISP_LOOKUP_NEG))
787 GOTO(out_release_it, rc = -ENOENT);
789 rc = it_open_error(DISP_OPEN_OPEN, &it);
791 GOTO(out_release_it, rc);
793 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
794 ll_och_fill(sbi->ll_md_exp, &it, och);
796 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
797 GOTO(out_close, rc = -EOPNOTSUPP);
799 /* already get lease, handle lease lock */
800 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
801 if (it.it_lock_mode == 0 ||
802 it.it_lock_bits != MDS_INODELOCK_OPEN) {
803 /* open lock must return for lease */
804 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
805 PFID(ll_inode2fid(inode)), it.it_lock_mode,
807 GOTO(out_close, rc = -EPROTO);
810 ll_intent_release(&it);
814 /* Cancel open lock */
815 if (it.it_lock_mode != 0) {
816 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
819 och->och_lease_handle.cookie = 0ULL;
821 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
823 CERROR("%s: error closing file "DFID": %d\n",
824 ll_get_fsname(inode->i_sb, NULL, 0),
825 PFID(&ll_i2info(inode)->lli_fid), rc2);
826 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
828 ll_intent_release(&it);
836 * Check whether a layout swap can be done between two inodes.
838 * \param[in] inode1 First inode to check
839 * \param[in] inode2 Second inode to check
841 * \retval 0 on success, layout swap can be performed between both inodes
842 * \retval negative error code if requirements are not met
844 static int ll_check_swap_layouts_validity(struct inode *inode1,
845 struct inode *inode2)
847 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
850 if (inode_permission(inode1, MAY_WRITE) ||
851 inode_permission(inode2, MAY_WRITE))
854 if (inode1->i_sb != inode2->i_sb)
860 static int ll_swap_layouts_close(struct obd_client_handle *och,
861 struct inode *inode, struct inode *inode2)
863 const struct lu_fid *fid1 = ll_inode2fid(inode);
864 const struct lu_fid *fid2;
868 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
869 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
871 rc = ll_check_swap_layouts_validity(inode, inode2);
873 GOTO(out_free_och, rc);
875 /* We now know that inode2 is a lustre inode */
876 fid2 = ll_inode2fid(inode2);
878 rc = lu_fid_cmp(fid1, fid2);
880 GOTO(out_free_och, rc = -EINVAL);
882 /* Close the file and swap layouts between inode & inode2.
883 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
884 * because we still need it to pack l_remote_handle to MDT. */
885 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
888 och = NULL; /* freed in ll_close_inode_openhandle() */
898 * Release lease and close the file.
899 * It will check if the lease has ever broken.
901 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
904 struct ldlm_lock *lock;
905 bool cancelled = true;
909 lock = ldlm_handle2lock(&och->och_lease_handle);
911 lock_res_and_lock(lock);
912 cancelled = ldlm_is_cancel(lock);
913 unlock_res_and_lock(lock);
917 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
918 PFID(&ll_i2info(inode)->lli_fid), cancelled);
921 ldlm_cli_cancel(&och->och_lease_handle, 0);
922 if (lease_broken != NULL)
923 *lease_broken = cancelled;
925 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
929 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
931 struct ll_inode_info *lli = ll_i2info(inode);
932 struct cl_object *obj = lli->lli_clob;
933 struct cl_attr *attr = vvp_env_thread_attr(env);
941 ll_inode_size_lock(inode);
943 /* merge timestamps the most recently obtained from mds with
944 timestamps obtained from osts */
945 LTIME_S(inode->i_atime) = lli->lli_atime;
946 LTIME_S(inode->i_mtime) = lli->lli_mtime;
947 LTIME_S(inode->i_ctime) = lli->lli_ctime;
949 atime = LTIME_S(inode->i_atime);
950 mtime = LTIME_S(inode->i_mtime);
951 ctime = LTIME_S(inode->i_ctime);
953 cl_object_attr_lock(obj);
954 rc = cl_object_attr_get(env, obj, attr);
955 cl_object_attr_unlock(obj);
958 GOTO(out_size_unlock, rc);
960 if (atime < attr->cat_atime)
961 atime = attr->cat_atime;
963 if (ctime < attr->cat_ctime)
964 ctime = attr->cat_ctime;
966 if (mtime < attr->cat_mtime)
967 mtime = attr->cat_mtime;
969 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
970 PFID(&lli->lli_fid), attr->cat_size);
972 i_size_write(inode, attr->cat_size);
973 inode->i_blocks = attr->cat_blocks;
975 LTIME_S(inode->i_atime) = atime;
976 LTIME_S(inode->i_mtime) = mtime;
977 LTIME_S(inode->i_ctime) = ctime;
980 ll_inode_size_unlock(inode);
985 static bool file_is_noatime(const struct file *file)
987 const struct vfsmount *mnt = file->f_path.mnt;
988 const struct inode *inode = file->f_path.dentry->d_inode;
990 /* Adapted from file_accessed() and touch_atime().*/
991 if (file->f_flags & O_NOATIME)
994 if (inode->i_flags & S_NOATIME)
997 if (IS_NOATIME(inode))
1000 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1003 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1006 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1012 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1014 struct inode *inode = file->f_path.dentry->d_inode;
1016 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1018 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1019 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1020 file->f_flags & O_DIRECT ||
1023 io->ci_obj = ll_i2info(inode)->lli_clob;
1024 io->ci_lockreq = CILR_MAYBE;
1025 if (ll_file_nolock(file)) {
1026 io->ci_lockreq = CILR_NEVER;
1027 io->ci_no_srvlock = 1;
1028 } else if (file->f_flags & O_APPEND) {
1029 io->ci_lockreq = CILR_MANDATORY;
1032 io->ci_noatime = file_is_noatime(file);
1036 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1037 struct file *file, enum cl_io_type iot,
1038 loff_t *ppos, size_t count)
1040 struct vvp_io *vio = vvp_env_io(env);
1041 struct inode *inode = file->f_path.dentry->d_inode;
1042 struct ll_inode_info *lli = ll_i2info(inode);
1043 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1047 struct range_lock range;
1051 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1052 file->f_path.dentry->d_name.name, iot, *ppos, count);
1055 io = vvp_env_thread_io(env);
1056 ll_io_init(io, file, iot == CIT_WRITE);
1058 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1059 bool range_locked = false;
1061 if (file->f_flags & O_APPEND)
1062 range_lock_init(&range, 0, LUSTRE_EOF);
1064 range_lock_init(&range, *ppos, *ppos + count - 1);
1066 vio->vui_fd = LUSTRE_FPRIVATE(file);
1067 vio->vui_io_subtype = args->via_io_subtype;
1069 switch (vio->vui_io_subtype) {
1071 vio->vui_iter = args->u.normal.via_iter;
1072 vio->vui_iocb = args->u.normal.via_iocb;
1073 /* Direct IO reads must also take range lock,
1074 * or multiple reads will try to work on the same pages
1075 * See LU-6227 for details. */
1076 if (((iot == CIT_WRITE) ||
1077 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1078 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1079 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1081 rc = range_lock(&lli->lli_write_tree, &range);
1085 range_locked = true;
1089 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1090 vio->u.splice.vui_flags = args->u.splice.via_flags;
1093 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1097 ll_cl_add(file, env, io);
1098 rc = cl_io_loop(env, io);
1099 ll_cl_remove(file, env);
1102 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1104 range_unlock(&lli->lli_write_tree, &range);
1107 /* cl_io_rw_init() handled IO */
1111 if (io->ci_nob > 0) {
1112 result += io->ci_nob;
1113 count -= io->ci_nob;
1114 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1116 /* prepare IO restart */
1117 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1118 args->u.normal.via_iter = vio->vui_iter;
1122 cl_io_fini(env, io);
1124 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1126 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1127 file->f_path.dentry->d_name.name,
1128 iot == CIT_READ ? "read" : "write",
1129 *ppos, count, result);
1133 if (iot == CIT_READ) {
1135 ll_stats_ops_tally(ll_i2sbi(inode),
1136 LPROC_LL_READ_BYTES, result);
1137 } else if (iot == CIT_WRITE) {
1139 ll_stats_ops_tally(ll_i2sbi(inode),
1140 LPROC_LL_WRITE_BYTES, result);
1141 fd->fd_write_failed = false;
1142 } else if (result == 0 && rc == 0) {
1145 fd->fd_write_failed = true;
1147 fd->fd_write_failed = false;
1148 } else if (rc != -ERESTARTSYS) {
1149 fd->fd_write_failed = true;
1153 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1155 return result > 0 ? result : rc;
1159 * Read from a file (through the page cache).
1161 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1163 struct vvp_io_args *args;
1168 env = cl_env_get(&refcheck);
1170 return PTR_ERR(env);
1172 args = ll_env_args(env, IO_NORMAL);
1173 args->u.normal.via_iter = to;
1174 args->u.normal.via_iocb = iocb;
1176 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1177 &iocb->ki_pos, iov_iter_count(to));
1178 cl_env_put(env, &refcheck);
1183 * Write to a file (through the page cache).
1185 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1187 struct vvp_io_args *args;
1192 env = cl_env_get(&refcheck);
1194 return PTR_ERR(env);
1196 args = ll_env_args(env, IO_NORMAL);
1197 args->u.normal.via_iter = from;
1198 args->u.normal.via_iocb = iocb;
1200 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1201 &iocb->ki_pos, iov_iter_count(from));
1202 cl_env_put(env, &refcheck);
1206 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1208 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1210 static int ll_file_get_iov_count(const struct iovec *iov,
1211 unsigned long *nr_segs, size_t *count)
1216 for (seg = 0; seg < *nr_segs; seg++) {
1217 const struct iovec *iv = &iov[seg];
1220 * If any segment has a negative length, or the cumulative
1221 * length ever wraps negative then return -EINVAL.
1224 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1226 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1231 cnt -= iv->iov_len; /* This segment is no good */
1238 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1239 unsigned long nr_segs, loff_t pos)
1246 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1250 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1251 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1252 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1253 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1254 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1256 result = ll_file_read_iter(iocb, &to);
1261 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1264 struct iovec iov = { .iov_base = buf, .iov_len = count };
1265 struct kiocb *kiocb;
1269 OBD_ALLOC_PTR(kiocb);
1273 init_sync_kiocb(kiocb, file);
1274 kiocb->ki_pos = *ppos;
1275 #ifdef HAVE_KIOCB_KI_LEFT
1276 kiocb->ki_left = count;
1277 #elif defined(HAVE_KI_NBYTES)
1278 kiocb->ki_nbytes = count;
1281 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1282 *ppos = kiocb->ki_pos;
1284 OBD_FREE_PTR(kiocb);
1289 * Write to a file (through the page cache).
1292 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1293 unsigned long nr_segs, loff_t pos)
1295 struct iov_iter from;
1300 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1304 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1305 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1306 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1307 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1308 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1310 result = ll_file_write_iter(iocb, &from);
1315 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1316 size_t count, loff_t *ppos)
1319 struct iovec iov = { .iov_base = (void __user *)buf,
1321 struct kiocb *kiocb;
1326 env = cl_env_get(&refcheck);
1328 RETURN(PTR_ERR(env));
1330 kiocb = &ll_env_info(env)->lti_kiocb;
1331 init_sync_kiocb(kiocb, file);
1332 kiocb->ki_pos = *ppos;
1333 #ifdef HAVE_KIOCB_KI_LEFT
1334 kiocb->ki_left = count;
1335 #elif defined(HAVE_KI_NBYTES)
1336 kiocb->ki_nbytes = count;
1339 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1340 *ppos = kiocb->ki_pos;
1342 cl_env_put(env, &refcheck);
1345 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1348 * Send file content (through pagecache) somewhere with helper
1350 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1351 struct pipe_inode_info *pipe, size_t count,
1355 struct vvp_io_args *args;
1360 env = cl_env_get(&refcheck);
1362 RETURN(PTR_ERR(env));
1364 args = ll_env_args(env, IO_SPLICE);
1365 args->u.splice.via_pipe = pipe;
1366 args->u.splice.via_flags = flags;
1368 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1369 cl_env_put(env, &refcheck);
1373 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1374 __u64 flags, struct lov_user_md *lum,
1377 struct lookup_intent oit = {
1379 .it_flags = flags | MDS_OPEN_BY_FID,
1384 ll_inode_size_lock(inode);
1385 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1387 GOTO(out_unlock, rc);
1389 ll_release_openhandle(file->f_path.dentry, &oit);
1392 ll_inode_size_unlock(inode);
1393 ll_intent_release(&oit);
1394 cl_lov_delay_create_clear(&file->f_flags);
1399 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1400 struct lov_mds_md **lmmp, int *lmm_size,
1401 struct ptlrpc_request **request)
1403 struct ll_sb_info *sbi = ll_i2sbi(inode);
1404 struct mdt_body *body;
1405 struct lov_mds_md *lmm = NULL;
1406 struct ptlrpc_request *req = NULL;
1407 struct md_op_data *op_data;
1410 rc = ll_get_default_mdsize(sbi, &lmmsize);
1414 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1415 strlen(filename), lmmsize,
1416 LUSTRE_OPC_ANY, NULL);
1417 if (IS_ERR(op_data))
1418 RETURN(PTR_ERR(op_data));
1420 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1421 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1422 ll_finish_md_op_data(op_data);
1424 CDEBUG(D_INFO, "md_getattr_name failed "
1425 "on %s: rc %d\n", filename, rc);
1429 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1430 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1432 lmmsize = body->mbo_eadatasize;
1434 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1436 GOTO(out, rc = -ENODATA);
1439 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1440 LASSERT(lmm != NULL);
1442 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1443 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1444 GOTO(out, rc = -EPROTO);
1448 * This is coming from the MDS, so is probably in
1449 * little endian. We convert it to host endian before
1450 * passing it to userspace.
1452 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1455 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1456 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1459 /* if function called for directory - we should
1460 * avoid swab not existent lsm objects */
1461 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1462 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1463 if (S_ISREG(body->mbo_mode))
1464 lustre_swab_lov_user_md_objects(
1465 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1467 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1468 lustre_swab_lov_user_md_v3(
1469 (struct lov_user_md_v3 *)lmm);
1470 if (S_ISREG(body->mbo_mode))
1471 lustre_swab_lov_user_md_objects(
1472 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1479 *lmm_size = lmmsize;
1484 static int ll_lov_setea(struct inode *inode, struct file *file,
1487 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1488 struct lov_user_md *lump;
1489 int lum_size = sizeof(struct lov_user_md) +
1490 sizeof(struct lov_user_ost_data);
1494 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1497 OBD_ALLOC_LARGE(lump, lum_size);
1501 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1502 GOTO(out_lump, rc = -EFAULT);
1504 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1507 OBD_FREE_LARGE(lump, lum_size);
1511 static int ll_file_getstripe(struct inode *inode,
1512 struct lov_user_md __user *lum)
1519 env = cl_env_get(&refcheck);
1521 RETURN(PTR_ERR(env));
1523 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1524 cl_env_put(env, &refcheck);
1528 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1531 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1532 struct lov_user_md *klum;
1534 __u64 flags = FMODE_WRITE;
1537 rc = ll_copy_user_md(lum, &klum);
1542 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1546 put_user(0, &lum->lmm_stripe_count);
1548 ll_layout_refresh(inode, &gen);
1549 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1552 OBD_FREE(klum, lum_size);
1557 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1559 struct ll_inode_info *lli = ll_i2info(inode);
1560 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1561 struct ll_grouplock grouplock;
1566 CWARN("group id for group lock must not be 0\n");
1570 if (ll_file_nolock(file))
1571 RETURN(-EOPNOTSUPP);
1573 spin_lock(&lli->lli_lock);
1574 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1575 CWARN("group lock already existed with gid %lu\n",
1576 fd->fd_grouplock.lg_gid);
1577 spin_unlock(&lli->lli_lock);
1580 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1581 spin_unlock(&lli->lli_lock);
1583 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1584 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1588 spin_lock(&lli->lli_lock);
1589 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1590 spin_unlock(&lli->lli_lock);
1591 CERROR("another thread just won the race\n");
1592 cl_put_grouplock(&grouplock);
1596 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1597 fd->fd_grouplock = grouplock;
1598 spin_unlock(&lli->lli_lock);
1600 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1604 static int ll_put_grouplock(struct inode *inode, struct file *file,
1607 struct ll_inode_info *lli = ll_i2info(inode);
1608 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1609 struct ll_grouplock grouplock;
1612 spin_lock(&lli->lli_lock);
1613 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1614 spin_unlock(&lli->lli_lock);
1615 CWARN("no group lock held\n");
1619 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1621 if (fd->fd_grouplock.lg_gid != arg) {
1622 CWARN("group lock %lu doesn't match current id %lu\n",
1623 arg, fd->fd_grouplock.lg_gid);
1624 spin_unlock(&lli->lli_lock);
1628 grouplock = fd->fd_grouplock;
1629 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1630 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1631 spin_unlock(&lli->lli_lock);
1633 cl_put_grouplock(&grouplock);
1634 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1639 * Close inode open handle
1641 * \param dentry [in] dentry which contains the inode
1642 * \param it [in,out] intent which contains open info and result
1645 * \retval <0 failure
1647 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1649 struct inode *inode = dentry->d_inode;
1650 struct obd_client_handle *och;
1656 /* Root ? Do nothing. */
1657 if (dentry->d_inode->i_sb->s_root == dentry)
1660 /* No open handle to close? Move away */
1661 if (!it_disposition(it, DISP_OPEN_OPEN))
1664 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1666 OBD_ALLOC(och, sizeof(*och));
1668 GOTO(out, rc = -ENOMEM);
1670 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1672 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1674 /* this one is in place of ll_file_open */
1675 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1676 ptlrpc_req_finished(it->it_request);
1677 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1683 * Get size for inode for which FIEMAP mapping is requested.
1684 * Make the FIEMAP get_info call and returns the result.
1685 * \param fiemap kernel buffer to hold extens
1686 * \param num_bytes kernel buffer size
1688 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1694 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1697 /* Checks for fiemap flags */
1698 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1699 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1703 /* Check for FIEMAP_FLAG_SYNC */
1704 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1705 rc = filemap_fdatawrite(inode->i_mapping);
1710 env = cl_env_get(&refcheck);
1712 RETURN(PTR_ERR(env));
1714 if (i_size_read(inode) == 0) {
1715 rc = ll_glimpse_size(inode);
1720 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1721 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1722 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1724 /* If filesize is 0, then there would be no objects for mapping */
1725 if (fmkey.lfik_oa.o_size == 0) {
1726 fiemap->fm_mapped_extents = 0;
1730 fmkey.lfik_fiemap = *fiemap;
1732 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1733 &fmkey, fiemap, &num_bytes);
1735 cl_env_put(env, &refcheck);
1739 int ll_fid2path(struct inode *inode, void __user *arg)
1741 struct obd_export *exp = ll_i2mdexp(inode);
1742 const struct getinfo_fid2path __user *gfin = arg;
1744 struct getinfo_fid2path *gfout;
1750 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1754 /* Only need to get the buflen */
1755 if (get_user(pathlen, &gfin->gf_pathlen))
1758 if (pathlen > PATH_MAX)
1761 outsize = sizeof(*gfout) + pathlen;
1762 OBD_ALLOC(gfout, outsize);
1766 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1767 GOTO(gf_free, rc = -EFAULT);
1768 /* append root FID after gfout to let MDT know the root FID so that it
1769 * can lookup the correct path, this is mainly for fileset.
1770 * old server without fileset mount support will ignore this. */
1771 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1773 /* Call mdc_iocontrol */
1774 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1778 if (copy_to_user(arg, gfout, outsize))
1782 OBD_FREE(gfout, outsize);
1787 * Read the data_version for inode.
1789 * This value is computed using stripe object version on OST.
1790 * Version is computed using server side locking.
1792 * @param flags if do sync on the OST side;
1794 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1795 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1797 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1799 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1807 /* If no file object initialized, we consider its version is 0. */
1813 env = cl_env_get(&refcheck);
1815 RETURN(PTR_ERR(env));
1817 io = vvp_env_thread_io(env);
1819 io->u.ci_data_version.dv_data_version = 0;
1820 io->u.ci_data_version.dv_flags = flags;
1823 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1824 result = cl_io_loop(env, io);
1826 result = io->ci_result;
1828 *data_version = io->u.ci_data_version.dv_data_version;
1830 cl_io_fini(env, io);
1832 if (unlikely(io->ci_need_restart))
1835 cl_env_put(env, &refcheck);
1841 * Trigger a HSM release request for the provided inode.
1843 int ll_hsm_release(struct inode *inode)
1846 struct obd_client_handle *och = NULL;
1847 __u64 data_version = 0;
1852 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1853 ll_get_fsname(inode->i_sb, NULL, 0),
1854 PFID(&ll_i2info(inode)->lli_fid));
1856 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1858 GOTO(out, rc = PTR_ERR(och));
1860 /* Grab latest data_version and [am]time values */
1861 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1865 env = cl_env_get(&refcheck);
1867 GOTO(out, rc = PTR_ERR(env));
1869 ll_merge_attr(env, inode);
1870 cl_env_put(env, &refcheck);
1872 /* Release the file.
1873 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1874 * we still need it to pack l_remote_handle to MDT. */
1875 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
1881 if (och != NULL && !IS_ERR(och)) /* close the file */
1882 ll_lease_close(och, inode, NULL);
1887 struct ll_swap_stack {
1890 struct inode *inode1;
1891 struct inode *inode2;
1896 static int ll_swap_layouts(struct file *file1, struct file *file2,
1897 struct lustre_swap_layouts *lsl)
1899 struct mdc_swap_layouts msl;
1900 struct md_op_data *op_data;
1903 struct ll_swap_stack *llss = NULL;
1906 OBD_ALLOC_PTR(llss);
1910 llss->inode1 = file1->f_path.dentry->d_inode;
1911 llss->inode2 = file2->f_path.dentry->d_inode;
1913 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1917 /* we use 2 bool because it is easier to swap than 2 bits */
1918 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1919 llss->check_dv1 = true;
1921 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1922 llss->check_dv2 = true;
1924 /* we cannot use lsl->sl_dvX directly because we may swap them */
1925 llss->dv1 = lsl->sl_dv1;
1926 llss->dv2 = lsl->sl_dv2;
1928 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1929 if (rc == 0) /* same file, done! */
1932 if (rc < 0) { /* sequentialize it */
1933 swap(llss->inode1, llss->inode2);
1935 swap(llss->dv1, llss->dv2);
1936 swap(llss->check_dv1, llss->check_dv2);
1940 if (gid != 0) { /* application asks to flush dirty cache */
1941 rc = ll_get_grouplock(llss->inode1, file1, gid);
1945 rc = ll_get_grouplock(llss->inode2, file2, gid);
1947 ll_put_grouplock(llss->inode1, file1, gid);
1952 /* ultimate check, before swaping the layouts we check if
1953 * dataversion has changed (if requested) */
1954 if (llss->check_dv1) {
1955 rc = ll_data_version(llss->inode1, &dv, 0);
1958 if (dv != llss->dv1)
1959 GOTO(putgl, rc = -EAGAIN);
1962 if (llss->check_dv2) {
1963 rc = ll_data_version(llss->inode2, &dv, 0);
1966 if (dv != llss->dv2)
1967 GOTO(putgl, rc = -EAGAIN);
1970 /* struct md_op_data is used to send the swap args to the mdt
1971 * only flags is missing, so we use struct mdc_swap_layouts
1972 * through the md_op_data->op_data */
1973 /* flags from user space have to be converted before they are send to
1974 * server, no flag is sent today, they are only used on the client */
1977 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1978 0, LUSTRE_OPC_ANY, &msl);
1979 if (IS_ERR(op_data))
1980 GOTO(free, rc = PTR_ERR(op_data));
1982 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1983 sizeof(*op_data), op_data, NULL);
1984 ll_finish_md_op_data(op_data);
1991 ll_put_grouplock(llss->inode2, file2, gid);
1992 ll_put_grouplock(llss->inode1, file1, gid);
2002 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2004 struct md_op_data *op_data;
2008 /* Detect out-of range masks */
2009 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2012 /* Non-root users are forbidden to set or clear flags which are
2013 * NOT defined in HSM_USER_MASK. */
2014 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2015 !cfs_capable(CFS_CAP_SYS_ADMIN))
2018 /* Detect out-of range archive id */
2019 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2020 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2023 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2024 LUSTRE_OPC_ANY, hss);
2025 if (IS_ERR(op_data))
2026 RETURN(PTR_ERR(op_data));
2028 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2029 sizeof(*op_data), op_data, NULL);
2031 ll_finish_md_op_data(op_data);
2036 static int ll_hsm_import(struct inode *inode, struct file *file,
2037 struct hsm_user_import *hui)
2039 struct hsm_state_set *hss = NULL;
2040 struct iattr *attr = NULL;
2044 if (!S_ISREG(inode->i_mode))
2050 GOTO(out, rc = -ENOMEM);
2052 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2053 hss->hss_archive_id = hui->hui_archive_id;
2054 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2055 rc = ll_hsm_state_set(inode, hss);
2059 OBD_ALLOC_PTR(attr);
2061 GOTO(out, rc = -ENOMEM);
2063 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2064 attr->ia_mode |= S_IFREG;
2065 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2066 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2067 attr->ia_size = hui->hui_size;
2068 attr->ia_mtime.tv_sec = hui->hui_mtime;
2069 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2070 attr->ia_atime.tv_sec = hui->hui_atime;
2071 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2073 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2074 ATTR_UID | ATTR_GID |
2075 ATTR_MTIME | ATTR_MTIME_SET |
2076 ATTR_ATIME | ATTR_ATIME_SET;
2078 mutex_lock(&inode->i_mutex);
2080 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2084 mutex_unlock(&inode->i_mutex);
2096 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2098 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2099 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2102 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2104 struct inode *inode = file->f_path.dentry->d_inode;
2106 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2107 ATTR_MTIME | ATTR_MTIME_SET |
2108 ATTR_CTIME | ATTR_CTIME_SET,
2110 .tv_sec = lfu->lfu_atime_sec,
2111 .tv_nsec = lfu->lfu_atime_nsec,
2114 .tv_sec = lfu->lfu_mtime_sec,
2115 .tv_nsec = lfu->lfu_mtime_nsec,
2118 .tv_sec = lfu->lfu_ctime_sec,
2119 .tv_nsec = lfu->lfu_ctime_nsec,
2125 if (!capable(CAP_SYS_ADMIN))
2128 if (!S_ISREG(inode->i_mode))
2131 mutex_lock(&inode->i_mutex);
2132 rc = ll_setattr_raw(file->f_path.dentry, &ia, false);
2133 mutex_unlock(&inode->i_mutex);
2139 * Give file access advices
2141 * The ladvise interface is similar to Linux fadvise() system call, except it
2142 * forwards the advices directly from Lustre client to server. The server side
2143 * codes will apply appropriate read-ahead and caching techniques for the
2144 * corresponding files.
2146 * A typical workload for ladvise is e.g. a bunch of different clients are
2147 * doing small random reads of a file, so prefetching pages into OSS cache
2148 * with big linear reads before the random IO is a net benefit. Fetching
2149 * all that data into each client cache with fadvise() may not be, due to
2150 * much more data being sent to the client.
2152 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2153 struct lu_ladvise *ladvise)
2157 struct cl_ladvise_io *lio;
2162 env = cl_env_get(&refcheck);
2164 RETURN(PTR_ERR(env));
2166 io = vvp_env_thread_io(env);
2167 io->ci_obj = ll_i2info(inode)->lli_clob;
2169 /* initialize parameters for ladvise */
2170 lio = &io->u.ci_ladvise;
2171 lio->li_start = ladvise->lla_start;
2172 lio->li_end = ladvise->lla_end;
2173 lio->li_fid = ll_inode2fid(inode);
2174 lio->li_advice = ladvise->lla_advice;
2175 lio->li_flags = flags;
2177 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2178 rc = cl_io_loop(env, io);
2182 cl_io_fini(env, io);
2183 cl_env_put(env, &refcheck);
2188 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2190 struct inode *inode = file->f_path.dentry->d_inode;
2191 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2195 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2196 PFID(ll_inode2fid(inode)), inode, cmd);
2197 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2199 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2200 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2204 case LL_IOC_GETFLAGS:
2205 /* Get the current value of the file flags */
2206 return put_user(fd->fd_flags, (int __user *)arg);
2207 case LL_IOC_SETFLAGS:
2208 case LL_IOC_CLRFLAGS:
2209 /* Set or clear specific file flags */
2210 /* XXX This probably needs checks to ensure the flags are
2211 * not abused, and to handle any flag side effects.
2213 if (get_user(flags, (int __user *) arg))
2216 if (cmd == LL_IOC_SETFLAGS) {
2217 if ((flags & LL_FILE_IGNORE_LOCK) &&
2218 !(file->f_flags & O_DIRECT)) {
2219 CERROR("%s: unable to disable locking on "
2220 "non-O_DIRECT file\n", current->comm);
2224 fd->fd_flags |= flags;
2226 fd->fd_flags &= ~flags;
2229 case LL_IOC_LOV_SETSTRIPE:
2230 RETURN(ll_lov_setstripe(inode, file, arg));
2231 case LL_IOC_LOV_SETEA:
2232 RETURN(ll_lov_setea(inode, file, arg));
2233 case LL_IOC_LOV_SWAP_LAYOUTS: {
2235 struct lustre_swap_layouts lsl;
2237 if (copy_from_user(&lsl, (char __user *)arg,
2238 sizeof(struct lustre_swap_layouts)))
2241 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2244 file2 = fget(lsl.sl_fd);
2248 /* O_WRONLY or O_RDWR */
2249 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2250 GOTO(out, rc = -EPERM);
2252 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2253 struct inode *inode2;
2254 struct ll_inode_info *lli;
2255 struct obd_client_handle *och = NULL;
2257 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2258 GOTO(out, rc = -EINVAL);
2260 lli = ll_i2info(inode);
2261 mutex_lock(&lli->lli_och_mutex);
2262 if (fd->fd_lease_och != NULL) {
2263 och = fd->fd_lease_och;
2264 fd->fd_lease_och = NULL;
2266 mutex_unlock(&lli->lli_och_mutex);
2268 GOTO(out, rc = -ENOLCK);
2269 inode2 = file2->f_path.dentry->d_inode;
2270 rc = ll_swap_layouts_close(och, inode, inode2);
2272 rc = ll_swap_layouts(file, file2, &lsl);
2278 case LL_IOC_LOV_GETSTRIPE:
2279 RETURN(ll_file_getstripe(inode,
2280 (struct lov_user_md __user *)arg));
2281 case FSFILT_IOC_GETFLAGS:
2282 case FSFILT_IOC_SETFLAGS:
2283 RETURN(ll_iocontrol(inode, file, cmd, arg));
2284 case FSFILT_IOC_GETVERSION_OLD:
2285 case FSFILT_IOC_GETVERSION:
2286 RETURN(put_user(inode->i_generation, (int __user *)arg));
2287 case LL_IOC_GROUP_LOCK:
2288 RETURN(ll_get_grouplock(inode, file, arg));
2289 case LL_IOC_GROUP_UNLOCK:
2290 RETURN(ll_put_grouplock(inode, file, arg));
2291 case IOC_OBD_STATFS:
2292 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2294 /* We need to special case any other ioctls we want to handle,
2295 * to send them to the MDS/OST as appropriate and to properly
2296 * network encode the arg field.
2297 case FSFILT_IOC_SETVERSION_OLD:
2298 case FSFILT_IOC_SETVERSION:
2300 case LL_IOC_FLUSHCTX:
2301 RETURN(ll_flush_ctx(inode));
2302 case LL_IOC_PATH2FID: {
2303 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2304 sizeof(struct lu_fid)))
2309 case LL_IOC_GETPARENT:
2310 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2312 case OBD_IOC_FID2PATH:
2313 RETURN(ll_fid2path(inode, (void __user *)arg));
2314 case LL_IOC_DATA_VERSION: {
2315 struct ioc_data_version idv;
2318 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2321 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2322 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2325 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2331 case LL_IOC_GET_MDTIDX: {
2334 mdtidx = ll_get_mdt_idx(inode);
2338 if (put_user((int)mdtidx, (int __user *)arg))
2343 case OBD_IOC_GETDTNAME:
2344 case OBD_IOC_GETMDNAME:
2345 RETURN(ll_get_obd_name(inode, cmd, arg));
2346 case LL_IOC_HSM_STATE_GET: {
2347 struct md_op_data *op_data;
2348 struct hsm_user_state *hus;
2355 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2356 LUSTRE_OPC_ANY, hus);
2357 if (IS_ERR(op_data)) {
2359 RETURN(PTR_ERR(op_data));
2362 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2365 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2368 ll_finish_md_op_data(op_data);
2372 case LL_IOC_HSM_STATE_SET: {
2373 struct hsm_state_set *hss;
2380 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2385 rc = ll_hsm_state_set(inode, hss);
2390 case LL_IOC_HSM_ACTION: {
2391 struct md_op_data *op_data;
2392 struct hsm_current_action *hca;
2399 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2400 LUSTRE_OPC_ANY, hca);
2401 if (IS_ERR(op_data)) {
2403 RETURN(PTR_ERR(op_data));
2406 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2409 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2412 ll_finish_md_op_data(op_data);
2416 case LL_IOC_SET_LEASE: {
2417 struct ll_inode_info *lli = ll_i2info(inode);
2418 struct obd_client_handle *och = NULL;
2423 case LL_LEASE_WRLCK:
2424 if (!(file->f_mode & FMODE_WRITE))
2426 fmode = FMODE_WRITE;
2428 case LL_LEASE_RDLCK:
2429 if (!(file->f_mode & FMODE_READ))
2433 case LL_LEASE_UNLCK:
2434 mutex_lock(&lli->lli_och_mutex);
2435 if (fd->fd_lease_och != NULL) {
2436 och = fd->fd_lease_och;
2437 fd->fd_lease_och = NULL;
2439 mutex_unlock(&lli->lli_och_mutex);
2444 fmode = och->och_flags;
2445 rc = ll_lease_close(och, inode, &lease_broken);
2452 RETURN(ll_lease_type_from_fmode(fmode));
2457 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2459 /* apply for lease */
2460 och = ll_lease_open(inode, file, fmode, 0);
2462 RETURN(PTR_ERR(och));
2465 mutex_lock(&lli->lli_och_mutex);
2466 if (fd->fd_lease_och == NULL) {
2467 fd->fd_lease_och = och;
2470 mutex_unlock(&lli->lli_och_mutex);
2472 /* impossible now that only excl is supported for now */
2473 ll_lease_close(och, inode, &lease_broken);
2478 case LL_IOC_GET_LEASE: {
2479 struct ll_inode_info *lli = ll_i2info(inode);
2480 struct ldlm_lock *lock = NULL;
2483 mutex_lock(&lli->lli_och_mutex);
2484 if (fd->fd_lease_och != NULL) {
2485 struct obd_client_handle *och = fd->fd_lease_och;
2487 lock = ldlm_handle2lock(&och->och_lease_handle);
2489 lock_res_and_lock(lock);
2490 if (!ldlm_is_cancel(lock))
2491 fmode = och->och_flags;
2493 unlock_res_and_lock(lock);
2494 LDLM_LOCK_PUT(lock);
2497 mutex_unlock(&lli->lli_och_mutex);
2499 RETURN(ll_lease_type_from_fmode(fmode));
2501 case LL_IOC_HSM_IMPORT: {
2502 struct hsm_user_import *hui;
2508 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2513 rc = ll_hsm_import(inode, file, hui);
2518 case LL_IOC_FUTIMES_3: {
2519 struct ll_futimes_3 lfu;
2521 if (copy_from_user(&lfu,
2522 (const struct ll_futimes_3 __user *)arg,
2526 RETURN(ll_file_futimes_3(file, &lfu));
2528 case LL_IOC_LADVISE: {
2529 struct ladvise_hdr *ladvise_hdr;
2532 int alloc_size = sizeof(*ladvise_hdr);
2535 OBD_ALLOC_PTR(ladvise_hdr);
2536 if (ladvise_hdr == NULL)
2539 if (copy_from_user(ladvise_hdr,
2540 (const struct ladvise_hdr __user *)arg,
2542 GOTO(out_ladvise, rc = -EFAULT);
2544 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2545 ladvise_hdr->lah_count < 1)
2546 GOTO(out_ladvise, rc = -EINVAL);
2548 num_advise = ladvise_hdr->lah_count;
2549 if (num_advise >= LAH_COUNT_MAX)
2550 GOTO(out_ladvise, rc = -EFBIG);
2552 OBD_FREE_PTR(ladvise_hdr);
2553 alloc_size = offsetof(typeof(*ladvise_hdr),
2554 lah_advise[num_advise]);
2555 OBD_ALLOC(ladvise_hdr, alloc_size);
2556 if (ladvise_hdr == NULL)
2560 * TODO: submit multiple advices to one server in a single RPC
2562 if (copy_from_user(ladvise_hdr,
2563 (const struct ladvise_hdr __user *)arg,
2565 GOTO(out_ladvise, rc = -EFAULT);
2567 for (i = 0; i < num_advise; i++) {
2568 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2569 &ladvise_hdr->lah_advise[i]);
2575 OBD_FREE(ladvise_hdr, alloc_size);
2582 ll_iocontrol_call(inode, file, cmd, arg, &err))
2585 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2586 (void __user *)arg));
2591 #ifndef HAVE_FILE_LLSEEK_SIZE
2592 static inline loff_t
2593 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2595 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2597 if (offset > maxsize)
2600 if (offset != file->f_pos) {
2601 file->f_pos = offset;
2602 file->f_version = 0;
2608 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2609 loff_t maxsize, loff_t eof)
2611 struct inode *inode = file->f_path.dentry->d_inode;
2619 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2620 * position-querying operation. Avoid rewriting the "same"
2621 * f_pos value back to the file because a concurrent read(),
2622 * write() or lseek() might have altered it
2627 * f_lock protects against read/modify/write race with other
2628 * SEEK_CURs. Note that parallel writes and reads behave
2631 mutex_lock(&inode->i_mutex);
2632 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2633 mutex_unlock(&inode->i_mutex);
2637 * In the generic case the entire file is data, so as long as
2638 * offset isn't at the end of the file then the offset is data.
2645 * There is a virtual hole at the end of the file, so as long as
2646 * offset isn't i_size or larger, return i_size.
2654 return llseek_execute(file, offset, maxsize);
2658 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2660 struct inode *inode = file->f_path.dentry->d_inode;
2661 loff_t retval, eof = 0;
2664 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2665 (origin == SEEK_CUR) ? file->f_pos : 0);
2666 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2667 PFID(ll_inode2fid(inode)), inode, retval, retval,
2669 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2671 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2672 retval = ll_glimpse_size(inode);
2675 eof = i_size_read(inode);
2678 retval = ll_generic_file_llseek_size(file, offset, origin,
2679 ll_file_maxbytes(inode), eof);
2683 static int ll_flush(struct file *file, fl_owner_t id)
2685 struct inode *inode = file->f_path.dentry->d_inode;
2686 struct ll_inode_info *lli = ll_i2info(inode);
2687 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2690 LASSERT(!S_ISDIR(inode->i_mode));
2692 /* catch async errors that were recorded back when async writeback
2693 * failed for pages in this mapping. */
2694 rc = lli->lli_async_rc;
2695 lli->lli_async_rc = 0;
2696 if (lli->lli_clob != NULL) {
2697 err = lov_read_and_clear_async_rc(lli->lli_clob);
2702 /* The application has been told write failure already.
2703 * Do not report failure again. */
2704 if (fd->fd_write_failed)
2706 return rc ? -EIO : 0;
2710 * Called to make sure a portion of file has been written out.
2711 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2713 * Return how many pages have been written.
2715 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2716 enum cl_fsync_mode mode, int ignore_layout)
2720 struct cl_fsync_io *fio;
2725 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2726 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2729 env = cl_env_get(&refcheck);
2731 RETURN(PTR_ERR(env));
2733 io = vvp_env_thread_io(env);
2734 io->ci_obj = ll_i2info(inode)->lli_clob;
2735 io->ci_ignore_layout = ignore_layout;
2737 /* initialize parameters for sync */
2738 fio = &io->u.ci_fsync;
2739 fio->fi_start = start;
2741 fio->fi_fid = ll_inode2fid(inode);
2742 fio->fi_mode = mode;
2743 fio->fi_nr_written = 0;
2745 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2746 result = cl_io_loop(env, io);
2748 result = io->ci_result;
2750 result = fio->fi_nr_written;
2751 cl_io_fini(env, io);
2752 cl_env_put(env, &refcheck);
2758 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2759 * null and dentry must be used directly rather than pulled from
2760 * *file->f_path.dentry as is done otherwise.
2763 #ifdef HAVE_FILE_FSYNC_4ARGS
2764 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2766 struct dentry *dentry = file->f_path.dentry;
2767 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2768 int ll_fsync(struct file *file, int datasync)
2770 struct dentry *dentry = file->f_path.dentry;
2772 loff_t end = LLONG_MAX;
2774 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2777 loff_t end = LLONG_MAX;
2779 struct inode *inode = dentry->d_inode;
2780 struct ll_inode_info *lli = ll_i2info(inode);
2781 struct ptlrpc_request *req;
2785 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2786 PFID(ll_inode2fid(inode)), inode);
2787 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2789 #ifdef HAVE_FILE_FSYNC_4ARGS
2790 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2791 mutex_lock(&inode->i_mutex);
2793 /* fsync's caller has already called _fdata{sync,write}, we want
2794 * that IO to finish before calling the osc and mdc sync methods */
2795 rc = filemap_fdatawait(inode->i_mapping);
2798 /* catch async errors that were recorded back when async writeback
2799 * failed for pages in this mapping. */
2800 if (!S_ISDIR(inode->i_mode)) {
2801 err = lli->lli_async_rc;
2802 lli->lli_async_rc = 0;
2805 err = lov_read_and_clear_async_rc(lli->lli_clob);
2810 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2814 ptlrpc_req_finished(req);
2816 if (S_ISREG(inode->i_mode)) {
2817 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2819 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2820 if (rc == 0 && err < 0)
2823 fd->fd_write_failed = true;
2825 fd->fd_write_failed = false;
2828 #ifdef HAVE_FILE_FSYNC_4ARGS
2829 mutex_unlock(&inode->i_mutex);
2835 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2837 struct inode *inode = file->f_path.dentry->d_inode;
2838 struct ll_sb_info *sbi = ll_i2sbi(inode);
2839 struct ldlm_enqueue_info einfo = {
2840 .ei_type = LDLM_FLOCK,
2841 .ei_cb_cp = ldlm_flock_completion_ast,
2842 .ei_cbdata = file_lock,
2844 struct md_op_data *op_data;
2845 struct lustre_handle lockh = { 0 };
2846 union ldlm_policy_data flock = { { 0 } };
2847 int fl_type = file_lock->fl_type;
2853 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2854 PFID(ll_inode2fid(inode)), file_lock);
2856 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2858 if (file_lock->fl_flags & FL_FLOCK) {
2859 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2860 /* flocks are whole-file locks */
2861 flock.l_flock.end = OFFSET_MAX;
2862 /* For flocks owner is determined by the local file desctiptor*/
2863 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2864 } else if (file_lock->fl_flags & FL_POSIX) {
2865 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2866 flock.l_flock.start = file_lock->fl_start;
2867 flock.l_flock.end = file_lock->fl_end;
2871 flock.l_flock.pid = file_lock->fl_pid;
2873 /* Somewhat ugly workaround for svc lockd.
2874 * lockd installs custom fl_lmops->lm_compare_owner that checks
2875 * for the fl_owner to be the same (which it always is on local node
2876 * I guess between lockd processes) and then compares pid.
2877 * As such we assign pid to the owner field to make it all work,
2878 * conflict with normal locks is unlikely since pid space and
2879 * pointer space for current->files are not intersecting */
2880 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2881 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2885 einfo.ei_mode = LCK_PR;
2888 /* An unlock request may or may not have any relation to
2889 * existing locks so we may not be able to pass a lock handle
2890 * via a normal ldlm_lock_cancel() request. The request may even
2891 * unlock a byte range in the middle of an existing lock. In
2892 * order to process an unlock request we need all of the same
2893 * information that is given with a normal read or write record
2894 * lock request. To avoid creating another ldlm unlock (cancel)
2895 * message we'll treat a LCK_NL flock request as an unlock. */
2896 einfo.ei_mode = LCK_NL;
2899 einfo.ei_mode = LCK_PW;
2902 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2917 flags = LDLM_FL_BLOCK_NOWAIT;
2923 flags = LDLM_FL_TEST_LOCK;
2926 CERROR("unknown fcntl lock command: %d\n", cmd);
2930 /* Save the old mode so that if the mode in the lock changes we
2931 * can decrement the appropriate reader or writer refcount. */
2932 file_lock->fl_type = einfo.ei_mode;
2934 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2935 LUSTRE_OPC_ANY, NULL);
2936 if (IS_ERR(op_data))
2937 RETURN(PTR_ERR(op_data));
2939 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2940 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2941 flock.l_flock.pid, flags, einfo.ei_mode,
2942 flock.l_flock.start, flock.l_flock.end);
2944 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2947 /* Restore the file lock type if not TEST lock. */
2948 if (!(flags & LDLM_FL_TEST_LOCK))
2949 file_lock->fl_type = fl_type;
2951 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
2952 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
2953 !(flags & LDLM_FL_TEST_LOCK))
2954 rc2 = locks_lock_file_wait(file, file_lock);
2956 if ((file_lock->fl_flags & FL_FLOCK) &&
2957 (rc == 0 || file_lock->fl_type == F_UNLCK))
2958 rc2 = flock_lock_file_wait(file, file_lock);
2959 if ((file_lock->fl_flags & FL_POSIX) &&
2960 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2961 !(flags & LDLM_FL_TEST_LOCK))
2962 rc2 = posix_lock_file_wait(file, file_lock);
2963 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
2965 if (rc2 && file_lock->fl_type != F_UNLCK) {
2966 einfo.ei_mode = LCK_NL;
2967 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2972 ll_finish_md_op_data(op_data);
2977 int ll_get_fid_by_name(struct inode *parent, const char *name,
2978 int namelen, struct lu_fid *fid,
2979 struct inode **inode)
2981 struct md_op_data *op_data = NULL;
2982 struct mdt_body *body;
2983 struct ptlrpc_request *req;
2987 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2988 LUSTRE_OPC_ANY, NULL);
2989 if (IS_ERR(op_data))
2990 RETURN(PTR_ERR(op_data));
2992 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
2993 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2994 ll_finish_md_op_data(op_data);
2998 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3000 GOTO(out_req, rc = -EFAULT);
3002 *fid = body->mbo_fid1;
3005 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3007 ptlrpc_req_finished(req);
3011 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3012 const char *name, int namelen)
3014 struct dentry *dchild = NULL;
3015 struct inode *child_inode = NULL;
3016 struct md_op_data *op_data;
3017 struct ptlrpc_request *request = NULL;
3018 struct obd_client_handle *och = NULL;
3020 struct mdt_body *body;
3022 __u64 data_version = 0;
3025 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3026 name, PFID(ll_inode2fid(parent)), mdtidx);
3028 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3029 0, LUSTRE_OPC_ANY, NULL);
3030 if (IS_ERR(op_data))
3031 RETURN(PTR_ERR(op_data));
3033 /* Get child FID first */
3034 qstr.hash = full_name_hash(name, namelen);
3037 dchild = d_lookup(file->f_path.dentry, &qstr);
3038 if (dchild != NULL) {
3039 if (dchild->d_inode != NULL)
3040 child_inode = igrab(dchild->d_inode);
3044 if (child_inode == NULL) {
3045 rc = ll_get_fid_by_name(parent, name, namelen,
3046 &op_data->op_fid3, &child_inode);
3051 if (child_inode == NULL)
3052 GOTO(out_free, rc = -EINVAL);
3055 * lfs migrate command needs to be blocked on the client
3056 * by checking the migrate FID against the FID of the
3059 if (child_inode == parent->i_sb->s_root->d_inode)
3060 GOTO(out_iput, rc = -EINVAL);
3062 mutex_lock(&child_inode->i_mutex);
3063 op_data->op_fid3 = *ll_inode2fid(child_inode);
3064 if (!fid_is_sane(&op_data->op_fid3)) {
3065 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3066 ll_get_fsname(parent->i_sb, NULL, 0), name,
3067 PFID(&op_data->op_fid3));
3068 GOTO(out_unlock, rc = -EINVAL);
3071 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3073 GOTO(out_unlock, rc);
3076 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3077 PFID(&op_data->op_fid3), mdtidx);
3078 GOTO(out_unlock, rc = 0);
3081 if (S_ISREG(child_inode->i_mode)) {
3082 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3086 GOTO(out_unlock, rc);
3089 rc = ll_data_version(child_inode, &data_version,
3092 GOTO(out_close, rc);
3094 op_data->op_handle = och->och_fh;
3095 op_data->op_data = och->och_mod;
3096 op_data->op_data_version = data_version;
3097 op_data->op_lease_handle = och->och_lease_handle;
3098 op_data->op_bias |= MDS_RENAME_MIGRATE;
3101 op_data->op_mds = mdtidx;
3102 op_data->op_cli_flags = CLI_MIGRATE;
3103 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3104 namelen, name, namelen, &request);
3106 ll_update_times(request, parent);
3108 if (request != NULL) {
3109 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3111 ptlrpc_req_finished(request);
3112 GOTO(out_close, rc = -EPROTO);
3115 /* If the server does release layout lock, then we cleanup
3116 * the client och here, otherwise release it in out_close: */
3118 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3119 obd_mod_put(och->och_mod);
3120 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3122 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3126 ptlrpc_req_finished(request);
3129 /* Try again if the file layout has changed. */
3130 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3135 if (och != NULL) /* close the file */
3136 ll_lease_close(och, child_inode, NULL);
3138 clear_nlink(child_inode);
3140 mutex_unlock(&child_inode->i_mutex);
3144 ll_finish_md_op_data(op_data);
3149 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3157 * test if some locks matching bits and l_req_mode are acquired
3158 * - bits can be in different locks
3159 * - if found clear the common lock bits in *bits
3160 * - the bits not found, are kept in *bits
3162 * \param bits [IN] searched lock bits [IN]
3163 * \param l_req_mode [IN] searched lock mode
3164 * \retval boolean, true iff all bits are found
3166 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3168 struct lustre_handle lockh;
3169 union ldlm_policy_data policy;
3170 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3171 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3180 fid = &ll_i2info(inode)->lli_fid;
3181 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3182 ldlm_lockname[mode]);
3184 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3185 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3186 policy.l_inodebits.bits = *bits & (1 << i);
3187 if (policy.l_inodebits.bits == 0)
3190 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3191 &policy, mode, &lockh)) {
3192 struct ldlm_lock *lock;
3194 lock = ldlm_handle2lock(&lockh);
3197 ~(lock->l_policy_data.l_inodebits.bits);
3198 LDLM_LOCK_PUT(lock);
3200 *bits &= ~policy.l_inodebits.bits;
3207 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3208 struct lustre_handle *lockh, __u64 flags,
3209 enum ldlm_mode mode)
3211 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3216 fid = &ll_i2info(inode)->lli_fid;
3217 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3219 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3220 fid, LDLM_IBITS, &policy, mode, lockh);
3225 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3227 /* Already unlinked. Just update nlink and return success */
3228 if (rc == -ENOENT) {
3230 /* If it is striped directory, and there is bad stripe
3231 * Let's revalidate the dentry again, instead of returning
3233 if (S_ISDIR(inode->i_mode) &&
3234 ll_i2info(inode)->lli_lsm_md != NULL)
3237 /* This path cannot be hit for regular files unless in
3238 * case of obscure races, so no need to to validate
3240 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3242 } else if (rc != 0) {
3243 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3244 "%s: revalidate FID "DFID" error: rc = %d\n",
3245 ll_get_fsname(inode->i_sb, NULL, 0),
3246 PFID(ll_inode2fid(inode)), rc);
3252 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3254 struct inode *inode = dentry->d_inode;
3255 struct ptlrpc_request *req = NULL;
3256 struct obd_export *exp;
3260 LASSERT(inode != NULL);
3262 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3263 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3265 exp = ll_i2mdexp(inode);
3267 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3268 * But under CMD case, it caused some lock issues, should be fixed
3269 * with new CMD ibits lock. See bug 12718 */
3270 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3271 struct lookup_intent oit = { .it_op = IT_GETATTR };
3272 struct md_op_data *op_data;
3274 if (ibits == MDS_INODELOCK_LOOKUP)
3275 oit.it_op = IT_LOOKUP;
3277 /* Call getattr by fid, so do not provide name at all. */
3278 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3279 dentry->d_inode, NULL, 0, 0,
3280 LUSTRE_OPC_ANY, NULL);
3281 if (IS_ERR(op_data))
3282 RETURN(PTR_ERR(op_data));
3284 rc = md_intent_lock(exp, op_data, &oit, &req,
3285 &ll_md_blocking_ast, 0);
3286 ll_finish_md_op_data(op_data);
3288 rc = ll_inode_revalidate_fini(inode, rc);
3292 rc = ll_revalidate_it_finish(req, &oit, dentry);
3294 ll_intent_release(&oit);
3298 /* Unlinked? Unhash dentry, so it is not picked up later by
3299 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3300 here to preserve get_cwd functionality on 2.6.
3302 if (!dentry->d_inode->i_nlink) {
3303 ll_lock_dcache(inode);
3304 d_lustre_invalidate(dentry, 0);
3305 ll_unlock_dcache(inode);
3308 ll_lookup_finish_locks(&oit, dentry);
3309 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3310 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3311 u64 valid = OBD_MD_FLGETATTR;
3312 struct md_op_data *op_data;
3315 if (S_ISREG(inode->i_mode)) {
3316 rc = ll_get_default_mdsize(sbi, &ealen);
3319 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3322 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3323 0, ealen, LUSTRE_OPC_ANY,
3325 if (IS_ERR(op_data))
3326 RETURN(PTR_ERR(op_data));
3328 op_data->op_valid = valid;
3329 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3330 ll_finish_md_op_data(op_data);
3332 rc = ll_inode_revalidate_fini(inode, rc);
3336 rc = ll_prep_inode(&inode, req, NULL, NULL);
3339 ptlrpc_req_finished(req);
3343 static int ll_merge_md_attr(struct inode *inode)
3345 struct cl_attr attr = { 0 };
3348 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3349 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3350 &attr, ll_md_blocking_ast);
3354 set_nlink(inode, attr.cat_nlink);
3355 inode->i_blocks = attr.cat_blocks;
3356 i_size_write(inode, attr.cat_size);
3358 ll_i2info(inode)->lli_atime = attr.cat_atime;
3359 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3360 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3366 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3368 struct inode *inode = dentry->d_inode;
3372 rc = __ll_inode_revalidate(dentry, ibits);
3376 /* if object isn't regular file, don't validate size */
3377 if (!S_ISREG(inode->i_mode)) {
3378 if (S_ISDIR(inode->i_mode) &&
3379 ll_i2info(inode)->lli_lsm_md != NULL) {
3380 rc = ll_merge_md_attr(inode);
3385 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3386 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3387 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3389 /* In case of restore, the MDT has the right size and has
3390 * already send it back without granting the layout lock,
3391 * inode is up-to-date so glimpse is useless.
3392 * Also to glimpse we need the layout, in case of a running
3393 * restore the MDT holds the layout lock so the glimpse will
3394 * block up to the end of restore (getattr will block)
3396 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3397 rc = ll_glimpse_size(inode);
3402 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3404 struct inode *inode = de->d_inode;
3405 struct ll_sb_info *sbi = ll_i2sbi(inode);
3406 struct ll_inode_info *lli = ll_i2info(inode);
3409 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3410 MDS_INODELOCK_LOOKUP);
3411 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3416 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3418 stat->dev = inode->i_sb->s_dev;
3419 if (ll_need_32bit_api(sbi))
3420 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3422 stat->ino = inode->i_ino;
3423 stat->mode = inode->i_mode;
3424 stat->uid = inode->i_uid;
3425 stat->gid = inode->i_gid;
3426 stat->rdev = inode->i_rdev;
3427 stat->atime = inode->i_atime;
3428 stat->mtime = inode->i_mtime;
3429 stat->ctime = inode->i_ctime;
3430 stat->blksize = 1 << inode->i_blkbits;
3432 stat->nlink = inode->i_nlink;
3433 stat->size = i_size_read(inode);
3434 stat->blocks = inode->i_blocks;
3439 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3440 __u64 start, __u64 len)
3444 struct fiemap *fiemap;
3445 unsigned int extent_count = fieinfo->fi_extents_max;
3447 num_bytes = sizeof(*fiemap) + (extent_count *
3448 sizeof(struct fiemap_extent));
3449 OBD_ALLOC_LARGE(fiemap, num_bytes);
3454 fiemap->fm_flags = fieinfo->fi_flags;
3455 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3456 fiemap->fm_start = start;
3457 fiemap->fm_length = len;
3458 if (extent_count > 0 &&
3459 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3460 sizeof(struct fiemap_extent)) != 0)
3461 GOTO(out, rc = -EFAULT);
3463 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3465 fieinfo->fi_flags = fiemap->fm_flags;
3466 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3467 if (extent_count > 0 &&
3468 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3469 fiemap->fm_mapped_extents *
3470 sizeof(struct fiemap_extent)) != 0)
3471 GOTO(out, rc = -EFAULT);
3473 OBD_FREE_LARGE(fiemap, num_bytes);
3477 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3479 struct ll_inode_info *lli = ll_i2info(inode);
3480 struct posix_acl *acl = NULL;
3483 spin_lock(&lli->lli_lock);
3484 /* VFS' acl_permission_check->check_acl will release the refcount */
3485 acl = posix_acl_dup(lli->lli_posix_acl);
3486 spin_unlock(&lli->lli_lock);
3491 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3493 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3494 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3496 ll_check_acl(struct inode *inode, int mask)
3499 # ifdef CONFIG_FS_POSIX_ACL
3500 struct posix_acl *acl;
3504 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3505 if (flags & IPERM_FLAG_RCU)
3508 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3513 rc = posix_acl_permission(inode, acl, mask);
3514 posix_acl_release(acl);
3517 # else /* !CONFIG_FS_POSIX_ACL */
3519 # endif /* CONFIG_FS_POSIX_ACL */
3521 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3523 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3524 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3526 # ifdef HAVE_INODE_PERMISION_2ARGS
3527 int ll_inode_permission(struct inode *inode, int mask)
3529 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3534 struct ll_sb_info *sbi;
3535 struct root_squash_info *squash;
3536 struct cred *cred = NULL;
3537 const struct cred *old_cred = NULL;
3539 bool squash_id = false;
3542 #ifdef MAY_NOT_BLOCK
3543 if (mask & MAY_NOT_BLOCK)
3545 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3546 if (flags & IPERM_FLAG_RCU)
3550 /* as root inode are NOT getting validated in lookup operation,
3551 * need to do it before permission check. */
3553 if (inode == inode->i_sb->s_root->d_inode) {
3554 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3555 MDS_INODELOCK_LOOKUP);
3560 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3561 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3563 /* squash fsuid/fsgid if needed */
3564 sbi = ll_i2sbi(inode);
3565 squash = &sbi->ll_squash;
3566 if (unlikely(squash->rsi_uid != 0 &&
3567 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3568 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3572 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3573 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3574 squash->rsi_uid, squash->rsi_gid);
3576 /* update current process's credentials
3577 * and FS capability */
3578 cred = prepare_creds();
3582 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3583 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3584 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3585 if ((1 << cap) & CFS_CAP_FS_MASK)
3586 cap_lower(cred->cap_effective, cap);
3588 old_cred = override_creds(cred);
3591 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3593 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3594 rc = lustre_check_remote_perm(inode, mask);
3596 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3598 /* restore current process's credentials and FS capability */
3600 revert_creds(old_cred);
3607 /* -o localflock - only provides locally consistent flock locks */
3608 struct file_operations ll_file_operations = {
3609 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3610 # ifdef HAVE_SYNC_READ_WRITE
3611 .read = new_sync_read,
3612 .write = new_sync_write,
3614 .read_iter = ll_file_read_iter,
3615 .write_iter = ll_file_write_iter,
3616 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3617 .read = ll_file_read,
3618 .aio_read = ll_file_aio_read,
3619 .write = ll_file_write,
3620 .aio_write = ll_file_aio_write,
3621 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3622 .unlocked_ioctl = ll_file_ioctl,
3623 .open = ll_file_open,
3624 .release = ll_file_release,
3625 .mmap = ll_file_mmap,
3626 .llseek = ll_file_seek,
3627 .splice_read = ll_file_splice_read,
3632 struct file_operations ll_file_operations_flock = {
3633 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3634 # ifdef HAVE_SYNC_READ_WRITE
3635 .read = new_sync_read,
3636 .write = new_sync_write,
3637 # endif /* HAVE_SYNC_READ_WRITE */
3638 .read_iter = ll_file_read_iter,
3639 .write_iter = ll_file_write_iter,
3640 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3641 .read = ll_file_read,
3642 .aio_read = ll_file_aio_read,
3643 .write = ll_file_write,
3644 .aio_write = ll_file_aio_write,
3645 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3646 .unlocked_ioctl = ll_file_ioctl,
3647 .open = ll_file_open,
3648 .release = ll_file_release,
3649 .mmap = ll_file_mmap,
3650 .llseek = ll_file_seek,
3651 .splice_read = ll_file_splice_read,
3654 .flock = ll_file_flock,
3655 .lock = ll_file_flock
3658 /* These are for -o noflock - to return ENOSYS on flock calls */
3659 struct file_operations ll_file_operations_noflock = {
3660 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3661 # ifdef HAVE_SYNC_READ_WRITE
3662 .read = new_sync_read,
3663 .write = new_sync_write,
3664 # endif /* HAVE_SYNC_READ_WRITE */
3665 .read_iter = ll_file_read_iter,
3666 .write_iter = ll_file_write_iter,
3667 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3668 .read = ll_file_read,
3669 .aio_read = ll_file_aio_read,
3670 .write = ll_file_write,
3671 .aio_write = ll_file_aio_write,
3672 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3673 .unlocked_ioctl = ll_file_ioctl,
3674 .open = ll_file_open,
3675 .release = ll_file_release,
3676 .mmap = ll_file_mmap,
3677 .llseek = ll_file_seek,
3678 .splice_read = ll_file_splice_read,
3681 .flock = ll_file_noflock,
3682 .lock = ll_file_noflock
3685 struct inode_operations ll_file_inode_operations = {
3686 .setattr = ll_setattr,
3687 .getattr = ll_getattr,
3688 .permission = ll_inode_permission,
3689 .setxattr = ll_setxattr,
3690 .getxattr = ll_getxattr,
3691 .listxattr = ll_listxattr,
3692 .removexattr = ll_removexattr,
3693 .fiemap = ll_fiemap,
3694 #ifdef HAVE_IOP_GET_ACL
3695 .get_acl = ll_get_acl,
3699 /* dynamic ioctl number support routins */
3700 static struct llioc_ctl_data {
3701 struct rw_semaphore ioc_sem;
3702 struct list_head ioc_head;
3704 __RWSEM_INITIALIZER(llioc.ioc_sem),
3705 LIST_HEAD_INIT(llioc.ioc_head)
3710 struct list_head iocd_list;
3711 unsigned int iocd_size;
3712 llioc_callback_t iocd_cb;
3713 unsigned int iocd_count;
3714 unsigned int iocd_cmd[0];
3717 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3720 struct llioc_data *in_data = NULL;
3723 if (cb == NULL || cmd == NULL ||
3724 count > LLIOC_MAX_CMD || count < 0)
3727 size = sizeof(*in_data) + count * sizeof(unsigned int);
3728 OBD_ALLOC(in_data, size);
3729 if (in_data == NULL)
3732 memset(in_data, 0, sizeof(*in_data));
3733 in_data->iocd_size = size;
3734 in_data->iocd_cb = cb;
3735 in_data->iocd_count = count;
3736 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3738 down_write(&llioc.ioc_sem);
3739 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3740 up_write(&llioc.ioc_sem);
3745 void ll_iocontrol_unregister(void *magic)
3747 struct llioc_data *tmp;
3752 down_write(&llioc.ioc_sem);
3753 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3755 unsigned int size = tmp->iocd_size;
3757 list_del(&tmp->iocd_list);
3758 up_write(&llioc.ioc_sem);
3760 OBD_FREE(tmp, size);
3764 up_write(&llioc.ioc_sem);
3766 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3769 EXPORT_SYMBOL(ll_iocontrol_register);
3770 EXPORT_SYMBOL(ll_iocontrol_unregister);
3772 static enum llioc_iter
3773 ll_iocontrol_call(struct inode *inode, struct file *file,
3774 unsigned int cmd, unsigned long arg, int *rcp)
3776 enum llioc_iter ret = LLIOC_CONT;
3777 struct llioc_data *data;
3778 int rc = -EINVAL, i;
3780 down_read(&llioc.ioc_sem);
3781 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3782 for (i = 0; i < data->iocd_count; i++) {
3783 if (cmd != data->iocd_cmd[i])
3786 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3790 if (ret == LLIOC_STOP)
3793 up_read(&llioc.ioc_sem);
3800 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3802 struct ll_inode_info *lli = ll_i2info(inode);
3803 struct cl_object *obj = lli->lli_clob;
3812 env = cl_env_get(&refcheck);
3814 RETURN(PTR_ERR(env));
3816 rc = cl_conf_set(env, lli->lli_clob, conf);
3820 if (conf->coc_opc == OBJECT_CONF_SET) {
3821 struct ldlm_lock *lock = conf->coc_lock;
3822 struct cl_layout cl = {
3826 LASSERT(lock != NULL);
3827 LASSERT(ldlm_has_layout(lock));
3829 /* it can only be allowed to match after layout is
3830 * applied to inode otherwise false layout would be
3831 * seen. Applying layout shoud happen before dropping
3832 * the intent lock. */
3833 ldlm_lock_allow_match(lock);
3835 rc = cl_object_layout_get(env, obj, &cl);
3840 DFID": layout version change: %u -> %u\n",
3841 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3843 ll_layout_version_set(lli, cl.cl_layout_gen);
3847 cl_env_put(env, &refcheck);
3852 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3853 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3856 struct ll_sb_info *sbi = ll_i2sbi(inode);
3857 struct ptlrpc_request *req;
3858 struct mdt_body *body;
3865 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3866 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3867 lock->l_lvb_data, lock->l_lvb_len);
3869 if (lock->l_lvb_data != NULL)
3872 /* if layout lock was granted right away, the layout is returned
3873 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3874 * blocked and then granted via completion ast, we have to fetch
3875 * layout here. Please note that we can't use the LVB buffer in
3876 * completion AST because it doesn't have a large enough buffer */
3877 rc = ll_get_default_mdsize(sbi, &lmmsize);
3879 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3880 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3885 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3887 GOTO(out, rc = -EPROTO);
3889 lmmsize = body->mbo_eadatasize;
3890 if (lmmsize == 0) /* empty layout */
3893 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3895 GOTO(out, rc = -EFAULT);
3897 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3898 if (lvbdata == NULL)
3899 GOTO(out, rc = -ENOMEM);
3901 memcpy(lvbdata, lmm, lmmsize);
3902 lock_res_and_lock(lock);
3903 if (unlikely(lock->l_lvb_data == NULL)) {
3904 lock->l_lvb_type = LVB_T_LAYOUT;
3905 lock->l_lvb_data = lvbdata;
3906 lock->l_lvb_len = lmmsize;
3909 unlock_res_and_lock(lock);
3912 OBD_FREE_LARGE(lvbdata, lmmsize);
3917 ptlrpc_req_finished(req);
3922 * Apply the layout to the inode. Layout lock is held and will be released
3925 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3926 struct inode *inode)
3928 struct ll_inode_info *lli = ll_i2info(inode);
3929 struct ll_sb_info *sbi = ll_i2sbi(inode);
3930 struct ldlm_lock *lock;
3931 struct cl_object_conf conf;
3934 bool wait_layout = false;
3937 LASSERT(lustre_handle_is_used(lockh));
3939 lock = ldlm_handle2lock(lockh);
3940 LASSERT(lock != NULL);
3941 LASSERT(ldlm_has_layout(lock));
3943 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3944 PFID(&lli->lli_fid), inode);
3946 /* in case this is a caching lock and reinstate with new inode */
3947 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3949 lock_res_and_lock(lock);
3950 lvb_ready = ldlm_is_lvb_ready(lock);
3951 unlock_res_and_lock(lock);
3952 /* checking lvb_ready is racy but this is okay. The worst case is
3953 * that multi processes may configure the file on the same time. */
3958 rc = ll_layout_fetch(inode, lock);
3962 /* for layout lock, lmm is stored in lock's lvb.
3963 * lvb_data is immutable if the lock is held so it's safe to access it
3966 * set layout to file. Unlikely this will fail as old layout was
3967 * surely eliminated */
3968 memset(&conf, 0, sizeof conf);
3969 conf.coc_opc = OBJECT_CONF_SET;
3970 conf.coc_inode = inode;
3971 conf.coc_lock = lock;
3972 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
3973 conf.u.coc_layout.lb_len = lock->l_lvb_len;
3974 rc = ll_layout_conf(inode, &conf);
3976 /* refresh layout failed, need to wait */
3977 wait_layout = rc == -EBUSY;
3981 LDLM_LOCK_PUT(lock);
3982 ldlm_lock_decref(lockh, mode);
3984 /* wait for IO to complete if it's still being used. */
3986 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3987 ll_get_fsname(inode->i_sb, NULL, 0),
3988 PFID(&lli->lli_fid), inode);
3990 memset(&conf, 0, sizeof conf);
3991 conf.coc_opc = OBJECT_CONF_WAIT;
3992 conf.coc_inode = inode;
3993 rc = ll_layout_conf(inode, &conf);
3997 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3998 ll_get_fsname(inode->i_sb, NULL, 0),
3999 PFID(&lli->lli_fid), rc);
4004 static int ll_layout_refresh_locked(struct inode *inode)
4006 struct ll_inode_info *lli = ll_i2info(inode);
4007 struct ll_sb_info *sbi = ll_i2sbi(inode);
4008 struct md_op_data *op_data;
4009 struct lookup_intent it;
4010 struct lustre_handle lockh;
4011 enum ldlm_mode mode;
4012 struct ldlm_enqueue_info einfo = {
4013 .ei_type = LDLM_IBITS,
4015 .ei_cb_bl = &ll_md_blocking_ast,
4016 .ei_cb_cp = &ldlm_completion_ast,
4022 /* mostly layout lock is caching on the local side, so try to match
4023 * it before grabbing layout lock mutex. */
4024 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4025 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4026 if (mode != 0) { /* hit cached lock */
4027 rc = ll_layout_lock_set(&lockh, mode, inode);
4034 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4035 0, 0, LUSTRE_OPC_ANY, NULL);
4036 if (IS_ERR(op_data))
4037 RETURN(PTR_ERR(op_data));
4039 /* have to enqueue one */
4040 memset(&it, 0, sizeof(it));
4041 it.it_op = IT_LAYOUT;
4042 lockh.cookie = 0ULL;
4044 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4045 ll_get_fsname(inode->i_sb, NULL, 0),
4046 PFID(&lli->lli_fid), inode);
4048 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4049 if (it.it_request != NULL)
4050 ptlrpc_req_finished(it.it_request);
4051 it.it_request = NULL;
4053 ll_finish_md_op_data(op_data);
4055 mode = it.it_lock_mode;
4056 it.it_lock_mode = 0;
4057 ll_intent_drop_lock(&it);
4060 /* set lock data in case this is a new lock */
4061 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4062 rc = ll_layout_lock_set(&lockh, mode, inode);
4071 * This function checks if there exists a LAYOUT lock on the client side,
4072 * or enqueues it if it doesn't have one in cache.
4074 * This function will not hold layout lock so it may be revoked any time after
4075 * this function returns. Any operations depend on layout should be redone
4078 * This function should be called before lov_io_init() to get an uptodate
4079 * layout version, the caller should save the version number and after IO
4080 * is finished, this function should be called again to verify that layout
4081 * is not changed during IO time.
4083 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4085 struct ll_inode_info *lli = ll_i2info(inode);
4086 struct ll_sb_info *sbi = ll_i2sbi(inode);
4090 *gen = ll_layout_version_get(lli);
4091 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4095 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4096 LASSERT(S_ISREG(inode->i_mode));
4098 /* take layout lock mutex to enqueue layout lock exclusively. */
4099 mutex_lock(&lli->lli_layout_mutex);
4101 rc = ll_layout_refresh_locked(inode);
4105 *gen = ll_layout_version_get(lli);
4107 mutex_unlock(&lli->lli_layout_mutex);
4113 * This function send a restore request to the MDT
4115 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4117 struct hsm_user_request *hur;
4121 len = sizeof(struct hsm_user_request) +
4122 sizeof(struct hsm_user_item);
4123 OBD_ALLOC(hur, len);
4127 hur->hur_request.hr_action = HUA_RESTORE;
4128 hur->hur_request.hr_archive_id = 0;
4129 hur->hur_request.hr_flags = 0;
4130 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4131 sizeof(hur->hur_user_item[0].hui_fid));
4132 hur->hur_user_item[0].hui_extent.offset = offset;
4133 hur->hur_user_item[0].hui_extent.length = length;
4134 hur->hur_request.hr_itemcount = 1;
4135 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,