4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("released file has negative dentry: file = %p, "
339 "dentry = %p, name = %s\n",
340 file, file->f_dentry, file->f_dentry->d_name.name);
344 LUSTRE_FPRIVATE(file) = NULL;
345 ll_file_data_put(fd);
346 ll_capa_close(inode);
351 /* While this returns an error code, fput() the caller does not, so we need
352 * to make every effort to clean up all of our state here. Also, applications
353 * rarely check close errors and even if an error is returned they will not
354 * re-try the close call.
356 int ll_file_release(struct inode *inode, struct file *file)
358 struct ll_file_data *fd;
359 struct ll_sb_info *sbi = ll_i2sbi(inode);
360 struct ll_inode_info *lli = ll_i2info(inode);
364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
365 PFID(ll_inode2fid(inode)), inode);
367 #ifdef CONFIG_FS_POSIX_ACL
368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
369 inode == inode->i_sb->s_root->d_inode) {
370 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
373 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
374 fd->fd_flags &= ~LL_FILE_RMTACL;
375 rct_del(&sbi->ll_rct, current_pid());
376 et_search_free(&sbi->ll_et, current_pid());
381 if (inode->i_sb->s_root != file->f_dentry)
382 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
383 fd = LUSTRE_FPRIVATE(file);
386 /* The last ref on @file, maybe not the the owner pid of statahead,
387 * because parent and child process can share the same file handle. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
389 ll_deauthorize_statahead(inode, fd);
391 if (inode->i_sb->s_root == file->f_dentry) {
392 LUSTRE_FPRIVATE(file) = NULL;
393 ll_file_data_put(fd);
397 if (!S_ISDIR(inode->i_mode)) {
398 if (lli->lli_clob != NULL)
399 lov_read_and_clear_async_rc(lli->lli_clob);
400 lli->lli_async_rc = 0;
403 rc = ll_md_close(sbi->ll_md_exp, inode, file);
405 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
406 libcfs_debug_dumplog();
411 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
412 struct lookup_intent *itp)
414 struct dentry *de = file->f_dentry;
415 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
416 struct dentry *parent = de->d_parent;
417 const char *name = NULL;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req = NULL;
424 LASSERT(parent != NULL);
425 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
427 /* if server supports open-by-fid, or file name is invalid, don't pack
428 * name in open request */
429 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
430 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
431 name = de->d_name.name;
432 len = de->d_name.len;
435 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
436 name, len, 0, LUSTRE_OPC_ANY, NULL);
438 RETURN(PTR_ERR(op_data));
439 op_data->op_data = lmm;
440 op_data->op_data_size = lmmsize;
442 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
443 &ll_md_blocking_ast, 0);
444 ll_finish_md_op_data(op_data);
446 /* reason for keep own exit path - don`t flood log
447 * with messages with -ESTALE errors.
449 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
450 it_open_error(DISP_OPEN_OPEN, itp))
452 ll_release_openhandle(de, itp);
456 if (it_disposition(itp, DISP_LOOKUP_NEG))
457 GOTO(out, rc = -ENOENT);
459 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
460 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
461 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
465 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
466 if (!rc && itp->d.lustre.it_lock_mode)
467 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
470 ptlrpc_req_finished(req);
471 ll_intent_drop_lock(itp);
477 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
478 * not believe attributes if a few ioepoch holders exist. Attributes for
479 * previous ioepoch if new one is opened are also skipped by MDS.
481 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
483 if (ioepoch && lli->lli_ioepoch != ioepoch) {
484 lli->lli_ioepoch = ioepoch;
485 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
486 ioepoch, PFID(&lli->lli_fid));
490 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
491 struct obd_client_handle *och)
493 struct ptlrpc_request *req = it->d.lustre.it_data;
494 struct mdt_body *body;
496 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
497 och->och_fh = body->mbo_handle;
498 och->och_fid = body->mbo_fid1;
499 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
500 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
501 och->och_flags = it->it_flags;
503 return md_set_open_replay_data(md_exp, och, it);
506 static int ll_local_open(struct file *file, struct lookup_intent *it,
507 struct ll_file_data *fd, struct obd_client_handle *och)
509 struct inode *inode = file->f_dentry->d_inode;
510 struct ll_inode_info *lli = ll_i2info(inode);
513 LASSERT(!LUSTRE_FPRIVATE(file));
518 struct ptlrpc_request *req = it->d.lustre.it_data;
519 struct mdt_body *body;
522 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
526 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
527 ll_ioepoch_open(lli, body->mbo_ioepoch);
530 LUSTRE_FPRIVATE(file) = fd;
531 ll_readahead_init(inode, &fd->fd_ras);
532 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
534 /* ll_cl_context initialize */
535 rwlock_init(&fd->fd_lock);
536 INIT_LIST_HEAD(&fd->fd_lccs);
541 /* Open a file, and (for the very first open) create objects on the OSTs at
542 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
543 * creation or open until ll_lov_setstripe() ioctl is called.
545 * If we already have the stripe MD locally then we don't request it in
546 * md_open(), by passing a lmm_size = 0.
548 * It is up to the application to ensure no other processes open this file
549 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
550 * used. We might be able to avoid races of that sort by getting lli_open_sem
551 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
552 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
554 int ll_file_open(struct inode *inode, struct file *file)
556 struct ll_inode_info *lli = ll_i2info(inode);
557 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
558 .it_flags = file->f_flags };
559 struct obd_client_handle **och_p = NULL;
560 __u64 *och_usecount = NULL;
561 struct ll_file_data *fd;
565 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
566 PFID(ll_inode2fid(inode)), inode, file->f_flags);
568 it = file->private_data; /* XXX: compat macro */
569 file->private_data = NULL; /* prevent ll_local_open assertion */
571 fd = ll_file_data_get();
573 GOTO(out_openerr, rc = -ENOMEM);
576 if (S_ISDIR(inode->i_mode))
577 ll_authorize_statahead(inode, fd);
579 if (inode->i_sb->s_root == file->f_dentry) {
580 LUSTRE_FPRIVATE(file) = fd;
584 if (!it || !it->d.lustre.it_disposition) {
585 /* Convert f_flags into access mode. We cannot use file->f_mode,
586 * because everything but O_ACCMODE mask was stripped from
588 if ((oit.it_flags + 1) & O_ACCMODE)
590 if (file->f_flags & O_TRUNC)
591 oit.it_flags |= FMODE_WRITE;
593 /* kernel only call f_op->open in dentry_open. filp_open calls
594 * dentry_open after call to open_namei that checks permissions.
595 * Only nfsd_open call dentry_open directly without checking
596 * permissions and because of that this code below is safe. */
597 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
598 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
600 /* We do not want O_EXCL here, presumably we opened the file
601 * already? XXX - NFS implications? */
602 oit.it_flags &= ~O_EXCL;
604 /* bug20584, if "it_flags" contains O_CREAT, the file will be
605 * created if necessary, then "IT_CREAT" should be set to keep
606 * consistent with it */
607 if (oit.it_flags & O_CREAT)
608 oit.it_op |= IT_CREAT;
614 /* Let's see if we have file open on MDS already. */
615 if (it->it_flags & FMODE_WRITE) {
616 och_p = &lli->lli_mds_write_och;
617 och_usecount = &lli->lli_open_fd_write_count;
618 } else if (it->it_flags & FMODE_EXEC) {
619 och_p = &lli->lli_mds_exec_och;
620 och_usecount = &lli->lli_open_fd_exec_count;
622 och_p = &lli->lli_mds_read_och;
623 och_usecount = &lli->lli_open_fd_read_count;
626 mutex_lock(&lli->lli_och_mutex);
627 if (*och_p) { /* Open handle is present */
628 if (it_disposition(it, DISP_OPEN_OPEN)) {
629 /* Well, there's extra open request that we do not need,
630 let's close it somehow. This will decref request. */
631 rc = it_open_error(DISP_OPEN_OPEN, it);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 ll_release_openhandle(file->f_dentry, it);
641 rc = ll_local_open(file, it, fd, NULL);
644 mutex_unlock(&lli->lli_och_mutex);
645 GOTO(out_openerr, rc);
648 LASSERT(*och_usecount == 0);
649 if (!it->d.lustre.it_disposition) {
650 /* We cannot just request lock handle now, new ELC code
651 means that one of other OPEN locks for this file
652 could be cancelled, and since blocking ast handler
653 would attempt to grab och_mutex as well, that would
654 result in a deadlock */
655 mutex_unlock(&lli->lli_och_mutex);
657 * Normally called under two situations:
659 * 2. A race/condition on MDS resulting in no open
660 * handle to be returned from LOOKUP|OPEN request,
661 * for example if the target entry was a symlink.
663 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
665 * Always specify MDS_OPEN_BY_FID because we don't want
666 * to get file with different fid.
668 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
669 rc = ll_intent_file_open(file, NULL, 0, it);
671 GOTO(out_openerr, rc);
675 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
677 GOTO(out_och_free, rc = -ENOMEM);
681 /* md_intent_lock() didn't get a request ref if there was an
682 * open error, so don't do cleanup on the request here
684 /* XXX (green): Should not we bail out on any error here, not
685 * just open error? */
686 rc = it_open_error(DISP_OPEN_OPEN, it);
688 GOTO(out_och_free, rc);
690 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
691 "inode %p: disposition %x, status %d\n", inode,
692 it_disposition(it, ~0), it->d.lustre.it_status);
694 rc = ll_local_open(file, it, fd, *och_p);
696 GOTO(out_och_free, rc);
698 mutex_unlock(&lli->lli_och_mutex);
701 /* Must do this outside lli_och_mutex lock to prevent deadlock where
702 different kind of OPEN lock for this same inode gets cancelled
703 by ldlm_cancel_lru */
704 if (!S_ISREG(inode->i_mode))
705 GOTO(out_och_free, rc);
709 if (!lli->lli_has_smd &&
710 (cl_is_lov_delay_create(file->f_flags) ||
711 (file->f_mode & FMODE_WRITE) == 0)) {
712 CDEBUG(D_INODE, "object creation was delayed\n");
713 GOTO(out_och_free, rc);
715 cl_lov_delay_create_clear(&file->f_flags);
716 GOTO(out_och_free, rc);
720 if (och_p && *och_p) {
721 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
722 *och_p = NULL; /* OBD_FREE writes some magic there */
725 mutex_unlock(&lli->lli_och_mutex);
728 if (lli->lli_opendir_key == fd)
729 ll_deauthorize_statahead(inode, fd);
731 ll_file_data_put(fd);
733 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
736 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
737 ptlrpc_req_finished(it->d.lustre.it_data);
738 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
744 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
745 struct ldlm_lock_desc *desc, void *data, int flag)
748 struct lustre_handle lockh;
752 case LDLM_CB_BLOCKING:
753 ldlm_lock2handle(lock, &lockh);
754 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
756 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
760 case LDLM_CB_CANCELING:
768 * Acquire a lease and open the file.
770 static struct obd_client_handle *
771 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
774 struct lookup_intent it = { .it_op = IT_OPEN };
775 struct ll_sb_info *sbi = ll_i2sbi(inode);
776 struct md_op_data *op_data;
777 struct ptlrpc_request *req = NULL;
778 struct lustre_handle old_handle = { 0 };
779 struct obd_client_handle *och = NULL;
784 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
785 RETURN(ERR_PTR(-EINVAL));
788 struct ll_inode_info *lli = ll_i2info(inode);
789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
790 struct obd_client_handle **och_p;
793 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
794 RETURN(ERR_PTR(-EPERM));
796 /* Get the openhandle of the file */
798 mutex_lock(&lli->lli_och_mutex);
799 if (fd->fd_lease_och != NULL) {
800 mutex_unlock(&lli->lli_och_mutex);
804 if (fd->fd_och == NULL) {
805 if (file->f_mode & FMODE_WRITE) {
806 LASSERT(lli->lli_mds_write_och != NULL);
807 och_p = &lli->lli_mds_write_och;
808 och_usecount = &lli->lli_open_fd_write_count;
810 LASSERT(lli->lli_mds_read_och != NULL);
811 och_p = &lli->lli_mds_read_och;
812 och_usecount = &lli->lli_open_fd_read_count;
814 if (*och_usecount == 1) {
821 mutex_unlock(&lli->lli_och_mutex);
822 if (rc < 0) /* more than 1 opener */
825 LASSERT(fd->fd_och != NULL);
826 old_handle = fd->fd_och->och_fh;
831 RETURN(ERR_PTR(-ENOMEM));
833 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
834 LUSTRE_OPC_ANY, NULL);
836 GOTO(out, rc = PTR_ERR(op_data));
838 /* To tell the MDT this openhandle is from the same owner */
839 op_data->op_handle = old_handle;
841 it.it_flags = fmode | open_flags;
842 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
843 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
844 &ll_md_blocking_lease_ast,
845 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
846 * it can be cancelled which may mislead applications that the lease is
848 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
849 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
850 * doesn't deal with openhandle, so normal openhandle will be leaked. */
851 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
852 ll_finish_md_op_data(op_data);
853 ptlrpc_req_finished(req);
855 GOTO(out_release_it, rc);
857 if (it_disposition(&it, DISP_LOOKUP_NEG))
858 GOTO(out_release_it, rc = -ENOENT);
860 rc = it_open_error(DISP_OPEN_OPEN, &it);
862 GOTO(out_release_it, rc);
864 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
865 ll_och_fill(sbi->ll_md_exp, &it, och);
867 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
868 GOTO(out_close, rc = -EOPNOTSUPP);
870 /* already get lease, handle lease lock */
871 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
872 if (it.d.lustre.it_lock_mode == 0 ||
873 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
874 /* open lock must return for lease */
875 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
876 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
877 it.d.lustre.it_lock_bits);
878 GOTO(out_close, rc = -EPROTO);
881 ll_intent_release(&it);
885 /* Cancel open lock */
886 if (it.d.lustre.it_lock_mode != 0) {
887 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
888 it.d.lustre.it_lock_mode);
889 it.d.lustre.it_lock_mode = 0;
890 och->och_lease_handle.cookie = 0ULL;
892 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
894 CERROR("%s: error closing file "DFID": %d\n",
895 ll_get_fsname(inode->i_sb, NULL, 0),
896 PFID(&ll_i2info(inode)->lli_fid), rc2);
897 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
899 ll_intent_release(&it);
907 * Release lease and close the file.
908 * It will check if the lease has ever broken.
910 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
913 struct ldlm_lock *lock;
914 bool cancelled = true;
918 lock = ldlm_handle2lock(&och->och_lease_handle);
920 lock_res_and_lock(lock);
921 cancelled = ldlm_is_cancel(lock);
922 unlock_res_and_lock(lock);
926 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
927 PFID(&ll_i2info(inode)->lli_fid), cancelled);
930 ldlm_cli_cancel(&och->och_lease_handle, 0);
931 if (lease_broken != NULL)
932 *lease_broken = cancelled;
934 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
939 /* Fills the obdo with the attributes for the lsm */
940 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
941 struct obd_capa *capa, struct obdo *obdo,
942 __u64 ioepoch, int dv_flags)
944 struct ptlrpc_request_set *set;
945 struct obd_info oinfo = { { { 0 } } };
950 LASSERT(lsm != NULL);
954 oinfo.oi_oa->o_oi = lsm->lsm_oi;
955 oinfo.oi_oa->o_mode = S_IFREG;
956 oinfo.oi_oa->o_ioepoch = ioepoch;
957 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
958 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
959 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
960 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
961 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
962 OBD_MD_FLDATAVERSION;
963 oinfo.oi_capa = capa;
964 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
965 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
966 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
967 if (dv_flags & LL_DV_WR_FLUSH)
968 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
971 set = ptlrpc_prep_set();
973 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
976 rc = obd_getattr_async(exp, &oinfo, set);
978 rc = ptlrpc_set_wait(set);
979 ptlrpc_set_destroy(set);
982 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
983 OBD_MD_FLATIME | OBD_MD_FLMTIME |
984 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
985 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
986 if (dv_flags & LL_DV_WR_FLUSH &&
987 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
988 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
995 * Performs the getattr on the inode and updates its fields.
996 * If @sync != 0, perform the getattr under the server-side lock.
998 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
999 __u64 ioepoch, int sync)
1001 struct obd_capa *capa = ll_mdscapa_get(inode);
1002 struct lov_stripe_md *lsm;
1006 lsm = ccc_inode_lsm_get(inode);
1007 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1008 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1011 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1013 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1014 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1015 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1016 (unsigned long long)inode->i_blocks,
1017 1UL << inode->i_blkbits);
1019 ccc_inode_lsm_put(inode, lsm);
1023 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct cl_object *obj = lli->lli_clob;
1027 struct cl_attr *attr = ccc_env_thread_attr(env);
1033 ll_inode_size_lock(inode);
1034 /* merge timestamps the most recently obtained from mds with
1035 timestamps obtained from osts */
1036 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1037 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1038 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1040 lvb.lvb_size = i_size_read(inode);
1041 lvb.lvb_blocks = inode->i_blocks;
1042 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1043 lvb.lvb_atime = LTIME_S(inode->i_atime);
1044 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1046 cl_object_attr_lock(obj);
1047 rc = cl_object_attr_get(env, obj, attr);
1048 cl_object_attr_unlock(obj);
1051 if (lvb.lvb_atime < attr->cat_atime)
1052 lvb.lvb_atime = attr->cat_atime;
1053 if (lvb.lvb_ctime < attr->cat_ctime)
1054 lvb.lvb_ctime = attr->cat_ctime;
1055 if (lvb.lvb_mtime < attr->cat_mtime)
1056 lvb.lvb_mtime = attr->cat_mtime;
1058 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1059 PFID(&lli->lli_fid), attr->cat_size);
1060 cl_isize_write_nolock(inode, attr->cat_size);
1062 inode->i_blocks = attr->cat_blocks;
1064 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1065 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1066 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1068 ll_inode_size_unlock(inode);
1073 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1076 struct obdo obdo = { 0 };
1079 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1081 st->st_size = obdo.o_size;
1082 st->st_blocks = obdo.o_blocks;
1083 st->st_mtime = obdo.o_mtime;
1084 st->st_atime = obdo.o_atime;
1085 st->st_ctime = obdo.o_ctime;
1090 static bool file_is_noatime(const struct file *file)
1092 const struct vfsmount *mnt = file->f_path.mnt;
1093 const struct inode *inode = file->f_path.dentry->d_inode;
1095 /* Adapted from file_accessed() and touch_atime().*/
1096 if (file->f_flags & O_NOATIME)
1099 if (inode->i_flags & S_NOATIME)
1102 if (IS_NOATIME(inode))
1105 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1108 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1111 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1117 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1119 struct inode *inode = file->f_dentry->d_inode;
1121 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1123 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1124 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1125 file->f_flags & O_DIRECT ||
1128 io->ci_obj = ll_i2info(inode)->lli_clob;
1129 io->ci_lockreq = CILR_MAYBE;
1130 if (ll_file_nolock(file)) {
1131 io->ci_lockreq = CILR_NEVER;
1132 io->ci_no_srvlock = 1;
1133 } else if (file->f_flags & O_APPEND) {
1134 io->ci_lockreq = CILR_MANDATORY;
1137 io->ci_noatime = file_is_noatime(file);
1141 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1142 struct file *file, enum cl_io_type iot,
1143 loff_t *ppos, size_t count)
1145 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1149 struct range_lock range;
1152 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1153 file->f_dentry->d_name.name, iot, *ppos, count);
1156 io = ccc_env_thread_io(env);
1157 ll_io_init(io, file, iot == CIT_WRITE);
1159 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1160 struct vvp_io *vio = vvp_env_io(env);
1161 struct ccc_io *cio = ccc_env_io(env);
1162 bool range_locked = false;
1164 if (file->f_flags & O_APPEND)
1165 range_lock_init(&range, 0, LUSTRE_EOF);
1167 range_lock_init(&range, *ppos, *ppos + count - 1);
1168 cio->cui_fd = LUSTRE_FPRIVATE(file);
1169 vio->cui_io_subtype = args->via_io_subtype;
1171 switch (vio->cui_io_subtype) {
1173 cio->cui_iov = args->u.normal.via_iov;
1174 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1175 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1176 cio->cui_iocb = args->u.normal.via_iocb;
1177 if ((iot == CIT_WRITE) &&
1178 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1179 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1181 result = range_lock(&lli->lli_write_tree,
1186 range_locked = true;
1188 down_read(&lli->lli_trunc_sem);
1191 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1192 vio->u.splice.cui_flags = args->u.splice.via_flags;
1195 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1199 ll_cl_add(file, env, io);
1200 result = cl_io_loop(env, io);
1201 ll_cl_remove(file, env);
1203 if (args->via_io_subtype == IO_NORMAL)
1204 up_read(&lli->lli_trunc_sem);
1206 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1208 range_unlock(&lli->lli_write_tree, &range);
1211 /* cl_io_rw_init() handled IO */
1212 result = io->ci_result;
1215 if (io->ci_nob > 0) {
1216 result = io->ci_nob;
1217 *ppos = io->u.ci_wr.wr.crw_pos;
1221 cl_io_fini(env, io);
1222 /* If any bit been read/written (result != 0), we just return
1223 * short read/write instead of restart io. */
1224 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1225 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1226 iot == CIT_READ ? "read" : "write",
1227 file->f_dentry->d_name.name, *ppos, count);
1228 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1232 if (iot == CIT_READ) {
1234 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1235 LPROC_LL_READ_BYTES, result);
1236 } else if (iot == CIT_WRITE) {
1238 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1239 LPROC_LL_WRITE_BYTES, result);
1240 fd->fd_write_failed = false;
1241 } else if (result != -ERESTARTSYS) {
1242 fd->fd_write_failed = true;
1245 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1252 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1254 static int ll_file_get_iov_count(const struct iovec *iov,
1255 unsigned long *nr_segs, size_t *count)
1260 for (seg = 0; seg < *nr_segs; seg++) {
1261 const struct iovec *iv = &iov[seg];
1264 * If any segment has a negative length, or the cumulative
1265 * length ever wraps negative then return -EINVAL.
1268 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1270 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1275 cnt -= iv->iov_len; /* This segment is no good */
1282 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1283 unsigned long nr_segs, loff_t pos)
1286 struct vvp_io_args *args;
1292 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1296 env = cl_env_get(&refcheck);
1298 RETURN(PTR_ERR(env));
1300 args = vvp_env_args(env, IO_NORMAL);
1301 args->u.normal.via_iov = (struct iovec *)iov;
1302 args->u.normal.via_nrsegs = nr_segs;
1303 args->u.normal.via_iocb = iocb;
1305 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1306 &iocb->ki_pos, count);
1307 cl_env_put(env, &refcheck);
1311 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1315 struct iovec *local_iov;
1316 struct kiocb *kiocb;
1321 env = cl_env_get(&refcheck);
1323 RETURN(PTR_ERR(env));
1325 local_iov = &vvp_env_info(env)->vti_local_iov;
1326 kiocb = &vvp_env_info(env)->vti_kiocb;
1327 local_iov->iov_base = (void __user *)buf;
1328 local_iov->iov_len = count;
1329 init_sync_kiocb(kiocb, file);
1330 kiocb->ki_pos = *ppos;
1331 #ifdef HAVE_KIOCB_KI_LEFT
1332 kiocb->ki_left = count;
1334 kiocb->ki_nbytes = count;
1337 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1338 *ppos = kiocb->ki_pos;
1340 cl_env_put(env, &refcheck);
1345 * Write to a file (through the page cache).
1348 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1349 unsigned long nr_segs, loff_t pos)
1352 struct vvp_io_args *args;
1358 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1362 env = cl_env_get(&refcheck);
1364 RETURN(PTR_ERR(env));
1366 args = vvp_env_args(env, IO_NORMAL);
1367 args->u.normal.via_iov = (struct iovec *)iov;
1368 args->u.normal.via_nrsegs = nr_segs;
1369 args->u.normal.via_iocb = iocb;
1371 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1372 &iocb->ki_pos, count);
1373 cl_env_put(env, &refcheck);
1377 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1378 size_t count, loff_t *ppos)
1381 struct iovec *local_iov;
1382 struct kiocb *kiocb;
1387 env = cl_env_get(&refcheck);
1389 RETURN(PTR_ERR(env));
1391 local_iov = &vvp_env_info(env)->vti_local_iov;
1392 kiocb = &vvp_env_info(env)->vti_kiocb;
1393 local_iov->iov_base = (void __user *)buf;
1394 local_iov->iov_len = count;
1395 init_sync_kiocb(kiocb, file);
1396 kiocb->ki_pos = *ppos;
1397 #ifdef HAVE_KIOCB_KI_LEFT
1398 kiocb->ki_left = count;
1400 kiocb->ki_nbytes = count;
1403 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1404 *ppos = kiocb->ki_pos;
1406 cl_env_put(env, &refcheck);
1411 * Send file content (through pagecache) somewhere with helper
1413 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1414 struct pipe_inode_info *pipe, size_t count,
1418 struct vvp_io_args *args;
1423 env = cl_env_get(&refcheck);
1425 RETURN(PTR_ERR(env));
1427 args = vvp_env_args(env, IO_SPLICE);
1428 args->u.splice.via_pipe = pipe;
1429 args->u.splice.via_flags = flags;
1431 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1432 cl_env_put(env, &refcheck);
1436 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1439 struct obd_export *exp = ll_i2dtexp(inode);
1440 struct obd_trans_info oti = { 0 };
1441 struct obdo *oa = NULL;
1444 struct lov_stripe_md *lsm = NULL, *lsm2;
1451 lsm = ccc_inode_lsm_get(inode);
1452 if (!lsm_has_objects(lsm))
1453 GOTO(out, rc = -ENOENT);
1455 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1456 (lsm->lsm_stripe_count));
1458 OBD_ALLOC_LARGE(lsm2, lsm_size);
1460 GOTO(out, rc = -ENOMEM);
1463 oa->o_nlink = ost_idx;
1464 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1465 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1466 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1467 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1468 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1469 memcpy(lsm2, lsm, lsm_size);
1470 ll_inode_size_lock(inode);
1471 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1472 ll_inode_size_unlock(inode);
1474 OBD_FREE_LARGE(lsm2, lsm_size);
1477 ccc_inode_lsm_put(inode, lsm);
1482 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1484 struct ll_recreate_obj ucreat;
1488 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1491 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1495 ostid_set_seq_mdt0(&oi);
1496 ostid_set_id(&oi, ucreat.lrc_id);
1497 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1500 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1507 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1510 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1513 fid_to_ostid(&fid, &oi);
1514 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1515 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1518 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1519 __u64 flags, struct lov_user_md *lum,
1522 struct lov_stripe_md *lsm = NULL;
1523 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1527 lsm = ccc_inode_lsm_get(inode);
1529 ccc_inode_lsm_put(inode, lsm);
1530 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1531 PFID(ll_inode2fid(inode)));
1532 GOTO(out, rc = -EEXIST);
1535 ll_inode_size_lock(inode);
1536 oit.it_flags |= MDS_OPEN_BY_FID;
1537 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1539 GOTO(out_unlock, rc);
1540 rc = oit.d.lustre.it_status;
1542 GOTO(out_req_free, rc);
1544 ll_release_openhandle(file->f_dentry, &oit);
1547 ll_inode_size_unlock(inode);
1548 ll_intent_release(&oit);
1549 ccc_inode_lsm_put(inode, lsm);
1551 cl_lov_delay_create_clear(&file->f_flags);
1554 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1558 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1559 struct lov_mds_md **lmmp, int *lmm_size,
1560 struct ptlrpc_request **request)
1562 struct ll_sb_info *sbi = ll_i2sbi(inode);
1563 struct mdt_body *body;
1564 struct lov_mds_md *lmm = NULL;
1565 struct ptlrpc_request *req = NULL;
1566 struct md_op_data *op_data;
1569 rc = ll_get_default_mdsize(sbi, &lmmsize);
1573 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1574 strlen(filename), lmmsize,
1575 LUSTRE_OPC_ANY, NULL);
1576 if (IS_ERR(op_data))
1577 RETURN(PTR_ERR(op_data));
1579 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1580 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1581 ll_finish_md_op_data(op_data);
1583 CDEBUG(D_INFO, "md_getattr_name failed "
1584 "on %s: rc %d\n", filename, rc);
1588 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1589 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1591 lmmsize = body->mbo_eadatasize;
1593 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1595 GOTO(out, rc = -ENODATA);
1598 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1599 LASSERT(lmm != NULL);
1601 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1602 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1603 GOTO(out, rc = -EPROTO);
1607 * This is coming from the MDS, so is probably in
1608 * little endian. We convert it to host endian before
1609 * passing it to userspace.
1611 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1614 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1615 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1618 /* if function called for directory - we should
1619 * avoid swab not existent lsm objects */
1620 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1621 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1622 if (S_ISREG(body->mbo_mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1626 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1627 lustre_swab_lov_user_md_v3(
1628 (struct lov_user_md_v3 *)lmm);
1629 if (S_ISREG(body->mbo_mode))
1630 lustre_swab_lov_user_md_objects(
1631 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1638 *lmm_size = lmmsize;
1643 static int ll_lov_setea(struct inode *inode, struct file *file,
1646 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1647 struct lov_user_md *lump;
1648 int lum_size = sizeof(struct lov_user_md) +
1649 sizeof(struct lov_user_ost_data);
1653 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1656 OBD_ALLOC_LARGE(lump, lum_size);
1660 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1661 OBD_FREE_LARGE(lump, lum_size);
1665 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1667 OBD_FREE_LARGE(lump, lum_size);
1671 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1674 struct lov_user_md_v3 lumv3;
1675 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1676 struct lov_user_md_v1 __user *lumv1p =
1677 (struct lov_user_md_v1 __user *)arg;
1678 struct lov_user_md_v3 __user *lumv3p =
1679 (struct lov_user_md_v3 __user *)arg;
1681 __u64 flags = FMODE_WRITE;
1684 /* first try with v1 which is smaller than v3 */
1685 lum_size = sizeof(struct lov_user_md_v1);
1686 if (copy_from_user(lumv1, lumv1p, lum_size))
1689 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1690 lum_size = sizeof(struct lov_user_md_v3);
1691 if (copy_from_user(&lumv3, lumv3p, lum_size))
1695 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1697 struct lov_stripe_md *lsm;
1700 put_user(0, &lumv1p->lmm_stripe_count);
1702 ll_layout_refresh(inode, &gen);
1703 lsm = ccc_inode_lsm_get(inode);
1704 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1705 0, lsm, (void __user *)arg);
1706 ccc_inode_lsm_put(inode, lsm);
1711 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1713 struct lov_stripe_md *lsm;
1717 lsm = ccc_inode_lsm_get(inode);
1719 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1720 lsm, (void __user *)arg);
1721 ccc_inode_lsm_put(inode, lsm);
1726 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1728 struct ll_inode_info *lli = ll_i2info(inode);
1729 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1730 struct ccc_grouplock grouplock;
1734 if (ll_file_nolock(file))
1735 RETURN(-EOPNOTSUPP);
1737 spin_lock(&lli->lli_lock);
1738 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1739 CWARN("group lock already existed with gid %lu\n",
1740 fd->fd_grouplock.cg_gid);
1741 spin_unlock(&lli->lli_lock);
1744 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1745 spin_unlock(&lli->lli_lock);
1747 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1748 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1752 spin_lock(&lli->lli_lock);
1753 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1754 spin_unlock(&lli->lli_lock);
1755 CERROR("another thread just won the race\n");
1756 cl_put_grouplock(&grouplock);
1760 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1761 fd->fd_grouplock = grouplock;
1762 spin_unlock(&lli->lli_lock);
1764 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1768 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1770 struct ll_inode_info *lli = ll_i2info(inode);
1771 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1772 struct ccc_grouplock grouplock;
1775 spin_lock(&lli->lli_lock);
1776 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1777 spin_unlock(&lli->lli_lock);
1778 CWARN("no group lock held\n");
1781 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1783 if (fd->fd_grouplock.cg_gid != arg) {
1784 CWARN("group lock %lu doesn't match current id %lu\n",
1785 arg, fd->fd_grouplock.cg_gid);
1786 spin_unlock(&lli->lli_lock);
1790 grouplock = fd->fd_grouplock;
1791 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1792 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1793 spin_unlock(&lli->lli_lock);
1795 cl_put_grouplock(&grouplock);
1796 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1801 * Close inode open handle
1803 * \param dentry [in] dentry which contains the inode
1804 * \param it [in,out] intent which contains open info and result
1807 * \retval <0 failure
1809 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1811 struct inode *inode = dentry->d_inode;
1812 struct obd_client_handle *och;
1818 /* Root ? Do nothing. */
1819 if (dentry->d_inode->i_sb->s_root == dentry)
1822 /* No open handle to close? Move away */
1823 if (!it_disposition(it, DISP_OPEN_OPEN))
1826 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1828 OBD_ALLOC(och, sizeof(*och));
1830 GOTO(out, rc = -ENOMEM);
1832 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1834 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1837 /* this one is in place of ll_file_open */
1838 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1839 ptlrpc_req_finished(it->d.lustre.it_data);
1840 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1846 * Get size for inode for which FIEMAP mapping is requested.
1847 * Make the FIEMAP get_info call and returns the result.
1849 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1852 struct obd_export *exp = ll_i2dtexp(inode);
1853 struct lov_stripe_md *lsm = NULL;
1854 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1855 __u32 vallen = num_bytes;
1859 /* Checks for fiemap flags */
1860 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1861 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1865 /* Check for FIEMAP_FLAG_SYNC */
1866 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1867 rc = filemap_fdatawrite(inode->i_mapping);
1872 lsm = ccc_inode_lsm_get(inode);
1876 /* If the stripe_count > 1 and the application does not understand
1877 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1879 if (lsm->lsm_stripe_count > 1 &&
1880 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1881 GOTO(out, rc = -EOPNOTSUPP);
1883 fm_key.oa.o_oi = lsm->lsm_oi;
1884 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1886 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1887 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1888 /* If filesize is 0, then there would be no objects for mapping */
1889 if (fm_key.oa.o_size == 0) {
1890 fiemap->fm_mapped_extents = 0;
1894 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1896 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1899 CERROR("obd_get_info failed: rc = %d\n", rc);
1902 ccc_inode_lsm_put(inode, lsm);
1906 int ll_fid2path(struct inode *inode, void __user *arg)
1908 struct obd_export *exp = ll_i2mdexp(inode);
1909 const struct getinfo_fid2path __user *gfin = arg;
1911 struct getinfo_fid2path *gfout;
1917 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1918 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1921 /* Only need to get the buflen */
1922 if (get_user(pathlen, &gfin->gf_pathlen))
1925 if (pathlen > PATH_MAX)
1928 outsize = sizeof(*gfout) + pathlen;
1929 OBD_ALLOC(gfout, outsize);
1933 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1934 GOTO(gf_free, rc = -EFAULT);
1936 /* Call mdc_iocontrol */
1937 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1941 if (copy_to_user(arg, gfout, outsize))
1945 OBD_FREE(gfout, outsize);
1949 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1951 struct ll_user_fiemap *fiemap_s;
1952 size_t num_bytes, ret_bytes;
1953 unsigned int extent_count;
1956 /* Get the extent count so we can calculate the size of
1957 * required fiemap buffer */
1958 if (get_user(extent_count,
1959 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1963 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1965 num_bytes = sizeof(*fiemap_s) + (extent_count *
1966 sizeof(struct ll_fiemap_extent));
1968 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1969 if (fiemap_s == NULL)
1972 /* get the fiemap value */
1973 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1975 GOTO(error, rc = -EFAULT);
1977 /* If fm_extent_count is non-zero, read the first extent since
1978 * it is used to calculate end_offset and device from previous
1981 if (copy_from_user(&fiemap_s->fm_extents[0],
1982 (char __user *)arg + sizeof(*fiemap_s),
1983 sizeof(struct ll_fiemap_extent)))
1984 GOTO(error, rc = -EFAULT);
1987 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1991 ret_bytes = sizeof(struct ll_user_fiemap);
1993 if (extent_count != 0)
1994 ret_bytes += (fiemap_s->fm_mapped_extents *
1995 sizeof(struct ll_fiemap_extent));
1997 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
2001 OBD_FREE_LARGE(fiemap_s, num_bytes);
2006 * Read the data_version for inode.
2008 * This value is computed using stripe object version on OST.
2009 * Version is computed using server side locking.
2011 * @param sync if do sync on the OST side;
2013 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2014 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2016 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2018 struct lov_stripe_md *lsm = NULL;
2019 struct ll_sb_info *sbi = ll_i2sbi(inode);
2020 struct obdo *obdo = NULL;
2024 /* If no stripe, we consider version is 0. */
2025 lsm = ccc_inode_lsm_get(inode);
2026 if (!lsm_has_objects(lsm)) {
2028 CDEBUG(D_INODE, "No object for inode\n");
2032 OBD_ALLOC_PTR(obdo);
2034 GOTO(out, rc = -ENOMEM);
2036 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2038 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2041 *data_version = obdo->o_data_version;
2047 ccc_inode_lsm_put(inode, lsm);
2052 * Trigger a HSM release request for the provided inode.
2054 int ll_hsm_release(struct inode *inode)
2056 struct cl_env_nest nest;
2058 struct obd_client_handle *och = NULL;
2059 __u64 data_version = 0;
2063 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2064 ll_get_fsname(inode->i_sb, NULL, 0),
2065 PFID(&ll_i2info(inode)->lli_fid));
2067 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2069 GOTO(out, rc = PTR_ERR(och));
2071 /* Grab latest data_version and [am]time values */
2072 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2076 env = cl_env_nested_get(&nest);
2078 GOTO(out, rc = PTR_ERR(env));
2080 ll_merge_lvb(env, inode);
2081 cl_env_nested_put(&nest, env);
2083 /* Release the file.
2084 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2085 * we still need it to pack l_remote_handle to MDT. */
2086 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2092 if (och != NULL && !IS_ERR(och)) /* close the file */
2093 ll_lease_close(och, inode, NULL);
2098 struct ll_swap_stack {
2099 struct iattr ia1, ia2;
2101 struct inode *inode1, *inode2;
2102 bool check_dv1, check_dv2;
2105 static int ll_swap_layouts(struct file *file1, struct file *file2,
2106 struct lustre_swap_layouts *lsl)
2108 struct mdc_swap_layouts msl;
2109 struct md_op_data *op_data;
2112 struct ll_swap_stack *llss = NULL;
2115 OBD_ALLOC_PTR(llss);
2119 llss->inode1 = file1->f_dentry->d_inode;
2120 llss->inode2 = file2->f_dentry->d_inode;
2122 if (!S_ISREG(llss->inode2->i_mode))
2123 GOTO(free, rc = -EINVAL);
2125 if (inode_permission(llss->inode1, MAY_WRITE) ||
2126 inode_permission(llss->inode2, MAY_WRITE))
2127 GOTO(free, rc = -EPERM);
2129 if (llss->inode2->i_sb != llss->inode1->i_sb)
2130 GOTO(free, rc = -EXDEV);
2132 /* we use 2 bool because it is easier to swap than 2 bits */
2133 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2134 llss->check_dv1 = true;
2136 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2137 llss->check_dv2 = true;
2139 /* we cannot use lsl->sl_dvX directly because we may swap them */
2140 llss->dv1 = lsl->sl_dv1;
2141 llss->dv2 = lsl->sl_dv2;
2143 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2144 if (rc == 0) /* same file, done! */
2147 if (rc < 0) { /* sequentialize it */
2148 swap(llss->inode1, llss->inode2);
2150 swap(llss->dv1, llss->dv2);
2151 swap(llss->check_dv1, llss->check_dv2);
2155 if (gid != 0) { /* application asks to flush dirty cache */
2156 rc = ll_get_grouplock(llss->inode1, file1, gid);
2160 rc = ll_get_grouplock(llss->inode2, file2, gid);
2162 ll_put_grouplock(llss->inode1, file1, gid);
2167 /* to be able to restore mtime and atime after swap
2168 * we need to first save them */
2170 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2171 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2172 llss->ia1.ia_atime = llss->inode1->i_atime;
2173 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2174 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2175 llss->ia2.ia_atime = llss->inode2->i_atime;
2176 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2179 /* ultimate check, before swaping the layouts we check if
2180 * dataversion has changed (if requested) */
2181 if (llss->check_dv1) {
2182 rc = ll_data_version(llss->inode1, &dv, 0);
2185 if (dv != llss->dv1)
2186 GOTO(putgl, rc = -EAGAIN);
2189 if (llss->check_dv2) {
2190 rc = ll_data_version(llss->inode2, &dv, 0);
2193 if (dv != llss->dv2)
2194 GOTO(putgl, rc = -EAGAIN);
2197 /* struct md_op_data is used to send the swap args to the mdt
2198 * only flags is missing, so we use struct mdc_swap_layouts
2199 * through the md_op_data->op_data */
2200 /* flags from user space have to be converted before they are send to
2201 * server, no flag is sent today, they are only used on the client */
2204 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2205 0, LUSTRE_OPC_ANY, &msl);
2206 if (IS_ERR(op_data))
2207 GOTO(free, rc = PTR_ERR(op_data));
2209 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2210 sizeof(*op_data), op_data, NULL);
2211 ll_finish_md_op_data(op_data);
2215 ll_put_grouplock(llss->inode2, file2, gid);
2216 ll_put_grouplock(llss->inode1, file1, gid);
2219 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2223 /* clear useless flags */
2224 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2225 llss->ia1.ia_valid &= ~ATTR_MTIME;
2226 llss->ia2.ia_valid &= ~ATTR_MTIME;
2229 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2230 llss->ia1.ia_valid &= ~ATTR_ATIME;
2231 llss->ia2.ia_valid &= ~ATTR_ATIME;
2234 /* update time if requested */
2236 if (llss->ia2.ia_valid != 0) {
2237 mutex_lock(&llss->inode1->i_mutex);
2238 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2239 mutex_unlock(&llss->inode1->i_mutex);
2242 if (llss->ia1.ia_valid != 0) {
2245 mutex_lock(&llss->inode2->i_mutex);
2246 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2247 mutex_unlock(&llss->inode2->i_mutex);
2259 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2261 struct md_op_data *op_data;
2264 /* Non-root users are forbidden to set or clear flags which are
2265 * NOT defined in HSM_USER_MASK. */
2266 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2267 !cfs_capable(CFS_CAP_SYS_ADMIN))
2270 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2271 LUSTRE_OPC_ANY, hss);
2272 if (IS_ERR(op_data))
2273 RETURN(PTR_ERR(op_data));
2275 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2276 sizeof(*op_data), op_data, NULL);
2278 ll_finish_md_op_data(op_data);
2283 static int ll_hsm_import(struct inode *inode, struct file *file,
2284 struct hsm_user_import *hui)
2286 struct hsm_state_set *hss = NULL;
2287 struct iattr *attr = NULL;
2291 if (!S_ISREG(inode->i_mode))
2297 GOTO(out, rc = -ENOMEM);
2299 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2300 hss->hss_archive_id = hui->hui_archive_id;
2301 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2302 rc = ll_hsm_state_set(inode, hss);
2306 OBD_ALLOC_PTR(attr);
2308 GOTO(out, rc = -ENOMEM);
2310 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2311 attr->ia_mode |= S_IFREG;
2312 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2313 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2314 attr->ia_size = hui->hui_size;
2315 attr->ia_mtime.tv_sec = hui->hui_mtime;
2316 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2317 attr->ia_atime.tv_sec = hui->hui_atime;
2318 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2320 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2321 ATTR_UID | ATTR_GID |
2322 ATTR_MTIME | ATTR_MTIME_SET |
2323 ATTR_ATIME | ATTR_ATIME_SET;
2325 mutex_lock(&inode->i_mutex);
2327 rc = ll_setattr_raw(file->f_dentry, attr, true);
2331 mutex_unlock(&inode->i_mutex);
2343 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2345 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2346 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2350 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2352 struct inode *inode = file->f_dentry->d_inode;
2353 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2358 PFID(ll_inode2fid(inode)), inode, cmd);
2359 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2361 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2362 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2366 case LL_IOC_GETFLAGS:
2367 /* Get the current value of the file flags */
2368 return put_user(fd->fd_flags, (int __user *)arg);
2369 case LL_IOC_SETFLAGS:
2370 case LL_IOC_CLRFLAGS:
2371 /* Set or clear specific file flags */
2372 /* XXX This probably needs checks to ensure the flags are
2373 * not abused, and to handle any flag side effects.
2375 if (get_user(flags, (int __user *) arg))
2378 if (cmd == LL_IOC_SETFLAGS) {
2379 if ((flags & LL_FILE_IGNORE_LOCK) &&
2380 !(file->f_flags & O_DIRECT)) {
2381 CERROR("%s: unable to disable locking on "
2382 "non-O_DIRECT file\n", current->comm);
2386 fd->fd_flags |= flags;
2388 fd->fd_flags &= ~flags;
2391 case LL_IOC_LOV_SETSTRIPE:
2392 RETURN(ll_lov_setstripe(inode, file, arg));
2393 case LL_IOC_LOV_SETEA:
2394 RETURN(ll_lov_setea(inode, file, arg));
2395 case LL_IOC_LOV_SWAP_LAYOUTS: {
2397 struct lustre_swap_layouts lsl;
2399 if (copy_from_user(&lsl, (char __user *)arg,
2400 sizeof(struct lustre_swap_layouts)))
2403 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2406 file2 = fget(lsl.sl_fd);
2411 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2412 rc = ll_swap_layouts(file, file2, &lsl);
2416 case LL_IOC_LOV_GETSTRIPE:
2417 RETURN(ll_lov_getstripe(inode, arg));
2418 case LL_IOC_RECREATE_OBJ:
2419 RETURN(ll_lov_recreate_obj(inode, arg));
2420 case LL_IOC_RECREATE_FID:
2421 RETURN(ll_lov_recreate_fid(inode, arg));
2422 case FSFILT_IOC_FIEMAP:
2423 RETURN(ll_ioctl_fiemap(inode, arg));
2424 case FSFILT_IOC_GETFLAGS:
2425 case FSFILT_IOC_SETFLAGS:
2426 RETURN(ll_iocontrol(inode, file, cmd, arg));
2427 case FSFILT_IOC_GETVERSION_OLD:
2428 case FSFILT_IOC_GETVERSION:
2429 RETURN(put_user(inode->i_generation, (int __user *)arg));
2430 case LL_IOC_GROUP_LOCK:
2431 RETURN(ll_get_grouplock(inode, file, arg));
2432 case LL_IOC_GROUP_UNLOCK:
2433 RETURN(ll_put_grouplock(inode, file, arg));
2434 case IOC_OBD_STATFS:
2435 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2437 /* We need to special case any other ioctls we want to handle,
2438 * to send them to the MDS/OST as appropriate and to properly
2439 * network encode the arg field.
2440 case FSFILT_IOC_SETVERSION_OLD:
2441 case FSFILT_IOC_SETVERSION:
2443 case LL_IOC_FLUSHCTX:
2444 RETURN(ll_flush_ctx(inode));
2445 case LL_IOC_PATH2FID: {
2446 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2447 sizeof(struct lu_fid)))
2452 case OBD_IOC_FID2PATH:
2453 RETURN(ll_fid2path(inode, (void __user *)arg));
2454 case LL_IOC_DATA_VERSION: {
2455 struct ioc_data_version idv;
2458 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2461 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2462 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2465 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2471 case LL_IOC_GET_MDTIDX: {
2474 mdtidx = ll_get_mdt_idx(inode);
2478 if (put_user((int)mdtidx, (int __user *)arg))
2483 case OBD_IOC_GETDTNAME:
2484 case OBD_IOC_GETMDNAME:
2485 RETURN(ll_get_obd_name(inode, cmd, arg));
2486 case LL_IOC_HSM_STATE_GET: {
2487 struct md_op_data *op_data;
2488 struct hsm_user_state *hus;
2495 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2496 LUSTRE_OPC_ANY, hus);
2497 if (IS_ERR(op_data)) {
2499 RETURN(PTR_ERR(op_data));
2502 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2505 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2508 ll_finish_md_op_data(op_data);
2512 case LL_IOC_HSM_STATE_SET: {
2513 struct hsm_state_set *hss;
2520 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2525 rc = ll_hsm_state_set(inode, hss);
2530 case LL_IOC_HSM_ACTION: {
2531 struct md_op_data *op_data;
2532 struct hsm_current_action *hca;
2539 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2540 LUSTRE_OPC_ANY, hca);
2541 if (IS_ERR(op_data)) {
2543 RETURN(PTR_ERR(op_data));
2546 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2549 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2552 ll_finish_md_op_data(op_data);
2556 case LL_IOC_SET_LEASE: {
2557 struct ll_inode_info *lli = ll_i2info(inode);
2558 struct obd_client_handle *och = NULL;
2563 case LL_LEASE_WRLCK:
2564 if (!(file->f_mode & FMODE_WRITE))
2566 fmode = FMODE_WRITE;
2568 case LL_LEASE_RDLCK:
2569 if (!(file->f_mode & FMODE_READ))
2573 case LL_LEASE_UNLCK:
2574 mutex_lock(&lli->lli_och_mutex);
2575 if (fd->fd_lease_och != NULL) {
2576 och = fd->fd_lease_och;
2577 fd->fd_lease_och = NULL;
2579 mutex_unlock(&lli->lli_och_mutex);
2584 fmode = och->och_flags;
2585 rc = ll_lease_close(och, inode, &lease_broken);
2592 RETURN(ll_lease_type_from_fmode(fmode));
2597 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2599 /* apply for lease */
2600 och = ll_lease_open(inode, file, fmode, 0);
2602 RETURN(PTR_ERR(och));
2605 mutex_lock(&lli->lli_och_mutex);
2606 if (fd->fd_lease_och == NULL) {
2607 fd->fd_lease_och = och;
2610 mutex_unlock(&lli->lli_och_mutex);
2612 /* impossible now that only excl is supported for now */
2613 ll_lease_close(och, inode, &lease_broken);
2618 case LL_IOC_GET_LEASE: {
2619 struct ll_inode_info *lli = ll_i2info(inode);
2620 struct ldlm_lock *lock = NULL;
2623 mutex_lock(&lli->lli_och_mutex);
2624 if (fd->fd_lease_och != NULL) {
2625 struct obd_client_handle *och = fd->fd_lease_och;
2627 lock = ldlm_handle2lock(&och->och_lease_handle);
2629 lock_res_and_lock(lock);
2630 if (!ldlm_is_cancel(lock))
2631 fmode = och->och_flags;
2633 unlock_res_and_lock(lock);
2634 LDLM_LOCK_PUT(lock);
2637 mutex_unlock(&lli->lli_och_mutex);
2639 RETURN(ll_lease_type_from_fmode(fmode));
2641 case LL_IOC_HSM_IMPORT: {
2642 struct hsm_user_import *hui;
2648 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2653 rc = ll_hsm_import(inode, file, hui);
2663 ll_iocontrol_call(inode, file, cmd, arg, &err))
2666 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2667 (void __user *)arg));
2672 #ifndef HAVE_FILE_LLSEEK_SIZE
2673 static inline loff_t
2674 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2676 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2678 if (offset > maxsize)
2681 if (offset != file->f_pos) {
2682 file->f_pos = offset;
2683 file->f_version = 0;
2689 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2690 loff_t maxsize, loff_t eof)
2692 struct inode *inode = file->f_dentry->d_inode;
2700 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2701 * position-querying operation. Avoid rewriting the "same"
2702 * f_pos value back to the file because a concurrent read(),
2703 * write() or lseek() might have altered it
2708 * f_lock protects against read/modify/write race with other
2709 * SEEK_CURs. Note that parallel writes and reads behave
2712 mutex_lock(&inode->i_mutex);
2713 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2714 mutex_unlock(&inode->i_mutex);
2718 * In the generic case the entire file is data, so as long as
2719 * offset isn't at the end of the file then the offset is data.
2726 * There is a virtual hole at the end of the file, so as long as
2727 * offset isn't i_size or larger, return i_size.
2735 return llseek_execute(file, offset, maxsize);
2739 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2741 struct inode *inode = file->f_dentry->d_inode;
2742 loff_t retval, eof = 0;
2745 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2746 (origin == SEEK_CUR) ? file->f_pos : 0);
2747 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2748 PFID(ll_inode2fid(inode)), inode, retval, retval,
2750 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2752 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2753 retval = ll_glimpse_size(inode);
2756 eof = i_size_read(inode);
2759 retval = ll_generic_file_llseek_size(file, offset, origin,
2760 ll_file_maxbytes(inode), eof);
2764 static int ll_flush(struct file *file, fl_owner_t id)
2766 struct inode *inode = file->f_dentry->d_inode;
2767 struct ll_inode_info *lli = ll_i2info(inode);
2768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2771 LASSERT(!S_ISDIR(inode->i_mode));
2773 /* catch async errors that were recorded back when async writeback
2774 * failed for pages in this mapping. */
2775 rc = lli->lli_async_rc;
2776 lli->lli_async_rc = 0;
2777 if (lli->lli_clob != NULL) {
2778 err = lov_read_and_clear_async_rc(lli->lli_clob);
2783 /* The application has been told write failure already.
2784 * Do not report failure again. */
2785 if (fd->fd_write_failed)
2787 return rc ? -EIO : 0;
2791 * Called to make sure a portion of file has been written out.
2792 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2794 * Return how many pages have been written.
2796 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2797 enum cl_fsync_mode mode, int ignore_layout)
2799 struct cl_env_nest nest;
2802 struct obd_capa *capa = NULL;
2803 struct cl_fsync_io *fio;
2807 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2808 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2811 env = cl_env_nested_get(&nest);
2813 RETURN(PTR_ERR(env));
2815 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2817 io = ccc_env_thread_io(env);
2818 io->ci_obj = cl_i2info(inode)->lli_clob;
2819 io->ci_ignore_layout = ignore_layout;
2821 /* initialize parameters for sync */
2822 fio = &io->u.ci_fsync;
2823 fio->fi_capa = capa;
2824 fio->fi_start = start;
2826 fio->fi_fid = ll_inode2fid(inode);
2827 fio->fi_mode = mode;
2828 fio->fi_nr_written = 0;
2830 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2831 result = cl_io_loop(env, io);
2833 result = io->ci_result;
2835 result = fio->fi_nr_written;
2836 cl_io_fini(env, io);
2837 cl_env_nested_put(&nest, env);
2845 * When dentry is provided (the 'else' case), *file->f_dentry may be
2846 * null and dentry must be used directly rather than pulled from
2847 * *file->f_dentry as is done otherwise.
2850 #ifdef HAVE_FILE_FSYNC_4ARGS
2851 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2853 struct dentry *dentry = file->f_dentry;
2854 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2855 int ll_fsync(struct file *file, int datasync)
2857 struct dentry *dentry = file->f_dentry;
2859 loff_t end = LLONG_MAX;
2861 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2864 loff_t end = LLONG_MAX;
2866 struct inode *inode = dentry->d_inode;
2867 struct ll_inode_info *lli = ll_i2info(inode);
2868 struct ptlrpc_request *req;
2869 struct obd_capa *oc;
2873 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2874 PFID(ll_inode2fid(inode)), inode);
2875 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2877 #ifdef HAVE_FILE_FSYNC_4ARGS
2878 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2879 mutex_lock(&inode->i_mutex);
2881 /* fsync's caller has already called _fdata{sync,write}, we want
2882 * that IO to finish before calling the osc and mdc sync methods */
2883 rc = filemap_fdatawait(inode->i_mapping);
2886 /* catch async errors that were recorded back when async writeback
2887 * failed for pages in this mapping. */
2888 if (!S_ISDIR(inode->i_mode)) {
2889 err = lli->lli_async_rc;
2890 lli->lli_async_rc = 0;
2893 err = lov_read_and_clear_async_rc(lli->lli_clob);
2898 oc = ll_mdscapa_get(inode);
2899 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2905 ptlrpc_req_finished(req);
2907 if (S_ISREG(inode->i_mode)) {
2908 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2910 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2911 if (rc == 0 && err < 0)
2914 fd->fd_write_failed = true;
2916 fd->fd_write_failed = false;
2919 #ifdef HAVE_FILE_FSYNC_4ARGS
2920 mutex_unlock(&inode->i_mutex);
2926 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2928 struct inode *inode = file->f_dentry->d_inode;
2929 struct ll_sb_info *sbi = ll_i2sbi(inode);
2930 struct ldlm_enqueue_info einfo = {
2931 .ei_type = LDLM_FLOCK,
2932 .ei_cb_cp = ldlm_flock_completion_ast,
2933 .ei_cbdata = file_lock,
2935 struct md_op_data *op_data;
2936 struct lustre_handle lockh = {0};
2937 ldlm_policy_data_t flock = {{0}};
2938 int fl_type = file_lock->fl_type;
2944 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2945 PFID(ll_inode2fid(inode)), file_lock);
2947 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2949 if (file_lock->fl_flags & FL_FLOCK) {
2950 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2951 /* flocks are whole-file locks */
2952 flock.l_flock.end = OFFSET_MAX;
2953 /* For flocks owner is determined by the local file desctiptor*/
2954 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2955 } else if (file_lock->fl_flags & FL_POSIX) {
2956 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2957 flock.l_flock.start = file_lock->fl_start;
2958 flock.l_flock.end = file_lock->fl_end;
2962 flock.l_flock.pid = file_lock->fl_pid;
2964 /* Somewhat ugly workaround for svc lockd.
2965 * lockd installs custom fl_lmops->lm_compare_owner that checks
2966 * for the fl_owner to be the same (which it always is on local node
2967 * I guess between lockd processes) and then compares pid.
2968 * As such we assign pid to the owner field to make it all work,
2969 * conflict with normal locks is unlikely since pid space and
2970 * pointer space for current->files are not intersecting */
2971 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2972 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2976 einfo.ei_mode = LCK_PR;
2979 /* An unlock request may or may not have any relation to
2980 * existing locks so we may not be able to pass a lock handle
2981 * via a normal ldlm_lock_cancel() request. The request may even
2982 * unlock a byte range in the middle of an existing lock. In
2983 * order to process an unlock request we need all of the same
2984 * information that is given with a normal read or write record
2985 * lock request. To avoid creating another ldlm unlock (cancel)
2986 * message we'll treat a LCK_NL flock request as an unlock. */
2987 einfo.ei_mode = LCK_NL;
2990 einfo.ei_mode = LCK_PW;
2993 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3008 flags = LDLM_FL_BLOCK_NOWAIT;
3014 flags = LDLM_FL_TEST_LOCK;
3017 CERROR("unknown fcntl lock command: %d\n", cmd);
3021 /* Save the old mode so that if the mode in the lock changes we
3022 * can decrement the appropriate reader or writer refcount. */
3023 file_lock->fl_type = einfo.ei_mode;
3025 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3026 LUSTRE_OPC_ANY, NULL);
3027 if (IS_ERR(op_data))
3028 RETURN(PTR_ERR(op_data));
3030 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3031 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3032 flock.l_flock.pid, flags, einfo.ei_mode,
3033 flock.l_flock.start, flock.l_flock.end);
3035 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3038 /* Restore the file lock type if not TEST lock. */
3039 if (!(flags & LDLM_FL_TEST_LOCK))
3040 file_lock->fl_type = fl_type;
3042 if ((file_lock->fl_flags & FL_FLOCK) &&
3043 (rc == 0 || file_lock->fl_type == F_UNLCK))
3044 rc2 = flock_lock_file_wait(file, file_lock);
3045 if ((file_lock->fl_flags & FL_POSIX) &&
3046 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3047 !(flags & LDLM_FL_TEST_LOCK))
3048 rc2 = posix_lock_file_wait(file, file_lock);
3050 if (rc2 && file_lock->fl_type != F_UNLCK) {
3051 einfo.ei_mode = LCK_NL;
3052 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3057 ll_finish_md_op_data(op_data);
3062 int ll_get_fid_by_name(struct inode *parent, const char *name,
3063 int namelen, struct lu_fid *fid)
3065 struct md_op_data *op_data = NULL;
3066 struct mdt_body *body;
3067 struct ptlrpc_request *req;
3071 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3072 LUSTRE_OPC_ANY, NULL);
3073 if (IS_ERR(op_data))
3074 RETURN(PTR_ERR(op_data));
3076 op_data->op_valid = OBD_MD_FLID;
3077 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3078 ll_finish_md_op_data(op_data);
3082 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3084 GOTO(out_req, rc = -EFAULT);
3086 *fid = body->mbo_fid1;
3088 ptlrpc_req_finished(req);
3092 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3093 const char *name, int namelen)
3095 struct dentry *dchild = NULL;
3096 struct inode *child_inode = NULL;
3097 struct md_op_data *op_data;
3098 struct ptlrpc_request *request = NULL;
3103 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3104 name, PFID(ll_inode2fid(parent)), mdtidx);
3106 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3107 0, LUSTRE_OPC_ANY, NULL);
3108 if (IS_ERR(op_data))
3109 RETURN(PTR_ERR(op_data));
3111 /* Get child FID first */
3112 qstr.hash = full_name_hash(name, namelen);
3115 dchild = d_lookup(file->f_dentry, &qstr);
3116 if (dchild != NULL && dchild->d_inode != NULL) {
3117 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3118 if (dchild->d_inode != NULL) {
3119 child_inode = igrab(dchild->d_inode);
3120 ll_invalidate_aliases(child_inode);
3124 rc = ll_get_fid_by_name(parent, name, namelen,
3130 if (!fid_is_sane(&op_data->op_fid3)) {
3131 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3132 ll_get_fsname(parent->i_sb, NULL, 0), name,
3133 PFID(&op_data->op_fid3));
3134 GOTO(out_free, rc = -EINVAL);
3137 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3142 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3143 PFID(&op_data->op_fid3), mdtidx);
3144 GOTO(out_free, rc = 0);
3147 op_data->op_mds = mdtidx;
3148 op_data->op_cli_flags = CLI_MIGRATE;
3149 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3150 namelen, name, namelen, &request);
3152 ll_update_times(request, parent);
3154 ptlrpc_req_finished(request);
3159 if (child_inode != NULL) {
3160 clear_nlink(child_inode);
3164 ll_finish_md_op_data(op_data);
3169 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3177 * test if some locks matching bits and l_req_mode are acquired
3178 * - bits can be in different locks
3179 * - if found clear the common lock bits in *bits
3180 * - the bits not found, are kept in *bits
3182 * \param bits [IN] searched lock bits [IN]
3183 * \param l_req_mode [IN] searched lock mode
3184 * \retval boolean, true iff all bits are found
3186 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3188 struct lustre_handle lockh;
3189 ldlm_policy_data_t policy;
3190 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3191 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3200 fid = &ll_i2info(inode)->lli_fid;
3201 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3202 ldlm_lockname[mode]);
3204 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3205 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3206 policy.l_inodebits.bits = *bits & (1 << i);
3207 if (policy.l_inodebits.bits == 0)
3210 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3211 &policy, mode, &lockh)) {
3212 struct ldlm_lock *lock;
3214 lock = ldlm_handle2lock(&lockh);
3217 ~(lock->l_policy_data.l_inodebits.bits);
3218 LDLM_LOCK_PUT(lock);
3220 *bits &= ~policy.l_inodebits.bits;
3227 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3228 struct lustre_handle *lockh, __u64 flags,
3231 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3236 fid = &ll_i2info(inode)->lli_fid;
3237 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3239 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3240 fid, LDLM_IBITS, &policy, mode, lockh);
3245 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3247 /* Already unlinked. Just update nlink and return success */
3248 if (rc == -ENOENT) {
3250 /* This path cannot be hit for regular files unless in
3251 * case of obscure races, so no need to to validate
3253 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3255 } else if (rc != 0) {
3256 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3257 "%s: revalidate FID "DFID" error: rc = %d\n",
3258 ll_get_fsname(inode->i_sb, NULL, 0),
3259 PFID(ll_inode2fid(inode)), rc);
3265 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3267 struct inode *inode = dentry->d_inode;
3268 struct ptlrpc_request *req = NULL;
3269 struct obd_export *exp;
3273 LASSERT(inode != NULL);
3275 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3276 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3278 exp = ll_i2mdexp(inode);
3280 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3281 * But under CMD case, it caused some lock issues, should be fixed
3282 * with new CMD ibits lock. See bug 12718 */
3283 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3284 struct lookup_intent oit = { .it_op = IT_GETATTR };
3285 struct md_op_data *op_data;
3287 if (ibits == MDS_INODELOCK_LOOKUP)
3288 oit.it_op = IT_LOOKUP;
3290 /* Call getattr by fid, so do not provide name at all. */
3291 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3292 dentry->d_inode, NULL, 0, 0,
3293 LUSTRE_OPC_ANY, NULL);
3294 if (IS_ERR(op_data))
3295 RETURN(PTR_ERR(op_data));
3297 rc = md_intent_lock(exp, op_data, &oit, &req,
3298 &ll_md_blocking_ast, 0);
3299 ll_finish_md_op_data(op_data);
3301 rc = ll_inode_revalidate_fini(inode, rc);
3305 rc = ll_revalidate_it_finish(req, &oit, dentry);
3307 ll_intent_release(&oit);
3311 /* Unlinked? Unhash dentry, so it is not picked up later by
3312 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3313 here to preserve get_cwd functionality on 2.6.
3315 if (!dentry->d_inode->i_nlink)
3316 d_lustre_invalidate(dentry, 0);
3318 ll_lookup_finish_locks(&oit, dentry);
3319 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3320 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3321 obd_valid valid = OBD_MD_FLGETATTR;
3322 struct md_op_data *op_data;
3325 if (S_ISREG(inode->i_mode)) {
3326 rc = ll_get_default_mdsize(sbi, &ealen);
3329 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3332 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3333 0, ealen, LUSTRE_OPC_ANY,
3335 if (IS_ERR(op_data))
3336 RETURN(PTR_ERR(op_data));
3338 op_data->op_valid = valid;
3339 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3340 * capa for this inode. Because we only keep capas of dirs
3342 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3343 ll_finish_md_op_data(op_data);
3345 rc = ll_inode_revalidate_fini(inode, rc);
3349 rc = ll_prep_inode(&inode, req, NULL, NULL);
3352 ptlrpc_req_finished(req);
3356 static int ll_merge_md_attr(struct inode *inode)
3358 struct cl_attr attr = { 0 };
3361 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3362 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3367 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3368 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3370 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3371 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3372 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3378 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3380 struct inode *inode = dentry->d_inode;
3384 rc = __ll_inode_revalidate(dentry, ibits);
3388 /* if object isn't regular file, don't validate size */
3389 if (!S_ISREG(inode->i_mode)) {
3390 if (S_ISDIR(inode->i_mode) &&
3391 ll_i2info(inode)->lli_lsm_md != NULL) {
3392 rc = ll_merge_md_attr(inode);
3397 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3398 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3399 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3401 /* In case of restore, the MDT has the right size and has
3402 * already send it back without granting the layout lock,
3403 * inode is up-to-date so glimpse is useless.
3404 * Also to glimpse we need the layout, in case of a running
3405 * restore the MDT holds the layout lock so the glimpse will
3406 * block up to the end of restore (getattr will block)
3408 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3409 rc = ll_glimpse_size(inode);
3414 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3416 struct inode *inode = de->d_inode;
3417 struct ll_sb_info *sbi = ll_i2sbi(inode);
3418 struct ll_inode_info *lli = ll_i2info(inode);
3421 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3422 MDS_INODELOCK_LOOKUP);
3423 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3428 stat->dev = inode->i_sb->s_dev;
3429 if (ll_need_32bit_api(sbi))
3430 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3432 stat->ino = inode->i_ino;
3433 stat->mode = inode->i_mode;
3434 stat->uid = inode->i_uid;
3435 stat->gid = inode->i_gid;
3436 stat->rdev = inode->i_rdev;
3437 stat->atime = inode->i_atime;
3438 stat->mtime = inode->i_mtime;
3439 stat->ctime = inode->i_ctime;
3440 stat->blksize = 1 << inode->i_blkbits;
3441 stat->blocks = inode->i_blocks;
3443 if (S_ISDIR(inode->i_mode) &&
3444 ll_i2info(inode)->lli_lsm_md != NULL) {
3445 stat->nlink = lli->lli_stripe_dir_nlink;
3446 stat->size = lli->lli_stripe_dir_size;
3448 stat->nlink = inode->i_nlink;
3449 stat->size = i_size_read(inode);
3455 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3456 __u64 start, __u64 len)
3460 struct ll_user_fiemap *fiemap;
3461 unsigned int extent_count = fieinfo->fi_extents_max;
3463 num_bytes = sizeof(*fiemap) + (extent_count *
3464 sizeof(struct ll_fiemap_extent));
3465 OBD_ALLOC_LARGE(fiemap, num_bytes);
3470 fiemap->fm_flags = fieinfo->fi_flags;
3471 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3472 fiemap->fm_start = start;
3473 fiemap->fm_length = len;
3474 if (extent_count > 0)
3475 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3476 sizeof(struct ll_fiemap_extent));
3478 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3480 fieinfo->fi_flags = fiemap->fm_flags;
3481 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3482 if (extent_count > 0)
3483 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3484 fiemap->fm_mapped_extents *
3485 sizeof(struct ll_fiemap_extent));
3487 OBD_FREE_LARGE(fiemap, num_bytes);
3491 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3493 struct ll_inode_info *lli = ll_i2info(inode);
3494 struct posix_acl *acl = NULL;
3497 spin_lock(&lli->lli_lock);
3498 /* VFS' acl_permission_check->check_acl will release the refcount */
3499 acl = posix_acl_dup(lli->lli_posix_acl);
3500 spin_unlock(&lli->lli_lock);
3505 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3507 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3508 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3510 ll_check_acl(struct inode *inode, int mask)
3513 # ifdef CONFIG_FS_POSIX_ACL
3514 struct posix_acl *acl;
3518 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3519 if (flags & IPERM_FLAG_RCU)
3522 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3527 rc = posix_acl_permission(inode, acl, mask);
3528 posix_acl_release(acl);
3531 # else /* !CONFIG_FS_POSIX_ACL */
3533 # endif /* CONFIG_FS_POSIX_ACL */
3535 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3537 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3538 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3540 # ifdef HAVE_INODE_PERMISION_2ARGS
3541 int ll_inode_permission(struct inode *inode, int mask)
3543 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3548 struct ll_sb_info *sbi;
3549 struct root_squash_info *squash;
3550 struct cred *cred = NULL;
3551 const struct cred *old_cred = NULL;
3553 bool squash_id = false;
3556 #ifdef MAY_NOT_BLOCK
3557 if (mask & MAY_NOT_BLOCK)
3559 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3560 if (flags & IPERM_FLAG_RCU)
3564 /* as root inode are NOT getting validated in lookup operation,
3565 * need to do it before permission check. */
3567 if (inode == inode->i_sb->s_root->d_inode) {
3568 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3569 MDS_INODELOCK_LOOKUP);
3574 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3575 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3577 /* squash fsuid/fsgid if needed */
3578 sbi = ll_i2sbi(inode);
3579 squash = &sbi->ll_squash;
3580 if (unlikely(squash->rsi_uid != 0 &&
3581 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3582 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3586 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3587 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3588 squash->rsi_uid, squash->rsi_gid);
3590 /* update current process's credentials
3591 * and FS capability */
3592 cred = prepare_creds();
3596 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3597 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3598 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3599 if ((1 << cap) & CFS_CAP_FS_MASK)
3600 cap_lower(cred->cap_effective, cap);
3602 old_cred = override_creds(cred);
3605 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3607 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3608 rc = lustre_check_remote_perm(inode, mask);
3610 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3612 /* restore current process's credentials and FS capability */
3614 revert_creds(old_cred);
3621 /* -o localflock - only provides locally consistent flock locks */
3622 struct file_operations ll_file_operations = {
3623 .read = ll_file_read,
3624 .aio_read = ll_file_aio_read,
3625 .write = ll_file_write,
3626 .aio_write = ll_file_aio_write,
3627 .unlocked_ioctl = ll_file_ioctl,
3628 .open = ll_file_open,
3629 .release = ll_file_release,
3630 .mmap = ll_file_mmap,
3631 .llseek = ll_file_seek,
3632 .splice_read = ll_file_splice_read,
3637 struct file_operations ll_file_operations_flock = {
3638 .read = ll_file_read,
3639 .aio_read = ll_file_aio_read,
3640 .write = ll_file_write,
3641 .aio_write = ll_file_aio_write,
3642 .unlocked_ioctl = ll_file_ioctl,
3643 .open = ll_file_open,
3644 .release = ll_file_release,
3645 .mmap = ll_file_mmap,
3646 .llseek = ll_file_seek,
3647 .splice_read = ll_file_splice_read,
3650 .flock = ll_file_flock,
3651 .lock = ll_file_flock
3654 /* These are for -o noflock - to return ENOSYS on flock calls */
3655 struct file_operations ll_file_operations_noflock = {
3656 .read = ll_file_read,
3657 .aio_read = ll_file_aio_read,
3658 .write = ll_file_write,
3659 .aio_write = ll_file_aio_write,
3660 .unlocked_ioctl = ll_file_ioctl,
3661 .open = ll_file_open,
3662 .release = ll_file_release,
3663 .mmap = ll_file_mmap,
3664 .llseek = ll_file_seek,
3665 .splice_read = ll_file_splice_read,
3668 .flock = ll_file_noflock,
3669 .lock = ll_file_noflock
3672 struct inode_operations ll_file_inode_operations = {
3673 .setattr = ll_setattr,
3674 .getattr = ll_getattr,
3675 .permission = ll_inode_permission,
3676 .setxattr = ll_setxattr,
3677 .getxattr = ll_getxattr,
3678 .listxattr = ll_listxattr,
3679 .removexattr = ll_removexattr,
3680 .fiemap = ll_fiemap,
3681 #ifdef HAVE_IOP_GET_ACL
3682 .get_acl = ll_get_acl,
3686 /* dynamic ioctl number support routins */
3687 static struct llioc_ctl_data {
3688 struct rw_semaphore ioc_sem;
3689 struct list_head ioc_head;
3691 __RWSEM_INITIALIZER(llioc.ioc_sem),
3692 LIST_HEAD_INIT(llioc.ioc_head)
3697 struct list_head iocd_list;
3698 unsigned int iocd_size;
3699 llioc_callback_t iocd_cb;
3700 unsigned int iocd_count;
3701 unsigned int iocd_cmd[0];
3704 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3707 struct llioc_data *in_data = NULL;
3710 if (cb == NULL || cmd == NULL ||
3711 count > LLIOC_MAX_CMD || count < 0)
3714 size = sizeof(*in_data) + count * sizeof(unsigned int);
3715 OBD_ALLOC(in_data, size);
3716 if (in_data == NULL)
3719 memset(in_data, 0, sizeof(*in_data));
3720 in_data->iocd_size = size;
3721 in_data->iocd_cb = cb;
3722 in_data->iocd_count = count;
3723 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3725 down_write(&llioc.ioc_sem);
3726 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3727 up_write(&llioc.ioc_sem);
3732 void ll_iocontrol_unregister(void *magic)
3734 struct llioc_data *tmp;
3739 down_write(&llioc.ioc_sem);
3740 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3742 unsigned int size = tmp->iocd_size;
3744 list_del(&tmp->iocd_list);
3745 up_write(&llioc.ioc_sem);
3747 OBD_FREE(tmp, size);
3751 up_write(&llioc.ioc_sem);
3753 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3756 EXPORT_SYMBOL(ll_iocontrol_register);
3757 EXPORT_SYMBOL(ll_iocontrol_unregister);
3759 static enum llioc_iter
3760 ll_iocontrol_call(struct inode *inode, struct file *file,
3761 unsigned int cmd, unsigned long arg, int *rcp)
3763 enum llioc_iter ret = LLIOC_CONT;
3764 struct llioc_data *data;
3765 int rc = -EINVAL, i;
3767 down_read(&llioc.ioc_sem);
3768 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3769 for (i = 0; i < data->iocd_count; i++) {
3770 if (cmd != data->iocd_cmd[i])
3773 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3777 if (ret == LLIOC_STOP)
3780 up_read(&llioc.ioc_sem);
3787 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3789 struct ll_inode_info *lli = ll_i2info(inode);
3790 struct cl_env_nest nest;
3795 if (lli->lli_clob == NULL)
3798 env = cl_env_nested_get(&nest);
3800 RETURN(PTR_ERR(env));
3802 result = cl_conf_set(env, lli->lli_clob, conf);
3803 cl_env_nested_put(&nest, env);
3805 if (conf->coc_opc == OBJECT_CONF_SET) {
3806 struct ldlm_lock *lock = conf->coc_lock;
3808 LASSERT(lock != NULL);
3809 LASSERT(ldlm_has_layout(lock));
3811 struct lustre_md *md = conf->u.coc_md;
3812 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3814 /* it can only be allowed to match after layout is
3815 * applied to inode otherwise false layout would be
3816 * seen. Applying layout shoud happen before dropping
3817 * the intent lock. */
3818 ldlm_lock_allow_match(lock);
3820 lli->lli_has_smd = lsm_has_objects(md->lsm);
3821 if (md->lsm != NULL)
3822 gen = md->lsm->lsm_layout_gen;
3825 DFID ": layout version change: %u -> %u\n",
3826 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3828 ll_layout_version_set(lli, gen);
3834 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3835 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3838 struct ll_sb_info *sbi = ll_i2sbi(inode);
3839 struct obd_capa *oc;
3840 struct ptlrpc_request *req;
3841 struct mdt_body *body;
3848 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3849 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3850 lock->l_lvb_data, lock->l_lvb_len);
3852 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3855 /* if layout lock was granted right away, the layout is returned
3856 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3857 * blocked and then granted via completion ast, we have to fetch
3858 * layout here. Please note that we can't use the LVB buffer in
3859 * completion AST because it doesn't have a large enough buffer */
3860 oc = ll_mdscapa_get(inode);
3861 rc = ll_get_default_mdsize(sbi, &lmmsize);
3863 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3864 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3870 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3872 GOTO(out, rc = -EPROTO);
3874 lmmsize = body->mbo_eadatasize;
3875 if (lmmsize == 0) /* empty layout */
3878 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3880 GOTO(out, rc = -EFAULT);
3882 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3883 if (lvbdata == NULL)
3884 GOTO(out, rc = -ENOMEM);
3886 memcpy(lvbdata, lmm, lmmsize);
3887 lock_res_and_lock(lock);
3888 if (lock->l_lvb_data != NULL)
3889 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3891 lock->l_lvb_data = lvbdata;
3892 lock->l_lvb_len = lmmsize;
3893 unlock_res_and_lock(lock);
3898 ptlrpc_req_finished(req);
3903 * Apply the layout to the inode. Layout lock is held and will be released
3906 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3907 struct inode *inode, __u32 *gen, bool reconf)
3909 struct ll_inode_info *lli = ll_i2info(inode);
3910 struct ll_sb_info *sbi = ll_i2sbi(inode);
3911 struct ldlm_lock *lock;
3912 struct lustre_md md = { NULL };
3913 struct cl_object_conf conf;
3916 bool wait_layout = false;
3919 LASSERT(lustre_handle_is_used(lockh));
3921 lock = ldlm_handle2lock(lockh);
3922 LASSERT(lock != NULL);
3923 LASSERT(ldlm_has_layout(lock));
3925 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3926 PFID(&lli->lli_fid), inode, reconf);
3928 /* in case this is a caching lock and reinstate with new inode */
3929 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3931 lock_res_and_lock(lock);
3932 lvb_ready = ldlm_is_lvb_ready(lock);
3933 unlock_res_and_lock(lock);
3934 /* checking lvb_ready is racy but this is okay. The worst case is
3935 * that multi processes may configure the file on the same time. */
3937 if (lvb_ready || !reconf) {
3940 /* layout_gen must be valid if layout lock is not
3941 * cancelled and stripe has already set */
3942 *gen = ll_layout_version_get(lli);
3948 rc = ll_layout_fetch(inode, lock);
3952 /* for layout lock, lmm is returned in lock's lvb.
3953 * lvb_data is immutable if the lock is held so it's safe to access it
3954 * without res lock. See the description in ldlm_lock_decref_internal()
3955 * for the condition to free lvb_data of layout lock */
3956 if (lock->l_lvb_data != NULL) {
3957 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3958 lock->l_lvb_data, lock->l_lvb_len);
3960 *gen = LL_LAYOUT_GEN_EMPTY;
3962 *gen = md.lsm->lsm_layout_gen;
3965 CERROR("%s: file "DFID" unpackmd error: %d\n",
3966 ll_get_fsname(inode->i_sb, NULL, 0),
3967 PFID(&lli->lli_fid), rc);
3973 /* set layout to file. Unlikely this will fail as old layout was
3974 * surely eliminated */
3975 memset(&conf, 0, sizeof conf);
3976 conf.coc_opc = OBJECT_CONF_SET;
3977 conf.coc_inode = inode;
3978 conf.coc_lock = lock;
3979 conf.u.coc_md = &md;
3980 rc = ll_layout_conf(inode, &conf);
3983 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3985 /* refresh layout failed, need to wait */
3986 wait_layout = rc == -EBUSY;
3990 LDLM_LOCK_PUT(lock);
3991 ldlm_lock_decref(lockh, mode);
3993 /* wait for IO to complete if it's still being used. */
3995 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3996 ll_get_fsname(inode->i_sb, NULL, 0),
3997 PFID(&lli->lli_fid), inode);
3999 memset(&conf, 0, sizeof conf);
4000 conf.coc_opc = OBJECT_CONF_WAIT;
4001 conf.coc_inode = inode;
4002 rc = ll_layout_conf(inode, &conf);
4006 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4007 ll_get_fsname(inode->i_sb, NULL, 0),
4008 PFID(&lli->lli_fid), rc);
4014 * This function checks if there exists a LAYOUT lock on the client side,
4015 * or enqueues it if it doesn't have one in cache.
4017 * This function will not hold layout lock so it may be revoked any time after
4018 * this function returns. Any operations depend on layout should be redone
4021 * This function should be called before lov_io_init() to get an uptodate
4022 * layout version, the caller should save the version number and after IO
4023 * is finished, this function should be called again to verify that layout
4024 * is not changed during IO time.
4026 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4028 struct ll_inode_info *lli = ll_i2info(inode);
4029 struct ll_sb_info *sbi = ll_i2sbi(inode);
4030 struct md_op_data *op_data;
4031 struct lookup_intent it;
4032 struct lustre_handle lockh;
4034 struct ldlm_enqueue_info einfo = {
4035 .ei_type = LDLM_IBITS,
4037 .ei_cb_bl = &ll_md_blocking_ast,
4038 .ei_cb_cp = &ldlm_completion_ast,
4043 *gen = ll_layout_version_get(lli);
4044 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4048 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4049 LASSERT(S_ISREG(inode->i_mode));
4051 /* take layout lock mutex to enqueue layout lock exclusively. */
4052 mutex_lock(&lli->lli_layout_mutex);
4055 /* mostly layout lock is caching on the local side, so try to match
4056 * it before grabbing layout lock mutex. */
4057 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4058 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4059 if (mode != 0) { /* hit cached lock */
4060 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4064 mutex_unlock(&lli->lli_layout_mutex);
4068 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4069 0, 0, LUSTRE_OPC_ANY, NULL);
4070 if (IS_ERR(op_data)) {
4071 mutex_unlock(&lli->lli_layout_mutex);
4072 RETURN(PTR_ERR(op_data));
4075 /* have to enqueue one */
4076 memset(&it, 0, sizeof(it));
4077 it.it_op = IT_LAYOUT;
4078 lockh.cookie = 0ULL;
4080 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4081 ll_get_fsname(inode->i_sb, NULL, 0),
4082 PFID(&lli->lli_fid), inode);
4084 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4085 if (it.d.lustre.it_data != NULL)
4086 ptlrpc_req_finished(it.d.lustre.it_data);
4087 it.d.lustre.it_data = NULL;
4089 ll_finish_md_op_data(op_data);
4091 mode = it.d.lustre.it_lock_mode;
4092 it.d.lustre.it_lock_mode = 0;
4093 ll_intent_drop_lock(&it);
4096 /* set lock data in case this is a new lock */
4097 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4098 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4102 mutex_unlock(&lli->lli_layout_mutex);
4108 * This function send a restore request to the MDT
4110 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4112 struct hsm_user_request *hur;
4116 len = sizeof(struct hsm_user_request) +
4117 sizeof(struct hsm_user_item);
4118 OBD_ALLOC(hur, len);
4122 hur->hur_request.hr_action = HUA_RESTORE;
4123 hur->hur_request.hr_archive_id = 0;
4124 hur->hur_request.hr_flags = 0;
4125 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4126 sizeof(hur->hur_user_item[0].hui_fid));
4127 hur->hur_user_item[0].hui_extent.offset = offset;
4128 hur->hur_user_item[0].hui_extent.length = length;
4129 hur->hur_request.hr_itemcount = 1;
4130 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,