4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("Releasing a file %p with negative dentry %p. Name %s",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead.
386 * Different processes can open the same dir, "ll_opendir_key" means:
387 * it is me that should stop the statahead thread. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
389 lli->lli_opendir_pid != 0)
390 ll_stop_statahead(inode, lli->lli_opendir_key);
392 if (inode->i_sb->s_root == file->f_dentry) {
393 LUSTRE_FPRIVATE(file) = NULL;
394 ll_file_data_put(fd);
398 if (!S_ISDIR(inode->i_mode)) {
399 if (lli->lli_clob != NULL)
400 lov_read_and_clear_async_rc(lli->lli_clob);
401 lli->lli_async_rc = 0;
404 rc = ll_md_close(sbi->ll_md_exp, inode, file);
406 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
407 libcfs_debug_dumplog();
412 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
413 struct lookup_intent *itp)
415 struct dentry *de = file->f_dentry;
416 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
417 struct dentry *parent = de->d_parent;
418 const char *name = NULL;
420 struct md_op_data *op_data;
421 struct ptlrpc_request *req = NULL;
425 LASSERT(parent != NULL);
426 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
428 /* if server supports open-by-fid, or file name is invalid, don't pack
429 * name in open request */
430 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
431 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
432 name = de->d_name.name;
433 len = de->d_name.len;
436 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
437 name, len, 0, LUSTRE_OPC_ANY, NULL);
439 RETURN(PTR_ERR(op_data));
440 op_data->op_data = lmm;
441 op_data->op_data_size = lmmsize;
445 * Fixup for NFS export open.
447 * We're called in the context of NFS export, and parent
448 * unknown, use parent fid saved in lli_pfid which will
449 * be used by MDS to create data.
451 struct ll_inode_info *lli = ll_i2info(de->d_inode);
453 spin_lock(&lli->lli_lock);
454 op_data->op_fid1 = lli->lli_pfid;
455 spin_unlock(&lli->lli_lock);
457 LASSERT(fid_is_sane(&op_data->op_fid1));
458 /** We ignore parent's capability temporary. */
459 if (op_data->op_capa1 != NULL) {
460 capa_put(op_data->op_capa1);
461 op_data->op_capa1 = NULL;
465 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
466 &ll_md_blocking_ast, 0);
467 ll_finish_md_op_data(op_data);
469 /* reason for keep own exit path - don`t flood log
470 * with messages with -ESTALE errors.
472 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
473 it_open_error(DISP_OPEN_OPEN, itp))
475 ll_release_openhandle(de, itp);
479 if (it_disposition(itp, DISP_LOOKUP_NEG))
480 GOTO(out, rc = -ENOENT);
482 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
483 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
484 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
488 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
489 if (!rc && itp->d.lustre.it_lock_mode)
490 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
493 ptlrpc_req_finished(req);
494 ll_intent_drop_lock(itp);
500 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
501 * not believe attributes if a few ioepoch holders exist. Attributes for
502 * previous ioepoch if new one is opened are also skipped by MDS.
504 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
506 if (ioepoch && lli->lli_ioepoch != ioepoch) {
507 lli->lli_ioepoch = ioepoch;
508 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
509 ioepoch, PFID(&lli->lli_fid));
513 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
514 struct obd_client_handle *och)
516 struct ptlrpc_request *req = it->d.lustre.it_data;
517 struct mdt_body *body;
519 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
520 och->och_fh = body->mbo_handle;
521 och->och_fid = body->mbo_fid1;
522 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
523 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
524 och->och_flags = it->it_flags;
526 return md_set_open_replay_data(md_exp, och, it);
529 static int ll_local_open(struct file *file, struct lookup_intent *it,
530 struct ll_file_data *fd, struct obd_client_handle *och)
532 struct inode *inode = file->f_dentry->d_inode;
533 struct ll_inode_info *lli = ll_i2info(inode);
536 LASSERT(!LUSTRE_FPRIVATE(file));
541 struct ptlrpc_request *req = it->d.lustre.it_data;
542 struct mdt_body *body;
545 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
549 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
550 ll_ioepoch_open(lli, body->mbo_ioepoch);
553 LUSTRE_FPRIVATE(file) = fd;
554 ll_readahead_init(inode, &fd->fd_ras);
555 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
557 /* ll_cl_context initialize */
558 rwlock_init(&fd->fd_lock);
559 INIT_LIST_HEAD(&fd->fd_lccs);
564 /* Open a file, and (for the very first open) create objects on the OSTs at
565 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
566 * creation or open until ll_lov_setstripe() ioctl is called.
568 * If we already have the stripe MD locally then we don't request it in
569 * md_open(), by passing a lmm_size = 0.
571 * It is up to the application to ensure no other processes open this file
572 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
573 * used. We might be able to avoid races of that sort by getting lli_open_sem
574 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
575 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
577 int ll_file_open(struct inode *inode, struct file *file)
579 struct ll_inode_info *lli = ll_i2info(inode);
580 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
581 .it_flags = file->f_flags };
582 struct obd_client_handle **och_p = NULL;
583 __u64 *och_usecount = NULL;
584 struct ll_file_data *fd;
585 int rc = 0, opendir_set = 0;
588 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
589 PFID(ll_inode2fid(inode)), inode, file->f_flags);
591 it = file->private_data; /* XXX: compat macro */
592 file->private_data = NULL; /* prevent ll_local_open assertion */
594 fd = ll_file_data_get();
596 GOTO(out_openerr, rc = -ENOMEM);
599 if (S_ISDIR(inode->i_mode)) {
600 spin_lock(&lli->lli_sa_lock);
601 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
602 lli->lli_opendir_pid == 0) {
603 lli->lli_opendir_key = fd;
604 lli->lli_opendir_pid = current_pid();
607 spin_unlock(&lli->lli_sa_lock);
610 if (inode->i_sb->s_root == file->f_dentry) {
611 LUSTRE_FPRIVATE(file) = fd;
615 if (!it || !it->d.lustre.it_disposition) {
616 /* Convert f_flags into access mode. We cannot use file->f_mode,
617 * because everything but O_ACCMODE mask was stripped from
619 if ((oit.it_flags + 1) & O_ACCMODE)
621 if (file->f_flags & O_TRUNC)
622 oit.it_flags |= FMODE_WRITE;
624 /* kernel only call f_op->open in dentry_open. filp_open calls
625 * dentry_open after call to open_namei that checks permissions.
626 * Only nfsd_open call dentry_open directly without checking
627 * permissions and because of that this code below is safe. */
628 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
629 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
631 /* We do not want O_EXCL here, presumably we opened the file
632 * already? XXX - NFS implications? */
633 oit.it_flags &= ~O_EXCL;
635 /* bug20584, if "it_flags" contains O_CREAT, the file will be
636 * created if necessary, then "IT_CREAT" should be set to keep
637 * consistent with it */
638 if (oit.it_flags & O_CREAT)
639 oit.it_op |= IT_CREAT;
645 /* Let's see if we have file open on MDS already. */
646 if (it->it_flags & FMODE_WRITE) {
647 och_p = &lli->lli_mds_write_och;
648 och_usecount = &lli->lli_open_fd_write_count;
649 } else if (it->it_flags & FMODE_EXEC) {
650 och_p = &lli->lli_mds_exec_och;
651 och_usecount = &lli->lli_open_fd_exec_count;
653 och_p = &lli->lli_mds_read_och;
654 och_usecount = &lli->lli_open_fd_read_count;
657 mutex_lock(&lli->lli_och_mutex);
658 if (*och_p) { /* Open handle is present */
659 if (it_disposition(it, DISP_OPEN_OPEN)) {
660 /* Well, there's extra open request that we do not need,
661 let's close it somehow. This will decref request. */
662 rc = it_open_error(DISP_OPEN_OPEN, it);
664 mutex_unlock(&lli->lli_och_mutex);
665 GOTO(out_openerr, rc);
668 ll_release_openhandle(file->f_dentry, it);
672 rc = ll_local_open(file, it, fd, NULL);
675 mutex_unlock(&lli->lli_och_mutex);
676 GOTO(out_openerr, rc);
679 LASSERT(*och_usecount == 0);
680 if (!it->d.lustre.it_disposition) {
681 /* We cannot just request lock handle now, new ELC code
682 means that one of other OPEN locks for this file
683 could be cancelled, and since blocking ast handler
684 would attempt to grab och_mutex as well, that would
685 result in a deadlock */
686 mutex_unlock(&lli->lli_och_mutex);
688 * Normally called under two situations:
690 * 2. revalidate with IT_OPEN (revalidate doesn't
691 * execute this intent any more).
693 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
695 * Always specify MDS_OPEN_BY_FID because we don't want
696 * to get file with different fid.
698 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
699 rc = ll_intent_file_open(file, NULL, 0, it);
701 GOTO(out_openerr, rc);
705 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
707 GOTO(out_och_free, rc = -ENOMEM);
711 /* md_intent_lock() didn't get a request ref if there was an
712 * open error, so don't do cleanup on the request here
714 /* XXX (green): Should not we bail out on any error here, not
715 * just open error? */
716 rc = it_open_error(DISP_OPEN_OPEN, it);
718 GOTO(out_och_free, rc);
720 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
721 "inode %p: disposition %x, status %d\n", inode,
722 it_disposition(it, ~0), it->d.lustre.it_status);
724 rc = ll_local_open(file, it, fd, *och_p);
726 GOTO(out_och_free, rc);
728 mutex_unlock(&lli->lli_och_mutex);
731 /* Must do this outside lli_och_mutex lock to prevent deadlock where
732 different kind of OPEN lock for this same inode gets cancelled
733 by ldlm_cancel_lru */
734 if (!S_ISREG(inode->i_mode))
735 GOTO(out_och_free, rc);
739 if (!lli->lli_has_smd &&
740 (cl_is_lov_delay_create(file->f_flags) ||
741 (file->f_mode & FMODE_WRITE) == 0)) {
742 CDEBUG(D_INODE, "object creation was delayed\n");
743 GOTO(out_och_free, rc);
745 cl_lov_delay_create_clear(&file->f_flags);
746 GOTO(out_och_free, rc);
750 if (och_p && *och_p) {
751 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
752 *och_p = NULL; /* OBD_FREE writes some magic there */
755 mutex_unlock(&lli->lli_och_mutex);
758 if (opendir_set != 0)
759 ll_stop_statahead(inode, lli->lli_opendir_key);
761 ll_file_data_put(fd);
763 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
766 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
767 ptlrpc_req_finished(it->d.lustre.it_data);
768 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
774 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
775 struct ldlm_lock_desc *desc, void *data, int flag)
778 struct lustre_handle lockh;
782 case LDLM_CB_BLOCKING:
783 ldlm_lock2handle(lock, &lockh);
784 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
786 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
790 case LDLM_CB_CANCELING:
798 * Acquire a lease and open the file.
800 static struct obd_client_handle *
801 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
804 struct lookup_intent it = { .it_op = IT_OPEN };
805 struct ll_sb_info *sbi = ll_i2sbi(inode);
806 struct md_op_data *op_data;
807 struct ptlrpc_request *req = NULL;
808 struct lustre_handle old_handle = { 0 };
809 struct obd_client_handle *och = NULL;
814 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
815 RETURN(ERR_PTR(-EINVAL));
818 struct ll_inode_info *lli = ll_i2info(inode);
819 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
820 struct obd_client_handle **och_p;
823 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
824 RETURN(ERR_PTR(-EPERM));
826 /* Get the openhandle of the file */
828 mutex_lock(&lli->lli_och_mutex);
829 if (fd->fd_lease_och != NULL) {
830 mutex_unlock(&lli->lli_och_mutex);
834 if (fd->fd_och == NULL) {
835 if (file->f_mode & FMODE_WRITE) {
836 LASSERT(lli->lli_mds_write_och != NULL);
837 och_p = &lli->lli_mds_write_och;
838 och_usecount = &lli->lli_open_fd_write_count;
840 LASSERT(lli->lli_mds_read_och != NULL);
841 och_p = &lli->lli_mds_read_och;
842 och_usecount = &lli->lli_open_fd_read_count;
844 if (*och_usecount == 1) {
851 mutex_unlock(&lli->lli_och_mutex);
852 if (rc < 0) /* more than 1 opener */
855 LASSERT(fd->fd_och != NULL);
856 old_handle = fd->fd_och->och_fh;
861 RETURN(ERR_PTR(-ENOMEM));
863 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
864 LUSTRE_OPC_ANY, NULL);
866 GOTO(out, rc = PTR_ERR(op_data));
868 /* To tell the MDT this openhandle is from the same owner */
869 op_data->op_handle = old_handle;
871 it.it_flags = fmode | open_flags;
872 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
873 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
874 &ll_md_blocking_lease_ast,
875 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
876 * it can be cancelled which may mislead applications that the lease is
878 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
879 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
880 * doesn't deal with openhandle, so normal openhandle will be leaked. */
881 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
882 ll_finish_md_op_data(op_data);
883 ptlrpc_req_finished(req);
885 GOTO(out_release_it, rc);
887 if (it_disposition(&it, DISP_LOOKUP_NEG))
888 GOTO(out_release_it, rc = -ENOENT);
890 rc = it_open_error(DISP_OPEN_OPEN, &it);
892 GOTO(out_release_it, rc);
894 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
895 ll_och_fill(sbi->ll_md_exp, &it, och);
897 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
898 GOTO(out_close, rc = -EOPNOTSUPP);
900 /* already get lease, handle lease lock */
901 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
902 if (it.d.lustre.it_lock_mode == 0 ||
903 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
904 /* open lock must return for lease */
905 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
906 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
907 it.d.lustre.it_lock_bits);
908 GOTO(out_close, rc = -EPROTO);
911 ll_intent_release(&it);
915 /* Cancel open lock */
916 if (it.d.lustre.it_lock_mode != 0) {
917 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
918 it.d.lustre.it_lock_mode);
919 it.d.lustre.it_lock_mode = 0;
920 och->och_lease_handle.cookie = 0ULL;
922 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
924 CERROR("%s: error closing file "DFID": %d\n",
925 ll_get_fsname(inode->i_sb, NULL, 0),
926 PFID(&ll_i2info(inode)->lli_fid), rc2);
927 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
929 ll_intent_release(&it);
937 * Release lease and close the file.
938 * It will check if the lease has ever broken.
940 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
943 struct ldlm_lock *lock;
944 bool cancelled = true;
948 lock = ldlm_handle2lock(&och->och_lease_handle);
950 lock_res_and_lock(lock);
951 cancelled = ldlm_is_cancel(lock);
952 unlock_res_and_lock(lock);
956 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
957 PFID(&ll_i2info(inode)->lli_fid), cancelled);
960 ldlm_cli_cancel(&och->och_lease_handle, 0);
961 if (lease_broken != NULL)
962 *lease_broken = cancelled;
964 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
969 /* Fills the obdo with the attributes for the lsm */
970 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
971 struct obd_capa *capa, struct obdo *obdo,
972 __u64 ioepoch, int dv_flags)
974 struct ptlrpc_request_set *set;
975 struct obd_info oinfo = { { { 0 } } };
980 LASSERT(lsm != NULL);
984 oinfo.oi_oa->o_oi = lsm->lsm_oi;
985 oinfo.oi_oa->o_mode = S_IFREG;
986 oinfo.oi_oa->o_ioepoch = ioepoch;
987 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
988 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
989 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
990 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
991 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
992 OBD_MD_FLDATAVERSION;
993 oinfo.oi_capa = capa;
994 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
995 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
996 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
997 if (dv_flags & LL_DV_WR_FLUSH)
998 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
1001 set = ptlrpc_prep_set();
1003 CERROR("can't allocate ptlrpc set\n");
1006 rc = obd_getattr_async(exp, &oinfo, set);
1008 rc = ptlrpc_set_wait(set);
1009 ptlrpc_set_destroy(set);
1012 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
1013 OBD_MD_FLATIME | OBD_MD_FLMTIME |
1014 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
1015 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
1016 if (dv_flags & LL_DV_WR_FLUSH &&
1017 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
1018 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1025 * Performs the getattr on the inode and updates its fields.
1026 * If @sync != 0, perform the getattr under the server-side lock.
1028 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1029 __u64 ioepoch, int sync)
1031 struct obd_capa *capa = ll_mdscapa_get(inode);
1032 struct lov_stripe_md *lsm;
1036 lsm = ccc_inode_lsm_get(inode);
1037 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1038 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1041 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1043 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1044 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1045 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1046 (unsigned long long)inode->i_blocks,
1047 (unsigned long)ll_inode_blksize(inode));
1049 ccc_inode_lsm_put(inode, lsm);
1053 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1055 struct ll_inode_info *lli = ll_i2info(inode);
1056 struct cl_object *obj = lli->lli_clob;
1057 struct cl_attr *attr = ccc_env_thread_attr(env);
1063 ll_inode_size_lock(inode);
1064 /* merge timestamps the most recently obtained from mds with
1065 timestamps obtained from osts */
1066 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1067 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1068 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1069 inode_init_lvb(inode, &lvb);
1071 cl_object_attr_lock(obj);
1072 rc = cl_object_attr_get(env, obj, attr);
1073 cl_object_attr_unlock(obj);
1076 if (lvb.lvb_atime < attr->cat_atime)
1077 lvb.lvb_atime = attr->cat_atime;
1078 if (lvb.lvb_ctime < attr->cat_ctime)
1079 lvb.lvb_ctime = attr->cat_ctime;
1080 if (lvb.lvb_mtime < attr->cat_mtime)
1081 lvb.lvb_mtime = attr->cat_mtime;
1083 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1084 PFID(&lli->lli_fid), attr->cat_size);
1085 cl_isize_write_nolock(inode, attr->cat_size);
1087 inode->i_blocks = attr->cat_blocks;
1089 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1090 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1091 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1093 ll_inode_size_unlock(inode);
1098 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1101 struct obdo obdo = { 0 };
1104 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1106 st->st_size = obdo.o_size;
1107 st->st_blocks = obdo.o_blocks;
1108 st->st_mtime = obdo.o_mtime;
1109 st->st_atime = obdo.o_atime;
1110 st->st_ctime = obdo.o_ctime;
1115 static bool file_is_noatime(const struct file *file)
1117 const struct vfsmount *mnt = file->f_path.mnt;
1118 const struct inode *inode = file->f_path.dentry->d_inode;
1120 /* Adapted from file_accessed() and touch_atime().*/
1121 if (file->f_flags & O_NOATIME)
1124 if (inode->i_flags & S_NOATIME)
1127 if (IS_NOATIME(inode))
1130 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1133 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1136 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1142 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1144 struct inode *inode = file->f_dentry->d_inode;
1146 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1148 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1149 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1150 file->f_flags & O_DIRECT ||
1153 io->ci_obj = ll_i2info(inode)->lli_clob;
1154 io->ci_lockreq = CILR_MAYBE;
1155 if (ll_file_nolock(file)) {
1156 io->ci_lockreq = CILR_NEVER;
1157 io->ci_no_srvlock = 1;
1158 } else if (file->f_flags & O_APPEND) {
1159 io->ci_lockreq = CILR_MANDATORY;
1162 io->ci_noatime = file_is_noatime(file);
1166 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1167 struct file *file, enum cl_io_type iot,
1168 loff_t *ppos, size_t count)
1170 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1171 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1176 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1177 file->f_dentry->d_name.name, iot, *ppos, count);
1180 io = ccc_env_thread_io(env);
1181 ll_io_init(io, file, iot == CIT_WRITE);
1183 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1184 struct vvp_io *vio = vvp_env_io(env);
1185 struct ccc_io *cio = ccc_env_io(env);
1186 int write_mutex_locked = 0;
1188 cio->cui_fd = LUSTRE_FPRIVATE(file);
1189 vio->cui_io_subtype = args->via_io_subtype;
1191 ll_cl_add(file, env, io);
1193 switch (vio->cui_io_subtype) {
1195 cio->cui_iov = args->u.normal.via_iov;
1196 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1197 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1198 cio->cui_iocb = args->u.normal.via_iocb;
1199 if ((iot == CIT_WRITE) &&
1200 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1201 if (mutex_lock_interruptible(&lli->
1203 GOTO(out, result = -ERESTARTSYS);
1204 write_mutex_locked = 1;
1206 down_read(&lli->lli_trunc_sem);
1209 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1210 vio->u.splice.cui_flags = args->u.splice.via_flags;
1213 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1216 result = cl_io_loop(env, io);
1217 if (args->via_io_subtype == IO_NORMAL)
1218 up_read(&lli->lli_trunc_sem);
1219 if (write_mutex_locked)
1220 mutex_unlock(&lli->lli_write_mutex);
1221 ll_cl_remove(file, env);
1223 /* cl_io_rw_init() handled IO */
1224 result = io->ci_result;
1227 if (io->ci_nob > 0) {
1228 result = io->ci_nob;
1229 *ppos = io->u.ci_wr.wr.crw_pos;
1233 cl_io_fini(env, io);
1234 /* If any bit been read/written (result != 0), we just return
1235 * short read/write instead of restart io. */
1236 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1237 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1238 iot == CIT_READ ? "read" : "write",
1239 file->f_dentry->d_name.name, *ppos, count);
1240 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1244 if (iot == CIT_READ) {
1246 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1247 LPROC_LL_READ_BYTES, result);
1248 } else if (iot == CIT_WRITE) {
1250 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1251 LPROC_LL_WRITE_BYTES, result);
1252 fd->fd_write_failed = false;
1253 } else if (result != -ERESTARTSYS) {
1254 fd->fd_write_failed = true;
1257 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1264 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1266 static int ll_file_get_iov_count(const struct iovec *iov,
1267 unsigned long *nr_segs, size_t *count)
1272 for (seg = 0; seg < *nr_segs; seg++) {
1273 const struct iovec *iv = &iov[seg];
1276 * If any segment has a negative length, or the cumulative
1277 * length ever wraps negative then return -EINVAL.
1280 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1282 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1287 cnt -= iv->iov_len; /* This segment is no good */
1294 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1295 unsigned long nr_segs, loff_t pos)
1298 struct vvp_io_args *args;
1304 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1308 env = cl_env_get(&refcheck);
1310 RETURN(PTR_ERR(env));
1312 args = vvp_env_args(env, IO_NORMAL);
1313 args->u.normal.via_iov = (struct iovec *)iov;
1314 args->u.normal.via_nrsegs = nr_segs;
1315 args->u.normal.via_iocb = iocb;
1317 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1318 &iocb->ki_pos, count);
1319 cl_env_put(env, &refcheck);
1323 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1327 struct iovec *local_iov;
1328 struct kiocb *kiocb;
1333 env = cl_env_get(&refcheck);
1335 RETURN(PTR_ERR(env));
1337 local_iov = &vvp_env_info(env)->vti_local_iov;
1338 kiocb = &vvp_env_info(env)->vti_kiocb;
1339 local_iov->iov_base = (void __user *)buf;
1340 local_iov->iov_len = count;
1341 init_sync_kiocb(kiocb, file);
1342 kiocb->ki_pos = *ppos;
1343 #ifdef HAVE_KIOCB_KI_LEFT
1344 kiocb->ki_left = count;
1346 kiocb->ki_nbytes = count;
1349 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1350 *ppos = kiocb->ki_pos;
1352 cl_env_put(env, &refcheck);
1357 * Write to a file (through the page cache).
1360 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1361 unsigned long nr_segs, loff_t pos)
1364 struct vvp_io_args *args;
1370 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1374 env = cl_env_get(&refcheck);
1376 RETURN(PTR_ERR(env));
1378 args = vvp_env_args(env, IO_NORMAL);
1379 args->u.normal.via_iov = (struct iovec *)iov;
1380 args->u.normal.via_nrsegs = nr_segs;
1381 args->u.normal.via_iocb = iocb;
1383 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1384 &iocb->ki_pos, count);
1385 cl_env_put(env, &refcheck);
1389 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1393 struct iovec *local_iov;
1394 struct kiocb *kiocb;
1399 env = cl_env_get(&refcheck);
1401 RETURN(PTR_ERR(env));
1403 local_iov = &vvp_env_info(env)->vti_local_iov;
1404 kiocb = &vvp_env_info(env)->vti_kiocb;
1405 local_iov->iov_base = (void __user *)buf;
1406 local_iov->iov_len = count;
1407 init_sync_kiocb(kiocb, file);
1408 kiocb->ki_pos = *ppos;
1409 #ifdef HAVE_KIOCB_KI_LEFT
1410 kiocb->ki_left = count;
1412 kiocb->ki_nbytes = count;
1415 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1416 *ppos = kiocb->ki_pos;
1418 cl_env_put(env, &refcheck);
1423 * Send file content (through pagecache) somewhere with helper
1425 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1426 struct pipe_inode_info *pipe, size_t count,
1430 struct vvp_io_args *args;
1435 env = cl_env_get(&refcheck);
1437 RETURN(PTR_ERR(env));
1439 args = vvp_env_args(env, IO_SPLICE);
1440 args->u.splice.via_pipe = pipe;
1441 args->u.splice.via_flags = flags;
1443 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1444 cl_env_put(env, &refcheck);
1448 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1451 struct obd_export *exp = ll_i2dtexp(inode);
1452 struct obd_trans_info oti = { 0 };
1453 struct obdo *oa = NULL;
1456 struct lov_stripe_md *lsm = NULL, *lsm2;
1463 lsm = ccc_inode_lsm_get(inode);
1464 if (!lsm_has_objects(lsm))
1465 GOTO(out, rc = -ENOENT);
1467 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1468 (lsm->lsm_stripe_count));
1470 OBD_ALLOC_LARGE(lsm2, lsm_size);
1472 GOTO(out, rc = -ENOMEM);
1475 oa->o_nlink = ost_idx;
1476 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1477 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1478 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1479 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1480 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1481 memcpy(lsm2, lsm, lsm_size);
1482 ll_inode_size_lock(inode);
1483 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1484 ll_inode_size_unlock(inode);
1486 OBD_FREE_LARGE(lsm2, lsm_size);
1489 ccc_inode_lsm_put(inode, lsm);
1494 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1496 struct ll_recreate_obj ucreat;
1500 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1503 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1507 ostid_set_seq_mdt0(&oi);
1508 ostid_set_id(&oi, ucreat.lrc_id);
1509 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1512 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1519 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1522 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1525 fid_to_ostid(&fid, &oi);
1526 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1527 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1530 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1531 __u64 flags, struct lov_user_md *lum,
1534 struct lov_stripe_md *lsm = NULL;
1535 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1539 lsm = ccc_inode_lsm_get(inode);
1541 ccc_inode_lsm_put(inode, lsm);
1542 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1543 PFID(ll_inode2fid(inode)));
1544 GOTO(out, rc = -EEXIST);
1547 ll_inode_size_lock(inode);
1548 oit.it_flags |= MDS_OPEN_BY_FID;
1549 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1551 GOTO(out_unlock, rc);
1552 rc = oit.d.lustre.it_status;
1554 GOTO(out_req_free, rc);
1556 ll_release_openhandle(file->f_dentry, &oit);
1559 ll_inode_size_unlock(inode);
1560 ll_intent_release(&oit);
1561 ccc_inode_lsm_put(inode, lsm);
1563 cl_lov_delay_create_clear(&file->f_flags);
1566 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1570 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1571 struct lov_mds_md **lmmp, int *lmm_size,
1572 struct ptlrpc_request **request)
1574 struct ll_sb_info *sbi = ll_i2sbi(inode);
1575 struct mdt_body *body;
1576 struct lov_mds_md *lmm = NULL;
1577 struct ptlrpc_request *req = NULL;
1578 struct md_op_data *op_data;
1581 rc = ll_get_default_mdsize(sbi, &lmmsize);
1585 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1586 strlen(filename), lmmsize,
1587 LUSTRE_OPC_ANY, NULL);
1588 if (IS_ERR(op_data))
1589 RETURN(PTR_ERR(op_data));
1591 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1592 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1593 ll_finish_md_op_data(op_data);
1595 CDEBUG(D_INFO, "md_getattr_name failed "
1596 "on %s: rc %d\n", filename, rc);
1600 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1601 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1603 lmmsize = body->mbo_eadatasize;
1605 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1607 GOTO(out, rc = -ENODATA);
1610 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1611 LASSERT(lmm != NULL);
1613 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1614 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1615 GOTO(out, rc = -EPROTO);
1619 * This is coming from the MDS, so is probably in
1620 * little endian. We convert it to host endian before
1621 * passing it to userspace.
1623 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1626 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1627 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1630 /* if function called for directory - we should
1631 * avoid swab not existent lsm objects */
1632 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1633 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1634 if (S_ISREG(body->mbo_mode))
1635 lustre_swab_lov_user_md_objects(
1636 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1638 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1639 lustre_swab_lov_user_md_v3(
1640 (struct lov_user_md_v3 *)lmm);
1641 if (S_ISREG(body->mbo_mode))
1642 lustre_swab_lov_user_md_objects(
1643 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1650 *lmm_size = lmmsize;
1655 static int ll_lov_setea(struct inode *inode, struct file *file,
1658 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1659 struct lov_user_md *lump;
1660 int lum_size = sizeof(struct lov_user_md) +
1661 sizeof(struct lov_user_ost_data);
1665 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1668 OBD_ALLOC_LARGE(lump, lum_size);
1672 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1673 OBD_FREE_LARGE(lump, lum_size);
1677 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1679 OBD_FREE_LARGE(lump, lum_size);
1683 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1686 struct lov_user_md_v3 lumv3;
1687 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1688 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1689 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1691 __u64 flags = FMODE_WRITE;
1694 /* first try with v1 which is smaller than v3 */
1695 lum_size = sizeof(struct lov_user_md_v1);
1696 if (copy_from_user(lumv1, lumv1p, lum_size))
1699 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1700 lum_size = sizeof(struct lov_user_md_v3);
1701 if (copy_from_user(&lumv3, lumv3p, lum_size))
1705 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1707 struct lov_stripe_md *lsm;
1710 put_user(0, &lumv1p->lmm_stripe_count);
1712 ll_layout_refresh(inode, &gen);
1713 lsm = ccc_inode_lsm_get(inode);
1714 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1715 0, lsm, (void *)arg);
1716 ccc_inode_lsm_put(inode, lsm);
1721 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1723 struct lov_stripe_md *lsm;
1727 lsm = ccc_inode_lsm_get(inode);
1729 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1731 ccc_inode_lsm_put(inode, lsm);
1736 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1738 struct ll_inode_info *lli = ll_i2info(inode);
1739 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1740 struct ccc_grouplock grouplock;
1744 if (ll_file_nolock(file))
1745 RETURN(-EOPNOTSUPP);
1747 spin_lock(&lli->lli_lock);
1748 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1749 CWARN("group lock already existed with gid %lu\n",
1750 fd->fd_grouplock.cg_gid);
1751 spin_unlock(&lli->lli_lock);
1754 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1755 spin_unlock(&lli->lli_lock);
1757 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1758 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1762 spin_lock(&lli->lli_lock);
1763 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1764 spin_unlock(&lli->lli_lock);
1765 CERROR("another thread just won the race\n");
1766 cl_put_grouplock(&grouplock);
1770 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1771 fd->fd_grouplock = grouplock;
1772 spin_unlock(&lli->lli_lock);
1774 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1778 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1780 struct ll_inode_info *lli = ll_i2info(inode);
1781 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1782 struct ccc_grouplock grouplock;
1785 spin_lock(&lli->lli_lock);
1786 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1787 spin_unlock(&lli->lli_lock);
1788 CWARN("no group lock held\n");
1791 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1793 if (fd->fd_grouplock.cg_gid != arg) {
1794 CWARN("group lock %lu doesn't match current id %lu\n",
1795 arg, fd->fd_grouplock.cg_gid);
1796 spin_unlock(&lli->lli_lock);
1800 grouplock = fd->fd_grouplock;
1801 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1802 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1803 spin_unlock(&lli->lli_lock);
1805 cl_put_grouplock(&grouplock);
1806 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1811 * Close inode open handle
1813 * \param dentry [in] dentry which contains the inode
1814 * \param it [in,out] intent which contains open info and result
1817 * \retval <0 failure
1819 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1821 struct inode *inode = dentry->d_inode;
1822 struct obd_client_handle *och;
1828 /* Root ? Do nothing. */
1829 if (dentry->d_inode->i_sb->s_root == dentry)
1832 /* No open handle to close? Move away */
1833 if (!it_disposition(it, DISP_OPEN_OPEN))
1836 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1838 OBD_ALLOC(och, sizeof(*och));
1840 GOTO(out, rc = -ENOMEM);
1842 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1844 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1847 /* this one is in place of ll_file_open */
1848 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1849 ptlrpc_req_finished(it->d.lustre.it_data);
1850 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1856 * Get size for inode for which FIEMAP mapping is requested.
1857 * Make the FIEMAP get_info call and returns the result.
1859 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1862 struct obd_export *exp = ll_i2dtexp(inode);
1863 struct lov_stripe_md *lsm = NULL;
1864 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1865 __u32 vallen = num_bytes;
1869 /* Checks for fiemap flags */
1870 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1871 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1875 /* Check for FIEMAP_FLAG_SYNC */
1876 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1877 rc = filemap_fdatawrite(inode->i_mapping);
1882 lsm = ccc_inode_lsm_get(inode);
1886 /* If the stripe_count > 1 and the application does not understand
1887 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1889 if (lsm->lsm_stripe_count > 1 &&
1890 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1891 GOTO(out, rc = -EOPNOTSUPP);
1893 fm_key.oa.o_oi = lsm->lsm_oi;
1894 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1896 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1897 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1898 /* If filesize is 0, then there would be no objects for mapping */
1899 if (fm_key.oa.o_size == 0) {
1900 fiemap->fm_mapped_extents = 0;
1904 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1906 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1909 CERROR("obd_get_info failed: rc = %d\n", rc);
1912 ccc_inode_lsm_put(inode, lsm);
1916 int ll_fid2path(struct inode *inode, void *arg)
1918 struct obd_export *exp = ll_i2mdexp(inode);
1919 struct getinfo_fid2path *gfout, *gfin;
1923 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1924 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1927 /* Need to get the buflen */
1928 OBD_ALLOC_PTR(gfin);
1931 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1936 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1937 OBD_ALLOC(gfout, outsize);
1938 if (gfout == NULL) {
1942 memcpy(gfout, gfin, sizeof(*gfout));
1945 /* Call mdc_iocontrol */
1946 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1950 if (copy_to_user(arg, gfout, outsize))
1954 OBD_FREE(gfout, outsize);
1958 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1960 struct ll_user_fiemap *fiemap_s;
1961 size_t num_bytes, ret_bytes;
1962 unsigned int extent_count;
1965 /* Get the extent count so we can calculate the size of
1966 * required fiemap buffer */
1967 if (get_user(extent_count,
1968 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1970 num_bytes = sizeof(*fiemap_s) + (extent_count *
1971 sizeof(struct ll_fiemap_extent));
1973 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1974 if (fiemap_s == NULL)
1977 /* get the fiemap value */
1978 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1980 GOTO(error, rc = -EFAULT);
1982 /* If fm_extent_count is non-zero, read the first extent since
1983 * it is used to calculate end_offset and device from previous
1986 if (copy_from_user(&fiemap_s->fm_extents[0],
1987 (char __user *)arg + sizeof(*fiemap_s),
1988 sizeof(struct ll_fiemap_extent)))
1989 GOTO(error, rc = -EFAULT);
1992 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1996 ret_bytes = sizeof(struct ll_user_fiemap);
1998 if (extent_count != 0)
1999 ret_bytes += (fiemap_s->fm_mapped_extents *
2000 sizeof(struct ll_fiemap_extent));
2002 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2006 OBD_FREE_LARGE(fiemap_s, num_bytes);
2011 * Read the data_version for inode.
2013 * This value is computed using stripe object version on OST.
2014 * Version is computed using server side locking.
2016 * @param sync if do sync on the OST side;
2018 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2019 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2021 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2023 struct lov_stripe_md *lsm = NULL;
2024 struct ll_sb_info *sbi = ll_i2sbi(inode);
2025 struct obdo *obdo = NULL;
2029 /* If no stripe, we consider version is 0. */
2030 lsm = ccc_inode_lsm_get(inode);
2031 if (!lsm_has_objects(lsm)) {
2033 CDEBUG(D_INODE, "No object for inode\n");
2037 OBD_ALLOC_PTR(obdo);
2039 GOTO(out, rc = -ENOMEM);
2041 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2043 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2046 *data_version = obdo->o_data_version;
2052 ccc_inode_lsm_put(inode, lsm);
2057 * Trigger a HSM release request for the provided inode.
2059 int ll_hsm_release(struct inode *inode)
2061 struct cl_env_nest nest;
2063 struct obd_client_handle *och = NULL;
2064 __u64 data_version = 0;
2068 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2069 ll_get_fsname(inode->i_sb, NULL, 0),
2070 PFID(&ll_i2info(inode)->lli_fid));
2072 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2074 GOTO(out, rc = PTR_ERR(och));
2076 /* Grab latest data_version and [am]time values */
2077 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2081 env = cl_env_nested_get(&nest);
2083 GOTO(out, rc = PTR_ERR(env));
2085 ll_merge_lvb(env, inode);
2086 cl_env_nested_put(&nest, env);
2088 /* Release the file.
2089 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2090 * we still need it to pack l_remote_handle to MDT. */
2091 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2097 if (och != NULL && !IS_ERR(och)) /* close the file */
2098 ll_lease_close(och, inode, NULL);
2103 struct ll_swap_stack {
2104 struct iattr ia1, ia2;
2106 struct inode *inode1, *inode2;
2107 bool check_dv1, check_dv2;
2110 static int ll_swap_layouts(struct file *file1, struct file *file2,
2111 struct lustre_swap_layouts *lsl)
2113 struct mdc_swap_layouts msl;
2114 struct md_op_data *op_data;
2117 struct ll_swap_stack *llss = NULL;
2120 OBD_ALLOC_PTR(llss);
2124 llss->inode1 = file1->f_dentry->d_inode;
2125 llss->inode2 = file2->f_dentry->d_inode;
2127 if (!S_ISREG(llss->inode2->i_mode))
2128 GOTO(free, rc = -EINVAL);
2130 if (inode_permission(llss->inode1, MAY_WRITE) ||
2131 inode_permission(llss->inode2, MAY_WRITE))
2132 GOTO(free, rc = -EPERM);
2134 if (llss->inode2->i_sb != llss->inode1->i_sb)
2135 GOTO(free, rc = -EXDEV);
2137 /* we use 2 bool because it is easier to swap than 2 bits */
2138 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2139 llss->check_dv1 = true;
2141 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2142 llss->check_dv2 = true;
2144 /* we cannot use lsl->sl_dvX directly because we may swap them */
2145 llss->dv1 = lsl->sl_dv1;
2146 llss->dv2 = lsl->sl_dv2;
2148 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2149 if (rc == 0) /* same file, done! */
2152 if (rc < 0) { /* sequentialize it */
2153 swap(llss->inode1, llss->inode2);
2155 swap(llss->dv1, llss->dv2);
2156 swap(llss->check_dv1, llss->check_dv2);
2160 if (gid != 0) { /* application asks to flush dirty cache */
2161 rc = ll_get_grouplock(llss->inode1, file1, gid);
2165 rc = ll_get_grouplock(llss->inode2, file2, gid);
2167 ll_put_grouplock(llss->inode1, file1, gid);
2172 /* to be able to restore mtime and atime after swap
2173 * we need to first save them */
2175 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2176 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2177 llss->ia1.ia_atime = llss->inode1->i_atime;
2178 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2179 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2180 llss->ia2.ia_atime = llss->inode2->i_atime;
2181 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2184 /* ultimate check, before swaping the layouts we check if
2185 * dataversion has changed (if requested) */
2186 if (llss->check_dv1) {
2187 rc = ll_data_version(llss->inode1, &dv, 0);
2190 if (dv != llss->dv1)
2191 GOTO(putgl, rc = -EAGAIN);
2194 if (llss->check_dv2) {
2195 rc = ll_data_version(llss->inode2, &dv, 0);
2198 if (dv != llss->dv2)
2199 GOTO(putgl, rc = -EAGAIN);
2202 /* struct md_op_data is used to send the swap args to the mdt
2203 * only flags is missing, so we use struct mdc_swap_layouts
2204 * through the md_op_data->op_data */
2205 /* flags from user space have to be converted before they are send to
2206 * server, no flag is sent today, they are only used on the client */
2209 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2210 0, LUSTRE_OPC_ANY, &msl);
2211 if (IS_ERR(op_data))
2212 GOTO(free, rc = PTR_ERR(op_data));
2214 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2215 sizeof(*op_data), op_data, NULL);
2216 ll_finish_md_op_data(op_data);
2220 ll_put_grouplock(llss->inode2, file2, gid);
2221 ll_put_grouplock(llss->inode1, file1, gid);
2224 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2228 /* clear useless flags */
2229 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2230 llss->ia1.ia_valid &= ~ATTR_MTIME;
2231 llss->ia2.ia_valid &= ~ATTR_MTIME;
2234 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2235 llss->ia1.ia_valid &= ~ATTR_ATIME;
2236 llss->ia2.ia_valid &= ~ATTR_ATIME;
2239 /* update time if requested */
2241 if (llss->ia2.ia_valid != 0) {
2242 mutex_lock(&llss->inode1->i_mutex);
2243 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2244 mutex_unlock(&llss->inode1->i_mutex);
2247 if (llss->ia1.ia_valid != 0) {
2250 mutex_lock(&llss->inode2->i_mutex);
2251 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2252 mutex_unlock(&llss->inode2->i_mutex);
2264 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2266 struct md_op_data *op_data;
2269 /* Non-root users are forbidden to set or clear flags which are
2270 * NOT defined in HSM_USER_MASK. */
2271 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2272 !cfs_capable(CFS_CAP_SYS_ADMIN))
2275 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2276 LUSTRE_OPC_ANY, hss);
2277 if (IS_ERR(op_data))
2278 RETURN(PTR_ERR(op_data));
2280 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2281 sizeof(*op_data), op_data, NULL);
2283 ll_finish_md_op_data(op_data);
2288 static int ll_hsm_import(struct inode *inode, struct file *file,
2289 struct hsm_user_import *hui)
2291 struct hsm_state_set *hss = NULL;
2292 struct iattr *attr = NULL;
2296 if (!S_ISREG(inode->i_mode))
2302 GOTO(out, rc = -ENOMEM);
2304 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2305 hss->hss_archive_id = hui->hui_archive_id;
2306 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2307 rc = ll_hsm_state_set(inode, hss);
2311 OBD_ALLOC_PTR(attr);
2313 GOTO(out, rc = -ENOMEM);
2315 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2316 attr->ia_mode |= S_IFREG;
2317 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2318 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2319 attr->ia_size = hui->hui_size;
2320 attr->ia_mtime.tv_sec = hui->hui_mtime;
2321 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2322 attr->ia_atime.tv_sec = hui->hui_atime;
2323 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2325 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2326 ATTR_UID | ATTR_GID |
2327 ATTR_MTIME | ATTR_MTIME_SET |
2328 ATTR_ATIME | ATTR_ATIME_SET;
2330 rc = ll_setattr_raw(file->f_dentry, attr, true);
2345 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2347 struct inode *inode = file->f_dentry->d_inode;
2348 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2352 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2353 PFID(ll_inode2fid(inode)), inode, cmd);
2354 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2356 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2357 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2361 case LL_IOC_GETFLAGS:
2362 /* Get the current value of the file flags */
2363 return put_user(fd->fd_flags, (int *)arg);
2364 case LL_IOC_SETFLAGS:
2365 case LL_IOC_CLRFLAGS:
2366 /* Set or clear specific file flags */
2367 /* XXX This probably needs checks to ensure the flags are
2368 * not abused, and to handle any flag side effects.
2370 if (get_user(flags, (int *) arg))
2373 if (cmd == LL_IOC_SETFLAGS) {
2374 if ((flags & LL_FILE_IGNORE_LOCK) &&
2375 !(file->f_flags & O_DIRECT)) {
2376 CERROR("%s: unable to disable locking on "
2377 "non-O_DIRECT file\n", current->comm);
2381 fd->fd_flags |= flags;
2383 fd->fd_flags &= ~flags;
2386 case LL_IOC_LOV_SETSTRIPE:
2387 RETURN(ll_lov_setstripe(inode, file, arg));
2388 case LL_IOC_LOV_SETEA:
2389 RETURN(ll_lov_setea(inode, file, arg));
2390 case LL_IOC_LOV_SWAP_LAYOUTS: {
2392 struct lustre_swap_layouts lsl;
2394 if (copy_from_user(&lsl, (char *)arg,
2395 sizeof(struct lustre_swap_layouts)))
2398 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2401 file2 = fget(lsl.sl_fd);
2406 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2407 rc = ll_swap_layouts(file, file2, &lsl);
2411 case LL_IOC_LOV_GETSTRIPE:
2412 RETURN(ll_lov_getstripe(inode, arg));
2413 case LL_IOC_RECREATE_OBJ:
2414 RETURN(ll_lov_recreate_obj(inode, arg));
2415 case LL_IOC_RECREATE_FID:
2416 RETURN(ll_lov_recreate_fid(inode, arg));
2417 case FSFILT_IOC_FIEMAP:
2418 RETURN(ll_ioctl_fiemap(inode, arg));
2419 case FSFILT_IOC_GETFLAGS:
2420 case FSFILT_IOC_SETFLAGS:
2421 RETURN(ll_iocontrol(inode, file, cmd, arg));
2422 case FSFILT_IOC_GETVERSION_OLD:
2423 case FSFILT_IOC_GETVERSION:
2424 RETURN(put_user(inode->i_generation, (int *)arg));
2425 case LL_IOC_GROUP_LOCK:
2426 RETURN(ll_get_grouplock(inode, file, arg));
2427 case LL_IOC_GROUP_UNLOCK:
2428 RETURN(ll_put_grouplock(inode, file, arg));
2429 case IOC_OBD_STATFS:
2430 RETURN(ll_obd_statfs(inode, (void *)arg));
2432 /* We need to special case any other ioctls we want to handle,
2433 * to send them to the MDS/OST as appropriate and to properly
2434 * network encode the arg field.
2435 case FSFILT_IOC_SETVERSION_OLD:
2436 case FSFILT_IOC_SETVERSION:
2438 case LL_IOC_FLUSHCTX:
2439 RETURN(ll_flush_ctx(inode));
2440 case LL_IOC_PATH2FID: {
2441 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2442 sizeof(struct lu_fid)))
2447 case OBD_IOC_FID2PATH:
2448 RETURN(ll_fid2path(inode, (void *)arg));
2449 case LL_IOC_DATA_VERSION: {
2450 struct ioc_data_version idv;
2453 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2456 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2457 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2459 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2465 case LL_IOC_GET_MDTIDX: {
2468 mdtidx = ll_get_mdt_idx(inode);
2472 if (put_user((int)mdtidx, (int*)arg))
2477 case OBD_IOC_GETDTNAME:
2478 case OBD_IOC_GETMDNAME:
2479 RETURN(ll_get_obd_name(inode, cmd, arg));
2480 case LL_IOC_HSM_STATE_GET: {
2481 struct md_op_data *op_data;
2482 struct hsm_user_state *hus;
2489 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2490 LUSTRE_OPC_ANY, hus);
2491 if (IS_ERR(op_data)) {
2493 RETURN(PTR_ERR(op_data));
2496 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2499 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2502 ll_finish_md_op_data(op_data);
2506 case LL_IOC_HSM_STATE_SET: {
2507 struct hsm_state_set *hss;
2514 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2519 rc = ll_hsm_state_set(inode, hss);
2524 case LL_IOC_HSM_ACTION: {
2525 struct md_op_data *op_data;
2526 struct hsm_current_action *hca;
2533 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2534 LUSTRE_OPC_ANY, hca);
2535 if (IS_ERR(op_data)) {
2537 RETURN(PTR_ERR(op_data));
2540 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2543 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2546 ll_finish_md_op_data(op_data);
2550 case LL_IOC_SET_LEASE: {
2551 struct ll_inode_info *lli = ll_i2info(inode);
2552 struct obd_client_handle *och = NULL;
2558 if (!(file->f_mode & FMODE_WRITE))
2563 if (!(file->f_mode & FMODE_READ))
2568 mutex_lock(&lli->lli_och_mutex);
2569 if (fd->fd_lease_och != NULL) {
2570 och = fd->fd_lease_och;
2571 fd->fd_lease_och = NULL;
2573 mutex_unlock(&lli->lli_och_mutex);
2576 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2577 rc = ll_lease_close(och, inode, &lease_broken);
2578 if (rc == 0 && lease_broken)
2584 /* return the type of lease or error */
2585 RETURN(rc < 0 ? rc : (int)mode);
2590 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2592 /* apply for lease */
2593 och = ll_lease_open(inode, file, mode, 0);
2595 RETURN(PTR_ERR(och));
2598 mutex_lock(&lli->lli_och_mutex);
2599 if (fd->fd_lease_och == NULL) {
2600 fd->fd_lease_och = och;
2603 mutex_unlock(&lli->lli_och_mutex);
2605 /* impossible now that only excl is supported for now */
2606 ll_lease_close(och, inode, &lease_broken);
2611 case LL_IOC_GET_LEASE: {
2612 struct ll_inode_info *lli = ll_i2info(inode);
2613 struct ldlm_lock *lock = NULL;
2616 mutex_lock(&lli->lli_och_mutex);
2617 if (fd->fd_lease_och != NULL) {
2618 struct obd_client_handle *och = fd->fd_lease_och;
2620 lock = ldlm_handle2lock(&och->och_lease_handle);
2622 lock_res_and_lock(lock);
2623 if (!ldlm_is_cancel(lock))
2624 rc = och->och_flags &
2625 (FMODE_READ | FMODE_WRITE);
2626 unlock_res_and_lock(lock);
2627 LDLM_LOCK_PUT(lock);
2630 mutex_unlock(&lli->lli_och_mutex);
2633 case LL_IOC_HSM_IMPORT: {
2634 struct hsm_user_import *hui;
2640 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2645 rc = ll_hsm_import(inode, file, hui);
2655 ll_iocontrol_call(inode, file, cmd, arg, &err))
2658 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2664 #ifndef HAVE_FILE_LLSEEK_SIZE
2665 static inline loff_t
2666 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2668 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2670 if (offset > maxsize)
2673 if (offset != file->f_pos) {
2674 file->f_pos = offset;
2675 file->f_version = 0;
2681 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2682 loff_t maxsize, loff_t eof)
2684 struct inode *inode = file->f_dentry->d_inode;
2692 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2693 * position-querying operation. Avoid rewriting the "same"
2694 * f_pos value back to the file because a concurrent read(),
2695 * write() or lseek() might have altered it
2700 * f_lock protects against read/modify/write race with other
2701 * SEEK_CURs. Note that parallel writes and reads behave
2704 mutex_lock(&inode->i_mutex);
2705 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2706 mutex_unlock(&inode->i_mutex);
2710 * In the generic case the entire file is data, so as long as
2711 * offset isn't at the end of the file then the offset is data.
2718 * There is a virtual hole at the end of the file, so as long as
2719 * offset isn't i_size or larger, return i_size.
2727 return llseek_execute(file, offset, maxsize);
2731 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2733 struct inode *inode = file->f_dentry->d_inode;
2734 loff_t retval, eof = 0;
2737 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2738 (origin == SEEK_CUR) ? file->f_pos : 0);
2739 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2740 PFID(ll_inode2fid(inode)), inode, retval, retval,
2742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2744 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2745 retval = ll_glimpse_size(inode);
2748 eof = i_size_read(inode);
2751 retval = ll_generic_file_llseek_size(file, offset, origin,
2752 ll_file_maxbytes(inode), eof);
2756 static int ll_flush(struct file *file, fl_owner_t id)
2758 struct inode *inode = file->f_dentry->d_inode;
2759 struct ll_inode_info *lli = ll_i2info(inode);
2760 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2763 LASSERT(!S_ISDIR(inode->i_mode));
2765 /* catch async errors that were recorded back when async writeback
2766 * failed for pages in this mapping. */
2767 rc = lli->lli_async_rc;
2768 lli->lli_async_rc = 0;
2769 if (lli->lli_clob != NULL) {
2770 err = lov_read_and_clear_async_rc(lli->lli_clob);
2775 /* The application has been told write failure already.
2776 * Do not report failure again. */
2777 if (fd->fd_write_failed)
2779 return rc ? -EIO : 0;
2783 * Called to make sure a portion of file has been written out.
2784 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2786 * Return how many pages have been written.
2788 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2789 enum cl_fsync_mode mode, int ignore_layout)
2791 struct cl_env_nest nest;
2794 struct obd_capa *capa = NULL;
2795 struct cl_fsync_io *fio;
2799 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2800 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2803 env = cl_env_nested_get(&nest);
2805 RETURN(PTR_ERR(env));
2807 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2809 io = ccc_env_thread_io(env);
2810 io->ci_obj = cl_i2info(inode)->lli_clob;
2811 io->ci_ignore_layout = ignore_layout;
2813 /* initialize parameters for sync */
2814 fio = &io->u.ci_fsync;
2815 fio->fi_capa = capa;
2816 fio->fi_start = start;
2818 fio->fi_fid = ll_inode2fid(inode);
2819 fio->fi_mode = mode;
2820 fio->fi_nr_written = 0;
2822 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2823 result = cl_io_loop(env, io);
2825 result = io->ci_result;
2827 result = fio->fi_nr_written;
2828 cl_io_fini(env, io);
2829 cl_env_nested_put(&nest, env);
2837 * When dentry is provided (the 'else' case), *file->f_dentry may be
2838 * null and dentry must be used directly rather than pulled from
2839 * *file->f_dentry as is done otherwise.
2842 #ifdef HAVE_FILE_FSYNC_4ARGS
2843 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2845 struct dentry *dentry = file->f_dentry;
2846 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2847 int ll_fsync(struct file *file, int datasync)
2849 struct dentry *dentry = file->f_dentry;
2851 loff_t end = LLONG_MAX;
2853 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2856 loff_t end = LLONG_MAX;
2858 struct inode *inode = dentry->d_inode;
2859 struct ll_inode_info *lli = ll_i2info(inode);
2860 struct ptlrpc_request *req;
2861 struct obd_capa *oc;
2865 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2866 PFID(ll_inode2fid(inode)), inode);
2867 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2869 #ifdef HAVE_FILE_FSYNC_4ARGS
2870 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2871 mutex_lock(&inode->i_mutex);
2873 /* fsync's caller has already called _fdata{sync,write}, we want
2874 * that IO to finish before calling the osc and mdc sync methods */
2875 rc = filemap_fdatawait(inode->i_mapping);
2878 /* catch async errors that were recorded back when async writeback
2879 * failed for pages in this mapping. */
2880 if (!S_ISDIR(inode->i_mode)) {
2881 err = lli->lli_async_rc;
2882 lli->lli_async_rc = 0;
2885 err = lov_read_and_clear_async_rc(lli->lli_clob);
2890 oc = ll_mdscapa_get(inode);
2891 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2897 ptlrpc_req_finished(req);
2899 if (S_ISREG(inode->i_mode)) {
2900 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2902 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2903 if (rc == 0 && err < 0)
2906 fd->fd_write_failed = true;
2908 fd->fd_write_failed = false;
2911 #ifdef HAVE_FILE_FSYNC_4ARGS
2912 mutex_unlock(&inode->i_mutex);
2918 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2920 struct inode *inode = file->f_dentry->d_inode;
2921 struct ll_sb_info *sbi = ll_i2sbi(inode);
2922 struct ldlm_enqueue_info einfo = {
2923 .ei_type = LDLM_FLOCK,
2924 .ei_cb_cp = ldlm_flock_completion_ast,
2925 .ei_cbdata = file_lock,
2927 struct md_op_data *op_data;
2928 struct lustre_handle lockh = {0};
2929 ldlm_policy_data_t flock = {{0}};
2930 int fl_type = file_lock->fl_type;
2936 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2937 PFID(ll_inode2fid(inode)), file_lock);
2939 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2941 if (file_lock->fl_flags & FL_FLOCK) {
2942 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2943 /* flocks are whole-file locks */
2944 flock.l_flock.end = OFFSET_MAX;
2945 /* For flocks owner is determined by the local file desctiptor*/
2946 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2947 } else if (file_lock->fl_flags & FL_POSIX) {
2948 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2949 flock.l_flock.start = file_lock->fl_start;
2950 flock.l_flock.end = file_lock->fl_end;
2954 flock.l_flock.pid = file_lock->fl_pid;
2956 /* Somewhat ugly workaround for svc lockd.
2957 * lockd installs custom fl_lmops->lm_compare_owner that checks
2958 * for the fl_owner to be the same (which it always is on local node
2959 * I guess between lockd processes) and then compares pid.
2960 * As such we assign pid to the owner field to make it all work,
2961 * conflict with normal locks is unlikely since pid space and
2962 * pointer space for current->files are not intersecting */
2963 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2964 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2968 einfo.ei_mode = LCK_PR;
2971 /* An unlock request may or may not have any relation to
2972 * existing locks so we may not be able to pass a lock handle
2973 * via a normal ldlm_lock_cancel() request. The request may even
2974 * unlock a byte range in the middle of an existing lock. In
2975 * order to process an unlock request we need all of the same
2976 * information that is given with a normal read or write record
2977 * lock request. To avoid creating another ldlm unlock (cancel)
2978 * message we'll treat a LCK_NL flock request as an unlock. */
2979 einfo.ei_mode = LCK_NL;
2982 einfo.ei_mode = LCK_PW;
2985 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3000 flags = LDLM_FL_BLOCK_NOWAIT;
3006 flags = LDLM_FL_TEST_LOCK;
3009 CERROR("unknown fcntl lock command: %d\n", cmd);
3013 /* Save the old mode so that if the mode in the lock changes we
3014 * can decrement the appropriate reader or writer refcount. */
3015 file_lock->fl_type = einfo.ei_mode;
3017 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3018 LUSTRE_OPC_ANY, NULL);
3019 if (IS_ERR(op_data))
3020 RETURN(PTR_ERR(op_data));
3022 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3023 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3024 flock.l_flock.pid, flags, einfo.ei_mode,
3025 flock.l_flock.start, flock.l_flock.end);
3027 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3030 /* Restore the file lock type if not TEST lock. */
3031 if (!(flags & LDLM_FL_TEST_LOCK))
3032 file_lock->fl_type = fl_type;
3034 if ((file_lock->fl_flags & FL_FLOCK) &&
3035 (rc == 0 || file_lock->fl_type == F_UNLCK))
3036 rc2 = flock_lock_file_wait(file, file_lock);
3037 if ((file_lock->fl_flags & FL_POSIX) &&
3038 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3039 !(flags & LDLM_FL_TEST_LOCK))
3040 rc2 = posix_lock_file_wait(file, file_lock);
3042 if (rc2 && file_lock->fl_type != F_UNLCK) {
3043 einfo.ei_mode = LCK_NL;
3044 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3049 ll_finish_md_op_data(op_data);
3054 int ll_get_fid_by_name(struct inode *parent, const char *name,
3055 int namelen, struct lu_fid *fid)
3057 struct md_op_data *op_data = NULL;
3058 struct mdt_body *body;
3059 struct ptlrpc_request *req;
3063 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3064 LUSTRE_OPC_ANY, NULL);
3065 if (IS_ERR(op_data))
3066 RETURN(PTR_ERR(op_data));
3068 op_data->op_valid = OBD_MD_FLID;
3069 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3070 ll_finish_md_op_data(op_data);
3074 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3076 GOTO(out_req, rc = -EFAULT);
3078 *fid = body->mbo_fid1;
3080 ptlrpc_req_finished(req);
3084 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3085 const char *name, int namelen)
3087 struct dentry *dchild = NULL;
3088 struct inode *child_inode = NULL;
3089 struct md_op_data *op_data;
3090 struct ptlrpc_request *request = NULL;
3095 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3096 name, PFID(ll_inode2fid(parent)), mdtidx);
3098 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3099 0, LUSTRE_OPC_ANY, NULL);
3100 if (IS_ERR(op_data))
3101 RETURN(PTR_ERR(op_data));
3103 /* Get child FID first */
3104 qstr.hash = full_name_hash(name, namelen);
3107 dchild = d_lookup(file->f_dentry, &qstr);
3108 if (dchild != NULL && dchild->d_inode != NULL) {
3109 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3110 if (dchild->d_inode != NULL) {
3111 child_inode = igrab(dchild->d_inode);
3112 ll_invalidate_aliases(child_inode);
3116 rc = ll_get_fid_by_name(parent, name, namelen,
3122 if (!fid_is_sane(&op_data->op_fid3)) {
3123 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3124 ll_get_fsname(parent->i_sb, NULL, 0), name,
3125 PFID(&op_data->op_fid3));
3126 GOTO(out_free, rc = -EINVAL);
3129 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3134 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3135 PFID(&op_data->op_fid3), mdtidx);
3136 GOTO(out_free, rc = 0);
3139 op_data->op_mds = mdtidx;
3140 op_data->op_cli_flags = CLI_MIGRATE;
3141 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3142 namelen, name, namelen, &request);
3144 ll_update_times(request, parent);
3146 ptlrpc_req_finished(request);
3151 if (child_inode != NULL) {
3152 clear_nlink(child_inode);
3156 ll_finish_md_op_data(op_data);
3161 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3169 * test if some locks matching bits and l_req_mode are acquired
3170 * - bits can be in different locks
3171 * - if found clear the common lock bits in *bits
3172 * - the bits not found, are kept in *bits
3174 * \param bits [IN] searched lock bits [IN]
3175 * \param l_req_mode [IN] searched lock mode
3176 * \retval boolean, true iff all bits are found
3178 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3180 struct lustre_handle lockh;
3181 ldlm_policy_data_t policy;
3182 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3183 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3192 fid = &ll_i2info(inode)->lli_fid;
3193 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3194 ldlm_lockname[mode]);
3196 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3197 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3198 policy.l_inodebits.bits = *bits & (1 << i);
3199 if (policy.l_inodebits.bits == 0)
3202 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3203 &policy, mode, &lockh)) {
3204 struct ldlm_lock *lock;
3206 lock = ldlm_handle2lock(&lockh);
3209 ~(lock->l_policy_data.l_inodebits.bits);
3210 LDLM_LOCK_PUT(lock);
3212 *bits &= ~policy.l_inodebits.bits;
3219 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3220 struct lustre_handle *lockh, __u64 flags,
3223 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3228 fid = &ll_i2info(inode)->lli_fid;
3229 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3231 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3232 fid, LDLM_IBITS, &policy, mode, lockh);
3237 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3239 /* Already unlinked. Just update nlink and return success */
3240 if (rc == -ENOENT) {
3242 /* This path cannot be hit for regular files unless in
3243 * case of obscure races, so no need to to validate
3245 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3247 } else if (rc != 0) {
3248 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3249 "%s: revalidate FID "DFID" error: rc = %d\n",
3250 ll_get_fsname(inode->i_sb, NULL, 0),
3251 PFID(ll_inode2fid(inode)), rc);
3257 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3259 struct inode *inode = dentry->d_inode;
3260 struct ptlrpc_request *req = NULL;
3261 struct obd_export *exp;
3265 LASSERT(inode != NULL);
3267 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3268 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3270 exp = ll_i2mdexp(inode);
3272 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3273 * But under CMD case, it caused some lock issues, should be fixed
3274 * with new CMD ibits lock. See bug 12718 */
3275 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3276 struct lookup_intent oit = { .it_op = IT_GETATTR };
3277 struct md_op_data *op_data;
3279 if (ibits == MDS_INODELOCK_LOOKUP)
3280 oit.it_op = IT_LOOKUP;
3282 /* Call getattr by fid, so do not provide name at all. */
3283 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3284 dentry->d_inode, NULL, 0, 0,
3285 LUSTRE_OPC_ANY, NULL);
3286 if (IS_ERR(op_data))
3287 RETURN(PTR_ERR(op_data));
3289 rc = md_intent_lock(exp, op_data, &oit, &req,
3290 &ll_md_blocking_ast, 0);
3291 ll_finish_md_op_data(op_data);
3293 rc = ll_inode_revalidate_fini(inode, rc);
3297 rc = ll_revalidate_it_finish(req, &oit, dentry);
3299 ll_intent_release(&oit);
3303 /* Unlinked? Unhash dentry, so it is not picked up later by
3304 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3305 here to preserve get_cwd functionality on 2.6.
3307 if (!dentry->d_inode->i_nlink)
3308 d_lustre_invalidate(dentry, 0);
3310 ll_lookup_finish_locks(&oit, dentry);
3311 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3312 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3313 obd_valid valid = OBD_MD_FLGETATTR;
3314 struct md_op_data *op_data;
3317 if (S_ISREG(inode->i_mode)) {
3318 rc = ll_get_default_mdsize(sbi, &ealen);
3321 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3324 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3325 0, ealen, LUSTRE_OPC_ANY,
3327 if (IS_ERR(op_data))
3328 RETURN(PTR_ERR(op_data));
3330 op_data->op_valid = valid;
3331 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3332 * capa for this inode. Because we only keep capas of dirs
3334 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3335 ll_finish_md_op_data(op_data);
3337 rc = ll_inode_revalidate_fini(inode, rc);
3341 rc = ll_prep_inode(&inode, req, NULL, NULL);
3344 ptlrpc_req_finished(req);
3348 static int ll_merge_md_attr(struct inode *inode)
3350 struct cl_attr attr = { 0 };
3353 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3354 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3359 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3360 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3362 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3363 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3364 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3370 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3372 struct inode *inode = dentry->d_inode;
3376 rc = __ll_inode_revalidate(dentry, ibits);
3380 /* if object isn't regular file, don't validate size */
3381 if (!S_ISREG(inode->i_mode)) {
3382 if (S_ISDIR(inode->i_mode) &&
3383 ll_i2info(inode)->lli_lsm_md != NULL) {
3384 rc = ll_merge_md_attr(inode);
3389 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3390 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3391 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3393 /* In case of restore, the MDT has the right size and has
3394 * already send it back without granting the layout lock,
3395 * inode is up-to-date so glimpse is useless.
3396 * Also to glimpse we need the layout, in case of a running
3397 * restore the MDT holds the layout lock so the glimpse will
3398 * block up to the end of restore (getattr will block)
3400 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3401 rc = ll_glimpse_size(inode);
3406 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3408 struct inode *inode = de->d_inode;
3409 struct ll_sb_info *sbi = ll_i2sbi(inode);
3410 struct ll_inode_info *lli = ll_i2info(inode);
3413 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3414 MDS_INODELOCK_LOOKUP);
3415 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3420 stat->dev = inode->i_sb->s_dev;
3421 if (ll_need_32bit_api(sbi))
3422 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3424 stat->ino = inode->i_ino;
3425 stat->mode = inode->i_mode;
3426 stat->uid = inode->i_uid;
3427 stat->gid = inode->i_gid;
3428 stat->rdev = inode->i_rdev;
3429 stat->atime = inode->i_atime;
3430 stat->mtime = inode->i_mtime;
3431 stat->ctime = inode->i_ctime;
3432 stat->blksize = 1 << inode->i_blkbits;
3433 stat->blocks = inode->i_blocks;
3435 if (S_ISDIR(inode->i_mode) &&
3436 ll_i2info(inode)->lli_lsm_md != NULL) {
3437 stat->nlink = lli->lli_stripe_dir_nlink;
3438 stat->size = lli->lli_stripe_dir_size;
3440 stat->nlink = inode->i_nlink;
3441 stat->size = i_size_read(inode);
3447 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3448 __u64 start, __u64 len)
3452 struct ll_user_fiemap *fiemap;
3453 unsigned int extent_count = fieinfo->fi_extents_max;
3455 num_bytes = sizeof(*fiemap) + (extent_count *
3456 sizeof(struct ll_fiemap_extent));
3457 OBD_ALLOC_LARGE(fiemap, num_bytes);
3462 fiemap->fm_flags = fieinfo->fi_flags;
3463 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3464 fiemap->fm_start = start;
3465 fiemap->fm_length = len;
3466 if (extent_count > 0)
3467 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3468 sizeof(struct ll_fiemap_extent));
3470 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3472 fieinfo->fi_flags = fiemap->fm_flags;
3473 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3474 if (extent_count > 0)
3475 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3476 fiemap->fm_mapped_extents *
3477 sizeof(struct ll_fiemap_extent));
3479 OBD_FREE_LARGE(fiemap, num_bytes);
3483 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3485 struct ll_inode_info *lli = ll_i2info(inode);
3486 struct posix_acl *acl = NULL;
3489 spin_lock(&lli->lli_lock);
3490 /* VFS' acl_permission_check->check_acl will release the refcount */
3491 acl = posix_acl_dup(lli->lli_posix_acl);
3492 spin_unlock(&lli->lli_lock);
3497 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3499 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3500 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3502 ll_check_acl(struct inode *inode, int mask)
3505 # ifdef CONFIG_FS_POSIX_ACL
3506 struct posix_acl *acl;
3510 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3511 if (flags & IPERM_FLAG_RCU)
3514 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3519 rc = posix_acl_permission(inode, acl, mask);
3520 posix_acl_release(acl);
3523 # else /* !CONFIG_FS_POSIX_ACL */
3525 # endif /* CONFIG_FS_POSIX_ACL */
3527 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3529 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3530 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3532 # ifdef HAVE_INODE_PERMISION_2ARGS
3533 int ll_inode_permission(struct inode *inode, int mask)
3535 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3540 struct ll_sb_info *sbi;
3541 struct root_squash_info *squash;
3542 struct cred *cred = NULL;
3543 const struct cred *old_cred = NULL;
3545 bool squash_id = false;
3548 #ifdef MAY_NOT_BLOCK
3549 if (mask & MAY_NOT_BLOCK)
3551 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3552 if (flags & IPERM_FLAG_RCU)
3556 /* as root inode are NOT getting validated in lookup operation,
3557 * need to do it before permission check. */
3559 if (inode == inode->i_sb->s_root->d_inode) {
3560 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3561 MDS_INODELOCK_LOOKUP);
3566 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3567 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3569 /* squash fsuid/fsgid if needed */
3570 sbi = ll_i2sbi(inode);
3571 squash = &sbi->ll_squash;
3572 if (unlikely(squash->rsi_uid != 0 &&
3573 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3574 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3578 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3579 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3580 squash->rsi_uid, squash->rsi_gid);
3582 /* update current process's credentials
3583 * and FS capability */
3584 cred = prepare_creds();
3588 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3589 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3590 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3591 if ((1 << cap) & CFS_CAP_FS_MASK)
3592 cap_lower(cred->cap_effective, cap);
3594 old_cred = override_creds(cred);
3597 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3599 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3600 rc = lustre_check_remote_perm(inode, mask);
3602 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3604 /* restore current process's credentials and FS capability */
3606 revert_creds(old_cred);
3613 /* -o localflock - only provides locally consistent flock locks */
3614 struct file_operations ll_file_operations = {
3615 .read = ll_file_read,
3616 .aio_read = ll_file_aio_read,
3617 .write = ll_file_write,
3618 .aio_write = ll_file_aio_write,
3619 .unlocked_ioctl = ll_file_ioctl,
3620 .open = ll_file_open,
3621 .release = ll_file_release,
3622 .mmap = ll_file_mmap,
3623 .llseek = ll_file_seek,
3624 .splice_read = ll_file_splice_read,
3629 struct file_operations ll_file_operations_flock = {
3630 .read = ll_file_read,
3631 .aio_read = ll_file_aio_read,
3632 .write = ll_file_write,
3633 .aio_write = ll_file_aio_write,
3634 .unlocked_ioctl = ll_file_ioctl,
3635 .open = ll_file_open,
3636 .release = ll_file_release,
3637 .mmap = ll_file_mmap,
3638 .llseek = ll_file_seek,
3639 .splice_read = ll_file_splice_read,
3642 .flock = ll_file_flock,
3643 .lock = ll_file_flock
3646 /* These are for -o noflock - to return ENOSYS on flock calls */
3647 struct file_operations ll_file_operations_noflock = {
3648 .read = ll_file_read,
3649 .aio_read = ll_file_aio_read,
3650 .write = ll_file_write,
3651 .aio_write = ll_file_aio_write,
3652 .unlocked_ioctl = ll_file_ioctl,
3653 .open = ll_file_open,
3654 .release = ll_file_release,
3655 .mmap = ll_file_mmap,
3656 .llseek = ll_file_seek,
3657 .splice_read = ll_file_splice_read,
3660 .flock = ll_file_noflock,
3661 .lock = ll_file_noflock
3664 struct inode_operations ll_file_inode_operations = {
3665 .setattr = ll_setattr,
3666 .getattr = ll_getattr,
3667 .permission = ll_inode_permission,
3668 .setxattr = ll_setxattr,
3669 .getxattr = ll_getxattr,
3670 .listxattr = ll_listxattr,
3671 .removexattr = ll_removexattr,
3672 .fiemap = ll_fiemap,
3673 #ifdef HAVE_IOP_GET_ACL
3674 .get_acl = ll_get_acl,
3678 /* dynamic ioctl number support routins */
3679 static struct llioc_ctl_data {
3680 struct rw_semaphore ioc_sem;
3681 struct list_head ioc_head;
3683 __RWSEM_INITIALIZER(llioc.ioc_sem),
3684 LIST_HEAD_INIT(llioc.ioc_head)
3689 struct list_head iocd_list;
3690 unsigned int iocd_size;
3691 llioc_callback_t iocd_cb;
3692 unsigned int iocd_count;
3693 unsigned int iocd_cmd[0];
3696 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3699 struct llioc_data *in_data = NULL;
3702 if (cb == NULL || cmd == NULL ||
3703 count > LLIOC_MAX_CMD || count < 0)
3706 size = sizeof(*in_data) + count * sizeof(unsigned int);
3707 OBD_ALLOC(in_data, size);
3708 if (in_data == NULL)
3711 memset(in_data, 0, sizeof(*in_data));
3712 in_data->iocd_size = size;
3713 in_data->iocd_cb = cb;
3714 in_data->iocd_count = count;
3715 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3717 down_write(&llioc.ioc_sem);
3718 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3719 up_write(&llioc.ioc_sem);
3724 void ll_iocontrol_unregister(void *magic)
3726 struct llioc_data *tmp;
3731 down_write(&llioc.ioc_sem);
3732 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3734 unsigned int size = tmp->iocd_size;
3736 list_del(&tmp->iocd_list);
3737 up_write(&llioc.ioc_sem);
3739 OBD_FREE(tmp, size);
3743 up_write(&llioc.ioc_sem);
3745 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3748 EXPORT_SYMBOL(ll_iocontrol_register);
3749 EXPORT_SYMBOL(ll_iocontrol_unregister);
3751 static enum llioc_iter
3752 ll_iocontrol_call(struct inode *inode, struct file *file,
3753 unsigned int cmd, unsigned long arg, int *rcp)
3755 enum llioc_iter ret = LLIOC_CONT;
3756 struct llioc_data *data;
3757 int rc = -EINVAL, i;
3759 down_read(&llioc.ioc_sem);
3760 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3761 for (i = 0; i < data->iocd_count; i++) {
3762 if (cmd != data->iocd_cmd[i])
3765 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3769 if (ret == LLIOC_STOP)
3772 up_read(&llioc.ioc_sem);
3779 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3781 struct ll_inode_info *lli = ll_i2info(inode);
3782 struct cl_env_nest nest;
3787 if (lli->lli_clob == NULL)
3790 env = cl_env_nested_get(&nest);
3792 RETURN(PTR_ERR(env));
3794 result = cl_conf_set(env, lli->lli_clob, conf);
3795 cl_env_nested_put(&nest, env);
3797 if (conf->coc_opc == OBJECT_CONF_SET) {
3798 struct ldlm_lock *lock = conf->coc_lock;
3800 LASSERT(lock != NULL);
3801 LASSERT(ldlm_has_layout(lock));
3803 struct lustre_md *md = conf->u.coc_md;
3804 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3806 /* it can only be allowed to match after layout is
3807 * applied to inode otherwise false layout would be
3808 * seen. Applying layout shoud happen before dropping
3809 * the intent lock. */
3810 ldlm_lock_allow_match(lock);
3812 lli->lli_has_smd = lsm_has_objects(md->lsm);
3813 if (md->lsm != NULL)
3814 gen = md->lsm->lsm_layout_gen;
3817 DFID ": layout version change: %u -> %u\n",
3818 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3820 ll_layout_version_set(lli, gen);
3826 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3827 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3830 struct ll_sb_info *sbi = ll_i2sbi(inode);
3831 struct obd_capa *oc;
3832 struct ptlrpc_request *req;
3833 struct mdt_body *body;
3840 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3841 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3842 lock->l_lvb_data, lock->l_lvb_len);
3844 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3847 /* if layout lock was granted right away, the layout is returned
3848 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3849 * blocked and then granted via completion ast, we have to fetch
3850 * layout here. Please note that we can't use the LVB buffer in
3851 * completion AST because it doesn't have a large enough buffer */
3852 oc = ll_mdscapa_get(inode);
3853 rc = ll_get_default_mdsize(sbi, &lmmsize);
3855 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3856 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3862 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3864 GOTO(out, rc = -EPROTO);
3866 lmmsize = body->mbo_eadatasize;
3867 if (lmmsize == 0) /* empty layout */
3870 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3872 GOTO(out, rc = -EFAULT);
3874 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3875 if (lvbdata == NULL)
3876 GOTO(out, rc = -ENOMEM);
3878 memcpy(lvbdata, lmm, lmmsize);
3879 lock_res_and_lock(lock);
3880 if (lock->l_lvb_data != NULL)
3881 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3883 lock->l_lvb_data = lvbdata;
3884 lock->l_lvb_len = lmmsize;
3885 unlock_res_and_lock(lock);
3890 ptlrpc_req_finished(req);
3895 * Apply the layout to the inode. Layout lock is held and will be released
3898 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3899 struct inode *inode, __u32 *gen, bool reconf)
3901 struct ll_inode_info *lli = ll_i2info(inode);
3902 struct ll_sb_info *sbi = ll_i2sbi(inode);
3903 struct ldlm_lock *lock;
3904 struct lustre_md md = { NULL };
3905 struct cl_object_conf conf;
3908 bool wait_layout = false;
3911 LASSERT(lustre_handle_is_used(lockh));
3913 lock = ldlm_handle2lock(lockh);
3914 LASSERT(lock != NULL);
3915 LASSERT(ldlm_has_layout(lock));
3917 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3918 PFID(&lli->lli_fid), inode, reconf);
3920 /* in case this is a caching lock and reinstate with new inode */
3921 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3923 lock_res_and_lock(lock);
3924 lvb_ready = ldlm_is_lvb_ready(lock);
3925 unlock_res_and_lock(lock);
3926 /* checking lvb_ready is racy but this is okay. The worst case is
3927 * that multi processes may configure the file on the same time. */
3929 if (lvb_ready || !reconf) {
3932 /* layout_gen must be valid if layout lock is not
3933 * cancelled and stripe has already set */
3934 *gen = ll_layout_version_get(lli);
3940 rc = ll_layout_fetch(inode, lock);
3944 /* for layout lock, lmm is returned in lock's lvb.
3945 * lvb_data is immutable if the lock is held so it's safe to access it
3946 * without res lock. See the description in ldlm_lock_decref_internal()
3947 * for the condition to free lvb_data of layout lock */
3948 if (lock->l_lvb_data != NULL) {
3949 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3950 lock->l_lvb_data, lock->l_lvb_len);
3952 *gen = LL_LAYOUT_GEN_EMPTY;
3954 *gen = md.lsm->lsm_layout_gen;
3957 CERROR("%s: file "DFID" unpackmd error: %d\n",
3958 ll_get_fsname(inode->i_sb, NULL, 0),
3959 PFID(&lli->lli_fid), rc);
3965 /* set layout to file. Unlikely this will fail as old layout was
3966 * surely eliminated */
3967 memset(&conf, 0, sizeof conf);
3968 conf.coc_opc = OBJECT_CONF_SET;
3969 conf.coc_inode = inode;
3970 conf.coc_lock = lock;
3971 conf.u.coc_md = &md;
3972 rc = ll_layout_conf(inode, &conf);
3975 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3977 /* refresh layout failed, need to wait */
3978 wait_layout = rc == -EBUSY;
3982 LDLM_LOCK_PUT(lock);
3983 ldlm_lock_decref(lockh, mode);
3985 /* wait for IO to complete if it's still being used. */
3987 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3988 ll_get_fsname(inode->i_sb, NULL, 0),
3989 PFID(&lli->lli_fid), inode);
3991 memset(&conf, 0, sizeof conf);
3992 conf.coc_opc = OBJECT_CONF_WAIT;
3993 conf.coc_inode = inode;
3994 rc = ll_layout_conf(inode, &conf);
3998 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3999 ll_get_fsname(inode->i_sb, NULL, 0),
4000 PFID(&lli->lli_fid), rc);
4006 * This function checks if there exists a LAYOUT lock on the client side,
4007 * or enqueues it if it doesn't have one in cache.
4009 * This function will not hold layout lock so it may be revoked any time after
4010 * this function returns. Any operations depend on layout should be redone
4013 * This function should be called before lov_io_init() to get an uptodate
4014 * layout version, the caller should save the version number and after IO
4015 * is finished, this function should be called again to verify that layout
4016 * is not changed during IO time.
4018 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4020 struct ll_inode_info *lli = ll_i2info(inode);
4021 struct ll_sb_info *sbi = ll_i2sbi(inode);
4022 struct md_op_data *op_data;
4023 struct lookup_intent it;
4024 struct lustre_handle lockh;
4026 struct ldlm_enqueue_info einfo = {
4027 .ei_type = LDLM_IBITS,
4029 .ei_cb_bl = &ll_md_blocking_ast,
4030 .ei_cb_cp = &ldlm_completion_ast,
4035 *gen = ll_layout_version_get(lli);
4036 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4040 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4041 LASSERT(S_ISREG(inode->i_mode));
4043 /* take layout lock mutex to enqueue layout lock exclusively. */
4044 mutex_lock(&lli->lli_layout_mutex);
4047 /* mostly layout lock is caching on the local side, so try to match
4048 * it before grabbing layout lock mutex. */
4049 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4050 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4051 if (mode != 0) { /* hit cached lock */
4052 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4056 mutex_unlock(&lli->lli_layout_mutex);
4060 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4061 0, 0, LUSTRE_OPC_ANY, NULL);
4062 if (IS_ERR(op_data)) {
4063 mutex_unlock(&lli->lli_layout_mutex);
4064 RETURN(PTR_ERR(op_data));
4067 /* have to enqueue one */
4068 memset(&it, 0, sizeof(it));
4069 it.it_op = IT_LAYOUT;
4070 lockh.cookie = 0ULL;
4072 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4073 ll_get_fsname(inode->i_sb, NULL, 0),
4074 PFID(&lli->lli_fid), inode);
4076 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4077 if (it.d.lustre.it_data != NULL)
4078 ptlrpc_req_finished(it.d.lustre.it_data);
4079 it.d.lustre.it_data = NULL;
4081 ll_finish_md_op_data(op_data);
4083 mode = it.d.lustre.it_lock_mode;
4084 it.d.lustre.it_lock_mode = 0;
4085 ll_intent_drop_lock(&it);
4088 /* set lock data in case this is a new lock */
4089 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4090 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4094 mutex_unlock(&lli->lli_layout_mutex);
4100 * This function send a restore request to the MDT
4102 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4104 struct hsm_user_request *hur;
4108 len = sizeof(struct hsm_user_request) +
4109 sizeof(struct hsm_user_item);
4110 OBD_ALLOC(hur, len);
4114 hur->hur_request.hr_action = HUA_RESTORE;
4115 hur->hur_request.hr_archive_id = 0;
4116 hur->hur_request.hr_flags = 0;
4117 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4118 sizeof(hur->hur_user_item[0].hui_fid));
4119 hur->hur_user_item[0].hui_extent.offset = offset;
4120 hur->hur_user_item[0].hui_extent.length = length;
4121 hur->hur_request.hr_itemcount = 1;
4122 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,