4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("released file has negative dentry: file = %p, "
339 "dentry = %p, name = %s\n",
340 file, file->f_dentry, file->f_dentry->d_name.name);
344 LUSTRE_FPRIVATE(file) = NULL;
345 ll_file_data_put(fd);
346 ll_capa_close(inode);
351 /* While this returns an error code, fput() the caller does not, so we need
352 * to make every effort to clean up all of our state here. Also, applications
353 * rarely check close errors and even if an error is returned they will not
354 * re-try the close call.
356 int ll_file_release(struct inode *inode, struct file *file)
358 struct ll_file_data *fd;
359 struct ll_sb_info *sbi = ll_i2sbi(inode);
360 struct ll_inode_info *lli = ll_i2info(inode);
364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
365 PFID(ll_inode2fid(inode)), inode);
367 #ifdef CONFIG_FS_POSIX_ACL
368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
369 inode == inode->i_sb->s_root->d_inode) {
370 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
373 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
374 fd->fd_flags &= ~LL_FILE_RMTACL;
375 rct_del(&sbi->ll_rct, current_pid());
376 et_search_free(&sbi->ll_et, current_pid());
381 if (inode->i_sb->s_root != file->f_dentry)
382 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
383 fd = LUSTRE_FPRIVATE(file);
386 /* The last ref on @file, maybe not the the owner pid of statahead,
387 * because parent and child process can share the same file handle. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
389 ll_deauthorize_statahead(inode, fd);
391 if (inode->i_sb->s_root == file->f_dentry) {
392 LUSTRE_FPRIVATE(file) = NULL;
393 ll_file_data_put(fd);
397 if (!S_ISDIR(inode->i_mode)) {
398 if (lli->lli_clob != NULL)
399 lov_read_and_clear_async_rc(lli->lli_clob);
400 lli->lli_async_rc = 0;
403 rc = ll_md_close(sbi->ll_md_exp, inode, file);
405 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
406 libcfs_debug_dumplog();
411 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
412 struct lookup_intent *itp)
414 struct dentry *de = file->f_dentry;
415 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
416 struct dentry *parent = de->d_parent;
417 const char *name = NULL;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req = NULL;
424 LASSERT(parent != NULL);
425 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
427 /* if server supports open-by-fid, or file name is invalid, don't pack
428 * name in open request */
429 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
430 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
431 name = de->d_name.name;
432 len = de->d_name.len;
435 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
436 name, len, 0, LUSTRE_OPC_ANY, NULL);
438 RETURN(PTR_ERR(op_data));
439 op_data->op_data = lmm;
440 op_data->op_data_size = lmmsize;
442 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
443 &ll_md_blocking_ast, 0);
444 ll_finish_md_op_data(op_data);
446 /* reason for keep own exit path - don`t flood log
447 * with messages with -ESTALE errors.
449 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
450 it_open_error(DISP_OPEN_OPEN, itp))
452 ll_release_openhandle(de, itp);
456 if (it_disposition(itp, DISP_LOOKUP_NEG))
457 GOTO(out, rc = -ENOENT);
459 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
460 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
461 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
465 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
466 if (!rc && itp->d.lustre.it_lock_mode)
467 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
470 ptlrpc_req_finished(req);
471 ll_intent_drop_lock(itp);
477 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
478 * not believe attributes if a few ioepoch holders exist. Attributes for
479 * previous ioepoch if new one is opened are also skipped by MDS.
481 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
483 if (ioepoch && lli->lli_ioepoch != ioepoch) {
484 lli->lli_ioepoch = ioepoch;
485 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
486 ioepoch, PFID(&lli->lli_fid));
490 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
491 struct obd_client_handle *och)
493 struct ptlrpc_request *req = it->d.lustre.it_data;
494 struct mdt_body *body;
496 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
497 och->och_fh = body->mbo_handle;
498 och->och_fid = body->mbo_fid1;
499 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
500 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
501 och->och_flags = it->it_flags;
503 return md_set_open_replay_data(md_exp, och, it);
506 static int ll_local_open(struct file *file, struct lookup_intent *it,
507 struct ll_file_data *fd, struct obd_client_handle *och)
509 struct inode *inode = file->f_dentry->d_inode;
510 struct ll_inode_info *lli = ll_i2info(inode);
513 LASSERT(!LUSTRE_FPRIVATE(file));
518 struct ptlrpc_request *req = it->d.lustre.it_data;
519 struct mdt_body *body;
522 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
526 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
527 ll_ioepoch_open(lli, body->mbo_ioepoch);
530 LUSTRE_FPRIVATE(file) = fd;
531 ll_readahead_init(inode, &fd->fd_ras);
532 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
534 /* ll_cl_context initialize */
535 rwlock_init(&fd->fd_lock);
536 INIT_LIST_HEAD(&fd->fd_lccs);
541 /* Open a file, and (for the very first open) create objects on the OSTs at
542 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
543 * creation or open until ll_lov_setstripe() ioctl is called.
545 * If we already have the stripe MD locally then we don't request it in
546 * md_open(), by passing a lmm_size = 0.
548 * It is up to the application to ensure no other processes open this file
549 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
550 * used. We might be able to avoid races of that sort by getting lli_open_sem
551 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
552 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
554 int ll_file_open(struct inode *inode, struct file *file)
556 struct ll_inode_info *lli = ll_i2info(inode);
557 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
558 .it_flags = file->f_flags };
559 struct obd_client_handle **och_p = NULL;
560 __u64 *och_usecount = NULL;
561 struct ll_file_data *fd;
565 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
566 PFID(ll_inode2fid(inode)), inode, file->f_flags);
568 it = file->private_data; /* XXX: compat macro */
569 file->private_data = NULL; /* prevent ll_local_open assertion */
571 fd = ll_file_data_get();
573 GOTO(out_openerr, rc = -ENOMEM);
576 if (S_ISDIR(inode->i_mode))
577 ll_authorize_statahead(inode, fd);
579 if (inode->i_sb->s_root == file->f_dentry) {
580 LUSTRE_FPRIVATE(file) = fd;
584 if (!it || !it->d.lustre.it_disposition) {
585 /* Convert f_flags into access mode. We cannot use file->f_mode,
586 * because everything but O_ACCMODE mask was stripped from
588 if ((oit.it_flags + 1) & O_ACCMODE)
590 if (file->f_flags & O_TRUNC)
591 oit.it_flags |= FMODE_WRITE;
593 /* kernel only call f_op->open in dentry_open. filp_open calls
594 * dentry_open after call to open_namei that checks permissions.
595 * Only nfsd_open call dentry_open directly without checking
596 * permissions and because of that this code below is safe. */
597 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
598 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
600 /* We do not want O_EXCL here, presumably we opened the file
601 * already? XXX - NFS implications? */
602 oit.it_flags &= ~O_EXCL;
604 /* bug20584, if "it_flags" contains O_CREAT, the file will be
605 * created if necessary, then "IT_CREAT" should be set to keep
606 * consistent with it */
607 if (oit.it_flags & O_CREAT)
608 oit.it_op |= IT_CREAT;
614 /* Let's see if we have file open on MDS already. */
615 if (it->it_flags & FMODE_WRITE) {
616 och_p = &lli->lli_mds_write_och;
617 och_usecount = &lli->lli_open_fd_write_count;
618 } else if (it->it_flags & FMODE_EXEC) {
619 och_p = &lli->lli_mds_exec_och;
620 och_usecount = &lli->lli_open_fd_exec_count;
622 och_p = &lli->lli_mds_read_och;
623 och_usecount = &lli->lli_open_fd_read_count;
626 mutex_lock(&lli->lli_och_mutex);
627 if (*och_p) { /* Open handle is present */
628 if (it_disposition(it, DISP_OPEN_OPEN)) {
629 /* Well, there's extra open request that we do not need,
630 let's close it somehow. This will decref request. */
631 rc = it_open_error(DISP_OPEN_OPEN, it);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 ll_release_openhandle(file->f_dentry, it);
641 rc = ll_local_open(file, it, fd, NULL);
644 mutex_unlock(&lli->lli_och_mutex);
645 GOTO(out_openerr, rc);
648 LASSERT(*och_usecount == 0);
649 if (!it->d.lustre.it_disposition) {
650 /* We cannot just request lock handle now, new ELC code
651 means that one of other OPEN locks for this file
652 could be cancelled, and since blocking ast handler
653 would attempt to grab och_mutex as well, that would
654 result in a deadlock */
655 mutex_unlock(&lli->lli_och_mutex);
657 * Normally called under two situations:
659 * 2. A race/condition on MDS resulting in no open
660 * handle to be returned from LOOKUP|OPEN request,
661 * for example if the target entry was a symlink.
663 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
665 * Always specify MDS_OPEN_BY_FID because we don't want
666 * to get file with different fid.
668 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
669 rc = ll_intent_file_open(file, NULL, 0, it);
671 GOTO(out_openerr, rc);
675 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
677 GOTO(out_och_free, rc = -ENOMEM);
681 /* md_intent_lock() didn't get a request ref if there was an
682 * open error, so don't do cleanup on the request here
684 /* XXX (green): Should not we bail out on any error here, not
685 * just open error? */
686 rc = it_open_error(DISP_OPEN_OPEN, it);
688 GOTO(out_och_free, rc);
690 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
691 "inode %p: disposition %x, status %d\n", inode,
692 it_disposition(it, ~0), it->d.lustre.it_status);
694 rc = ll_local_open(file, it, fd, *och_p);
696 GOTO(out_och_free, rc);
698 mutex_unlock(&lli->lli_och_mutex);
701 /* Must do this outside lli_och_mutex lock to prevent deadlock where
702 different kind of OPEN lock for this same inode gets cancelled
703 by ldlm_cancel_lru */
704 if (!S_ISREG(inode->i_mode))
705 GOTO(out_och_free, rc);
709 if (!lli->lli_has_smd &&
710 (cl_is_lov_delay_create(file->f_flags) ||
711 (file->f_mode & FMODE_WRITE) == 0)) {
712 CDEBUG(D_INODE, "object creation was delayed\n");
713 GOTO(out_och_free, rc);
715 cl_lov_delay_create_clear(&file->f_flags);
716 GOTO(out_och_free, rc);
720 if (och_p && *och_p) {
721 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
722 *och_p = NULL; /* OBD_FREE writes some magic there */
725 mutex_unlock(&lli->lli_och_mutex);
728 if (lli->lli_opendir_key == fd)
729 ll_deauthorize_statahead(inode, fd);
731 ll_file_data_put(fd);
733 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
736 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
737 ptlrpc_req_finished(it->d.lustre.it_data);
738 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
744 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
745 struct ldlm_lock_desc *desc, void *data, int flag)
748 struct lustre_handle lockh;
752 case LDLM_CB_BLOCKING:
753 ldlm_lock2handle(lock, &lockh);
754 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
756 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
760 case LDLM_CB_CANCELING:
768 * Acquire a lease and open the file.
770 static struct obd_client_handle *
771 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
774 struct lookup_intent it = { .it_op = IT_OPEN };
775 struct ll_sb_info *sbi = ll_i2sbi(inode);
776 struct md_op_data *op_data;
777 struct ptlrpc_request *req = NULL;
778 struct lustre_handle old_handle = { 0 };
779 struct obd_client_handle *och = NULL;
784 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
785 RETURN(ERR_PTR(-EINVAL));
788 struct ll_inode_info *lli = ll_i2info(inode);
789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
790 struct obd_client_handle **och_p;
793 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
794 RETURN(ERR_PTR(-EPERM));
796 /* Get the openhandle of the file */
798 mutex_lock(&lli->lli_och_mutex);
799 if (fd->fd_lease_och != NULL) {
800 mutex_unlock(&lli->lli_och_mutex);
804 if (fd->fd_och == NULL) {
805 if (file->f_mode & FMODE_WRITE) {
806 LASSERT(lli->lli_mds_write_och != NULL);
807 och_p = &lli->lli_mds_write_och;
808 och_usecount = &lli->lli_open_fd_write_count;
810 LASSERT(lli->lli_mds_read_och != NULL);
811 och_p = &lli->lli_mds_read_och;
812 och_usecount = &lli->lli_open_fd_read_count;
814 if (*och_usecount == 1) {
821 mutex_unlock(&lli->lli_och_mutex);
822 if (rc < 0) /* more than 1 opener */
825 LASSERT(fd->fd_och != NULL);
826 old_handle = fd->fd_och->och_fh;
831 RETURN(ERR_PTR(-ENOMEM));
833 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
834 LUSTRE_OPC_ANY, NULL);
836 GOTO(out, rc = PTR_ERR(op_data));
838 /* To tell the MDT this openhandle is from the same owner */
839 op_data->op_handle = old_handle;
841 it.it_flags = fmode | open_flags;
842 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
843 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
844 &ll_md_blocking_lease_ast,
845 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
846 * it can be cancelled which may mislead applications that the lease is
848 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
849 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
850 * doesn't deal with openhandle, so normal openhandle will be leaked. */
851 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
852 ll_finish_md_op_data(op_data);
853 ptlrpc_req_finished(req);
855 GOTO(out_release_it, rc);
857 if (it_disposition(&it, DISP_LOOKUP_NEG))
858 GOTO(out_release_it, rc = -ENOENT);
860 rc = it_open_error(DISP_OPEN_OPEN, &it);
862 GOTO(out_release_it, rc);
864 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
865 ll_och_fill(sbi->ll_md_exp, &it, och);
867 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
868 GOTO(out_close, rc = -EOPNOTSUPP);
870 /* already get lease, handle lease lock */
871 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
872 if (it.d.lustre.it_lock_mode == 0 ||
873 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
874 /* open lock must return for lease */
875 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
876 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
877 it.d.lustre.it_lock_bits);
878 GOTO(out_close, rc = -EPROTO);
881 ll_intent_release(&it);
885 /* Cancel open lock */
886 if (it.d.lustre.it_lock_mode != 0) {
887 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
888 it.d.lustre.it_lock_mode);
889 it.d.lustre.it_lock_mode = 0;
890 och->och_lease_handle.cookie = 0ULL;
892 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
894 CERROR("%s: error closing file "DFID": %d\n",
895 ll_get_fsname(inode->i_sb, NULL, 0),
896 PFID(&ll_i2info(inode)->lli_fid), rc2);
897 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
899 ll_intent_release(&it);
907 * Release lease and close the file.
908 * It will check if the lease has ever broken.
910 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
913 struct ldlm_lock *lock;
914 bool cancelled = true;
918 lock = ldlm_handle2lock(&och->och_lease_handle);
920 lock_res_and_lock(lock);
921 cancelled = ldlm_is_cancel(lock);
922 unlock_res_and_lock(lock);
926 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
927 PFID(&ll_i2info(inode)->lli_fid), cancelled);
930 ldlm_cli_cancel(&och->och_lease_handle, 0);
931 if (lease_broken != NULL)
932 *lease_broken = cancelled;
934 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
939 /* Fills the obdo with the attributes for the lsm */
940 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
941 struct obd_capa *capa, struct obdo *obdo,
942 __u64 ioepoch, int dv_flags)
944 struct ptlrpc_request_set *set;
945 struct obd_info oinfo = { { { 0 } } };
950 LASSERT(lsm != NULL);
954 oinfo.oi_oa->o_oi = lsm->lsm_oi;
955 oinfo.oi_oa->o_mode = S_IFREG;
956 oinfo.oi_oa->o_ioepoch = ioepoch;
957 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
958 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
959 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
960 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
961 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
962 OBD_MD_FLDATAVERSION;
963 oinfo.oi_capa = capa;
964 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
965 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
966 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
967 if (dv_flags & LL_DV_WR_FLUSH)
968 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
971 set = ptlrpc_prep_set();
973 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
976 rc = obd_getattr_async(exp, &oinfo, set);
978 rc = ptlrpc_set_wait(set);
979 ptlrpc_set_destroy(set);
982 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
983 OBD_MD_FLATIME | OBD_MD_FLMTIME |
984 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
985 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
986 if (dv_flags & LL_DV_WR_FLUSH &&
987 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
988 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
995 * Performs the getattr on the inode and updates its fields.
996 * If @sync != 0, perform the getattr under the server-side lock.
998 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
999 __u64 ioepoch, int sync)
1001 struct obd_capa *capa = ll_mdscapa_get(inode);
1002 struct lov_stripe_md *lsm;
1006 lsm = ccc_inode_lsm_get(inode);
1007 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1008 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1011 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1013 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1014 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1015 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1016 (unsigned long long)inode->i_blocks,
1017 1UL << inode->i_blkbits);
1019 ccc_inode_lsm_put(inode, lsm);
1023 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct cl_object *obj = lli->lli_clob;
1027 struct cl_attr *attr = ccc_env_thread_attr(env);
1033 ll_inode_size_lock(inode);
1034 /* merge timestamps the most recently obtained from mds with
1035 timestamps obtained from osts */
1036 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1037 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1038 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1040 lvb.lvb_size = i_size_read(inode);
1041 lvb.lvb_blocks = inode->i_blocks;
1042 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1043 lvb.lvb_atime = LTIME_S(inode->i_atime);
1044 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1046 cl_object_attr_lock(obj);
1047 rc = cl_object_attr_get(env, obj, attr);
1048 cl_object_attr_unlock(obj);
1051 if (lvb.lvb_atime < attr->cat_atime)
1052 lvb.lvb_atime = attr->cat_atime;
1053 if (lvb.lvb_ctime < attr->cat_ctime)
1054 lvb.lvb_ctime = attr->cat_ctime;
1055 if (lvb.lvb_mtime < attr->cat_mtime)
1056 lvb.lvb_mtime = attr->cat_mtime;
1058 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1059 PFID(&lli->lli_fid), attr->cat_size);
1060 cl_isize_write_nolock(inode, attr->cat_size);
1062 inode->i_blocks = attr->cat_blocks;
1064 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1065 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1066 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1068 ll_inode_size_unlock(inode);
1073 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1076 struct obdo obdo = { 0 };
1079 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1081 st->st_size = obdo.o_size;
1082 st->st_blocks = obdo.o_blocks;
1083 st->st_mtime = obdo.o_mtime;
1084 st->st_atime = obdo.o_atime;
1085 st->st_ctime = obdo.o_ctime;
1090 static bool file_is_noatime(const struct file *file)
1092 const struct vfsmount *mnt = file->f_path.mnt;
1093 const struct inode *inode = file->f_path.dentry->d_inode;
1095 /* Adapted from file_accessed() and touch_atime().*/
1096 if (file->f_flags & O_NOATIME)
1099 if (inode->i_flags & S_NOATIME)
1102 if (IS_NOATIME(inode))
1105 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1108 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1111 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1117 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1119 struct inode *inode = file->f_dentry->d_inode;
1121 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1123 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1124 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1125 file->f_flags & O_DIRECT ||
1128 io->ci_obj = ll_i2info(inode)->lli_clob;
1129 io->ci_lockreq = CILR_MAYBE;
1130 if (ll_file_nolock(file)) {
1131 io->ci_lockreq = CILR_NEVER;
1132 io->ci_no_srvlock = 1;
1133 } else if (file->f_flags & O_APPEND) {
1134 io->ci_lockreq = CILR_MANDATORY;
1137 io->ci_noatime = file_is_noatime(file);
1141 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1142 struct file *file, enum cl_io_type iot,
1143 loff_t *ppos, size_t count)
1145 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1149 struct range_lock range;
1152 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1153 file->f_dentry->d_name.name, iot, *ppos, count);
1156 io = ccc_env_thread_io(env);
1157 ll_io_init(io, file, iot == CIT_WRITE);
1159 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1160 struct vvp_io *vio = vvp_env_io(env);
1161 struct ccc_io *cio = ccc_env_io(env);
1162 bool range_locked = false;
1164 if (file->f_flags & O_APPEND)
1165 range_lock_init(&range, 0, LUSTRE_EOF);
1167 range_lock_init(&range, *ppos, *ppos + count - 1);
1168 cio->cui_fd = LUSTRE_FPRIVATE(file);
1169 vio->cui_io_subtype = args->via_io_subtype;
1171 switch (vio->cui_io_subtype) {
1173 cio->cui_iov = args->u.normal.via_iov;
1174 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1175 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1176 cio->cui_iocb = args->u.normal.via_iocb;
1177 if ((iot == CIT_WRITE) &&
1178 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1179 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1181 result = range_lock(&lli->lli_write_tree,
1186 range_locked = true;
1188 down_read(&lli->lli_trunc_sem);
1191 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1192 vio->u.splice.cui_flags = args->u.splice.via_flags;
1195 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1199 ll_cl_add(file, env, io);
1200 result = cl_io_loop(env, io);
1201 ll_cl_remove(file, env);
1203 if (args->via_io_subtype == IO_NORMAL)
1204 up_read(&lli->lli_trunc_sem);
1206 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1208 range_unlock(&lli->lli_write_tree, &range);
1211 /* cl_io_rw_init() handled IO */
1212 result = io->ci_result;
1215 if (io->ci_nob > 0) {
1216 result = io->ci_nob;
1217 *ppos = io->u.ci_wr.wr.crw_pos;
1221 cl_io_fini(env, io);
1222 /* If any bit been read/written (result != 0), we just return
1223 * short read/write instead of restart io. */
1224 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1225 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1226 iot == CIT_READ ? "read" : "write",
1227 file->f_dentry->d_name.name, *ppos, count);
1228 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1232 if (iot == CIT_READ) {
1234 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1235 LPROC_LL_READ_BYTES, result);
1236 } else if (iot == CIT_WRITE) {
1238 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1239 LPROC_LL_WRITE_BYTES, result);
1240 fd->fd_write_failed = false;
1241 } else if (result != -ERESTARTSYS) {
1242 fd->fd_write_failed = true;
1245 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1252 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1254 static int ll_file_get_iov_count(const struct iovec *iov,
1255 unsigned long *nr_segs, size_t *count)
1260 for (seg = 0; seg < *nr_segs; seg++) {
1261 const struct iovec *iv = &iov[seg];
1264 * If any segment has a negative length, or the cumulative
1265 * length ever wraps negative then return -EINVAL.
1268 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1270 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1275 cnt -= iv->iov_len; /* This segment is no good */
1282 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1283 unsigned long nr_segs, loff_t pos)
1286 struct vvp_io_args *args;
1292 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1296 env = cl_env_get(&refcheck);
1298 RETURN(PTR_ERR(env));
1300 args = vvp_env_args(env, IO_NORMAL);
1301 args->u.normal.via_iov = (struct iovec *)iov;
1302 args->u.normal.via_nrsegs = nr_segs;
1303 args->u.normal.via_iocb = iocb;
1305 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1306 &iocb->ki_pos, count);
1307 cl_env_put(env, &refcheck);
1311 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1315 struct iovec *local_iov;
1316 struct kiocb *kiocb;
1321 env = cl_env_get(&refcheck);
1323 RETURN(PTR_ERR(env));
1325 local_iov = &vvp_env_info(env)->vti_local_iov;
1326 kiocb = &vvp_env_info(env)->vti_kiocb;
1327 local_iov->iov_base = (void __user *)buf;
1328 local_iov->iov_len = count;
1329 init_sync_kiocb(kiocb, file);
1330 kiocb->ki_pos = *ppos;
1331 #ifdef HAVE_KIOCB_KI_LEFT
1332 kiocb->ki_left = count;
1334 kiocb->ki_nbytes = count;
1337 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1338 *ppos = kiocb->ki_pos;
1340 cl_env_put(env, &refcheck);
1345 * Write to a file (through the page cache).
1348 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1349 unsigned long nr_segs, loff_t pos)
1352 struct vvp_io_args *args;
1358 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1362 env = cl_env_get(&refcheck);
1364 RETURN(PTR_ERR(env));
1366 args = vvp_env_args(env, IO_NORMAL);
1367 args->u.normal.via_iov = (struct iovec *)iov;
1368 args->u.normal.via_nrsegs = nr_segs;
1369 args->u.normal.via_iocb = iocb;
1371 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1372 &iocb->ki_pos, count);
1373 cl_env_put(env, &refcheck);
1377 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1381 struct iovec *local_iov;
1382 struct kiocb *kiocb;
1387 env = cl_env_get(&refcheck);
1389 RETURN(PTR_ERR(env));
1391 local_iov = &vvp_env_info(env)->vti_local_iov;
1392 kiocb = &vvp_env_info(env)->vti_kiocb;
1393 local_iov->iov_base = (void __user *)buf;
1394 local_iov->iov_len = count;
1395 init_sync_kiocb(kiocb, file);
1396 kiocb->ki_pos = *ppos;
1397 #ifdef HAVE_KIOCB_KI_LEFT
1398 kiocb->ki_left = count;
1400 kiocb->ki_nbytes = count;
1403 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1404 *ppos = kiocb->ki_pos;
1406 cl_env_put(env, &refcheck);
1411 * Send file content (through pagecache) somewhere with helper
1413 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1414 struct pipe_inode_info *pipe, size_t count,
1418 struct vvp_io_args *args;
1423 env = cl_env_get(&refcheck);
1425 RETURN(PTR_ERR(env));
1427 args = vvp_env_args(env, IO_SPLICE);
1428 args->u.splice.via_pipe = pipe;
1429 args->u.splice.via_flags = flags;
1431 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1432 cl_env_put(env, &refcheck);
1436 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1439 struct obd_export *exp = ll_i2dtexp(inode);
1440 struct obd_trans_info oti = { 0 };
1441 struct obdo *oa = NULL;
1444 struct lov_stripe_md *lsm = NULL, *lsm2;
1451 lsm = ccc_inode_lsm_get(inode);
1452 if (!lsm_has_objects(lsm))
1453 GOTO(out, rc = -ENOENT);
1455 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1456 (lsm->lsm_stripe_count));
1458 OBD_ALLOC_LARGE(lsm2, lsm_size);
1460 GOTO(out, rc = -ENOMEM);
1463 oa->o_nlink = ost_idx;
1464 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1465 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1466 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1467 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1468 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1469 memcpy(lsm2, lsm, lsm_size);
1470 ll_inode_size_lock(inode);
1471 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1472 ll_inode_size_unlock(inode);
1474 OBD_FREE_LARGE(lsm2, lsm_size);
1477 ccc_inode_lsm_put(inode, lsm);
1482 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1484 struct ll_recreate_obj ucreat;
1488 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1491 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1495 ostid_set_seq_mdt0(&oi);
1496 ostid_set_id(&oi, ucreat.lrc_id);
1497 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1500 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1507 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1510 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1513 fid_to_ostid(&fid, &oi);
1514 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1515 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1518 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1519 __u64 flags, struct lov_user_md *lum,
1522 struct lov_stripe_md *lsm = NULL;
1523 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1527 lsm = ccc_inode_lsm_get(inode);
1529 ccc_inode_lsm_put(inode, lsm);
1530 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1531 PFID(ll_inode2fid(inode)));
1532 GOTO(out, rc = -EEXIST);
1535 ll_inode_size_lock(inode);
1536 oit.it_flags |= MDS_OPEN_BY_FID;
1537 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1539 GOTO(out_unlock, rc);
1540 rc = oit.d.lustre.it_status;
1542 GOTO(out_req_free, rc);
1544 ll_release_openhandle(file->f_dentry, &oit);
1547 ll_inode_size_unlock(inode);
1548 ll_intent_release(&oit);
1549 ccc_inode_lsm_put(inode, lsm);
1551 cl_lov_delay_create_clear(&file->f_flags);
1554 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1558 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1559 struct lov_mds_md **lmmp, int *lmm_size,
1560 struct ptlrpc_request **request)
1562 struct ll_sb_info *sbi = ll_i2sbi(inode);
1563 struct mdt_body *body;
1564 struct lov_mds_md *lmm = NULL;
1565 struct ptlrpc_request *req = NULL;
1566 struct md_op_data *op_data;
1569 rc = ll_get_default_mdsize(sbi, &lmmsize);
1573 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1574 strlen(filename), lmmsize,
1575 LUSTRE_OPC_ANY, NULL);
1576 if (IS_ERR(op_data))
1577 RETURN(PTR_ERR(op_data));
1579 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1580 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1581 ll_finish_md_op_data(op_data);
1583 CDEBUG(D_INFO, "md_getattr_name failed "
1584 "on %s: rc %d\n", filename, rc);
1588 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1589 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1591 lmmsize = body->mbo_eadatasize;
1593 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1595 GOTO(out, rc = -ENODATA);
1598 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1599 LASSERT(lmm != NULL);
1601 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1602 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1603 GOTO(out, rc = -EPROTO);
1607 * This is coming from the MDS, so is probably in
1608 * little endian. We convert it to host endian before
1609 * passing it to userspace.
1611 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1614 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1615 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1618 /* if function called for directory - we should
1619 * avoid swab not existent lsm objects */
1620 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1621 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1622 if (S_ISREG(body->mbo_mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1626 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1627 lustre_swab_lov_user_md_v3(
1628 (struct lov_user_md_v3 *)lmm);
1629 if (S_ISREG(body->mbo_mode))
1630 lustre_swab_lov_user_md_objects(
1631 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1638 *lmm_size = lmmsize;
1643 static int ll_lov_setea(struct inode *inode, struct file *file,
1646 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1647 struct lov_user_md *lump;
1648 int lum_size = sizeof(struct lov_user_md) +
1649 sizeof(struct lov_user_ost_data);
1653 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1656 OBD_ALLOC_LARGE(lump, lum_size);
1660 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1661 OBD_FREE_LARGE(lump, lum_size);
1665 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1667 OBD_FREE_LARGE(lump, lum_size);
1671 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1674 struct lov_user_md_v3 lumv3;
1675 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1676 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1677 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1679 __u64 flags = FMODE_WRITE;
1682 /* first try with v1 which is smaller than v3 */
1683 lum_size = sizeof(struct lov_user_md_v1);
1684 if (copy_from_user(lumv1, lumv1p, lum_size))
1687 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1688 lum_size = sizeof(struct lov_user_md_v3);
1689 if (copy_from_user(&lumv3, lumv3p, lum_size))
1693 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1695 struct lov_stripe_md *lsm;
1698 put_user(0, &lumv1p->lmm_stripe_count);
1700 ll_layout_refresh(inode, &gen);
1701 lsm = ccc_inode_lsm_get(inode);
1702 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1703 0, lsm, (void *)arg);
1704 ccc_inode_lsm_put(inode, lsm);
1709 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1711 struct lov_stripe_md *lsm;
1715 lsm = ccc_inode_lsm_get(inode);
1717 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1719 ccc_inode_lsm_put(inode, lsm);
1724 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1726 struct ll_inode_info *lli = ll_i2info(inode);
1727 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1728 struct ccc_grouplock grouplock;
1732 if (ll_file_nolock(file))
1733 RETURN(-EOPNOTSUPP);
1735 spin_lock(&lli->lli_lock);
1736 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1737 CWARN("group lock already existed with gid %lu\n",
1738 fd->fd_grouplock.cg_gid);
1739 spin_unlock(&lli->lli_lock);
1742 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1743 spin_unlock(&lli->lli_lock);
1745 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1746 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1750 spin_lock(&lli->lli_lock);
1751 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1752 spin_unlock(&lli->lli_lock);
1753 CERROR("another thread just won the race\n");
1754 cl_put_grouplock(&grouplock);
1758 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1759 fd->fd_grouplock = grouplock;
1760 spin_unlock(&lli->lli_lock);
1762 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1766 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1768 struct ll_inode_info *lli = ll_i2info(inode);
1769 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1770 struct ccc_grouplock grouplock;
1773 spin_lock(&lli->lli_lock);
1774 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1775 spin_unlock(&lli->lli_lock);
1776 CWARN("no group lock held\n");
1779 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1781 if (fd->fd_grouplock.cg_gid != arg) {
1782 CWARN("group lock %lu doesn't match current id %lu\n",
1783 arg, fd->fd_grouplock.cg_gid);
1784 spin_unlock(&lli->lli_lock);
1788 grouplock = fd->fd_grouplock;
1789 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1790 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1791 spin_unlock(&lli->lli_lock);
1793 cl_put_grouplock(&grouplock);
1794 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1799 * Close inode open handle
1801 * \param dentry [in] dentry which contains the inode
1802 * \param it [in,out] intent which contains open info and result
1805 * \retval <0 failure
1807 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1809 struct inode *inode = dentry->d_inode;
1810 struct obd_client_handle *och;
1816 /* Root ? Do nothing. */
1817 if (dentry->d_inode->i_sb->s_root == dentry)
1820 /* No open handle to close? Move away */
1821 if (!it_disposition(it, DISP_OPEN_OPEN))
1824 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1826 OBD_ALLOC(och, sizeof(*och));
1828 GOTO(out, rc = -ENOMEM);
1830 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1832 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1835 /* this one is in place of ll_file_open */
1836 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1837 ptlrpc_req_finished(it->d.lustre.it_data);
1838 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1844 * Get size for inode for which FIEMAP mapping is requested.
1845 * Make the FIEMAP get_info call and returns the result.
1847 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1850 struct obd_export *exp = ll_i2dtexp(inode);
1851 struct lov_stripe_md *lsm = NULL;
1852 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1853 __u32 vallen = num_bytes;
1857 /* Checks for fiemap flags */
1858 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1859 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1863 /* Check for FIEMAP_FLAG_SYNC */
1864 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1865 rc = filemap_fdatawrite(inode->i_mapping);
1870 lsm = ccc_inode_lsm_get(inode);
1874 /* If the stripe_count > 1 and the application does not understand
1875 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1877 if (lsm->lsm_stripe_count > 1 &&
1878 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1879 GOTO(out, rc = -EOPNOTSUPP);
1881 fm_key.oa.o_oi = lsm->lsm_oi;
1882 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1884 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1885 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1886 /* If filesize is 0, then there would be no objects for mapping */
1887 if (fm_key.oa.o_size == 0) {
1888 fiemap->fm_mapped_extents = 0;
1892 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1894 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1897 CERROR("obd_get_info failed: rc = %d\n", rc);
1900 ccc_inode_lsm_put(inode, lsm);
1904 int ll_fid2path(struct inode *inode, void __user *arg)
1906 struct obd_export *exp = ll_i2mdexp(inode);
1907 const struct getinfo_fid2path __user *gfin = arg;
1909 struct getinfo_fid2path *gfout;
1915 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1916 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1919 /* Only need to get the buflen */
1920 if (get_user(pathlen, &gfin->gf_pathlen))
1923 if (pathlen > PATH_MAX)
1926 outsize = sizeof(*gfout) + pathlen;
1927 OBD_ALLOC(gfout, outsize);
1931 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1932 GOTO(gf_free, rc = -EFAULT);
1934 /* Call mdc_iocontrol */
1935 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1939 if (copy_to_user(arg, gfout, outsize))
1943 OBD_FREE(gfout, outsize);
1947 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1949 struct ll_user_fiemap *fiemap_s;
1950 size_t num_bytes, ret_bytes;
1951 unsigned int extent_count;
1954 /* Get the extent count so we can calculate the size of
1955 * required fiemap buffer */
1956 if (get_user(extent_count,
1957 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1961 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1963 num_bytes = sizeof(*fiemap_s) + (extent_count *
1964 sizeof(struct ll_fiemap_extent));
1966 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1967 if (fiemap_s == NULL)
1970 /* get the fiemap value */
1971 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1973 GOTO(error, rc = -EFAULT);
1975 /* If fm_extent_count is non-zero, read the first extent since
1976 * it is used to calculate end_offset and device from previous
1979 if (copy_from_user(&fiemap_s->fm_extents[0],
1980 (char __user *)arg + sizeof(*fiemap_s),
1981 sizeof(struct ll_fiemap_extent)))
1982 GOTO(error, rc = -EFAULT);
1985 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1989 ret_bytes = sizeof(struct ll_user_fiemap);
1991 if (extent_count != 0)
1992 ret_bytes += (fiemap_s->fm_mapped_extents *
1993 sizeof(struct ll_fiemap_extent));
1995 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1999 OBD_FREE_LARGE(fiemap_s, num_bytes);
2004 * Read the data_version for inode.
2006 * This value is computed using stripe object version on OST.
2007 * Version is computed using server side locking.
2009 * @param sync if do sync on the OST side;
2011 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2012 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2014 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2016 struct lov_stripe_md *lsm = NULL;
2017 struct ll_sb_info *sbi = ll_i2sbi(inode);
2018 struct obdo *obdo = NULL;
2022 /* If no stripe, we consider version is 0. */
2023 lsm = ccc_inode_lsm_get(inode);
2024 if (!lsm_has_objects(lsm)) {
2026 CDEBUG(D_INODE, "No object for inode\n");
2030 OBD_ALLOC_PTR(obdo);
2032 GOTO(out, rc = -ENOMEM);
2034 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2036 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2039 *data_version = obdo->o_data_version;
2045 ccc_inode_lsm_put(inode, lsm);
2050 * Trigger a HSM release request for the provided inode.
2052 int ll_hsm_release(struct inode *inode)
2054 struct cl_env_nest nest;
2056 struct obd_client_handle *och = NULL;
2057 __u64 data_version = 0;
2061 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2062 ll_get_fsname(inode->i_sb, NULL, 0),
2063 PFID(&ll_i2info(inode)->lli_fid));
2065 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2067 GOTO(out, rc = PTR_ERR(och));
2069 /* Grab latest data_version and [am]time values */
2070 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2074 env = cl_env_nested_get(&nest);
2076 GOTO(out, rc = PTR_ERR(env));
2078 ll_merge_lvb(env, inode);
2079 cl_env_nested_put(&nest, env);
2081 /* Release the file.
2082 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2083 * we still need it to pack l_remote_handle to MDT. */
2084 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2090 if (och != NULL && !IS_ERR(och)) /* close the file */
2091 ll_lease_close(och, inode, NULL);
2096 struct ll_swap_stack {
2097 struct iattr ia1, ia2;
2099 struct inode *inode1, *inode2;
2100 bool check_dv1, check_dv2;
2103 static int ll_swap_layouts(struct file *file1, struct file *file2,
2104 struct lustre_swap_layouts *lsl)
2106 struct mdc_swap_layouts msl;
2107 struct md_op_data *op_data;
2110 struct ll_swap_stack *llss = NULL;
2113 OBD_ALLOC_PTR(llss);
2117 llss->inode1 = file1->f_dentry->d_inode;
2118 llss->inode2 = file2->f_dentry->d_inode;
2120 if (!S_ISREG(llss->inode2->i_mode))
2121 GOTO(free, rc = -EINVAL);
2123 if (inode_permission(llss->inode1, MAY_WRITE) ||
2124 inode_permission(llss->inode2, MAY_WRITE))
2125 GOTO(free, rc = -EPERM);
2127 if (llss->inode2->i_sb != llss->inode1->i_sb)
2128 GOTO(free, rc = -EXDEV);
2130 /* we use 2 bool because it is easier to swap than 2 bits */
2131 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2132 llss->check_dv1 = true;
2134 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2135 llss->check_dv2 = true;
2137 /* we cannot use lsl->sl_dvX directly because we may swap them */
2138 llss->dv1 = lsl->sl_dv1;
2139 llss->dv2 = lsl->sl_dv2;
2141 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2142 if (rc == 0) /* same file, done! */
2145 if (rc < 0) { /* sequentialize it */
2146 swap(llss->inode1, llss->inode2);
2148 swap(llss->dv1, llss->dv2);
2149 swap(llss->check_dv1, llss->check_dv2);
2153 if (gid != 0) { /* application asks to flush dirty cache */
2154 rc = ll_get_grouplock(llss->inode1, file1, gid);
2158 rc = ll_get_grouplock(llss->inode2, file2, gid);
2160 ll_put_grouplock(llss->inode1, file1, gid);
2165 /* to be able to restore mtime and atime after swap
2166 * we need to first save them */
2168 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2169 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2170 llss->ia1.ia_atime = llss->inode1->i_atime;
2171 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2172 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2173 llss->ia2.ia_atime = llss->inode2->i_atime;
2174 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2177 /* ultimate check, before swaping the layouts we check if
2178 * dataversion has changed (if requested) */
2179 if (llss->check_dv1) {
2180 rc = ll_data_version(llss->inode1, &dv, 0);
2183 if (dv != llss->dv1)
2184 GOTO(putgl, rc = -EAGAIN);
2187 if (llss->check_dv2) {
2188 rc = ll_data_version(llss->inode2, &dv, 0);
2191 if (dv != llss->dv2)
2192 GOTO(putgl, rc = -EAGAIN);
2195 /* struct md_op_data is used to send the swap args to the mdt
2196 * only flags is missing, so we use struct mdc_swap_layouts
2197 * through the md_op_data->op_data */
2198 /* flags from user space have to be converted before they are send to
2199 * server, no flag is sent today, they are only used on the client */
2202 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2203 0, LUSTRE_OPC_ANY, &msl);
2204 if (IS_ERR(op_data))
2205 GOTO(free, rc = PTR_ERR(op_data));
2207 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2208 sizeof(*op_data), op_data, NULL);
2209 ll_finish_md_op_data(op_data);
2213 ll_put_grouplock(llss->inode2, file2, gid);
2214 ll_put_grouplock(llss->inode1, file1, gid);
2217 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2221 /* clear useless flags */
2222 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2223 llss->ia1.ia_valid &= ~ATTR_MTIME;
2224 llss->ia2.ia_valid &= ~ATTR_MTIME;
2227 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2228 llss->ia1.ia_valid &= ~ATTR_ATIME;
2229 llss->ia2.ia_valid &= ~ATTR_ATIME;
2232 /* update time if requested */
2234 if (llss->ia2.ia_valid != 0) {
2235 mutex_lock(&llss->inode1->i_mutex);
2236 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2237 mutex_unlock(&llss->inode1->i_mutex);
2240 if (llss->ia1.ia_valid != 0) {
2243 mutex_lock(&llss->inode2->i_mutex);
2244 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2245 mutex_unlock(&llss->inode2->i_mutex);
2257 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2259 struct md_op_data *op_data;
2262 /* Non-root users are forbidden to set or clear flags which are
2263 * NOT defined in HSM_USER_MASK. */
2264 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2265 !cfs_capable(CFS_CAP_SYS_ADMIN))
2268 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2269 LUSTRE_OPC_ANY, hss);
2270 if (IS_ERR(op_data))
2271 RETURN(PTR_ERR(op_data));
2273 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2274 sizeof(*op_data), op_data, NULL);
2276 ll_finish_md_op_data(op_data);
2281 static int ll_hsm_import(struct inode *inode, struct file *file,
2282 struct hsm_user_import *hui)
2284 struct hsm_state_set *hss = NULL;
2285 struct iattr *attr = NULL;
2289 if (!S_ISREG(inode->i_mode))
2295 GOTO(out, rc = -ENOMEM);
2297 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2298 hss->hss_archive_id = hui->hui_archive_id;
2299 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2300 rc = ll_hsm_state_set(inode, hss);
2304 OBD_ALLOC_PTR(attr);
2306 GOTO(out, rc = -ENOMEM);
2308 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2309 attr->ia_mode |= S_IFREG;
2310 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2311 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2312 attr->ia_size = hui->hui_size;
2313 attr->ia_mtime.tv_sec = hui->hui_mtime;
2314 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2315 attr->ia_atime.tv_sec = hui->hui_atime;
2316 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2318 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2319 ATTR_UID | ATTR_GID |
2320 ATTR_MTIME | ATTR_MTIME_SET |
2321 ATTR_ATIME | ATTR_ATIME_SET;
2323 mutex_lock(&inode->i_mutex);
2325 rc = ll_setattr_raw(file->f_dentry, attr, true);
2329 mutex_unlock(&inode->i_mutex);
2341 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2343 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2344 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2348 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2350 struct inode *inode = file->f_dentry->d_inode;
2351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2355 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2356 PFID(ll_inode2fid(inode)), inode, cmd);
2357 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2359 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2360 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2364 case LL_IOC_GETFLAGS:
2365 /* Get the current value of the file flags */
2366 return put_user(fd->fd_flags, (int *)arg);
2367 case LL_IOC_SETFLAGS:
2368 case LL_IOC_CLRFLAGS:
2369 /* Set or clear specific file flags */
2370 /* XXX This probably needs checks to ensure the flags are
2371 * not abused, and to handle any flag side effects.
2373 if (get_user(flags, (int *) arg))
2376 if (cmd == LL_IOC_SETFLAGS) {
2377 if ((flags & LL_FILE_IGNORE_LOCK) &&
2378 !(file->f_flags & O_DIRECT)) {
2379 CERROR("%s: unable to disable locking on "
2380 "non-O_DIRECT file\n", current->comm);
2384 fd->fd_flags |= flags;
2386 fd->fd_flags &= ~flags;
2389 case LL_IOC_LOV_SETSTRIPE:
2390 RETURN(ll_lov_setstripe(inode, file, arg));
2391 case LL_IOC_LOV_SETEA:
2392 RETURN(ll_lov_setea(inode, file, arg));
2393 case LL_IOC_LOV_SWAP_LAYOUTS: {
2395 struct lustre_swap_layouts lsl;
2397 if (copy_from_user(&lsl, (char *)arg,
2398 sizeof(struct lustre_swap_layouts)))
2401 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2404 file2 = fget(lsl.sl_fd);
2409 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2410 rc = ll_swap_layouts(file, file2, &lsl);
2414 case LL_IOC_LOV_GETSTRIPE:
2415 RETURN(ll_lov_getstripe(inode, arg));
2416 case LL_IOC_RECREATE_OBJ:
2417 RETURN(ll_lov_recreate_obj(inode, arg));
2418 case LL_IOC_RECREATE_FID:
2419 RETURN(ll_lov_recreate_fid(inode, arg));
2420 case FSFILT_IOC_FIEMAP:
2421 RETURN(ll_ioctl_fiemap(inode, arg));
2422 case FSFILT_IOC_GETFLAGS:
2423 case FSFILT_IOC_SETFLAGS:
2424 RETURN(ll_iocontrol(inode, file, cmd, arg));
2425 case FSFILT_IOC_GETVERSION_OLD:
2426 case FSFILT_IOC_GETVERSION:
2427 RETURN(put_user(inode->i_generation, (int *)arg));
2428 case LL_IOC_GROUP_LOCK:
2429 RETURN(ll_get_grouplock(inode, file, arg));
2430 case LL_IOC_GROUP_UNLOCK:
2431 RETURN(ll_put_grouplock(inode, file, arg));
2432 case IOC_OBD_STATFS:
2433 RETURN(ll_obd_statfs(inode, (void *)arg));
2435 /* We need to special case any other ioctls we want to handle,
2436 * to send them to the MDS/OST as appropriate and to properly
2437 * network encode the arg field.
2438 case FSFILT_IOC_SETVERSION_OLD:
2439 case FSFILT_IOC_SETVERSION:
2441 case LL_IOC_FLUSHCTX:
2442 RETURN(ll_flush_ctx(inode));
2443 case LL_IOC_PATH2FID: {
2444 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2445 sizeof(struct lu_fid)))
2450 case OBD_IOC_FID2PATH:
2451 RETURN(ll_fid2path(inode, (void *)arg));
2452 case LL_IOC_DATA_VERSION: {
2453 struct ioc_data_version idv;
2456 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2459 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2460 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2462 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2468 case LL_IOC_GET_MDTIDX: {
2471 mdtidx = ll_get_mdt_idx(inode);
2475 if (put_user((int)mdtidx, (int*)arg))
2480 case OBD_IOC_GETDTNAME:
2481 case OBD_IOC_GETMDNAME:
2482 RETURN(ll_get_obd_name(inode, cmd, arg));
2483 case LL_IOC_HSM_STATE_GET: {
2484 struct md_op_data *op_data;
2485 struct hsm_user_state *hus;
2492 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2493 LUSTRE_OPC_ANY, hus);
2494 if (IS_ERR(op_data)) {
2496 RETURN(PTR_ERR(op_data));
2499 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2502 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2505 ll_finish_md_op_data(op_data);
2509 case LL_IOC_HSM_STATE_SET: {
2510 struct hsm_state_set *hss;
2517 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2522 rc = ll_hsm_state_set(inode, hss);
2527 case LL_IOC_HSM_ACTION: {
2528 struct md_op_data *op_data;
2529 struct hsm_current_action *hca;
2536 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2537 LUSTRE_OPC_ANY, hca);
2538 if (IS_ERR(op_data)) {
2540 RETURN(PTR_ERR(op_data));
2543 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2546 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2549 ll_finish_md_op_data(op_data);
2553 case LL_IOC_SET_LEASE: {
2554 struct ll_inode_info *lli = ll_i2info(inode);
2555 struct obd_client_handle *och = NULL;
2560 case LL_LEASE_WRLCK:
2561 if (!(file->f_mode & FMODE_WRITE))
2563 fmode = FMODE_WRITE;
2565 case LL_LEASE_RDLCK:
2566 if (!(file->f_mode & FMODE_READ))
2570 case LL_LEASE_UNLCK:
2571 mutex_lock(&lli->lli_och_mutex);
2572 if (fd->fd_lease_och != NULL) {
2573 och = fd->fd_lease_och;
2574 fd->fd_lease_och = NULL;
2576 mutex_unlock(&lli->lli_och_mutex);
2581 fmode = och->och_flags;
2582 rc = ll_lease_close(och, inode, &lease_broken);
2589 RETURN(ll_lease_type_from_fmode(fmode));
2594 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2596 /* apply for lease */
2597 och = ll_lease_open(inode, file, fmode, 0);
2599 RETURN(PTR_ERR(och));
2602 mutex_lock(&lli->lli_och_mutex);
2603 if (fd->fd_lease_och == NULL) {
2604 fd->fd_lease_och = och;
2607 mutex_unlock(&lli->lli_och_mutex);
2609 /* impossible now that only excl is supported for now */
2610 ll_lease_close(och, inode, &lease_broken);
2615 case LL_IOC_GET_LEASE: {
2616 struct ll_inode_info *lli = ll_i2info(inode);
2617 struct ldlm_lock *lock = NULL;
2620 mutex_lock(&lli->lli_och_mutex);
2621 if (fd->fd_lease_och != NULL) {
2622 struct obd_client_handle *och = fd->fd_lease_och;
2624 lock = ldlm_handle2lock(&och->och_lease_handle);
2626 lock_res_and_lock(lock);
2627 if (!ldlm_is_cancel(lock))
2628 fmode = och->och_flags;
2630 unlock_res_and_lock(lock);
2631 LDLM_LOCK_PUT(lock);
2634 mutex_unlock(&lli->lli_och_mutex);
2636 RETURN(ll_lease_type_from_fmode(fmode));
2638 case LL_IOC_HSM_IMPORT: {
2639 struct hsm_user_import *hui;
2645 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2650 rc = ll_hsm_import(inode, file, hui);
2660 ll_iocontrol_call(inode, file, cmd, arg, &err))
2663 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2669 #ifndef HAVE_FILE_LLSEEK_SIZE
2670 static inline loff_t
2671 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2673 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2675 if (offset > maxsize)
2678 if (offset != file->f_pos) {
2679 file->f_pos = offset;
2680 file->f_version = 0;
2686 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2687 loff_t maxsize, loff_t eof)
2689 struct inode *inode = file->f_dentry->d_inode;
2697 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2698 * position-querying operation. Avoid rewriting the "same"
2699 * f_pos value back to the file because a concurrent read(),
2700 * write() or lseek() might have altered it
2705 * f_lock protects against read/modify/write race with other
2706 * SEEK_CURs. Note that parallel writes and reads behave
2709 mutex_lock(&inode->i_mutex);
2710 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2711 mutex_unlock(&inode->i_mutex);
2715 * In the generic case the entire file is data, so as long as
2716 * offset isn't at the end of the file then the offset is data.
2723 * There is a virtual hole at the end of the file, so as long as
2724 * offset isn't i_size or larger, return i_size.
2732 return llseek_execute(file, offset, maxsize);
2736 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2738 struct inode *inode = file->f_dentry->d_inode;
2739 loff_t retval, eof = 0;
2742 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2743 (origin == SEEK_CUR) ? file->f_pos : 0);
2744 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2745 PFID(ll_inode2fid(inode)), inode, retval, retval,
2747 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2749 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2750 retval = ll_glimpse_size(inode);
2753 eof = i_size_read(inode);
2756 retval = ll_generic_file_llseek_size(file, offset, origin,
2757 ll_file_maxbytes(inode), eof);
2761 static int ll_flush(struct file *file, fl_owner_t id)
2763 struct inode *inode = file->f_dentry->d_inode;
2764 struct ll_inode_info *lli = ll_i2info(inode);
2765 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2768 LASSERT(!S_ISDIR(inode->i_mode));
2770 /* catch async errors that were recorded back when async writeback
2771 * failed for pages in this mapping. */
2772 rc = lli->lli_async_rc;
2773 lli->lli_async_rc = 0;
2774 if (lli->lli_clob != NULL) {
2775 err = lov_read_and_clear_async_rc(lli->lli_clob);
2780 /* The application has been told write failure already.
2781 * Do not report failure again. */
2782 if (fd->fd_write_failed)
2784 return rc ? -EIO : 0;
2788 * Called to make sure a portion of file has been written out.
2789 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2791 * Return how many pages have been written.
2793 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2794 enum cl_fsync_mode mode, int ignore_layout)
2796 struct cl_env_nest nest;
2799 struct obd_capa *capa = NULL;
2800 struct cl_fsync_io *fio;
2804 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2805 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2808 env = cl_env_nested_get(&nest);
2810 RETURN(PTR_ERR(env));
2812 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2814 io = ccc_env_thread_io(env);
2815 io->ci_obj = cl_i2info(inode)->lli_clob;
2816 io->ci_ignore_layout = ignore_layout;
2818 /* initialize parameters for sync */
2819 fio = &io->u.ci_fsync;
2820 fio->fi_capa = capa;
2821 fio->fi_start = start;
2823 fio->fi_fid = ll_inode2fid(inode);
2824 fio->fi_mode = mode;
2825 fio->fi_nr_written = 0;
2827 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2828 result = cl_io_loop(env, io);
2830 result = io->ci_result;
2832 result = fio->fi_nr_written;
2833 cl_io_fini(env, io);
2834 cl_env_nested_put(&nest, env);
2842 * When dentry is provided (the 'else' case), *file->f_dentry may be
2843 * null and dentry must be used directly rather than pulled from
2844 * *file->f_dentry as is done otherwise.
2847 #ifdef HAVE_FILE_FSYNC_4ARGS
2848 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2850 struct dentry *dentry = file->f_dentry;
2851 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2852 int ll_fsync(struct file *file, int datasync)
2854 struct dentry *dentry = file->f_dentry;
2856 loff_t end = LLONG_MAX;
2858 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2861 loff_t end = LLONG_MAX;
2863 struct inode *inode = dentry->d_inode;
2864 struct ll_inode_info *lli = ll_i2info(inode);
2865 struct ptlrpc_request *req;
2866 struct obd_capa *oc;
2870 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2871 PFID(ll_inode2fid(inode)), inode);
2872 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2874 #ifdef HAVE_FILE_FSYNC_4ARGS
2875 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2876 mutex_lock(&inode->i_mutex);
2878 /* fsync's caller has already called _fdata{sync,write}, we want
2879 * that IO to finish before calling the osc and mdc sync methods */
2880 rc = filemap_fdatawait(inode->i_mapping);
2883 /* catch async errors that were recorded back when async writeback
2884 * failed for pages in this mapping. */
2885 if (!S_ISDIR(inode->i_mode)) {
2886 err = lli->lli_async_rc;
2887 lli->lli_async_rc = 0;
2890 err = lov_read_and_clear_async_rc(lli->lli_clob);
2895 oc = ll_mdscapa_get(inode);
2896 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2902 ptlrpc_req_finished(req);
2904 if (S_ISREG(inode->i_mode)) {
2905 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2907 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2908 if (rc == 0 && err < 0)
2911 fd->fd_write_failed = true;
2913 fd->fd_write_failed = false;
2916 #ifdef HAVE_FILE_FSYNC_4ARGS
2917 mutex_unlock(&inode->i_mutex);
2923 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2925 struct inode *inode = file->f_dentry->d_inode;
2926 struct ll_sb_info *sbi = ll_i2sbi(inode);
2927 struct ldlm_enqueue_info einfo = {
2928 .ei_type = LDLM_FLOCK,
2929 .ei_cb_cp = ldlm_flock_completion_ast,
2930 .ei_cbdata = file_lock,
2932 struct md_op_data *op_data;
2933 struct lustre_handle lockh = {0};
2934 ldlm_policy_data_t flock = {{0}};
2935 int fl_type = file_lock->fl_type;
2941 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2942 PFID(ll_inode2fid(inode)), file_lock);
2944 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2946 if (file_lock->fl_flags & FL_FLOCK) {
2947 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2948 /* flocks are whole-file locks */
2949 flock.l_flock.end = OFFSET_MAX;
2950 /* For flocks owner is determined by the local file desctiptor*/
2951 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2952 } else if (file_lock->fl_flags & FL_POSIX) {
2953 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2954 flock.l_flock.start = file_lock->fl_start;
2955 flock.l_flock.end = file_lock->fl_end;
2959 flock.l_flock.pid = file_lock->fl_pid;
2961 /* Somewhat ugly workaround for svc lockd.
2962 * lockd installs custom fl_lmops->lm_compare_owner that checks
2963 * for the fl_owner to be the same (which it always is on local node
2964 * I guess between lockd processes) and then compares pid.
2965 * As such we assign pid to the owner field to make it all work,
2966 * conflict with normal locks is unlikely since pid space and
2967 * pointer space for current->files are not intersecting */
2968 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2969 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2973 einfo.ei_mode = LCK_PR;
2976 /* An unlock request may or may not have any relation to
2977 * existing locks so we may not be able to pass a lock handle
2978 * via a normal ldlm_lock_cancel() request. The request may even
2979 * unlock a byte range in the middle of an existing lock. In
2980 * order to process an unlock request we need all of the same
2981 * information that is given with a normal read or write record
2982 * lock request. To avoid creating another ldlm unlock (cancel)
2983 * message we'll treat a LCK_NL flock request as an unlock. */
2984 einfo.ei_mode = LCK_NL;
2987 einfo.ei_mode = LCK_PW;
2990 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3005 flags = LDLM_FL_BLOCK_NOWAIT;
3011 flags = LDLM_FL_TEST_LOCK;
3014 CERROR("unknown fcntl lock command: %d\n", cmd);
3018 /* Save the old mode so that if the mode in the lock changes we
3019 * can decrement the appropriate reader or writer refcount. */
3020 file_lock->fl_type = einfo.ei_mode;
3022 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3023 LUSTRE_OPC_ANY, NULL);
3024 if (IS_ERR(op_data))
3025 RETURN(PTR_ERR(op_data));
3027 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3028 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3029 flock.l_flock.pid, flags, einfo.ei_mode,
3030 flock.l_flock.start, flock.l_flock.end);
3032 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3035 /* Restore the file lock type if not TEST lock. */
3036 if (!(flags & LDLM_FL_TEST_LOCK))
3037 file_lock->fl_type = fl_type;
3039 if ((file_lock->fl_flags & FL_FLOCK) &&
3040 (rc == 0 || file_lock->fl_type == F_UNLCK))
3041 rc2 = flock_lock_file_wait(file, file_lock);
3042 if ((file_lock->fl_flags & FL_POSIX) &&
3043 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3044 !(flags & LDLM_FL_TEST_LOCK))
3045 rc2 = posix_lock_file_wait(file, file_lock);
3047 if (rc2 && file_lock->fl_type != F_UNLCK) {
3048 einfo.ei_mode = LCK_NL;
3049 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3054 ll_finish_md_op_data(op_data);
3059 int ll_get_fid_by_name(struct inode *parent, const char *name,
3060 int namelen, struct lu_fid *fid)
3062 struct md_op_data *op_data = NULL;
3063 struct mdt_body *body;
3064 struct ptlrpc_request *req;
3068 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3069 LUSTRE_OPC_ANY, NULL);
3070 if (IS_ERR(op_data))
3071 RETURN(PTR_ERR(op_data));
3073 op_data->op_valid = OBD_MD_FLID;
3074 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3075 ll_finish_md_op_data(op_data);
3079 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3081 GOTO(out_req, rc = -EFAULT);
3083 *fid = body->mbo_fid1;
3085 ptlrpc_req_finished(req);
3089 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3090 const char *name, int namelen)
3092 struct dentry *dchild = NULL;
3093 struct inode *child_inode = NULL;
3094 struct md_op_data *op_data;
3095 struct ptlrpc_request *request = NULL;
3100 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3101 name, PFID(ll_inode2fid(parent)), mdtidx);
3103 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3104 0, LUSTRE_OPC_ANY, NULL);
3105 if (IS_ERR(op_data))
3106 RETURN(PTR_ERR(op_data));
3108 /* Get child FID first */
3109 qstr.hash = full_name_hash(name, namelen);
3112 dchild = d_lookup(file->f_dentry, &qstr);
3113 if (dchild != NULL && dchild->d_inode != NULL) {
3114 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3115 if (dchild->d_inode != NULL) {
3116 child_inode = igrab(dchild->d_inode);
3117 ll_invalidate_aliases(child_inode);
3121 rc = ll_get_fid_by_name(parent, name, namelen,
3127 if (!fid_is_sane(&op_data->op_fid3)) {
3128 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3129 ll_get_fsname(parent->i_sb, NULL, 0), name,
3130 PFID(&op_data->op_fid3));
3131 GOTO(out_free, rc = -EINVAL);
3134 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3139 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3140 PFID(&op_data->op_fid3), mdtidx);
3141 GOTO(out_free, rc = 0);
3144 op_data->op_mds = mdtidx;
3145 op_data->op_cli_flags = CLI_MIGRATE;
3146 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3147 namelen, name, namelen, &request);
3149 ll_update_times(request, parent);
3151 ptlrpc_req_finished(request);
3156 if (child_inode != NULL) {
3157 clear_nlink(child_inode);
3161 ll_finish_md_op_data(op_data);
3166 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3174 * test if some locks matching bits and l_req_mode are acquired
3175 * - bits can be in different locks
3176 * - if found clear the common lock bits in *bits
3177 * - the bits not found, are kept in *bits
3179 * \param bits [IN] searched lock bits [IN]
3180 * \param l_req_mode [IN] searched lock mode
3181 * \retval boolean, true iff all bits are found
3183 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3185 struct lustre_handle lockh;
3186 ldlm_policy_data_t policy;
3187 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3188 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3197 fid = &ll_i2info(inode)->lli_fid;
3198 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3199 ldlm_lockname[mode]);
3201 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3202 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3203 policy.l_inodebits.bits = *bits & (1 << i);
3204 if (policy.l_inodebits.bits == 0)
3207 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3208 &policy, mode, &lockh)) {
3209 struct ldlm_lock *lock;
3211 lock = ldlm_handle2lock(&lockh);
3214 ~(lock->l_policy_data.l_inodebits.bits);
3215 LDLM_LOCK_PUT(lock);
3217 *bits &= ~policy.l_inodebits.bits;
3224 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3225 struct lustre_handle *lockh, __u64 flags,
3228 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3233 fid = &ll_i2info(inode)->lli_fid;
3234 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3236 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3237 fid, LDLM_IBITS, &policy, mode, lockh);
3242 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3244 /* Already unlinked. Just update nlink and return success */
3245 if (rc == -ENOENT) {
3247 /* This path cannot be hit for regular files unless in
3248 * case of obscure races, so no need to to validate
3250 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3252 } else if (rc != 0) {
3253 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3254 "%s: revalidate FID "DFID" error: rc = %d\n",
3255 ll_get_fsname(inode->i_sb, NULL, 0),
3256 PFID(ll_inode2fid(inode)), rc);
3262 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3264 struct inode *inode = dentry->d_inode;
3265 struct ptlrpc_request *req = NULL;
3266 struct obd_export *exp;
3270 LASSERT(inode != NULL);
3272 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3273 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3275 exp = ll_i2mdexp(inode);
3277 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3278 * But under CMD case, it caused some lock issues, should be fixed
3279 * with new CMD ibits lock. See bug 12718 */
3280 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3281 struct lookup_intent oit = { .it_op = IT_GETATTR };
3282 struct md_op_data *op_data;
3284 if (ibits == MDS_INODELOCK_LOOKUP)
3285 oit.it_op = IT_LOOKUP;
3287 /* Call getattr by fid, so do not provide name at all. */
3288 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3289 dentry->d_inode, NULL, 0, 0,
3290 LUSTRE_OPC_ANY, NULL);
3291 if (IS_ERR(op_data))
3292 RETURN(PTR_ERR(op_data));
3294 rc = md_intent_lock(exp, op_data, &oit, &req,
3295 &ll_md_blocking_ast, 0);
3296 ll_finish_md_op_data(op_data);
3298 rc = ll_inode_revalidate_fini(inode, rc);
3302 rc = ll_revalidate_it_finish(req, &oit, dentry);
3304 ll_intent_release(&oit);
3308 /* Unlinked? Unhash dentry, so it is not picked up later by
3309 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3310 here to preserve get_cwd functionality on 2.6.
3312 if (!dentry->d_inode->i_nlink)
3313 d_lustre_invalidate(dentry, 0);
3315 ll_lookup_finish_locks(&oit, dentry);
3316 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3317 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3318 obd_valid valid = OBD_MD_FLGETATTR;
3319 struct md_op_data *op_data;
3322 if (S_ISREG(inode->i_mode)) {
3323 rc = ll_get_default_mdsize(sbi, &ealen);
3326 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3329 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3330 0, ealen, LUSTRE_OPC_ANY,
3332 if (IS_ERR(op_data))
3333 RETURN(PTR_ERR(op_data));
3335 op_data->op_valid = valid;
3336 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3337 * capa for this inode. Because we only keep capas of dirs
3339 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3340 ll_finish_md_op_data(op_data);
3342 rc = ll_inode_revalidate_fini(inode, rc);
3346 rc = ll_prep_inode(&inode, req, NULL, NULL);
3349 ptlrpc_req_finished(req);
3353 static int ll_merge_md_attr(struct inode *inode)
3355 struct cl_attr attr = { 0 };
3358 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3359 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3364 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3365 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3367 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3368 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3369 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3375 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3377 struct inode *inode = dentry->d_inode;
3381 rc = __ll_inode_revalidate(dentry, ibits);
3385 /* if object isn't regular file, don't validate size */
3386 if (!S_ISREG(inode->i_mode)) {
3387 if (S_ISDIR(inode->i_mode) &&
3388 ll_i2info(inode)->lli_lsm_md != NULL) {
3389 rc = ll_merge_md_attr(inode);
3394 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3395 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3396 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3398 /* In case of restore, the MDT has the right size and has
3399 * already send it back without granting the layout lock,
3400 * inode is up-to-date so glimpse is useless.
3401 * Also to glimpse we need the layout, in case of a running
3402 * restore the MDT holds the layout lock so the glimpse will
3403 * block up to the end of restore (getattr will block)
3405 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3406 rc = ll_glimpse_size(inode);
3411 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3413 struct inode *inode = de->d_inode;
3414 struct ll_sb_info *sbi = ll_i2sbi(inode);
3415 struct ll_inode_info *lli = ll_i2info(inode);
3418 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3419 MDS_INODELOCK_LOOKUP);
3420 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3425 stat->dev = inode->i_sb->s_dev;
3426 if (ll_need_32bit_api(sbi))
3427 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3429 stat->ino = inode->i_ino;
3430 stat->mode = inode->i_mode;
3431 stat->uid = inode->i_uid;
3432 stat->gid = inode->i_gid;
3433 stat->rdev = inode->i_rdev;
3434 stat->atime = inode->i_atime;
3435 stat->mtime = inode->i_mtime;
3436 stat->ctime = inode->i_ctime;
3437 stat->blksize = 1 << inode->i_blkbits;
3438 stat->blocks = inode->i_blocks;
3440 if (S_ISDIR(inode->i_mode) &&
3441 ll_i2info(inode)->lli_lsm_md != NULL) {
3442 stat->nlink = lli->lli_stripe_dir_nlink;
3443 stat->size = lli->lli_stripe_dir_size;
3445 stat->nlink = inode->i_nlink;
3446 stat->size = i_size_read(inode);
3452 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3453 __u64 start, __u64 len)
3457 struct ll_user_fiemap *fiemap;
3458 unsigned int extent_count = fieinfo->fi_extents_max;
3460 num_bytes = sizeof(*fiemap) + (extent_count *
3461 sizeof(struct ll_fiemap_extent));
3462 OBD_ALLOC_LARGE(fiemap, num_bytes);
3467 fiemap->fm_flags = fieinfo->fi_flags;
3468 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3469 fiemap->fm_start = start;
3470 fiemap->fm_length = len;
3471 if (extent_count > 0)
3472 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3473 sizeof(struct ll_fiemap_extent));
3475 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3477 fieinfo->fi_flags = fiemap->fm_flags;
3478 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3479 if (extent_count > 0)
3480 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3481 fiemap->fm_mapped_extents *
3482 sizeof(struct ll_fiemap_extent));
3484 OBD_FREE_LARGE(fiemap, num_bytes);
3488 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3490 struct ll_inode_info *lli = ll_i2info(inode);
3491 struct posix_acl *acl = NULL;
3494 spin_lock(&lli->lli_lock);
3495 /* VFS' acl_permission_check->check_acl will release the refcount */
3496 acl = posix_acl_dup(lli->lli_posix_acl);
3497 spin_unlock(&lli->lli_lock);
3502 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3504 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3505 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3507 ll_check_acl(struct inode *inode, int mask)
3510 # ifdef CONFIG_FS_POSIX_ACL
3511 struct posix_acl *acl;
3515 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3516 if (flags & IPERM_FLAG_RCU)
3519 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3524 rc = posix_acl_permission(inode, acl, mask);
3525 posix_acl_release(acl);
3528 # else /* !CONFIG_FS_POSIX_ACL */
3530 # endif /* CONFIG_FS_POSIX_ACL */
3532 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3534 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3535 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3537 # ifdef HAVE_INODE_PERMISION_2ARGS
3538 int ll_inode_permission(struct inode *inode, int mask)
3540 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3545 struct ll_sb_info *sbi;
3546 struct root_squash_info *squash;
3547 struct cred *cred = NULL;
3548 const struct cred *old_cred = NULL;
3550 bool squash_id = false;
3553 #ifdef MAY_NOT_BLOCK
3554 if (mask & MAY_NOT_BLOCK)
3556 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3557 if (flags & IPERM_FLAG_RCU)
3561 /* as root inode are NOT getting validated in lookup operation,
3562 * need to do it before permission check. */
3564 if (inode == inode->i_sb->s_root->d_inode) {
3565 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3566 MDS_INODELOCK_LOOKUP);
3571 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3572 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3574 /* squash fsuid/fsgid if needed */
3575 sbi = ll_i2sbi(inode);
3576 squash = &sbi->ll_squash;
3577 if (unlikely(squash->rsi_uid != 0 &&
3578 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3579 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3583 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3584 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3585 squash->rsi_uid, squash->rsi_gid);
3587 /* update current process's credentials
3588 * and FS capability */
3589 cred = prepare_creds();
3593 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3594 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3595 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3596 if ((1 << cap) & CFS_CAP_FS_MASK)
3597 cap_lower(cred->cap_effective, cap);
3599 old_cred = override_creds(cred);
3602 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3604 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3605 rc = lustre_check_remote_perm(inode, mask);
3607 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3609 /* restore current process's credentials and FS capability */
3611 revert_creds(old_cred);
3618 /* -o localflock - only provides locally consistent flock locks */
3619 struct file_operations ll_file_operations = {
3620 .read = ll_file_read,
3621 .aio_read = ll_file_aio_read,
3622 .write = ll_file_write,
3623 .aio_write = ll_file_aio_write,
3624 .unlocked_ioctl = ll_file_ioctl,
3625 .open = ll_file_open,
3626 .release = ll_file_release,
3627 .mmap = ll_file_mmap,
3628 .llseek = ll_file_seek,
3629 .splice_read = ll_file_splice_read,
3634 struct file_operations ll_file_operations_flock = {
3635 .read = ll_file_read,
3636 .aio_read = ll_file_aio_read,
3637 .write = ll_file_write,
3638 .aio_write = ll_file_aio_write,
3639 .unlocked_ioctl = ll_file_ioctl,
3640 .open = ll_file_open,
3641 .release = ll_file_release,
3642 .mmap = ll_file_mmap,
3643 .llseek = ll_file_seek,
3644 .splice_read = ll_file_splice_read,
3647 .flock = ll_file_flock,
3648 .lock = ll_file_flock
3651 /* These are for -o noflock - to return ENOSYS on flock calls */
3652 struct file_operations ll_file_operations_noflock = {
3653 .read = ll_file_read,
3654 .aio_read = ll_file_aio_read,
3655 .write = ll_file_write,
3656 .aio_write = ll_file_aio_write,
3657 .unlocked_ioctl = ll_file_ioctl,
3658 .open = ll_file_open,
3659 .release = ll_file_release,
3660 .mmap = ll_file_mmap,
3661 .llseek = ll_file_seek,
3662 .splice_read = ll_file_splice_read,
3665 .flock = ll_file_noflock,
3666 .lock = ll_file_noflock
3669 struct inode_operations ll_file_inode_operations = {
3670 .setattr = ll_setattr,
3671 .getattr = ll_getattr,
3672 .permission = ll_inode_permission,
3673 .setxattr = ll_setxattr,
3674 .getxattr = ll_getxattr,
3675 .listxattr = ll_listxattr,
3676 .removexattr = ll_removexattr,
3677 .fiemap = ll_fiemap,
3678 #ifdef HAVE_IOP_GET_ACL
3679 .get_acl = ll_get_acl,
3683 /* dynamic ioctl number support routins */
3684 static struct llioc_ctl_data {
3685 struct rw_semaphore ioc_sem;
3686 struct list_head ioc_head;
3688 __RWSEM_INITIALIZER(llioc.ioc_sem),
3689 LIST_HEAD_INIT(llioc.ioc_head)
3694 struct list_head iocd_list;
3695 unsigned int iocd_size;
3696 llioc_callback_t iocd_cb;
3697 unsigned int iocd_count;
3698 unsigned int iocd_cmd[0];
3701 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3704 struct llioc_data *in_data = NULL;
3707 if (cb == NULL || cmd == NULL ||
3708 count > LLIOC_MAX_CMD || count < 0)
3711 size = sizeof(*in_data) + count * sizeof(unsigned int);
3712 OBD_ALLOC(in_data, size);
3713 if (in_data == NULL)
3716 memset(in_data, 0, sizeof(*in_data));
3717 in_data->iocd_size = size;
3718 in_data->iocd_cb = cb;
3719 in_data->iocd_count = count;
3720 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3722 down_write(&llioc.ioc_sem);
3723 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3724 up_write(&llioc.ioc_sem);
3729 void ll_iocontrol_unregister(void *magic)
3731 struct llioc_data *tmp;
3736 down_write(&llioc.ioc_sem);
3737 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3739 unsigned int size = tmp->iocd_size;
3741 list_del(&tmp->iocd_list);
3742 up_write(&llioc.ioc_sem);
3744 OBD_FREE(tmp, size);
3748 up_write(&llioc.ioc_sem);
3750 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3753 EXPORT_SYMBOL(ll_iocontrol_register);
3754 EXPORT_SYMBOL(ll_iocontrol_unregister);
3756 static enum llioc_iter
3757 ll_iocontrol_call(struct inode *inode, struct file *file,
3758 unsigned int cmd, unsigned long arg, int *rcp)
3760 enum llioc_iter ret = LLIOC_CONT;
3761 struct llioc_data *data;
3762 int rc = -EINVAL, i;
3764 down_read(&llioc.ioc_sem);
3765 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3766 for (i = 0; i < data->iocd_count; i++) {
3767 if (cmd != data->iocd_cmd[i])
3770 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3774 if (ret == LLIOC_STOP)
3777 up_read(&llioc.ioc_sem);
3784 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3786 struct ll_inode_info *lli = ll_i2info(inode);
3787 struct cl_env_nest nest;
3792 if (lli->lli_clob == NULL)
3795 env = cl_env_nested_get(&nest);
3797 RETURN(PTR_ERR(env));
3799 result = cl_conf_set(env, lli->lli_clob, conf);
3800 cl_env_nested_put(&nest, env);
3802 if (conf->coc_opc == OBJECT_CONF_SET) {
3803 struct ldlm_lock *lock = conf->coc_lock;
3805 LASSERT(lock != NULL);
3806 LASSERT(ldlm_has_layout(lock));
3808 struct lustre_md *md = conf->u.coc_md;
3809 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3811 /* it can only be allowed to match after layout is
3812 * applied to inode otherwise false layout would be
3813 * seen. Applying layout shoud happen before dropping
3814 * the intent lock. */
3815 ldlm_lock_allow_match(lock);
3817 lli->lli_has_smd = lsm_has_objects(md->lsm);
3818 if (md->lsm != NULL)
3819 gen = md->lsm->lsm_layout_gen;
3822 DFID ": layout version change: %u -> %u\n",
3823 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3825 ll_layout_version_set(lli, gen);
3831 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3832 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3835 struct ll_sb_info *sbi = ll_i2sbi(inode);
3836 struct obd_capa *oc;
3837 struct ptlrpc_request *req;
3838 struct mdt_body *body;
3845 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3846 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3847 lock->l_lvb_data, lock->l_lvb_len);
3849 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3852 /* if layout lock was granted right away, the layout is returned
3853 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3854 * blocked and then granted via completion ast, we have to fetch
3855 * layout here. Please note that we can't use the LVB buffer in
3856 * completion AST because it doesn't have a large enough buffer */
3857 oc = ll_mdscapa_get(inode);
3858 rc = ll_get_default_mdsize(sbi, &lmmsize);
3860 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3861 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3867 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3869 GOTO(out, rc = -EPROTO);
3871 lmmsize = body->mbo_eadatasize;
3872 if (lmmsize == 0) /* empty layout */
3875 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3877 GOTO(out, rc = -EFAULT);
3879 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3880 if (lvbdata == NULL)
3881 GOTO(out, rc = -ENOMEM);
3883 memcpy(lvbdata, lmm, lmmsize);
3884 lock_res_and_lock(lock);
3885 if (lock->l_lvb_data != NULL)
3886 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3888 lock->l_lvb_data = lvbdata;
3889 lock->l_lvb_len = lmmsize;
3890 unlock_res_and_lock(lock);
3895 ptlrpc_req_finished(req);
3900 * Apply the layout to the inode. Layout lock is held and will be released
3903 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3904 struct inode *inode, __u32 *gen, bool reconf)
3906 struct ll_inode_info *lli = ll_i2info(inode);
3907 struct ll_sb_info *sbi = ll_i2sbi(inode);
3908 struct ldlm_lock *lock;
3909 struct lustre_md md = { NULL };
3910 struct cl_object_conf conf;
3913 bool wait_layout = false;
3916 LASSERT(lustre_handle_is_used(lockh));
3918 lock = ldlm_handle2lock(lockh);
3919 LASSERT(lock != NULL);
3920 LASSERT(ldlm_has_layout(lock));
3922 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3923 PFID(&lli->lli_fid), inode, reconf);
3925 /* in case this is a caching lock and reinstate with new inode */
3926 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3928 lock_res_and_lock(lock);
3929 lvb_ready = ldlm_is_lvb_ready(lock);
3930 unlock_res_and_lock(lock);
3931 /* checking lvb_ready is racy but this is okay. The worst case is
3932 * that multi processes may configure the file on the same time. */
3934 if (lvb_ready || !reconf) {
3937 /* layout_gen must be valid if layout lock is not
3938 * cancelled and stripe has already set */
3939 *gen = ll_layout_version_get(lli);
3945 rc = ll_layout_fetch(inode, lock);
3949 /* for layout lock, lmm is returned in lock's lvb.
3950 * lvb_data is immutable if the lock is held so it's safe to access it
3951 * without res lock. See the description in ldlm_lock_decref_internal()
3952 * for the condition to free lvb_data of layout lock */
3953 if (lock->l_lvb_data != NULL) {
3954 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3955 lock->l_lvb_data, lock->l_lvb_len);
3957 *gen = LL_LAYOUT_GEN_EMPTY;
3959 *gen = md.lsm->lsm_layout_gen;
3962 CERROR("%s: file "DFID" unpackmd error: %d\n",
3963 ll_get_fsname(inode->i_sb, NULL, 0),
3964 PFID(&lli->lli_fid), rc);
3970 /* set layout to file. Unlikely this will fail as old layout was
3971 * surely eliminated */
3972 memset(&conf, 0, sizeof conf);
3973 conf.coc_opc = OBJECT_CONF_SET;
3974 conf.coc_inode = inode;
3975 conf.coc_lock = lock;
3976 conf.u.coc_md = &md;
3977 rc = ll_layout_conf(inode, &conf);
3980 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3982 /* refresh layout failed, need to wait */
3983 wait_layout = rc == -EBUSY;
3987 LDLM_LOCK_PUT(lock);
3988 ldlm_lock_decref(lockh, mode);
3990 /* wait for IO to complete if it's still being used. */
3992 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3993 ll_get_fsname(inode->i_sb, NULL, 0),
3994 PFID(&lli->lli_fid), inode);
3996 memset(&conf, 0, sizeof conf);
3997 conf.coc_opc = OBJECT_CONF_WAIT;
3998 conf.coc_inode = inode;
3999 rc = ll_layout_conf(inode, &conf);
4003 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4004 ll_get_fsname(inode->i_sb, NULL, 0),
4005 PFID(&lli->lli_fid), rc);
4011 * This function checks if there exists a LAYOUT lock on the client side,
4012 * or enqueues it if it doesn't have one in cache.
4014 * This function will not hold layout lock so it may be revoked any time after
4015 * this function returns. Any operations depend on layout should be redone
4018 * This function should be called before lov_io_init() to get an uptodate
4019 * layout version, the caller should save the version number and after IO
4020 * is finished, this function should be called again to verify that layout
4021 * is not changed during IO time.
4023 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4025 struct ll_inode_info *lli = ll_i2info(inode);
4026 struct ll_sb_info *sbi = ll_i2sbi(inode);
4027 struct md_op_data *op_data;
4028 struct lookup_intent it;
4029 struct lustre_handle lockh;
4031 struct ldlm_enqueue_info einfo = {
4032 .ei_type = LDLM_IBITS,
4034 .ei_cb_bl = &ll_md_blocking_ast,
4035 .ei_cb_cp = &ldlm_completion_ast,
4040 *gen = ll_layout_version_get(lli);
4041 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4045 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4046 LASSERT(S_ISREG(inode->i_mode));
4048 /* take layout lock mutex to enqueue layout lock exclusively. */
4049 mutex_lock(&lli->lli_layout_mutex);
4052 /* mostly layout lock is caching on the local side, so try to match
4053 * it before grabbing layout lock mutex. */
4054 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4055 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4056 if (mode != 0) { /* hit cached lock */
4057 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4061 mutex_unlock(&lli->lli_layout_mutex);
4065 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4066 0, 0, LUSTRE_OPC_ANY, NULL);
4067 if (IS_ERR(op_data)) {
4068 mutex_unlock(&lli->lli_layout_mutex);
4069 RETURN(PTR_ERR(op_data));
4072 /* have to enqueue one */
4073 memset(&it, 0, sizeof(it));
4074 it.it_op = IT_LAYOUT;
4075 lockh.cookie = 0ULL;
4077 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4078 ll_get_fsname(inode->i_sb, NULL, 0),
4079 PFID(&lli->lli_fid), inode);
4081 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4082 if (it.d.lustre.it_data != NULL)
4083 ptlrpc_req_finished(it.d.lustre.it_data);
4084 it.d.lustre.it_data = NULL;
4086 ll_finish_md_op_data(op_data);
4088 mode = it.d.lustre.it_lock_mode;
4089 it.d.lustre.it_lock_mode = 0;
4090 ll_intent_drop_lock(&it);
4093 /* set lock data in case this is a new lock */
4094 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4095 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4099 mutex_unlock(&lli->lli_layout_mutex);
4105 * This function send a restore request to the MDT
4107 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4109 struct hsm_user_request *hur;
4113 len = sizeof(struct hsm_user_request) +
4114 sizeof(struct hsm_user_item);
4115 OBD_ALLOC(hur, len);
4119 hur->hur_request.hr_action = HUA_RESTORE;
4120 hur->hur_request.hr_archive_id = 0;
4121 hur->hur_request.hr_flags = 0;
4122 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4123 sizeof(hur->hur_user_item[0].hui_fid));
4124 hur->hur_user_item[0].hui_extent.offset = offset;
4125 hur->hur_user_item[0].hui_extent.length = length;
4126 hur->hur_request.hr_itemcount = 1;
4127 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,