4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("released file has negative dentry: file = %p, "
339 "dentry = %p, name = %s\n",
340 file, file->f_dentry, file->f_dentry->d_name.name);
344 LUSTRE_FPRIVATE(file) = NULL;
345 ll_file_data_put(fd);
346 ll_capa_close(inode);
351 /* While this returns an error code, fput() the caller does not, so we need
352 * to make every effort to clean up all of our state here. Also, applications
353 * rarely check close errors and even if an error is returned they will not
354 * re-try the close call.
356 int ll_file_release(struct inode *inode, struct file *file)
358 struct ll_file_data *fd;
359 struct ll_sb_info *sbi = ll_i2sbi(inode);
360 struct ll_inode_info *lli = ll_i2info(inode);
364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
365 PFID(ll_inode2fid(inode)), inode);
367 #ifdef CONFIG_FS_POSIX_ACL
368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
369 inode == inode->i_sb->s_root->d_inode) {
370 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
373 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
374 fd->fd_flags &= ~LL_FILE_RMTACL;
375 rct_del(&sbi->ll_rct, current_pid());
376 et_search_free(&sbi->ll_et, current_pid());
381 if (inode->i_sb->s_root != file->f_dentry)
382 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
383 fd = LUSTRE_FPRIVATE(file);
386 /* The last ref on @file, maybe not the the owner pid of statahead,
387 * because parent and child process can share the same file handle. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
389 ll_deauthorize_statahead(inode, fd);
391 if (inode->i_sb->s_root == file->f_dentry) {
392 LUSTRE_FPRIVATE(file) = NULL;
393 ll_file_data_put(fd);
397 if (!S_ISDIR(inode->i_mode)) {
398 if (lli->lli_clob != NULL)
399 lov_read_and_clear_async_rc(lli->lli_clob);
400 lli->lli_async_rc = 0;
403 rc = ll_md_close(sbi->ll_md_exp, inode, file);
405 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
406 libcfs_debug_dumplog();
411 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
412 struct lookup_intent *itp)
414 struct dentry *de = file->f_dentry;
415 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
416 struct dentry *parent = de->d_parent;
417 const char *name = NULL;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req = NULL;
424 LASSERT(parent != NULL);
425 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
427 /* if server supports open-by-fid, or file name is invalid, don't pack
428 * name in open request */
429 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
430 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
431 name = de->d_name.name;
432 len = de->d_name.len;
435 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
436 name, len, 0, LUSTRE_OPC_ANY, NULL);
438 RETURN(PTR_ERR(op_data));
439 op_data->op_data = lmm;
440 op_data->op_data_size = lmmsize;
442 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
443 &ll_md_blocking_ast, 0);
444 ll_finish_md_op_data(op_data);
446 /* reason for keep own exit path - don`t flood log
447 * with messages with -ESTALE errors.
449 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
450 it_open_error(DISP_OPEN_OPEN, itp))
452 ll_release_openhandle(de, itp);
456 if (it_disposition(itp, DISP_LOOKUP_NEG))
457 GOTO(out, rc = -ENOENT);
459 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
460 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
461 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
465 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
466 if (!rc && itp->d.lustre.it_lock_mode)
467 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
470 ptlrpc_req_finished(req);
471 ll_intent_drop_lock(itp);
477 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
478 * not believe attributes if a few ioepoch holders exist. Attributes for
479 * previous ioepoch if new one is opened are also skipped by MDS.
481 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
483 if (ioepoch && lli->lli_ioepoch != ioepoch) {
484 lli->lli_ioepoch = ioepoch;
485 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
486 ioepoch, PFID(&lli->lli_fid));
490 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
491 struct obd_client_handle *och)
493 struct ptlrpc_request *req = it->d.lustre.it_data;
494 struct mdt_body *body;
496 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
497 och->och_fh = body->mbo_handle;
498 och->och_fid = body->mbo_fid1;
499 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
500 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
501 och->och_flags = it->it_flags;
503 return md_set_open_replay_data(md_exp, och, it);
506 static int ll_local_open(struct file *file, struct lookup_intent *it,
507 struct ll_file_data *fd, struct obd_client_handle *och)
509 struct inode *inode = file->f_dentry->d_inode;
510 struct ll_inode_info *lli = ll_i2info(inode);
513 LASSERT(!LUSTRE_FPRIVATE(file));
518 struct ptlrpc_request *req = it->d.lustre.it_data;
519 struct mdt_body *body;
522 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
526 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
527 ll_ioepoch_open(lli, body->mbo_ioepoch);
530 LUSTRE_FPRIVATE(file) = fd;
531 ll_readahead_init(inode, &fd->fd_ras);
532 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
534 /* ll_cl_context initialize */
535 rwlock_init(&fd->fd_lock);
536 INIT_LIST_HEAD(&fd->fd_lccs);
541 /* Open a file, and (for the very first open) create objects on the OSTs at
542 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
543 * creation or open until ll_lov_setstripe() ioctl is called.
545 * If we already have the stripe MD locally then we don't request it in
546 * md_open(), by passing a lmm_size = 0.
548 * It is up to the application to ensure no other processes open this file
549 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
550 * used. We might be able to avoid races of that sort by getting lli_open_sem
551 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
552 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
554 int ll_file_open(struct inode *inode, struct file *file)
556 struct ll_inode_info *lli = ll_i2info(inode);
557 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
558 .it_flags = file->f_flags };
559 struct obd_client_handle **och_p = NULL;
560 __u64 *och_usecount = NULL;
561 struct ll_file_data *fd;
565 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
566 PFID(ll_inode2fid(inode)), inode, file->f_flags);
568 it = file->private_data; /* XXX: compat macro */
569 file->private_data = NULL; /* prevent ll_local_open assertion */
571 fd = ll_file_data_get();
573 GOTO(out_openerr, rc = -ENOMEM);
576 if (S_ISDIR(inode->i_mode))
577 ll_authorize_statahead(inode, fd);
579 if (inode->i_sb->s_root == file->f_dentry) {
580 LUSTRE_FPRIVATE(file) = fd;
584 if (!it || !it->d.lustre.it_disposition) {
585 /* Convert f_flags into access mode. We cannot use file->f_mode,
586 * because everything but O_ACCMODE mask was stripped from
588 if ((oit.it_flags + 1) & O_ACCMODE)
590 if (file->f_flags & O_TRUNC)
591 oit.it_flags |= FMODE_WRITE;
593 /* kernel only call f_op->open in dentry_open. filp_open calls
594 * dentry_open after call to open_namei that checks permissions.
595 * Only nfsd_open call dentry_open directly without checking
596 * permissions and because of that this code below is safe. */
597 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
598 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
600 /* We do not want O_EXCL here, presumably we opened the file
601 * already? XXX - NFS implications? */
602 oit.it_flags &= ~O_EXCL;
604 /* bug20584, if "it_flags" contains O_CREAT, the file will be
605 * created if necessary, then "IT_CREAT" should be set to keep
606 * consistent with it */
607 if (oit.it_flags & O_CREAT)
608 oit.it_op |= IT_CREAT;
614 /* Let's see if we have file open on MDS already. */
615 if (it->it_flags & FMODE_WRITE) {
616 och_p = &lli->lli_mds_write_och;
617 och_usecount = &lli->lli_open_fd_write_count;
618 } else if (it->it_flags & FMODE_EXEC) {
619 och_p = &lli->lli_mds_exec_och;
620 och_usecount = &lli->lli_open_fd_exec_count;
622 och_p = &lli->lli_mds_read_och;
623 och_usecount = &lli->lli_open_fd_read_count;
626 mutex_lock(&lli->lli_och_mutex);
627 if (*och_p) { /* Open handle is present */
628 if (it_disposition(it, DISP_OPEN_OPEN)) {
629 /* Well, there's extra open request that we do not need,
630 let's close it somehow. This will decref request. */
631 rc = it_open_error(DISP_OPEN_OPEN, it);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 ll_release_openhandle(file->f_dentry, it);
641 rc = ll_local_open(file, it, fd, NULL);
644 mutex_unlock(&lli->lli_och_mutex);
645 GOTO(out_openerr, rc);
648 LASSERT(*och_usecount == 0);
649 if (!it->d.lustre.it_disposition) {
650 /* We cannot just request lock handle now, new ELC code
651 means that one of other OPEN locks for this file
652 could be cancelled, and since blocking ast handler
653 would attempt to grab och_mutex as well, that would
654 result in a deadlock */
655 mutex_unlock(&lli->lli_och_mutex);
657 * Normally called under two situations:
659 * 2. A race/condition on MDS resulting in no open
660 * handle to be returned from LOOKUP|OPEN request,
661 * for example if the target entry was a symlink.
663 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
665 * Always specify MDS_OPEN_BY_FID because we don't want
666 * to get file with different fid.
668 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
669 rc = ll_intent_file_open(file, NULL, 0, it);
671 GOTO(out_openerr, rc);
675 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
677 GOTO(out_och_free, rc = -ENOMEM);
681 /* md_intent_lock() didn't get a request ref if there was an
682 * open error, so don't do cleanup on the request here
684 /* XXX (green): Should not we bail out on any error here, not
685 * just open error? */
686 rc = it_open_error(DISP_OPEN_OPEN, it);
688 GOTO(out_och_free, rc);
690 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
691 "inode %p: disposition %x, status %d\n", inode,
692 it_disposition(it, ~0), it->d.lustre.it_status);
694 rc = ll_local_open(file, it, fd, *och_p);
696 GOTO(out_och_free, rc);
698 mutex_unlock(&lli->lli_och_mutex);
701 /* Must do this outside lli_och_mutex lock to prevent deadlock where
702 different kind of OPEN lock for this same inode gets cancelled
703 by ldlm_cancel_lru */
704 if (!S_ISREG(inode->i_mode))
705 GOTO(out_och_free, rc);
709 if (!lli->lli_has_smd &&
710 (cl_is_lov_delay_create(file->f_flags) ||
711 (file->f_mode & FMODE_WRITE) == 0)) {
712 CDEBUG(D_INODE, "object creation was delayed\n");
713 GOTO(out_och_free, rc);
715 cl_lov_delay_create_clear(&file->f_flags);
716 GOTO(out_och_free, rc);
720 if (och_p && *och_p) {
721 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
722 *och_p = NULL; /* OBD_FREE writes some magic there */
725 mutex_unlock(&lli->lli_och_mutex);
728 if (lli->lli_opendir_key == fd)
729 ll_deauthorize_statahead(inode, fd);
731 ll_file_data_put(fd);
733 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
736 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
737 ptlrpc_req_finished(it->d.lustre.it_data);
738 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
744 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
745 struct ldlm_lock_desc *desc, void *data, int flag)
748 struct lustre_handle lockh;
752 case LDLM_CB_BLOCKING:
753 ldlm_lock2handle(lock, &lockh);
754 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
756 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
760 case LDLM_CB_CANCELING:
768 * Acquire a lease and open the file.
770 static struct obd_client_handle *
771 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
774 struct lookup_intent it = { .it_op = IT_OPEN };
775 struct ll_sb_info *sbi = ll_i2sbi(inode);
776 struct md_op_data *op_data;
777 struct ptlrpc_request *req = NULL;
778 struct lustre_handle old_handle = { 0 };
779 struct obd_client_handle *och = NULL;
784 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
785 RETURN(ERR_PTR(-EINVAL));
788 struct ll_inode_info *lli = ll_i2info(inode);
789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
790 struct obd_client_handle **och_p;
793 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
794 RETURN(ERR_PTR(-EPERM));
796 /* Get the openhandle of the file */
798 mutex_lock(&lli->lli_och_mutex);
799 if (fd->fd_lease_och != NULL) {
800 mutex_unlock(&lli->lli_och_mutex);
804 if (fd->fd_och == NULL) {
805 if (file->f_mode & FMODE_WRITE) {
806 LASSERT(lli->lli_mds_write_och != NULL);
807 och_p = &lli->lli_mds_write_och;
808 och_usecount = &lli->lli_open_fd_write_count;
810 LASSERT(lli->lli_mds_read_och != NULL);
811 och_p = &lli->lli_mds_read_och;
812 och_usecount = &lli->lli_open_fd_read_count;
814 if (*och_usecount == 1) {
821 mutex_unlock(&lli->lli_och_mutex);
822 if (rc < 0) /* more than 1 opener */
825 LASSERT(fd->fd_och != NULL);
826 old_handle = fd->fd_och->och_fh;
831 RETURN(ERR_PTR(-ENOMEM));
833 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
834 LUSTRE_OPC_ANY, NULL);
836 GOTO(out, rc = PTR_ERR(op_data));
838 /* To tell the MDT this openhandle is from the same owner */
839 op_data->op_handle = old_handle;
841 it.it_flags = fmode | open_flags;
842 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
843 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
844 &ll_md_blocking_lease_ast,
845 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
846 * it can be cancelled which may mislead applications that the lease is
848 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
849 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
850 * doesn't deal with openhandle, so normal openhandle will be leaked. */
851 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
852 ll_finish_md_op_data(op_data);
853 ptlrpc_req_finished(req);
855 GOTO(out_release_it, rc);
857 if (it_disposition(&it, DISP_LOOKUP_NEG))
858 GOTO(out_release_it, rc = -ENOENT);
860 rc = it_open_error(DISP_OPEN_OPEN, &it);
862 GOTO(out_release_it, rc);
864 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
865 ll_och_fill(sbi->ll_md_exp, &it, och);
867 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
868 GOTO(out_close, rc = -EOPNOTSUPP);
870 /* already get lease, handle lease lock */
871 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
872 if (it.d.lustre.it_lock_mode == 0 ||
873 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
874 /* open lock must return for lease */
875 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
876 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
877 it.d.lustre.it_lock_bits);
878 GOTO(out_close, rc = -EPROTO);
881 ll_intent_release(&it);
885 /* Cancel open lock */
886 if (it.d.lustre.it_lock_mode != 0) {
887 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
888 it.d.lustre.it_lock_mode);
889 it.d.lustre.it_lock_mode = 0;
890 och->och_lease_handle.cookie = 0ULL;
892 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
894 CERROR("%s: error closing file "DFID": %d\n",
895 ll_get_fsname(inode->i_sb, NULL, 0),
896 PFID(&ll_i2info(inode)->lli_fid), rc2);
897 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
899 ll_intent_release(&it);
907 * Release lease and close the file.
908 * It will check if the lease has ever broken.
910 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
913 struct ldlm_lock *lock;
914 bool cancelled = true;
918 lock = ldlm_handle2lock(&och->och_lease_handle);
920 lock_res_and_lock(lock);
921 cancelled = ldlm_is_cancel(lock);
922 unlock_res_and_lock(lock);
926 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
927 PFID(&ll_i2info(inode)->lli_fid), cancelled);
930 ldlm_cli_cancel(&och->och_lease_handle, 0);
931 if (lease_broken != NULL)
932 *lease_broken = cancelled;
934 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
939 /* Fills the obdo with the attributes for the lsm */
940 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
941 struct obd_capa *capa, struct obdo *obdo,
942 __u64 ioepoch, int dv_flags)
944 struct ptlrpc_request_set *set;
945 struct obd_info oinfo = { { { 0 } } };
950 LASSERT(lsm != NULL);
954 oinfo.oi_oa->o_oi = lsm->lsm_oi;
955 oinfo.oi_oa->o_mode = S_IFREG;
956 oinfo.oi_oa->o_ioepoch = ioepoch;
957 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
958 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
959 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
960 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
961 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
962 OBD_MD_FLDATAVERSION;
963 oinfo.oi_capa = capa;
964 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
965 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
966 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
967 if (dv_flags & LL_DV_WR_FLUSH)
968 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
971 set = ptlrpc_prep_set();
973 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
976 rc = obd_getattr_async(exp, &oinfo, set);
978 rc = ptlrpc_set_wait(set);
979 ptlrpc_set_destroy(set);
982 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
983 OBD_MD_FLATIME | OBD_MD_FLMTIME |
984 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
985 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
986 if (dv_flags & LL_DV_WR_FLUSH &&
987 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
988 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
995 * Performs the getattr on the inode and updates its fields.
996 * If @sync != 0, perform the getattr under the server-side lock.
998 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
999 __u64 ioepoch, int sync)
1001 struct obd_capa *capa = ll_mdscapa_get(inode);
1002 struct lov_stripe_md *lsm;
1006 lsm = ccc_inode_lsm_get(inode);
1007 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1008 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1011 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1013 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1014 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1015 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1016 (unsigned long long)inode->i_blocks,
1017 1UL << inode->i_blkbits);
1019 ccc_inode_lsm_put(inode, lsm);
1023 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct cl_object *obj = lli->lli_clob;
1027 struct cl_attr *attr = ccc_env_thread_attr(env);
1033 ll_inode_size_lock(inode);
1034 /* merge timestamps the most recently obtained from mds with
1035 timestamps obtained from osts */
1036 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1037 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1038 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 inode_init_lvb(inode, &lvb);
1041 cl_object_attr_lock(obj);
1042 rc = cl_object_attr_get(env, obj, attr);
1043 cl_object_attr_unlock(obj);
1046 if (lvb.lvb_atime < attr->cat_atime)
1047 lvb.lvb_atime = attr->cat_atime;
1048 if (lvb.lvb_ctime < attr->cat_ctime)
1049 lvb.lvb_ctime = attr->cat_ctime;
1050 if (lvb.lvb_mtime < attr->cat_mtime)
1051 lvb.lvb_mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1055 cl_isize_write_nolock(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1060 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1061 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1146 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1147 file->f_dentry->d_name.name, iot, *ppos, count);
1150 io = ccc_env_thread_io(env);
1151 ll_io_init(io, file, iot == CIT_WRITE);
1153 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1154 struct vvp_io *vio = vvp_env_io(env);
1155 struct ccc_io *cio = ccc_env_io(env);
1156 int write_mutex_locked = 0;
1158 cio->cui_fd = LUSTRE_FPRIVATE(file);
1159 vio->cui_io_subtype = args->via_io_subtype;
1161 switch (vio->cui_io_subtype) {
1163 cio->cui_iov = args->u.normal.via_iov;
1164 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1165 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1166 cio->cui_iocb = args->u.normal.via_iocb;
1167 if ((iot == CIT_WRITE) &&
1168 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1169 if (mutex_lock_interruptible(&lli->
1171 GOTO(out, result = -ERESTARTSYS);
1172 write_mutex_locked = 1;
1174 down_read(&lli->lli_trunc_sem);
1177 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1178 vio->u.splice.cui_flags = args->u.splice.via_flags;
1181 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1185 ll_cl_add(file, env, io);
1186 result = cl_io_loop(env, io);
1187 ll_cl_remove(file, env);
1189 if (args->via_io_subtype == IO_NORMAL)
1190 up_read(&lli->lli_trunc_sem);
1191 if (write_mutex_locked)
1192 mutex_unlock(&lli->lli_write_mutex);
1194 /* cl_io_rw_init() handled IO */
1195 result = io->ci_result;
1198 if (io->ci_nob > 0) {
1199 result = io->ci_nob;
1200 *ppos = io->u.ci_wr.wr.crw_pos;
1204 cl_io_fini(env, io);
1205 /* If any bit been read/written (result != 0), we just return
1206 * short read/write instead of restart io. */
1207 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1208 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1209 iot == CIT_READ ? "read" : "write",
1210 file->f_dentry->d_name.name, *ppos, count);
1211 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1215 if (iot == CIT_READ) {
1217 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1218 LPROC_LL_READ_BYTES, result);
1219 } else if (iot == CIT_WRITE) {
1221 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1222 LPROC_LL_WRITE_BYTES, result);
1223 fd->fd_write_failed = false;
1224 } else if (result != -ERESTARTSYS) {
1225 fd->fd_write_failed = true;
1228 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1235 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1237 static int ll_file_get_iov_count(const struct iovec *iov,
1238 unsigned long *nr_segs, size_t *count)
1243 for (seg = 0; seg < *nr_segs; seg++) {
1244 const struct iovec *iv = &iov[seg];
1247 * If any segment has a negative length, or the cumulative
1248 * length ever wraps negative then return -EINVAL.
1251 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1253 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1258 cnt -= iv->iov_len; /* This segment is no good */
1265 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1266 unsigned long nr_segs, loff_t pos)
1269 struct vvp_io_args *args;
1275 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1279 env = cl_env_get(&refcheck);
1281 RETURN(PTR_ERR(env));
1283 args = vvp_env_args(env, IO_NORMAL);
1284 args->u.normal.via_iov = (struct iovec *)iov;
1285 args->u.normal.via_nrsegs = nr_segs;
1286 args->u.normal.via_iocb = iocb;
1288 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1289 &iocb->ki_pos, count);
1290 cl_env_put(env, &refcheck);
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1298 struct iovec *local_iov;
1299 struct kiocb *kiocb;
1304 env = cl_env_get(&refcheck);
1306 RETURN(PTR_ERR(env));
1308 local_iov = &vvp_env_info(env)->vti_local_iov;
1309 kiocb = &vvp_env_info(env)->vti_kiocb;
1310 local_iov->iov_base = (void __user *)buf;
1311 local_iov->iov_len = count;
1312 init_sync_kiocb(kiocb, file);
1313 kiocb->ki_pos = *ppos;
1314 #ifdef HAVE_KIOCB_KI_LEFT
1315 kiocb->ki_left = count;
1317 kiocb->ki_nbytes = count;
1320 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1321 *ppos = kiocb->ki_pos;
1323 cl_env_put(env, &refcheck);
1328 * Write to a file (through the page cache).
1331 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1332 unsigned long nr_segs, loff_t pos)
1335 struct vvp_io_args *args;
1341 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1345 env = cl_env_get(&refcheck);
1347 RETURN(PTR_ERR(env));
1349 args = vvp_env_args(env, IO_NORMAL);
1350 args->u.normal.via_iov = (struct iovec *)iov;
1351 args->u.normal.via_nrsegs = nr_segs;
1352 args->u.normal.via_iocb = iocb;
1354 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1355 &iocb->ki_pos, count);
1356 cl_env_put(env, &refcheck);
1360 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1364 struct iovec *local_iov;
1365 struct kiocb *kiocb;
1370 env = cl_env_get(&refcheck);
1372 RETURN(PTR_ERR(env));
1374 local_iov = &vvp_env_info(env)->vti_local_iov;
1375 kiocb = &vvp_env_info(env)->vti_kiocb;
1376 local_iov->iov_base = (void __user *)buf;
1377 local_iov->iov_len = count;
1378 init_sync_kiocb(kiocb, file);
1379 kiocb->ki_pos = *ppos;
1380 #ifdef HAVE_KIOCB_KI_LEFT
1381 kiocb->ki_left = count;
1383 kiocb->ki_nbytes = count;
1386 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1387 *ppos = kiocb->ki_pos;
1389 cl_env_put(env, &refcheck);
1394 * Send file content (through pagecache) somewhere with helper
1396 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1397 struct pipe_inode_info *pipe, size_t count,
1401 struct vvp_io_args *args;
1406 env = cl_env_get(&refcheck);
1408 RETURN(PTR_ERR(env));
1410 args = vvp_env_args(env, IO_SPLICE);
1411 args->u.splice.via_pipe = pipe;
1412 args->u.splice.via_flags = flags;
1414 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1415 cl_env_put(env, &refcheck);
1419 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1422 struct obd_export *exp = ll_i2dtexp(inode);
1423 struct obd_trans_info oti = { 0 };
1424 struct obdo *oa = NULL;
1427 struct lov_stripe_md *lsm = NULL, *lsm2;
1434 lsm = ccc_inode_lsm_get(inode);
1435 if (!lsm_has_objects(lsm))
1436 GOTO(out, rc = -ENOENT);
1438 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1439 (lsm->lsm_stripe_count));
1441 OBD_ALLOC_LARGE(lsm2, lsm_size);
1443 GOTO(out, rc = -ENOMEM);
1446 oa->o_nlink = ost_idx;
1447 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1448 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1449 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1450 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1451 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1452 memcpy(lsm2, lsm, lsm_size);
1453 ll_inode_size_lock(inode);
1454 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1455 ll_inode_size_unlock(inode);
1457 OBD_FREE_LARGE(lsm2, lsm_size);
1460 ccc_inode_lsm_put(inode, lsm);
1465 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1467 struct ll_recreate_obj ucreat;
1471 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1474 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1478 ostid_set_seq_mdt0(&oi);
1479 ostid_set_id(&oi, ucreat.lrc_id);
1480 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1483 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1490 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1493 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1496 fid_to_ostid(&fid, &oi);
1497 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1498 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1501 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1502 __u64 flags, struct lov_user_md *lum,
1505 struct lov_stripe_md *lsm = NULL;
1506 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1510 lsm = ccc_inode_lsm_get(inode);
1512 ccc_inode_lsm_put(inode, lsm);
1513 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1514 PFID(ll_inode2fid(inode)));
1515 GOTO(out, rc = -EEXIST);
1518 ll_inode_size_lock(inode);
1519 oit.it_flags |= MDS_OPEN_BY_FID;
1520 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1522 GOTO(out_unlock, rc);
1523 rc = oit.d.lustre.it_status;
1525 GOTO(out_req_free, rc);
1527 ll_release_openhandle(file->f_dentry, &oit);
1530 ll_inode_size_unlock(inode);
1531 ll_intent_release(&oit);
1532 ccc_inode_lsm_put(inode, lsm);
1534 cl_lov_delay_create_clear(&file->f_flags);
1537 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1541 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1542 struct lov_mds_md **lmmp, int *lmm_size,
1543 struct ptlrpc_request **request)
1545 struct ll_sb_info *sbi = ll_i2sbi(inode);
1546 struct mdt_body *body;
1547 struct lov_mds_md *lmm = NULL;
1548 struct ptlrpc_request *req = NULL;
1549 struct md_op_data *op_data;
1552 rc = ll_get_default_mdsize(sbi, &lmmsize);
1556 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1557 strlen(filename), lmmsize,
1558 LUSTRE_OPC_ANY, NULL);
1559 if (IS_ERR(op_data))
1560 RETURN(PTR_ERR(op_data));
1562 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1563 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1564 ll_finish_md_op_data(op_data);
1566 CDEBUG(D_INFO, "md_getattr_name failed "
1567 "on %s: rc %d\n", filename, rc);
1571 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1572 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1574 lmmsize = body->mbo_eadatasize;
1576 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1578 GOTO(out, rc = -ENODATA);
1581 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1582 LASSERT(lmm != NULL);
1584 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1585 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1586 GOTO(out, rc = -EPROTO);
1590 * This is coming from the MDS, so is probably in
1591 * little endian. We convert it to host endian before
1592 * passing it to userspace.
1594 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1597 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1598 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1601 /* if function called for directory - we should
1602 * avoid swab not existent lsm objects */
1603 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1604 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1605 if (S_ISREG(body->mbo_mode))
1606 lustre_swab_lov_user_md_objects(
1607 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1609 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1610 lustre_swab_lov_user_md_v3(
1611 (struct lov_user_md_v3 *)lmm);
1612 if (S_ISREG(body->mbo_mode))
1613 lustre_swab_lov_user_md_objects(
1614 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1621 *lmm_size = lmmsize;
1626 static int ll_lov_setea(struct inode *inode, struct file *file,
1629 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1630 struct lov_user_md *lump;
1631 int lum_size = sizeof(struct lov_user_md) +
1632 sizeof(struct lov_user_ost_data);
1636 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1639 OBD_ALLOC_LARGE(lump, lum_size);
1643 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1644 OBD_FREE_LARGE(lump, lum_size);
1648 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1650 OBD_FREE_LARGE(lump, lum_size);
1654 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1657 struct lov_user_md_v3 lumv3;
1658 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1659 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1660 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1662 __u64 flags = FMODE_WRITE;
1665 /* first try with v1 which is smaller than v3 */
1666 lum_size = sizeof(struct lov_user_md_v1);
1667 if (copy_from_user(lumv1, lumv1p, lum_size))
1670 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1671 lum_size = sizeof(struct lov_user_md_v3);
1672 if (copy_from_user(&lumv3, lumv3p, lum_size))
1676 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1678 struct lov_stripe_md *lsm;
1681 put_user(0, &lumv1p->lmm_stripe_count);
1683 ll_layout_refresh(inode, &gen);
1684 lsm = ccc_inode_lsm_get(inode);
1685 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1686 0, lsm, (void *)arg);
1687 ccc_inode_lsm_put(inode, lsm);
1692 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1694 struct lov_stripe_md *lsm;
1698 lsm = ccc_inode_lsm_get(inode);
1700 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1702 ccc_inode_lsm_put(inode, lsm);
1707 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1709 struct ll_inode_info *lli = ll_i2info(inode);
1710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1711 struct ccc_grouplock grouplock;
1715 if (ll_file_nolock(file))
1716 RETURN(-EOPNOTSUPP);
1718 spin_lock(&lli->lli_lock);
1719 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1720 CWARN("group lock already existed with gid %lu\n",
1721 fd->fd_grouplock.cg_gid);
1722 spin_unlock(&lli->lli_lock);
1725 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1726 spin_unlock(&lli->lli_lock);
1728 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1729 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1733 spin_lock(&lli->lli_lock);
1734 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1735 spin_unlock(&lli->lli_lock);
1736 CERROR("another thread just won the race\n");
1737 cl_put_grouplock(&grouplock);
1741 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1742 fd->fd_grouplock = grouplock;
1743 spin_unlock(&lli->lli_lock);
1745 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1749 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1751 struct ll_inode_info *lli = ll_i2info(inode);
1752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1753 struct ccc_grouplock grouplock;
1756 spin_lock(&lli->lli_lock);
1757 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1758 spin_unlock(&lli->lli_lock);
1759 CWARN("no group lock held\n");
1762 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1764 if (fd->fd_grouplock.cg_gid != arg) {
1765 CWARN("group lock %lu doesn't match current id %lu\n",
1766 arg, fd->fd_grouplock.cg_gid);
1767 spin_unlock(&lli->lli_lock);
1771 grouplock = fd->fd_grouplock;
1772 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1773 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1774 spin_unlock(&lli->lli_lock);
1776 cl_put_grouplock(&grouplock);
1777 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1782 * Close inode open handle
1784 * \param dentry [in] dentry which contains the inode
1785 * \param it [in,out] intent which contains open info and result
1788 * \retval <0 failure
1790 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1792 struct inode *inode = dentry->d_inode;
1793 struct obd_client_handle *och;
1799 /* Root ? Do nothing. */
1800 if (dentry->d_inode->i_sb->s_root == dentry)
1803 /* No open handle to close? Move away */
1804 if (!it_disposition(it, DISP_OPEN_OPEN))
1807 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1809 OBD_ALLOC(och, sizeof(*och));
1811 GOTO(out, rc = -ENOMEM);
1813 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1815 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1818 /* this one is in place of ll_file_open */
1819 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1820 ptlrpc_req_finished(it->d.lustre.it_data);
1821 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1827 * Get size for inode for which FIEMAP mapping is requested.
1828 * Make the FIEMAP get_info call and returns the result.
1830 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1833 struct obd_export *exp = ll_i2dtexp(inode);
1834 struct lov_stripe_md *lsm = NULL;
1835 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1836 __u32 vallen = num_bytes;
1840 /* Checks for fiemap flags */
1841 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1842 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1846 /* Check for FIEMAP_FLAG_SYNC */
1847 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1848 rc = filemap_fdatawrite(inode->i_mapping);
1853 lsm = ccc_inode_lsm_get(inode);
1857 /* If the stripe_count > 1 and the application does not understand
1858 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1860 if (lsm->lsm_stripe_count > 1 &&
1861 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1862 GOTO(out, rc = -EOPNOTSUPP);
1864 fm_key.oa.o_oi = lsm->lsm_oi;
1865 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1867 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1868 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1869 /* If filesize is 0, then there would be no objects for mapping */
1870 if (fm_key.oa.o_size == 0) {
1871 fiemap->fm_mapped_extents = 0;
1875 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1877 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1880 CERROR("obd_get_info failed: rc = %d\n", rc);
1883 ccc_inode_lsm_put(inode, lsm);
1887 int ll_fid2path(struct inode *inode, void __user *arg)
1889 struct obd_export *exp = ll_i2mdexp(inode);
1890 const struct getinfo_fid2path __user *gfin = arg;
1892 struct getinfo_fid2path *gfout;
1898 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1899 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1902 /* Only need to get the buflen */
1903 if (get_user(pathlen, &gfin->gf_pathlen))
1906 if (pathlen > PATH_MAX)
1909 outsize = sizeof(*gfout) + pathlen;
1910 OBD_ALLOC(gfout, outsize);
1914 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1915 GOTO(gf_free, rc = -EFAULT);
1917 /* Call mdc_iocontrol */
1918 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1922 if (copy_to_user(arg, gfout, outsize))
1926 OBD_FREE(gfout, outsize);
1930 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1932 struct ll_user_fiemap *fiemap_s;
1933 size_t num_bytes, ret_bytes;
1934 unsigned int extent_count;
1937 /* Get the extent count so we can calculate the size of
1938 * required fiemap buffer */
1939 if (get_user(extent_count,
1940 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1944 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1946 num_bytes = sizeof(*fiemap_s) + (extent_count *
1947 sizeof(struct ll_fiemap_extent));
1949 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1950 if (fiemap_s == NULL)
1953 /* get the fiemap value */
1954 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1956 GOTO(error, rc = -EFAULT);
1958 /* If fm_extent_count is non-zero, read the first extent since
1959 * it is used to calculate end_offset and device from previous
1962 if (copy_from_user(&fiemap_s->fm_extents[0],
1963 (char __user *)arg + sizeof(*fiemap_s),
1964 sizeof(struct ll_fiemap_extent)))
1965 GOTO(error, rc = -EFAULT);
1968 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1972 ret_bytes = sizeof(struct ll_user_fiemap);
1974 if (extent_count != 0)
1975 ret_bytes += (fiemap_s->fm_mapped_extents *
1976 sizeof(struct ll_fiemap_extent));
1978 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1982 OBD_FREE_LARGE(fiemap_s, num_bytes);
1987 * Read the data_version for inode.
1989 * This value is computed using stripe object version on OST.
1990 * Version is computed using server side locking.
1992 * @param sync if do sync on the OST side;
1994 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1995 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1997 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1999 struct lov_stripe_md *lsm = NULL;
2000 struct ll_sb_info *sbi = ll_i2sbi(inode);
2001 struct obdo *obdo = NULL;
2005 /* If no stripe, we consider version is 0. */
2006 lsm = ccc_inode_lsm_get(inode);
2007 if (!lsm_has_objects(lsm)) {
2009 CDEBUG(D_INODE, "No object for inode\n");
2013 OBD_ALLOC_PTR(obdo);
2015 GOTO(out, rc = -ENOMEM);
2017 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2019 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2022 *data_version = obdo->o_data_version;
2028 ccc_inode_lsm_put(inode, lsm);
2033 * Trigger a HSM release request for the provided inode.
2035 int ll_hsm_release(struct inode *inode)
2037 struct cl_env_nest nest;
2039 struct obd_client_handle *och = NULL;
2040 __u64 data_version = 0;
2044 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2045 ll_get_fsname(inode->i_sb, NULL, 0),
2046 PFID(&ll_i2info(inode)->lli_fid));
2048 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2050 GOTO(out, rc = PTR_ERR(och));
2052 /* Grab latest data_version and [am]time values */
2053 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2057 env = cl_env_nested_get(&nest);
2059 GOTO(out, rc = PTR_ERR(env));
2061 ll_merge_lvb(env, inode);
2062 cl_env_nested_put(&nest, env);
2064 /* Release the file.
2065 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2066 * we still need it to pack l_remote_handle to MDT. */
2067 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2073 if (och != NULL && !IS_ERR(och)) /* close the file */
2074 ll_lease_close(och, inode, NULL);
2079 struct ll_swap_stack {
2080 struct iattr ia1, ia2;
2082 struct inode *inode1, *inode2;
2083 bool check_dv1, check_dv2;
2086 static int ll_swap_layouts(struct file *file1, struct file *file2,
2087 struct lustre_swap_layouts *lsl)
2089 struct mdc_swap_layouts msl;
2090 struct md_op_data *op_data;
2093 struct ll_swap_stack *llss = NULL;
2096 OBD_ALLOC_PTR(llss);
2100 llss->inode1 = file1->f_dentry->d_inode;
2101 llss->inode2 = file2->f_dentry->d_inode;
2103 if (!S_ISREG(llss->inode2->i_mode))
2104 GOTO(free, rc = -EINVAL);
2106 if (inode_permission(llss->inode1, MAY_WRITE) ||
2107 inode_permission(llss->inode2, MAY_WRITE))
2108 GOTO(free, rc = -EPERM);
2110 if (llss->inode2->i_sb != llss->inode1->i_sb)
2111 GOTO(free, rc = -EXDEV);
2113 /* we use 2 bool because it is easier to swap than 2 bits */
2114 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2115 llss->check_dv1 = true;
2117 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2118 llss->check_dv2 = true;
2120 /* we cannot use lsl->sl_dvX directly because we may swap them */
2121 llss->dv1 = lsl->sl_dv1;
2122 llss->dv2 = lsl->sl_dv2;
2124 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2125 if (rc == 0) /* same file, done! */
2128 if (rc < 0) { /* sequentialize it */
2129 swap(llss->inode1, llss->inode2);
2131 swap(llss->dv1, llss->dv2);
2132 swap(llss->check_dv1, llss->check_dv2);
2136 if (gid != 0) { /* application asks to flush dirty cache */
2137 rc = ll_get_grouplock(llss->inode1, file1, gid);
2141 rc = ll_get_grouplock(llss->inode2, file2, gid);
2143 ll_put_grouplock(llss->inode1, file1, gid);
2148 /* to be able to restore mtime and atime after swap
2149 * we need to first save them */
2151 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2152 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2153 llss->ia1.ia_atime = llss->inode1->i_atime;
2154 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2155 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2156 llss->ia2.ia_atime = llss->inode2->i_atime;
2157 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2160 /* ultimate check, before swaping the layouts we check if
2161 * dataversion has changed (if requested) */
2162 if (llss->check_dv1) {
2163 rc = ll_data_version(llss->inode1, &dv, 0);
2166 if (dv != llss->dv1)
2167 GOTO(putgl, rc = -EAGAIN);
2170 if (llss->check_dv2) {
2171 rc = ll_data_version(llss->inode2, &dv, 0);
2174 if (dv != llss->dv2)
2175 GOTO(putgl, rc = -EAGAIN);
2178 /* struct md_op_data is used to send the swap args to the mdt
2179 * only flags is missing, so we use struct mdc_swap_layouts
2180 * through the md_op_data->op_data */
2181 /* flags from user space have to be converted before they are send to
2182 * server, no flag is sent today, they are only used on the client */
2185 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2186 0, LUSTRE_OPC_ANY, &msl);
2187 if (IS_ERR(op_data))
2188 GOTO(free, rc = PTR_ERR(op_data));
2190 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2191 sizeof(*op_data), op_data, NULL);
2192 ll_finish_md_op_data(op_data);
2196 ll_put_grouplock(llss->inode2, file2, gid);
2197 ll_put_grouplock(llss->inode1, file1, gid);
2200 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2204 /* clear useless flags */
2205 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2206 llss->ia1.ia_valid &= ~ATTR_MTIME;
2207 llss->ia2.ia_valid &= ~ATTR_MTIME;
2210 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2211 llss->ia1.ia_valid &= ~ATTR_ATIME;
2212 llss->ia2.ia_valid &= ~ATTR_ATIME;
2215 /* update time if requested */
2217 if (llss->ia2.ia_valid != 0) {
2218 mutex_lock(&llss->inode1->i_mutex);
2219 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2220 mutex_unlock(&llss->inode1->i_mutex);
2223 if (llss->ia1.ia_valid != 0) {
2226 mutex_lock(&llss->inode2->i_mutex);
2227 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2228 mutex_unlock(&llss->inode2->i_mutex);
2240 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2242 struct md_op_data *op_data;
2245 /* Non-root users are forbidden to set or clear flags which are
2246 * NOT defined in HSM_USER_MASK. */
2247 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2248 !cfs_capable(CFS_CAP_SYS_ADMIN))
2251 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2252 LUSTRE_OPC_ANY, hss);
2253 if (IS_ERR(op_data))
2254 RETURN(PTR_ERR(op_data));
2256 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2257 sizeof(*op_data), op_data, NULL);
2259 ll_finish_md_op_data(op_data);
2264 static int ll_hsm_import(struct inode *inode, struct file *file,
2265 struct hsm_user_import *hui)
2267 struct hsm_state_set *hss = NULL;
2268 struct iattr *attr = NULL;
2272 if (!S_ISREG(inode->i_mode))
2278 GOTO(out, rc = -ENOMEM);
2280 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2281 hss->hss_archive_id = hui->hui_archive_id;
2282 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2283 rc = ll_hsm_state_set(inode, hss);
2287 OBD_ALLOC_PTR(attr);
2289 GOTO(out, rc = -ENOMEM);
2291 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2292 attr->ia_mode |= S_IFREG;
2293 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2294 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2295 attr->ia_size = hui->hui_size;
2296 attr->ia_mtime.tv_sec = hui->hui_mtime;
2297 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2298 attr->ia_atime.tv_sec = hui->hui_atime;
2299 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2301 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2302 ATTR_UID | ATTR_GID |
2303 ATTR_MTIME | ATTR_MTIME_SET |
2304 ATTR_ATIME | ATTR_ATIME_SET;
2306 mutex_lock(&inode->i_mutex);
2308 rc = ll_setattr_raw(file->f_dentry, attr, true);
2312 mutex_unlock(&inode->i_mutex);
2324 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2326 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2327 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2331 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2333 struct inode *inode = file->f_dentry->d_inode;
2334 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2338 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2339 PFID(ll_inode2fid(inode)), inode, cmd);
2340 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2342 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2343 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2347 case LL_IOC_GETFLAGS:
2348 /* Get the current value of the file flags */
2349 return put_user(fd->fd_flags, (int *)arg);
2350 case LL_IOC_SETFLAGS:
2351 case LL_IOC_CLRFLAGS:
2352 /* Set or clear specific file flags */
2353 /* XXX This probably needs checks to ensure the flags are
2354 * not abused, and to handle any flag side effects.
2356 if (get_user(flags, (int *) arg))
2359 if (cmd == LL_IOC_SETFLAGS) {
2360 if ((flags & LL_FILE_IGNORE_LOCK) &&
2361 !(file->f_flags & O_DIRECT)) {
2362 CERROR("%s: unable to disable locking on "
2363 "non-O_DIRECT file\n", current->comm);
2367 fd->fd_flags |= flags;
2369 fd->fd_flags &= ~flags;
2372 case LL_IOC_LOV_SETSTRIPE:
2373 RETURN(ll_lov_setstripe(inode, file, arg));
2374 case LL_IOC_LOV_SETEA:
2375 RETURN(ll_lov_setea(inode, file, arg));
2376 case LL_IOC_LOV_SWAP_LAYOUTS: {
2378 struct lustre_swap_layouts lsl;
2380 if (copy_from_user(&lsl, (char *)arg,
2381 sizeof(struct lustre_swap_layouts)))
2384 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2387 file2 = fget(lsl.sl_fd);
2392 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2393 rc = ll_swap_layouts(file, file2, &lsl);
2397 case LL_IOC_LOV_GETSTRIPE:
2398 RETURN(ll_lov_getstripe(inode, arg));
2399 case LL_IOC_RECREATE_OBJ:
2400 RETURN(ll_lov_recreate_obj(inode, arg));
2401 case LL_IOC_RECREATE_FID:
2402 RETURN(ll_lov_recreate_fid(inode, arg));
2403 case FSFILT_IOC_FIEMAP:
2404 RETURN(ll_ioctl_fiemap(inode, arg));
2405 case FSFILT_IOC_GETFLAGS:
2406 case FSFILT_IOC_SETFLAGS:
2407 RETURN(ll_iocontrol(inode, file, cmd, arg));
2408 case FSFILT_IOC_GETVERSION_OLD:
2409 case FSFILT_IOC_GETVERSION:
2410 RETURN(put_user(inode->i_generation, (int *)arg));
2411 case LL_IOC_GROUP_LOCK:
2412 RETURN(ll_get_grouplock(inode, file, arg));
2413 case LL_IOC_GROUP_UNLOCK:
2414 RETURN(ll_put_grouplock(inode, file, arg));
2415 case IOC_OBD_STATFS:
2416 RETURN(ll_obd_statfs(inode, (void *)arg));
2418 /* We need to special case any other ioctls we want to handle,
2419 * to send them to the MDS/OST as appropriate and to properly
2420 * network encode the arg field.
2421 case FSFILT_IOC_SETVERSION_OLD:
2422 case FSFILT_IOC_SETVERSION:
2424 case LL_IOC_FLUSHCTX:
2425 RETURN(ll_flush_ctx(inode));
2426 case LL_IOC_PATH2FID: {
2427 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2428 sizeof(struct lu_fid)))
2433 case OBD_IOC_FID2PATH:
2434 RETURN(ll_fid2path(inode, (void *)arg));
2435 case LL_IOC_DATA_VERSION: {
2436 struct ioc_data_version idv;
2439 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2442 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2443 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2445 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2451 case LL_IOC_GET_MDTIDX: {
2454 mdtidx = ll_get_mdt_idx(inode);
2458 if (put_user((int)mdtidx, (int*)arg))
2463 case OBD_IOC_GETDTNAME:
2464 case OBD_IOC_GETMDNAME:
2465 RETURN(ll_get_obd_name(inode, cmd, arg));
2466 case LL_IOC_HSM_STATE_GET: {
2467 struct md_op_data *op_data;
2468 struct hsm_user_state *hus;
2475 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2476 LUSTRE_OPC_ANY, hus);
2477 if (IS_ERR(op_data)) {
2479 RETURN(PTR_ERR(op_data));
2482 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2485 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2488 ll_finish_md_op_data(op_data);
2492 case LL_IOC_HSM_STATE_SET: {
2493 struct hsm_state_set *hss;
2500 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2505 rc = ll_hsm_state_set(inode, hss);
2510 case LL_IOC_HSM_ACTION: {
2511 struct md_op_data *op_data;
2512 struct hsm_current_action *hca;
2519 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2520 LUSTRE_OPC_ANY, hca);
2521 if (IS_ERR(op_data)) {
2523 RETURN(PTR_ERR(op_data));
2526 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2529 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2532 ll_finish_md_op_data(op_data);
2536 case LL_IOC_SET_LEASE: {
2537 struct ll_inode_info *lli = ll_i2info(inode);
2538 struct obd_client_handle *och = NULL;
2543 case LL_LEASE_WRLCK:
2544 if (!(file->f_mode & FMODE_WRITE))
2546 fmode = FMODE_WRITE;
2548 case LL_LEASE_RDLCK:
2549 if (!(file->f_mode & FMODE_READ))
2553 case LL_LEASE_UNLCK:
2554 mutex_lock(&lli->lli_och_mutex);
2555 if (fd->fd_lease_och != NULL) {
2556 och = fd->fd_lease_och;
2557 fd->fd_lease_och = NULL;
2559 mutex_unlock(&lli->lli_och_mutex);
2564 fmode = och->och_flags;
2565 rc = ll_lease_close(och, inode, &lease_broken);
2572 RETURN(ll_lease_type_from_fmode(fmode));
2577 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2579 /* apply for lease */
2580 och = ll_lease_open(inode, file, fmode, 0);
2582 RETURN(PTR_ERR(och));
2585 mutex_lock(&lli->lli_och_mutex);
2586 if (fd->fd_lease_och == NULL) {
2587 fd->fd_lease_och = och;
2590 mutex_unlock(&lli->lli_och_mutex);
2592 /* impossible now that only excl is supported for now */
2593 ll_lease_close(och, inode, &lease_broken);
2598 case LL_IOC_GET_LEASE: {
2599 struct ll_inode_info *lli = ll_i2info(inode);
2600 struct ldlm_lock *lock = NULL;
2603 mutex_lock(&lli->lli_och_mutex);
2604 if (fd->fd_lease_och != NULL) {
2605 struct obd_client_handle *och = fd->fd_lease_och;
2607 lock = ldlm_handle2lock(&och->och_lease_handle);
2609 lock_res_and_lock(lock);
2610 if (!ldlm_is_cancel(lock))
2611 fmode = och->och_flags;
2613 unlock_res_and_lock(lock);
2614 LDLM_LOCK_PUT(lock);
2617 mutex_unlock(&lli->lli_och_mutex);
2619 RETURN(ll_lease_type_from_fmode(fmode));
2621 case LL_IOC_HSM_IMPORT: {
2622 struct hsm_user_import *hui;
2628 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2633 rc = ll_hsm_import(inode, file, hui);
2643 ll_iocontrol_call(inode, file, cmd, arg, &err))
2646 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2652 #ifndef HAVE_FILE_LLSEEK_SIZE
2653 static inline loff_t
2654 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2656 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2658 if (offset > maxsize)
2661 if (offset != file->f_pos) {
2662 file->f_pos = offset;
2663 file->f_version = 0;
2669 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2670 loff_t maxsize, loff_t eof)
2672 struct inode *inode = file->f_dentry->d_inode;
2680 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2681 * position-querying operation. Avoid rewriting the "same"
2682 * f_pos value back to the file because a concurrent read(),
2683 * write() or lseek() might have altered it
2688 * f_lock protects against read/modify/write race with other
2689 * SEEK_CURs. Note that parallel writes and reads behave
2692 mutex_lock(&inode->i_mutex);
2693 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2694 mutex_unlock(&inode->i_mutex);
2698 * In the generic case the entire file is data, so as long as
2699 * offset isn't at the end of the file then the offset is data.
2706 * There is a virtual hole at the end of the file, so as long as
2707 * offset isn't i_size or larger, return i_size.
2715 return llseek_execute(file, offset, maxsize);
2719 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2721 struct inode *inode = file->f_dentry->d_inode;
2722 loff_t retval, eof = 0;
2725 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2726 (origin == SEEK_CUR) ? file->f_pos : 0);
2727 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2728 PFID(ll_inode2fid(inode)), inode, retval, retval,
2730 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2732 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2733 retval = ll_glimpse_size(inode);
2736 eof = i_size_read(inode);
2739 retval = ll_generic_file_llseek_size(file, offset, origin,
2740 ll_file_maxbytes(inode), eof);
2744 static int ll_flush(struct file *file, fl_owner_t id)
2746 struct inode *inode = file->f_dentry->d_inode;
2747 struct ll_inode_info *lli = ll_i2info(inode);
2748 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2751 LASSERT(!S_ISDIR(inode->i_mode));
2753 /* catch async errors that were recorded back when async writeback
2754 * failed for pages in this mapping. */
2755 rc = lli->lli_async_rc;
2756 lli->lli_async_rc = 0;
2757 if (lli->lli_clob != NULL) {
2758 err = lov_read_and_clear_async_rc(lli->lli_clob);
2763 /* The application has been told write failure already.
2764 * Do not report failure again. */
2765 if (fd->fd_write_failed)
2767 return rc ? -EIO : 0;
2771 * Called to make sure a portion of file has been written out.
2772 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2774 * Return how many pages have been written.
2776 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2777 enum cl_fsync_mode mode, int ignore_layout)
2779 struct cl_env_nest nest;
2782 struct obd_capa *capa = NULL;
2783 struct cl_fsync_io *fio;
2787 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2788 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2791 env = cl_env_nested_get(&nest);
2793 RETURN(PTR_ERR(env));
2795 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2797 io = ccc_env_thread_io(env);
2798 io->ci_obj = cl_i2info(inode)->lli_clob;
2799 io->ci_ignore_layout = ignore_layout;
2801 /* initialize parameters for sync */
2802 fio = &io->u.ci_fsync;
2803 fio->fi_capa = capa;
2804 fio->fi_start = start;
2806 fio->fi_fid = ll_inode2fid(inode);
2807 fio->fi_mode = mode;
2808 fio->fi_nr_written = 0;
2810 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2811 result = cl_io_loop(env, io);
2813 result = io->ci_result;
2815 result = fio->fi_nr_written;
2816 cl_io_fini(env, io);
2817 cl_env_nested_put(&nest, env);
2825 * When dentry is provided (the 'else' case), *file->f_dentry may be
2826 * null and dentry must be used directly rather than pulled from
2827 * *file->f_dentry as is done otherwise.
2830 #ifdef HAVE_FILE_FSYNC_4ARGS
2831 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2833 struct dentry *dentry = file->f_dentry;
2834 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2835 int ll_fsync(struct file *file, int datasync)
2837 struct dentry *dentry = file->f_dentry;
2839 loff_t end = LLONG_MAX;
2841 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2844 loff_t end = LLONG_MAX;
2846 struct inode *inode = dentry->d_inode;
2847 struct ll_inode_info *lli = ll_i2info(inode);
2848 struct ptlrpc_request *req;
2849 struct obd_capa *oc;
2853 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2854 PFID(ll_inode2fid(inode)), inode);
2855 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2857 #ifdef HAVE_FILE_FSYNC_4ARGS
2858 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2859 mutex_lock(&inode->i_mutex);
2861 /* fsync's caller has already called _fdata{sync,write}, we want
2862 * that IO to finish before calling the osc and mdc sync methods */
2863 rc = filemap_fdatawait(inode->i_mapping);
2866 /* catch async errors that were recorded back when async writeback
2867 * failed for pages in this mapping. */
2868 if (!S_ISDIR(inode->i_mode)) {
2869 err = lli->lli_async_rc;
2870 lli->lli_async_rc = 0;
2873 err = lov_read_and_clear_async_rc(lli->lli_clob);
2878 oc = ll_mdscapa_get(inode);
2879 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2885 ptlrpc_req_finished(req);
2887 if (S_ISREG(inode->i_mode)) {
2888 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2890 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2891 if (rc == 0 && err < 0)
2894 fd->fd_write_failed = true;
2896 fd->fd_write_failed = false;
2899 #ifdef HAVE_FILE_FSYNC_4ARGS
2900 mutex_unlock(&inode->i_mutex);
2906 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2908 struct inode *inode = file->f_dentry->d_inode;
2909 struct ll_sb_info *sbi = ll_i2sbi(inode);
2910 struct ldlm_enqueue_info einfo = {
2911 .ei_type = LDLM_FLOCK,
2912 .ei_cb_cp = ldlm_flock_completion_ast,
2913 .ei_cbdata = file_lock,
2915 struct md_op_data *op_data;
2916 struct lustre_handle lockh = {0};
2917 ldlm_policy_data_t flock = {{0}};
2918 int fl_type = file_lock->fl_type;
2924 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2925 PFID(ll_inode2fid(inode)), file_lock);
2927 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2929 if (file_lock->fl_flags & FL_FLOCK) {
2930 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2931 /* flocks are whole-file locks */
2932 flock.l_flock.end = OFFSET_MAX;
2933 /* For flocks owner is determined by the local file desctiptor*/
2934 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2935 } else if (file_lock->fl_flags & FL_POSIX) {
2936 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2937 flock.l_flock.start = file_lock->fl_start;
2938 flock.l_flock.end = file_lock->fl_end;
2942 flock.l_flock.pid = file_lock->fl_pid;
2944 /* Somewhat ugly workaround for svc lockd.
2945 * lockd installs custom fl_lmops->lm_compare_owner that checks
2946 * for the fl_owner to be the same (which it always is on local node
2947 * I guess between lockd processes) and then compares pid.
2948 * As such we assign pid to the owner field to make it all work,
2949 * conflict with normal locks is unlikely since pid space and
2950 * pointer space for current->files are not intersecting */
2951 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2952 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2956 einfo.ei_mode = LCK_PR;
2959 /* An unlock request may or may not have any relation to
2960 * existing locks so we may not be able to pass a lock handle
2961 * via a normal ldlm_lock_cancel() request. The request may even
2962 * unlock a byte range in the middle of an existing lock. In
2963 * order to process an unlock request we need all of the same
2964 * information that is given with a normal read or write record
2965 * lock request. To avoid creating another ldlm unlock (cancel)
2966 * message we'll treat a LCK_NL flock request as an unlock. */
2967 einfo.ei_mode = LCK_NL;
2970 einfo.ei_mode = LCK_PW;
2973 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2988 flags = LDLM_FL_BLOCK_NOWAIT;
2994 flags = LDLM_FL_TEST_LOCK;
2997 CERROR("unknown fcntl lock command: %d\n", cmd);
3001 /* Save the old mode so that if the mode in the lock changes we
3002 * can decrement the appropriate reader or writer refcount. */
3003 file_lock->fl_type = einfo.ei_mode;
3005 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3006 LUSTRE_OPC_ANY, NULL);
3007 if (IS_ERR(op_data))
3008 RETURN(PTR_ERR(op_data));
3010 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3011 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3012 flock.l_flock.pid, flags, einfo.ei_mode,
3013 flock.l_flock.start, flock.l_flock.end);
3015 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3018 /* Restore the file lock type if not TEST lock. */
3019 if (!(flags & LDLM_FL_TEST_LOCK))
3020 file_lock->fl_type = fl_type;
3022 if ((file_lock->fl_flags & FL_FLOCK) &&
3023 (rc == 0 || file_lock->fl_type == F_UNLCK))
3024 rc2 = flock_lock_file_wait(file, file_lock);
3025 if ((file_lock->fl_flags & FL_POSIX) &&
3026 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3027 !(flags & LDLM_FL_TEST_LOCK))
3028 rc2 = posix_lock_file_wait(file, file_lock);
3030 if (rc2 && file_lock->fl_type != F_UNLCK) {
3031 einfo.ei_mode = LCK_NL;
3032 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3037 ll_finish_md_op_data(op_data);
3042 int ll_get_fid_by_name(struct inode *parent, const char *name,
3043 int namelen, struct lu_fid *fid)
3045 struct md_op_data *op_data = NULL;
3046 struct mdt_body *body;
3047 struct ptlrpc_request *req;
3051 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3052 LUSTRE_OPC_ANY, NULL);
3053 if (IS_ERR(op_data))
3054 RETURN(PTR_ERR(op_data));
3056 op_data->op_valid = OBD_MD_FLID;
3057 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3058 ll_finish_md_op_data(op_data);
3062 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3064 GOTO(out_req, rc = -EFAULT);
3066 *fid = body->mbo_fid1;
3068 ptlrpc_req_finished(req);
3072 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3073 const char *name, int namelen)
3075 struct dentry *dchild = NULL;
3076 struct inode *child_inode = NULL;
3077 struct md_op_data *op_data;
3078 struct ptlrpc_request *request = NULL;
3083 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3084 name, PFID(ll_inode2fid(parent)), mdtidx);
3086 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3087 0, LUSTRE_OPC_ANY, NULL);
3088 if (IS_ERR(op_data))
3089 RETURN(PTR_ERR(op_data));
3091 /* Get child FID first */
3092 qstr.hash = full_name_hash(name, namelen);
3095 dchild = d_lookup(file->f_dentry, &qstr);
3096 if (dchild != NULL && dchild->d_inode != NULL) {
3097 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3098 if (dchild->d_inode != NULL) {
3099 child_inode = igrab(dchild->d_inode);
3100 ll_invalidate_aliases(child_inode);
3104 rc = ll_get_fid_by_name(parent, name, namelen,
3110 if (!fid_is_sane(&op_data->op_fid3)) {
3111 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3112 ll_get_fsname(parent->i_sb, NULL, 0), name,
3113 PFID(&op_data->op_fid3));
3114 GOTO(out_free, rc = -EINVAL);
3117 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3122 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3123 PFID(&op_data->op_fid3), mdtidx);
3124 GOTO(out_free, rc = 0);
3127 op_data->op_mds = mdtidx;
3128 op_data->op_cli_flags = CLI_MIGRATE;
3129 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3130 namelen, name, namelen, &request);
3132 ll_update_times(request, parent);
3134 ptlrpc_req_finished(request);
3139 if (child_inode != NULL) {
3140 clear_nlink(child_inode);
3144 ll_finish_md_op_data(op_data);
3149 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3157 * test if some locks matching bits and l_req_mode are acquired
3158 * - bits can be in different locks
3159 * - if found clear the common lock bits in *bits
3160 * - the bits not found, are kept in *bits
3162 * \param bits [IN] searched lock bits [IN]
3163 * \param l_req_mode [IN] searched lock mode
3164 * \retval boolean, true iff all bits are found
3166 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3168 struct lustre_handle lockh;
3169 ldlm_policy_data_t policy;
3170 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3171 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3180 fid = &ll_i2info(inode)->lli_fid;
3181 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3182 ldlm_lockname[mode]);
3184 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3185 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3186 policy.l_inodebits.bits = *bits & (1 << i);
3187 if (policy.l_inodebits.bits == 0)
3190 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3191 &policy, mode, &lockh)) {
3192 struct ldlm_lock *lock;
3194 lock = ldlm_handle2lock(&lockh);
3197 ~(lock->l_policy_data.l_inodebits.bits);
3198 LDLM_LOCK_PUT(lock);
3200 *bits &= ~policy.l_inodebits.bits;
3207 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3208 struct lustre_handle *lockh, __u64 flags,
3211 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3216 fid = &ll_i2info(inode)->lli_fid;
3217 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3219 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3220 fid, LDLM_IBITS, &policy, mode, lockh);
3225 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3227 /* Already unlinked. Just update nlink and return success */
3228 if (rc == -ENOENT) {
3230 /* This path cannot be hit for regular files unless in
3231 * case of obscure races, so no need to to validate
3233 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3235 } else if (rc != 0) {
3236 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3237 "%s: revalidate FID "DFID" error: rc = %d\n",
3238 ll_get_fsname(inode->i_sb, NULL, 0),
3239 PFID(ll_inode2fid(inode)), rc);
3245 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3247 struct inode *inode = dentry->d_inode;
3248 struct ptlrpc_request *req = NULL;
3249 struct obd_export *exp;
3253 LASSERT(inode != NULL);
3255 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3256 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3258 exp = ll_i2mdexp(inode);
3260 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3261 * But under CMD case, it caused some lock issues, should be fixed
3262 * with new CMD ibits lock. See bug 12718 */
3263 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3264 struct lookup_intent oit = { .it_op = IT_GETATTR };
3265 struct md_op_data *op_data;
3267 if (ibits == MDS_INODELOCK_LOOKUP)
3268 oit.it_op = IT_LOOKUP;
3270 /* Call getattr by fid, so do not provide name at all. */
3271 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3272 dentry->d_inode, NULL, 0, 0,
3273 LUSTRE_OPC_ANY, NULL);
3274 if (IS_ERR(op_data))
3275 RETURN(PTR_ERR(op_data));
3277 rc = md_intent_lock(exp, op_data, &oit, &req,
3278 &ll_md_blocking_ast, 0);
3279 ll_finish_md_op_data(op_data);
3281 rc = ll_inode_revalidate_fini(inode, rc);
3285 rc = ll_revalidate_it_finish(req, &oit, dentry);
3287 ll_intent_release(&oit);
3291 /* Unlinked? Unhash dentry, so it is not picked up later by
3292 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3293 here to preserve get_cwd functionality on 2.6.
3295 if (!dentry->d_inode->i_nlink)
3296 d_lustre_invalidate(dentry, 0);
3298 ll_lookup_finish_locks(&oit, dentry);
3299 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3300 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3301 obd_valid valid = OBD_MD_FLGETATTR;
3302 struct md_op_data *op_data;
3305 if (S_ISREG(inode->i_mode)) {
3306 rc = ll_get_default_mdsize(sbi, &ealen);
3309 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3312 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3313 0, ealen, LUSTRE_OPC_ANY,
3315 if (IS_ERR(op_data))
3316 RETURN(PTR_ERR(op_data));
3318 op_data->op_valid = valid;
3319 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3320 * capa for this inode. Because we only keep capas of dirs
3322 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3323 ll_finish_md_op_data(op_data);
3325 rc = ll_inode_revalidate_fini(inode, rc);
3329 rc = ll_prep_inode(&inode, req, NULL, NULL);
3332 ptlrpc_req_finished(req);
3336 static int ll_merge_md_attr(struct inode *inode)
3338 struct cl_attr attr = { 0 };
3341 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3342 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3347 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3348 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3350 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3351 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3352 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3358 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3360 struct inode *inode = dentry->d_inode;
3364 rc = __ll_inode_revalidate(dentry, ibits);
3368 /* if object isn't regular file, don't validate size */
3369 if (!S_ISREG(inode->i_mode)) {
3370 if (S_ISDIR(inode->i_mode) &&
3371 ll_i2info(inode)->lli_lsm_md != NULL) {
3372 rc = ll_merge_md_attr(inode);
3377 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3378 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3379 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3381 /* In case of restore, the MDT has the right size and has
3382 * already send it back without granting the layout lock,
3383 * inode is up-to-date so glimpse is useless.
3384 * Also to glimpse we need the layout, in case of a running
3385 * restore the MDT holds the layout lock so the glimpse will
3386 * block up to the end of restore (getattr will block)
3388 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3389 rc = ll_glimpse_size(inode);
3394 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3396 struct inode *inode = de->d_inode;
3397 struct ll_sb_info *sbi = ll_i2sbi(inode);
3398 struct ll_inode_info *lli = ll_i2info(inode);
3401 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3402 MDS_INODELOCK_LOOKUP);
3403 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3408 stat->dev = inode->i_sb->s_dev;
3409 if (ll_need_32bit_api(sbi))
3410 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3412 stat->ino = inode->i_ino;
3413 stat->mode = inode->i_mode;
3414 stat->uid = inode->i_uid;
3415 stat->gid = inode->i_gid;
3416 stat->rdev = inode->i_rdev;
3417 stat->atime = inode->i_atime;
3418 stat->mtime = inode->i_mtime;
3419 stat->ctime = inode->i_ctime;
3420 stat->blksize = 1 << inode->i_blkbits;
3421 stat->blocks = inode->i_blocks;
3423 if (S_ISDIR(inode->i_mode) &&
3424 ll_i2info(inode)->lli_lsm_md != NULL) {
3425 stat->nlink = lli->lli_stripe_dir_nlink;
3426 stat->size = lli->lli_stripe_dir_size;
3428 stat->nlink = inode->i_nlink;
3429 stat->size = i_size_read(inode);
3435 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3436 __u64 start, __u64 len)
3440 struct ll_user_fiemap *fiemap;
3441 unsigned int extent_count = fieinfo->fi_extents_max;
3443 num_bytes = sizeof(*fiemap) + (extent_count *
3444 sizeof(struct ll_fiemap_extent));
3445 OBD_ALLOC_LARGE(fiemap, num_bytes);
3450 fiemap->fm_flags = fieinfo->fi_flags;
3451 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3452 fiemap->fm_start = start;
3453 fiemap->fm_length = len;
3454 if (extent_count > 0)
3455 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3456 sizeof(struct ll_fiemap_extent));
3458 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3460 fieinfo->fi_flags = fiemap->fm_flags;
3461 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3462 if (extent_count > 0)
3463 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3464 fiemap->fm_mapped_extents *
3465 sizeof(struct ll_fiemap_extent));
3467 OBD_FREE_LARGE(fiemap, num_bytes);
3471 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3473 struct ll_inode_info *lli = ll_i2info(inode);
3474 struct posix_acl *acl = NULL;
3477 spin_lock(&lli->lli_lock);
3478 /* VFS' acl_permission_check->check_acl will release the refcount */
3479 acl = posix_acl_dup(lli->lli_posix_acl);
3480 spin_unlock(&lli->lli_lock);
3485 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3487 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3488 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3490 ll_check_acl(struct inode *inode, int mask)
3493 # ifdef CONFIG_FS_POSIX_ACL
3494 struct posix_acl *acl;
3498 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3499 if (flags & IPERM_FLAG_RCU)
3502 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3507 rc = posix_acl_permission(inode, acl, mask);
3508 posix_acl_release(acl);
3511 # else /* !CONFIG_FS_POSIX_ACL */
3513 # endif /* CONFIG_FS_POSIX_ACL */
3515 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3517 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3518 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3520 # ifdef HAVE_INODE_PERMISION_2ARGS
3521 int ll_inode_permission(struct inode *inode, int mask)
3523 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3528 struct ll_sb_info *sbi;
3529 struct root_squash_info *squash;
3530 struct cred *cred = NULL;
3531 const struct cred *old_cred = NULL;
3533 bool squash_id = false;
3536 #ifdef MAY_NOT_BLOCK
3537 if (mask & MAY_NOT_BLOCK)
3539 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3540 if (flags & IPERM_FLAG_RCU)
3544 /* as root inode are NOT getting validated in lookup operation,
3545 * need to do it before permission check. */
3547 if (inode == inode->i_sb->s_root->d_inode) {
3548 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3549 MDS_INODELOCK_LOOKUP);
3554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3555 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3557 /* squash fsuid/fsgid if needed */
3558 sbi = ll_i2sbi(inode);
3559 squash = &sbi->ll_squash;
3560 if (unlikely(squash->rsi_uid != 0 &&
3561 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3562 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3566 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3567 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3568 squash->rsi_uid, squash->rsi_gid);
3570 /* update current process's credentials
3571 * and FS capability */
3572 cred = prepare_creds();
3576 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3577 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3578 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3579 if ((1 << cap) & CFS_CAP_FS_MASK)
3580 cap_lower(cred->cap_effective, cap);
3582 old_cred = override_creds(cred);
3585 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3587 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3588 rc = lustre_check_remote_perm(inode, mask);
3590 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3592 /* restore current process's credentials and FS capability */
3594 revert_creds(old_cred);
3601 /* -o localflock - only provides locally consistent flock locks */
3602 struct file_operations ll_file_operations = {
3603 .read = ll_file_read,
3604 .aio_read = ll_file_aio_read,
3605 .write = ll_file_write,
3606 .aio_write = ll_file_aio_write,
3607 .unlocked_ioctl = ll_file_ioctl,
3608 .open = ll_file_open,
3609 .release = ll_file_release,
3610 .mmap = ll_file_mmap,
3611 .llseek = ll_file_seek,
3612 .splice_read = ll_file_splice_read,
3617 struct file_operations ll_file_operations_flock = {
3618 .read = ll_file_read,
3619 .aio_read = ll_file_aio_read,
3620 .write = ll_file_write,
3621 .aio_write = ll_file_aio_write,
3622 .unlocked_ioctl = ll_file_ioctl,
3623 .open = ll_file_open,
3624 .release = ll_file_release,
3625 .mmap = ll_file_mmap,
3626 .llseek = ll_file_seek,
3627 .splice_read = ll_file_splice_read,
3630 .flock = ll_file_flock,
3631 .lock = ll_file_flock
3634 /* These are for -o noflock - to return ENOSYS on flock calls */
3635 struct file_operations ll_file_operations_noflock = {
3636 .read = ll_file_read,
3637 .aio_read = ll_file_aio_read,
3638 .write = ll_file_write,
3639 .aio_write = ll_file_aio_write,
3640 .unlocked_ioctl = ll_file_ioctl,
3641 .open = ll_file_open,
3642 .release = ll_file_release,
3643 .mmap = ll_file_mmap,
3644 .llseek = ll_file_seek,
3645 .splice_read = ll_file_splice_read,
3648 .flock = ll_file_noflock,
3649 .lock = ll_file_noflock
3652 struct inode_operations ll_file_inode_operations = {
3653 .setattr = ll_setattr,
3654 .getattr = ll_getattr,
3655 .permission = ll_inode_permission,
3656 .setxattr = ll_setxattr,
3657 .getxattr = ll_getxattr,
3658 .listxattr = ll_listxattr,
3659 .removexattr = ll_removexattr,
3660 .fiemap = ll_fiemap,
3661 #ifdef HAVE_IOP_GET_ACL
3662 .get_acl = ll_get_acl,
3666 /* dynamic ioctl number support routins */
3667 static struct llioc_ctl_data {
3668 struct rw_semaphore ioc_sem;
3669 struct list_head ioc_head;
3671 __RWSEM_INITIALIZER(llioc.ioc_sem),
3672 LIST_HEAD_INIT(llioc.ioc_head)
3677 struct list_head iocd_list;
3678 unsigned int iocd_size;
3679 llioc_callback_t iocd_cb;
3680 unsigned int iocd_count;
3681 unsigned int iocd_cmd[0];
3684 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3687 struct llioc_data *in_data = NULL;
3690 if (cb == NULL || cmd == NULL ||
3691 count > LLIOC_MAX_CMD || count < 0)
3694 size = sizeof(*in_data) + count * sizeof(unsigned int);
3695 OBD_ALLOC(in_data, size);
3696 if (in_data == NULL)
3699 memset(in_data, 0, sizeof(*in_data));
3700 in_data->iocd_size = size;
3701 in_data->iocd_cb = cb;
3702 in_data->iocd_count = count;
3703 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3705 down_write(&llioc.ioc_sem);
3706 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3707 up_write(&llioc.ioc_sem);
3712 void ll_iocontrol_unregister(void *magic)
3714 struct llioc_data *tmp;
3719 down_write(&llioc.ioc_sem);
3720 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3722 unsigned int size = tmp->iocd_size;
3724 list_del(&tmp->iocd_list);
3725 up_write(&llioc.ioc_sem);
3727 OBD_FREE(tmp, size);
3731 up_write(&llioc.ioc_sem);
3733 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3736 EXPORT_SYMBOL(ll_iocontrol_register);
3737 EXPORT_SYMBOL(ll_iocontrol_unregister);
3739 static enum llioc_iter
3740 ll_iocontrol_call(struct inode *inode, struct file *file,
3741 unsigned int cmd, unsigned long arg, int *rcp)
3743 enum llioc_iter ret = LLIOC_CONT;
3744 struct llioc_data *data;
3745 int rc = -EINVAL, i;
3747 down_read(&llioc.ioc_sem);
3748 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3749 for (i = 0; i < data->iocd_count; i++) {
3750 if (cmd != data->iocd_cmd[i])
3753 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3757 if (ret == LLIOC_STOP)
3760 up_read(&llioc.ioc_sem);
3767 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3769 struct ll_inode_info *lli = ll_i2info(inode);
3770 struct cl_env_nest nest;
3775 if (lli->lli_clob == NULL)
3778 env = cl_env_nested_get(&nest);
3780 RETURN(PTR_ERR(env));
3782 result = cl_conf_set(env, lli->lli_clob, conf);
3783 cl_env_nested_put(&nest, env);
3785 if (conf->coc_opc == OBJECT_CONF_SET) {
3786 struct ldlm_lock *lock = conf->coc_lock;
3788 LASSERT(lock != NULL);
3789 LASSERT(ldlm_has_layout(lock));
3791 struct lustre_md *md = conf->u.coc_md;
3792 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3794 /* it can only be allowed to match after layout is
3795 * applied to inode otherwise false layout would be
3796 * seen. Applying layout shoud happen before dropping
3797 * the intent lock. */
3798 ldlm_lock_allow_match(lock);
3800 lli->lli_has_smd = lsm_has_objects(md->lsm);
3801 if (md->lsm != NULL)
3802 gen = md->lsm->lsm_layout_gen;
3805 DFID ": layout version change: %u -> %u\n",
3806 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3808 ll_layout_version_set(lli, gen);
3814 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3815 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3818 struct ll_sb_info *sbi = ll_i2sbi(inode);
3819 struct obd_capa *oc;
3820 struct ptlrpc_request *req;
3821 struct mdt_body *body;
3828 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3829 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3830 lock->l_lvb_data, lock->l_lvb_len);
3832 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3835 /* if layout lock was granted right away, the layout is returned
3836 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3837 * blocked and then granted via completion ast, we have to fetch
3838 * layout here. Please note that we can't use the LVB buffer in
3839 * completion AST because it doesn't have a large enough buffer */
3840 oc = ll_mdscapa_get(inode);
3841 rc = ll_get_default_mdsize(sbi, &lmmsize);
3843 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3844 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3850 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3852 GOTO(out, rc = -EPROTO);
3854 lmmsize = body->mbo_eadatasize;
3855 if (lmmsize == 0) /* empty layout */
3858 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3860 GOTO(out, rc = -EFAULT);
3862 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3863 if (lvbdata == NULL)
3864 GOTO(out, rc = -ENOMEM);
3866 memcpy(lvbdata, lmm, lmmsize);
3867 lock_res_and_lock(lock);
3868 if (lock->l_lvb_data != NULL)
3869 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3871 lock->l_lvb_data = lvbdata;
3872 lock->l_lvb_len = lmmsize;
3873 unlock_res_and_lock(lock);
3878 ptlrpc_req_finished(req);
3883 * Apply the layout to the inode. Layout lock is held and will be released
3886 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3887 struct inode *inode, __u32 *gen, bool reconf)
3889 struct ll_inode_info *lli = ll_i2info(inode);
3890 struct ll_sb_info *sbi = ll_i2sbi(inode);
3891 struct ldlm_lock *lock;
3892 struct lustre_md md = { NULL };
3893 struct cl_object_conf conf;
3896 bool wait_layout = false;
3899 LASSERT(lustre_handle_is_used(lockh));
3901 lock = ldlm_handle2lock(lockh);
3902 LASSERT(lock != NULL);
3903 LASSERT(ldlm_has_layout(lock));
3905 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3906 PFID(&lli->lli_fid), inode, reconf);
3908 /* in case this is a caching lock and reinstate with new inode */
3909 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3911 lock_res_and_lock(lock);
3912 lvb_ready = ldlm_is_lvb_ready(lock);
3913 unlock_res_and_lock(lock);
3914 /* checking lvb_ready is racy but this is okay. The worst case is
3915 * that multi processes may configure the file on the same time. */
3917 if (lvb_ready || !reconf) {
3920 /* layout_gen must be valid if layout lock is not
3921 * cancelled and stripe has already set */
3922 *gen = ll_layout_version_get(lli);
3928 rc = ll_layout_fetch(inode, lock);
3932 /* for layout lock, lmm is returned in lock's lvb.
3933 * lvb_data is immutable if the lock is held so it's safe to access it
3934 * without res lock. See the description in ldlm_lock_decref_internal()
3935 * for the condition to free lvb_data of layout lock */
3936 if (lock->l_lvb_data != NULL) {
3937 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3938 lock->l_lvb_data, lock->l_lvb_len);
3940 *gen = LL_LAYOUT_GEN_EMPTY;
3942 *gen = md.lsm->lsm_layout_gen;
3945 CERROR("%s: file "DFID" unpackmd error: %d\n",
3946 ll_get_fsname(inode->i_sb, NULL, 0),
3947 PFID(&lli->lli_fid), rc);
3953 /* set layout to file. Unlikely this will fail as old layout was
3954 * surely eliminated */
3955 memset(&conf, 0, sizeof conf);
3956 conf.coc_opc = OBJECT_CONF_SET;
3957 conf.coc_inode = inode;
3958 conf.coc_lock = lock;
3959 conf.u.coc_md = &md;
3960 rc = ll_layout_conf(inode, &conf);
3963 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3965 /* refresh layout failed, need to wait */
3966 wait_layout = rc == -EBUSY;
3970 LDLM_LOCK_PUT(lock);
3971 ldlm_lock_decref(lockh, mode);
3973 /* wait for IO to complete if it's still being used. */
3975 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3976 ll_get_fsname(inode->i_sb, NULL, 0),
3977 PFID(&lli->lli_fid), inode);
3979 memset(&conf, 0, sizeof conf);
3980 conf.coc_opc = OBJECT_CONF_WAIT;
3981 conf.coc_inode = inode;
3982 rc = ll_layout_conf(inode, &conf);
3986 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3987 ll_get_fsname(inode->i_sb, NULL, 0),
3988 PFID(&lli->lli_fid), rc);
3994 * This function checks if there exists a LAYOUT lock on the client side,
3995 * or enqueues it if it doesn't have one in cache.
3997 * This function will not hold layout lock so it may be revoked any time after
3998 * this function returns. Any operations depend on layout should be redone
4001 * This function should be called before lov_io_init() to get an uptodate
4002 * layout version, the caller should save the version number and after IO
4003 * is finished, this function should be called again to verify that layout
4004 * is not changed during IO time.
4006 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4008 struct ll_inode_info *lli = ll_i2info(inode);
4009 struct ll_sb_info *sbi = ll_i2sbi(inode);
4010 struct md_op_data *op_data;
4011 struct lookup_intent it;
4012 struct lustre_handle lockh;
4014 struct ldlm_enqueue_info einfo = {
4015 .ei_type = LDLM_IBITS,
4017 .ei_cb_bl = &ll_md_blocking_ast,
4018 .ei_cb_cp = &ldlm_completion_ast,
4023 *gen = ll_layout_version_get(lli);
4024 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4028 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4029 LASSERT(S_ISREG(inode->i_mode));
4031 /* take layout lock mutex to enqueue layout lock exclusively. */
4032 mutex_lock(&lli->lli_layout_mutex);
4035 /* mostly layout lock is caching on the local side, so try to match
4036 * it before grabbing layout lock mutex. */
4037 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4038 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4039 if (mode != 0) { /* hit cached lock */
4040 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4044 mutex_unlock(&lli->lli_layout_mutex);
4048 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4049 0, 0, LUSTRE_OPC_ANY, NULL);
4050 if (IS_ERR(op_data)) {
4051 mutex_unlock(&lli->lli_layout_mutex);
4052 RETURN(PTR_ERR(op_data));
4055 /* have to enqueue one */
4056 memset(&it, 0, sizeof(it));
4057 it.it_op = IT_LAYOUT;
4058 lockh.cookie = 0ULL;
4060 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4061 ll_get_fsname(inode->i_sb, NULL, 0),
4062 PFID(&lli->lli_fid), inode);
4064 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4065 if (it.d.lustre.it_data != NULL)
4066 ptlrpc_req_finished(it.d.lustre.it_data);
4067 it.d.lustre.it_data = NULL;
4069 ll_finish_md_op_data(op_data);
4071 mode = it.d.lustre.it_lock_mode;
4072 it.d.lustre.it_lock_mode = 0;
4073 ll_intent_drop_lock(&it);
4076 /* set lock data in case this is a new lock */
4077 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4078 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4082 mutex_unlock(&lli->lli_layout_mutex);
4088 * This function send a restore request to the MDT
4090 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4092 struct hsm_user_request *hur;
4096 len = sizeof(struct hsm_user_request) +
4097 sizeof(struct hsm_user_item);
4098 OBD_ALLOC(hur, len);
4102 hur->hur_request.hr_action = HUA_RESTORE;
4103 hur->hur_request.hr_archive_id = 0;
4104 hur->hur_request.hr_flags = 0;
4105 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4106 sizeof(hur->hur_user_item[0].hui_fid));
4107 hur->hur_user_item[0].hui_extent.offset = offset;
4108 hur->hur_user_item[0].hui_extent.length = length;
4109 hur->hur_request.hr_itemcount = 1;
4110 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,