4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("Releasing a file %p with negative dentry %p. Name %s",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead.
386 * Different processes can open the same dir, "ll_opendir_key" means:
387 * it is me that should stop the statahead thread. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
389 lli->lli_opendir_pid != 0)
390 ll_stop_statahead(inode, lli->lli_opendir_key);
392 if (inode->i_sb->s_root == file->f_dentry) {
393 LUSTRE_FPRIVATE(file) = NULL;
394 ll_file_data_put(fd);
398 if (!S_ISDIR(inode->i_mode)) {
399 if (lli->lli_clob != NULL)
400 lov_read_and_clear_async_rc(lli->lli_clob);
401 lli->lli_async_rc = 0;
404 rc = ll_md_close(sbi->ll_md_exp, inode, file);
406 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
407 libcfs_debug_dumplog();
412 static int ll_intent_file_open(struct file *file, void *lmm,
413 int lmmsize, struct lookup_intent *itp)
415 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
416 struct dentry *parent = file->f_dentry->d_parent;
417 const char *name = file->f_dentry->d_name.name;
418 const int len = file->f_dentry->d_name.len;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req = NULL;
421 __u32 opc = LUSTRE_OPC_ANY;
428 /* Usually we come here only for NFSD, and we want open lock.
429 But we can also get here with pre 2.6.15 patchless kernels, and in
430 that case that lock is also ok */
431 /* We can also get here if there was cached open handle in revalidate_it
432 * but it disappeared while we were getting from there to ll_file_open.
433 * But this means this file was closed and immediatelly opened which
434 * makes a good candidate for using OPEN lock */
435 /* If lmmsize & lmm are not 0, we are just setting stripe info
436 * parameters. No need for the open lock */
437 if (lmm == NULL && lmmsize == 0) {
438 itp->it_flags |= MDS_OPEN_LOCK;
439 if (itp->it_flags & FMODE_WRITE)
440 opc = LUSTRE_OPC_CREATE;
443 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
444 file->f_dentry->d_inode, name, len,
447 RETURN(PTR_ERR(op_data));
449 op_data->op_data = lmm;
450 op_data->op_data_size = lmmsize;
452 itp->it_flags |= MDS_OPEN_BY_FID;
453 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
454 &ll_md_blocking_ast, 0);
455 ll_finish_md_op_data(op_data);
457 /* reason for keep own exit path - don`t flood log
458 * with messages with -ESTALE errors.
460 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
461 it_open_error(DISP_OPEN_OPEN, itp))
463 ll_release_openhandle(file->f_dentry, itp);
467 if (it_disposition(itp, DISP_LOOKUP_NEG))
468 GOTO(out, rc = -ENOENT);
470 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
471 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
472 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
476 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
477 if (!rc && itp->d.lustre.it_lock_mode)
478 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
482 ptlrpc_req_finished(req);
483 ll_intent_drop_lock(itp);
489 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
490 * not believe attributes if a few ioepoch holders exist. Attributes for
491 * previous ioepoch if new one is opened are also skipped by MDS.
493 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
495 if (ioepoch && lli->lli_ioepoch != ioepoch) {
496 lli->lli_ioepoch = ioepoch;
497 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
498 ioepoch, PFID(&lli->lli_fid));
502 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
503 struct obd_client_handle *och)
505 struct ptlrpc_request *req = it->d.lustre.it_data;
506 struct mdt_body *body;
508 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
509 och->och_fh = body->mbo_handle;
510 och->och_fid = body->mbo_fid1;
511 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
512 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
513 och->och_flags = it->it_flags;
515 return md_set_open_replay_data(md_exp, och, it);
518 static int ll_local_open(struct file *file, struct lookup_intent *it,
519 struct ll_file_data *fd, struct obd_client_handle *och)
521 struct inode *inode = file->f_dentry->d_inode;
522 struct ll_inode_info *lli = ll_i2info(inode);
525 LASSERT(!LUSTRE_FPRIVATE(file));
530 struct ptlrpc_request *req = it->d.lustre.it_data;
531 struct mdt_body *body;
534 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
538 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
539 ll_ioepoch_open(lli, body->mbo_ioepoch);
542 LUSTRE_FPRIVATE(file) = fd;
543 ll_readahead_init(inode, &fd->fd_ras);
544 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
549 /* Open a file, and (for the very first open) create objects on the OSTs at
550 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
551 * creation or open until ll_lov_setstripe() ioctl is called.
553 * If we already have the stripe MD locally then we don't request it in
554 * md_open(), by passing a lmm_size = 0.
556 * It is up to the application to ensure no other processes open this file
557 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
558 * used. We might be able to avoid races of that sort by getting lli_open_sem
559 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
560 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
562 int ll_file_open(struct inode *inode, struct file *file)
564 struct ll_inode_info *lli = ll_i2info(inode);
565 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
566 .it_flags = file->f_flags };
567 struct obd_client_handle **och_p = NULL;
568 __u64 *och_usecount = NULL;
569 struct ll_file_data *fd;
570 int rc = 0, opendir_set = 0;
573 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
574 PFID(ll_inode2fid(inode)), inode, file->f_flags);
576 it = file->private_data; /* XXX: compat macro */
577 file->private_data = NULL; /* prevent ll_local_open assertion */
579 fd = ll_file_data_get();
581 GOTO(out_openerr, rc = -ENOMEM);
584 if (S_ISDIR(inode->i_mode)) {
585 spin_lock(&lli->lli_sa_lock);
586 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
587 lli->lli_opendir_pid == 0) {
588 lli->lli_opendir_key = fd;
589 lli->lli_opendir_pid = current_pid();
592 spin_unlock(&lli->lli_sa_lock);
595 if (inode->i_sb->s_root == file->f_dentry) {
596 LUSTRE_FPRIVATE(file) = fd;
600 if (!it || !it->d.lustre.it_disposition) {
601 /* Convert f_flags into access mode. We cannot use file->f_mode,
602 * because everything but O_ACCMODE mask was stripped from
604 if ((oit.it_flags + 1) & O_ACCMODE)
606 if (file->f_flags & O_TRUNC)
607 oit.it_flags |= FMODE_WRITE;
609 /* kernel only call f_op->open in dentry_open. filp_open calls
610 * dentry_open after call to open_namei that checks permissions.
611 * Only nfsd_open call dentry_open directly without checking
612 * permissions and because of that this code below is safe. */
613 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
614 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
616 /* We do not want O_EXCL here, presumably we opened the file
617 * already? XXX - NFS implications? */
618 oit.it_flags &= ~O_EXCL;
620 /* bug20584, if "it_flags" contains O_CREAT, the file will be
621 * created if necessary, then "IT_CREAT" should be set to keep
622 * consistent with it */
623 if (oit.it_flags & O_CREAT)
624 oit.it_op |= IT_CREAT;
630 /* Let's see if we have file open on MDS already. */
631 if (it->it_flags & FMODE_WRITE) {
632 och_p = &lli->lli_mds_write_och;
633 och_usecount = &lli->lli_open_fd_write_count;
634 } else if (it->it_flags & FMODE_EXEC) {
635 och_p = &lli->lli_mds_exec_och;
636 och_usecount = &lli->lli_open_fd_exec_count;
638 och_p = &lli->lli_mds_read_och;
639 och_usecount = &lli->lli_open_fd_read_count;
642 mutex_lock(&lli->lli_och_mutex);
643 if (*och_p) { /* Open handle is present */
644 if (it_disposition(it, DISP_OPEN_OPEN)) {
645 /* Well, there's extra open request that we do not need,
646 let's close it somehow. This will decref request. */
647 rc = it_open_error(DISP_OPEN_OPEN, it);
649 mutex_unlock(&lli->lli_och_mutex);
650 GOTO(out_openerr, rc);
653 ll_release_openhandle(file->f_dentry, it);
657 rc = ll_local_open(file, it, fd, NULL);
660 mutex_unlock(&lli->lli_och_mutex);
661 GOTO(out_openerr, rc);
664 LASSERT(*och_usecount == 0);
665 if (!it->d.lustre.it_disposition) {
666 /* We cannot just request lock handle now, new ELC code
667 means that one of other OPEN locks for this file
668 could be cancelled, and since blocking ast handler
669 would attempt to grab och_mutex as well, that would
670 result in a deadlock */
671 mutex_unlock(&lli->lli_och_mutex);
672 it->it_create_mode |= M_CHECK_STALE;
673 rc = ll_intent_file_open(file, NULL, 0, it);
674 it->it_create_mode &= ~M_CHECK_STALE;
676 GOTO(out_openerr, rc);
680 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
682 GOTO(out_och_free, rc = -ENOMEM);
686 /* md_intent_lock() didn't get a request ref if there was an
687 * open error, so don't do cleanup on the request here
689 /* XXX (green): Should not we bail out on any error here, not
690 * just open error? */
691 rc = it_open_error(DISP_OPEN_OPEN, it);
693 GOTO(out_och_free, rc);
695 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
696 "inode %p: disposition %x, status %d\n", inode,
697 it_disposition(it, ~0), it->d.lustre.it_status);
699 rc = ll_local_open(file, it, fd, *och_p);
701 GOTO(out_och_free, rc);
703 mutex_unlock(&lli->lli_och_mutex);
706 /* Must do this outside lli_och_mutex lock to prevent deadlock where
707 different kind of OPEN lock for this same inode gets cancelled
708 by ldlm_cancel_lru */
709 if (!S_ISREG(inode->i_mode))
710 GOTO(out_och_free, rc);
714 if (!lli->lli_has_smd &&
715 (cl_is_lov_delay_create(file->f_flags) ||
716 (file->f_mode & FMODE_WRITE) == 0)) {
717 CDEBUG(D_INODE, "object creation was delayed\n");
718 GOTO(out_och_free, rc);
720 cl_lov_delay_create_clear(&file->f_flags);
721 GOTO(out_och_free, rc);
725 if (och_p && *och_p) {
726 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
727 *och_p = NULL; /* OBD_FREE writes some magic there */
730 mutex_unlock(&lli->lli_och_mutex);
733 if (opendir_set != 0)
734 ll_stop_statahead(inode, lli->lli_opendir_key);
736 ll_file_data_put(fd);
738 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
741 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
742 ptlrpc_req_finished(it->d.lustre.it_data);
743 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
749 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
750 struct ldlm_lock_desc *desc, void *data, int flag)
753 struct lustre_handle lockh;
757 case LDLM_CB_BLOCKING:
758 ldlm_lock2handle(lock, &lockh);
759 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
761 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
765 case LDLM_CB_CANCELING:
773 * Acquire a lease and open the file.
775 static struct obd_client_handle *
776 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
779 struct lookup_intent it = { .it_op = IT_OPEN };
780 struct ll_sb_info *sbi = ll_i2sbi(inode);
781 struct md_op_data *op_data;
782 struct ptlrpc_request *req = NULL;
783 struct lustre_handle old_handle = { 0 };
784 struct obd_client_handle *och = NULL;
789 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
790 RETURN(ERR_PTR(-EINVAL));
793 struct ll_inode_info *lli = ll_i2info(inode);
794 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
795 struct obd_client_handle **och_p;
798 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
799 RETURN(ERR_PTR(-EPERM));
801 /* Get the openhandle of the file */
803 mutex_lock(&lli->lli_och_mutex);
804 if (fd->fd_lease_och != NULL) {
805 mutex_unlock(&lli->lli_och_mutex);
809 if (fd->fd_och == NULL) {
810 if (file->f_mode & FMODE_WRITE) {
811 LASSERT(lli->lli_mds_write_och != NULL);
812 och_p = &lli->lli_mds_write_och;
813 och_usecount = &lli->lli_open_fd_write_count;
815 LASSERT(lli->lli_mds_read_och != NULL);
816 och_p = &lli->lli_mds_read_och;
817 och_usecount = &lli->lli_open_fd_read_count;
819 if (*och_usecount == 1) {
826 mutex_unlock(&lli->lli_och_mutex);
827 if (rc < 0) /* more than 1 opener */
830 LASSERT(fd->fd_och != NULL);
831 old_handle = fd->fd_och->och_fh;
836 RETURN(ERR_PTR(-ENOMEM));
838 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
839 LUSTRE_OPC_ANY, NULL);
841 GOTO(out, rc = PTR_ERR(op_data));
843 /* To tell the MDT this openhandle is from the same owner */
844 op_data->op_handle = old_handle;
846 it.it_flags = fmode | open_flags;
847 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
848 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
849 &ll_md_blocking_lease_ast,
850 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
851 * it can be cancelled which may mislead applications that the lease is
853 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
854 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
855 * doesn't deal with openhandle, so normal openhandle will be leaked. */
856 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
857 ll_finish_md_op_data(op_data);
858 ptlrpc_req_finished(req);
860 GOTO(out_release_it, rc);
862 if (it_disposition(&it, DISP_LOOKUP_NEG))
863 GOTO(out_release_it, rc = -ENOENT);
865 rc = it_open_error(DISP_OPEN_OPEN, &it);
867 GOTO(out_release_it, rc);
869 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
870 ll_och_fill(sbi->ll_md_exp, &it, och);
872 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
873 GOTO(out_close, rc = -EOPNOTSUPP);
875 /* already get lease, handle lease lock */
876 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
877 if (it.d.lustre.it_lock_mode == 0 ||
878 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
879 /* open lock must return for lease */
880 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
881 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
882 it.d.lustre.it_lock_bits);
883 GOTO(out_close, rc = -EPROTO);
886 ll_intent_release(&it);
890 /* Cancel open lock */
891 if (it.d.lustre.it_lock_mode != 0) {
892 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
893 it.d.lustre.it_lock_mode);
894 it.d.lustre.it_lock_mode = 0;
895 och->och_lease_handle.cookie = 0ULL;
897 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
899 CERROR("%s: error closing file "DFID": %d\n",
900 ll_get_fsname(inode->i_sb, NULL, 0),
901 PFID(&ll_i2info(inode)->lli_fid), rc2);
902 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
904 ll_intent_release(&it);
912 * Release lease and close the file.
913 * It will check if the lease has ever broken.
915 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
918 struct ldlm_lock *lock;
919 bool cancelled = true;
923 lock = ldlm_handle2lock(&och->och_lease_handle);
925 lock_res_and_lock(lock);
926 cancelled = ldlm_is_cancel(lock);
927 unlock_res_and_lock(lock);
931 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
932 PFID(&ll_i2info(inode)->lli_fid), cancelled);
935 ldlm_cli_cancel(&och->och_lease_handle, 0);
936 if (lease_broken != NULL)
937 *lease_broken = cancelled;
939 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
944 /* Fills the obdo with the attributes for the lsm */
945 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
946 struct obd_capa *capa, struct obdo *obdo,
947 __u64 ioepoch, int dv_flags)
949 struct ptlrpc_request_set *set;
950 struct obd_info oinfo = { { { 0 } } };
955 LASSERT(lsm != NULL);
959 oinfo.oi_oa->o_oi = lsm->lsm_oi;
960 oinfo.oi_oa->o_mode = S_IFREG;
961 oinfo.oi_oa->o_ioepoch = ioepoch;
962 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
963 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
964 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
965 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
966 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
967 OBD_MD_FLDATAVERSION;
968 oinfo.oi_capa = capa;
969 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
970 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
971 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
972 if (dv_flags & LL_DV_WR_FLUSH)
973 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
976 set = ptlrpc_prep_set();
978 CERROR("can't allocate ptlrpc set\n");
981 rc = obd_getattr_async(exp, &oinfo, set);
983 rc = ptlrpc_set_wait(set);
984 ptlrpc_set_destroy(set);
987 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
988 OBD_MD_FLATIME | OBD_MD_FLMTIME |
989 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
990 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
991 if (dv_flags & LL_DV_WR_FLUSH &&
992 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
993 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1000 * Performs the getattr on the inode and updates its fields.
1001 * If @sync != 0, perform the getattr under the server-side lock.
1003 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1004 __u64 ioepoch, int sync)
1006 struct obd_capa *capa = ll_mdscapa_get(inode);
1007 struct lov_stripe_md *lsm;
1011 lsm = ccc_inode_lsm_get(inode);
1012 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1013 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1016 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1018 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1019 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1020 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1021 (unsigned long long)inode->i_blocks,
1022 (unsigned long)ll_inode_blksize(inode));
1024 ccc_inode_lsm_put(inode, lsm);
1028 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1030 struct ll_inode_info *lli = ll_i2info(inode);
1031 struct cl_object *obj = lli->lli_clob;
1032 struct cl_attr *attr = ccc_env_thread_attr(env);
1038 ll_inode_size_lock(inode);
1039 /* merge timestamps the most recently obtained from mds with
1040 timestamps obtained from osts */
1041 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1042 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1043 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1044 inode_init_lvb(inode, &lvb);
1046 cl_object_attr_lock(obj);
1047 rc = cl_object_attr_get(env, obj, attr);
1048 cl_object_attr_unlock(obj);
1051 if (lvb.lvb_atime < attr->cat_atime)
1052 lvb.lvb_atime = attr->cat_atime;
1053 if (lvb.lvb_ctime < attr->cat_ctime)
1054 lvb.lvb_ctime = attr->cat_ctime;
1055 if (lvb.lvb_mtime < attr->cat_mtime)
1056 lvb.lvb_mtime = attr->cat_mtime;
1058 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1059 PFID(&lli->lli_fid), attr->cat_size);
1060 cl_isize_write_nolock(inode, attr->cat_size);
1062 inode->i_blocks = attr->cat_blocks;
1064 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1065 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1066 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1068 ll_inode_size_unlock(inode);
1073 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1076 struct obdo obdo = { 0 };
1079 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1081 st->st_size = obdo.o_size;
1082 st->st_blocks = obdo.o_blocks;
1083 st->st_mtime = obdo.o_mtime;
1084 st->st_atime = obdo.o_atime;
1085 st->st_ctime = obdo.o_ctime;
1090 static bool file_is_noatime(const struct file *file)
1092 const struct vfsmount *mnt = file->f_path.mnt;
1093 const struct inode *inode = file->f_path.dentry->d_inode;
1095 /* Adapted from file_accessed() and touch_atime().*/
1096 if (file->f_flags & O_NOATIME)
1099 if (inode->i_flags & S_NOATIME)
1102 if (IS_NOATIME(inode))
1105 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1108 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1111 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1117 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1119 struct inode *inode = file->f_dentry->d_inode;
1121 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1123 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1124 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1125 file->f_flags & O_DIRECT ||
1128 io->ci_obj = ll_i2info(inode)->lli_clob;
1129 io->ci_lockreq = CILR_MAYBE;
1130 if (ll_file_nolock(file)) {
1131 io->ci_lockreq = CILR_NEVER;
1132 io->ci_no_srvlock = 1;
1133 } else if (file->f_flags & O_APPEND) {
1134 io->ci_lockreq = CILR_MANDATORY;
1137 io->ci_noatime = file_is_noatime(file);
1141 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1142 struct file *file, enum cl_io_type iot,
1143 loff_t *ppos, size_t count)
1145 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1151 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1152 file->f_dentry->d_name.name, iot, *ppos, count);
1155 io = ccc_env_thread_io(env);
1156 ll_io_init(io, file, iot == CIT_WRITE);
1158 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1159 struct vvp_io *vio = vvp_env_io(env);
1160 struct ccc_io *cio = ccc_env_io(env);
1161 int write_mutex_locked = 0;
1163 cio->cui_fd = LUSTRE_FPRIVATE(file);
1164 vio->cui_io_subtype = args->via_io_subtype;
1166 switch (vio->cui_io_subtype) {
1168 cio->cui_iov = args->u.normal.via_iov;
1169 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1170 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1171 cio->cui_iocb = args->u.normal.via_iocb;
1172 if ((iot == CIT_WRITE) &&
1173 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1174 if (mutex_lock_interruptible(&lli->
1176 GOTO(out, result = -ERESTARTSYS);
1177 write_mutex_locked = 1;
1179 down_read(&lli->lli_trunc_sem);
1182 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1183 vio->u.splice.cui_flags = args->u.splice.via_flags;
1186 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1189 result = cl_io_loop(env, io);
1190 if (args->via_io_subtype == IO_NORMAL)
1191 up_read(&lli->lli_trunc_sem);
1192 if (write_mutex_locked)
1193 mutex_unlock(&lli->lli_write_mutex);
1195 /* cl_io_rw_init() handled IO */
1196 result = io->ci_result;
1199 if (io->ci_nob > 0) {
1200 result = io->ci_nob;
1201 *ppos = io->u.ci_wr.wr.crw_pos;
1205 cl_io_fini(env, io);
1206 /* If any bit been read/written (result != 0), we just return
1207 * short read/write instead of restart io. */
1208 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1209 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1210 iot == CIT_READ ? "read" : "write",
1211 file->f_dentry->d_name.name, *ppos, count);
1212 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1216 if (iot == CIT_READ) {
1218 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1219 LPROC_LL_READ_BYTES, result);
1220 } else if (iot == CIT_WRITE) {
1222 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1223 LPROC_LL_WRITE_BYTES, result);
1224 fd->fd_write_failed = false;
1225 } else if (result != -ERESTARTSYS) {
1226 fd->fd_write_failed = true;
1229 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1236 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1238 static int ll_file_get_iov_count(const struct iovec *iov,
1239 unsigned long *nr_segs, size_t *count)
1244 for (seg = 0; seg < *nr_segs; seg++) {
1245 const struct iovec *iv = &iov[seg];
1248 * If any segment has a negative length, or the cumulative
1249 * length ever wraps negative then return -EINVAL.
1252 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1254 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1259 cnt -= iv->iov_len; /* This segment is no good */
1266 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1267 unsigned long nr_segs, loff_t pos)
1270 struct vvp_io_args *args;
1276 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1280 env = cl_env_get(&refcheck);
1282 RETURN(PTR_ERR(env));
1284 args = vvp_env_args(env, IO_NORMAL);
1285 args->u.normal.via_iov = (struct iovec *)iov;
1286 args->u.normal.via_nrsegs = nr_segs;
1287 args->u.normal.via_iocb = iocb;
1289 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1290 &iocb->ki_pos, count);
1291 cl_env_put(env, &refcheck);
1295 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1299 struct iovec *local_iov;
1300 struct kiocb *kiocb;
1305 env = cl_env_get(&refcheck);
1307 RETURN(PTR_ERR(env));
1309 local_iov = &vvp_env_info(env)->vti_local_iov;
1310 kiocb = &vvp_env_info(env)->vti_kiocb;
1311 local_iov->iov_base = (void __user *)buf;
1312 local_iov->iov_len = count;
1313 init_sync_kiocb(kiocb, file);
1314 kiocb->ki_pos = *ppos;
1315 #ifdef HAVE_KIOCB_KI_LEFT
1316 kiocb->ki_left = count;
1318 kiocb->ki_nbytes = count;
1321 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1322 *ppos = kiocb->ki_pos;
1324 cl_env_put(env, &refcheck);
1329 * Write to a file (through the page cache).
1332 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1333 unsigned long nr_segs, loff_t pos)
1336 struct vvp_io_args *args;
1342 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1346 env = cl_env_get(&refcheck);
1348 RETURN(PTR_ERR(env));
1350 args = vvp_env_args(env, IO_NORMAL);
1351 args->u.normal.via_iov = (struct iovec *)iov;
1352 args->u.normal.via_nrsegs = nr_segs;
1353 args->u.normal.via_iocb = iocb;
1355 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1356 &iocb->ki_pos, count);
1357 cl_env_put(env, &refcheck);
1361 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1365 struct iovec *local_iov;
1366 struct kiocb *kiocb;
1371 env = cl_env_get(&refcheck);
1373 RETURN(PTR_ERR(env));
1375 local_iov = &vvp_env_info(env)->vti_local_iov;
1376 kiocb = &vvp_env_info(env)->vti_kiocb;
1377 local_iov->iov_base = (void __user *)buf;
1378 local_iov->iov_len = count;
1379 init_sync_kiocb(kiocb, file);
1380 kiocb->ki_pos = *ppos;
1381 #ifdef HAVE_KIOCB_KI_LEFT
1382 kiocb->ki_left = count;
1384 kiocb->ki_nbytes = count;
1387 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1388 *ppos = kiocb->ki_pos;
1390 cl_env_put(env, &refcheck);
1395 * Send file content (through pagecache) somewhere with helper
1397 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1398 struct pipe_inode_info *pipe, size_t count,
1402 struct vvp_io_args *args;
1407 env = cl_env_get(&refcheck);
1409 RETURN(PTR_ERR(env));
1411 args = vvp_env_args(env, IO_SPLICE);
1412 args->u.splice.via_pipe = pipe;
1413 args->u.splice.via_flags = flags;
1415 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1416 cl_env_put(env, &refcheck);
1420 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1423 struct obd_export *exp = ll_i2dtexp(inode);
1424 struct obd_trans_info oti = { 0 };
1425 struct obdo *oa = NULL;
1428 struct lov_stripe_md *lsm = NULL, *lsm2;
1435 lsm = ccc_inode_lsm_get(inode);
1436 if (!lsm_has_objects(lsm))
1437 GOTO(out, rc = -ENOENT);
1439 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1440 (lsm->lsm_stripe_count));
1442 OBD_ALLOC_LARGE(lsm2, lsm_size);
1444 GOTO(out, rc = -ENOMEM);
1447 oa->o_nlink = ost_idx;
1448 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1449 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1450 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1451 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1452 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1453 memcpy(lsm2, lsm, lsm_size);
1454 ll_inode_size_lock(inode);
1455 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1456 ll_inode_size_unlock(inode);
1458 OBD_FREE_LARGE(lsm2, lsm_size);
1461 ccc_inode_lsm_put(inode, lsm);
1466 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1468 struct ll_recreate_obj ucreat;
1472 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1475 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1479 ostid_set_seq_mdt0(&oi);
1480 ostid_set_id(&oi, ucreat.lrc_id);
1481 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1484 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1491 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1494 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1497 fid_to_ostid(&fid, &oi);
1498 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1499 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1502 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1503 __u64 flags, struct lov_user_md *lum,
1506 struct lov_stripe_md *lsm = NULL;
1507 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1511 lsm = ccc_inode_lsm_get(inode);
1513 ccc_inode_lsm_put(inode, lsm);
1514 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1515 PFID(ll_inode2fid(inode)));
1516 GOTO(out, rc = -EEXIST);
1519 ll_inode_size_lock(inode);
1520 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1522 GOTO(out_unlock, rc);
1523 rc = oit.d.lustre.it_status;
1525 GOTO(out_req_free, rc);
1527 ll_release_openhandle(file->f_dentry, &oit);
1530 ll_inode_size_unlock(inode);
1531 ll_intent_release(&oit);
1532 ccc_inode_lsm_put(inode, lsm);
1534 cl_lov_delay_create_clear(&file->f_flags);
1537 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1541 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1542 struct lov_mds_md **lmmp, int *lmm_size,
1543 struct ptlrpc_request **request)
1545 struct ll_sb_info *sbi = ll_i2sbi(inode);
1546 struct mdt_body *body;
1547 struct lov_mds_md *lmm = NULL;
1548 struct ptlrpc_request *req = NULL;
1549 struct md_op_data *op_data;
1552 rc = ll_get_default_mdsize(sbi, &lmmsize);
1556 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1557 strlen(filename), lmmsize,
1558 LUSTRE_OPC_ANY, NULL);
1559 if (IS_ERR(op_data))
1560 RETURN(PTR_ERR(op_data));
1562 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1563 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1564 ll_finish_md_op_data(op_data);
1566 CDEBUG(D_INFO, "md_getattr_name failed "
1567 "on %s: rc %d\n", filename, rc);
1571 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1572 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1574 lmmsize = body->mbo_eadatasize;
1576 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1578 GOTO(out, rc = -ENODATA);
1581 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1582 LASSERT(lmm != NULL);
1584 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1585 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1586 GOTO(out, rc = -EPROTO);
1590 * This is coming from the MDS, so is probably in
1591 * little endian. We convert it to host endian before
1592 * passing it to userspace.
1594 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1597 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1598 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1601 /* if function called for directory - we should
1602 * avoid swab not existent lsm objects */
1603 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1604 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1605 if (S_ISREG(body->mbo_mode))
1606 lustre_swab_lov_user_md_objects(
1607 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1609 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1610 lustre_swab_lov_user_md_v3(
1611 (struct lov_user_md_v3 *)lmm);
1612 if (S_ISREG(body->mbo_mode))
1613 lustre_swab_lov_user_md_objects(
1614 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1621 *lmm_size = lmmsize;
1626 static int ll_lov_setea(struct inode *inode, struct file *file,
1629 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1630 struct lov_user_md *lump;
1631 int lum_size = sizeof(struct lov_user_md) +
1632 sizeof(struct lov_user_ost_data);
1636 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1639 OBD_ALLOC_LARGE(lump, lum_size);
1643 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1644 OBD_FREE_LARGE(lump, lum_size);
1648 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1650 OBD_FREE_LARGE(lump, lum_size);
1654 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1657 struct lov_user_md_v3 lumv3;
1658 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1659 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1660 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1662 __u64 flags = FMODE_WRITE;
1665 /* first try with v1 which is smaller than v3 */
1666 lum_size = sizeof(struct lov_user_md_v1);
1667 if (copy_from_user(lumv1, lumv1p, lum_size))
1670 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1671 lum_size = sizeof(struct lov_user_md_v3);
1672 if (copy_from_user(&lumv3, lumv3p, lum_size))
1676 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1678 struct lov_stripe_md *lsm;
1681 put_user(0, &lumv1p->lmm_stripe_count);
1683 ll_layout_refresh(inode, &gen);
1684 lsm = ccc_inode_lsm_get(inode);
1685 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1686 0, lsm, (void *)arg);
1687 ccc_inode_lsm_put(inode, lsm);
1692 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1694 struct lov_stripe_md *lsm;
1698 lsm = ccc_inode_lsm_get(inode);
1700 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1702 ccc_inode_lsm_put(inode, lsm);
1707 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1709 struct ll_inode_info *lli = ll_i2info(inode);
1710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1711 struct ccc_grouplock grouplock;
1715 if (ll_file_nolock(file))
1716 RETURN(-EOPNOTSUPP);
1718 spin_lock(&lli->lli_lock);
1719 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1720 CWARN("group lock already existed with gid %lu\n",
1721 fd->fd_grouplock.cg_gid);
1722 spin_unlock(&lli->lli_lock);
1725 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1726 spin_unlock(&lli->lli_lock);
1728 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1729 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1733 spin_lock(&lli->lli_lock);
1734 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1735 spin_unlock(&lli->lli_lock);
1736 CERROR("another thread just won the race\n");
1737 cl_put_grouplock(&grouplock);
1741 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1742 fd->fd_grouplock = grouplock;
1743 spin_unlock(&lli->lli_lock);
1745 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1749 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1751 struct ll_inode_info *lli = ll_i2info(inode);
1752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1753 struct ccc_grouplock grouplock;
1756 spin_lock(&lli->lli_lock);
1757 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1758 spin_unlock(&lli->lli_lock);
1759 CWARN("no group lock held\n");
1762 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1764 if (fd->fd_grouplock.cg_gid != arg) {
1765 CWARN("group lock %lu doesn't match current id %lu\n",
1766 arg, fd->fd_grouplock.cg_gid);
1767 spin_unlock(&lli->lli_lock);
1771 grouplock = fd->fd_grouplock;
1772 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1773 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1774 spin_unlock(&lli->lli_lock);
1776 cl_put_grouplock(&grouplock);
1777 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1782 * Close inode open handle
1784 * \param dentry [in] dentry which contains the inode
1785 * \param it [in,out] intent which contains open info and result
1788 * \retval <0 failure
1790 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1792 struct inode *inode = dentry->d_inode;
1793 struct obd_client_handle *och;
1799 /* Root ? Do nothing. */
1800 if (dentry->d_inode->i_sb->s_root == dentry)
1803 /* No open handle to close? Move away */
1804 if (!it_disposition(it, DISP_OPEN_OPEN))
1807 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1809 OBD_ALLOC(och, sizeof(*och));
1811 GOTO(out, rc = -ENOMEM);
1813 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1815 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1818 /* this one is in place of ll_file_open */
1819 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1820 ptlrpc_req_finished(it->d.lustre.it_data);
1821 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1827 * Get size for inode for which FIEMAP mapping is requested.
1828 * Make the FIEMAP get_info call and returns the result.
1830 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1833 struct obd_export *exp = ll_i2dtexp(inode);
1834 struct lov_stripe_md *lsm = NULL;
1835 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1836 __u32 vallen = num_bytes;
1840 /* Checks for fiemap flags */
1841 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1842 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1846 /* Check for FIEMAP_FLAG_SYNC */
1847 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1848 rc = filemap_fdatawrite(inode->i_mapping);
1853 lsm = ccc_inode_lsm_get(inode);
1857 /* If the stripe_count > 1 and the application does not understand
1858 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1860 if (lsm->lsm_stripe_count > 1 &&
1861 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1862 GOTO(out, rc = -EOPNOTSUPP);
1864 fm_key.oa.o_oi = lsm->lsm_oi;
1865 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1867 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1868 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1869 /* If filesize is 0, then there would be no objects for mapping */
1870 if (fm_key.oa.o_size == 0) {
1871 fiemap->fm_mapped_extents = 0;
1875 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1877 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1880 CERROR("obd_get_info failed: rc = %d\n", rc);
1883 ccc_inode_lsm_put(inode, lsm);
1887 int ll_fid2path(struct inode *inode, void *arg)
1889 struct obd_export *exp = ll_i2mdexp(inode);
1890 struct getinfo_fid2path *gfout, *gfin;
1894 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1895 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1898 /* Need to get the buflen */
1899 OBD_ALLOC_PTR(gfin);
1902 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1907 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1908 OBD_ALLOC(gfout, outsize);
1909 if (gfout == NULL) {
1913 memcpy(gfout, gfin, sizeof(*gfout));
1916 /* Call mdc_iocontrol */
1917 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1921 if (copy_to_user(arg, gfout, outsize))
1925 OBD_FREE(gfout, outsize);
1929 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1931 struct ll_user_fiemap *fiemap_s;
1932 size_t num_bytes, ret_bytes;
1933 unsigned int extent_count;
1936 /* Get the extent count so we can calculate the size of
1937 * required fiemap buffer */
1938 if (get_user(extent_count,
1939 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1941 num_bytes = sizeof(*fiemap_s) + (extent_count *
1942 sizeof(struct ll_fiemap_extent));
1944 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1945 if (fiemap_s == NULL)
1948 /* get the fiemap value */
1949 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1951 GOTO(error, rc = -EFAULT);
1953 /* If fm_extent_count is non-zero, read the first extent since
1954 * it is used to calculate end_offset and device from previous
1957 if (copy_from_user(&fiemap_s->fm_extents[0],
1958 (char __user *)arg + sizeof(*fiemap_s),
1959 sizeof(struct ll_fiemap_extent)))
1960 GOTO(error, rc = -EFAULT);
1963 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1967 ret_bytes = sizeof(struct ll_user_fiemap);
1969 if (extent_count != 0)
1970 ret_bytes += (fiemap_s->fm_mapped_extents *
1971 sizeof(struct ll_fiemap_extent));
1973 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1977 OBD_FREE_LARGE(fiemap_s, num_bytes);
1982 * Read the data_version for inode.
1984 * This value is computed using stripe object version on OST.
1985 * Version is computed using server side locking.
1987 * @param sync if do sync on the OST side;
1989 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1990 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1992 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1994 struct lov_stripe_md *lsm = NULL;
1995 struct ll_sb_info *sbi = ll_i2sbi(inode);
1996 struct obdo *obdo = NULL;
2000 /* If no stripe, we consider version is 0. */
2001 lsm = ccc_inode_lsm_get(inode);
2002 if (!lsm_has_objects(lsm)) {
2004 CDEBUG(D_INODE, "No object for inode\n");
2008 OBD_ALLOC_PTR(obdo);
2010 GOTO(out, rc = -ENOMEM);
2012 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2014 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2017 *data_version = obdo->o_data_version;
2023 ccc_inode_lsm_put(inode, lsm);
2028 * Trigger a HSM release request for the provided inode.
2030 int ll_hsm_release(struct inode *inode)
2032 struct cl_env_nest nest;
2034 struct obd_client_handle *och = NULL;
2035 __u64 data_version = 0;
2039 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2040 ll_get_fsname(inode->i_sb, NULL, 0),
2041 PFID(&ll_i2info(inode)->lli_fid));
2043 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2045 GOTO(out, rc = PTR_ERR(och));
2047 /* Grab latest data_version and [am]time values */
2048 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2052 env = cl_env_nested_get(&nest);
2054 GOTO(out, rc = PTR_ERR(env));
2056 ll_merge_lvb(env, inode);
2057 cl_env_nested_put(&nest, env);
2059 /* Release the file.
2060 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2061 * we still need it to pack l_remote_handle to MDT. */
2062 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2068 if (och != NULL && !IS_ERR(och)) /* close the file */
2069 ll_lease_close(och, inode, NULL);
2074 struct ll_swap_stack {
2075 struct iattr ia1, ia2;
2077 struct inode *inode1, *inode2;
2078 bool check_dv1, check_dv2;
2081 static int ll_swap_layouts(struct file *file1, struct file *file2,
2082 struct lustre_swap_layouts *lsl)
2084 struct mdc_swap_layouts msl;
2085 struct md_op_data *op_data;
2088 struct ll_swap_stack *llss = NULL;
2091 OBD_ALLOC_PTR(llss);
2095 llss->inode1 = file1->f_dentry->d_inode;
2096 llss->inode2 = file2->f_dentry->d_inode;
2098 if (!S_ISREG(llss->inode2->i_mode))
2099 GOTO(free, rc = -EINVAL);
2101 if (inode_permission(llss->inode1, MAY_WRITE) ||
2102 inode_permission(llss->inode2, MAY_WRITE))
2103 GOTO(free, rc = -EPERM);
2105 if (llss->inode2->i_sb != llss->inode1->i_sb)
2106 GOTO(free, rc = -EXDEV);
2108 /* we use 2 bool because it is easier to swap than 2 bits */
2109 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2110 llss->check_dv1 = true;
2112 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2113 llss->check_dv2 = true;
2115 /* we cannot use lsl->sl_dvX directly because we may swap them */
2116 llss->dv1 = lsl->sl_dv1;
2117 llss->dv2 = lsl->sl_dv2;
2119 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2120 if (rc == 0) /* same file, done! */
2123 if (rc < 0) { /* sequentialize it */
2124 swap(llss->inode1, llss->inode2);
2126 swap(llss->dv1, llss->dv2);
2127 swap(llss->check_dv1, llss->check_dv2);
2131 if (gid != 0) { /* application asks to flush dirty cache */
2132 rc = ll_get_grouplock(llss->inode1, file1, gid);
2136 rc = ll_get_grouplock(llss->inode2, file2, gid);
2138 ll_put_grouplock(llss->inode1, file1, gid);
2143 /* to be able to restore mtime and atime after swap
2144 * we need to first save them */
2146 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2147 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2148 llss->ia1.ia_atime = llss->inode1->i_atime;
2149 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2150 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2151 llss->ia2.ia_atime = llss->inode2->i_atime;
2152 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2155 /* ultimate check, before swaping the layouts we check if
2156 * dataversion has changed (if requested) */
2157 if (llss->check_dv1) {
2158 rc = ll_data_version(llss->inode1, &dv, 0);
2161 if (dv != llss->dv1)
2162 GOTO(putgl, rc = -EAGAIN);
2165 if (llss->check_dv2) {
2166 rc = ll_data_version(llss->inode2, &dv, 0);
2169 if (dv != llss->dv2)
2170 GOTO(putgl, rc = -EAGAIN);
2173 /* struct md_op_data is used to send the swap args to the mdt
2174 * only flags is missing, so we use struct mdc_swap_layouts
2175 * through the md_op_data->op_data */
2176 /* flags from user space have to be converted before they are send to
2177 * server, no flag is sent today, they are only used on the client */
2180 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2181 0, LUSTRE_OPC_ANY, &msl);
2182 if (IS_ERR(op_data))
2183 GOTO(free, rc = PTR_ERR(op_data));
2185 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2186 sizeof(*op_data), op_data, NULL);
2187 ll_finish_md_op_data(op_data);
2191 ll_put_grouplock(llss->inode2, file2, gid);
2192 ll_put_grouplock(llss->inode1, file1, gid);
2195 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2199 /* clear useless flags */
2200 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2201 llss->ia1.ia_valid &= ~ATTR_MTIME;
2202 llss->ia2.ia_valid &= ~ATTR_MTIME;
2205 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2206 llss->ia1.ia_valid &= ~ATTR_ATIME;
2207 llss->ia2.ia_valid &= ~ATTR_ATIME;
2210 /* update time if requested */
2212 if (llss->ia2.ia_valid != 0) {
2213 mutex_lock(&llss->inode1->i_mutex);
2214 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2215 mutex_unlock(&llss->inode1->i_mutex);
2218 if (llss->ia1.ia_valid != 0) {
2221 mutex_lock(&llss->inode2->i_mutex);
2222 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2223 mutex_unlock(&llss->inode2->i_mutex);
2235 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2237 struct md_op_data *op_data;
2240 /* Non-root users are forbidden to set or clear flags which are
2241 * NOT defined in HSM_USER_MASK. */
2242 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2243 !cfs_capable(CFS_CAP_SYS_ADMIN))
2246 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2247 LUSTRE_OPC_ANY, hss);
2248 if (IS_ERR(op_data))
2249 RETURN(PTR_ERR(op_data));
2251 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2252 sizeof(*op_data), op_data, NULL);
2254 ll_finish_md_op_data(op_data);
2259 static int ll_hsm_import(struct inode *inode, struct file *file,
2260 struct hsm_user_import *hui)
2262 struct hsm_state_set *hss = NULL;
2263 struct iattr *attr = NULL;
2267 if (!S_ISREG(inode->i_mode))
2273 GOTO(out, rc = -ENOMEM);
2275 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2276 hss->hss_archive_id = hui->hui_archive_id;
2277 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2278 rc = ll_hsm_state_set(inode, hss);
2282 OBD_ALLOC_PTR(attr);
2284 GOTO(out, rc = -ENOMEM);
2286 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2287 attr->ia_mode |= S_IFREG;
2288 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2289 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2290 attr->ia_size = hui->hui_size;
2291 attr->ia_mtime.tv_sec = hui->hui_mtime;
2292 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2293 attr->ia_atime.tv_sec = hui->hui_atime;
2294 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2296 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2297 ATTR_UID | ATTR_GID |
2298 ATTR_MTIME | ATTR_MTIME_SET |
2299 ATTR_ATIME | ATTR_ATIME_SET;
2301 rc = ll_setattr_raw(file->f_dentry, attr, true);
2316 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2318 struct inode *inode = file->f_dentry->d_inode;
2319 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2323 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2324 PFID(ll_inode2fid(inode)), inode, cmd);
2325 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2327 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2328 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2332 case LL_IOC_GETFLAGS:
2333 /* Get the current value of the file flags */
2334 return put_user(fd->fd_flags, (int *)arg);
2335 case LL_IOC_SETFLAGS:
2336 case LL_IOC_CLRFLAGS:
2337 /* Set or clear specific file flags */
2338 /* XXX This probably needs checks to ensure the flags are
2339 * not abused, and to handle any flag side effects.
2341 if (get_user(flags, (int *) arg))
2344 if (cmd == LL_IOC_SETFLAGS) {
2345 if ((flags & LL_FILE_IGNORE_LOCK) &&
2346 !(file->f_flags & O_DIRECT)) {
2347 CERROR("%s: unable to disable locking on "
2348 "non-O_DIRECT file\n", current->comm);
2352 fd->fd_flags |= flags;
2354 fd->fd_flags &= ~flags;
2357 case LL_IOC_LOV_SETSTRIPE:
2358 RETURN(ll_lov_setstripe(inode, file, arg));
2359 case LL_IOC_LOV_SETEA:
2360 RETURN(ll_lov_setea(inode, file, arg));
2361 case LL_IOC_LOV_SWAP_LAYOUTS: {
2363 struct lustre_swap_layouts lsl;
2365 if (copy_from_user(&lsl, (char *)arg,
2366 sizeof(struct lustre_swap_layouts)))
2369 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2372 file2 = fget(lsl.sl_fd);
2377 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2378 rc = ll_swap_layouts(file, file2, &lsl);
2382 case LL_IOC_LOV_GETSTRIPE:
2383 RETURN(ll_lov_getstripe(inode, arg));
2384 case LL_IOC_RECREATE_OBJ:
2385 RETURN(ll_lov_recreate_obj(inode, arg));
2386 case LL_IOC_RECREATE_FID:
2387 RETURN(ll_lov_recreate_fid(inode, arg));
2388 case FSFILT_IOC_FIEMAP:
2389 RETURN(ll_ioctl_fiemap(inode, arg));
2390 case FSFILT_IOC_GETFLAGS:
2391 case FSFILT_IOC_SETFLAGS:
2392 RETURN(ll_iocontrol(inode, file, cmd, arg));
2393 case FSFILT_IOC_GETVERSION_OLD:
2394 case FSFILT_IOC_GETVERSION:
2395 RETURN(put_user(inode->i_generation, (int *)arg));
2396 case LL_IOC_GROUP_LOCK:
2397 RETURN(ll_get_grouplock(inode, file, arg));
2398 case LL_IOC_GROUP_UNLOCK:
2399 RETURN(ll_put_grouplock(inode, file, arg));
2400 case IOC_OBD_STATFS:
2401 RETURN(ll_obd_statfs(inode, (void *)arg));
2403 /* We need to special case any other ioctls we want to handle,
2404 * to send them to the MDS/OST as appropriate and to properly
2405 * network encode the arg field.
2406 case FSFILT_IOC_SETVERSION_OLD:
2407 case FSFILT_IOC_SETVERSION:
2409 case LL_IOC_FLUSHCTX:
2410 RETURN(ll_flush_ctx(inode));
2411 case LL_IOC_PATH2FID: {
2412 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2413 sizeof(struct lu_fid)))
2418 case OBD_IOC_FID2PATH:
2419 RETURN(ll_fid2path(inode, (void *)arg));
2420 case LL_IOC_DATA_VERSION: {
2421 struct ioc_data_version idv;
2424 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2427 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2428 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2430 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2436 case LL_IOC_GET_MDTIDX: {
2439 mdtidx = ll_get_mdt_idx(inode);
2443 if (put_user((int)mdtidx, (int*)arg))
2448 case OBD_IOC_GETDTNAME:
2449 case OBD_IOC_GETMDNAME:
2450 RETURN(ll_get_obd_name(inode, cmd, arg));
2451 case LL_IOC_HSM_STATE_GET: {
2452 struct md_op_data *op_data;
2453 struct hsm_user_state *hus;
2460 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2461 LUSTRE_OPC_ANY, hus);
2462 if (IS_ERR(op_data)) {
2464 RETURN(PTR_ERR(op_data));
2467 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2470 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2473 ll_finish_md_op_data(op_data);
2477 case LL_IOC_HSM_STATE_SET: {
2478 struct hsm_state_set *hss;
2485 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2490 rc = ll_hsm_state_set(inode, hss);
2495 case LL_IOC_HSM_ACTION: {
2496 struct md_op_data *op_data;
2497 struct hsm_current_action *hca;
2504 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2505 LUSTRE_OPC_ANY, hca);
2506 if (IS_ERR(op_data)) {
2508 RETURN(PTR_ERR(op_data));
2511 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2514 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2517 ll_finish_md_op_data(op_data);
2521 case LL_IOC_SET_LEASE: {
2522 struct ll_inode_info *lli = ll_i2info(inode);
2523 struct obd_client_handle *och = NULL;
2529 if (!(file->f_mode & FMODE_WRITE))
2534 if (!(file->f_mode & FMODE_READ))
2539 mutex_lock(&lli->lli_och_mutex);
2540 if (fd->fd_lease_och != NULL) {
2541 och = fd->fd_lease_och;
2542 fd->fd_lease_och = NULL;
2544 mutex_unlock(&lli->lli_och_mutex);
2547 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2548 rc = ll_lease_close(och, inode, &lease_broken);
2549 if (rc == 0 && lease_broken)
2555 /* return the type of lease or error */
2556 RETURN(rc < 0 ? rc : (int)mode);
2561 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2563 /* apply for lease */
2564 och = ll_lease_open(inode, file, mode, 0);
2566 RETURN(PTR_ERR(och));
2569 mutex_lock(&lli->lli_och_mutex);
2570 if (fd->fd_lease_och == NULL) {
2571 fd->fd_lease_och = och;
2574 mutex_unlock(&lli->lli_och_mutex);
2576 /* impossible now that only excl is supported for now */
2577 ll_lease_close(och, inode, &lease_broken);
2582 case LL_IOC_GET_LEASE: {
2583 struct ll_inode_info *lli = ll_i2info(inode);
2584 struct ldlm_lock *lock = NULL;
2587 mutex_lock(&lli->lli_och_mutex);
2588 if (fd->fd_lease_och != NULL) {
2589 struct obd_client_handle *och = fd->fd_lease_och;
2591 lock = ldlm_handle2lock(&och->och_lease_handle);
2593 lock_res_and_lock(lock);
2594 if (!ldlm_is_cancel(lock))
2595 rc = och->och_flags &
2596 (FMODE_READ | FMODE_WRITE);
2597 unlock_res_and_lock(lock);
2598 LDLM_LOCK_PUT(lock);
2601 mutex_unlock(&lli->lli_och_mutex);
2604 case LL_IOC_HSM_IMPORT: {
2605 struct hsm_user_import *hui;
2611 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2616 rc = ll_hsm_import(inode, file, hui);
2626 ll_iocontrol_call(inode, file, cmd, arg, &err))
2629 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2635 #ifndef HAVE_FILE_LLSEEK_SIZE
2636 static inline loff_t
2637 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2639 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2641 if (offset > maxsize)
2644 if (offset != file->f_pos) {
2645 file->f_pos = offset;
2646 file->f_version = 0;
2652 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2653 loff_t maxsize, loff_t eof)
2655 struct inode *inode = file->f_dentry->d_inode;
2663 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2664 * position-querying operation. Avoid rewriting the "same"
2665 * f_pos value back to the file because a concurrent read(),
2666 * write() or lseek() might have altered it
2671 * f_lock protects against read/modify/write race with other
2672 * SEEK_CURs. Note that parallel writes and reads behave
2675 mutex_lock(&inode->i_mutex);
2676 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2677 mutex_unlock(&inode->i_mutex);
2681 * In the generic case the entire file is data, so as long as
2682 * offset isn't at the end of the file then the offset is data.
2689 * There is a virtual hole at the end of the file, so as long as
2690 * offset isn't i_size or larger, return i_size.
2698 return llseek_execute(file, offset, maxsize);
2702 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2704 struct inode *inode = file->f_dentry->d_inode;
2705 loff_t retval, eof = 0;
2708 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2709 (origin == SEEK_CUR) ? file->f_pos : 0);
2710 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2711 PFID(ll_inode2fid(inode)), inode, retval, retval,
2713 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2715 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2716 retval = ll_glimpse_size(inode);
2719 eof = i_size_read(inode);
2722 retval = ll_generic_file_llseek_size(file, offset, origin,
2723 ll_file_maxbytes(inode), eof);
2727 static int ll_flush(struct file *file, fl_owner_t id)
2729 struct inode *inode = file->f_dentry->d_inode;
2730 struct ll_inode_info *lli = ll_i2info(inode);
2731 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2734 LASSERT(!S_ISDIR(inode->i_mode));
2736 /* catch async errors that were recorded back when async writeback
2737 * failed for pages in this mapping. */
2738 rc = lli->lli_async_rc;
2739 lli->lli_async_rc = 0;
2740 if (lli->lli_clob != NULL) {
2741 err = lov_read_and_clear_async_rc(lli->lli_clob);
2746 /* The application has been told write failure already.
2747 * Do not report failure again. */
2748 if (fd->fd_write_failed)
2750 return rc ? -EIO : 0;
2754 * Called to make sure a portion of file has been written out.
2755 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2757 * Return how many pages have been written.
2759 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2760 enum cl_fsync_mode mode, int ignore_layout)
2762 struct cl_env_nest nest;
2765 struct obd_capa *capa = NULL;
2766 struct cl_fsync_io *fio;
2770 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2771 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2774 env = cl_env_nested_get(&nest);
2776 RETURN(PTR_ERR(env));
2778 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2780 io = ccc_env_thread_io(env);
2781 io->ci_obj = cl_i2info(inode)->lli_clob;
2782 io->ci_ignore_layout = ignore_layout;
2784 /* initialize parameters for sync */
2785 fio = &io->u.ci_fsync;
2786 fio->fi_capa = capa;
2787 fio->fi_start = start;
2789 fio->fi_fid = ll_inode2fid(inode);
2790 fio->fi_mode = mode;
2791 fio->fi_nr_written = 0;
2793 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2794 result = cl_io_loop(env, io);
2796 result = io->ci_result;
2798 result = fio->fi_nr_written;
2799 cl_io_fini(env, io);
2800 cl_env_nested_put(&nest, env);
2808 * When dentry is provided (the 'else' case), *file->f_dentry may be
2809 * null and dentry must be used directly rather than pulled from
2810 * *file->f_dentry as is done otherwise.
2813 #ifdef HAVE_FILE_FSYNC_4ARGS
2814 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2816 struct dentry *dentry = file->f_dentry;
2817 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2818 int ll_fsync(struct file *file, int datasync)
2820 struct dentry *dentry = file->f_dentry;
2822 loff_t end = LLONG_MAX;
2824 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2827 loff_t end = LLONG_MAX;
2829 struct inode *inode = dentry->d_inode;
2830 struct ll_inode_info *lli = ll_i2info(inode);
2831 struct ptlrpc_request *req;
2832 struct obd_capa *oc;
2836 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2837 PFID(ll_inode2fid(inode)), inode);
2838 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2840 #ifdef HAVE_FILE_FSYNC_4ARGS
2841 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2842 mutex_lock(&inode->i_mutex);
2844 /* fsync's caller has already called _fdata{sync,write}, we want
2845 * that IO to finish before calling the osc and mdc sync methods */
2846 rc = filemap_fdatawait(inode->i_mapping);
2849 /* catch async errors that were recorded back when async writeback
2850 * failed for pages in this mapping. */
2851 if (!S_ISDIR(inode->i_mode)) {
2852 err = lli->lli_async_rc;
2853 lli->lli_async_rc = 0;
2856 err = lov_read_and_clear_async_rc(lli->lli_clob);
2861 oc = ll_mdscapa_get(inode);
2862 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2868 ptlrpc_req_finished(req);
2870 if (S_ISREG(inode->i_mode)) {
2871 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2873 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2874 if (rc == 0 && err < 0)
2877 fd->fd_write_failed = true;
2879 fd->fd_write_failed = false;
2882 #ifdef HAVE_FILE_FSYNC_4ARGS
2883 mutex_unlock(&inode->i_mutex);
2889 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2891 struct inode *inode = file->f_dentry->d_inode;
2892 struct ll_sb_info *sbi = ll_i2sbi(inode);
2893 struct ldlm_enqueue_info einfo = {
2894 .ei_type = LDLM_FLOCK,
2895 .ei_cb_cp = ldlm_flock_completion_ast,
2896 .ei_cbdata = file_lock,
2898 struct md_op_data *op_data;
2899 struct lustre_handle lockh = {0};
2900 ldlm_policy_data_t flock = {{0}};
2901 int fl_type = file_lock->fl_type;
2907 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2908 PFID(ll_inode2fid(inode)), file_lock);
2910 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2912 if (file_lock->fl_flags & FL_FLOCK) {
2913 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2914 /* flocks are whole-file locks */
2915 flock.l_flock.end = OFFSET_MAX;
2916 /* For flocks owner is determined by the local file desctiptor*/
2917 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2918 } else if (file_lock->fl_flags & FL_POSIX) {
2919 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2920 flock.l_flock.start = file_lock->fl_start;
2921 flock.l_flock.end = file_lock->fl_end;
2925 flock.l_flock.pid = file_lock->fl_pid;
2927 /* Somewhat ugly workaround for svc lockd.
2928 * lockd installs custom fl_lmops->lm_compare_owner that checks
2929 * for the fl_owner to be the same (which it always is on local node
2930 * I guess between lockd processes) and then compares pid.
2931 * As such we assign pid to the owner field to make it all work,
2932 * conflict with normal locks is unlikely since pid space and
2933 * pointer space for current->files are not intersecting */
2934 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2935 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2939 einfo.ei_mode = LCK_PR;
2942 /* An unlock request may or may not have any relation to
2943 * existing locks so we may not be able to pass a lock handle
2944 * via a normal ldlm_lock_cancel() request. The request may even
2945 * unlock a byte range in the middle of an existing lock. In
2946 * order to process an unlock request we need all of the same
2947 * information that is given with a normal read or write record
2948 * lock request. To avoid creating another ldlm unlock (cancel)
2949 * message we'll treat a LCK_NL flock request as an unlock. */
2950 einfo.ei_mode = LCK_NL;
2953 einfo.ei_mode = LCK_PW;
2956 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2971 flags = LDLM_FL_BLOCK_NOWAIT;
2977 flags = LDLM_FL_TEST_LOCK;
2980 CERROR("unknown fcntl lock command: %d\n", cmd);
2984 /* Save the old mode so that if the mode in the lock changes we
2985 * can decrement the appropriate reader or writer refcount. */
2986 file_lock->fl_type = einfo.ei_mode;
2988 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2989 LUSTRE_OPC_ANY, NULL);
2990 if (IS_ERR(op_data))
2991 RETURN(PTR_ERR(op_data));
2993 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2994 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2995 flock.l_flock.pid, flags, einfo.ei_mode,
2996 flock.l_flock.start, flock.l_flock.end);
2998 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3001 /* Restore the file lock type if not TEST lock. */
3002 if (!(flags & LDLM_FL_TEST_LOCK))
3003 file_lock->fl_type = fl_type;
3005 if ((file_lock->fl_flags & FL_FLOCK) &&
3006 (rc == 0 || file_lock->fl_type == F_UNLCK))
3007 rc2 = flock_lock_file_wait(file, file_lock);
3008 if ((file_lock->fl_flags & FL_POSIX) &&
3009 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3010 !(flags & LDLM_FL_TEST_LOCK))
3011 rc2 = posix_lock_file_wait(file, file_lock);
3013 if (rc2 && file_lock->fl_type != F_UNLCK) {
3014 einfo.ei_mode = LCK_NL;
3015 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3020 ll_finish_md_op_data(op_data);
3025 int ll_get_fid_by_name(struct inode *parent, const char *name,
3026 int namelen, struct lu_fid *fid)
3028 struct md_op_data *op_data = NULL;
3029 struct mdt_body *body;
3030 struct ptlrpc_request *req;
3034 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3035 LUSTRE_OPC_ANY, NULL);
3036 if (IS_ERR(op_data))
3037 RETURN(PTR_ERR(op_data));
3039 op_data->op_valid = OBD_MD_FLID;
3040 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3041 ll_finish_md_op_data(op_data);
3045 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3047 GOTO(out_req, rc = -EFAULT);
3049 *fid = body->mbo_fid1;
3051 ptlrpc_req_finished(req);
3055 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3056 const char *name, int namelen)
3058 struct dentry *dchild = NULL;
3059 struct inode *child_inode = NULL;
3060 struct md_op_data *op_data;
3061 struct ptlrpc_request *request = NULL;
3066 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3067 name, PFID(ll_inode2fid(parent)), mdtidx);
3069 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3070 0, LUSTRE_OPC_ANY, NULL);
3071 if (IS_ERR(op_data))
3072 RETURN(PTR_ERR(op_data));
3074 /* Get child FID first */
3075 qstr.hash = full_name_hash(name, namelen);
3078 dchild = d_lookup(file->f_dentry, &qstr);
3079 if (dchild != NULL && dchild->d_inode != NULL) {
3080 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3081 if (dchild->d_inode != NULL) {
3082 child_inode = igrab(dchild->d_inode);
3083 ll_invalidate_aliases(child_inode);
3087 rc = ll_get_fid_by_name(parent, name, namelen,
3093 if (!fid_is_sane(&op_data->op_fid3)) {
3094 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3095 ll_get_fsname(parent->i_sb, NULL, 0), name,
3096 PFID(&op_data->op_fid3));
3097 GOTO(out_free, rc = -EINVAL);
3100 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3105 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3106 PFID(&op_data->op_fid3), mdtidx);
3107 GOTO(out_free, rc = 0);
3110 op_data->op_mds = mdtidx;
3111 op_data->op_cli_flags = CLI_MIGRATE;
3112 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3113 namelen, name, namelen, &request);
3115 ll_update_times(request, parent);
3117 ptlrpc_req_finished(request);
3122 if (child_inode != NULL) {
3123 clear_nlink(child_inode);
3127 ll_finish_md_op_data(op_data);
3132 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3140 * test if some locks matching bits and l_req_mode are acquired
3141 * - bits can be in different locks
3142 * - if found clear the common lock bits in *bits
3143 * - the bits not found, are kept in *bits
3145 * \param bits [IN] searched lock bits [IN]
3146 * \param l_req_mode [IN] searched lock mode
3147 * \retval boolean, true iff all bits are found
3149 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3151 struct lustre_handle lockh;
3152 ldlm_policy_data_t policy;
3153 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3154 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3163 fid = &ll_i2info(inode)->lli_fid;
3164 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3165 ldlm_lockname[mode]);
3167 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3168 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3169 policy.l_inodebits.bits = *bits & (1 << i);
3170 if (policy.l_inodebits.bits == 0)
3173 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3174 &policy, mode, &lockh)) {
3175 struct ldlm_lock *lock;
3177 lock = ldlm_handle2lock(&lockh);
3180 ~(lock->l_policy_data.l_inodebits.bits);
3181 LDLM_LOCK_PUT(lock);
3183 *bits &= ~policy.l_inodebits.bits;
3190 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3191 struct lustre_handle *lockh, __u64 flags,
3194 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3199 fid = &ll_i2info(inode)->lli_fid;
3200 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3202 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3203 fid, LDLM_IBITS, &policy, mode, lockh);
3208 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3210 /* Already unlinked. Just update nlink and return success */
3211 if (rc == -ENOENT) {
3213 /* This path cannot be hit for regular files unless in
3214 * case of obscure races, so no need to to validate
3216 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3218 } else if (rc != 0) {
3219 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3220 "%s: revalidate FID "DFID" error: rc = %d\n",
3221 ll_get_fsname(inode->i_sb, NULL, 0),
3222 PFID(ll_inode2fid(inode)), rc);
3228 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3230 struct inode *inode = dentry->d_inode;
3231 struct ptlrpc_request *req = NULL;
3232 struct obd_export *exp;
3236 LASSERT(inode != NULL);
3238 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3239 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3241 exp = ll_i2mdexp(inode);
3243 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3244 * But under CMD case, it caused some lock issues, should be fixed
3245 * with new CMD ibits lock. See bug 12718 */
3246 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3247 struct lookup_intent oit = { .it_op = IT_GETATTR };
3248 struct md_op_data *op_data;
3250 if (ibits == MDS_INODELOCK_LOOKUP)
3251 oit.it_op = IT_LOOKUP;
3253 /* Call getattr by fid, so do not provide name at all. */
3254 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3255 dentry->d_inode, NULL, 0, 0,
3256 LUSTRE_OPC_ANY, NULL);
3257 if (IS_ERR(op_data))
3258 RETURN(PTR_ERR(op_data));
3260 oit.it_create_mode |= M_CHECK_STALE;
3261 rc = md_intent_lock(exp, op_data, &oit, &req,
3262 &ll_md_blocking_ast, 0);
3263 ll_finish_md_op_data(op_data);
3264 oit.it_create_mode &= ~M_CHECK_STALE;
3266 rc = ll_inode_revalidate_fini(inode, rc);
3270 rc = ll_revalidate_it_finish(req, &oit, dentry);
3272 ll_intent_release(&oit);
3276 /* Unlinked? Unhash dentry, so it is not picked up later by
3277 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3278 here to preserve get_cwd functionality on 2.6.
3280 if (!dentry->d_inode->i_nlink)
3281 d_lustre_invalidate(dentry, 0);
3283 ll_lookup_finish_locks(&oit, dentry);
3284 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3285 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3286 obd_valid valid = OBD_MD_FLGETATTR;
3287 struct md_op_data *op_data;
3290 if (S_ISREG(inode->i_mode)) {
3291 rc = ll_get_default_mdsize(sbi, &ealen);
3294 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3297 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3298 0, ealen, LUSTRE_OPC_ANY,
3300 if (IS_ERR(op_data))
3301 RETURN(PTR_ERR(op_data));
3303 op_data->op_valid = valid;
3304 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3305 * capa for this inode. Because we only keep capas of dirs
3307 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3308 ll_finish_md_op_data(op_data);
3310 rc = ll_inode_revalidate_fini(inode, rc);
3314 rc = ll_prep_inode(&inode, req, NULL, NULL);
3317 ptlrpc_req_finished(req);
3321 static int ll_merge_md_attr(struct inode *inode)
3323 struct cl_attr attr = { 0 };
3326 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3327 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3332 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3333 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3335 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3336 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3337 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3343 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3345 struct inode *inode = dentry->d_inode;
3349 rc = __ll_inode_revalidate(dentry, ibits);
3353 /* if object isn't regular file, don't validate size */
3354 if (!S_ISREG(inode->i_mode)) {
3355 if (S_ISDIR(inode->i_mode) &&
3356 ll_i2info(inode)->lli_lsm_md != NULL) {
3357 rc = ll_merge_md_attr(inode);
3362 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3363 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3364 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3366 /* In case of restore, the MDT has the right size and has
3367 * already send it back without granting the layout lock,
3368 * inode is up-to-date so glimpse is useless.
3369 * Also to glimpse we need the layout, in case of a running
3370 * restore the MDT holds the layout lock so the glimpse will
3371 * block up to the end of restore (getattr will block)
3373 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3374 rc = ll_glimpse_size(inode);
3379 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3381 struct inode *inode = de->d_inode;
3382 struct ll_sb_info *sbi = ll_i2sbi(inode);
3383 struct ll_inode_info *lli = ll_i2info(inode);
3386 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3387 MDS_INODELOCK_LOOKUP);
3388 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3393 stat->dev = inode->i_sb->s_dev;
3394 if (ll_need_32bit_api(sbi))
3395 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3397 stat->ino = inode->i_ino;
3398 stat->mode = inode->i_mode;
3399 stat->uid = inode->i_uid;
3400 stat->gid = inode->i_gid;
3401 stat->rdev = inode->i_rdev;
3402 stat->atime = inode->i_atime;
3403 stat->mtime = inode->i_mtime;
3404 stat->ctime = inode->i_ctime;
3405 stat->blksize = 1 << inode->i_blkbits;
3406 stat->blocks = inode->i_blocks;
3408 if (S_ISDIR(inode->i_mode) &&
3409 ll_i2info(inode)->lli_lsm_md != NULL) {
3410 stat->nlink = lli->lli_stripe_dir_nlink;
3411 stat->size = lli->lli_stripe_dir_size;
3413 stat->nlink = inode->i_nlink;
3414 stat->size = i_size_read(inode);
3420 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3421 __u64 start, __u64 len)
3425 struct ll_user_fiemap *fiemap;
3426 unsigned int extent_count = fieinfo->fi_extents_max;
3428 num_bytes = sizeof(*fiemap) + (extent_count *
3429 sizeof(struct ll_fiemap_extent));
3430 OBD_ALLOC_LARGE(fiemap, num_bytes);
3435 fiemap->fm_flags = fieinfo->fi_flags;
3436 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3437 fiemap->fm_start = start;
3438 fiemap->fm_length = len;
3439 if (extent_count > 0)
3440 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3441 sizeof(struct ll_fiemap_extent));
3443 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3445 fieinfo->fi_flags = fiemap->fm_flags;
3446 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3447 if (extent_count > 0)
3448 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3449 fiemap->fm_mapped_extents *
3450 sizeof(struct ll_fiemap_extent));
3452 OBD_FREE_LARGE(fiemap, num_bytes);
3456 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3458 struct ll_inode_info *lli = ll_i2info(inode);
3459 struct posix_acl *acl = NULL;
3462 spin_lock(&lli->lli_lock);
3463 /* VFS' acl_permission_check->check_acl will release the refcount */
3464 acl = posix_acl_dup(lli->lli_posix_acl);
3465 spin_unlock(&lli->lli_lock);
3470 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3472 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3473 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3475 ll_check_acl(struct inode *inode, int mask)
3478 # ifdef CONFIG_FS_POSIX_ACL
3479 struct posix_acl *acl;
3483 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3484 if (flags & IPERM_FLAG_RCU)
3487 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3492 rc = posix_acl_permission(inode, acl, mask);
3493 posix_acl_release(acl);
3496 # else /* !CONFIG_FS_POSIX_ACL */
3498 # endif /* CONFIG_FS_POSIX_ACL */
3500 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3502 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3503 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3505 # ifdef HAVE_INODE_PERMISION_2ARGS
3506 int ll_inode_permission(struct inode *inode, int mask)
3508 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3513 struct ll_sb_info *sbi;
3514 struct root_squash_info *squash;
3515 struct cred *cred = NULL;
3516 const struct cred *old_cred = NULL;
3518 bool squash_id = false;
3521 #ifdef MAY_NOT_BLOCK
3522 if (mask & MAY_NOT_BLOCK)
3524 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3525 if (flags & IPERM_FLAG_RCU)
3529 /* as root inode are NOT getting validated in lookup operation,
3530 * need to do it before permission check. */
3532 if (inode == inode->i_sb->s_root->d_inode) {
3533 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3534 MDS_INODELOCK_LOOKUP);
3539 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3540 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3542 /* squash fsuid/fsgid if needed */
3543 sbi = ll_i2sbi(inode);
3544 squash = &sbi->ll_squash;
3545 if (unlikely(squash->rsi_uid != 0 &&
3546 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3547 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3551 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3552 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3553 squash->rsi_uid, squash->rsi_gid);
3555 /* update current process's credentials
3556 * and FS capability */
3557 cred = prepare_creds();
3561 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3562 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3563 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3564 if ((1 << cap) & CFS_CAP_FS_MASK)
3565 cap_lower(cred->cap_effective, cap);
3567 old_cred = override_creds(cred);
3570 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3572 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3573 rc = lustre_check_remote_perm(inode, mask);
3575 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3577 /* restore current process's credentials and FS capability */
3579 revert_creds(old_cred);
3586 /* -o localflock - only provides locally consistent flock locks */
3587 struct file_operations ll_file_operations = {
3588 .read = ll_file_read,
3589 .aio_read = ll_file_aio_read,
3590 .write = ll_file_write,
3591 .aio_write = ll_file_aio_write,
3592 .unlocked_ioctl = ll_file_ioctl,
3593 .open = ll_file_open,
3594 .release = ll_file_release,
3595 .mmap = ll_file_mmap,
3596 .llseek = ll_file_seek,
3597 .splice_read = ll_file_splice_read,
3602 struct file_operations ll_file_operations_flock = {
3603 .read = ll_file_read,
3604 .aio_read = ll_file_aio_read,
3605 .write = ll_file_write,
3606 .aio_write = ll_file_aio_write,
3607 .unlocked_ioctl = ll_file_ioctl,
3608 .open = ll_file_open,
3609 .release = ll_file_release,
3610 .mmap = ll_file_mmap,
3611 .llseek = ll_file_seek,
3612 .splice_read = ll_file_splice_read,
3615 .flock = ll_file_flock,
3616 .lock = ll_file_flock
3619 /* These are for -o noflock - to return ENOSYS on flock calls */
3620 struct file_operations ll_file_operations_noflock = {
3621 .read = ll_file_read,
3622 .aio_read = ll_file_aio_read,
3623 .write = ll_file_write,
3624 .aio_write = ll_file_aio_write,
3625 .unlocked_ioctl = ll_file_ioctl,
3626 .open = ll_file_open,
3627 .release = ll_file_release,
3628 .mmap = ll_file_mmap,
3629 .llseek = ll_file_seek,
3630 .splice_read = ll_file_splice_read,
3633 .flock = ll_file_noflock,
3634 .lock = ll_file_noflock
3637 struct inode_operations ll_file_inode_operations = {
3638 .setattr = ll_setattr,
3639 .getattr = ll_getattr,
3640 .permission = ll_inode_permission,
3641 .setxattr = ll_setxattr,
3642 .getxattr = ll_getxattr,
3643 .listxattr = ll_listxattr,
3644 .removexattr = ll_removexattr,
3645 .fiemap = ll_fiemap,
3646 #ifdef HAVE_IOP_GET_ACL
3647 .get_acl = ll_get_acl,
3651 /* dynamic ioctl number support routins */
3652 static struct llioc_ctl_data {
3653 struct rw_semaphore ioc_sem;
3654 struct list_head ioc_head;
3656 __RWSEM_INITIALIZER(llioc.ioc_sem),
3657 LIST_HEAD_INIT(llioc.ioc_head)
3662 struct list_head iocd_list;
3663 unsigned int iocd_size;
3664 llioc_callback_t iocd_cb;
3665 unsigned int iocd_count;
3666 unsigned int iocd_cmd[0];
3669 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3672 struct llioc_data *in_data = NULL;
3675 if (cb == NULL || cmd == NULL ||
3676 count > LLIOC_MAX_CMD || count < 0)
3679 size = sizeof(*in_data) + count * sizeof(unsigned int);
3680 OBD_ALLOC(in_data, size);
3681 if (in_data == NULL)
3684 memset(in_data, 0, sizeof(*in_data));
3685 in_data->iocd_size = size;
3686 in_data->iocd_cb = cb;
3687 in_data->iocd_count = count;
3688 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3690 down_write(&llioc.ioc_sem);
3691 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3692 up_write(&llioc.ioc_sem);
3697 void ll_iocontrol_unregister(void *magic)
3699 struct llioc_data *tmp;
3704 down_write(&llioc.ioc_sem);
3705 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3707 unsigned int size = tmp->iocd_size;
3709 list_del(&tmp->iocd_list);
3710 up_write(&llioc.ioc_sem);
3712 OBD_FREE(tmp, size);
3716 up_write(&llioc.ioc_sem);
3718 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3721 EXPORT_SYMBOL(ll_iocontrol_register);
3722 EXPORT_SYMBOL(ll_iocontrol_unregister);
3724 static enum llioc_iter
3725 ll_iocontrol_call(struct inode *inode, struct file *file,
3726 unsigned int cmd, unsigned long arg, int *rcp)
3728 enum llioc_iter ret = LLIOC_CONT;
3729 struct llioc_data *data;
3730 int rc = -EINVAL, i;
3732 down_read(&llioc.ioc_sem);
3733 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3734 for (i = 0; i < data->iocd_count; i++) {
3735 if (cmd != data->iocd_cmd[i])
3738 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3742 if (ret == LLIOC_STOP)
3745 up_read(&llioc.ioc_sem);
3752 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3754 struct ll_inode_info *lli = ll_i2info(inode);
3755 struct cl_env_nest nest;
3760 if (lli->lli_clob == NULL)
3763 env = cl_env_nested_get(&nest);
3765 RETURN(PTR_ERR(env));
3767 result = cl_conf_set(env, lli->lli_clob, conf);
3768 cl_env_nested_put(&nest, env);
3770 if (conf->coc_opc == OBJECT_CONF_SET) {
3771 struct ldlm_lock *lock = conf->coc_lock;
3773 LASSERT(lock != NULL);
3774 LASSERT(ldlm_has_layout(lock));
3776 struct lustre_md *md = conf->u.coc_md;
3777 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3779 /* it can only be allowed to match after layout is
3780 * applied to inode otherwise false layout would be
3781 * seen. Applying layout shoud happen before dropping
3782 * the intent lock. */
3783 ldlm_lock_allow_match(lock);
3785 lli->lli_has_smd = lsm_has_objects(md->lsm);
3786 if (md->lsm != NULL)
3787 gen = md->lsm->lsm_layout_gen;
3790 DFID ": layout version change: %u -> %u\n",
3791 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3793 ll_layout_version_set(lli, gen);
3799 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3800 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3803 struct ll_sb_info *sbi = ll_i2sbi(inode);
3804 struct obd_capa *oc;
3805 struct ptlrpc_request *req;
3806 struct mdt_body *body;
3813 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3814 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3815 lock->l_lvb_data, lock->l_lvb_len);
3817 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3820 /* if layout lock was granted right away, the layout is returned
3821 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3822 * blocked and then granted via completion ast, we have to fetch
3823 * layout here. Please note that we can't use the LVB buffer in
3824 * completion AST because it doesn't have a large enough buffer */
3825 oc = ll_mdscapa_get(inode);
3826 rc = ll_get_default_mdsize(sbi, &lmmsize);
3828 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3829 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3835 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3837 GOTO(out, rc = -EPROTO);
3839 lmmsize = body->mbo_eadatasize;
3840 if (lmmsize == 0) /* empty layout */
3843 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3845 GOTO(out, rc = -EFAULT);
3847 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3848 if (lvbdata == NULL)
3849 GOTO(out, rc = -ENOMEM);
3851 memcpy(lvbdata, lmm, lmmsize);
3852 lock_res_and_lock(lock);
3853 if (lock->l_lvb_data != NULL)
3854 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3856 lock->l_lvb_data = lvbdata;
3857 lock->l_lvb_len = lmmsize;
3858 unlock_res_and_lock(lock);
3863 ptlrpc_req_finished(req);
3868 * Apply the layout to the inode. Layout lock is held and will be released
3871 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3872 struct inode *inode, __u32 *gen, bool reconf)
3874 struct ll_inode_info *lli = ll_i2info(inode);
3875 struct ll_sb_info *sbi = ll_i2sbi(inode);
3876 struct ldlm_lock *lock;
3877 struct lustre_md md = { NULL };
3878 struct cl_object_conf conf;
3881 bool wait_layout = false;
3884 LASSERT(lustre_handle_is_used(lockh));
3886 lock = ldlm_handle2lock(lockh);
3887 LASSERT(lock != NULL);
3888 LASSERT(ldlm_has_layout(lock));
3890 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3891 PFID(&lli->lli_fid), inode, reconf);
3893 /* in case this is a caching lock and reinstate with new inode */
3894 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3896 lock_res_and_lock(lock);
3897 lvb_ready = ldlm_is_lvb_ready(lock);
3898 unlock_res_and_lock(lock);
3899 /* checking lvb_ready is racy but this is okay. The worst case is
3900 * that multi processes may configure the file on the same time. */
3902 if (lvb_ready || !reconf) {
3905 /* layout_gen must be valid if layout lock is not
3906 * cancelled and stripe has already set */
3907 *gen = ll_layout_version_get(lli);
3913 rc = ll_layout_fetch(inode, lock);
3917 /* for layout lock, lmm is returned in lock's lvb.
3918 * lvb_data is immutable if the lock is held so it's safe to access it
3919 * without res lock. See the description in ldlm_lock_decref_internal()
3920 * for the condition to free lvb_data of layout lock */
3921 if (lock->l_lvb_data != NULL) {
3922 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3923 lock->l_lvb_data, lock->l_lvb_len);
3925 *gen = LL_LAYOUT_GEN_EMPTY;
3927 *gen = md.lsm->lsm_layout_gen;
3930 CERROR("%s: file "DFID" unpackmd error: %d\n",
3931 ll_get_fsname(inode->i_sb, NULL, 0),
3932 PFID(&lli->lli_fid), rc);
3938 /* set layout to file. Unlikely this will fail as old layout was
3939 * surely eliminated */
3940 memset(&conf, 0, sizeof conf);
3941 conf.coc_opc = OBJECT_CONF_SET;
3942 conf.coc_inode = inode;
3943 conf.coc_lock = lock;
3944 conf.u.coc_md = &md;
3945 rc = ll_layout_conf(inode, &conf);
3948 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3950 /* refresh layout failed, need to wait */
3951 wait_layout = rc == -EBUSY;
3955 LDLM_LOCK_PUT(lock);
3956 ldlm_lock_decref(lockh, mode);
3958 /* wait for IO to complete if it's still being used. */
3960 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3961 ll_get_fsname(inode->i_sb, NULL, 0),
3962 PFID(&lli->lli_fid), inode);
3964 memset(&conf, 0, sizeof conf);
3965 conf.coc_opc = OBJECT_CONF_WAIT;
3966 conf.coc_inode = inode;
3967 rc = ll_layout_conf(inode, &conf);
3971 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3972 ll_get_fsname(inode->i_sb, NULL, 0),
3973 PFID(&lli->lli_fid), rc);
3979 * This function checks if there exists a LAYOUT lock on the client side,
3980 * or enqueues it if it doesn't have one in cache.
3982 * This function will not hold layout lock so it may be revoked any time after
3983 * this function returns. Any operations depend on layout should be redone
3986 * This function should be called before lov_io_init() to get an uptodate
3987 * layout version, the caller should save the version number and after IO
3988 * is finished, this function should be called again to verify that layout
3989 * is not changed during IO time.
3991 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3993 struct ll_inode_info *lli = ll_i2info(inode);
3994 struct ll_sb_info *sbi = ll_i2sbi(inode);
3995 struct md_op_data *op_data;
3996 struct lookup_intent it;
3997 struct lustre_handle lockh;
3999 struct ldlm_enqueue_info einfo = {
4000 .ei_type = LDLM_IBITS,
4002 .ei_cb_bl = &ll_md_blocking_ast,
4003 .ei_cb_cp = &ldlm_completion_ast,
4008 *gen = ll_layout_version_get(lli);
4009 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4013 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4014 LASSERT(S_ISREG(inode->i_mode));
4016 /* take layout lock mutex to enqueue layout lock exclusively. */
4017 mutex_lock(&lli->lli_layout_mutex);
4020 /* mostly layout lock is caching on the local side, so try to match
4021 * it before grabbing layout lock mutex. */
4022 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4023 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4024 if (mode != 0) { /* hit cached lock */
4025 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4029 mutex_unlock(&lli->lli_layout_mutex);
4033 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4034 0, 0, LUSTRE_OPC_ANY, NULL);
4035 if (IS_ERR(op_data)) {
4036 mutex_unlock(&lli->lli_layout_mutex);
4037 RETURN(PTR_ERR(op_data));
4040 /* have to enqueue one */
4041 memset(&it, 0, sizeof(it));
4042 it.it_op = IT_LAYOUT;
4043 lockh.cookie = 0ULL;
4045 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4046 ll_get_fsname(inode->i_sb, NULL, 0),
4047 PFID(&lli->lli_fid), inode);
4049 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4050 if (it.d.lustre.it_data != NULL)
4051 ptlrpc_req_finished(it.d.lustre.it_data);
4052 it.d.lustre.it_data = NULL;
4054 ll_finish_md_op_data(op_data);
4056 mode = it.d.lustre.it_lock_mode;
4057 it.d.lustre.it_lock_mode = 0;
4058 ll_intent_drop_lock(&it);
4061 /* set lock data in case this is a new lock */
4062 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4063 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4067 mutex_unlock(&lli->lli_layout_mutex);
4073 * This function send a restore request to the MDT
4075 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4077 struct hsm_user_request *hur;
4081 len = sizeof(struct hsm_user_request) +
4082 sizeof(struct hsm_user_item);
4083 OBD_ALLOC(hur, len);
4087 hur->hur_request.hr_action = HUA_RESTORE;
4088 hur->hur_request.hr_archive_id = 0;
4089 hur->hur_request.hr_flags = 0;
4090 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4091 sizeof(hur->hur_user_item[0].hui_fid));
4092 hur->hur_user_item[0].hui_extent.offset = offset;
4093 hur->hur_user_item[0].hui_extent.length = length;
4094 hur->hur_request.hr_itemcount = 1;
4095 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,