4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("Releasing a file %p with negative dentry %p. Name %s",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead.
386 * Different processes can open the same dir, "ll_opendir_key" means:
387 * it is me that should stop the statahead thread. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
389 lli->lli_opendir_pid != 0)
390 ll_stop_statahead(inode, lli->lli_opendir_key);
392 if (inode->i_sb->s_root == file->f_dentry) {
393 LUSTRE_FPRIVATE(file) = NULL;
394 ll_file_data_put(fd);
398 if (!S_ISDIR(inode->i_mode)) {
399 if (lli->lli_clob != NULL)
400 lov_read_and_clear_async_rc(lli->lli_clob);
401 lli->lli_async_rc = 0;
404 rc = ll_md_close(sbi->ll_md_exp, inode, file);
406 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
407 libcfs_debug_dumplog();
412 static int ll_intent_file_open(struct file *file, void *lmm,
413 int lmmsize, struct lookup_intent *itp)
415 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
416 struct dentry *parent = file->f_dentry->d_parent;
417 const char *name = file->f_dentry->d_name.name;
418 const int len = file->f_dentry->d_name.len;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req = NULL;
421 __u32 opc = LUSTRE_OPC_ANY;
428 /* Usually we come here only for NFSD, and we want open lock.
429 But we can also get here with pre 2.6.15 patchless kernels, and in
430 that case that lock is also ok */
431 /* We can also get here if there was cached open handle in revalidate_it
432 * but it disappeared while we were getting from there to ll_file_open.
433 * But this means this file was closed and immediatelly opened which
434 * makes a good candidate for using OPEN lock */
435 /* If lmmsize & lmm are not 0, we are just setting stripe info
436 * parameters. No need for the open lock */
437 if (lmm == NULL && lmmsize == 0) {
438 itp->it_flags |= MDS_OPEN_LOCK;
439 if (itp->it_flags & FMODE_WRITE)
440 opc = LUSTRE_OPC_CREATE;
443 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
444 file->f_dentry->d_inode, name, len,
447 RETURN(PTR_ERR(op_data));
449 op_data->op_data = lmm;
450 op_data->op_data_size = lmmsize;
452 itp->it_flags |= MDS_OPEN_BY_FID;
453 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
454 &ll_md_blocking_ast, 0);
455 ll_finish_md_op_data(op_data);
457 /* reason for keep own exit path - don`t flood log
458 * with messages with -ESTALE errors.
460 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
461 it_open_error(DISP_OPEN_OPEN, itp))
463 ll_release_openhandle(file->f_dentry, itp);
467 if (it_disposition(itp, DISP_LOOKUP_NEG))
468 GOTO(out, rc = -ENOENT);
470 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
471 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
472 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
476 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
477 if (!rc && itp->d.lustre.it_lock_mode)
478 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
482 ptlrpc_req_finished(req);
483 ll_intent_drop_lock(itp);
489 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
490 * not believe attributes if a few ioepoch holders exist. Attributes for
491 * previous ioepoch if new one is opened are also skipped by MDS.
493 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
495 if (ioepoch && lli->lli_ioepoch != ioepoch) {
496 lli->lli_ioepoch = ioepoch;
497 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
498 ioepoch, PFID(&lli->lli_fid));
502 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
503 struct obd_client_handle *och)
505 struct ptlrpc_request *req = it->d.lustre.it_data;
506 struct mdt_body *body;
508 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
509 och->och_fh = body->handle;
510 och->och_fid = body->fid1;
511 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
512 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
513 och->och_flags = it->it_flags;
515 return md_set_open_replay_data(md_exp, och, it);
518 static int ll_local_open(struct file *file, struct lookup_intent *it,
519 struct ll_file_data *fd, struct obd_client_handle *och)
521 struct inode *inode = file->f_dentry->d_inode;
522 struct ll_inode_info *lli = ll_i2info(inode);
525 LASSERT(!LUSTRE_FPRIVATE(file));
530 struct ptlrpc_request *req = it->d.lustre.it_data;
531 struct mdt_body *body;
534 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
538 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
539 ll_ioepoch_open(lli, body->ioepoch);
542 LUSTRE_FPRIVATE(file) = fd;
543 ll_readahead_init(inode, &fd->fd_ras);
544 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
549 /* Open a file, and (for the very first open) create objects on the OSTs at
550 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
551 * creation or open until ll_lov_setstripe() ioctl is called.
553 * If we already have the stripe MD locally then we don't request it in
554 * md_open(), by passing a lmm_size = 0.
556 * It is up to the application to ensure no other processes open this file
557 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
558 * used. We might be able to avoid races of that sort by getting lli_open_sem
559 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
560 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
562 int ll_file_open(struct inode *inode, struct file *file)
564 struct ll_inode_info *lli = ll_i2info(inode);
565 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
566 .it_flags = file->f_flags };
567 struct obd_client_handle **och_p = NULL;
568 __u64 *och_usecount = NULL;
569 struct ll_file_data *fd;
570 int rc = 0, opendir_set = 0;
573 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
574 PFID(ll_inode2fid(inode)), inode, file->f_flags);
576 it = file->private_data; /* XXX: compat macro */
577 file->private_data = NULL; /* prevent ll_local_open assertion */
579 fd = ll_file_data_get();
581 GOTO(out_openerr, rc = -ENOMEM);
584 if (S_ISDIR(inode->i_mode)) {
585 spin_lock(&lli->lli_sa_lock);
586 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
587 lli->lli_opendir_pid == 0) {
588 lli->lli_opendir_key = fd;
589 lli->lli_opendir_pid = current_pid();
592 spin_unlock(&lli->lli_sa_lock);
595 if (inode->i_sb->s_root == file->f_dentry) {
596 LUSTRE_FPRIVATE(file) = fd;
600 if (!it || !it->d.lustre.it_disposition) {
601 /* Convert f_flags into access mode. We cannot use file->f_mode,
602 * because everything but O_ACCMODE mask was stripped from
604 if ((oit.it_flags + 1) & O_ACCMODE)
606 if (file->f_flags & O_TRUNC)
607 oit.it_flags |= FMODE_WRITE;
609 /* kernel only call f_op->open in dentry_open. filp_open calls
610 * dentry_open after call to open_namei that checks permissions.
611 * Only nfsd_open call dentry_open directly without checking
612 * permissions and because of that this code below is safe. */
613 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
614 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
616 /* We do not want O_EXCL here, presumably we opened the file
617 * already? XXX - NFS implications? */
618 oit.it_flags &= ~O_EXCL;
620 /* bug20584, if "it_flags" contains O_CREAT, the file will be
621 * created if necessary, then "IT_CREAT" should be set to keep
622 * consistent with it */
623 if (oit.it_flags & O_CREAT)
624 oit.it_op |= IT_CREAT;
630 /* Let's see if we have file open on MDS already. */
631 if (it->it_flags & FMODE_WRITE) {
632 och_p = &lli->lli_mds_write_och;
633 och_usecount = &lli->lli_open_fd_write_count;
634 } else if (it->it_flags & FMODE_EXEC) {
635 och_p = &lli->lli_mds_exec_och;
636 och_usecount = &lli->lli_open_fd_exec_count;
638 och_p = &lli->lli_mds_read_och;
639 och_usecount = &lli->lli_open_fd_read_count;
642 mutex_lock(&lli->lli_och_mutex);
643 if (*och_p) { /* Open handle is present */
644 if (it_disposition(it, DISP_OPEN_OPEN)) {
645 /* Well, there's extra open request that we do not need,
646 let's close it somehow. This will decref request. */
647 rc = it_open_error(DISP_OPEN_OPEN, it);
649 mutex_unlock(&lli->lli_och_mutex);
650 GOTO(out_openerr, rc);
653 ll_release_openhandle(file->f_dentry, it);
657 rc = ll_local_open(file, it, fd, NULL);
660 mutex_unlock(&lli->lli_och_mutex);
661 GOTO(out_openerr, rc);
664 LASSERT(*och_usecount == 0);
665 if (!it->d.lustre.it_disposition) {
666 /* We cannot just request lock handle now, new ELC code
667 means that one of other OPEN locks for this file
668 could be cancelled, and since blocking ast handler
669 would attempt to grab och_mutex as well, that would
670 result in a deadlock */
671 mutex_unlock(&lli->lli_och_mutex);
672 it->it_create_mode |= M_CHECK_STALE;
673 rc = ll_intent_file_open(file, NULL, 0, it);
674 it->it_create_mode &= ~M_CHECK_STALE;
676 GOTO(out_openerr, rc);
680 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
682 GOTO(out_och_free, rc = -ENOMEM);
686 /* md_intent_lock() didn't get a request ref if there was an
687 * open error, so don't do cleanup on the request here
689 /* XXX (green): Should not we bail out on any error here, not
690 * just open error? */
691 rc = it_open_error(DISP_OPEN_OPEN, it);
693 GOTO(out_och_free, rc);
695 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
696 "inode %p: disposition %x, status %d\n", inode,
697 it_disposition(it, ~0), it->d.lustre.it_status);
699 rc = ll_local_open(file, it, fd, *och_p);
701 GOTO(out_och_free, rc);
703 mutex_unlock(&lli->lli_och_mutex);
706 /* Must do this outside lli_och_mutex lock to prevent deadlock where
707 different kind of OPEN lock for this same inode gets cancelled
708 by ldlm_cancel_lru */
709 if (!S_ISREG(inode->i_mode))
710 GOTO(out_och_free, rc);
714 if (!lli->lli_has_smd &&
715 (cl_is_lov_delay_create(file->f_flags) ||
716 (file->f_mode & FMODE_WRITE) == 0)) {
717 CDEBUG(D_INODE, "object creation was delayed\n");
718 GOTO(out_och_free, rc);
720 cl_lov_delay_create_clear(&file->f_flags);
721 GOTO(out_och_free, rc);
725 if (och_p && *och_p) {
726 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
727 *och_p = NULL; /* OBD_FREE writes some magic there */
730 mutex_unlock(&lli->lli_och_mutex);
733 if (opendir_set != 0)
734 ll_stop_statahead(inode, lli->lli_opendir_key);
736 ll_file_data_put(fd);
738 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
741 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
742 ptlrpc_req_finished(it->d.lustre.it_data);
743 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
749 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
750 struct ldlm_lock_desc *desc, void *data, int flag)
753 struct lustre_handle lockh;
757 case LDLM_CB_BLOCKING:
758 ldlm_lock2handle(lock, &lockh);
759 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
761 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
765 case LDLM_CB_CANCELING:
773 * Acquire a lease and open the file.
775 static struct obd_client_handle *
776 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
779 struct lookup_intent it = { .it_op = IT_OPEN };
780 struct ll_sb_info *sbi = ll_i2sbi(inode);
781 struct md_op_data *op_data;
782 struct ptlrpc_request *req = NULL;
783 struct lustre_handle old_handle = { 0 };
784 struct obd_client_handle *och = NULL;
789 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
790 RETURN(ERR_PTR(-EINVAL));
793 struct ll_inode_info *lli = ll_i2info(inode);
794 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
795 struct obd_client_handle **och_p;
798 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
799 RETURN(ERR_PTR(-EPERM));
801 /* Get the openhandle of the file */
803 mutex_lock(&lli->lli_och_mutex);
804 if (fd->fd_lease_och != NULL) {
805 mutex_unlock(&lli->lli_och_mutex);
809 if (fd->fd_och == NULL) {
810 if (file->f_mode & FMODE_WRITE) {
811 LASSERT(lli->lli_mds_write_och != NULL);
812 och_p = &lli->lli_mds_write_och;
813 och_usecount = &lli->lli_open_fd_write_count;
815 LASSERT(lli->lli_mds_read_och != NULL);
816 och_p = &lli->lli_mds_read_och;
817 och_usecount = &lli->lli_open_fd_read_count;
819 if (*och_usecount == 1) {
826 mutex_unlock(&lli->lli_och_mutex);
827 if (rc < 0) /* more than 1 opener */
830 LASSERT(fd->fd_och != NULL);
831 old_handle = fd->fd_och->och_fh;
836 RETURN(ERR_PTR(-ENOMEM));
838 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
839 LUSTRE_OPC_ANY, NULL);
841 GOTO(out, rc = PTR_ERR(op_data));
843 /* To tell the MDT this openhandle is from the same owner */
844 op_data->op_handle = old_handle;
846 it.it_flags = fmode | open_flags;
847 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
848 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
849 &ll_md_blocking_lease_ast,
850 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
851 * it can be cancelled which may mislead applications that the lease is
853 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
854 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
855 * doesn't deal with openhandle, so normal openhandle will be leaked. */
856 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
857 ll_finish_md_op_data(op_data);
858 ptlrpc_req_finished(req);
860 GOTO(out_release_it, rc);
862 if (it_disposition(&it, DISP_LOOKUP_NEG))
863 GOTO(out_release_it, rc = -ENOENT);
865 rc = it_open_error(DISP_OPEN_OPEN, &it);
867 GOTO(out_release_it, rc);
869 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
870 ll_och_fill(sbi->ll_md_exp, &it, och);
872 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
873 GOTO(out_close, rc = -EOPNOTSUPP);
875 /* already get lease, handle lease lock */
876 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
877 if (it.d.lustre.it_lock_mode == 0 ||
878 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
879 /* open lock must return for lease */
880 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
881 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
882 it.d.lustre.it_lock_bits);
883 GOTO(out_close, rc = -EPROTO);
886 ll_intent_release(&it);
890 /* Cancel open lock */
891 if (it.d.lustre.it_lock_mode != 0) {
892 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
893 it.d.lustre.it_lock_mode);
894 it.d.lustre.it_lock_mode = 0;
895 och->och_lease_handle.cookie = 0ULL;
897 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
899 CERROR("%s: error closing file "DFID": %d\n",
900 ll_get_fsname(inode->i_sb, NULL, 0),
901 PFID(&ll_i2info(inode)->lli_fid), rc2);
902 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
904 ll_intent_release(&it);
912 * Release lease and close the file.
913 * It will check if the lease has ever broken.
915 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
918 struct ldlm_lock *lock;
919 bool cancelled = true;
923 lock = ldlm_handle2lock(&och->och_lease_handle);
925 lock_res_and_lock(lock);
926 cancelled = ldlm_is_cancel(lock);
927 unlock_res_and_lock(lock);
931 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
932 PFID(&ll_i2info(inode)->lli_fid), cancelled);
935 ldlm_cli_cancel(&och->och_lease_handle, 0);
936 if (lease_broken != NULL)
937 *lease_broken = cancelled;
939 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
944 /* Fills the obdo with the attributes for the lsm */
945 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
946 struct obd_capa *capa, struct obdo *obdo,
947 __u64 ioepoch, int dv_flags)
949 struct ptlrpc_request_set *set;
950 struct obd_info oinfo = { { { 0 } } };
955 LASSERT(lsm != NULL);
959 oinfo.oi_oa->o_oi = lsm->lsm_oi;
960 oinfo.oi_oa->o_mode = S_IFREG;
961 oinfo.oi_oa->o_ioepoch = ioepoch;
962 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
963 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
964 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
965 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
966 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
967 OBD_MD_FLDATAVERSION;
968 oinfo.oi_capa = capa;
969 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
970 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
971 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
972 if (dv_flags & LL_DV_WR_FLUSH)
973 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
976 set = ptlrpc_prep_set();
978 CERROR("can't allocate ptlrpc set\n");
981 rc = obd_getattr_async(exp, &oinfo, set);
983 rc = ptlrpc_set_wait(set);
984 ptlrpc_set_destroy(set);
987 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
988 OBD_MD_FLATIME | OBD_MD_FLMTIME |
989 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
990 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
991 if (dv_flags & LL_DV_WR_FLUSH &&
992 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
993 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1000 * Performs the getattr on the inode and updates its fields.
1001 * If @sync != 0, perform the getattr under the server-side lock.
1003 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1004 __u64 ioepoch, int sync)
1006 struct obd_capa *capa = ll_mdscapa_get(inode);
1007 struct lov_stripe_md *lsm;
1011 lsm = ccc_inode_lsm_get(inode);
1012 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1013 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1016 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1018 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1019 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1020 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1021 (unsigned long long)inode->i_blocks,
1022 (unsigned long)ll_inode_blksize(inode));
1024 ccc_inode_lsm_put(inode, lsm);
1028 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1030 struct ll_inode_info *lli = ll_i2info(inode);
1031 struct cl_object *obj = lli->lli_clob;
1032 struct cl_attr *attr = ccc_env_thread_attr(env);
1038 ll_inode_size_lock(inode);
1039 /* merge timestamps the most recently obtained from mds with
1040 timestamps obtained from osts */
1041 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1042 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1043 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1044 inode_init_lvb(inode, &lvb);
1046 cl_object_attr_lock(obj);
1047 rc = cl_object_attr_get(env, obj, attr);
1048 cl_object_attr_unlock(obj);
1051 if (lvb.lvb_atime < attr->cat_atime)
1052 lvb.lvb_atime = attr->cat_atime;
1053 if (lvb.lvb_ctime < attr->cat_ctime)
1054 lvb.lvb_ctime = attr->cat_ctime;
1055 if (lvb.lvb_mtime < attr->cat_mtime)
1056 lvb.lvb_mtime = attr->cat_mtime;
1058 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1059 PFID(&lli->lli_fid), attr->cat_size);
1060 cl_isize_write_nolock(inode, attr->cat_size);
1062 inode->i_blocks = attr->cat_blocks;
1064 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1065 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1066 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1068 ll_inode_size_unlock(inode);
1073 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1076 struct obdo obdo = { 0 };
1079 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1081 st->st_size = obdo.o_size;
1082 st->st_blocks = obdo.o_blocks;
1083 st->st_mtime = obdo.o_mtime;
1084 st->st_atime = obdo.o_atime;
1085 st->st_ctime = obdo.o_ctime;
1090 static bool file_is_noatime(const struct file *file)
1092 const struct vfsmount *mnt = file->f_path.mnt;
1093 const struct inode *inode = file->f_path.dentry->d_inode;
1095 /* Adapted from file_accessed() and touch_atime().*/
1096 if (file->f_flags & O_NOATIME)
1099 if (inode->i_flags & S_NOATIME)
1102 if (IS_NOATIME(inode))
1105 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1108 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1111 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1117 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1119 struct inode *inode = file->f_dentry->d_inode;
1121 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1123 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1124 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1125 file->f_flags & O_DIRECT ||
1128 io->ci_obj = ll_i2info(inode)->lli_clob;
1129 io->ci_lockreq = CILR_MAYBE;
1130 if (ll_file_nolock(file)) {
1131 io->ci_lockreq = CILR_NEVER;
1132 io->ci_no_srvlock = 1;
1133 } else if (file->f_flags & O_APPEND) {
1134 io->ci_lockreq = CILR_MANDATORY;
1137 io->ci_noatime = file_is_noatime(file);
1141 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1142 struct file *file, enum cl_io_type iot,
1143 loff_t *ppos, size_t count)
1145 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1151 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1152 file->f_dentry->d_name.name, iot, *ppos, count);
1155 io = ccc_env_thread_io(env);
1156 ll_io_init(io, file, iot == CIT_WRITE);
1158 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1159 struct vvp_io *vio = vvp_env_io(env);
1160 struct ccc_io *cio = ccc_env_io(env);
1161 int write_mutex_locked = 0;
1163 cio->cui_fd = LUSTRE_FPRIVATE(file);
1164 vio->cui_io_subtype = args->via_io_subtype;
1166 switch (vio->cui_io_subtype) {
1168 cio->cui_iov = args->u.normal.via_iov;
1169 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1170 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1171 cio->cui_iocb = args->u.normal.via_iocb;
1172 if ((iot == CIT_WRITE) &&
1173 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1174 if (mutex_lock_interruptible(&lli->
1176 GOTO(out, result = -ERESTARTSYS);
1177 write_mutex_locked = 1;
1179 down_read(&lli->lli_trunc_sem);
1182 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1183 vio->u.splice.cui_flags = args->u.splice.via_flags;
1186 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1189 result = cl_io_loop(env, io);
1190 if (args->via_io_subtype == IO_NORMAL)
1191 up_read(&lli->lli_trunc_sem);
1192 if (write_mutex_locked)
1193 mutex_unlock(&lli->lli_write_mutex);
1195 /* cl_io_rw_init() handled IO */
1196 result = io->ci_result;
1199 if (io->ci_nob > 0) {
1200 result = io->ci_nob;
1201 *ppos = io->u.ci_wr.wr.crw_pos;
1205 cl_io_fini(env, io);
1206 /* If any bit been read/written (result != 0), we just return
1207 * short read/write instead of restart io. */
1208 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1209 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1210 iot == CIT_READ ? "read" : "write",
1211 file->f_dentry->d_name.name, *ppos, count);
1212 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1216 if (iot == CIT_READ) {
1218 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1219 LPROC_LL_READ_BYTES, result);
1220 } else if (iot == CIT_WRITE) {
1222 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1223 LPROC_LL_WRITE_BYTES, result);
1224 fd->fd_write_failed = false;
1225 } else if (result != -ERESTARTSYS) {
1226 fd->fd_write_failed = true;
1229 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1236 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1238 static int ll_file_get_iov_count(const struct iovec *iov,
1239 unsigned long *nr_segs, size_t *count)
1244 for (seg = 0; seg < *nr_segs; seg++) {
1245 const struct iovec *iv = &iov[seg];
1248 * If any segment has a negative length, or the cumulative
1249 * length ever wraps negative then return -EINVAL.
1252 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1254 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1259 cnt -= iv->iov_len; /* This segment is no good */
1266 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1267 unsigned long nr_segs, loff_t pos)
1270 struct vvp_io_args *args;
1276 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1280 env = cl_env_get(&refcheck);
1282 RETURN(PTR_ERR(env));
1284 args = vvp_env_args(env, IO_NORMAL);
1285 args->u.normal.via_iov = (struct iovec *)iov;
1286 args->u.normal.via_nrsegs = nr_segs;
1287 args->u.normal.via_iocb = iocb;
1289 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1290 &iocb->ki_pos, count);
1291 cl_env_put(env, &refcheck);
1295 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1299 struct iovec *local_iov;
1300 struct kiocb *kiocb;
1305 env = cl_env_get(&refcheck);
1307 RETURN(PTR_ERR(env));
1309 local_iov = &vvp_env_info(env)->vti_local_iov;
1310 kiocb = &vvp_env_info(env)->vti_kiocb;
1311 local_iov->iov_base = (void __user *)buf;
1312 local_iov->iov_len = count;
1313 init_sync_kiocb(kiocb, file);
1314 kiocb->ki_pos = *ppos;
1315 #ifdef HAVE_KIOCB_KI_LEFT
1316 kiocb->ki_left = count;
1318 kiocb->ki_nbytes = count;
1321 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1322 *ppos = kiocb->ki_pos;
1324 cl_env_put(env, &refcheck);
1329 * Write to a file (through the page cache).
1332 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1333 unsigned long nr_segs, loff_t pos)
1336 struct vvp_io_args *args;
1342 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1346 env = cl_env_get(&refcheck);
1348 RETURN(PTR_ERR(env));
1350 args = vvp_env_args(env, IO_NORMAL);
1351 args->u.normal.via_iov = (struct iovec *)iov;
1352 args->u.normal.via_nrsegs = nr_segs;
1353 args->u.normal.via_iocb = iocb;
1355 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1356 &iocb->ki_pos, count);
1357 cl_env_put(env, &refcheck);
1361 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1365 struct iovec *local_iov;
1366 struct kiocb *kiocb;
1371 env = cl_env_get(&refcheck);
1373 RETURN(PTR_ERR(env));
1375 local_iov = &vvp_env_info(env)->vti_local_iov;
1376 kiocb = &vvp_env_info(env)->vti_kiocb;
1377 local_iov->iov_base = (void __user *)buf;
1378 local_iov->iov_len = count;
1379 init_sync_kiocb(kiocb, file);
1380 kiocb->ki_pos = *ppos;
1381 #ifdef HAVE_KIOCB_KI_LEFT
1382 kiocb->ki_left = count;
1384 kiocb->ki_nbytes = count;
1387 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1388 *ppos = kiocb->ki_pos;
1390 cl_env_put(env, &refcheck);
1395 * Send file content (through pagecache) somewhere with helper
1397 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1398 struct pipe_inode_info *pipe, size_t count,
1402 struct vvp_io_args *args;
1407 env = cl_env_get(&refcheck);
1409 RETURN(PTR_ERR(env));
1411 args = vvp_env_args(env, IO_SPLICE);
1412 args->u.splice.via_pipe = pipe;
1413 args->u.splice.via_flags = flags;
1415 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1416 cl_env_put(env, &refcheck);
1420 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1423 struct obd_export *exp = ll_i2dtexp(inode);
1424 struct obd_trans_info oti = { 0 };
1425 struct obdo *oa = NULL;
1428 struct lov_stripe_md *lsm = NULL, *lsm2;
1435 lsm = ccc_inode_lsm_get(inode);
1436 if (!lsm_has_objects(lsm))
1437 GOTO(out, rc = -ENOENT);
1439 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1440 (lsm->lsm_stripe_count));
1442 OBD_ALLOC_LARGE(lsm2, lsm_size);
1444 GOTO(out, rc = -ENOMEM);
1447 oa->o_nlink = ost_idx;
1448 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1449 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1450 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1451 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1452 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1453 memcpy(lsm2, lsm, lsm_size);
1454 ll_inode_size_lock(inode);
1455 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1456 ll_inode_size_unlock(inode);
1458 OBD_FREE_LARGE(lsm2, lsm_size);
1461 ccc_inode_lsm_put(inode, lsm);
1466 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1468 struct ll_recreate_obj ucreat;
1472 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1475 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1479 ostid_set_seq_mdt0(&oi);
1480 ostid_set_id(&oi, ucreat.lrc_id);
1481 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1484 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1491 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1494 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1497 fid_to_ostid(&fid, &oi);
1498 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1499 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1502 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1503 __u64 flags, struct lov_user_md *lum,
1506 struct lov_stripe_md *lsm = NULL;
1507 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1511 lsm = ccc_inode_lsm_get(inode);
1513 ccc_inode_lsm_put(inode, lsm);
1514 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1515 PFID(ll_inode2fid(inode)));
1516 GOTO(out, rc = -EEXIST);
1519 ll_inode_size_lock(inode);
1520 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1522 GOTO(out_unlock, rc);
1523 rc = oit.d.lustre.it_status;
1525 GOTO(out_req_free, rc);
1527 ll_release_openhandle(file->f_dentry, &oit);
1530 ll_inode_size_unlock(inode);
1531 ll_intent_release(&oit);
1532 ccc_inode_lsm_put(inode, lsm);
1534 cl_lov_delay_create_clear(&file->f_flags);
1537 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1541 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1542 struct lov_mds_md **lmmp, int *lmm_size,
1543 struct ptlrpc_request **request)
1545 struct ll_sb_info *sbi = ll_i2sbi(inode);
1546 struct mdt_body *body;
1547 struct lov_mds_md *lmm = NULL;
1548 struct ptlrpc_request *req = NULL;
1549 struct md_op_data *op_data;
1552 rc = ll_get_default_mdsize(sbi, &lmmsize);
1556 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1557 strlen(filename), lmmsize,
1558 LUSTRE_OPC_ANY, NULL);
1559 if (IS_ERR(op_data))
1560 RETURN(PTR_ERR(op_data));
1562 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1563 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1564 ll_finish_md_op_data(op_data);
1566 CDEBUG(D_INFO, "md_getattr_name failed "
1567 "on %s: rc %d\n", filename, rc);
1571 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1572 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1574 lmmsize = body->eadatasize;
1576 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1578 GOTO(out, rc = -ENODATA);
1581 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1582 LASSERT(lmm != NULL);
1584 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1585 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1586 GOTO(out, rc = -EPROTO);
1590 * This is coming from the MDS, so is probably in
1591 * little endian. We convert it to host endian before
1592 * passing it to userspace.
1594 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1597 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1598 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1601 /* if function called for directory - we should
1602 * avoid swab not existent lsm objects */
1603 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1604 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1605 if (S_ISREG(body->mode))
1606 lustre_swab_lov_user_md_objects(
1607 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1609 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1610 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1611 if (S_ISREG(body->mode))
1612 lustre_swab_lov_user_md_objects(
1613 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1620 *lmm_size = lmmsize;
1625 static int ll_lov_setea(struct inode *inode, struct file *file,
1628 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1629 struct lov_user_md *lump;
1630 int lum_size = sizeof(struct lov_user_md) +
1631 sizeof(struct lov_user_ost_data);
1635 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1638 OBD_ALLOC_LARGE(lump, lum_size);
1642 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1643 OBD_FREE_LARGE(lump, lum_size);
1647 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1649 OBD_FREE_LARGE(lump, lum_size);
1653 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1656 struct lov_user_md_v3 lumv3;
1657 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1658 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1659 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1661 __u64 flags = FMODE_WRITE;
1664 /* first try with v1 which is smaller than v3 */
1665 lum_size = sizeof(struct lov_user_md_v1);
1666 if (copy_from_user(lumv1, lumv1p, lum_size))
1669 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1670 lum_size = sizeof(struct lov_user_md_v3);
1671 if (copy_from_user(&lumv3, lumv3p, lum_size))
1675 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1677 struct lov_stripe_md *lsm;
1680 put_user(0, &lumv1p->lmm_stripe_count);
1682 ll_layout_refresh(inode, &gen);
1683 lsm = ccc_inode_lsm_get(inode);
1684 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1685 0, lsm, (void *)arg);
1686 ccc_inode_lsm_put(inode, lsm);
1691 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1693 struct lov_stripe_md *lsm;
1697 lsm = ccc_inode_lsm_get(inode);
1699 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1701 ccc_inode_lsm_put(inode, lsm);
1706 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1708 struct ll_inode_info *lli = ll_i2info(inode);
1709 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1710 struct ccc_grouplock grouplock;
1714 if (ll_file_nolock(file))
1715 RETURN(-EOPNOTSUPP);
1717 spin_lock(&lli->lli_lock);
1718 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1719 CWARN("group lock already existed with gid %lu\n",
1720 fd->fd_grouplock.cg_gid);
1721 spin_unlock(&lli->lli_lock);
1724 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1725 spin_unlock(&lli->lli_lock);
1727 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1728 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1732 spin_lock(&lli->lli_lock);
1733 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1734 spin_unlock(&lli->lli_lock);
1735 CERROR("another thread just won the race\n");
1736 cl_put_grouplock(&grouplock);
1740 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1741 fd->fd_grouplock = grouplock;
1742 spin_unlock(&lli->lli_lock);
1744 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1748 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1750 struct ll_inode_info *lli = ll_i2info(inode);
1751 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1752 struct ccc_grouplock grouplock;
1755 spin_lock(&lli->lli_lock);
1756 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1757 spin_unlock(&lli->lli_lock);
1758 CWARN("no group lock held\n");
1761 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1763 if (fd->fd_grouplock.cg_gid != arg) {
1764 CWARN("group lock %lu doesn't match current id %lu\n",
1765 arg, fd->fd_grouplock.cg_gid);
1766 spin_unlock(&lli->lli_lock);
1770 grouplock = fd->fd_grouplock;
1771 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1772 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1773 spin_unlock(&lli->lli_lock);
1775 cl_put_grouplock(&grouplock);
1776 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1781 * Close inode open handle
1783 * \param dentry [in] dentry which contains the inode
1784 * \param it [in,out] intent which contains open info and result
1787 * \retval <0 failure
1789 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1791 struct inode *inode = dentry->d_inode;
1792 struct obd_client_handle *och;
1798 /* Root ? Do nothing. */
1799 if (dentry->d_inode->i_sb->s_root == dentry)
1802 /* No open handle to close? Move away */
1803 if (!it_disposition(it, DISP_OPEN_OPEN))
1806 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1808 OBD_ALLOC(och, sizeof(*och));
1810 GOTO(out, rc = -ENOMEM);
1812 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1814 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1817 /* this one is in place of ll_file_open */
1818 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1819 ptlrpc_req_finished(it->d.lustre.it_data);
1820 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1826 * Get size for inode for which FIEMAP mapping is requested.
1827 * Make the FIEMAP get_info call and returns the result.
1829 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1832 struct obd_export *exp = ll_i2dtexp(inode);
1833 struct lov_stripe_md *lsm = NULL;
1834 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1835 __u32 vallen = num_bytes;
1839 /* Checks for fiemap flags */
1840 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1841 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1845 /* Check for FIEMAP_FLAG_SYNC */
1846 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1847 rc = filemap_fdatawrite(inode->i_mapping);
1852 lsm = ccc_inode_lsm_get(inode);
1856 /* If the stripe_count > 1 and the application does not understand
1857 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1859 if (lsm->lsm_stripe_count > 1 &&
1860 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1861 GOTO(out, rc = -EOPNOTSUPP);
1863 fm_key.oa.o_oi = lsm->lsm_oi;
1864 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1866 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1867 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1868 /* If filesize is 0, then there would be no objects for mapping */
1869 if (fm_key.oa.o_size == 0) {
1870 fiemap->fm_mapped_extents = 0;
1874 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1876 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1879 CERROR("obd_get_info failed: rc = %d\n", rc);
1882 ccc_inode_lsm_put(inode, lsm);
1886 int ll_fid2path(struct inode *inode, void *arg)
1888 struct obd_export *exp = ll_i2mdexp(inode);
1889 struct getinfo_fid2path *gfout, *gfin;
1893 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1894 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1897 /* Need to get the buflen */
1898 OBD_ALLOC_PTR(gfin);
1901 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1906 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1907 OBD_ALLOC(gfout, outsize);
1908 if (gfout == NULL) {
1912 memcpy(gfout, gfin, sizeof(*gfout));
1915 /* Call mdc_iocontrol */
1916 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1920 if (copy_to_user(arg, gfout, outsize))
1924 OBD_FREE(gfout, outsize);
1928 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1930 struct ll_user_fiemap *fiemap_s;
1931 size_t num_bytes, ret_bytes;
1932 unsigned int extent_count;
1935 /* Get the extent count so we can calculate the size of
1936 * required fiemap buffer */
1937 if (get_user(extent_count,
1938 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1940 num_bytes = sizeof(*fiemap_s) + (extent_count *
1941 sizeof(struct ll_fiemap_extent));
1943 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1944 if (fiemap_s == NULL)
1947 /* get the fiemap value */
1948 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1950 GOTO(error, rc = -EFAULT);
1952 /* If fm_extent_count is non-zero, read the first extent since
1953 * it is used to calculate end_offset and device from previous
1956 if (copy_from_user(&fiemap_s->fm_extents[0],
1957 (char __user *)arg + sizeof(*fiemap_s),
1958 sizeof(struct ll_fiemap_extent)))
1959 GOTO(error, rc = -EFAULT);
1962 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1966 ret_bytes = sizeof(struct ll_user_fiemap);
1968 if (extent_count != 0)
1969 ret_bytes += (fiemap_s->fm_mapped_extents *
1970 sizeof(struct ll_fiemap_extent));
1972 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1976 OBD_FREE_LARGE(fiemap_s, num_bytes);
1981 * Read the data_version for inode.
1983 * This value is computed using stripe object version on OST.
1984 * Version is computed using server side locking.
1986 * @param sync if do sync on the OST side;
1988 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1989 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1991 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1993 struct lov_stripe_md *lsm = NULL;
1994 struct ll_sb_info *sbi = ll_i2sbi(inode);
1995 struct obdo *obdo = NULL;
1999 /* If no stripe, we consider version is 0. */
2000 lsm = ccc_inode_lsm_get(inode);
2001 if (!lsm_has_objects(lsm)) {
2003 CDEBUG(D_INODE, "No object for inode\n");
2007 OBD_ALLOC_PTR(obdo);
2009 GOTO(out, rc = -ENOMEM);
2011 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2013 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2016 *data_version = obdo->o_data_version;
2022 ccc_inode_lsm_put(inode, lsm);
2027 * Trigger a HSM release request for the provided inode.
2029 int ll_hsm_release(struct inode *inode)
2031 struct cl_env_nest nest;
2033 struct obd_client_handle *och = NULL;
2034 __u64 data_version = 0;
2038 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2039 ll_get_fsname(inode->i_sb, NULL, 0),
2040 PFID(&ll_i2info(inode)->lli_fid));
2042 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2044 GOTO(out, rc = PTR_ERR(och));
2046 /* Grab latest data_version and [am]time values */
2047 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2051 env = cl_env_nested_get(&nest);
2053 GOTO(out, rc = PTR_ERR(env));
2055 ll_merge_lvb(env, inode);
2056 cl_env_nested_put(&nest, env);
2058 /* Release the file.
2059 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2060 * we still need it to pack l_remote_handle to MDT. */
2061 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2067 if (och != NULL && !IS_ERR(och)) /* close the file */
2068 ll_lease_close(och, inode, NULL);
2073 struct ll_swap_stack {
2074 struct iattr ia1, ia2;
2076 struct inode *inode1, *inode2;
2077 bool check_dv1, check_dv2;
2080 static int ll_swap_layouts(struct file *file1, struct file *file2,
2081 struct lustre_swap_layouts *lsl)
2083 struct mdc_swap_layouts msl;
2084 struct md_op_data *op_data;
2087 struct ll_swap_stack *llss = NULL;
2090 OBD_ALLOC_PTR(llss);
2094 llss->inode1 = file1->f_dentry->d_inode;
2095 llss->inode2 = file2->f_dentry->d_inode;
2097 if (!S_ISREG(llss->inode2->i_mode))
2098 GOTO(free, rc = -EINVAL);
2100 if (inode_permission(llss->inode1, MAY_WRITE) ||
2101 inode_permission(llss->inode2, MAY_WRITE))
2102 GOTO(free, rc = -EPERM);
2104 if (llss->inode2->i_sb != llss->inode1->i_sb)
2105 GOTO(free, rc = -EXDEV);
2107 /* we use 2 bool because it is easier to swap than 2 bits */
2108 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2109 llss->check_dv1 = true;
2111 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2112 llss->check_dv2 = true;
2114 /* we cannot use lsl->sl_dvX directly because we may swap them */
2115 llss->dv1 = lsl->sl_dv1;
2116 llss->dv2 = lsl->sl_dv2;
2118 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2119 if (rc == 0) /* same file, done! */
2122 if (rc < 0) { /* sequentialize it */
2123 swap(llss->inode1, llss->inode2);
2125 swap(llss->dv1, llss->dv2);
2126 swap(llss->check_dv1, llss->check_dv2);
2130 if (gid != 0) { /* application asks to flush dirty cache */
2131 rc = ll_get_grouplock(llss->inode1, file1, gid);
2135 rc = ll_get_grouplock(llss->inode2, file2, gid);
2137 ll_put_grouplock(llss->inode1, file1, gid);
2142 /* to be able to restore mtime and atime after swap
2143 * we need to first save them */
2145 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2146 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2147 llss->ia1.ia_atime = llss->inode1->i_atime;
2148 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2149 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2150 llss->ia2.ia_atime = llss->inode2->i_atime;
2151 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2154 /* ultimate check, before swaping the layouts we check if
2155 * dataversion has changed (if requested) */
2156 if (llss->check_dv1) {
2157 rc = ll_data_version(llss->inode1, &dv, 0);
2160 if (dv != llss->dv1)
2161 GOTO(putgl, rc = -EAGAIN);
2164 if (llss->check_dv2) {
2165 rc = ll_data_version(llss->inode2, &dv, 0);
2168 if (dv != llss->dv2)
2169 GOTO(putgl, rc = -EAGAIN);
2172 /* struct md_op_data is used to send the swap args to the mdt
2173 * only flags is missing, so we use struct mdc_swap_layouts
2174 * through the md_op_data->op_data */
2175 /* flags from user space have to be converted before they are send to
2176 * server, no flag is sent today, they are only used on the client */
2179 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2180 0, LUSTRE_OPC_ANY, &msl);
2181 if (IS_ERR(op_data))
2182 GOTO(free, rc = PTR_ERR(op_data));
2184 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2185 sizeof(*op_data), op_data, NULL);
2186 ll_finish_md_op_data(op_data);
2190 ll_put_grouplock(llss->inode2, file2, gid);
2191 ll_put_grouplock(llss->inode1, file1, gid);
2194 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2198 /* clear useless flags */
2199 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2200 llss->ia1.ia_valid &= ~ATTR_MTIME;
2201 llss->ia2.ia_valid &= ~ATTR_MTIME;
2204 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2205 llss->ia1.ia_valid &= ~ATTR_ATIME;
2206 llss->ia2.ia_valid &= ~ATTR_ATIME;
2209 /* update time if requested */
2211 if (llss->ia2.ia_valid != 0) {
2212 mutex_lock(&llss->inode1->i_mutex);
2213 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2214 mutex_unlock(&llss->inode1->i_mutex);
2217 if (llss->ia1.ia_valid != 0) {
2220 mutex_lock(&llss->inode2->i_mutex);
2221 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2222 mutex_unlock(&llss->inode2->i_mutex);
2234 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2236 struct md_op_data *op_data;
2239 /* Non-root users are forbidden to set or clear flags which are
2240 * NOT defined in HSM_USER_MASK. */
2241 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2242 !cfs_capable(CFS_CAP_SYS_ADMIN))
2245 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2246 LUSTRE_OPC_ANY, hss);
2247 if (IS_ERR(op_data))
2248 RETURN(PTR_ERR(op_data));
2250 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2251 sizeof(*op_data), op_data, NULL);
2253 ll_finish_md_op_data(op_data);
2258 static int ll_hsm_import(struct inode *inode, struct file *file,
2259 struct hsm_user_import *hui)
2261 struct hsm_state_set *hss = NULL;
2262 struct iattr *attr = NULL;
2266 if (!S_ISREG(inode->i_mode))
2272 GOTO(out, rc = -ENOMEM);
2274 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2275 hss->hss_archive_id = hui->hui_archive_id;
2276 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2277 rc = ll_hsm_state_set(inode, hss);
2281 OBD_ALLOC_PTR(attr);
2283 GOTO(out, rc = -ENOMEM);
2285 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2286 attr->ia_mode |= S_IFREG;
2287 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2288 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2289 attr->ia_size = hui->hui_size;
2290 attr->ia_mtime.tv_sec = hui->hui_mtime;
2291 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2292 attr->ia_atime.tv_sec = hui->hui_atime;
2293 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2295 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2296 ATTR_UID | ATTR_GID |
2297 ATTR_MTIME | ATTR_MTIME_SET |
2298 ATTR_ATIME | ATTR_ATIME_SET;
2300 rc = ll_setattr_raw(file->f_dentry, attr, true);
2315 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2317 struct inode *inode = file->f_dentry->d_inode;
2318 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2323 PFID(ll_inode2fid(inode)), inode, cmd);
2324 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2326 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2327 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2331 case LL_IOC_GETFLAGS:
2332 /* Get the current value of the file flags */
2333 return put_user(fd->fd_flags, (int *)arg);
2334 case LL_IOC_SETFLAGS:
2335 case LL_IOC_CLRFLAGS:
2336 /* Set or clear specific file flags */
2337 /* XXX This probably needs checks to ensure the flags are
2338 * not abused, and to handle any flag side effects.
2340 if (get_user(flags, (int *) arg))
2343 if (cmd == LL_IOC_SETFLAGS) {
2344 if ((flags & LL_FILE_IGNORE_LOCK) &&
2345 !(file->f_flags & O_DIRECT)) {
2346 CERROR("%s: unable to disable locking on "
2347 "non-O_DIRECT file\n", current->comm);
2351 fd->fd_flags |= flags;
2353 fd->fd_flags &= ~flags;
2356 case LL_IOC_LOV_SETSTRIPE:
2357 RETURN(ll_lov_setstripe(inode, file, arg));
2358 case LL_IOC_LOV_SETEA:
2359 RETURN(ll_lov_setea(inode, file, arg));
2360 case LL_IOC_LOV_SWAP_LAYOUTS: {
2362 struct lustre_swap_layouts lsl;
2364 if (copy_from_user(&lsl, (char *)arg,
2365 sizeof(struct lustre_swap_layouts)))
2368 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2371 file2 = fget(lsl.sl_fd);
2376 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2377 rc = ll_swap_layouts(file, file2, &lsl);
2381 case LL_IOC_LOV_GETSTRIPE:
2382 RETURN(ll_lov_getstripe(inode, arg));
2383 case LL_IOC_RECREATE_OBJ:
2384 RETURN(ll_lov_recreate_obj(inode, arg));
2385 case LL_IOC_RECREATE_FID:
2386 RETURN(ll_lov_recreate_fid(inode, arg));
2387 case FSFILT_IOC_FIEMAP:
2388 RETURN(ll_ioctl_fiemap(inode, arg));
2389 case FSFILT_IOC_GETFLAGS:
2390 case FSFILT_IOC_SETFLAGS:
2391 RETURN(ll_iocontrol(inode, file, cmd, arg));
2392 case FSFILT_IOC_GETVERSION_OLD:
2393 case FSFILT_IOC_GETVERSION:
2394 RETURN(put_user(inode->i_generation, (int *)arg));
2395 case LL_IOC_GROUP_LOCK:
2396 RETURN(ll_get_grouplock(inode, file, arg));
2397 case LL_IOC_GROUP_UNLOCK:
2398 RETURN(ll_put_grouplock(inode, file, arg));
2399 case IOC_OBD_STATFS:
2400 RETURN(ll_obd_statfs(inode, (void *)arg));
2402 /* We need to special case any other ioctls we want to handle,
2403 * to send them to the MDS/OST as appropriate and to properly
2404 * network encode the arg field.
2405 case FSFILT_IOC_SETVERSION_OLD:
2406 case FSFILT_IOC_SETVERSION:
2408 case LL_IOC_FLUSHCTX:
2409 RETURN(ll_flush_ctx(inode));
2410 case LL_IOC_PATH2FID: {
2411 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2412 sizeof(struct lu_fid)))
2417 case OBD_IOC_FID2PATH:
2418 RETURN(ll_fid2path(inode, (void *)arg));
2419 case LL_IOC_DATA_VERSION: {
2420 struct ioc_data_version idv;
2423 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2426 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2427 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2429 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2435 case LL_IOC_GET_MDTIDX: {
2438 mdtidx = ll_get_mdt_idx(inode);
2442 if (put_user((int)mdtidx, (int*)arg))
2447 case OBD_IOC_GETDTNAME:
2448 case OBD_IOC_GETMDNAME:
2449 RETURN(ll_get_obd_name(inode, cmd, arg));
2450 case LL_IOC_HSM_STATE_GET: {
2451 struct md_op_data *op_data;
2452 struct hsm_user_state *hus;
2459 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2460 LUSTRE_OPC_ANY, hus);
2461 if (IS_ERR(op_data)) {
2463 RETURN(PTR_ERR(op_data));
2466 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2469 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2472 ll_finish_md_op_data(op_data);
2476 case LL_IOC_HSM_STATE_SET: {
2477 struct hsm_state_set *hss;
2484 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2489 rc = ll_hsm_state_set(inode, hss);
2494 case LL_IOC_HSM_ACTION: {
2495 struct md_op_data *op_data;
2496 struct hsm_current_action *hca;
2503 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2504 LUSTRE_OPC_ANY, hca);
2505 if (IS_ERR(op_data)) {
2507 RETURN(PTR_ERR(op_data));
2510 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2513 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2516 ll_finish_md_op_data(op_data);
2520 case LL_IOC_SET_LEASE: {
2521 struct ll_inode_info *lli = ll_i2info(inode);
2522 struct obd_client_handle *och = NULL;
2528 if (!(file->f_mode & FMODE_WRITE))
2533 if (!(file->f_mode & FMODE_READ))
2538 mutex_lock(&lli->lli_och_mutex);
2539 if (fd->fd_lease_och != NULL) {
2540 och = fd->fd_lease_och;
2541 fd->fd_lease_och = NULL;
2543 mutex_unlock(&lli->lli_och_mutex);
2546 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2547 rc = ll_lease_close(och, inode, &lease_broken);
2548 if (rc == 0 && lease_broken)
2554 /* return the type of lease or error */
2555 RETURN(rc < 0 ? rc : (int)mode);
2560 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2562 /* apply for lease */
2563 och = ll_lease_open(inode, file, mode, 0);
2565 RETURN(PTR_ERR(och));
2568 mutex_lock(&lli->lli_och_mutex);
2569 if (fd->fd_lease_och == NULL) {
2570 fd->fd_lease_och = och;
2573 mutex_unlock(&lli->lli_och_mutex);
2575 /* impossible now that only excl is supported for now */
2576 ll_lease_close(och, inode, &lease_broken);
2581 case LL_IOC_GET_LEASE: {
2582 struct ll_inode_info *lli = ll_i2info(inode);
2583 struct ldlm_lock *lock = NULL;
2586 mutex_lock(&lli->lli_och_mutex);
2587 if (fd->fd_lease_och != NULL) {
2588 struct obd_client_handle *och = fd->fd_lease_och;
2590 lock = ldlm_handle2lock(&och->och_lease_handle);
2592 lock_res_and_lock(lock);
2593 if (!ldlm_is_cancel(lock))
2594 rc = och->och_flags &
2595 (FMODE_READ | FMODE_WRITE);
2596 unlock_res_and_lock(lock);
2597 LDLM_LOCK_PUT(lock);
2600 mutex_unlock(&lli->lli_och_mutex);
2603 case LL_IOC_HSM_IMPORT: {
2604 struct hsm_user_import *hui;
2610 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2615 rc = ll_hsm_import(inode, file, hui);
2625 ll_iocontrol_call(inode, file, cmd, arg, &err))
2628 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2634 #ifndef HAVE_FILE_LLSEEK_SIZE
2635 static inline loff_t
2636 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2638 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2640 if (offset > maxsize)
2643 if (offset != file->f_pos) {
2644 file->f_pos = offset;
2645 file->f_version = 0;
2651 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2652 loff_t maxsize, loff_t eof)
2654 struct inode *inode = file->f_dentry->d_inode;
2662 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2663 * position-querying operation. Avoid rewriting the "same"
2664 * f_pos value back to the file because a concurrent read(),
2665 * write() or lseek() might have altered it
2670 * f_lock protects against read/modify/write race with other
2671 * SEEK_CURs. Note that parallel writes and reads behave
2674 mutex_lock(&inode->i_mutex);
2675 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2676 mutex_unlock(&inode->i_mutex);
2680 * In the generic case the entire file is data, so as long as
2681 * offset isn't at the end of the file then the offset is data.
2688 * There is a virtual hole at the end of the file, so as long as
2689 * offset isn't i_size or larger, return i_size.
2697 return llseek_execute(file, offset, maxsize);
2701 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2703 struct inode *inode = file->f_dentry->d_inode;
2704 loff_t retval, eof = 0;
2707 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2708 (origin == SEEK_CUR) ? file->f_pos : 0);
2709 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2710 PFID(ll_inode2fid(inode)), inode, retval, retval,
2712 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2714 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2715 retval = ll_glimpse_size(inode);
2718 eof = i_size_read(inode);
2721 retval = ll_generic_file_llseek_size(file, offset, origin,
2722 ll_file_maxbytes(inode), eof);
2726 static int ll_flush(struct file *file, fl_owner_t id)
2728 struct inode *inode = file->f_dentry->d_inode;
2729 struct ll_inode_info *lli = ll_i2info(inode);
2730 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2733 LASSERT(!S_ISDIR(inode->i_mode));
2735 /* catch async errors that were recorded back when async writeback
2736 * failed for pages in this mapping. */
2737 rc = lli->lli_async_rc;
2738 lli->lli_async_rc = 0;
2739 if (lli->lli_clob != NULL) {
2740 err = lov_read_and_clear_async_rc(lli->lli_clob);
2745 /* The application has been told write failure already.
2746 * Do not report failure again. */
2747 if (fd->fd_write_failed)
2749 return rc ? -EIO : 0;
2753 * Called to make sure a portion of file has been written out.
2754 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2756 * Return how many pages have been written.
2758 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2759 enum cl_fsync_mode mode, int ignore_layout)
2761 struct cl_env_nest nest;
2764 struct obd_capa *capa = NULL;
2765 struct cl_fsync_io *fio;
2769 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2770 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2773 env = cl_env_nested_get(&nest);
2775 RETURN(PTR_ERR(env));
2777 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2779 io = ccc_env_thread_io(env);
2780 io->ci_obj = cl_i2info(inode)->lli_clob;
2781 io->ci_ignore_layout = ignore_layout;
2783 /* initialize parameters for sync */
2784 fio = &io->u.ci_fsync;
2785 fio->fi_capa = capa;
2786 fio->fi_start = start;
2788 fio->fi_fid = ll_inode2fid(inode);
2789 fio->fi_mode = mode;
2790 fio->fi_nr_written = 0;
2792 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2793 result = cl_io_loop(env, io);
2795 result = io->ci_result;
2797 result = fio->fi_nr_written;
2798 cl_io_fini(env, io);
2799 cl_env_nested_put(&nest, env);
2807 * When dentry is provided (the 'else' case), *file->f_dentry may be
2808 * null and dentry must be used directly rather than pulled from
2809 * *file->f_dentry as is done otherwise.
2812 #ifdef HAVE_FILE_FSYNC_4ARGS
2813 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2815 struct dentry *dentry = file->f_dentry;
2816 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2817 int ll_fsync(struct file *file, int datasync)
2819 struct dentry *dentry = file->f_dentry;
2821 loff_t end = LLONG_MAX;
2823 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2826 loff_t end = LLONG_MAX;
2828 struct inode *inode = dentry->d_inode;
2829 struct ll_inode_info *lli = ll_i2info(inode);
2830 struct ptlrpc_request *req;
2831 struct obd_capa *oc;
2835 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2836 PFID(ll_inode2fid(inode)), inode);
2837 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2839 #ifdef HAVE_FILE_FSYNC_4ARGS
2840 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2841 mutex_lock(&inode->i_mutex);
2843 /* fsync's caller has already called _fdata{sync,write}, we want
2844 * that IO to finish before calling the osc and mdc sync methods */
2845 rc = filemap_fdatawait(inode->i_mapping);
2848 /* catch async errors that were recorded back when async writeback
2849 * failed for pages in this mapping. */
2850 if (!S_ISDIR(inode->i_mode)) {
2851 err = lli->lli_async_rc;
2852 lli->lli_async_rc = 0;
2855 err = lov_read_and_clear_async_rc(lli->lli_clob);
2860 oc = ll_mdscapa_get(inode);
2861 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2867 ptlrpc_req_finished(req);
2869 if (S_ISREG(inode->i_mode)) {
2870 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2872 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2873 if (rc == 0 && err < 0)
2876 fd->fd_write_failed = true;
2878 fd->fd_write_failed = false;
2881 #ifdef HAVE_FILE_FSYNC_4ARGS
2882 mutex_unlock(&inode->i_mutex);
2888 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2890 struct inode *inode = file->f_dentry->d_inode;
2891 struct ll_sb_info *sbi = ll_i2sbi(inode);
2892 struct ldlm_enqueue_info einfo = {
2893 .ei_type = LDLM_FLOCK,
2894 .ei_cb_cp = ldlm_flock_completion_ast,
2895 .ei_cbdata = file_lock,
2897 struct md_op_data *op_data;
2898 struct lustre_handle lockh = {0};
2899 ldlm_policy_data_t flock = {{0}};
2900 int fl_type = file_lock->fl_type;
2906 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2907 PFID(ll_inode2fid(inode)), file_lock);
2909 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2911 if (file_lock->fl_flags & FL_FLOCK) {
2912 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2913 /* flocks are whole-file locks */
2914 flock.l_flock.end = OFFSET_MAX;
2915 /* For flocks owner is determined by the local file desctiptor*/
2916 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2917 } else if (file_lock->fl_flags & FL_POSIX) {
2918 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2919 flock.l_flock.start = file_lock->fl_start;
2920 flock.l_flock.end = file_lock->fl_end;
2924 flock.l_flock.pid = file_lock->fl_pid;
2926 /* Somewhat ugly workaround for svc lockd.
2927 * lockd installs custom fl_lmops->lm_compare_owner that checks
2928 * for the fl_owner to be the same (which it always is on local node
2929 * I guess between lockd processes) and then compares pid.
2930 * As such we assign pid to the owner field to make it all work,
2931 * conflict with normal locks is unlikely since pid space and
2932 * pointer space for current->files are not intersecting */
2933 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2934 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2938 einfo.ei_mode = LCK_PR;
2941 /* An unlock request may or may not have any relation to
2942 * existing locks so we may not be able to pass a lock handle
2943 * via a normal ldlm_lock_cancel() request. The request may even
2944 * unlock a byte range in the middle of an existing lock. In
2945 * order to process an unlock request we need all of the same
2946 * information that is given with a normal read or write record
2947 * lock request. To avoid creating another ldlm unlock (cancel)
2948 * message we'll treat a LCK_NL flock request as an unlock. */
2949 einfo.ei_mode = LCK_NL;
2952 einfo.ei_mode = LCK_PW;
2955 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2970 flags = LDLM_FL_BLOCK_NOWAIT;
2976 flags = LDLM_FL_TEST_LOCK;
2979 CERROR("unknown fcntl lock command: %d\n", cmd);
2983 /* Save the old mode so that if the mode in the lock changes we
2984 * can decrement the appropriate reader or writer refcount. */
2985 file_lock->fl_type = einfo.ei_mode;
2987 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2988 LUSTRE_OPC_ANY, NULL);
2989 if (IS_ERR(op_data))
2990 RETURN(PTR_ERR(op_data));
2992 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2993 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2994 flock.l_flock.pid, flags, einfo.ei_mode,
2995 flock.l_flock.start, flock.l_flock.end);
2997 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3000 /* Restore the file lock type if not TEST lock. */
3001 if (!(flags & LDLM_FL_TEST_LOCK))
3002 file_lock->fl_type = fl_type;
3004 if ((file_lock->fl_flags & FL_FLOCK) &&
3005 (rc == 0 || file_lock->fl_type == F_UNLCK))
3006 rc2 = flock_lock_file_wait(file, file_lock);
3007 if ((file_lock->fl_flags & FL_POSIX) &&
3008 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3009 !(flags & LDLM_FL_TEST_LOCK))
3010 rc2 = posix_lock_file_wait(file, file_lock);
3012 if (rc2 && file_lock->fl_type != F_UNLCK) {
3013 einfo.ei_mode = LCK_NL;
3014 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3019 ll_finish_md_op_data(op_data);
3024 int ll_get_fid_by_name(struct inode *parent, const char *name,
3025 int namelen, struct lu_fid *fid)
3027 struct md_op_data *op_data = NULL;
3028 struct mdt_body *body;
3029 struct ptlrpc_request *req;
3033 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3034 LUSTRE_OPC_ANY, NULL);
3035 if (IS_ERR(op_data))
3036 RETURN(PTR_ERR(op_data));
3038 op_data->op_valid = OBD_MD_FLID;
3039 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3040 ll_finish_md_op_data(op_data);
3044 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3046 GOTO(out_req, rc = -EFAULT);
3050 ptlrpc_req_finished(req);
3054 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3055 const char *name, int namelen)
3057 struct dentry *dchild = NULL;
3058 struct inode *child_inode = NULL;
3059 struct md_op_data *op_data;
3060 struct ptlrpc_request *request = NULL;
3065 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3066 name, PFID(ll_inode2fid(parent)), mdtidx);
3068 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3069 0, LUSTRE_OPC_ANY, NULL);
3070 if (IS_ERR(op_data))
3071 RETURN(PTR_ERR(op_data));
3073 /* Get child FID first */
3074 qstr.hash = full_name_hash(name, namelen);
3077 dchild = d_lookup(file->f_dentry, &qstr);
3078 if (dchild != NULL && dchild->d_inode != NULL) {
3079 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3080 if (dchild->d_inode != NULL) {
3081 child_inode = igrab(dchild->d_inode);
3082 ll_invalidate_aliases(child_inode);
3086 rc = ll_get_fid_by_name(parent, name, namelen,
3092 if (!fid_is_sane(&op_data->op_fid3)) {
3093 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3094 ll_get_fsname(parent->i_sb, NULL, 0), name,
3095 PFID(&op_data->op_fid3));
3096 GOTO(out_free, rc = -EINVAL);
3099 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3104 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3105 PFID(&op_data->op_fid3), mdtidx);
3106 GOTO(out_free, rc = 0);
3109 op_data->op_mds = mdtidx;
3110 op_data->op_cli_flags = CLI_MIGRATE;
3111 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3112 namelen, name, namelen, &request);
3114 ll_update_times(request, parent);
3116 ptlrpc_req_finished(request);
3121 if (child_inode != NULL) {
3122 clear_nlink(child_inode);
3126 ll_finish_md_op_data(op_data);
3131 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3139 * test if some locks matching bits and l_req_mode are acquired
3140 * - bits can be in different locks
3141 * - if found clear the common lock bits in *bits
3142 * - the bits not found, are kept in *bits
3144 * \param bits [IN] searched lock bits [IN]
3145 * \param l_req_mode [IN] searched lock mode
3146 * \retval boolean, true iff all bits are found
3148 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3150 struct lustre_handle lockh;
3151 ldlm_policy_data_t policy;
3152 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3153 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3162 fid = &ll_i2info(inode)->lli_fid;
3163 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3164 ldlm_lockname[mode]);
3166 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3167 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3168 policy.l_inodebits.bits = *bits & (1 << i);
3169 if (policy.l_inodebits.bits == 0)
3172 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3173 &policy, mode, &lockh)) {
3174 struct ldlm_lock *lock;
3176 lock = ldlm_handle2lock(&lockh);
3179 ~(lock->l_policy_data.l_inodebits.bits);
3180 LDLM_LOCK_PUT(lock);
3182 *bits &= ~policy.l_inodebits.bits;
3189 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3190 struct lustre_handle *lockh, __u64 flags,
3193 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3198 fid = &ll_i2info(inode)->lli_fid;
3199 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3201 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3202 fid, LDLM_IBITS, &policy, mode, lockh);
3207 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3209 /* Already unlinked. Just update nlink and return success */
3210 if (rc == -ENOENT) {
3212 /* This path cannot be hit for regular files unless in
3213 * case of obscure races, so no need to to validate
3215 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3217 } else if (rc != 0) {
3218 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3219 "%s: revalidate FID "DFID" error: rc = %d\n",
3220 ll_get_fsname(inode->i_sb, NULL, 0),
3221 PFID(ll_inode2fid(inode)), rc);
3227 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3229 struct inode *inode = dentry->d_inode;
3230 struct ptlrpc_request *req = NULL;
3231 struct obd_export *exp;
3235 LASSERT(inode != NULL);
3237 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3238 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3240 exp = ll_i2mdexp(inode);
3242 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3243 * But under CMD case, it caused some lock issues, should be fixed
3244 * with new CMD ibits lock. See bug 12718 */
3245 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3246 struct lookup_intent oit = { .it_op = IT_GETATTR };
3247 struct md_op_data *op_data;
3249 if (ibits == MDS_INODELOCK_LOOKUP)
3250 oit.it_op = IT_LOOKUP;
3252 /* Call getattr by fid, so do not provide name at all. */
3253 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3254 dentry->d_inode, NULL, 0, 0,
3255 LUSTRE_OPC_ANY, NULL);
3256 if (IS_ERR(op_data))
3257 RETURN(PTR_ERR(op_data));
3259 oit.it_create_mode |= M_CHECK_STALE;
3260 rc = md_intent_lock(exp, op_data, &oit, &req,
3261 &ll_md_blocking_ast, 0);
3262 ll_finish_md_op_data(op_data);
3263 oit.it_create_mode &= ~M_CHECK_STALE;
3265 rc = ll_inode_revalidate_fini(inode, rc);
3269 rc = ll_revalidate_it_finish(req, &oit, dentry);
3271 ll_intent_release(&oit);
3275 /* Unlinked? Unhash dentry, so it is not picked up later by
3276 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3277 here to preserve get_cwd functionality on 2.6.
3279 if (!dentry->d_inode->i_nlink)
3280 d_lustre_invalidate(dentry, 0);
3282 ll_lookup_finish_locks(&oit, dentry);
3283 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3284 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3285 obd_valid valid = OBD_MD_FLGETATTR;
3286 struct md_op_data *op_data;
3289 if (S_ISREG(inode->i_mode)) {
3290 rc = ll_get_default_mdsize(sbi, &ealen);
3293 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3296 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3297 0, ealen, LUSTRE_OPC_ANY,
3299 if (IS_ERR(op_data))
3300 RETURN(PTR_ERR(op_data));
3302 op_data->op_valid = valid;
3303 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3304 * capa for this inode. Because we only keep capas of dirs
3306 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3307 ll_finish_md_op_data(op_data);
3309 rc = ll_inode_revalidate_fini(inode, rc);
3313 rc = ll_prep_inode(&inode, req, NULL, NULL);
3316 ptlrpc_req_finished(req);
3320 static int ll_merge_md_attr(struct inode *inode)
3322 struct cl_attr attr = { 0 };
3325 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3326 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3331 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3332 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3334 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3335 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3336 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3342 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3344 struct inode *inode = dentry->d_inode;
3348 rc = __ll_inode_revalidate(dentry, ibits);
3352 /* if object isn't regular file, don't validate size */
3353 if (!S_ISREG(inode->i_mode)) {
3354 if (S_ISDIR(inode->i_mode) &&
3355 ll_i2info(inode)->lli_lsm_md != NULL) {
3356 rc = ll_merge_md_attr(inode);
3361 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3362 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3363 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3365 /* In case of restore, the MDT has the right size and has
3366 * already send it back without granting the layout lock,
3367 * inode is up-to-date so glimpse is useless.
3368 * Also to glimpse we need the layout, in case of a running
3369 * restore the MDT holds the layout lock so the glimpse will
3370 * block up to the end of restore (getattr will block)
3372 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3373 rc = ll_glimpse_size(inode);
3378 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3380 struct inode *inode = de->d_inode;
3381 struct ll_sb_info *sbi = ll_i2sbi(inode);
3382 struct ll_inode_info *lli = ll_i2info(inode);
3385 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3386 MDS_INODELOCK_LOOKUP);
3387 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3392 stat->dev = inode->i_sb->s_dev;
3393 if (ll_need_32bit_api(sbi))
3394 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3396 stat->ino = inode->i_ino;
3397 stat->mode = inode->i_mode;
3398 stat->uid = inode->i_uid;
3399 stat->gid = inode->i_gid;
3400 stat->rdev = inode->i_rdev;
3401 stat->atime = inode->i_atime;
3402 stat->mtime = inode->i_mtime;
3403 stat->ctime = inode->i_ctime;
3404 stat->blksize = 1 << inode->i_blkbits;
3405 stat->blocks = inode->i_blocks;
3407 if (S_ISDIR(inode->i_mode) &&
3408 ll_i2info(inode)->lli_lsm_md != NULL) {
3409 stat->nlink = lli->lli_stripe_dir_nlink;
3410 stat->size = lli->lli_stripe_dir_size;
3412 stat->nlink = inode->i_nlink;
3413 stat->size = i_size_read(inode);
3419 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3420 __u64 start, __u64 len)
3424 struct ll_user_fiemap *fiemap;
3425 unsigned int extent_count = fieinfo->fi_extents_max;
3427 num_bytes = sizeof(*fiemap) + (extent_count *
3428 sizeof(struct ll_fiemap_extent));
3429 OBD_ALLOC_LARGE(fiemap, num_bytes);
3434 fiemap->fm_flags = fieinfo->fi_flags;
3435 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3436 fiemap->fm_start = start;
3437 fiemap->fm_length = len;
3438 if (extent_count > 0)
3439 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3440 sizeof(struct ll_fiemap_extent));
3442 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3444 fieinfo->fi_flags = fiemap->fm_flags;
3445 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3446 if (extent_count > 0)
3447 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3448 fiemap->fm_mapped_extents *
3449 sizeof(struct ll_fiemap_extent));
3451 OBD_FREE_LARGE(fiemap, num_bytes);
3455 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3457 struct ll_inode_info *lli = ll_i2info(inode);
3458 struct posix_acl *acl = NULL;
3461 spin_lock(&lli->lli_lock);
3462 /* VFS' acl_permission_check->check_acl will release the refcount */
3463 acl = posix_acl_dup(lli->lli_posix_acl);
3464 spin_unlock(&lli->lli_lock);
3469 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3471 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3472 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3474 ll_check_acl(struct inode *inode, int mask)
3477 # ifdef CONFIG_FS_POSIX_ACL
3478 struct posix_acl *acl;
3482 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3483 if (flags & IPERM_FLAG_RCU)
3486 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3491 rc = posix_acl_permission(inode, acl, mask);
3492 posix_acl_release(acl);
3495 # else /* !CONFIG_FS_POSIX_ACL */
3497 # endif /* CONFIG_FS_POSIX_ACL */
3499 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3501 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3502 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3504 # ifdef HAVE_INODE_PERMISION_2ARGS
3505 int ll_inode_permission(struct inode *inode, int mask)
3507 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3512 struct ll_sb_info *sbi;
3513 struct root_squash_info *squash;
3514 struct cred *cred = NULL;
3515 const struct cred *old_cred = NULL;
3517 bool squash_id = false;
3520 #ifdef MAY_NOT_BLOCK
3521 if (mask & MAY_NOT_BLOCK)
3523 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3524 if (flags & IPERM_FLAG_RCU)
3528 /* as root inode are NOT getting validated in lookup operation,
3529 * need to do it before permission check. */
3531 if (inode == inode->i_sb->s_root->d_inode) {
3532 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3533 MDS_INODELOCK_LOOKUP);
3538 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3539 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3541 /* squash fsuid/fsgid if needed */
3542 sbi = ll_i2sbi(inode);
3543 squash = &sbi->ll_squash;
3544 if (unlikely(squash->rsi_uid != 0 &&
3545 current_fsuid() == 0 &&
3546 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3550 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3551 current_fsuid(), current_fsgid(),
3552 squash->rsi_uid, squash->rsi_gid);
3554 /* update current process's credentials
3555 * and FS capability */
3556 cred = prepare_creds();
3560 cred->fsuid = squash->rsi_uid;
3561 cred->fsgid = squash->rsi_gid;
3562 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3563 if ((1 << cap) & CFS_CAP_FS_MASK)
3564 cap_lower(cred->cap_effective, cap);
3566 old_cred = override_creds(cred);
3569 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3571 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3572 rc = lustre_check_remote_perm(inode, mask);
3574 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3576 /* restore current process's credentials and FS capability */
3578 revert_creds(old_cred);
3585 /* -o localflock - only provides locally consistent flock locks */
3586 struct file_operations ll_file_operations = {
3587 .read = ll_file_read,
3588 .aio_read = ll_file_aio_read,
3589 .write = ll_file_write,
3590 .aio_write = ll_file_aio_write,
3591 .unlocked_ioctl = ll_file_ioctl,
3592 .open = ll_file_open,
3593 .release = ll_file_release,
3594 .mmap = ll_file_mmap,
3595 .llseek = ll_file_seek,
3596 .splice_read = ll_file_splice_read,
3601 struct file_operations ll_file_operations_flock = {
3602 .read = ll_file_read,
3603 .aio_read = ll_file_aio_read,
3604 .write = ll_file_write,
3605 .aio_write = ll_file_aio_write,
3606 .unlocked_ioctl = ll_file_ioctl,
3607 .open = ll_file_open,
3608 .release = ll_file_release,
3609 .mmap = ll_file_mmap,
3610 .llseek = ll_file_seek,
3611 .splice_read = ll_file_splice_read,
3614 .flock = ll_file_flock,
3615 .lock = ll_file_flock
3618 /* These are for -o noflock - to return ENOSYS on flock calls */
3619 struct file_operations ll_file_operations_noflock = {
3620 .read = ll_file_read,
3621 .aio_read = ll_file_aio_read,
3622 .write = ll_file_write,
3623 .aio_write = ll_file_aio_write,
3624 .unlocked_ioctl = ll_file_ioctl,
3625 .open = ll_file_open,
3626 .release = ll_file_release,
3627 .mmap = ll_file_mmap,
3628 .llseek = ll_file_seek,
3629 .splice_read = ll_file_splice_read,
3632 .flock = ll_file_noflock,
3633 .lock = ll_file_noflock
3636 struct inode_operations ll_file_inode_operations = {
3637 .setattr = ll_setattr,
3638 .getattr = ll_getattr,
3639 .permission = ll_inode_permission,
3640 .setxattr = ll_setxattr,
3641 .getxattr = ll_getxattr,
3642 .listxattr = ll_listxattr,
3643 .removexattr = ll_removexattr,
3644 .fiemap = ll_fiemap,
3645 #ifdef HAVE_IOP_GET_ACL
3646 .get_acl = ll_get_acl,
3650 /* dynamic ioctl number support routins */
3651 static struct llioc_ctl_data {
3652 struct rw_semaphore ioc_sem;
3653 struct list_head ioc_head;
3655 __RWSEM_INITIALIZER(llioc.ioc_sem),
3656 LIST_HEAD_INIT(llioc.ioc_head)
3661 struct list_head iocd_list;
3662 unsigned int iocd_size;
3663 llioc_callback_t iocd_cb;
3664 unsigned int iocd_count;
3665 unsigned int iocd_cmd[0];
3668 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3671 struct llioc_data *in_data = NULL;
3674 if (cb == NULL || cmd == NULL ||
3675 count > LLIOC_MAX_CMD || count < 0)
3678 size = sizeof(*in_data) + count * sizeof(unsigned int);
3679 OBD_ALLOC(in_data, size);
3680 if (in_data == NULL)
3683 memset(in_data, 0, sizeof(*in_data));
3684 in_data->iocd_size = size;
3685 in_data->iocd_cb = cb;
3686 in_data->iocd_count = count;
3687 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3689 down_write(&llioc.ioc_sem);
3690 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3691 up_write(&llioc.ioc_sem);
3696 void ll_iocontrol_unregister(void *magic)
3698 struct llioc_data *tmp;
3703 down_write(&llioc.ioc_sem);
3704 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3706 unsigned int size = tmp->iocd_size;
3708 list_del(&tmp->iocd_list);
3709 up_write(&llioc.ioc_sem);
3711 OBD_FREE(tmp, size);
3715 up_write(&llioc.ioc_sem);
3717 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3720 EXPORT_SYMBOL(ll_iocontrol_register);
3721 EXPORT_SYMBOL(ll_iocontrol_unregister);
3723 static enum llioc_iter
3724 ll_iocontrol_call(struct inode *inode, struct file *file,
3725 unsigned int cmd, unsigned long arg, int *rcp)
3727 enum llioc_iter ret = LLIOC_CONT;
3728 struct llioc_data *data;
3729 int rc = -EINVAL, i;
3731 down_read(&llioc.ioc_sem);
3732 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3733 for (i = 0; i < data->iocd_count; i++) {
3734 if (cmd != data->iocd_cmd[i])
3737 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3741 if (ret == LLIOC_STOP)
3744 up_read(&llioc.ioc_sem);
3751 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3753 struct ll_inode_info *lli = ll_i2info(inode);
3754 struct cl_env_nest nest;
3759 if (lli->lli_clob == NULL)
3762 env = cl_env_nested_get(&nest);
3764 RETURN(PTR_ERR(env));
3766 result = cl_conf_set(env, lli->lli_clob, conf);
3767 cl_env_nested_put(&nest, env);
3769 if (conf->coc_opc == OBJECT_CONF_SET) {
3770 struct ldlm_lock *lock = conf->coc_lock;
3772 LASSERT(lock != NULL);
3773 LASSERT(ldlm_has_layout(lock));
3775 struct lustre_md *md = conf->u.coc_md;
3776 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3778 /* it can only be allowed to match after layout is
3779 * applied to inode otherwise false layout would be
3780 * seen. Applying layout shoud happen before dropping
3781 * the intent lock. */
3782 ldlm_lock_allow_match(lock);
3784 lli->lli_has_smd = lsm_has_objects(md->lsm);
3785 if (md->lsm != NULL)
3786 gen = md->lsm->lsm_layout_gen;
3789 DFID ": layout version change: %u -> %u\n",
3790 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3792 ll_layout_version_set(lli, gen);
3798 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3799 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3802 struct ll_sb_info *sbi = ll_i2sbi(inode);
3803 struct obd_capa *oc;
3804 struct ptlrpc_request *req;
3805 struct mdt_body *body;
3812 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3813 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3814 lock->l_lvb_data, lock->l_lvb_len);
3816 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3819 /* if layout lock was granted right away, the layout is returned
3820 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3821 * blocked and then granted via completion ast, we have to fetch
3822 * layout here. Please note that we can't use the LVB buffer in
3823 * completion AST because it doesn't have a large enough buffer */
3824 oc = ll_mdscapa_get(inode);
3825 rc = ll_get_default_mdsize(sbi, &lmmsize);
3827 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3828 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3834 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3836 GOTO(out, rc = -EPROTO);
3838 lmmsize = body->eadatasize;
3839 if (lmmsize == 0) /* empty layout */
3842 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3844 GOTO(out, rc = -EFAULT);
3846 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3847 if (lvbdata == NULL)
3848 GOTO(out, rc = -ENOMEM);
3850 memcpy(lvbdata, lmm, lmmsize);
3851 lock_res_and_lock(lock);
3852 if (lock->l_lvb_data != NULL)
3853 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3855 lock->l_lvb_data = lvbdata;
3856 lock->l_lvb_len = lmmsize;
3857 unlock_res_and_lock(lock);
3862 ptlrpc_req_finished(req);
3867 * Apply the layout to the inode. Layout lock is held and will be released
3870 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3871 struct inode *inode, __u32 *gen, bool reconf)
3873 struct ll_inode_info *lli = ll_i2info(inode);
3874 struct ll_sb_info *sbi = ll_i2sbi(inode);
3875 struct ldlm_lock *lock;
3876 struct lustre_md md = { NULL };
3877 struct cl_object_conf conf;
3880 bool wait_layout = false;
3883 LASSERT(lustre_handle_is_used(lockh));
3885 lock = ldlm_handle2lock(lockh);
3886 LASSERT(lock != NULL);
3887 LASSERT(ldlm_has_layout(lock));
3889 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3890 PFID(&lli->lli_fid), inode, reconf);
3892 /* in case this is a caching lock and reinstate with new inode */
3893 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3895 lock_res_and_lock(lock);
3896 lvb_ready = ldlm_is_lvb_ready(lock);
3897 unlock_res_and_lock(lock);
3898 /* checking lvb_ready is racy but this is okay. The worst case is
3899 * that multi processes may configure the file on the same time. */
3901 if (lvb_ready || !reconf) {
3904 /* layout_gen must be valid if layout lock is not
3905 * cancelled and stripe has already set */
3906 *gen = ll_layout_version_get(lli);
3912 rc = ll_layout_fetch(inode, lock);
3916 /* for layout lock, lmm is returned in lock's lvb.
3917 * lvb_data is immutable if the lock is held so it's safe to access it
3918 * without res lock. See the description in ldlm_lock_decref_internal()
3919 * for the condition to free lvb_data of layout lock */
3920 if (lock->l_lvb_data != NULL) {
3921 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3922 lock->l_lvb_data, lock->l_lvb_len);
3924 *gen = LL_LAYOUT_GEN_EMPTY;
3926 *gen = md.lsm->lsm_layout_gen;
3929 CERROR("%s: file "DFID" unpackmd error: %d\n",
3930 ll_get_fsname(inode->i_sb, NULL, 0),
3931 PFID(&lli->lli_fid), rc);
3937 /* set layout to file. Unlikely this will fail as old layout was
3938 * surely eliminated */
3939 memset(&conf, 0, sizeof conf);
3940 conf.coc_opc = OBJECT_CONF_SET;
3941 conf.coc_inode = inode;
3942 conf.coc_lock = lock;
3943 conf.u.coc_md = &md;
3944 rc = ll_layout_conf(inode, &conf);
3947 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3949 /* refresh layout failed, need to wait */
3950 wait_layout = rc == -EBUSY;
3954 LDLM_LOCK_PUT(lock);
3955 ldlm_lock_decref(lockh, mode);
3957 /* wait for IO to complete if it's still being used. */
3959 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3960 ll_get_fsname(inode->i_sb, NULL, 0),
3961 PFID(&lli->lli_fid), inode);
3963 memset(&conf, 0, sizeof conf);
3964 conf.coc_opc = OBJECT_CONF_WAIT;
3965 conf.coc_inode = inode;
3966 rc = ll_layout_conf(inode, &conf);
3970 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3971 ll_get_fsname(inode->i_sb, NULL, 0),
3972 PFID(&lli->lli_fid), rc);
3978 * This function checks if there exists a LAYOUT lock on the client side,
3979 * or enqueues it if it doesn't have one in cache.
3981 * This function will not hold layout lock so it may be revoked any time after
3982 * this function returns. Any operations depend on layout should be redone
3985 * This function should be called before lov_io_init() to get an uptodate
3986 * layout version, the caller should save the version number and after IO
3987 * is finished, this function should be called again to verify that layout
3988 * is not changed during IO time.
3990 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3992 struct ll_inode_info *lli = ll_i2info(inode);
3993 struct ll_sb_info *sbi = ll_i2sbi(inode);
3994 struct md_op_data *op_data;
3995 struct lookup_intent it;
3996 struct lustre_handle lockh;
3998 struct ldlm_enqueue_info einfo = {
3999 .ei_type = LDLM_IBITS,
4001 .ei_cb_bl = &ll_md_blocking_ast,
4002 .ei_cb_cp = &ldlm_completion_ast,
4007 *gen = ll_layout_version_get(lli);
4008 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4012 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4013 LASSERT(S_ISREG(inode->i_mode));
4015 /* take layout lock mutex to enqueue layout lock exclusively. */
4016 mutex_lock(&lli->lli_layout_mutex);
4019 /* mostly layout lock is caching on the local side, so try to match
4020 * it before grabbing layout lock mutex. */
4021 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4022 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4023 if (mode != 0) { /* hit cached lock */
4024 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4028 mutex_unlock(&lli->lli_layout_mutex);
4032 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4033 0, 0, LUSTRE_OPC_ANY, NULL);
4034 if (IS_ERR(op_data)) {
4035 mutex_unlock(&lli->lli_layout_mutex);
4036 RETURN(PTR_ERR(op_data));
4039 /* have to enqueue one */
4040 memset(&it, 0, sizeof(it));
4041 it.it_op = IT_LAYOUT;
4042 lockh.cookie = 0ULL;
4044 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4045 ll_get_fsname(inode->i_sb, NULL, 0),
4046 PFID(&lli->lli_fid), inode);
4048 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4049 if (it.d.lustre.it_data != NULL)
4050 ptlrpc_req_finished(it.d.lustre.it_data);
4051 it.d.lustre.it_data = NULL;
4053 ll_finish_md_op_data(op_data);
4055 mode = it.d.lustre.it_lock_mode;
4056 it.d.lustre.it_lock_mode = 0;
4057 ll_intent_drop_lock(&it);
4060 /* set lock data in case this is a new lock */
4061 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4062 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4066 mutex_unlock(&lli->lli_layout_mutex);
4072 * This function send a restore request to the MDT
4074 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4076 struct hsm_user_request *hur;
4080 len = sizeof(struct hsm_user_request) +
4081 sizeof(struct hsm_user_item);
4082 OBD_ALLOC(hur, len);
4086 hur->hur_request.hr_action = HUA_RESTORE;
4087 hur->hur_request.hr_archive_id = 0;
4088 hur->hur_request.hr_flags = 0;
4089 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4090 sizeof(hur->hur_user_item[0].hui_fid));
4091 hur->hur_user_item[0].hui_extent.offset = offset;
4092 hur->hur_user_item[0].hui_extent.length = length;
4093 hur->hur_request.hr_itemcount = 1;
4094 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,