4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
71 fd->fd_write_failed = false;
76 static void ll_file_data_put(struct ll_file_data *fd)
79 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
83 struct lustre_handle *fh)
85 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
86 op_data->op_attr.ia_mode = inode->i_mode;
87 op_data->op_attr.ia_atime = inode->i_atime;
88 op_data->op_attr.ia_mtime = inode->i_mtime;
89 op_data->op_attr.ia_ctime = inode->i_ctime;
90 op_data->op_attr.ia_size = i_size_read(inode);
91 op_data->op_attr_blocks = inode->i_blocks;
92 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
93 ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
198 rc = ll_objects_destroy(req, inode);
200 CERROR("%s: inode "DFID
201 " ll_objects destroy: rc = %d\n",
202 ll_i2mdexp(inode)->exp_obd->obd_name,
203 PFID(ll_inode2fid(inode)), rc);
206 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
207 struct mdt_body *body;
208 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
209 if (!(body->valid & OBD_MD_FLRELEASED))
213 ll_finish_md_op_data(op_data);
217 if (exp_connect_som(exp) && !epoch_close &&
218 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
219 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
221 md_clear_open_replay_data(md_exp, och);
222 /* Free @och if it is not waiting for DONE_WRITING. */
223 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
226 if (req) /* This is close request */
227 ptlrpc_req_finished(req);
231 int ll_md_real_close(struct inode *inode, fmode_t fmode)
233 struct ll_inode_info *lli = ll_i2info(inode);
234 struct obd_client_handle **och_p;
235 struct obd_client_handle *och;
240 if (fmode & FMODE_WRITE) {
241 och_p = &lli->lli_mds_write_och;
242 och_usecount = &lli->lli_open_fd_write_count;
243 } else if (fmode & FMODE_EXEC) {
244 och_p = &lli->lli_mds_exec_och;
245 och_usecount = &lli->lli_open_fd_exec_count;
247 LASSERT(fmode & FMODE_READ);
248 och_p = &lli->lli_mds_read_och;
249 och_usecount = &lli->lli_open_fd_read_count;
252 mutex_lock(&lli->lli_och_mutex);
253 if (*och_usecount > 0) {
254 /* There are still users of this handle, so skip
256 mutex_unlock(&lli->lli_och_mutex);
262 mutex_unlock(&lli->lli_och_mutex);
265 /* There might be a race and this handle may already
267 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
274 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
278 struct ll_inode_info *lli = ll_i2info(inode);
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("Releasing a file %p with negative dentry %p. Name %s",
337 file, file->f_dentry, file->f_dentry->d_name.name);
341 LUSTRE_FPRIVATE(file) = NULL;
342 ll_file_data_put(fd);
343 ll_capa_close(inode);
348 /* While this returns an error code, fput() the caller does not, so we need
349 * to make every effort to clean up all of our state here. Also, applications
350 * rarely check close errors and even if an error is returned they will not
351 * re-try the close call.
353 int ll_file_release(struct inode *inode, struct file *file)
355 struct ll_file_data *fd;
356 struct ll_sb_info *sbi = ll_i2sbi(inode);
357 struct ll_inode_info *lli = ll_i2info(inode);
361 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
362 PFID(ll_inode2fid(inode)), inode);
364 #ifdef CONFIG_FS_POSIX_ACL
365 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
366 inode == inode->i_sb->s_root->d_inode) {
367 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
370 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
371 fd->fd_flags &= ~LL_FILE_RMTACL;
372 rct_del(&sbi->ll_rct, current_pid());
373 et_search_free(&sbi->ll_et, current_pid());
378 if (inode->i_sb->s_root != file->f_dentry)
379 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
380 fd = LUSTRE_FPRIVATE(file);
383 /* The last ref on @file, maybe not the the owner pid of statahead.
384 * Different processes can open the same dir, "ll_opendir_key" means:
385 * it is me that should stop the statahead thread. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
387 lli->lli_opendir_pid != 0)
388 ll_stop_statahead(inode, lli->lli_opendir_key);
390 if (inode->i_sb->s_root == file->f_dentry) {
391 LUSTRE_FPRIVATE(file) = NULL;
392 ll_file_data_put(fd);
396 if (!S_ISDIR(inode->i_mode)) {
397 if (lli->lli_clob != NULL)
398 lov_read_and_clear_async_rc(lli->lli_clob);
399 lli->lli_async_rc = 0;
402 rc = ll_md_close(sbi->ll_md_exp, inode, file);
404 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
405 libcfs_debug_dumplog();
410 static int ll_intent_file_open(struct file *file, void *lmm,
411 int lmmsize, struct lookup_intent *itp)
413 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
414 struct dentry *parent = file->f_dentry->d_parent;
415 const char *name = file->f_dentry->d_name.name;
416 const int len = file->f_dentry->d_name.len;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req;
419 __u32 opc = LUSTRE_OPC_ANY;
426 /* Usually we come here only for NFSD, and we want open lock.
427 But we can also get here with pre 2.6.15 patchless kernels, and in
428 that case that lock is also ok */
429 /* We can also get here if there was cached open handle in revalidate_it
430 * but it disappeared while we were getting from there to ll_file_open.
431 * But this means this file was closed and immediatelly opened which
432 * makes a good candidate for using OPEN lock */
433 /* If lmmsize & lmm are not 0, we are just setting stripe info
434 * parameters. No need for the open lock */
435 if (lmm == NULL && lmmsize == 0) {
436 itp->it_flags |= MDS_OPEN_LOCK;
437 if (itp->it_flags & FMODE_WRITE)
438 opc = LUSTRE_OPC_CREATE;
441 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
442 file->f_dentry->d_inode, name, len,
445 RETURN(PTR_ERR(op_data));
447 itp->it_flags |= MDS_OPEN_BY_FID;
448 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
449 0 /*unused */, &req, ll_md_blocking_ast, 0);
450 ll_finish_md_op_data(op_data);
452 /* reason for keep own exit path - don`t flood log
453 * with messages with -ESTALE errors.
455 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
456 it_open_error(DISP_OPEN_OPEN, itp))
458 ll_release_openhandle(file->f_dentry, itp);
462 if (it_disposition(itp, DISP_LOOKUP_NEG))
463 GOTO(out, rc = -ENOENT);
465 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
466 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
467 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
471 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
472 if (!rc && itp->d.lustre.it_lock_mode)
473 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
477 ptlrpc_req_finished(req);
478 ll_intent_drop_lock(itp);
484 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
485 * not believe attributes if a few ioepoch holders exist. Attributes for
486 * previous ioepoch if new one is opened are also skipped by MDS.
488 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
490 if (ioepoch && lli->lli_ioepoch != ioepoch) {
491 lli->lli_ioepoch = ioepoch;
492 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
493 ioepoch, PFID(&lli->lli_fid));
497 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
498 struct obd_client_handle *och)
500 struct ptlrpc_request *req = it->d.lustre.it_data;
501 struct mdt_body *body;
503 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
504 och->och_fh = body->handle;
505 och->och_fid = body->fid1;
506 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
507 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
508 och->och_flags = it->it_flags;
510 return md_set_open_replay_data(md_exp, och, it);
513 static int ll_local_open(struct file *file, struct lookup_intent *it,
514 struct ll_file_data *fd, struct obd_client_handle *och)
516 struct inode *inode = file->f_dentry->d_inode;
517 struct ll_inode_info *lli = ll_i2info(inode);
520 LASSERT(!LUSTRE_FPRIVATE(file));
525 struct ptlrpc_request *req = it->d.lustre.it_data;
526 struct mdt_body *body;
529 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
533 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
534 ll_ioepoch_open(lli, body->ioepoch);
537 LUSTRE_FPRIVATE(file) = fd;
538 ll_readahead_init(inode, &fd->fd_ras);
539 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
544 /* Open a file, and (for the very first open) create objects on the OSTs at
545 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
546 * creation or open until ll_lov_setstripe() ioctl is called.
548 * If we already have the stripe MD locally then we don't request it in
549 * md_open(), by passing a lmm_size = 0.
551 * It is up to the application to ensure no other processes open this file
552 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
553 * used. We might be able to avoid races of that sort by getting lli_open_sem
554 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
555 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
557 int ll_file_open(struct inode *inode, struct file *file)
559 struct ll_inode_info *lli = ll_i2info(inode);
560 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
561 .it_flags = file->f_flags };
562 struct obd_client_handle **och_p = NULL;
563 __u64 *och_usecount = NULL;
564 struct ll_file_data *fd;
565 int rc = 0, opendir_set = 0;
568 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
569 PFID(ll_inode2fid(inode)), inode, file->f_flags);
571 it = file->private_data; /* XXX: compat macro */
572 file->private_data = NULL; /* prevent ll_local_open assertion */
574 fd = ll_file_data_get();
576 GOTO(out_openerr, rc = -ENOMEM);
579 if (S_ISDIR(inode->i_mode)) {
580 spin_lock(&lli->lli_sa_lock);
581 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
582 lli->lli_opendir_pid == 0) {
583 lli->lli_opendir_key = fd;
584 lli->lli_opendir_pid = current_pid();
587 spin_unlock(&lli->lli_sa_lock);
590 if (inode->i_sb->s_root == file->f_dentry) {
591 LUSTRE_FPRIVATE(file) = fd;
595 if (!it || !it->d.lustre.it_disposition) {
596 /* Convert f_flags into access mode. We cannot use file->f_mode,
597 * because everything but O_ACCMODE mask was stripped from
599 if ((oit.it_flags + 1) & O_ACCMODE)
601 if (file->f_flags & O_TRUNC)
602 oit.it_flags |= FMODE_WRITE;
604 /* kernel only call f_op->open in dentry_open. filp_open calls
605 * dentry_open after call to open_namei that checks permissions.
606 * Only nfsd_open call dentry_open directly without checking
607 * permissions and because of that this code below is safe. */
608 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
609 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
611 /* We do not want O_EXCL here, presumably we opened the file
612 * already? XXX - NFS implications? */
613 oit.it_flags &= ~O_EXCL;
615 /* bug20584, if "it_flags" contains O_CREAT, the file will be
616 * created if necessary, then "IT_CREAT" should be set to keep
617 * consistent with it */
618 if (oit.it_flags & O_CREAT)
619 oit.it_op |= IT_CREAT;
625 /* Let's see if we have file open on MDS already. */
626 if (it->it_flags & FMODE_WRITE) {
627 och_p = &lli->lli_mds_write_och;
628 och_usecount = &lli->lli_open_fd_write_count;
629 } else if (it->it_flags & FMODE_EXEC) {
630 och_p = &lli->lli_mds_exec_och;
631 och_usecount = &lli->lli_open_fd_exec_count;
633 och_p = &lli->lli_mds_read_och;
634 och_usecount = &lli->lli_open_fd_read_count;
637 mutex_lock(&lli->lli_och_mutex);
638 if (*och_p) { /* Open handle is present */
639 if (it_disposition(it, DISP_OPEN_OPEN)) {
640 /* Well, there's extra open request that we do not need,
641 let's close it somehow. This will decref request. */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 mutex_unlock(&lli->lli_och_mutex);
645 GOTO(out_openerr, rc);
648 ll_release_openhandle(file->f_dentry, it);
652 rc = ll_local_open(file, it, fd, NULL);
655 mutex_unlock(&lli->lli_och_mutex);
656 GOTO(out_openerr, rc);
659 LASSERT(*och_usecount == 0);
660 if (!it->d.lustre.it_disposition) {
661 /* We cannot just request lock handle now, new ELC code
662 means that one of other OPEN locks for this file
663 could be cancelled, and since blocking ast handler
664 would attempt to grab och_mutex as well, that would
665 result in a deadlock */
666 mutex_unlock(&lli->lli_och_mutex);
667 it->it_create_mode |= M_CHECK_STALE;
668 rc = ll_intent_file_open(file, NULL, 0, it);
669 it->it_create_mode &= ~M_CHECK_STALE;
671 GOTO(out_openerr, rc);
675 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
677 GOTO(out_och_free, rc = -ENOMEM);
681 /* md_intent_lock() didn't get a request ref if there was an
682 * open error, so don't do cleanup on the request here
684 /* XXX (green): Should not we bail out on any error here, not
685 * just open error? */
686 rc = it_open_error(DISP_OPEN_OPEN, it);
688 GOTO(out_och_free, rc);
690 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
691 "inode %p: disposition %x, status %d\n", inode,
692 it_disposition(it, ~0), it->d.lustre.it_status);
694 rc = ll_local_open(file, it, fd, *och_p);
696 GOTO(out_och_free, rc);
698 mutex_unlock(&lli->lli_och_mutex);
701 /* Must do this outside lli_och_mutex lock to prevent deadlock where
702 different kind of OPEN lock for this same inode gets cancelled
703 by ldlm_cancel_lru */
704 if (!S_ISREG(inode->i_mode))
705 GOTO(out_och_free, rc);
709 if (!lli->lli_has_smd &&
710 (cl_is_lov_delay_create(file->f_flags) ||
711 (file->f_mode & FMODE_WRITE) == 0)) {
712 CDEBUG(D_INODE, "object creation was delayed\n");
713 GOTO(out_och_free, rc);
715 cl_lov_delay_create_clear(&file->f_flags);
716 GOTO(out_och_free, rc);
720 if (och_p && *och_p) {
721 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
722 *och_p = NULL; /* OBD_FREE writes some magic there */
725 mutex_unlock(&lli->lli_och_mutex);
728 if (opendir_set != 0)
729 ll_stop_statahead(inode, lli->lli_opendir_key);
731 ll_file_data_put(fd);
733 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
736 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
737 ptlrpc_req_finished(it->d.lustre.it_data);
738 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
744 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
745 struct ldlm_lock_desc *desc, void *data, int flag)
748 struct lustre_handle lockh;
752 case LDLM_CB_BLOCKING:
753 ldlm_lock2handle(lock, &lockh);
754 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
756 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
760 case LDLM_CB_CANCELING:
768 * Acquire a lease and open the file.
770 static struct obd_client_handle *
771 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
774 struct lookup_intent it = { .it_op = IT_OPEN };
775 struct ll_sb_info *sbi = ll_i2sbi(inode);
776 struct md_op_data *op_data;
777 struct ptlrpc_request *req;
778 struct lustre_handle old_handle = { 0 };
779 struct obd_client_handle *och = NULL;
784 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
785 RETURN(ERR_PTR(-EINVAL));
788 struct ll_inode_info *lli = ll_i2info(inode);
789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
790 struct obd_client_handle **och_p;
793 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
794 RETURN(ERR_PTR(-EPERM));
796 /* Get the openhandle of the file */
798 mutex_lock(&lli->lli_och_mutex);
799 if (fd->fd_lease_och != NULL) {
800 mutex_unlock(&lli->lli_och_mutex);
804 if (fd->fd_och == NULL) {
805 if (file->f_mode & FMODE_WRITE) {
806 LASSERT(lli->lli_mds_write_och != NULL);
807 och_p = &lli->lli_mds_write_och;
808 och_usecount = &lli->lli_open_fd_write_count;
810 LASSERT(lli->lli_mds_read_och != NULL);
811 och_p = &lli->lli_mds_read_och;
812 och_usecount = &lli->lli_open_fd_read_count;
814 if (*och_usecount == 1) {
821 mutex_unlock(&lli->lli_och_mutex);
822 if (rc < 0) /* more than 1 opener */
825 LASSERT(fd->fd_och != NULL);
826 old_handle = fd->fd_och->och_fh;
831 RETURN(ERR_PTR(-ENOMEM));
833 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
834 LUSTRE_OPC_ANY, NULL);
836 GOTO(out, rc = PTR_ERR(op_data));
838 /* To tell the MDT this openhandle is from the same owner */
839 op_data->op_handle = old_handle;
841 it.it_flags = fmode | open_flags;
842 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
843 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
844 ll_md_blocking_lease_ast,
845 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
846 * it can be cancelled which may mislead applications that the lease is
848 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
849 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
850 * doesn't deal with openhandle, so normal openhandle will be leaked. */
851 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
852 ll_finish_md_op_data(op_data);
853 ptlrpc_req_finished(req);
855 GOTO(out_release_it, rc);
857 if (it_disposition(&it, DISP_LOOKUP_NEG))
858 GOTO(out_release_it, rc = -ENOENT);
860 rc = it_open_error(DISP_OPEN_OPEN, &it);
862 GOTO(out_release_it, rc);
864 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
865 ll_och_fill(sbi->ll_md_exp, &it, och);
867 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
868 GOTO(out_close, rc = -EOPNOTSUPP);
870 /* already get lease, handle lease lock */
871 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
872 if (it.d.lustre.it_lock_mode == 0 ||
873 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
874 /* open lock must return for lease */
875 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
876 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
877 it.d.lustre.it_lock_bits);
878 GOTO(out_close, rc = -EPROTO);
881 ll_intent_release(&it);
885 /* Cancel open lock */
886 if (it.d.lustre.it_lock_mode != 0) {
887 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
888 it.d.lustre.it_lock_mode);
889 it.d.lustre.it_lock_mode = 0;
890 och->och_lease_handle.cookie = 0ULL;
892 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
894 CERROR("%s: error closing file "DFID": %d\n",
895 ll_get_fsname(inode->i_sb, NULL, 0),
896 PFID(&ll_i2info(inode)->lli_fid), rc2);
897 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
899 ll_intent_release(&it);
907 * Release lease and close the file.
908 * It will check if the lease has ever broken.
910 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
913 struct ldlm_lock *lock;
914 bool cancelled = true;
918 lock = ldlm_handle2lock(&och->och_lease_handle);
920 lock_res_and_lock(lock);
921 cancelled = ldlm_is_cancel(lock);
922 unlock_res_and_lock(lock);
926 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
927 PFID(&ll_i2info(inode)->lli_fid), cancelled);
930 ldlm_cli_cancel(&och->och_lease_handle, 0);
931 if (lease_broken != NULL)
932 *lease_broken = cancelled;
934 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
939 /* Fills the obdo with the attributes for the lsm */
940 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
941 struct obd_capa *capa, struct obdo *obdo,
942 __u64 ioepoch, int dv_flags)
944 struct ptlrpc_request_set *set;
945 struct obd_info oinfo = { { { 0 } } };
950 LASSERT(lsm != NULL);
954 oinfo.oi_oa->o_oi = lsm->lsm_oi;
955 oinfo.oi_oa->o_mode = S_IFREG;
956 oinfo.oi_oa->o_ioepoch = ioepoch;
957 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
958 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
959 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
960 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
961 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
962 OBD_MD_FLDATAVERSION;
963 oinfo.oi_capa = capa;
964 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
965 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
966 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
967 if (dv_flags & LL_DV_WR_FLUSH)
968 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
971 set = ptlrpc_prep_set();
973 CERROR("can't allocate ptlrpc set\n");
976 rc = obd_getattr_async(exp, &oinfo, set);
978 rc = ptlrpc_set_wait(set);
979 ptlrpc_set_destroy(set);
982 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
983 OBD_MD_FLATIME | OBD_MD_FLMTIME |
984 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
985 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
986 if (dv_flags & LL_DV_WR_FLUSH &&
987 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
988 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
995 * Performs the getattr on the inode and updates its fields.
996 * If @sync != 0, perform the getattr under the server-side lock.
998 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
999 __u64 ioepoch, int sync)
1001 struct obd_capa *capa = ll_mdscapa_get(inode);
1002 struct lov_stripe_md *lsm;
1006 lsm = ccc_inode_lsm_get(inode);
1007 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1008 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1011 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1013 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1014 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1015 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1016 (unsigned long long)inode->i_blocks,
1017 (unsigned long)ll_inode_blksize(inode));
1019 ccc_inode_lsm_put(inode, lsm);
1023 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct cl_object *obj = lli->lli_clob;
1027 struct cl_attr *attr = ccc_env_thread_attr(env);
1033 ll_inode_size_lock(inode);
1034 /* merge timestamps the most recently obtained from mds with
1035 timestamps obtained from osts */
1036 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1037 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1038 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 inode_init_lvb(inode, &lvb);
1041 cl_object_attr_lock(obj);
1042 rc = cl_object_attr_get(env, obj, attr);
1043 cl_object_attr_unlock(obj);
1046 if (lvb.lvb_atime < attr->cat_atime)
1047 lvb.lvb_atime = attr->cat_atime;
1048 if (lvb.lvb_ctime < attr->cat_ctime)
1049 lvb.lvb_ctime = attr->cat_ctime;
1050 if (lvb.lvb_mtime < attr->cat_mtime)
1051 lvb.lvb_mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1055 cl_isize_write_nolock(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1060 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1061 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1146 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1147 file->f_dentry->d_name.name, iot, *ppos, count);
1150 io = ccc_env_thread_io(env);
1151 ll_io_init(io, file, iot == CIT_WRITE);
1153 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1154 struct vvp_io *vio = vvp_env_io(env);
1155 struct ccc_io *cio = ccc_env_io(env);
1156 int write_mutex_locked = 0;
1158 cio->cui_fd = LUSTRE_FPRIVATE(file);
1159 vio->cui_io_subtype = args->via_io_subtype;
1161 switch (vio->cui_io_subtype) {
1163 cio->cui_iov = args->u.normal.via_iov;
1164 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1165 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1166 cio->cui_iocb = args->u.normal.via_iocb;
1167 if ((iot == CIT_WRITE) &&
1168 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1169 if (mutex_lock_interruptible(&lli->
1171 GOTO(out, result = -ERESTARTSYS);
1172 write_mutex_locked = 1;
1174 down_read(&lli->lli_trunc_sem);
1177 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1178 vio->u.splice.cui_flags = args->u.splice.via_flags;
1181 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1184 result = cl_io_loop(env, io);
1185 if (args->via_io_subtype == IO_NORMAL)
1186 up_read(&lli->lli_trunc_sem);
1187 if (write_mutex_locked)
1188 mutex_unlock(&lli->lli_write_mutex);
1190 /* cl_io_rw_init() handled IO */
1191 result = io->ci_result;
1194 if (io->ci_nob > 0) {
1195 result = io->ci_nob;
1196 *ppos = io->u.ci_wr.wr.crw_pos;
1200 cl_io_fini(env, io);
1201 /* If any bit been read/written (result != 0), we just return
1202 * short read/write instead of restart io. */
1203 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1204 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1205 iot == CIT_READ ? "read" : "write",
1206 file->f_dentry->d_name.name, *ppos, count);
1207 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1211 if (iot == CIT_READ) {
1213 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1214 LPROC_LL_READ_BYTES, result);
1215 } else if (iot == CIT_WRITE) {
1217 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1218 LPROC_LL_WRITE_BYTES, result);
1219 fd->fd_write_failed = false;
1220 } else if (result != -ERESTARTSYS) {
1221 fd->fd_write_failed = true;
1224 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1231 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1233 static int ll_file_get_iov_count(const struct iovec *iov,
1234 unsigned long *nr_segs, size_t *count)
1239 for (seg = 0; seg < *nr_segs; seg++) {
1240 const struct iovec *iv = &iov[seg];
1243 * If any segment has a negative length, or the cumulative
1244 * length ever wraps negative then return -EINVAL.
1247 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1249 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1254 cnt -= iv->iov_len; /* This segment is no good */
1261 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1262 unsigned long nr_segs, loff_t pos)
1265 struct vvp_io_args *args;
1271 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1275 env = cl_env_get(&refcheck);
1277 RETURN(PTR_ERR(env));
1279 args = vvp_env_args(env, IO_NORMAL);
1280 args->u.normal.via_iov = (struct iovec *)iov;
1281 args->u.normal.via_nrsegs = nr_segs;
1282 args->u.normal.via_iocb = iocb;
1284 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1285 &iocb->ki_pos, count);
1286 cl_env_put(env, &refcheck);
1290 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1294 struct iovec *local_iov;
1295 struct kiocb *kiocb;
1300 env = cl_env_get(&refcheck);
1302 RETURN(PTR_ERR(env));
1304 local_iov = &vvp_env_info(env)->vti_local_iov;
1305 kiocb = &vvp_env_info(env)->vti_kiocb;
1306 local_iov->iov_base = (void __user *)buf;
1307 local_iov->iov_len = count;
1308 init_sync_kiocb(kiocb, file);
1309 kiocb->ki_pos = *ppos;
1310 #ifdef HAVE_KIOCB_KI_LEFT
1311 kiocb->ki_left = count;
1313 kiocb->ki_nbytes = count;
1316 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1317 *ppos = kiocb->ki_pos;
1319 cl_env_put(env, &refcheck);
1324 * Write to a file (through the page cache).
1327 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1328 unsigned long nr_segs, loff_t pos)
1331 struct vvp_io_args *args;
1337 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1341 env = cl_env_get(&refcheck);
1343 RETURN(PTR_ERR(env));
1345 args = vvp_env_args(env, IO_NORMAL);
1346 args->u.normal.via_iov = (struct iovec *)iov;
1347 args->u.normal.via_nrsegs = nr_segs;
1348 args->u.normal.via_iocb = iocb;
1350 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1351 &iocb->ki_pos, count);
1352 cl_env_put(env, &refcheck);
1356 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1360 struct iovec *local_iov;
1361 struct kiocb *kiocb;
1366 env = cl_env_get(&refcheck);
1368 RETURN(PTR_ERR(env));
1370 local_iov = &vvp_env_info(env)->vti_local_iov;
1371 kiocb = &vvp_env_info(env)->vti_kiocb;
1372 local_iov->iov_base = (void __user *)buf;
1373 local_iov->iov_len = count;
1374 init_sync_kiocb(kiocb, file);
1375 kiocb->ki_pos = *ppos;
1376 #ifdef HAVE_KIOCB_KI_LEFT
1377 kiocb->ki_left = count;
1379 kiocb->ki_nbytes = count;
1382 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1383 *ppos = kiocb->ki_pos;
1385 cl_env_put(env, &refcheck);
1390 * Send file content (through pagecache) somewhere with helper
1392 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1393 struct pipe_inode_info *pipe, size_t count,
1397 struct vvp_io_args *args;
1402 env = cl_env_get(&refcheck);
1404 RETURN(PTR_ERR(env));
1406 args = vvp_env_args(env, IO_SPLICE);
1407 args->u.splice.via_pipe = pipe;
1408 args->u.splice.via_flags = flags;
1410 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1411 cl_env_put(env, &refcheck);
1415 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1418 struct obd_export *exp = ll_i2dtexp(inode);
1419 struct obd_trans_info oti = { 0 };
1420 struct obdo *oa = NULL;
1423 struct lov_stripe_md *lsm = NULL, *lsm2;
1430 lsm = ccc_inode_lsm_get(inode);
1431 if (!lsm_has_objects(lsm))
1432 GOTO(out, rc = -ENOENT);
1434 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1435 (lsm->lsm_stripe_count));
1437 OBD_ALLOC_LARGE(lsm2, lsm_size);
1439 GOTO(out, rc = -ENOMEM);
1442 oa->o_nlink = ost_idx;
1443 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1444 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1445 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1446 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1447 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1448 memcpy(lsm2, lsm, lsm_size);
1449 ll_inode_size_lock(inode);
1450 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1451 ll_inode_size_unlock(inode);
1453 OBD_FREE_LARGE(lsm2, lsm_size);
1456 ccc_inode_lsm_put(inode, lsm);
1461 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1463 struct ll_recreate_obj ucreat;
1467 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1470 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1474 ostid_set_seq_mdt0(&oi);
1475 ostid_set_id(&oi, ucreat.lrc_id);
1476 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1479 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1486 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1489 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1492 fid_to_ostid(&fid, &oi);
1493 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1494 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1497 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1498 __u64 flags, struct lov_user_md *lum,
1501 struct lov_stripe_md *lsm = NULL;
1502 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1506 lsm = ccc_inode_lsm_get(inode);
1508 ccc_inode_lsm_put(inode, lsm);
1509 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1510 PFID(ll_inode2fid(inode)));
1511 GOTO(out, rc = -EEXIST);
1514 ll_inode_size_lock(inode);
1515 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1517 GOTO(out_unlock, rc);
1518 rc = oit.d.lustre.it_status;
1520 GOTO(out_req_free, rc);
1522 ll_release_openhandle(file->f_dentry, &oit);
1525 ll_inode_size_unlock(inode);
1526 ll_intent_release(&oit);
1527 ccc_inode_lsm_put(inode, lsm);
1529 cl_lov_delay_create_clear(&file->f_flags);
1532 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1536 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1537 struct lov_mds_md **lmmp, int *lmm_size,
1538 struct ptlrpc_request **request)
1540 struct ll_sb_info *sbi = ll_i2sbi(inode);
1541 struct mdt_body *body;
1542 struct lov_mds_md *lmm = NULL;
1543 struct ptlrpc_request *req = NULL;
1544 struct md_op_data *op_data;
1547 rc = ll_get_default_mdsize(sbi, &lmmsize);
1551 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1552 strlen(filename), lmmsize,
1553 LUSTRE_OPC_ANY, NULL);
1554 if (IS_ERR(op_data))
1555 RETURN(PTR_ERR(op_data));
1557 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1558 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1559 ll_finish_md_op_data(op_data);
1561 CDEBUG(D_INFO, "md_getattr_name failed "
1562 "on %s: rc %d\n", filename, rc);
1566 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1567 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1569 lmmsize = body->eadatasize;
1571 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1573 GOTO(out, rc = -ENODATA);
1576 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1577 LASSERT(lmm != NULL);
1579 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1580 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1581 GOTO(out, rc = -EPROTO);
1585 * This is coming from the MDS, so is probably in
1586 * little endian. We convert it to host endian before
1587 * passing it to userspace.
1589 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1592 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1593 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1596 /* if function called for directory - we should
1597 * avoid swab not existent lsm objects */
1598 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1599 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1600 if (S_ISREG(body->mode))
1601 lustre_swab_lov_user_md_objects(
1602 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1604 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1605 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1606 if (S_ISREG(body->mode))
1607 lustre_swab_lov_user_md_objects(
1608 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1615 *lmm_size = lmmsize;
1620 static int ll_lov_setea(struct inode *inode, struct file *file,
1623 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1624 struct lov_user_md *lump;
1625 int lum_size = sizeof(struct lov_user_md) +
1626 sizeof(struct lov_user_ost_data);
1630 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1633 OBD_ALLOC_LARGE(lump, lum_size);
1637 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1638 OBD_FREE_LARGE(lump, lum_size);
1642 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1644 OBD_FREE_LARGE(lump, lum_size);
1648 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1651 struct lov_user_md_v3 lumv3;
1652 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1653 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1654 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1656 __u64 flags = FMODE_WRITE;
1659 /* first try with v1 which is smaller than v3 */
1660 lum_size = sizeof(struct lov_user_md_v1);
1661 if (copy_from_user(lumv1, lumv1p, lum_size))
1664 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1665 lum_size = sizeof(struct lov_user_md_v3);
1666 if (copy_from_user(&lumv3, lumv3p, lum_size))
1670 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1672 struct lov_stripe_md *lsm;
1675 put_user(0, &lumv1p->lmm_stripe_count);
1677 ll_layout_refresh(inode, &gen);
1678 lsm = ccc_inode_lsm_get(inode);
1679 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1680 0, lsm, (void *)arg);
1681 ccc_inode_lsm_put(inode, lsm);
1686 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1688 struct lov_stripe_md *lsm;
1692 lsm = ccc_inode_lsm_get(inode);
1694 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1696 ccc_inode_lsm_put(inode, lsm);
1701 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1703 struct ll_inode_info *lli = ll_i2info(inode);
1704 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1705 struct ccc_grouplock grouplock;
1709 if (ll_file_nolock(file))
1710 RETURN(-EOPNOTSUPP);
1712 spin_lock(&lli->lli_lock);
1713 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1714 CWARN("group lock already existed with gid %lu\n",
1715 fd->fd_grouplock.cg_gid);
1716 spin_unlock(&lli->lli_lock);
1719 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1720 spin_unlock(&lli->lli_lock);
1722 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1723 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1727 spin_lock(&lli->lli_lock);
1728 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1729 spin_unlock(&lli->lli_lock);
1730 CERROR("another thread just won the race\n");
1731 cl_put_grouplock(&grouplock);
1735 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1736 fd->fd_grouplock = grouplock;
1737 spin_unlock(&lli->lli_lock);
1739 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1743 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1745 struct ll_inode_info *lli = ll_i2info(inode);
1746 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1747 struct ccc_grouplock grouplock;
1750 spin_lock(&lli->lli_lock);
1751 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1752 spin_unlock(&lli->lli_lock);
1753 CWARN("no group lock held\n");
1756 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1758 if (fd->fd_grouplock.cg_gid != arg) {
1759 CWARN("group lock %lu doesn't match current id %lu\n",
1760 arg, fd->fd_grouplock.cg_gid);
1761 spin_unlock(&lli->lli_lock);
1765 grouplock = fd->fd_grouplock;
1766 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1767 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1768 spin_unlock(&lli->lli_lock);
1770 cl_put_grouplock(&grouplock);
1771 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1776 * Close inode open handle
1778 * \param dentry [in] dentry which contains the inode
1779 * \param it [in,out] intent which contains open info and result
1782 * \retval <0 failure
1784 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1786 struct inode *inode = dentry->d_inode;
1787 struct obd_client_handle *och;
1793 /* Root ? Do nothing. */
1794 if (dentry->d_inode->i_sb->s_root == dentry)
1797 /* No open handle to close? Move away */
1798 if (!it_disposition(it, DISP_OPEN_OPEN))
1801 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1803 OBD_ALLOC(och, sizeof(*och));
1805 GOTO(out, rc = -ENOMEM);
1807 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1809 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1812 /* this one is in place of ll_file_open */
1813 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1814 ptlrpc_req_finished(it->d.lustre.it_data);
1815 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1821 * Get size for inode for which FIEMAP mapping is requested.
1822 * Make the FIEMAP get_info call and returns the result.
1824 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1827 struct obd_export *exp = ll_i2dtexp(inode);
1828 struct lov_stripe_md *lsm = NULL;
1829 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1830 __u32 vallen = num_bytes;
1834 /* Checks for fiemap flags */
1835 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1836 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1840 /* Check for FIEMAP_FLAG_SYNC */
1841 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1842 rc = filemap_fdatawrite(inode->i_mapping);
1847 lsm = ccc_inode_lsm_get(inode);
1851 /* If the stripe_count > 1 and the application does not understand
1852 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1854 if (lsm->lsm_stripe_count > 1 &&
1855 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1856 GOTO(out, rc = -EOPNOTSUPP);
1858 fm_key.oa.o_oi = lsm->lsm_oi;
1859 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1861 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1862 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1863 /* If filesize is 0, then there would be no objects for mapping */
1864 if (fm_key.oa.o_size == 0) {
1865 fiemap->fm_mapped_extents = 0;
1869 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1871 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1874 CERROR("obd_get_info failed: rc = %d\n", rc);
1877 ccc_inode_lsm_put(inode, lsm);
1881 int ll_fid2path(struct inode *inode, void *arg)
1883 struct obd_export *exp = ll_i2mdexp(inode);
1884 struct getinfo_fid2path *gfout, *gfin;
1888 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1889 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1892 /* Need to get the buflen */
1893 OBD_ALLOC_PTR(gfin);
1896 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1901 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1902 OBD_ALLOC(gfout, outsize);
1903 if (gfout == NULL) {
1907 memcpy(gfout, gfin, sizeof(*gfout));
1910 /* Call mdc_iocontrol */
1911 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1915 if (copy_to_user(arg, gfout, outsize))
1919 OBD_FREE(gfout, outsize);
1923 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1925 struct ll_user_fiemap *fiemap_s;
1926 size_t num_bytes, ret_bytes;
1927 unsigned int extent_count;
1930 /* Get the extent count so we can calculate the size of
1931 * required fiemap buffer */
1932 if (get_user(extent_count,
1933 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1935 num_bytes = sizeof(*fiemap_s) + (extent_count *
1936 sizeof(struct ll_fiemap_extent));
1938 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1939 if (fiemap_s == NULL)
1942 /* get the fiemap value */
1943 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1945 GOTO(error, rc = -EFAULT);
1947 /* If fm_extent_count is non-zero, read the first extent since
1948 * it is used to calculate end_offset and device from previous
1951 if (copy_from_user(&fiemap_s->fm_extents[0],
1952 (char __user *)arg + sizeof(*fiemap_s),
1953 sizeof(struct ll_fiemap_extent)))
1954 GOTO(error, rc = -EFAULT);
1957 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1961 ret_bytes = sizeof(struct ll_user_fiemap);
1963 if (extent_count != 0)
1964 ret_bytes += (fiemap_s->fm_mapped_extents *
1965 sizeof(struct ll_fiemap_extent));
1967 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1971 OBD_FREE_LARGE(fiemap_s, num_bytes);
1976 * Read the data_version for inode.
1978 * This value is computed using stripe object version on OST.
1979 * Version is computed using server side locking.
1981 * @param sync if do sync on the OST side;
1983 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1984 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1986 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1988 struct lov_stripe_md *lsm = NULL;
1989 struct ll_sb_info *sbi = ll_i2sbi(inode);
1990 struct obdo *obdo = NULL;
1994 /* If no stripe, we consider version is 0. */
1995 lsm = ccc_inode_lsm_get(inode);
1996 if (!lsm_has_objects(lsm)) {
1998 CDEBUG(D_INODE, "No object for inode\n");
2002 OBD_ALLOC_PTR(obdo);
2004 GOTO(out, rc = -ENOMEM);
2006 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2008 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2011 *data_version = obdo->o_data_version;
2017 ccc_inode_lsm_put(inode, lsm);
2022 * Trigger a HSM release request for the provided inode.
2024 int ll_hsm_release(struct inode *inode)
2026 struct cl_env_nest nest;
2028 struct obd_client_handle *och = NULL;
2029 __u64 data_version = 0;
2033 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2034 ll_get_fsname(inode->i_sb, NULL, 0),
2035 PFID(&ll_i2info(inode)->lli_fid));
2037 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2039 GOTO(out, rc = PTR_ERR(och));
2041 /* Grab latest data_version and [am]time values */
2042 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2046 env = cl_env_nested_get(&nest);
2048 GOTO(out, rc = PTR_ERR(env));
2050 ll_merge_lvb(env, inode);
2051 cl_env_nested_put(&nest, env);
2053 /* Release the file.
2054 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2055 * we still need it to pack l_remote_handle to MDT. */
2056 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2062 if (och != NULL && !IS_ERR(och)) /* close the file */
2063 ll_lease_close(och, inode, NULL);
2068 struct ll_swap_stack {
2069 struct iattr ia1, ia2;
2071 struct inode *inode1, *inode2;
2072 bool check_dv1, check_dv2;
2075 static int ll_swap_layouts(struct file *file1, struct file *file2,
2076 struct lustre_swap_layouts *lsl)
2078 struct mdc_swap_layouts msl;
2079 struct md_op_data *op_data;
2082 struct ll_swap_stack *llss = NULL;
2085 OBD_ALLOC_PTR(llss);
2089 llss->inode1 = file1->f_dentry->d_inode;
2090 llss->inode2 = file2->f_dentry->d_inode;
2092 if (!S_ISREG(llss->inode2->i_mode))
2093 GOTO(free, rc = -EINVAL);
2095 if (inode_permission(llss->inode1, MAY_WRITE) ||
2096 inode_permission(llss->inode2, MAY_WRITE))
2097 GOTO(free, rc = -EPERM);
2099 if (llss->inode2->i_sb != llss->inode1->i_sb)
2100 GOTO(free, rc = -EXDEV);
2102 /* we use 2 bool because it is easier to swap than 2 bits */
2103 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2104 llss->check_dv1 = true;
2106 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2107 llss->check_dv2 = true;
2109 /* we cannot use lsl->sl_dvX directly because we may swap them */
2110 llss->dv1 = lsl->sl_dv1;
2111 llss->dv2 = lsl->sl_dv2;
2113 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2114 if (rc == 0) /* same file, done! */
2117 if (rc < 0) { /* sequentialize it */
2118 swap(llss->inode1, llss->inode2);
2120 swap(llss->dv1, llss->dv2);
2121 swap(llss->check_dv1, llss->check_dv2);
2125 if (gid != 0) { /* application asks to flush dirty cache */
2126 rc = ll_get_grouplock(llss->inode1, file1, gid);
2130 rc = ll_get_grouplock(llss->inode2, file2, gid);
2132 ll_put_grouplock(llss->inode1, file1, gid);
2137 /* to be able to restore mtime and atime after swap
2138 * we need to first save them */
2140 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2141 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2142 llss->ia1.ia_atime = llss->inode1->i_atime;
2143 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2144 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2145 llss->ia2.ia_atime = llss->inode2->i_atime;
2146 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2149 /* ultimate check, before swaping the layouts we check if
2150 * dataversion has changed (if requested) */
2151 if (llss->check_dv1) {
2152 rc = ll_data_version(llss->inode1, &dv, 0);
2155 if (dv != llss->dv1)
2156 GOTO(putgl, rc = -EAGAIN);
2159 if (llss->check_dv2) {
2160 rc = ll_data_version(llss->inode2, &dv, 0);
2163 if (dv != llss->dv2)
2164 GOTO(putgl, rc = -EAGAIN);
2167 /* struct md_op_data is used to send the swap args to the mdt
2168 * only flags is missing, so we use struct mdc_swap_layouts
2169 * through the md_op_data->op_data */
2170 /* flags from user space have to be converted before they are send to
2171 * server, no flag is sent today, they are only used on the client */
2174 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2175 0, LUSTRE_OPC_ANY, &msl);
2176 if (IS_ERR(op_data))
2177 GOTO(free, rc = PTR_ERR(op_data));
2179 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2180 sizeof(*op_data), op_data, NULL);
2181 ll_finish_md_op_data(op_data);
2185 ll_put_grouplock(llss->inode2, file2, gid);
2186 ll_put_grouplock(llss->inode1, file1, gid);
2189 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2193 /* clear useless flags */
2194 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2195 llss->ia1.ia_valid &= ~ATTR_MTIME;
2196 llss->ia2.ia_valid &= ~ATTR_MTIME;
2199 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2200 llss->ia1.ia_valid &= ~ATTR_ATIME;
2201 llss->ia2.ia_valid &= ~ATTR_ATIME;
2204 /* update time if requested */
2206 if (llss->ia2.ia_valid != 0) {
2207 mutex_lock(&llss->inode1->i_mutex);
2208 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2209 mutex_unlock(&llss->inode1->i_mutex);
2212 if (llss->ia1.ia_valid != 0) {
2215 mutex_lock(&llss->inode2->i_mutex);
2216 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2217 mutex_unlock(&llss->inode2->i_mutex);
2229 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2231 struct md_op_data *op_data;
2234 /* Non-root users are forbidden to set or clear flags which are
2235 * NOT defined in HSM_USER_MASK. */
2236 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2237 !cfs_capable(CFS_CAP_SYS_ADMIN))
2240 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2241 LUSTRE_OPC_ANY, hss);
2242 if (IS_ERR(op_data))
2243 RETURN(PTR_ERR(op_data));
2245 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2246 sizeof(*op_data), op_data, NULL);
2248 ll_finish_md_op_data(op_data);
2253 static int ll_hsm_import(struct inode *inode, struct file *file,
2254 struct hsm_user_import *hui)
2256 struct hsm_state_set *hss = NULL;
2257 struct iattr *attr = NULL;
2261 if (!S_ISREG(inode->i_mode))
2267 GOTO(out, rc = -ENOMEM);
2269 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2270 hss->hss_archive_id = hui->hui_archive_id;
2271 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2272 rc = ll_hsm_state_set(inode, hss);
2276 OBD_ALLOC_PTR(attr);
2278 GOTO(out, rc = -ENOMEM);
2280 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2281 attr->ia_mode |= S_IFREG;
2282 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2283 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2284 attr->ia_size = hui->hui_size;
2285 attr->ia_mtime.tv_sec = hui->hui_mtime;
2286 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2287 attr->ia_atime.tv_sec = hui->hui_atime;
2288 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2290 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2291 ATTR_UID | ATTR_GID |
2292 ATTR_MTIME | ATTR_MTIME_SET |
2293 ATTR_ATIME | ATTR_ATIME_SET;
2295 rc = ll_setattr_raw(file->f_dentry, attr, true);
2310 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2312 struct inode *inode = file->f_dentry->d_inode;
2313 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2317 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2318 PFID(ll_inode2fid(inode)), inode, cmd);
2319 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2321 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2322 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2326 case LL_IOC_GETFLAGS:
2327 /* Get the current value of the file flags */
2328 return put_user(fd->fd_flags, (int *)arg);
2329 case LL_IOC_SETFLAGS:
2330 case LL_IOC_CLRFLAGS:
2331 /* Set or clear specific file flags */
2332 /* XXX This probably needs checks to ensure the flags are
2333 * not abused, and to handle any flag side effects.
2335 if (get_user(flags, (int *) arg))
2338 if (cmd == LL_IOC_SETFLAGS) {
2339 if ((flags & LL_FILE_IGNORE_LOCK) &&
2340 !(file->f_flags & O_DIRECT)) {
2341 CERROR("%s: unable to disable locking on "
2342 "non-O_DIRECT file\n", current->comm);
2346 fd->fd_flags |= flags;
2348 fd->fd_flags &= ~flags;
2351 case LL_IOC_LOV_SETSTRIPE:
2352 RETURN(ll_lov_setstripe(inode, file, arg));
2353 case LL_IOC_LOV_SETEA:
2354 RETURN(ll_lov_setea(inode, file, arg));
2355 case LL_IOC_LOV_SWAP_LAYOUTS: {
2357 struct lustre_swap_layouts lsl;
2359 if (copy_from_user(&lsl, (char *)arg,
2360 sizeof(struct lustre_swap_layouts)))
2363 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2366 file2 = fget(lsl.sl_fd);
2371 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2372 rc = ll_swap_layouts(file, file2, &lsl);
2376 case LL_IOC_LOV_GETSTRIPE:
2377 RETURN(ll_lov_getstripe(inode, arg));
2378 case LL_IOC_RECREATE_OBJ:
2379 RETURN(ll_lov_recreate_obj(inode, arg));
2380 case LL_IOC_RECREATE_FID:
2381 RETURN(ll_lov_recreate_fid(inode, arg));
2382 case FSFILT_IOC_FIEMAP:
2383 RETURN(ll_ioctl_fiemap(inode, arg));
2384 case FSFILT_IOC_GETFLAGS:
2385 case FSFILT_IOC_SETFLAGS:
2386 RETURN(ll_iocontrol(inode, file, cmd, arg));
2387 case FSFILT_IOC_GETVERSION_OLD:
2388 case FSFILT_IOC_GETVERSION:
2389 RETURN(put_user(inode->i_generation, (int *)arg));
2390 case LL_IOC_GROUP_LOCK:
2391 RETURN(ll_get_grouplock(inode, file, arg));
2392 case LL_IOC_GROUP_UNLOCK:
2393 RETURN(ll_put_grouplock(inode, file, arg));
2394 case IOC_OBD_STATFS:
2395 RETURN(ll_obd_statfs(inode, (void *)arg));
2397 /* We need to special case any other ioctls we want to handle,
2398 * to send them to the MDS/OST as appropriate and to properly
2399 * network encode the arg field.
2400 case FSFILT_IOC_SETVERSION_OLD:
2401 case FSFILT_IOC_SETVERSION:
2403 case LL_IOC_FLUSHCTX:
2404 RETURN(ll_flush_ctx(inode));
2405 case LL_IOC_PATH2FID: {
2406 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2407 sizeof(struct lu_fid)))
2412 case OBD_IOC_FID2PATH:
2413 RETURN(ll_fid2path(inode, (void *)arg));
2414 case LL_IOC_DATA_VERSION: {
2415 struct ioc_data_version idv;
2418 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2421 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2422 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2424 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2430 case LL_IOC_GET_MDTIDX: {
2433 mdtidx = ll_get_mdt_idx(inode);
2437 if (put_user((int)mdtidx, (int*)arg))
2442 case OBD_IOC_GETDTNAME:
2443 case OBD_IOC_GETMDNAME:
2444 RETURN(ll_get_obd_name(inode, cmd, arg));
2445 case LL_IOC_HSM_STATE_GET: {
2446 struct md_op_data *op_data;
2447 struct hsm_user_state *hus;
2454 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2455 LUSTRE_OPC_ANY, hus);
2456 if (IS_ERR(op_data)) {
2458 RETURN(PTR_ERR(op_data));
2461 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2464 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2467 ll_finish_md_op_data(op_data);
2471 case LL_IOC_HSM_STATE_SET: {
2472 struct hsm_state_set *hss;
2479 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2484 rc = ll_hsm_state_set(inode, hss);
2489 case LL_IOC_HSM_ACTION: {
2490 struct md_op_data *op_data;
2491 struct hsm_current_action *hca;
2498 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2499 LUSTRE_OPC_ANY, hca);
2500 if (IS_ERR(op_data)) {
2502 RETURN(PTR_ERR(op_data));
2505 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2508 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2511 ll_finish_md_op_data(op_data);
2515 case LL_IOC_SET_LEASE: {
2516 struct ll_inode_info *lli = ll_i2info(inode);
2517 struct obd_client_handle *och = NULL;
2523 if (!(file->f_mode & FMODE_WRITE))
2528 if (!(file->f_mode & FMODE_READ))
2533 mutex_lock(&lli->lli_och_mutex);
2534 if (fd->fd_lease_och != NULL) {
2535 och = fd->fd_lease_och;
2536 fd->fd_lease_och = NULL;
2538 mutex_unlock(&lli->lli_och_mutex);
2541 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2542 rc = ll_lease_close(och, inode, &lease_broken);
2543 if (rc == 0 && lease_broken)
2549 /* return the type of lease or error */
2550 RETURN(rc < 0 ? rc : (int)mode);
2555 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2557 /* apply for lease */
2558 och = ll_lease_open(inode, file, mode, 0);
2560 RETURN(PTR_ERR(och));
2563 mutex_lock(&lli->lli_och_mutex);
2564 if (fd->fd_lease_och == NULL) {
2565 fd->fd_lease_och = och;
2568 mutex_unlock(&lli->lli_och_mutex);
2570 /* impossible now that only excl is supported for now */
2571 ll_lease_close(och, inode, &lease_broken);
2576 case LL_IOC_GET_LEASE: {
2577 struct ll_inode_info *lli = ll_i2info(inode);
2578 struct ldlm_lock *lock = NULL;
2581 mutex_lock(&lli->lli_och_mutex);
2582 if (fd->fd_lease_och != NULL) {
2583 struct obd_client_handle *och = fd->fd_lease_och;
2585 lock = ldlm_handle2lock(&och->och_lease_handle);
2587 lock_res_and_lock(lock);
2588 if (!ldlm_is_cancel(lock))
2589 rc = och->och_flags &
2590 (FMODE_READ | FMODE_WRITE);
2591 unlock_res_and_lock(lock);
2592 LDLM_LOCK_PUT(lock);
2595 mutex_unlock(&lli->lli_och_mutex);
2598 case LL_IOC_HSM_IMPORT: {
2599 struct hsm_user_import *hui;
2605 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2610 rc = ll_hsm_import(inode, file, hui);
2620 ll_iocontrol_call(inode, file, cmd, arg, &err))
2623 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2629 #ifndef HAVE_FILE_LLSEEK_SIZE
2630 static inline loff_t
2631 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2633 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2635 if (offset > maxsize)
2638 if (offset != file->f_pos) {
2639 file->f_pos = offset;
2640 file->f_version = 0;
2646 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2647 loff_t maxsize, loff_t eof)
2649 struct inode *inode = file->f_dentry->d_inode;
2657 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2658 * position-querying operation. Avoid rewriting the "same"
2659 * f_pos value back to the file because a concurrent read(),
2660 * write() or lseek() might have altered it
2665 * f_lock protects against read/modify/write race with other
2666 * SEEK_CURs. Note that parallel writes and reads behave
2669 mutex_lock(&inode->i_mutex);
2670 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2671 mutex_unlock(&inode->i_mutex);
2675 * In the generic case the entire file is data, so as long as
2676 * offset isn't at the end of the file then the offset is data.
2683 * There is a virtual hole at the end of the file, so as long as
2684 * offset isn't i_size or larger, return i_size.
2692 return llseek_execute(file, offset, maxsize);
2696 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2698 struct inode *inode = file->f_dentry->d_inode;
2699 loff_t retval, eof = 0;
2702 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2703 (origin == SEEK_CUR) ? file->f_pos : 0);
2704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2705 PFID(ll_inode2fid(inode)), inode, retval, retval,
2707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2709 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2710 retval = ll_glimpse_size(inode);
2713 eof = i_size_read(inode);
2716 retval = ll_generic_file_llseek_size(file, offset, origin,
2717 ll_file_maxbytes(inode), eof);
2721 static int ll_flush(struct file *file, fl_owner_t id)
2723 struct inode *inode = file->f_dentry->d_inode;
2724 struct ll_inode_info *lli = ll_i2info(inode);
2725 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2728 LASSERT(!S_ISDIR(inode->i_mode));
2730 /* catch async errors that were recorded back when async writeback
2731 * failed for pages in this mapping. */
2732 rc = lli->lli_async_rc;
2733 lli->lli_async_rc = 0;
2734 if (lli->lli_clob != NULL) {
2735 err = lov_read_and_clear_async_rc(lli->lli_clob);
2740 /* The application has been told write failure already.
2741 * Do not report failure again. */
2742 if (fd->fd_write_failed)
2744 return rc ? -EIO : 0;
2748 * Called to make sure a portion of file has been written out.
2749 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2751 * Return how many pages have been written.
2753 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2754 enum cl_fsync_mode mode, int ignore_layout)
2756 struct cl_env_nest nest;
2759 struct obd_capa *capa = NULL;
2760 struct cl_fsync_io *fio;
2764 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2765 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2768 env = cl_env_nested_get(&nest);
2770 RETURN(PTR_ERR(env));
2772 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2774 io = ccc_env_thread_io(env);
2775 io->ci_obj = cl_i2info(inode)->lli_clob;
2776 io->ci_ignore_layout = ignore_layout;
2778 /* initialize parameters for sync */
2779 fio = &io->u.ci_fsync;
2780 fio->fi_capa = capa;
2781 fio->fi_start = start;
2783 fio->fi_fid = ll_inode2fid(inode);
2784 fio->fi_mode = mode;
2785 fio->fi_nr_written = 0;
2787 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2788 result = cl_io_loop(env, io);
2790 result = io->ci_result;
2792 result = fio->fi_nr_written;
2793 cl_io_fini(env, io);
2794 cl_env_nested_put(&nest, env);
2802 * When dentry is provided (the 'else' case), *file->f_dentry may be
2803 * null and dentry must be used directly rather than pulled from
2804 * *file->f_dentry as is done otherwise.
2807 #ifdef HAVE_FILE_FSYNC_4ARGS
2808 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2810 struct dentry *dentry = file->f_dentry;
2811 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2812 int ll_fsync(struct file *file, int datasync)
2814 struct dentry *dentry = file->f_dentry;
2816 loff_t end = LLONG_MAX;
2818 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2821 loff_t end = LLONG_MAX;
2823 struct inode *inode = dentry->d_inode;
2824 struct ll_inode_info *lli = ll_i2info(inode);
2825 struct ptlrpc_request *req;
2826 struct obd_capa *oc;
2830 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2831 PFID(ll_inode2fid(inode)), inode);
2832 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2834 #ifdef HAVE_FILE_FSYNC_4ARGS
2835 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2836 mutex_lock(&inode->i_mutex);
2838 /* fsync's caller has already called _fdata{sync,write}, we want
2839 * that IO to finish before calling the osc and mdc sync methods */
2840 rc = filemap_fdatawait(inode->i_mapping);
2843 /* catch async errors that were recorded back when async writeback
2844 * failed for pages in this mapping. */
2845 if (!S_ISDIR(inode->i_mode)) {
2846 err = lli->lli_async_rc;
2847 lli->lli_async_rc = 0;
2850 err = lov_read_and_clear_async_rc(lli->lli_clob);
2855 oc = ll_mdscapa_get(inode);
2856 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2862 ptlrpc_req_finished(req);
2864 if (S_ISREG(inode->i_mode)) {
2865 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2867 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2868 if (rc == 0 && err < 0)
2871 fd->fd_write_failed = true;
2873 fd->fd_write_failed = false;
2876 #ifdef HAVE_FILE_FSYNC_4ARGS
2877 mutex_unlock(&inode->i_mutex);
2883 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2885 struct inode *inode = file->f_dentry->d_inode;
2886 struct ll_sb_info *sbi = ll_i2sbi(inode);
2887 struct ldlm_enqueue_info einfo = {
2888 .ei_type = LDLM_FLOCK,
2889 .ei_cb_cp = ldlm_flock_completion_ast,
2890 .ei_cbdata = file_lock,
2892 struct md_op_data *op_data;
2893 struct lustre_handle lockh = {0};
2894 ldlm_policy_data_t flock = {{0}};
2900 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2901 PFID(ll_inode2fid(inode)), file_lock);
2903 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2905 if (file_lock->fl_flags & FL_FLOCK) {
2906 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2907 /* flocks are whole-file locks */
2908 flock.l_flock.end = OFFSET_MAX;
2909 /* For flocks owner is determined by the local file desctiptor*/
2910 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2911 } else if (file_lock->fl_flags & FL_POSIX) {
2912 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2913 flock.l_flock.start = file_lock->fl_start;
2914 flock.l_flock.end = file_lock->fl_end;
2918 flock.l_flock.pid = file_lock->fl_pid;
2920 /* Somewhat ugly workaround for svc lockd.
2921 * lockd installs custom fl_lmops->lm_compare_owner that checks
2922 * for the fl_owner to be the same (which it always is on local node
2923 * I guess between lockd processes) and then compares pid.
2924 * As such we assign pid to the owner field to make it all work,
2925 * conflict with normal locks is unlikely since pid space and
2926 * pointer space for current->files are not intersecting */
2927 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2928 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2930 switch (file_lock->fl_type) {
2932 einfo.ei_mode = LCK_PR;
2935 /* An unlock request may or may not have any relation to
2936 * existing locks so we may not be able to pass a lock handle
2937 * via a normal ldlm_lock_cancel() request. The request may even
2938 * unlock a byte range in the middle of an existing lock. In
2939 * order to process an unlock request we need all of the same
2940 * information that is given with a normal read or write record
2941 * lock request. To avoid creating another ldlm unlock (cancel)
2942 * message we'll treat a LCK_NL flock request as an unlock. */
2943 einfo.ei_mode = LCK_NL;
2946 einfo.ei_mode = LCK_PW;
2949 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2950 file_lock->fl_type);
2965 flags = LDLM_FL_BLOCK_NOWAIT;
2971 flags = LDLM_FL_TEST_LOCK;
2972 /* Save the old mode so that if the mode in the lock changes we
2973 * can decrement the appropriate reader or writer refcount. */
2974 file_lock->fl_type = einfo.ei_mode;
2977 CERROR("unknown fcntl lock command: %d\n", cmd);
2981 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2982 LUSTRE_OPC_ANY, NULL);
2983 if (IS_ERR(op_data))
2984 RETURN(PTR_ERR(op_data));
2986 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2987 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2988 flock.l_flock.pid, flags, einfo.ei_mode,
2989 flock.l_flock.start, flock.l_flock.end);
2991 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2992 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2994 if ((file_lock->fl_flags & FL_FLOCK) &&
2995 (rc == 0 || file_lock->fl_type == F_UNLCK))
2996 rc2 = flock_lock_file_wait(file, file_lock);
2997 if ((file_lock->fl_flags & FL_POSIX) &&
2998 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2999 !(flags & LDLM_FL_TEST_LOCK))
3000 rc2 = posix_lock_file_wait(file, file_lock);
3002 if (rc2 && file_lock->fl_type != F_UNLCK) {
3003 einfo.ei_mode = LCK_NL;
3004 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
3005 op_data, &lockh, &flock, 0, NULL /* req */, flags);
3009 ll_finish_md_op_data(op_data);
3014 int ll_get_fid_by_name(struct inode *parent, const char *name,
3015 int namelen, struct lu_fid *fid)
3017 struct md_op_data *op_data = NULL;
3018 struct mdt_body *body;
3019 struct ptlrpc_request *req;
3023 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3024 LUSTRE_OPC_ANY, NULL);
3025 if (IS_ERR(op_data))
3026 RETURN(PTR_ERR(op_data));
3028 op_data->op_valid = OBD_MD_FLID;
3029 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3030 ll_finish_md_op_data(op_data);
3034 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3036 GOTO(out_req, rc = -EFAULT);
3040 ptlrpc_req_finished(req);
3044 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3045 const char *name, int namelen)
3047 struct dentry *dchild = NULL;
3048 struct inode *child_inode = NULL;
3049 struct md_op_data *op_data;
3050 struct ptlrpc_request *request = NULL;
3055 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3056 name, PFID(ll_inode2fid(parent)), mdtidx);
3058 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3059 0, LUSTRE_OPC_ANY, NULL);
3060 if (IS_ERR(op_data))
3061 RETURN(PTR_ERR(op_data));
3063 /* Get child FID first */
3064 qstr.hash = full_name_hash(name, namelen);
3067 dchild = d_lookup(file->f_dentry, &qstr);
3068 if (dchild != NULL && dchild->d_inode != NULL) {
3069 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3070 if (dchild->d_inode != NULL) {
3071 child_inode = igrab(dchild->d_inode);
3072 ll_invalidate_aliases(child_inode);
3076 rc = ll_get_fid_by_name(parent, name, namelen,
3082 if (!fid_is_sane(&op_data->op_fid3)) {
3083 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3084 ll_get_fsname(parent->i_sb, NULL, 0), name,
3085 PFID(&op_data->op_fid3));
3086 GOTO(out_free, rc = -EINVAL);
3089 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3094 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3095 PFID(&op_data->op_fid3), mdtidx);
3096 GOTO(out_free, rc = 0);
3099 op_data->op_mds = mdtidx;
3100 op_data->op_cli_flags = CLI_MIGRATE;
3101 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3102 namelen, name, namelen, &request);
3104 ll_update_times(request, parent);
3106 ptlrpc_req_finished(request);
3111 if (child_inode != NULL) {
3112 clear_nlink(child_inode);
3116 ll_finish_md_op_data(op_data);
3121 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3129 * test if some locks matching bits and l_req_mode are acquired
3130 * - bits can be in different locks
3131 * - if found clear the common lock bits in *bits
3132 * - the bits not found, are kept in *bits
3134 * \param bits [IN] searched lock bits [IN]
3135 * \param l_req_mode [IN] searched lock mode
3136 * \retval boolean, true iff all bits are found
3138 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3140 struct lustre_handle lockh;
3141 ldlm_policy_data_t policy;
3142 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3143 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3152 fid = &ll_i2info(inode)->lli_fid;
3153 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3154 ldlm_lockname[mode]);
3156 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3157 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3158 policy.l_inodebits.bits = *bits & (1 << i);
3159 if (policy.l_inodebits.bits == 0)
3162 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3163 &policy, mode, &lockh)) {
3164 struct ldlm_lock *lock;
3166 lock = ldlm_handle2lock(&lockh);
3169 ~(lock->l_policy_data.l_inodebits.bits);
3170 LDLM_LOCK_PUT(lock);
3172 *bits &= ~policy.l_inodebits.bits;
3179 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3180 struct lustre_handle *lockh, __u64 flags,
3183 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3188 fid = &ll_i2info(inode)->lli_fid;
3189 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3191 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3192 fid, LDLM_IBITS, &policy, mode, lockh);
3197 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3199 /* Already unlinked. Just update nlink and return success */
3200 if (rc == -ENOENT) {
3202 /* This path cannot be hit for regular files unless in
3203 * case of obscure races, so no need to to validate
3205 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3207 } else if (rc != 0) {
3208 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3209 "%s: revalidate FID "DFID" error: rc = %d\n",
3210 ll_get_fsname(inode->i_sb, NULL, 0),
3211 PFID(ll_inode2fid(inode)), rc);
3217 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3219 struct inode *inode = dentry->d_inode;
3220 struct ptlrpc_request *req = NULL;
3221 struct obd_export *exp;
3225 LASSERT(inode != NULL);
3227 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3228 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3230 exp = ll_i2mdexp(inode);
3232 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3233 * But under CMD case, it caused some lock issues, should be fixed
3234 * with new CMD ibits lock. See bug 12718 */
3235 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3236 struct lookup_intent oit = { .it_op = IT_GETATTR };
3237 struct md_op_data *op_data;
3239 if (ibits == MDS_INODELOCK_LOOKUP)
3240 oit.it_op = IT_LOOKUP;
3242 /* Call getattr by fid, so do not provide name at all. */
3243 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3244 dentry->d_inode, NULL, 0, 0,
3245 LUSTRE_OPC_ANY, NULL);
3246 if (IS_ERR(op_data))
3247 RETURN(PTR_ERR(op_data));
3249 oit.it_create_mode |= M_CHECK_STALE;
3250 rc = md_intent_lock(exp, op_data, NULL, 0,
3251 /* we are not interested in name
3254 ll_md_blocking_ast, 0);
3255 ll_finish_md_op_data(op_data);
3256 oit.it_create_mode &= ~M_CHECK_STALE;
3258 rc = ll_inode_revalidate_fini(inode, rc);
3262 rc = ll_revalidate_it_finish(req, &oit, dentry);
3264 ll_intent_release(&oit);
3268 /* Unlinked? Unhash dentry, so it is not picked up later by
3269 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3270 here to preserve get_cwd functionality on 2.6.
3272 if (!dentry->d_inode->i_nlink)
3273 d_lustre_invalidate(dentry, 0);
3275 ll_lookup_finish_locks(&oit, dentry);
3276 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3277 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3278 obd_valid valid = OBD_MD_FLGETATTR;
3279 struct md_op_data *op_data;
3282 if (S_ISREG(inode->i_mode)) {
3283 rc = ll_get_default_mdsize(sbi, &ealen);
3286 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3289 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3290 0, ealen, LUSTRE_OPC_ANY,
3292 if (IS_ERR(op_data))
3293 RETURN(PTR_ERR(op_data));
3295 op_data->op_valid = valid;
3296 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3297 * capa for this inode. Because we only keep capas of dirs
3299 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3300 ll_finish_md_op_data(op_data);
3302 rc = ll_inode_revalidate_fini(inode, rc);
3306 rc = ll_prep_inode(&inode, req, NULL, NULL);
3309 ptlrpc_req_finished(req);
3313 static int ll_merge_md_attr(struct inode *inode)
3315 struct cl_attr attr = { 0 };
3318 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3319 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3324 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3325 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3327 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3328 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3329 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3335 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3337 struct inode *inode = dentry->d_inode;
3341 rc = __ll_inode_revalidate(dentry, ibits);
3345 /* if object isn't regular file, don't validate size */
3346 if (!S_ISREG(inode->i_mode)) {
3347 if (S_ISDIR(inode->i_mode) &&
3348 ll_i2info(inode)->lli_lsm_md != NULL) {
3349 rc = ll_merge_md_attr(inode);
3354 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3355 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3356 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3358 /* In case of restore, the MDT has the right size and has
3359 * already send it back without granting the layout lock,
3360 * inode is up-to-date so glimpse is useless.
3361 * Also to glimpse we need the layout, in case of a running
3362 * restore the MDT holds the layout lock so the glimpse will
3363 * block up to the end of restore (getattr will block)
3365 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3366 rc = ll_glimpse_size(inode);
3371 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3373 struct inode *inode = de->d_inode;
3374 struct ll_sb_info *sbi = ll_i2sbi(inode);
3375 struct ll_inode_info *lli = ll_i2info(inode);
3378 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3379 MDS_INODELOCK_LOOKUP);
3380 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3385 stat->dev = inode->i_sb->s_dev;
3386 if (ll_need_32bit_api(sbi))
3387 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3389 stat->ino = inode->i_ino;
3390 stat->mode = inode->i_mode;
3391 stat->uid = inode->i_uid;
3392 stat->gid = inode->i_gid;
3393 stat->rdev = inode->i_rdev;
3394 stat->atime = inode->i_atime;
3395 stat->mtime = inode->i_mtime;
3396 stat->ctime = inode->i_ctime;
3397 stat->blksize = 1 << inode->i_blkbits;
3398 stat->blocks = inode->i_blocks;
3400 if (S_ISDIR(inode->i_mode) &&
3401 ll_i2info(inode)->lli_lsm_md != NULL) {
3402 stat->nlink = lli->lli_stripe_dir_nlink;
3403 stat->size = lli->lli_stripe_dir_size;
3405 stat->nlink = inode->i_nlink;
3406 stat->size = i_size_read(inode);
3412 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3413 __u64 start, __u64 len)
3417 struct ll_user_fiemap *fiemap;
3418 unsigned int extent_count = fieinfo->fi_extents_max;
3420 num_bytes = sizeof(*fiemap) + (extent_count *
3421 sizeof(struct ll_fiemap_extent));
3422 OBD_ALLOC_LARGE(fiemap, num_bytes);
3427 fiemap->fm_flags = fieinfo->fi_flags;
3428 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3429 fiemap->fm_start = start;
3430 fiemap->fm_length = len;
3431 if (extent_count > 0)
3432 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3433 sizeof(struct ll_fiemap_extent));
3435 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3437 fieinfo->fi_flags = fiemap->fm_flags;
3438 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3439 if (extent_count > 0)
3440 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3441 fiemap->fm_mapped_extents *
3442 sizeof(struct ll_fiemap_extent));
3444 OBD_FREE_LARGE(fiemap, num_bytes);
3448 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3450 struct ll_inode_info *lli = ll_i2info(inode);
3451 struct posix_acl *acl = NULL;
3454 spin_lock(&lli->lli_lock);
3455 /* VFS' acl_permission_check->check_acl will release the refcount */
3456 acl = posix_acl_dup(lli->lli_posix_acl);
3457 spin_unlock(&lli->lli_lock);
3462 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3464 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3465 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3467 ll_check_acl(struct inode *inode, int mask)
3470 # ifdef CONFIG_FS_POSIX_ACL
3471 struct posix_acl *acl;
3475 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3476 if (flags & IPERM_FLAG_RCU)
3479 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3484 rc = posix_acl_permission(inode, acl, mask);
3485 posix_acl_release(acl);
3488 # else /* !CONFIG_FS_POSIX_ACL */
3490 # endif /* CONFIG_FS_POSIX_ACL */
3492 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3494 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3495 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3497 # ifdef HAVE_INODE_PERMISION_2ARGS
3498 int ll_inode_permission(struct inode *inode, int mask)
3500 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3507 #ifdef MAY_NOT_BLOCK
3508 if (mask & MAY_NOT_BLOCK)
3510 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3511 if (flags & IPERM_FLAG_RCU)
3515 /* as root inode are NOT getting validated in lookup operation,
3516 * need to do it before permission check. */
3518 if (inode == inode->i_sb->s_root->d_inode) {
3519 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3520 MDS_INODELOCK_LOOKUP);
3525 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3526 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3528 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3529 return lustre_check_remote_perm(inode, mask);
3531 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3532 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3537 /* -o localflock - only provides locally consistent flock locks */
3538 struct file_operations ll_file_operations = {
3539 .read = ll_file_read,
3540 .aio_read = ll_file_aio_read,
3541 .write = ll_file_write,
3542 .aio_write = ll_file_aio_write,
3543 .unlocked_ioctl = ll_file_ioctl,
3544 .open = ll_file_open,
3545 .release = ll_file_release,
3546 .mmap = ll_file_mmap,
3547 .llseek = ll_file_seek,
3548 .splice_read = ll_file_splice_read,
3553 struct file_operations ll_file_operations_flock = {
3554 .read = ll_file_read,
3555 .aio_read = ll_file_aio_read,
3556 .write = ll_file_write,
3557 .aio_write = ll_file_aio_write,
3558 .unlocked_ioctl = ll_file_ioctl,
3559 .open = ll_file_open,
3560 .release = ll_file_release,
3561 .mmap = ll_file_mmap,
3562 .llseek = ll_file_seek,
3563 .splice_read = ll_file_splice_read,
3566 .flock = ll_file_flock,
3567 .lock = ll_file_flock
3570 /* These are for -o noflock - to return ENOSYS on flock calls */
3571 struct file_operations ll_file_operations_noflock = {
3572 .read = ll_file_read,
3573 .aio_read = ll_file_aio_read,
3574 .write = ll_file_write,
3575 .aio_write = ll_file_aio_write,
3576 .unlocked_ioctl = ll_file_ioctl,
3577 .open = ll_file_open,
3578 .release = ll_file_release,
3579 .mmap = ll_file_mmap,
3580 .llseek = ll_file_seek,
3581 .splice_read = ll_file_splice_read,
3584 .flock = ll_file_noflock,
3585 .lock = ll_file_noflock
3588 struct inode_operations ll_file_inode_operations = {
3589 .setattr = ll_setattr,
3590 .getattr = ll_getattr,
3591 .permission = ll_inode_permission,
3592 .setxattr = ll_setxattr,
3593 .getxattr = ll_getxattr,
3594 .listxattr = ll_listxattr,
3595 .removexattr = ll_removexattr,
3596 .fiemap = ll_fiemap,
3597 #ifdef HAVE_IOP_GET_ACL
3598 .get_acl = ll_get_acl,
3602 /* dynamic ioctl number support routins */
3603 static struct llioc_ctl_data {
3604 struct rw_semaphore ioc_sem;
3605 cfs_list_t ioc_head;
3607 __RWSEM_INITIALIZER(llioc.ioc_sem),
3608 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3613 cfs_list_t iocd_list;
3614 unsigned int iocd_size;
3615 llioc_callback_t iocd_cb;
3616 unsigned int iocd_count;
3617 unsigned int iocd_cmd[0];
3620 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3623 struct llioc_data *in_data = NULL;
3626 if (cb == NULL || cmd == NULL ||
3627 count > LLIOC_MAX_CMD || count < 0)
3630 size = sizeof(*in_data) + count * sizeof(unsigned int);
3631 OBD_ALLOC(in_data, size);
3632 if (in_data == NULL)
3635 memset(in_data, 0, sizeof(*in_data));
3636 in_data->iocd_size = size;
3637 in_data->iocd_cb = cb;
3638 in_data->iocd_count = count;
3639 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3641 down_write(&llioc.ioc_sem);
3642 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3643 up_write(&llioc.ioc_sem);
3648 void ll_iocontrol_unregister(void *magic)
3650 struct llioc_data *tmp;
3655 down_write(&llioc.ioc_sem);
3656 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3658 unsigned int size = tmp->iocd_size;
3660 cfs_list_del(&tmp->iocd_list);
3661 up_write(&llioc.ioc_sem);
3663 OBD_FREE(tmp, size);
3667 up_write(&llioc.ioc_sem);
3669 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3672 EXPORT_SYMBOL(ll_iocontrol_register);
3673 EXPORT_SYMBOL(ll_iocontrol_unregister);
3675 static enum llioc_iter
3676 ll_iocontrol_call(struct inode *inode, struct file *file,
3677 unsigned int cmd, unsigned long arg, int *rcp)
3679 enum llioc_iter ret = LLIOC_CONT;
3680 struct llioc_data *data;
3681 int rc = -EINVAL, i;
3683 down_read(&llioc.ioc_sem);
3684 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3685 for (i = 0; i < data->iocd_count; i++) {
3686 if (cmd != data->iocd_cmd[i])
3689 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3693 if (ret == LLIOC_STOP)
3696 up_read(&llioc.ioc_sem);
3703 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3705 struct ll_inode_info *lli = ll_i2info(inode);
3706 struct cl_env_nest nest;
3711 if (lli->lli_clob == NULL)
3714 env = cl_env_nested_get(&nest);
3716 RETURN(PTR_ERR(env));
3718 result = cl_conf_set(env, lli->lli_clob, conf);
3719 cl_env_nested_put(&nest, env);
3721 if (conf->coc_opc == OBJECT_CONF_SET) {
3722 struct ldlm_lock *lock = conf->coc_lock;
3724 LASSERT(lock != NULL);
3725 LASSERT(ldlm_has_layout(lock));
3727 struct lustre_md *md = conf->u.coc_md;
3728 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3730 /* it can only be allowed to match after layout is
3731 * applied to inode otherwise false layout would be
3732 * seen. Applying layout shoud happen before dropping
3733 * the intent lock. */
3734 ldlm_lock_allow_match(lock);
3736 lli->lli_has_smd = lsm_has_objects(md->lsm);
3737 if (md->lsm != NULL)
3738 gen = md->lsm->lsm_layout_gen;
3741 DFID ": layout version change: %u -> %u\n",
3742 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3744 ll_layout_version_set(lli, gen);
3750 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3751 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3754 struct ll_sb_info *sbi = ll_i2sbi(inode);
3755 struct obd_capa *oc;
3756 struct ptlrpc_request *req;
3757 struct mdt_body *body;
3764 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3765 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3766 lock->l_lvb_data, lock->l_lvb_len);
3768 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3771 /* if layout lock was granted right away, the layout is returned
3772 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3773 * blocked and then granted via completion ast, we have to fetch
3774 * layout here. Please note that we can't use the LVB buffer in
3775 * completion AST because it doesn't have a large enough buffer */
3776 oc = ll_mdscapa_get(inode);
3777 rc = ll_get_default_mdsize(sbi, &lmmsize);
3779 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3780 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3786 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3788 GOTO(out, rc = -EPROTO);
3790 lmmsize = body->eadatasize;
3791 if (lmmsize == 0) /* empty layout */
3794 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3796 GOTO(out, rc = -EFAULT);
3798 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3799 if (lvbdata == NULL)
3800 GOTO(out, rc = -ENOMEM);
3802 memcpy(lvbdata, lmm, lmmsize);
3803 lock_res_and_lock(lock);
3804 if (lock->l_lvb_data != NULL)
3805 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3807 lock->l_lvb_data = lvbdata;
3808 lock->l_lvb_len = lmmsize;
3809 unlock_res_and_lock(lock);
3814 ptlrpc_req_finished(req);
3819 * Apply the layout to the inode. Layout lock is held and will be released
3822 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3823 struct inode *inode, __u32 *gen, bool reconf)
3825 struct ll_inode_info *lli = ll_i2info(inode);
3826 struct ll_sb_info *sbi = ll_i2sbi(inode);
3827 struct ldlm_lock *lock;
3828 struct lustre_md md = { NULL };
3829 struct cl_object_conf conf;
3832 bool wait_layout = false;
3835 LASSERT(lustre_handle_is_used(lockh));
3837 lock = ldlm_handle2lock(lockh);
3838 LASSERT(lock != NULL);
3839 LASSERT(ldlm_has_layout(lock));
3841 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3842 PFID(&lli->lli_fid), inode, reconf);
3844 /* in case this is a caching lock and reinstate with new inode */
3845 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3847 lock_res_and_lock(lock);
3848 lvb_ready = ldlm_is_lvb_ready(lock);
3849 unlock_res_and_lock(lock);
3850 /* checking lvb_ready is racy but this is okay. The worst case is
3851 * that multi processes may configure the file on the same time. */
3853 if (lvb_ready || !reconf) {
3856 /* layout_gen must be valid if layout lock is not
3857 * cancelled and stripe has already set */
3858 *gen = ll_layout_version_get(lli);
3864 rc = ll_layout_fetch(inode, lock);
3868 /* for layout lock, lmm is returned in lock's lvb.
3869 * lvb_data is immutable if the lock is held so it's safe to access it
3870 * without res lock. See the description in ldlm_lock_decref_internal()
3871 * for the condition to free lvb_data of layout lock */
3872 if (lock->l_lvb_data != NULL) {
3873 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3874 lock->l_lvb_data, lock->l_lvb_len);
3876 *gen = LL_LAYOUT_GEN_EMPTY;
3878 *gen = md.lsm->lsm_layout_gen;
3881 CERROR("%s: file "DFID" unpackmd error: %d\n",
3882 ll_get_fsname(inode->i_sb, NULL, 0),
3883 PFID(&lli->lli_fid), rc);
3889 /* set layout to file. Unlikely this will fail as old layout was
3890 * surely eliminated */
3891 memset(&conf, 0, sizeof conf);
3892 conf.coc_opc = OBJECT_CONF_SET;
3893 conf.coc_inode = inode;
3894 conf.coc_lock = lock;
3895 conf.u.coc_md = &md;
3896 rc = ll_layout_conf(inode, &conf);
3899 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3901 /* refresh layout failed, need to wait */
3902 wait_layout = rc == -EBUSY;
3906 LDLM_LOCK_PUT(lock);
3907 ldlm_lock_decref(lockh, mode);
3909 /* wait for IO to complete if it's still being used. */
3911 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3912 ll_get_fsname(inode->i_sb, NULL, 0),
3913 PFID(&lli->lli_fid), inode);
3915 memset(&conf, 0, sizeof conf);
3916 conf.coc_opc = OBJECT_CONF_WAIT;
3917 conf.coc_inode = inode;
3918 rc = ll_layout_conf(inode, &conf);
3922 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3923 ll_get_fsname(inode->i_sb, NULL, 0),
3924 PFID(&lli->lli_fid), rc);
3930 * This function checks if there exists a LAYOUT lock on the client side,
3931 * or enqueues it if it doesn't have one in cache.
3933 * This function will not hold layout lock so it may be revoked any time after
3934 * this function returns. Any operations depend on layout should be redone
3937 * This function should be called before lov_io_init() to get an uptodate
3938 * layout version, the caller should save the version number and after IO
3939 * is finished, this function should be called again to verify that layout
3940 * is not changed during IO time.
3942 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3944 struct ll_inode_info *lli = ll_i2info(inode);
3945 struct ll_sb_info *sbi = ll_i2sbi(inode);
3946 struct md_op_data *op_data;
3947 struct lookup_intent it;
3948 struct lustre_handle lockh;
3950 struct ldlm_enqueue_info einfo = {
3951 .ei_type = LDLM_IBITS,
3953 .ei_cb_bl = ll_md_blocking_ast,
3954 .ei_cb_cp = ldlm_completion_ast,
3959 *gen = ll_layout_version_get(lli);
3960 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3964 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3965 LASSERT(S_ISREG(inode->i_mode));
3967 /* take layout lock mutex to enqueue layout lock exclusively. */
3968 mutex_lock(&lli->lli_layout_mutex);
3971 /* mostly layout lock is caching on the local side, so try to match
3972 * it before grabbing layout lock mutex. */
3973 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3974 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3975 if (mode != 0) { /* hit cached lock */
3976 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3980 mutex_unlock(&lli->lli_layout_mutex);
3984 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3985 0, 0, LUSTRE_OPC_ANY, NULL);
3986 if (IS_ERR(op_data)) {
3987 mutex_unlock(&lli->lli_layout_mutex);
3988 RETURN(PTR_ERR(op_data));
3991 /* have to enqueue one */
3992 memset(&it, 0, sizeof(it));
3993 it.it_op = IT_LAYOUT;
3994 lockh.cookie = 0ULL;
3996 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
3997 ll_get_fsname(inode->i_sb, NULL, 0),
3998 PFID(&lli->lli_fid), inode);
4000 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
4002 if (it.d.lustre.it_data != NULL)
4003 ptlrpc_req_finished(it.d.lustre.it_data);
4004 it.d.lustre.it_data = NULL;
4006 ll_finish_md_op_data(op_data);
4008 mode = it.d.lustre.it_lock_mode;
4009 it.d.lustre.it_lock_mode = 0;
4010 ll_intent_drop_lock(&it);
4013 /* set lock data in case this is a new lock */
4014 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4015 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4019 mutex_unlock(&lli->lli_layout_mutex);
4025 * This function send a restore request to the MDT
4027 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4029 struct hsm_user_request *hur;
4033 len = sizeof(struct hsm_user_request) +
4034 sizeof(struct hsm_user_item);
4035 OBD_ALLOC(hur, len);
4039 hur->hur_request.hr_action = HUA_RESTORE;
4040 hur->hur_request.hr_archive_id = 0;
4041 hur->hur_request.hr_flags = 0;
4042 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4043 sizeof(hur->hur_user_item[0].hui_fid));
4044 hur->hur_user_item[0].hui_extent.offset = offset;
4045 hur->hur_user_item[0].hui_extent.length = length;
4046 hur->hur_request.hr_itemcount = 1;
4047 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,