4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
71 fd->fd_write_failed = false;
76 static void ll_file_data_put(struct ll_file_data *fd)
79 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
83 struct lustre_handle *fh)
85 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
86 op_data->op_attr.ia_mode = inode->i_mode;
87 op_data->op_attr.ia_atime = inode->i_atime;
88 op_data->op_attr.ia_mtime = inode->i_mtime;
89 op_data->op_attr.ia_ctime = inode->i_ctime;
90 op_data->op_attr.ia_size = i_size_read(inode);
91 op_data->op_attr_blocks = inode->i_blocks;
92 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
93 ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
198 rc = ll_objects_destroy(req, inode);
200 CERROR("%s: inode "DFID
201 " ll_objects destroy: rc = %d\n",
202 ll_i2mdexp(inode)->exp_obd->obd_name,
203 PFID(ll_inode2fid(inode)), rc);
206 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
207 struct mdt_body *body;
208 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
209 if (!(body->valid & OBD_MD_FLRELEASED))
213 ll_finish_md_op_data(op_data);
217 if (exp_connect_som(exp) && !epoch_close &&
218 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
219 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
221 md_clear_open_replay_data(md_exp, och);
222 /* Free @och if it is not waiting for DONE_WRITING. */
223 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
226 if (req) /* This is close request */
227 ptlrpc_req_finished(req);
231 int ll_md_real_close(struct inode *inode, fmode_t fmode)
233 struct ll_inode_info *lli = ll_i2info(inode);
234 struct obd_client_handle **och_p;
235 struct obd_client_handle *och;
240 if (fmode & FMODE_WRITE) {
241 och_p = &lli->lli_mds_write_och;
242 och_usecount = &lli->lli_open_fd_write_count;
243 } else if (fmode & FMODE_EXEC) {
244 och_p = &lli->lli_mds_exec_och;
245 och_usecount = &lli->lli_open_fd_exec_count;
247 LASSERT(fmode & FMODE_READ);
248 och_p = &lli->lli_mds_read_och;
249 och_usecount = &lli->lli_open_fd_read_count;
252 mutex_lock(&lli->lli_och_mutex);
253 if (*och_usecount > 0) {
254 /* There are still users of this handle, so skip
256 mutex_unlock(&lli->lli_och_mutex);
262 mutex_unlock(&lli->lli_och_mutex);
265 /* There might be a race and this handle may already
267 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
274 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
278 struct ll_inode_info *lli = ll_i2info(inode);
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("Releasing a file %p with negative dentry %p. Name %s",
337 file, file->f_dentry, file->f_dentry->d_name.name);
341 LUSTRE_FPRIVATE(file) = NULL;
342 ll_file_data_put(fd);
343 ll_capa_close(inode);
348 /* While this returns an error code, fput() the caller does not, so we need
349 * to make every effort to clean up all of our state here. Also, applications
350 * rarely check close errors and even if an error is returned they will not
351 * re-try the close call.
353 int ll_file_release(struct inode *inode, struct file *file)
355 struct ll_file_data *fd;
356 struct ll_sb_info *sbi = ll_i2sbi(inode);
357 struct ll_inode_info *lli = ll_i2info(inode);
361 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
362 PFID(ll_inode2fid(inode)), inode);
364 #ifdef CONFIG_FS_POSIX_ACL
365 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
366 inode == inode->i_sb->s_root->d_inode) {
367 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
370 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
371 fd->fd_flags &= ~LL_FILE_RMTACL;
372 rct_del(&sbi->ll_rct, current_pid());
373 et_search_free(&sbi->ll_et, current_pid());
378 if (inode->i_sb->s_root != file->f_dentry)
379 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
380 fd = LUSTRE_FPRIVATE(file);
383 /* The last ref on @file, maybe not the the owner pid of statahead.
384 * Different processes can open the same dir, "ll_opendir_key" means:
385 * it is me that should stop the statahead thread. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
387 lli->lli_opendir_pid != 0)
388 ll_stop_statahead(inode, lli->lli_opendir_key);
390 if (inode->i_sb->s_root == file->f_dentry) {
391 LUSTRE_FPRIVATE(file) = NULL;
392 ll_file_data_put(fd);
396 if (!S_ISDIR(inode->i_mode)) {
397 if (lli->lli_clob != NULL)
398 lov_read_and_clear_async_rc(lli->lli_clob);
399 lli->lli_async_rc = 0;
402 rc = ll_md_close(sbi->ll_md_exp, inode, file);
404 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
405 libcfs_debug_dumplog();
410 static int ll_intent_file_open(struct file *file, void *lmm,
411 int lmmsize, struct lookup_intent *itp)
413 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
414 struct dentry *parent = file->f_dentry->d_parent;
415 const char *name = file->f_dentry->d_name.name;
416 const int len = file->f_dentry->d_name.len;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req;
419 __u32 opc = LUSTRE_OPC_ANY;
426 /* Usually we come here only for NFSD, and we want open lock.
427 But we can also get here with pre 2.6.15 patchless kernels, and in
428 that case that lock is also ok */
429 /* We can also get here if there was cached open handle in revalidate_it
430 * but it disappeared while we were getting from there to ll_file_open.
431 * But this means this file was closed and immediatelly opened which
432 * makes a good candidate for using OPEN lock */
433 /* If lmmsize & lmm are not 0, we are just setting stripe info
434 * parameters. No need for the open lock */
435 if (lmm == NULL && lmmsize == 0) {
436 itp->it_flags |= MDS_OPEN_LOCK;
437 if (itp->it_flags & FMODE_WRITE)
438 opc = LUSTRE_OPC_CREATE;
441 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
442 file->f_dentry->d_inode, name, len,
445 RETURN(PTR_ERR(op_data));
447 itp->it_flags |= MDS_OPEN_BY_FID;
448 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
449 0 /*unused */, &req, ll_md_blocking_ast, 0);
450 ll_finish_md_op_data(op_data);
452 /* reason for keep own exit path - don`t flood log
453 * with messages with -ESTALE errors.
455 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
456 it_open_error(DISP_OPEN_OPEN, itp))
458 ll_release_openhandle(file->f_dentry, itp);
462 if (it_disposition(itp, DISP_LOOKUP_NEG))
463 GOTO(out, rc = -ENOENT);
465 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
466 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
467 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
471 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
472 if (!rc && itp->d.lustre.it_lock_mode)
473 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
477 ptlrpc_req_finished(req);
478 ll_intent_drop_lock(itp);
484 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
485 * not believe attributes if a few ioepoch holders exist. Attributes for
486 * previous ioepoch if new one is opened are also skipped by MDS.
488 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
490 if (ioepoch && lli->lli_ioepoch != ioepoch) {
491 lli->lli_ioepoch = ioepoch;
492 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
493 ioepoch, PFID(&lli->lli_fid));
497 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
498 struct obd_client_handle *och)
500 struct ptlrpc_request *req = it->d.lustre.it_data;
501 struct mdt_body *body;
503 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
504 och->och_fh = body->handle;
505 och->och_fid = body->fid1;
506 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
507 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
508 och->och_flags = it->it_flags;
510 return md_set_open_replay_data(md_exp, och, it);
513 static int ll_local_open(struct file *file, struct lookup_intent *it,
514 struct ll_file_data *fd, struct obd_client_handle *och)
516 struct inode *inode = file->f_dentry->d_inode;
517 struct ll_inode_info *lli = ll_i2info(inode);
520 LASSERT(!LUSTRE_FPRIVATE(file));
525 struct ptlrpc_request *req = it->d.lustre.it_data;
526 struct mdt_body *body;
529 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
533 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
534 ll_ioepoch_open(lli, body->ioepoch);
537 LUSTRE_FPRIVATE(file) = fd;
538 ll_readahead_init(inode, &fd->fd_ras);
539 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
544 /* Open a file, and (for the very first open) create objects on the OSTs at
545 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
546 * creation or open until ll_lov_setstripe() ioctl is called.
548 * If we already have the stripe MD locally then we don't request it in
549 * md_open(), by passing a lmm_size = 0.
551 * It is up to the application to ensure no other processes open this file
552 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
553 * used. We might be able to avoid races of that sort by getting lli_open_sem
554 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
555 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
557 int ll_file_open(struct inode *inode, struct file *file)
559 struct ll_inode_info *lli = ll_i2info(inode);
560 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
561 .it_flags = file->f_flags };
562 struct obd_client_handle **och_p = NULL;
563 __u64 *och_usecount = NULL;
564 struct ll_file_data *fd;
565 int rc = 0, opendir_set = 0;
568 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
569 PFID(ll_inode2fid(inode)), inode, file->f_flags);
571 it = file->private_data; /* XXX: compat macro */
572 file->private_data = NULL; /* prevent ll_local_open assertion */
574 fd = ll_file_data_get();
576 GOTO(out_openerr, rc = -ENOMEM);
579 if (S_ISDIR(inode->i_mode)) {
580 spin_lock(&lli->lli_sa_lock);
581 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
582 lli->lli_opendir_pid == 0) {
583 lli->lli_opendir_key = fd;
584 lli->lli_opendir_pid = current_pid();
587 spin_unlock(&lli->lli_sa_lock);
590 if (inode->i_sb->s_root == file->f_dentry) {
591 LUSTRE_FPRIVATE(file) = fd;
595 if (!it || !it->d.lustre.it_disposition) {
596 /* Convert f_flags into access mode. We cannot use file->f_mode,
597 * because everything but O_ACCMODE mask was stripped from
599 if ((oit.it_flags + 1) & O_ACCMODE)
601 if (file->f_flags & O_TRUNC)
602 oit.it_flags |= FMODE_WRITE;
604 /* kernel only call f_op->open in dentry_open. filp_open calls
605 * dentry_open after call to open_namei that checks permissions.
606 * Only nfsd_open call dentry_open directly without checking
607 * permissions and because of that this code below is safe. */
608 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
609 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
611 /* We do not want O_EXCL here, presumably we opened the file
612 * already? XXX - NFS implications? */
613 oit.it_flags &= ~O_EXCL;
615 /* bug20584, if "it_flags" contains O_CREAT, the file will be
616 * created if necessary, then "IT_CREAT" should be set to keep
617 * consistent with it */
618 if (oit.it_flags & O_CREAT)
619 oit.it_op |= IT_CREAT;
625 /* Let's see if we have file open on MDS already. */
626 if (it->it_flags & FMODE_WRITE) {
627 och_p = &lli->lli_mds_write_och;
628 och_usecount = &lli->lli_open_fd_write_count;
629 } else if (it->it_flags & FMODE_EXEC) {
630 och_p = &lli->lli_mds_exec_och;
631 och_usecount = &lli->lli_open_fd_exec_count;
633 och_p = &lli->lli_mds_read_och;
634 och_usecount = &lli->lli_open_fd_read_count;
637 mutex_lock(&lli->lli_och_mutex);
638 if (*och_p) { /* Open handle is present */
639 if (it_disposition(it, DISP_OPEN_OPEN)) {
640 /* Well, there's extra open request that we do not need,
641 let's close it somehow. This will decref request. */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 mutex_unlock(&lli->lli_och_mutex);
645 GOTO(out_openerr, rc);
648 ll_release_openhandle(file->f_dentry, it);
652 rc = ll_local_open(file, it, fd, NULL);
655 mutex_unlock(&lli->lli_och_mutex);
656 GOTO(out_openerr, rc);
659 LASSERT(*och_usecount == 0);
660 if (!it->d.lustre.it_disposition) {
661 /* We cannot just request lock handle now, new ELC code
662 means that one of other OPEN locks for this file
663 could be cancelled, and since blocking ast handler
664 would attempt to grab och_mutex as well, that would
665 result in a deadlock */
666 mutex_unlock(&lli->lli_och_mutex);
667 it->it_create_mode |= M_CHECK_STALE;
668 rc = ll_intent_file_open(file, NULL, 0, it);
669 it->it_create_mode &= ~M_CHECK_STALE;
671 GOTO(out_openerr, rc);
675 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
677 GOTO(out_och_free, rc = -ENOMEM);
681 /* md_intent_lock() didn't get a request ref if there was an
682 * open error, so don't do cleanup on the request here
684 /* XXX (green): Should not we bail out on any error here, not
685 * just open error? */
686 rc = it_open_error(DISP_OPEN_OPEN, it);
688 GOTO(out_och_free, rc);
690 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
691 "inode %p: disposition %x, status %d\n", inode,
692 it_disposition(it, ~0), it->d.lustre.it_status);
694 rc = ll_local_open(file, it, fd, *och_p);
696 GOTO(out_och_free, rc);
698 mutex_unlock(&lli->lli_och_mutex);
701 /* Must do this outside lli_och_mutex lock to prevent deadlock where
702 different kind of OPEN lock for this same inode gets cancelled
703 by ldlm_cancel_lru */
704 if (!S_ISREG(inode->i_mode))
705 GOTO(out_och_free, rc);
709 if (!lli->lli_has_smd &&
710 (cl_is_lov_delay_create(file->f_flags) ||
711 (file->f_mode & FMODE_WRITE) == 0)) {
712 CDEBUG(D_INODE, "object creation was delayed\n");
713 GOTO(out_och_free, rc);
715 cl_lov_delay_create_clear(&file->f_flags);
716 GOTO(out_och_free, rc);
720 if (och_p && *och_p) {
721 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
722 *och_p = NULL; /* OBD_FREE writes some magic there */
725 mutex_unlock(&lli->lli_och_mutex);
728 if (opendir_set != 0)
729 ll_stop_statahead(inode, lli->lli_opendir_key);
731 ll_file_data_put(fd);
733 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
736 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
737 ptlrpc_req_finished(it->d.lustre.it_data);
738 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
744 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
745 struct ldlm_lock_desc *desc, void *data, int flag)
748 struct lustre_handle lockh;
752 case LDLM_CB_BLOCKING:
753 ldlm_lock2handle(lock, &lockh);
754 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
756 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
760 case LDLM_CB_CANCELING:
768 * Acquire a lease and open the file.
770 static struct obd_client_handle *
771 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
774 struct lookup_intent it = { .it_op = IT_OPEN };
775 struct ll_sb_info *sbi = ll_i2sbi(inode);
776 struct md_op_data *op_data;
777 struct ptlrpc_request *req;
778 struct lustre_handle old_handle = { 0 };
779 struct obd_client_handle *och = NULL;
784 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
785 RETURN(ERR_PTR(-EINVAL));
788 struct ll_inode_info *lli = ll_i2info(inode);
789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
790 struct obd_client_handle **och_p;
793 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
794 RETURN(ERR_PTR(-EPERM));
796 /* Get the openhandle of the file */
798 mutex_lock(&lli->lli_och_mutex);
799 if (fd->fd_lease_och != NULL) {
800 mutex_unlock(&lli->lli_och_mutex);
804 if (fd->fd_och == NULL) {
805 if (file->f_mode & FMODE_WRITE) {
806 LASSERT(lli->lli_mds_write_och != NULL);
807 och_p = &lli->lli_mds_write_och;
808 och_usecount = &lli->lli_open_fd_write_count;
810 LASSERT(lli->lli_mds_read_och != NULL);
811 och_p = &lli->lli_mds_read_och;
812 och_usecount = &lli->lli_open_fd_read_count;
814 if (*och_usecount == 1) {
821 mutex_unlock(&lli->lli_och_mutex);
822 if (rc < 0) /* more than 1 opener */
825 LASSERT(fd->fd_och != NULL);
826 old_handle = fd->fd_och->och_fh;
831 RETURN(ERR_PTR(-ENOMEM));
833 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
834 LUSTRE_OPC_ANY, NULL);
836 GOTO(out, rc = PTR_ERR(op_data));
838 /* To tell the MDT this openhandle is from the same owner */
839 op_data->op_handle = old_handle;
841 it.it_flags = fmode | open_flags;
842 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
843 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
844 ll_md_blocking_lease_ast,
845 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
846 * it can be cancelled which may mislead applications that the lease is
848 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
849 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
850 * doesn't deal with openhandle, so normal openhandle will be leaked. */
851 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
852 ll_finish_md_op_data(op_data);
853 ptlrpc_req_finished(req);
855 GOTO(out_release_it, rc);
857 if (it_disposition(&it, DISP_LOOKUP_NEG))
858 GOTO(out_release_it, rc = -ENOENT);
860 rc = it_open_error(DISP_OPEN_OPEN, &it);
862 GOTO(out_release_it, rc);
864 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
865 ll_och_fill(sbi->ll_md_exp, &it, och);
867 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
868 GOTO(out_close, rc = -EOPNOTSUPP);
870 /* already get lease, handle lease lock */
871 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
872 if (it.d.lustre.it_lock_mode == 0 ||
873 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
874 /* open lock must return for lease */
875 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
876 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
877 it.d.lustre.it_lock_bits);
878 GOTO(out_close, rc = -EPROTO);
881 ll_intent_release(&it);
885 /* Cancel open lock */
886 if (it.d.lustre.it_lock_mode != 0) {
887 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
888 it.d.lustre.it_lock_mode);
889 it.d.lustre.it_lock_mode = 0;
890 och->och_lease_handle.cookie = 0ULL;
892 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
894 CERROR("%s: error closing file "DFID": %d\n",
895 ll_get_fsname(inode->i_sb, NULL, 0),
896 PFID(&ll_i2info(inode)->lli_fid), rc2);
897 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
899 ll_intent_release(&it);
907 * Release lease and close the file.
908 * It will check if the lease has ever broken.
910 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
913 struct ldlm_lock *lock;
914 bool cancelled = true;
918 lock = ldlm_handle2lock(&och->och_lease_handle);
920 lock_res_and_lock(lock);
921 cancelled = ldlm_is_cancel(lock);
922 unlock_res_and_lock(lock);
926 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
927 PFID(&ll_i2info(inode)->lli_fid), cancelled);
930 ldlm_cli_cancel(&och->och_lease_handle, 0);
931 if (lease_broken != NULL)
932 *lease_broken = cancelled;
934 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
939 /* Fills the obdo with the attributes for the lsm */
940 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
941 struct obd_capa *capa, struct obdo *obdo,
942 __u64 ioepoch, int dv_flags)
944 struct ptlrpc_request_set *set;
945 struct obd_info oinfo = { { { 0 } } };
950 LASSERT(lsm != NULL);
954 oinfo.oi_oa->o_oi = lsm->lsm_oi;
955 oinfo.oi_oa->o_mode = S_IFREG;
956 oinfo.oi_oa->o_ioepoch = ioepoch;
957 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
958 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
959 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
960 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
961 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
962 OBD_MD_FLDATAVERSION;
963 oinfo.oi_capa = capa;
964 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
965 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
966 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
967 if (dv_flags & LL_DV_WR_FLUSH)
968 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
971 set = ptlrpc_prep_set();
973 CERROR("can't allocate ptlrpc set\n");
976 rc = obd_getattr_async(exp, &oinfo, set);
978 rc = ptlrpc_set_wait(set);
979 ptlrpc_set_destroy(set);
982 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
983 OBD_MD_FLATIME | OBD_MD_FLMTIME |
984 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
985 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
986 if (dv_flags & LL_DV_WR_FLUSH &&
987 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
988 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
995 * Performs the getattr on the inode and updates its fields.
996 * If @sync != 0, perform the getattr under the server-side lock.
998 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
999 __u64 ioepoch, int sync)
1001 struct obd_capa *capa = ll_mdscapa_get(inode);
1002 struct lov_stripe_md *lsm;
1006 lsm = ccc_inode_lsm_get(inode);
1007 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1008 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1011 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1013 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1014 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1015 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1016 (unsigned long long)inode->i_blocks,
1017 (unsigned long)ll_inode_blksize(inode));
1019 ccc_inode_lsm_put(inode, lsm);
1023 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct cl_object *obj = lli->lli_clob;
1027 struct cl_attr *attr = ccc_env_thread_attr(env);
1033 ll_inode_size_lock(inode);
1034 /* merge timestamps the most recently obtained from mds with
1035 timestamps obtained from osts */
1036 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1037 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1038 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 inode_init_lvb(inode, &lvb);
1041 cl_object_attr_lock(obj);
1042 rc = cl_object_attr_get(env, obj, attr);
1043 cl_object_attr_unlock(obj);
1046 if (lvb.lvb_atime < attr->cat_atime)
1047 lvb.lvb_atime = attr->cat_atime;
1048 if (lvb.lvb_ctime < attr->cat_ctime)
1049 lvb.lvb_ctime = attr->cat_ctime;
1050 if (lvb.lvb_mtime < attr->cat_mtime)
1051 lvb.lvb_mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1055 cl_isize_write_nolock(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1060 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1061 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1146 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1147 file->f_dentry->d_name.name, iot, *ppos, count);
1150 io = ccc_env_thread_io(env);
1151 ll_io_init(io, file, iot == CIT_WRITE);
1153 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1154 struct vvp_io *vio = vvp_env_io(env);
1155 struct ccc_io *cio = ccc_env_io(env);
1156 int write_mutex_locked = 0;
1158 cio->cui_fd = LUSTRE_FPRIVATE(file);
1159 vio->cui_io_subtype = args->via_io_subtype;
1161 switch (vio->cui_io_subtype) {
1163 cio->cui_iov = args->u.normal.via_iov;
1164 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1165 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1166 cio->cui_iocb = args->u.normal.via_iocb;
1167 if ((iot == CIT_WRITE) &&
1168 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1169 if (mutex_lock_interruptible(&lli->
1171 GOTO(out, result = -ERESTARTSYS);
1172 write_mutex_locked = 1;
1174 down_read(&lli->lli_trunc_sem);
1177 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1178 vio->u.splice.cui_flags = args->u.splice.via_flags;
1181 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1184 result = cl_io_loop(env, io);
1185 if (args->via_io_subtype == IO_NORMAL)
1186 up_read(&lli->lli_trunc_sem);
1187 if (write_mutex_locked)
1188 mutex_unlock(&lli->lli_write_mutex);
1190 /* cl_io_rw_init() handled IO */
1191 result = io->ci_result;
1194 if (io->ci_nob > 0) {
1195 result = io->ci_nob;
1196 *ppos = io->u.ci_wr.wr.crw_pos;
1200 cl_io_fini(env, io);
1201 /* If any bit been read/written (result != 0), we just return
1202 * short read/write instead of restart io. */
1203 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1204 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1205 iot == CIT_READ ? "read" : "write",
1206 file->f_dentry->d_name.name, *ppos, count);
1207 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1211 if (iot == CIT_READ) {
1213 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1214 LPROC_LL_READ_BYTES, result);
1215 } else if (iot == CIT_WRITE) {
1217 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1218 LPROC_LL_WRITE_BYTES, result);
1219 fd->fd_write_failed = false;
1220 } else if (result != -ERESTARTSYS) {
1221 fd->fd_write_failed = true;
1224 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1231 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1233 static int ll_file_get_iov_count(const struct iovec *iov,
1234 unsigned long *nr_segs, size_t *count)
1239 for (seg = 0; seg < *nr_segs; seg++) {
1240 const struct iovec *iv = &iov[seg];
1243 * If any segment has a negative length, or the cumulative
1244 * length ever wraps negative then return -EINVAL.
1247 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1249 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1254 cnt -= iv->iov_len; /* This segment is no good */
1261 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1262 unsigned long nr_segs, loff_t pos)
1265 struct vvp_io_args *args;
1271 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1275 env = cl_env_get(&refcheck);
1277 RETURN(PTR_ERR(env));
1279 args = vvp_env_args(env, IO_NORMAL);
1280 args->u.normal.via_iov = (struct iovec *)iov;
1281 args->u.normal.via_nrsegs = nr_segs;
1282 args->u.normal.via_iocb = iocb;
1284 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1285 &iocb->ki_pos, count);
1286 cl_env_put(env, &refcheck);
1290 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1294 struct iovec *local_iov;
1295 struct kiocb *kiocb;
1300 env = cl_env_get(&refcheck);
1302 RETURN(PTR_ERR(env));
1304 local_iov = &vvp_env_info(env)->vti_local_iov;
1305 kiocb = &vvp_env_info(env)->vti_kiocb;
1306 local_iov->iov_base = (void __user *)buf;
1307 local_iov->iov_len = count;
1308 init_sync_kiocb(kiocb, file);
1309 kiocb->ki_pos = *ppos;
1310 #ifdef HAVE_KIOCB_KI_LEFT
1311 kiocb->ki_left = count;
1313 kiocb->ki_nbytes = count;
1316 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1317 *ppos = kiocb->ki_pos;
1319 cl_env_put(env, &refcheck);
1324 * Write to a file (through the page cache).
1327 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1328 unsigned long nr_segs, loff_t pos)
1331 struct vvp_io_args *args;
1337 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1341 env = cl_env_get(&refcheck);
1343 RETURN(PTR_ERR(env));
1345 args = vvp_env_args(env, IO_NORMAL);
1346 args->u.normal.via_iov = (struct iovec *)iov;
1347 args->u.normal.via_nrsegs = nr_segs;
1348 args->u.normal.via_iocb = iocb;
1350 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1351 &iocb->ki_pos, count);
1352 cl_env_put(env, &refcheck);
1356 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1360 struct iovec *local_iov;
1361 struct kiocb *kiocb;
1366 env = cl_env_get(&refcheck);
1368 RETURN(PTR_ERR(env));
1370 local_iov = &vvp_env_info(env)->vti_local_iov;
1371 kiocb = &vvp_env_info(env)->vti_kiocb;
1372 local_iov->iov_base = (void __user *)buf;
1373 local_iov->iov_len = count;
1374 init_sync_kiocb(kiocb, file);
1375 kiocb->ki_pos = *ppos;
1376 #ifdef HAVE_KIOCB_KI_LEFT
1377 kiocb->ki_left = count;
1379 kiocb->ki_nbytes = count;
1382 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1383 *ppos = kiocb->ki_pos;
1385 cl_env_put(env, &refcheck);
1390 * Send file content (through pagecache) somewhere with helper
1392 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1393 struct pipe_inode_info *pipe, size_t count,
1397 struct vvp_io_args *args;
1402 env = cl_env_get(&refcheck);
1404 RETURN(PTR_ERR(env));
1406 args = vvp_env_args(env, IO_SPLICE);
1407 args->u.splice.via_pipe = pipe;
1408 args->u.splice.via_flags = flags;
1410 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1411 cl_env_put(env, &refcheck);
1415 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1418 struct obd_export *exp = ll_i2dtexp(inode);
1419 struct obd_trans_info oti = { 0 };
1420 struct obdo *oa = NULL;
1423 struct lov_stripe_md *lsm = NULL, *lsm2;
1430 lsm = ccc_inode_lsm_get(inode);
1431 if (!lsm_has_objects(lsm))
1432 GOTO(out, rc = -ENOENT);
1434 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1435 (lsm->lsm_stripe_count));
1437 OBD_ALLOC_LARGE(lsm2, lsm_size);
1439 GOTO(out, rc = -ENOMEM);
1442 oa->o_nlink = ost_idx;
1443 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1444 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1445 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1446 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1447 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1448 memcpy(lsm2, lsm, lsm_size);
1449 ll_inode_size_lock(inode);
1450 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1451 ll_inode_size_unlock(inode);
1453 OBD_FREE_LARGE(lsm2, lsm_size);
1456 ccc_inode_lsm_put(inode, lsm);
1461 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1463 struct ll_recreate_obj ucreat;
1467 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1470 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1474 ostid_set_seq_mdt0(&oi);
1475 ostid_set_id(&oi, ucreat.lrc_id);
1476 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1479 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1486 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1489 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1492 fid_to_ostid(&fid, &oi);
1493 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1494 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1497 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1498 __u64 flags, struct lov_user_md *lum,
1501 struct lov_stripe_md *lsm = NULL;
1502 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1506 lsm = ccc_inode_lsm_get(inode);
1508 ccc_inode_lsm_put(inode, lsm);
1509 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1510 PFID(ll_inode2fid(inode)));
1511 GOTO(out, rc = -EEXIST);
1514 ll_inode_size_lock(inode);
1515 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1517 GOTO(out_unlock, rc);
1518 rc = oit.d.lustre.it_status;
1520 GOTO(out_req_free, rc);
1522 ll_release_openhandle(file->f_dentry, &oit);
1525 ll_inode_size_unlock(inode);
1526 ll_intent_release(&oit);
1527 ccc_inode_lsm_put(inode, lsm);
1529 cl_lov_delay_create_clear(&file->f_flags);
1532 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1536 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1537 struct lov_mds_md **lmmp, int *lmm_size,
1538 struct ptlrpc_request **request)
1540 struct ll_sb_info *sbi = ll_i2sbi(inode);
1541 struct mdt_body *body;
1542 struct lov_mds_md *lmm = NULL;
1543 struct ptlrpc_request *req = NULL;
1544 struct md_op_data *op_data;
1547 rc = ll_get_default_mdsize(sbi, &lmmsize);
1551 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1552 strlen(filename), lmmsize,
1553 LUSTRE_OPC_ANY, NULL);
1554 if (IS_ERR(op_data))
1555 RETURN(PTR_ERR(op_data));
1557 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1558 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1559 ll_finish_md_op_data(op_data);
1561 CDEBUG(D_INFO, "md_getattr_name failed "
1562 "on %s: rc %d\n", filename, rc);
1566 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1567 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1569 lmmsize = body->eadatasize;
1571 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1573 GOTO(out, rc = -ENODATA);
1576 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1577 LASSERT(lmm != NULL);
1579 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1580 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1581 GOTO(out, rc = -EPROTO);
1585 * This is coming from the MDS, so is probably in
1586 * little endian. We convert it to host endian before
1587 * passing it to userspace.
1589 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1592 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1593 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1596 /* if function called for directory - we should
1597 * avoid swab not existent lsm objects */
1598 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1599 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1600 if (S_ISREG(body->mode))
1601 lustre_swab_lov_user_md_objects(
1602 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1604 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1605 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1606 if (S_ISREG(body->mode))
1607 lustre_swab_lov_user_md_objects(
1608 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1615 *lmm_size = lmmsize;
1620 static int ll_lov_setea(struct inode *inode, struct file *file,
1623 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1624 struct lov_user_md *lump;
1625 int lum_size = sizeof(struct lov_user_md) +
1626 sizeof(struct lov_user_ost_data);
1630 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1633 OBD_ALLOC_LARGE(lump, lum_size);
1637 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1638 OBD_FREE_LARGE(lump, lum_size);
1642 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1644 OBD_FREE_LARGE(lump, lum_size);
1648 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1651 struct lov_user_md_v3 lumv3;
1652 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1653 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1654 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1656 __u64 flags = FMODE_WRITE;
1659 /* first try with v1 which is smaller than v3 */
1660 lum_size = sizeof(struct lov_user_md_v1);
1661 if (copy_from_user(lumv1, lumv1p, lum_size))
1664 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1665 lum_size = sizeof(struct lov_user_md_v3);
1666 if (copy_from_user(&lumv3, lumv3p, lum_size))
1670 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1672 struct lov_stripe_md *lsm;
1675 put_user(0, &lumv1p->lmm_stripe_count);
1677 ll_layout_refresh(inode, &gen);
1678 lsm = ccc_inode_lsm_get(inode);
1679 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1680 0, lsm, (void *)arg);
1681 ccc_inode_lsm_put(inode, lsm);
1686 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1688 struct lov_stripe_md *lsm;
1692 lsm = ccc_inode_lsm_get(inode);
1694 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1696 ccc_inode_lsm_put(inode, lsm);
1701 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1703 struct ll_inode_info *lli = ll_i2info(inode);
1704 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1705 struct ccc_grouplock grouplock;
1709 if (ll_file_nolock(file))
1710 RETURN(-EOPNOTSUPP);
1712 spin_lock(&lli->lli_lock);
1713 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1714 CWARN("group lock already existed with gid %lu\n",
1715 fd->fd_grouplock.cg_gid);
1716 spin_unlock(&lli->lli_lock);
1719 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1720 spin_unlock(&lli->lli_lock);
1722 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1723 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1727 spin_lock(&lli->lli_lock);
1728 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1729 spin_unlock(&lli->lli_lock);
1730 CERROR("another thread just won the race\n");
1731 cl_put_grouplock(&grouplock);
1735 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1736 fd->fd_grouplock = grouplock;
1737 spin_unlock(&lli->lli_lock);
1739 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1743 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1745 struct ll_inode_info *lli = ll_i2info(inode);
1746 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1747 struct ccc_grouplock grouplock;
1750 spin_lock(&lli->lli_lock);
1751 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1752 spin_unlock(&lli->lli_lock);
1753 CWARN("no group lock held\n");
1756 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1758 if (fd->fd_grouplock.cg_gid != arg) {
1759 CWARN("group lock %lu doesn't match current id %lu\n",
1760 arg, fd->fd_grouplock.cg_gid);
1761 spin_unlock(&lli->lli_lock);
1765 grouplock = fd->fd_grouplock;
1766 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1767 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1768 spin_unlock(&lli->lli_lock);
1770 cl_put_grouplock(&grouplock);
1771 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1776 * Close inode open handle
1778 * \param dentry [in] dentry which contains the inode
1779 * \param it [in,out] intent which contains open info and result
1782 * \retval <0 failure
1784 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1786 struct inode *inode = dentry->d_inode;
1787 struct obd_client_handle *och;
1793 /* Root ? Do nothing. */
1794 if (dentry->d_inode->i_sb->s_root == dentry)
1797 /* No open handle to close? Move away */
1798 if (!it_disposition(it, DISP_OPEN_OPEN))
1801 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1803 OBD_ALLOC(och, sizeof(*och));
1805 GOTO(out, rc = -ENOMEM);
1807 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1809 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1812 /* this one is in place of ll_file_open */
1813 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1814 ptlrpc_req_finished(it->d.lustre.it_data);
1815 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1821 * Get size for inode for which FIEMAP mapping is requested.
1822 * Make the FIEMAP get_info call and returns the result.
1824 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1827 struct obd_export *exp = ll_i2dtexp(inode);
1828 struct lov_stripe_md *lsm = NULL;
1829 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1830 int vallen = num_bytes;
1834 /* Checks for fiemap flags */
1835 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1836 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1840 /* Check for FIEMAP_FLAG_SYNC */
1841 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1842 rc = filemap_fdatawrite(inode->i_mapping);
1847 lsm = ccc_inode_lsm_get(inode);
1851 /* If the stripe_count > 1 and the application does not understand
1852 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1854 if (lsm->lsm_stripe_count > 1 &&
1855 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1856 GOTO(out, rc = -EOPNOTSUPP);
1858 fm_key.oa.o_oi = lsm->lsm_oi;
1859 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1861 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1862 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1863 /* If filesize is 0, then there would be no objects for mapping */
1864 if (fm_key.oa.o_size == 0) {
1865 fiemap->fm_mapped_extents = 0;
1869 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1871 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1874 CERROR("obd_get_info failed: rc = %d\n", rc);
1877 ccc_inode_lsm_put(inode, lsm);
1881 int ll_fid2path(struct inode *inode, void *arg)
1883 struct obd_export *exp = ll_i2mdexp(inode);
1884 struct getinfo_fid2path *gfout, *gfin;
1888 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1889 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1892 /* Need to get the buflen */
1893 OBD_ALLOC_PTR(gfin);
1896 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1901 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1902 OBD_ALLOC(gfout, outsize);
1903 if (gfout == NULL) {
1907 memcpy(gfout, gfin, sizeof(*gfout));
1910 /* Call mdc_iocontrol */
1911 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1915 if (copy_to_user(arg, gfout, outsize))
1919 OBD_FREE(gfout, outsize);
1923 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1925 struct ll_user_fiemap *fiemap_s;
1926 size_t num_bytes, ret_bytes;
1927 unsigned int extent_count;
1930 /* Get the extent count so we can calculate the size of
1931 * required fiemap buffer */
1932 if (get_user(extent_count,
1933 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1935 num_bytes = sizeof(*fiemap_s) + (extent_count *
1936 sizeof(struct ll_fiemap_extent));
1938 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1939 if (fiemap_s == NULL)
1942 /* get the fiemap value */
1943 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1945 GOTO(error, rc = -EFAULT);
1947 /* If fm_extent_count is non-zero, read the first extent since
1948 * it is used to calculate end_offset and device from previous
1951 if (copy_from_user(&fiemap_s->fm_extents[0],
1952 (char __user *)arg + sizeof(*fiemap_s),
1953 sizeof(struct ll_fiemap_extent)))
1954 GOTO(error, rc = -EFAULT);
1957 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1961 ret_bytes = sizeof(struct ll_user_fiemap);
1963 if (extent_count != 0)
1964 ret_bytes += (fiemap_s->fm_mapped_extents *
1965 sizeof(struct ll_fiemap_extent));
1967 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1971 OBD_FREE_LARGE(fiemap_s, num_bytes);
1976 * Read the data_version for inode.
1978 * This value is computed using stripe object version on OST.
1979 * Version is computed using server side locking.
1981 * @param sync if do sync on the OST side;
1983 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1984 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1986 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1988 struct lov_stripe_md *lsm = NULL;
1989 struct ll_sb_info *sbi = ll_i2sbi(inode);
1990 struct obdo *obdo = NULL;
1994 /* If no stripe, we consider version is 0. */
1995 lsm = ccc_inode_lsm_get(inode);
1996 if (!lsm_has_objects(lsm)) {
1998 CDEBUG(D_INODE, "No object for inode\n");
2002 OBD_ALLOC_PTR(obdo);
2004 GOTO(out, rc = -ENOMEM);
2006 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2008 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2011 *data_version = obdo->o_data_version;
2017 ccc_inode_lsm_put(inode, lsm);
2022 * Trigger a HSM release request for the provided inode.
2024 int ll_hsm_release(struct inode *inode)
2026 struct cl_env_nest nest;
2028 struct obd_client_handle *och = NULL;
2029 __u64 data_version = 0;
2033 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2034 ll_get_fsname(inode->i_sb, NULL, 0),
2035 PFID(&ll_i2info(inode)->lli_fid));
2037 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2039 GOTO(out, rc = PTR_ERR(och));
2041 /* Grab latest data_version and [am]time values */
2042 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2046 env = cl_env_nested_get(&nest);
2048 GOTO(out, rc = PTR_ERR(env));
2050 ll_merge_lvb(env, inode);
2051 cl_env_nested_put(&nest, env);
2053 /* Release the file.
2054 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2055 * we still need it to pack l_remote_handle to MDT. */
2056 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2062 if (och != NULL && !IS_ERR(och)) /* close the file */
2063 ll_lease_close(och, inode, NULL);
2068 struct ll_swap_stack {
2069 struct iattr ia1, ia2;
2071 struct inode *inode1, *inode2;
2072 bool check_dv1, check_dv2;
2075 static int ll_swap_layouts(struct file *file1, struct file *file2,
2076 struct lustre_swap_layouts *lsl)
2078 struct mdc_swap_layouts msl;
2079 struct md_op_data *op_data;
2082 struct ll_swap_stack *llss = NULL;
2085 OBD_ALLOC_PTR(llss);
2089 llss->inode1 = file1->f_dentry->d_inode;
2090 llss->inode2 = file2->f_dentry->d_inode;
2092 if (!S_ISREG(llss->inode2->i_mode))
2093 GOTO(free, rc = -EINVAL);
2095 if (inode_permission(llss->inode1, MAY_WRITE) ||
2096 inode_permission(llss->inode2, MAY_WRITE))
2097 GOTO(free, rc = -EPERM);
2099 if (llss->inode2->i_sb != llss->inode1->i_sb)
2100 GOTO(free, rc = -EXDEV);
2102 /* we use 2 bool because it is easier to swap than 2 bits */
2103 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2104 llss->check_dv1 = true;
2106 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2107 llss->check_dv2 = true;
2109 /* we cannot use lsl->sl_dvX directly because we may swap them */
2110 llss->dv1 = lsl->sl_dv1;
2111 llss->dv2 = lsl->sl_dv2;
2113 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2114 if (rc == 0) /* same file, done! */
2117 if (rc < 0) { /* sequentialize it */
2118 swap(llss->inode1, llss->inode2);
2120 swap(llss->dv1, llss->dv2);
2121 swap(llss->check_dv1, llss->check_dv2);
2125 if (gid != 0) { /* application asks to flush dirty cache */
2126 rc = ll_get_grouplock(llss->inode1, file1, gid);
2130 rc = ll_get_grouplock(llss->inode2, file2, gid);
2132 ll_put_grouplock(llss->inode1, file1, gid);
2137 /* to be able to restore mtime and atime after swap
2138 * we need to first save them */
2140 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2141 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2142 llss->ia1.ia_atime = llss->inode1->i_atime;
2143 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2144 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2145 llss->ia2.ia_atime = llss->inode2->i_atime;
2146 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2149 /* ultimate check, before swaping the layouts we check if
2150 * dataversion has changed (if requested) */
2151 if (llss->check_dv1) {
2152 rc = ll_data_version(llss->inode1, &dv, 0);
2155 if (dv != llss->dv1)
2156 GOTO(putgl, rc = -EAGAIN);
2159 if (llss->check_dv2) {
2160 rc = ll_data_version(llss->inode2, &dv, 0);
2163 if (dv != llss->dv2)
2164 GOTO(putgl, rc = -EAGAIN);
2167 /* struct md_op_data is used to send the swap args to the mdt
2168 * only flags is missing, so we use struct mdc_swap_layouts
2169 * through the md_op_data->op_data */
2170 /* flags from user space have to be converted before they are send to
2171 * server, no flag is sent today, they are only used on the client */
2174 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2175 0, LUSTRE_OPC_ANY, &msl);
2176 if (IS_ERR(op_data))
2177 GOTO(free, rc = PTR_ERR(op_data));
2179 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2180 sizeof(*op_data), op_data, NULL);
2181 ll_finish_md_op_data(op_data);
2185 ll_put_grouplock(llss->inode2, file2, gid);
2186 ll_put_grouplock(llss->inode1, file1, gid);
2189 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2193 /* clear useless flags */
2194 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2195 llss->ia1.ia_valid &= ~ATTR_MTIME;
2196 llss->ia2.ia_valid &= ~ATTR_MTIME;
2199 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2200 llss->ia1.ia_valid &= ~ATTR_ATIME;
2201 llss->ia2.ia_valid &= ~ATTR_ATIME;
2204 /* update time if requested */
2206 if (llss->ia2.ia_valid != 0) {
2207 mutex_lock(&llss->inode1->i_mutex);
2208 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2209 mutex_unlock(&llss->inode1->i_mutex);
2212 if (llss->ia1.ia_valid != 0) {
2215 mutex_lock(&llss->inode2->i_mutex);
2216 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2217 mutex_unlock(&llss->inode2->i_mutex);
2229 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2231 struct md_op_data *op_data;
2234 /* Non-root users are forbidden to set or clear flags which are
2235 * NOT defined in HSM_USER_MASK. */
2236 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2237 !cfs_capable(CFS_CAP_SYS_ADMIN))
2240 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2241 LUSTRE_OPC_ANY, hss);
2242 if (IS_ERR(op_data))
2243 RETURN(PTR_ERR(op_data));
2245 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2246 sizeof(*op_data), op_data, NULL);
2248 ll_finish_md_op_data(op_data);
2253 static int ll_hsm_import(struct inode *inode, struct file *file,
2254 struct hsm_user_import *hui)
2256 struct hsm_state_set *hss = NULL;
2257 struct iattr *attr = NULL;
2261 if (!S_ISREG(inode->i_mode))
2267 GOTO(out, rc = -ENOMEM);
2269 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2270 hss->hss_archive_id = hui->hui_archive_id;
2271 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2272 rc = ll_hsm_state_set(inode, hss);
2276 OBD_ALLOC_PTR(attr);
2278 GOTO(out, rc = -ENOMEM);
2280 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2281 attr->ia_mode |= S_IFREG;
2282 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2283 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2284 attr->ia_size = hui->hui_size;
2285 attr->ia_mtime.tv_sec = hui->hui_mtime;
2286 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2287 attr->ia_atime.tv_sec = hui->hui_atime;
2288 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2290 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2291 ATTR_UID | ATTR_GID |
2292 ATTR_MTIME | ATTR_MTIME_SET |
2293 ATTR_ATIME | ATTR_ATIME_SET;
2295 rc = ll_setattr_raw(file->f_dentry, attr, true);
2310 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2312 struct inode *inode = file->f_dentry->d_inode;
2313 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2317 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2318 PFID(ll_inode2fid(inode)), inode, cmd);
2319 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2321 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2322 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2326 case LL_IOC_GETFLAGS:
2327 /* Get the current value of the file flags */
2328 return put_user(fd->fd_flags, (int *)arg);
2329 case LL_IOC_SETFLAGS:
2330 case LL_IOC_CLRFLAGS:
2331 /* Set or clear specific file flags */
2332 /* XXX This probably needs checks to ensure the flags are
2333 * not abused, and to handle any flag side effects.
2335 if (get_user(flags, (int *) arg))
2338 if (cmd == LL_IOC_SETFLAGS) {
2339 if ((flags & LL_FILE_IGNORE_LOCK) &&
2340 !(file->f_flags & O_DIRECT)) {
2341 CERROR("%s: unable to disable locking on "
2342 "non-O_DIRECT file\n", current->comm);
2346 fd->fd_flags |= flags;
2348 fd->fd_flags &= ~flags;
2351 case LL_IOC_LOV_SETSTRIPE:
2352 RETURN(ll_lov_setstripe(inode, file, arg));
2353 case LL_IOC_LOV_SETEA:
2354 RETURN(ll_lov_setea(inode, file, arg));
2355 case LL_IOC_LOV_SWAP_LAYOUTS: {
2357 struct lustre_swap_layouts lsl;
2359 if (copy_from_user(&lsl, (char *)arg,
2360 sizeof(struct lustre_swap_layouts)))
2363 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2366 file2 = fget(lsl.sl_fd);
2371 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2372 rc = ll_swap_layouts(file, file2, &lsl);
2376 case LL_IOC_LOV_GETSTRIPE:
2377 RETURN(ll_lov_getstripe(inode, arg));
2378 case LL_IOC_RECREATE_OBJ:
2379 RETURN(ll_lov_recreate_obj(inode, arg));
2380 case LL_IOC_RECREATE_FID:
2381 RETURN(ll_lov_recreate_fid(inode, arg));
2382 case FSFILT_IOC_FIEMAP:
2383 RETURN(ll_ioctl_fiemap(inode, arg));
2384 case FSFILT_IOC_GETFLAGS:
2385 case FSFILT_IOC_SETFLAGS:
2386 RETURN(ll_iocontrol(inode, file, cmd, arg));
2387 case FSFILT_IOC_GETVERSION_OLD:
2388 case FSFILT_IOC_GETVERSION:
2389 RETURN(put_user(inode->i_generation, (int *)arg));
2390 case LL_IOC_GROUP_LOCK:
2391 RETURN(ll_get_grouplock(inode, file, arg));
2392 case LL_IOC_GROUP_UNLOCK:
2393 RETURN(ll_put_grouplock(inode, file, arg));
2394 case IOC_OBD_STATFS:
2395 RETURN(ll_obd_statfs(inode, (void *)arg));
2397 /* We need to special case any other ioctls we want to handle,
2398 * to send them to the MDS/OST as appropriate and to properly
2399 * network encode the arg field.
2400 case FSFILT_IOC_SETVERSION_OLD:
2401 case FSFILT_IOC_SETVERSION:
2403 case LL_IOC_FLUSHCTX:
2404 RETURN(ll_flush_ctx(inode));
2405 case LL_IOC_PATH2FID: {
2406 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2407 sizeof(struct lu_fid)))
2412 case OBD_IOC_FID2PATH:
2413 RETURN(ll_fid2path(inode, (void *)arg));
2414 case LL_IOC_DATA_VERSION: {
2415 struct ioc_data_version idv;
2418 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2421 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2422 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2424 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2430 case LL_IOC_GET_MDTIDX: {
2433 mdtidx = ll_get_mdt_idx(inode);
2437 if (put_user((int)mdtidx, (int*)arg))
2442 case OBD_IOC_GETDTNAME:
2443 case OBD_IOC_GETMDNAME:
2444 RETURN(ll_get_obd_name(inode, cmd, arg));
2445 case LL_IOC_HSM_STATE_GET: {
2446 struct md_op_data *op_data;
2447 struct hsm_user_state *hus;
2454 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2455 LUSTRE_OPC_ANY, hus);
2456 if (IS_ERR(op_data)) {
2458 RETURN(PTR_ERR(op_data));
2461 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2464 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2467 ll_finish_md_op_data(op_data);
2471 case LL_IOC_HSM_STATE_SET: {
2472 struct hsm_state_set *hss;
2479 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2484 rc = ll_hsm_state_set(inode, hss);
2489 case LL_IOC_HSM_ACTION: {
2490 struct md_op_data *op_data;
2491 struct hsm_current_action *hca;
2498 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2499 LUSTRE_OPC_ANY, hca);
2500 if (IS_ERR(op_data)) {
2502 RETURN(PTR_ERR(op_data));
2505 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2508 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2511 ll_finish_md_op_data(op_data);
2515 case LL_IOC_SET_LEASE: {
2516 struct ll_inode_info *lli = ll_i2info(inode);
2517 struct obd_client_handle *och = NULL;
2523 if (!(file->f_mode & FMODE_WRITE))
2528 if (!(file->f_mode & FMODE_READ))
2533 mutex_lock(&lli->lli_och_mutex);
2534 if (fd->fd_lease_och != NULL) {
2535 och = fd->fd_lease_och;
2536 fd->fd_lease_och = NULL;
2538 mutex_unlock(&lli->lli_och_mutex);
2541 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2542 rc = ll_lease_close(och, inode, &lease_broken);
2543 if (rc == 0 && lease_broken)
2549 /* return the type of lease or error */
2550 RETURN(rc < 0 ? rc : (int)mode);
2555 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2557 /* apply for lease */
2558 och = ll_lease_open(inode, file, mode, 0);
2560 RETURN(PTR_ERR(och));
2563 mutex_lock(&lli->lli_och_mutex);
2564 if (fd->fd_lease_och == NULL) {
2565 fd->fd_lease_och = och;
2568 mutex_unlock(&lli->lli_och_mutex);
2570 /* impossible now that only excl is supported for now */
2571 ll_lease_close(och, inode, &lease_broken);
2576 case LL_IOC_GET_LEASE: {
2577 struct ll_inode_info *lli = ll_i2info(inode);
2578 struct ldlm_lock *lock = NULL;
2581 mutex_lock(&lli->lli_och_mutex);
2582 if (fd->fd_lease_och != NULL) {
2583 struct obd_client_handle *och = fd->fd_lease_och;
2585 lock = ldlm_handle2lock(&och->och_lease_handle);
2587 lock_res_and_lock(lock);
2588 if (!ldlm_is_cancel(lock))
2589 rc = och->och_flags &
2590 (FMODE_READ | FMODE_WRITE);
2591 unlock_res_and_lock(lock);
2592 LDLM_LOCK_PUT(lock);
2595 mutex_unlock(&lli->lli_och_mutex);
2598 case LL_IOC_HSM_IMPORT: {
2599 struct hsm_user_import *hui;
2605 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2610 rc = ll_hsm_import(inode, file, hui);
2620 ll_iocontrol_call(inode, file, cmd, arg, &err))
2623 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2629 #ifndef HAVE_FILE_LLSEEK_SIZE
2630 static inline loff_t
2631 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2633 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2635 if (offset > maxsize)
2638 if (offset != file->f_pos) {
2639 file->f_pos = offset;
2640 file->f_version = 0;
2646 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2647 loff_t maxsize, loff_t eof)
2649 struct inode *inode = file->f_dentry->d_inode;
2657 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2658 * position-querying operation. Avoid rewriting the "same"
2659 * f_pos value back to the file because a concurrent read(),
2660 * write() or lseek() might have altered it
2665 * f_lock protects against read/modify/write race with other
2666 * SEEK_CURs. Note that parallel writes and reads behave
2669 mutex_lock(&inode->i_mutex);
2670 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2671 mutex_unlock(&inode->i_mutex);
2675 * In the generic case the entire file is data, so as long as
2676 * offset isn't at the end of the file then the offset is data.
2683 * There is a virtual hole at the end of the file, so as long as
2684 * offset isn't i_size or larger, return i_size.
2692 return llseek_execute(file, offset, maxsize);
2696 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2698 struct inode *inode = file->f_dentry->d_inode;
2699 loff_t retval, eof = 0;
2702 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2703 (origin == SEEK_CUR) ? file->f_pos : 0);
2704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2705 PFID(ll_inode2fid(inode)), inode, retval, retval,
2707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2709 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2710 retval = ll_glimpse_size(inode);
2713 eof = i_size_read(inode);
2716 retval = ll_generic_file_llseek_size(file, offset, origin,
2717 ll_file_maxbytes(inode), eof);
2721 static int ll_flush(struct file *file, fl_owner_t id)
2723 struct inode *inode = file->f_dentry->d_inode;
2724 struct ll_inode_info *lli = ll_i2info(inode);
2725 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2728 LASSERT(!S_ISDIR(inode->i_mode));
2730 /* catch async errors that were recorded back when async writeback
2731 * failed for pages in this mapping. */
2732 rc = lli->lli_async_rc;
2733 lli->lli_async_rc = 0;
2734 if (lli->lli_clob != NULL) {
2735 err = lov_read_and_clear_async_rc(lli->lli_clob);
2740 /* The application has been told write failure already.
2741 * Do not report failure again. */
2742 if (fd->fd_write_failed)
2744 return rc ? -EIO : 0;
2748 * Called to make sure a portion of file has been written out.
2749 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2751 * Return how many pages have been written.
2753 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2754 enum cl_fsync_mode mode, int ignore_layout)
2756 struct cl_env_nest nest;
2759 struct obd_capa *capa = NULL;
2760 struct cl_fsync_io *fio;
2764 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2765 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2768 env = cl_env_nested_get(&nest);
2770 RETURN(PTR_ERR(env));
2772 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2774 io = ccc_env_thread_io(env);
2775 io->ci_obj = cl_i2info(inode)->lli_clob;
2776 io->ci_ignore_layout = ignore_layout;
2778 /* initialize parameters for sync */
2779 fio = &io->u.ci_fsync;
2780 fio->fi_capa = capa;
2781 fio->fi_start = start;
2783 fio->fi_fid = ll_inode2fid(inode);
2784 fio->fi_mode = mode;
2785 fio->fi_nr_written = 0;
2787 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2788 result = cl_io_loop(env, io);
2790 result = io->ci_result;
2792 result = fio->fi_nr_written;
2793 cl_io_fini(env, io);
2794 cl_env_nested_put(&nest, env);
2802 * When dentry is provided (the 'else' case), *file->f_dentry may be
2803 * null and dentry must be used directly rather than pulled from
2804 * *file->f_dentry as is done otherwise.
2807 #ifdef HAVE_FILE_FSYNC_4ARGS
2808 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2810 struct dentry *dentry = file->f_dentry;
2811 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2812 int ll_fsync(struct file *file, int datasync)
2814 struct dentry *dentry = file->f_dentry;
2816 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2819 struct inode *inode = dentry->d_inode;
2820 struct ll_inode_info *lli = ll_i2info(inode);
2821 struct ptlrpc_request *req;
2822 struct obd_capa *oc;
2826 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2827 PFID(ll_inode2fid(inode)), inode);
2828 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2830 #ifdef HAVE_FILE_FSYNC_4ARGS
2831 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2832 mutex_lock(&inode->i_mutex);
2834 /* fsync's caller has already called _fdata{sync,write}, we want
2835 * that IO to finish before calling the osc and mdc sync methods */
2836 rc = filemap_fdatawait(inode->i_mapping);
2839 /* catch async errors that were recorded back when async writeback
2840 * failed for pages in this mapping. */
2841 if (!S_ISDIR(inode->i_mode)) {
2842 err = lli->lli_async_rc;
2843 lli->lli_async_rc = 0;
2846 err = lov_read_and_clear_async_rc(lli->lli_clob);
2851 oc = ll_mdscapa_get(inode);
2852 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2858 ptlrpc_req_finished(req);
2860 if (S_ISREG(inode->i_mode)) {
2861 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2863 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2865 if (rc == 0 && err < 0)
2868 fd->fd_write_failed = true;
2870 fd->fd_write_failed = false;
2873 #ifdef HAVE_FILE_FSYNC_4ARGS
2874 mutex_unlock(&inode->i_mutex);
2880 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2882 struct inode *inode = file->f_dentry->d_inode;
2883 struct ll_sb_info *sbi = ll_i2sbi(inode);
2884 struct ldlm_enqueue_info einfo = {
2885 .ei_type = LDLM_FLOCK,
2886 .ei_cb_cp = ldlm_flock_completion_ast,
2887 .ei_cbdata = file_lock,
2889 struct md_op_data *op_data;
2890 struct lustre_handle lockh = {0};
2891 ldlm_policy_data_t flock = {{0}};
2897 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2898 PFID(ll_inode2fid(inode)), file_lock);
2900 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2902 if (file_lock->fl_flags & FL_FLOCK) {
2903 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2904 /* flocks are whole-file locks */
2905 flock.l_flock.end = OFFSET_MAX;
2906 /* For flocks owner is determined by the local file desctiptor*/
2907 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2908 } else if (file_lock->fl_flags & FL_POSIX) {
2909 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2910 flock.l_flock.start = file_lock->fl_start;
2911 flock.l_flock.end = file_lock->fl_end;
2915 flock.l_flock.pid = file_lock->fl_pid;
2917 /* Somewhat ugly workaround for svc lockd.
2918 * lockd installs custom fl_lmops->lm_compare_owner that checks
2919 * for the fl_owner to be the same (which it always is on local node
2920 * I guess between lockd processes) and then compares pid.
2921 * As such we assign pid to the owner field to make it all work,
2922 * conflict with normal locks is unlikely since pid space and
2923 * pointer space for current->files are not intersecting */
2924 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2925 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2927 switch (file_lock->fl_type) {
2929 einfo.ei_mode = LCK_PR;
2932 /* An unlock request may or may not have any relation to
2933 * existing locks so we may not be able to pass a lock handle
2934 * via a normal ldlm_lock_cancel() request. The request may even
2935 * unlock a byte range in the middle of an existing lock. In
2936 * order to process an unlock request we need all of the same
2937 * information that is given with a normal read or write record
2938 * lock request. To avoid creating another ldlm unlock (cancel)
2939 * message we'll treat a LCK_NL flock request as an unlock. */
2940 einfo.ei_mode = LCK_NL;
2943 einfo.ei_mode = LCK_PW;
2946 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2947 file_lock->fl_type);
2962 flags = LDLM_FL_BLOCK_NOWAIT;
2968 flags = LDLM_FL_TEST_LOCK;
2969 /* Save the old mode so that if the mode in the lock changes we
2970 * can decrement the appropriate reader or writer refcount. */
2971 file_lock->fl_type = einfo.ei_mode;
2974 CERROR("unknown fcntl lock command: %d\n", cmd);
2978 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2979 LUSTRE_OPC_ANY, NULL);
2980 if (IS_ERR(op_data))
2981 RETURN(PTR_ERR(op_data));
2983 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2984 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2985 flock.l_flock.pid, flags, einfo.ei_mode,
2986 flock.l_flock.start, flock.l_flock.end);
2988 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2989 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2991 if ((file_lock->fl_flags & FL_FLOCK) &&
2992 (rc == 0 || file_lock->fl_type == F_UNLCK))
2993 rc2 = flock_lock_file_wait(file, file_lock);
2994 if ((file_lock->fl_flags & FL_POSIX) &&
2995 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2996 !(flags & LDLM_FL_TEST_LOCK))
2997 rc2 = posix_lock_file_wait(file, file_lock);
2999 if (rc2 && file_lock->fl_type != F_UNLCK) {
3000 einfo.ei_mode = LCK_NL;
3001 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
3002 op_data, &lockh, &flock, 0, NULL /* req */, flags);
3006 ll_finish_md_op_data(op_data);
3011 int ll_get_fid_by_name(struct inode *parent, const char *name,
3012 int namelen, struct lu_fid *fid)
3014 struct md_op_data *op_data = NULL;
3015 struct mdt_body *body;
3016 struct ptlrpc_request *req;
3020 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3021 LUSTRE_OPC_ANY, NULL);
3022 if (IS_ERR(op_data))
3023 RETURN(PTR_ERR(op_data));
3025 op_data->op_valid = OBD_MD_FLID;
3026 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3027 ll_finish_md_op_data(op_data);
3031 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3033 GOTO(out_req, rc = -EFAULT);
3037 ptlrpc_req_finished(req);
3041 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3042 const char *name, int namelen)
3044 struct dentry *dchild = NULL;
3045 struct inode *child_inode = NULL;
3046 struct md_op_data *op_data;
3047 struct ptlrpc_request *request = NULL;
3052 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3053 name, PFID(ll_inode2fid(parent)), mdtidx);
3055 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3056 0, LUSTRE_OPC_ANY, NULL);
3057 if (IS_ERR(op_data))
3058 RETURN(PTR_ERR(op_data));
3060 /* Get child FID first */
3061 qstr.hash = full_name_hash(name, namelen);
3064 dchild = d_lookup(file->f_dentry, &qstr);
3065 if (dchild != NULL && dchild->d_inode != NULL) {
3066 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3067 if (dchild->d_inode != NULL) {
3068 child_inode = igrab(dchild->d_inode);
3069 ll_invalidate_aliases(child_inode);
3073 rc = ll_get_fid_by_name(parent, name, namelen,
3079 if (!fid_is_sane(&op_data->op_fid3)) {
3080 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3081 ll_get_fsname(parent->i_sb, NULL, 0), name,
3082 PFID(&op_data->op_fid3));
3083 GOTO(out_free, rc = -EINVAL);
3086 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3091 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3092 PFID(&op_data->op_fid3), mdtidx);
3093 GOTO(out_free, rc = 0);
3096 op_data->op_mds = mdtidx;
3097 op_data->op_cli_flags = CLI_MIGRATE;
3098 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3099 namelen, name, namelen, &request);
3101 ll_update_times(request, parent);
3103 ptlrpc_req_finished(request);
3108 if (child_inode != NULL) {
3109 clear_nlink(child_inode);
3113 ll_finish_md_op_data(op_data);
3118 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3126 * test if some locks matching bits and l_req_mode are acquired
3127 * - bits can be in different locks
3128 * - if found clear the common lock bits in *bits
3129 * - the bits not found, are kept in *bits
3131 * \param bits [IN] searched lock bits [IN]
3132 * \param l_req_mode [IN] searched lock mode
3133 * \retval boolean, true iff all bits are found
3135 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3137 struct lustre_handle lockh;
3138 ldlm_policy_data_t policy;
3139 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3140 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3149 fid = &ll_i2info(inode)->lli_fid;
3150 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3151 ldlm_lockname[mode]);
3153 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3154 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3155 policy.l_inodebits.bits = *bits & (1 << i);
3156 if (policy.l_inodebits.bits == 0)
3159 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3160 &policy, mode, &lockh)) {
3161 struct ldlm_lock *lock;
3163 lock = ldlm_handle2lock(&lockh);
3166 ~(lock->l_policy_data.l_inodebits.bits);
3167 LDLM_LOCK_PUT(lock);
3169 *bits &= ~policy.l_inodebits.bits;
3176 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3177 struct lustre_handle *lockh, __u64 flags,
3180 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3185 fid = &ll_i2info(inode)->lli_fid;
3186 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3188 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3189 fid, LDLM_IBITS, &policy, mode, lockh);
3194 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3196 /* Already unlinked. Just update nlink and return success */
3197 if (rc == -ENOENT) {
3199 /* This path cannot be hit for regular files unless in
3200 * case of obscure races, so no need to to validate
3202 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3204 } else if (rc != 0) {
3205 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3206 "%s: revalidate FID "DFID" error: rc = %d\n",
3207 ll_get_fsname(inode->i_sb, NULL, 0),
3208 PFID(ll_inode2fid(inode)), rc);
3214 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3216 struct inode *inode = dentry->d_inode;
3217 struct ptlrpc_request *req = NULL;
3218 struct obd_export *exp;
3222 LASSERT(inode != NULL);
3224 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3225 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3227 exp = ll_i2mdexp(inode);
3229 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3230 * But under CMD case, it caused some lock issues, should be fixed
3231 * with new CMD ibits lock. See bug 12718 */
3232 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3233 struct lookup_intent oit = { .it_op = IT_GETATTR };
3234 struct md_op_data *op_data;
3236 if (ibits == MDS_INODELOCK_LOOKUP)
3237 oit.it_op = IT_LOOKUP;
3239 /* Call getattr by fid, so do not provide name at all. */
3240 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3241 dentry->d_inode, NULL, 0, 0,
3242 LUSTRE_OPC_ANY, NULL);
3243 if (IS_ERR(op_data))
3244 RETURN(PTR_ERR(op_data));
3246 oit.it_create_mode |= M_CHECK_STALE;
3247 rc = md_intent_lock(exp, op_data, NULL, 0,
3248 /* we are not interested in name
3251 ll_md_blocking_ast, 0);
3252 ll_finish_md_op_data(op_data);
3253 oit.it_create_mode &= ~M_CHECK_STALE;
3255 rc = ll_inode_revalidate_fini(inode, rc);
3259 rc = ll_revalidate_it_finish(req, &oit, dentry);
3261 ll_intent_release(&oit);
3265 /* Unlinked? Unhash dentry, so it is not picked up later by
3266 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3267 here to preserve get_cwd functionality on 2.6.
3269 if (!dentry->d_inode->i_nlink)
3270 d_lustre_invalidate(dentry, 0);
3272 ll_lookup_finish_locks(&oit, dentry);
3273 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3274 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3275 obd_valid valid = OBD_MD_FLGETATTR;
3276 struct md_op_data *op_data;
3279 if (S_ISREG(inode->i_mode)) {
3280 rc = ll_get_default_mdsize(sbi, &ealen);
3283 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3286 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3287 0, ealen, LUSTRE_OPC_ANY,
3289 if (IS_ERR(op_data))
3290 RETURN(PTR_ERR(op_data));
3292 op_data->op_valid = valid;
3293 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3294 * capa for this inode. Because we only keep capas of dirs
3296 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3297 ll_finish_md_op_data(op_data);
3299 rc = ll_inode_revalidate_fini(inode, rc);
3303 rc = ll_prep_inode(&inode, req, NULL, NULL);
3306 ptlrpc_req_finished(req);
3310 static int ll_merge_md_attr(struct inode *inode)
3312 struct cl_attr attr = { 0 };
3315 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3316 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3321 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3322 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3324 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3325 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3326 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3332 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3334 struct inode *inode = dentry->d_inode;
3338 rc = __ll_inode_revalidate(dentry, ibits);
3342 /* if object isn't regular file, don't validate size */
3343 if (!S_ISREG(inode->i_mode)) {
3344 if (S_ISDIR(inode->i_mode) &&
3345 ll_i2info(inode)->lli_lsm_md != NULL) {
3346 rc = ll_merge_md_attr(inode);
3351 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3352 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3353 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3355 /* In case of restore, the MDT has the right size and has
3356 * already send it back without granting the layout lock,
3357 * inode is up-to-date so glimpse is useless.
3358 * Also to glimpse we need the layout, in case of a running
3359 * restore the MDT holds the layout lock so the glimpse will
3360 * block up to the end of restore (getattr will block)
3362 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3363 rc = ll_glimpse_size(inode);
3368 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3370 struct inode *inode = de->d_inode;
3371 struct ll_sb_info *sbi = ll_i2sbi(inode);
3372 struct ll_inode_info *lli = ll_i2info(inode);
3375 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3376 MDS_INODELOCK_LOOKUP);
3377 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3382 stat->dev = inode->i_sb->s_dev;
3383 if (ll_need_32bit_api(sbi))
3384 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3386 stat->ino = inode->i_ino;
3387 stat->mode = inode->i_mode;
3388 stat->uid = inode->i_uid;
3389 stat->gid = inode->i_gid;
3390 stat->rdev = inode->i_rdev;
3391 stat->atime = inode->i_atime;
3392 stat->mtime = inode->i_mtime;
3393 stat->ctime = inode->i_ctime;
3394 stat->blksize = 1 << inode->i_blkbits;
3395 stat->blocks = inode->i_blocks;
3397 if (S_ISDIR(inode->i_mode) &&
3398 ll_i2info(inode)->lli_lsm_md != NULL) {
3399 stat->nlink = lli->lli_stripe_dir_nlink;
3400 stat->size = lli->lli_stripe_dir_size;
3402 stat->nlink = inode->i_nlink;
3403 stat->size = i_size_read(inode);
3409 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3410 __u64 start, __u64 len)
3414 struct ll_user_fiemap *fiemap;
3415 unsigned int extent_count = fieinfo->fi_extents_max;
3417 num_bytes = sizeof(*fiemap) + (extent_count *
3418 sizeof(struct ll_fiemap_extent));
3419 OBD_ALLOC_LARGE(fiemap, num_bytes);
3424 fiemap->fm_flags = fieinfo->fi_flags;
3425 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3426 fiemap->fm_start = start;
3427 fiemap->fm_length = len;
3428 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3429 sizeof(struct ll_fiemap_extent));
3431 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3433 fieinfo->fi_flags = fiemap->fm_flags;
3434 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3435 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3436 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3438 OBD_FREE_LARGE(fiemap, num_bytes);
3442 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3444 struct ll_inode_info *lli = ll_i2info(inode);
3445 struct posix_acl *acl = NULL;
3448 spin_lock(&lli->lli_lock);
3449 /* VFS' acl_permission_check->check_acl will release the refcount */
3450 acl = posix_acl_dup(lli->lli_posix_acl);
3451 spin_unlock(&lli->lli_lock);
3456 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3458 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3459 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3461 ll_check_acl(struct inode *inode, int mask)
3464 # ifdef CONFIG_FS_POSIX_ACL
3465 struct posix_acl *acl;
3469 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3470 if (flags & IPERM_FLAG_RCU)
3473 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3478 rc = posix_acl_permission(inode, acl, mask);
3479 posix_acl_release(acl);
3482 # else /* !CONFIG_FS_POSIX_ACL */
3484 # endif /* CONFIG_FS_POSIX_ACL */
3486 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3488 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3489 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3491 # ifdef HAVE_INODE_PERMISION_2ARGS
3492 int ll_inode_permission(struct inode *inode, int mask)
3494 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3501 #ifdef MAY_NOT_BLOCK
3502 if (mask & MAY_NOT_BLOCK)
3504 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3505 if (flags & IPERM_FLAG_RCU)
3509 /* as root inode are NOT getting validated in lookup operation,
3510 * need to do it before permission check. */
3512 if (inode == inode->i_sb->s_root->d_inode) {
3513 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3514 MDS_INODELOCK_LOOKUP);
3519 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3520 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3522 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3523 return lustre_check_remote_perm(inode, mask);
3525 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3526 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3531 /* -o localflock - only provides locally consistent flock locks */
3532 struct file_operations ll_file_operations = {
3533 .read = ll_file_read,
3534 .aio_read = ll_file_aio_read,
3535 .write = ll_file_write,
3536 .aio_write = ll_file_aio_write,
3537 .unlocked_ioctl = ll_file_ioctl,
3538 .open = ll_file_open,
3539 .release = ll_file_release,
3540 .mmap = ll_file_mmap,
3541 .llseek = ll_file_seek,
3542 .splice_read = ll_file_splice_read,
3547 struct file_operations ll_file_operations_flock = {
3548 .read = ll_file_read,
3549 .aio_read = ll_file_aio_read,
3550 .write = ll_file_write,
3551 .aio_write = ll_file_aio_write,
3552 .unlocked_ioctl = ll_file_ioctl,
3553 .open = ll_file_open,
3554 .release = ll_file_release,
3555 .mmap = ll_file_mmap,
3556 .llseek = ll_file_seek,
3557 .splice_read = ll_file_splice_read,
3560 .flock = ll_file_flock,
3561 .lock = ll_file_flock
3564 /* These are for -o noflock - to return ENOSYS on flock calls */
3565 struct file_operations ll_file_operations_noflock = {
3566 .read = ll_file_read,
3567 .aio_read = ll_file_aio_read,
3568 .write = ll_file_write,
3569 .aio_write = ll_file_aio_write,
3570 .unlocked_ioctl = ll_file_ioctl,
3571 .open = ll_file_open,
3572 .release = ll_file_release,
3573 .mmap = ll_file_mmap,
3574 .llseek = ll_file_seek,
3575 .splice_read = ll_file_splice_read,
3578 .flock = ll_file_noflock,
3579 .lock = ll_file_noflock
3582 struct inode_operations ll_file_inode_operations = {
3583 .setattr = ll_setattr,
3584 .getattr = ll_getattr,
3585 .permission = ll_inode_permission,
3586 .setxattr = ll_setxattr,
3587 .getxattr = ll_getxattr,
3588 .listxattr = ll_listxattr,
3589 .removexattr = ll_removexattr,
3590 .fiemap = ll_fiemap,
3591 #ifdef HAVE_IOP_GET_ACL
3592 .get_acl = ll_get_acl,
3596 /* dynamic ioctl number support routins */
3597 static struct llioc_ctl_data {
3598 struct rw_semaphore ioc_sem;
3599 cfs_list_t ioc_head;
3601 __RWSEM_INITIALIZER(llioc.ioc_sem),
3602 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3607 cfs_list_t iocd_list;
3608 unsigned int iocd_size;
3609 llioc_callback_t iocd_cb;
3610 unsigned int iocd_count;
3611 unsigned int iocd_cmd[0];
3614 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3617 struct llioc_data *in_data = NULL;
3620 if (cb == NULL || cmd == NULL ||
3621 count > LLIOC_MAX_CMD || count < 0)
3624 size = sizeof(*in_data) + count * sizeof(unsigned int);
3625 OBD_ALLOC(in_data, size);
3626 if (in_data == NULL)
3629 memset(in_data, 0, sizeof(*in_data));
3630 in_data->iocd_size = size;
3631 in_data->iocd_cb = cb;
3632 in_data->iocd_count = count;
3633 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3635 down_write(&llioc.ioc_sem);
3636 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3637 up_write(&llioc.ioc_sem);
3642 void ll_iocontrol_unregister(void *magic)
3644 struct llioc_data *tmp;
3649 down_write(&llioc.ioc_sem);
3650 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3652 unsigned int size = tmp->iocd_size;
3654 cfs_list_del(&tmp->iocd_list);
3655 up_write(&llioc.ioc_sem);
3657 OBD_FREE(tmp, size);
3661 up_write(&llioc.ioc_sem);
3663 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3666 EXPORT_SYMBOL(ll_iocontrol_register);
3667 EXPORT_SYMBOL(ll_iocontrol_unregister);
3669 static enum llioc_iter
3670 ll_iocontrol_call(struct inode *inode, struct file *file,
3671 unsigned int cmd, unsigned long arg, int *rcp)
3673 enum llioc_iter ret = LLIOC_CONT;
3674 struct llioc_data *data;
3675 int rc = -EINVAL, i;
3677 down_read(&llioc.ioc_sem);
3678 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3679 for (i = 0; i < data->iocd_count; i++) {
3680 if (cmd != data->iocd_cmd[i])
3683 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3687 if (ret == LLIOC_STOP)
3690 up_read(&llioc.ioc_sem);
3697 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3699 struct ll_inode_info *lli = ll_i2info(inode);
3700 struct cl_env_nest nest;
3705 if (lli->lli_clob == NULL)
3708 env = cl_env_nested_get(&nest);
3710 RETURN(PTR_ERR(env));
3712 result = cl_conf_set(env, lli->lli_clob, conf);
3713 cl_env_nested_put(&nest, env);
3715 if (conf->coc_opc == OBJECT_CONF_SET) {
3716 struct ldlm_lock *lock = conf->coc_lock;
3718 LASSERT(lock != NULL);
3719 LASSERT(ldlm_has_layout(lock));
3721 struct lustre_md *md = conf->u.coc_md;
3722 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3724 /* it can only be allowed to match after layout is
3725 * applied to inode otherwise false layout would be
3726 * seen. Applying layout shoud happen before dropping
3727 * the intent lock. */
3728 ldlm_lock_allow_match(lock);
3730 lli->lli_has_smd = lsm_has_objects(md->lsm);
3731 if (md->lsm != NULL)
3732 gen = md->lsm->lsm_layout_gen;
3735 DFID ": layout version change: %u -> %u\n",
3736 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3738 ll_layout_version_set(lli, gen);
3744 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3745 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3748 struct ll_sb_info *sbi = ll_i2sbi(inode);
3749 struct obd_capa *oc;
3750 struct ptlrpc_request *req;
3751 struct mdt_body *body;
3758 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3759 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3760 lock->l_lvb_data, lock->l_lvb_len);
3762 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3765 /* if layout lock was granted right away, the layout is returned
3766 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3767 * blocked and then granted via completion ast, we have to fetch
3768 * layout here. Please note that we can't use the LVB buffer in
3769 * completion AST because it doesn't have a large enough buffer */
3770 oc = ll_mdscapa_get(inode);
3771 rc = ll_get_default_mdsize(sbi, &lmmsize);
3773 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3774 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3780 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3782 GOTO(out, rc = -EPROTO);
3784 lmmsize = body->eadatasize;
3785 if (lmmsize == 0) /* empty layout */
3788 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3790 GOTO(out, rc = -EFAULT);
3792 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3793 if (lvbdata == NULL)
3794 GOTO(out, rc = -ENOMEM);
3796 memcpy(lvbdata, lmm, lmmsize);
3797 lock_res_and_lock(lock);
3798 if (lock->l_lvb_data != NULL)
3799 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3801 lock->l_lvb_data = lvbdata;
3802 lock->l_lvb_len = lmmsize;
3803 unlock_res_and_lock(lock);
3808 ptlrpc_req_finished(req);
3813 * Apply the layout to the inode. Layout lock is held and will be released
3816 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3817 struct inode *inode, __u32 *gen, bool reconf)
3819 struct ll_inode_info *lli = ll_i2info(inode);
3820 struct ll_sb_info *sbi = ll_i2sbi(inode);
3821 struct ldlm_lock *lock;
3822 struct lustre_md md = { NULL };
3823 struct cl_object_conf conf;
3826 bool wait_layout = false;
3829 LASSERT(lustre_handle_is_used(lockh));
3831 lock = ldlm_handle2lock(lockh);
3832 LASSERT(lock != NULL);
3833 LASSERT(ldlm_has_layout(lock));
3835 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3836 PFID(&lli->lli_fid), inode, reconf);
3838 /* in case this is a caching lock and reinstate with new inode */
3839 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3841 lock_res_and_lock(lock);
3842 lvb_ready = ldlm_is_lvb_ready(lock);
3843 unlock_res_and_lock(lock);
3844 /* checking lvb_ready is racy but this is okay. The worst case is
3845 * that multi processes may configure the file on the same time. */
3847 if (lvb_ready || !reconf) {
3850 /* layout_gen must be valid if layout lock is not
3851 * cancelled and stripe has already set */
3852 *gen = ll_layout_version_get(lli);
3858 rc = ll_layout_fetch(inode, lock);
3862 /* for layout lock, lmm is returned in lock's lvb.
3863 * lvb_data is immutable if the lock is held so it's safe to access it
3864 * without res lock. See the description in ldlm_lock_decref_internal()
3865 * for the condition to free lvb_data of layout lock */
3866 if (lock->l_lvb_data != NULL) {
3867 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3868 lock->l_lvb_data, lock->l_lvb_len);
3870 *gen = LL_LAYOUT_GEN_EMPTY;
3872 *gen = md.lsm->lsm_layout_gen;
3875 CERROR("%s: file "DFID" unpackmd error: %d\n",
3876 ll_get_fsname(inode->i_sb, NULL, 0),
3877 PFID(&lli->lli_fid), rc);
3883 /* set layout to file. Unlikely this will fail as old layout was
3884 * surely eliminated */
3885 memset(&conf, 0, sizeof conf);
3886 conf.coc_opc = OBJECT_CONF_SET;
3887 conf.coc_inode = inode;
3888 conf.coc_lock = lock;
3889 conf.u.coc_md = &md;
3890 rc = ll_layout_conf(inode, &conf);
3893 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3895 /* refresh layout failed, need to wait */
3896 wait_layout = rc == -EBUSY;
3900 LDLM_LOCK_PUT(lock);
3901 ldlm_lock_decref(lockh, mode);
3903 /* wait for IO to complete if it's still being used. */
3905 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3906 ll_get_fsname(inode->i_sb, NULL, 0),
3907 PFID(&lli->lli_fid), inode);
3909 memset(&conf, 0, sizeof conf);
3910 conf.coc_opc = OBJECT_CONF_WAIT;
3911 conf.coc_inode = inode;
3912 rc = ll_layout_conf(inode, &conf);
3916 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3917 ll_get_fsname(inode->i_sb, NULL, 0),
3918 PFID(&lli->lli_fid), rc);
3924 * This function checks if there exists a LAYOUT lock on the client side,
3925 * or enqueues it if it doesn't have one in cache.
3927 * This function will not hold layout lock so it may be revoked any time after
3928 * this function returns. Any operations depend on layout should be redone
3931 * This function should be called before lov_io_init() to get an uptodate
3932 * layout version, the caller should save the version number and after IO
3933 * is finished, this function should be called again to verify that layout
3934 * is not changed during IO time.
3936 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3938 struct ll_inode_info *lli = ll_i2info(inode);
3939 struct ll_sb_info *sbi = ll_i2sbi(inode);
3940 struct md_op_data *op_data;
3941 struct lookup_intent it;
3942 struct lustre_handle lockh;
3944 struct ldlm_enqueue_info einfo = {
3945 .ei_type = LDLM_IBITS,
3947 .ei_cb_bl = ll_md_blocking_ast,
3948 .ei_cb_cp = ldlm_completion_ast,
3953 *gen = ll_layout_version_get(lli);
3954 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3958 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3959 LASSERT(S_ISREG(inode->i_mode));
3961 /* take layout lock mutex to enqueue layout lock exclusively. */
3962 mutex_lock(&lli->lli_layout_mutex);
3965 /* mostly layout lock is caching on the local side, so try to match
3966 * it before grabbing layout lock mutex. */
3967 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3968 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3969 if (mode != 0) { /* hit cached lock */
3970 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3974 mutex_unlock(&lli->lli_layout_mutex);
3978 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3979 0, 0, LUSTRE_OPC_ANY, NULL);
3980 if (IS_ERR(op_data)) {
3981 mutex_unlock(&lli->lli_layout_mutex);
3982 RETURN(PTR_ERR(op_data));
3985 /* have to enqueue one */
3986 memset(&it, 0, sizeof(it));
3987 it.it_op = IT_LAYOUT;
3988 lockh.cookie = 0ULL;
3990 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
3991 ll_get_fsname(inode->i_sb, NULL, 0),
3992 PFID(&lli->lli_fid), inode);
3994 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3996 if (it.d.lustre.it_data != NULL)
3997 ptlrpc_req_finished(it.d.lustre.it_data);
3998 it.d.lustre.it_data = NULL;
4000 ll_finish_md_op_data(op_data);
4002 mode = it.d.lustre.it_lock_mode;
4003 it.d.lustre.it_lock_mode = 0;
4004 ll_intent_drop_lock(&it);
4007 /* set lock data in case this is a new lock */
4008 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4009 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4013 mutex_unlock(&lli->lli_layout_mutex);
4019 * This function send a restore request to the MDT
4021 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4023 struct hsm_user_request *hur;
4027 len = sizeof(struct hsm_user_request) +
4028 sizeof(struct hsm_user_item);
4029 OBD_ALLOC(hur, len);
4033 hur->hur_request.hr_action = HUA_RESTORE;
4034 hur->hur_request.hr_archive_id = 0;
4035 hur->hur_request.hr_flags = 0;
4036 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4037 sizeof(hur->hur_user_item[0].hui_fid));
4038 hur->hur_user_item[0].hui_extent.offset = offset;
4039 hur->hur_user_item[0].hui_extent.length = length;
4040 hur->hur_request.hr_itemcount = 1;
4041 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,