4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
94 ll_inode_to_ext_flags(inode->i_flags);
95 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Closes the IO epoch and packs all the attributes into @op_data for
108 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
109 struct obd_client_handle *och)
113 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
114 ATTR_MTIME | ATTR_MTIME_SET |
115 ATTR_CTIME | ATTR_CTIME_SET;
117 if (!(och->och_flags & FMODE_WRITE))
120 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
121 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
123 ll_ioepoch_close(inode, op_data, &och, 0);
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
147 * XXX: in case of LMV, is this correct to access
150 CERROR("Invalid MDC connection handle "LPX64"\n",
151 ll_i2mdexp(inode)->exp_handle.h_cookie);
155 OBD_ALLOC_PTR(op_data);
157 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
159 ll_prepare_close(inode, op_data, och);
160 if (data_version != NULL) {
161 /* Pass in data_version implies release. */
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *data_version;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 /* This close must have the epoch closed. */
171 LASSERT(epoch_close);
172 /* MDS has instructed us to obtain Size-on-MDS attribute from
173 * OSTs and send setattr to back to MDS. */
174 rc = ll_som_update(inode, op_data);
176 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
177 " failed: rc = %d\n",
178 ll_i2mdexp(inode)->exp_obd->obd_name,
179 PFID(ll_inode2fid(inode)), rc);
183 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
184 ll_i2mdexp(inode)->exp_obd->obd_name,
185 PFID(ll_inode2fid(inode)), rc);
188 /* DATA_MODIFIED flag was successfully sent on close, cancel data
189 * modification flag. */
190 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
191 struct ll_inode_info *lli = ll_i2info(inode);
193 spin_lock(&lli->lli_lock);
194 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
195 spin_unlock(&lli->lli_lock);
199 rc = ll_objects_destroy(req, inode);
201 CERROR("%s: inode "DFID
202 " ll_objects destroy: rc = %d\n",
203 ll_i2mdexp(inode)->exp_obd->obd_name,
204 PFID(ll_inode2fid(inode)), rc);
207 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
208 struct mdt_body *body;
209 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
210 if (!(body->valid & OBD_MD_FLRELEASED))
214 ll_finish_md_op_data(op_data);
218 if (exp_connect_som(exp) && !epoch_close &&
219 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
220 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
222 md_clear_open_replay_data(md_exp, och);
223 /* Free @och if it is not waiting for DONE_WRITING. */
224 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
227 if (req) /* This is close request */
228 ptlrpc_req_finished(req);
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
275 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
279 struct ll_inode_info *lli = ll_i2info(inode);
283 /* clear group lock, if present */
284 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
285 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
287 if (fd->fd_lease_och != NULL) {
290 /* Usually the lease is not released when the
291 * application crashed, we need to release here. */
292 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
293 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
294 PFID(&lli->lli_fid), rc, lease_broken);
296 fd->fd_lease_och = NULL;
299 if (fd->fd_och != NULL) {
300 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
305 /* Let's see if we have good enough OPEN lock on the file and if
306 we can skip talking to MDS */
307 if (file->f_dentry->d_inode) { /* Can this ever be false? */
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct lustre_handle lockh;
311 struct inode *inode = file->f_dentry->d_inode;
312 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
314 mutex_lock(&lli->lli_och_mutex);
315 if (fd->fd_omode & FMODE_WRITE) {
317 LASSERT(lli->lli_open_fd_write_count);
318 lli->lli_open_fd_write_count--;
319 } else if (fd->fd_omode & FMODE_EXEC) {
321 LASSERT(lli->lli_open_fd_exec_count);
322 lli->lli_open_fd_exec_count--;
325 LASSERT(lli->lli_open_fd_read_count);
326 lli->lli_open_fd_read_count--;
328 mutex_unlock(&lli->lli_och_mutex);
330 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
331 LDLM_IBITS, &policy, lockmode,
333 rc = ll_md_real_close(file->f_dentry->d_inode,
337 CERROR("Releasing a file %p with negative dentry %p. Name %s",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead.
385 * Different processes can open the same dir, "ll_opendir_key" means:
386 * it is me that should stop the statahead thread. */
387 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
388 lli->lli_opendir_pid != 0)
389 ll_stop_statahead(inode, lli->lli_opendir_key);
391 if (inode->i_sb->s_root == file->f_dentry) {
392 LUSTRE_FPRIVATE(file) = NULL;
393 ll_file_data_put(fd);
397 if (!S_ISDIR(inode->i_mode)) {
398 if (lli->lli_clob != NULL)
399 lov_read_and_clear_async_rc(lli->lli_clob);
400 lli->lli_async_rc = 0;
403 rc = ll_md_close(sbi->ll_md_exp, inode, file);
405 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
406 libcfs_debug_dumplog();
411 static int ll_intent_file_open(struct file *file, void *lmm,
412 int lmmsize, struct lookup_intent *itp)
414 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
415 struct dentry *parent = file->f_dentry->d_parent;
416 const char *name = file->f_dentry->d_name.name;
417 const int len = file->f_dentry->d_name.len;
418 struct md_op_data *op_data;
419 struct ptlrpc_request *req;
420 __u32 opc = LUSTRE_OPC_ANY;
427 /* Usually we come here only for NFSD, and we want open lock.
428 But we can also get here with pre 2.6.15 patchless kernels, and in
429 that case that lock is also ok */
430 /* We can also get here if there was cached open handle in revalidate_it
431 * but it disappeared while we were getting from there to ll_file_open.
432 * But this means this file was closed and immediatelly opened which
433 * makes a good candidate for using OPEN lock */
434 /* If lmmsize & lmm are not 0, we are just setting stripe info
435 * parameters. No need for the open lock */
436 if (lmm == NULL && lmmsize == 0) {
437 itp->it_flags |= MDS_OPEN_LOCK;
438 if (itp->it_flags & FMODE_WRITE)
439 opc = LUSTRE_OPC_CREATE;
442 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
443 file->f_dentry->d_inode, name, len,
446 RETURN(PTR_ERR(op_data));
448 itp->it_flags |= MDS_OPEN_BY_FID;
449 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
450 0 /*unused */, &req, ll_md_blocking_ast, 0);
451 ll_finish_md_op_data(op_data);
453 /* reason for keep own exit path - don`t flood log
454 * with messages with -ESTALE errors.
456 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
457 it_open_error(DISP_OPEN_OPEN, itp))
459 ll_release_openhandle(file->f_dentry, itp);
463 if (it_disposition(itp, DISP_LOOKUP_NEG))
464 GOTO(out, rc = -ENOENT);
466 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
467 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
468 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
472 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
473 if (!rc && itp->d.lustre.it_lock_mode)
474 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
478 ptlrpc_req_finished(req);
479 ll_intent_drop_lock(itp);
485 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
486 * not believe attributes if a few ioepoch holders exist. Attributes for
487 * previous ioepoch if new one is opened are also skipped by MDS.
489 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
491 if (ioepoch && lli->lli_ioepoch != ioepoch) {
492 lli->lli_ioepoch = ioepoch;
493 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
494 ioepoch, PFID(&lli->lli_fid));
498 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
499 struct obd_client_handle *och)
501 struct ptlrpc_request *req = it->d.lustre.it_data;
502 struct mdt_body *body;
504 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
505 och->och_fh = body->handle;
506 och->och_fid = body->fid1;
507 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
508 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
509 och->och_flags = it->it_flags;
511 return md_set_open_replay_data(md_exp, och, it);
514 static int ll_local_open(struct file *file, struct lookup_intent *it,
515 struct ll_file_data *fd, struct obd_client_handle *och)
517 struct inode *inode = file->f_dentry->d_inode;
518 struct ll_inode_info *lli = ll_i2info(inode);
521 LASSERT(!LUSTRE_FPRIVATE(file));
526 struct ptlrpc_request *req = it->d.lustre.it_data;
527 struct mdt_body *body;
530 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
534 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
535 ll_ioepoch_open(lli, body->ioepoch);
538 LUSTRE_FPRIVATE(file) = fd;
539 ll_readahead_init(inode, &fd->fd_ras);
540 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
545 /* Open a file, and (for the very first open) create objects on the OSTs at
546 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
547 * creation or open until ll_lov_setstripe() ioctl is called.
549 * If we already have the stripe MD locally then we don't request it in
550 * md_open(), by passing a lmm_size = 0.
552 * It is up to the application to ensure no other processes open this file
553 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
554 * used. We might be able to avoid races of that sort by getting lli_open_sem
555 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
556 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
558 int ll_file_open(struct inode *inode, struct file *file)
560 struct ll_inode_info *lli = ll_i2info(inode);
561 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
562 .it_flags = file->f_flags };
563 struct obd_client_handle **och_p = NULL;
564 __u64 *och_usecount = NULL;
565 struct ll_file_data *fd;
566 int rc = 0, opendir_set = 0;
569 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
570 PFID(ll_inode2fid(inode)), inode, file->f_flags);
572 it = file->private_data; /* XXX: compat macro */
573 file->private_data = NULL; /* prevent ll_local_open assertion */
575 fd = ll_file_data_get();
577 GOTO(out_openerr, rc = -ENOMEM);
580 if (S_ISDIR(inode->i_mode)) {
581 spin_lock(&lli->lli_sa_lock);
582 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
583 lli->lli_opendir_pid == 0) {
584 lli->lli_opendir_key = fd;
585 lli->lli_opendir_pid = current_pid();
588 spin_unlock(&lli->lli_sa_lock);
591 if (inode->i_sb->s_root == file->f_dentry) {
592 LUSTRE_FPRIVATE(file) = fd;
596 if (!it || !it->d.lustre.it_disposition) {
597 /* Convert f_flags into access mode. We cannot use file->f_mode,
598 * because everything but O_ACCMODE mask was stripped from
600 if ((oit.it_flags + 1) & O_ACCMODE)
602 if (file->f_flags & O_TRUNC)
603 oit.it_flags |= FMODE_WRITE;
605 /* kernel only call f_op->open in dentry_open. filp_open calls
606 * dentry_open after call to open_namei that checks permissions.
607 * Only nfsd_open call dentry_open directly without checking
608 * permissions and because of that this code below is safe. */
609 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
610 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
612 /* We do not want O_EXCL here, presumably we opened the file
613 * already? XXX - NFS implications? */
614 oit.it_flags &= ~O_EXCL;
616 /* bug20584, if "it_flags" contains O_CREAT, the file will be
617 * created if necessary, then "IT_CREAT" should be set to keep
618 * consistent with it */
619 if (oit.it_flags & O_CREAT)
620 oit.it_op |= IT_CREAT;
626 /* Let's see if we have file open on MDS already. */
627 if (it->it_flags & FMODE_WRITE) {
628 och_p = &lli->lli_mds_write_och;
629 och_usecount = &lli->lli_open_fd_write_count;
630 } else if (it->it_flags & FMODE_EXEC) {
631 och_p = &lli->lli_mds_exec_och;
632 och_usecount = &lli->lli_open_fd_exec_count;
634 och_p = &lli->lli_mds_read_och;
635 och_usecount = &lli->lli_open_fd_read_count;
638 mutex_lock(&lli->lli_och_mutex);
639 if (*och_p) { /* Open handle is present */
640 if (it_disposition(it, DISP_OPEN_OPEN)) {
641 /* Well, there's extra open request that we do not need,
642 let's close it somehow. This will decref request. */
643 rc = it_open_error(DISP_OPEN_OPEN, it);
645 mutex_unlock(&lli->lli_och_mutex);
646 GOTO(out_openerr, rc);
649 ll_release_openhandle(file->f_dentry, it);
653 rc = ll_local_open(file, it, fd, NULL);
656 mutex_unlock(&lli->lli_och_mutex);
657 GOTO(out_openerr, rc);
660 LASSERT(*och_usecount == 0);
661 if (!it->d.lustre.it_disposition) {
662 /* We cannot just request lock handle now, new ELC code
663 means that one of other OPEN locks for this file
664 could be cancelled, and since blocking ast handler
665 would attempt to grab och_mutex as well, that would
666 result in a deadlock */
667 mutex_unlock(&lli->lli_och_mutex);
668 it->it_create_mode |= M_CHECK_STALE;
669 rc = ll_intent_file_open(file, NULL, 0, it);
670 it->it_create_mode &= ~M_CHECK_STALE;
672 GOTO(out_openerr, rc);
676 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
678 GOTO(out_och_free, rc = -ENOMEM);
682 /* md_intent_lock() didn't get a request ref if there was an
683 * open error, so don't do cleanup on the request here
685 /* XXX (green): Should not we bail out on any error here, not
686 * just open error? */
687 rc = it_open_error(DISP_OPEN_OPEN, it);
689 GOTO(out_och_free, rc);
691 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
692 "inode %p: disposition %x, status %d\n", inode,
693 it_disposition(it, ~0), it->d.lustre.it_status);
695 rc = ll_local_open(file, it, fd, *och_p);
697 GOTO(out_och_free, rc);
699 mutex_unlock(&lli->lli_och_mutex);
702 /* Must do this outside lli_och_mutex lock to prevent deadlock where
703 different kind of OPEN lock for this same inode gets cancelled
704 by ldlm_cancel_lru */
705 if (!S_ISREG(inode->i_mode))
706 GOTO(out_och_free, rc);
710 if (!lli->lli_has_smd &&
711 (cl_is_lov_delay_create(file->f_flags) ||
712 (file->f_mode & FMODE_WRITE) == 0)) {
713 CDEBUG(D_INODE, "object creation was delayed\n");
714 GOTO(out_och_free, rc);
716 cl_lov_delay_create_clear(&file->f_flags);
717 GOTO(out_och_free, rc);
721 if (och_p && *och_p) {
722 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
723 *och_p = NULL; /* OBD_FREE writes some magic there */
726 mutex_unlock(&lli->lli_och_mutex);
729 if (opendir_set != 0)
730 ll_stop_statahead(inode, lli->lli_opendir_key);
732 ll_file_data_put(fd);
734 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
737 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
738 ptlrpc_req_finished(it->d.lustre.it_data);
739 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
745 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
746 struct ldlm_lock_desc *desc, void *data, int flag)
749 struct lustre_handle lockh;
753 case LDLM_CB_BLOCKING:
754 ldlm_lock2handle(lock, &lockh);
755 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
757 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
761 case LDLM_CB_CANCELING:
769 * Acquire a lease and open the file.
771 static struct obd_client_handle *
772 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
775 struct lookup_intent it = { .it_op = IT_OPEN };
776 struct ll_sb_info *sbi = ll_i2sbi(inode);
777 struct md_op_data *op_data;
778 struct ptlrpc_request *req;
779 struct lustre_handle old_handle = { 0 };
780 struct obd_client_handle *och = NULL;
785 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
786 RETURN(ERR_PTR(-EINVAL));
789 struct ll_inode_info *lli = ll_i2info(inode);
790 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
791 struct obd_client_handle **och_p;
794 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
795 RETURN(ERR_PTR(-EPERM));
797 /* Get the openhandle of the file */
799 mutex_lock(&lli->lli_och_mutex);
800 if (fd->fd_lease_och != NULL) {
801 mutex_unlock(&lli->lli_och_mutex);
805 if (fd->fd_och == NULL) {
806 if (file->f_mode & FMODE_WRITE) {
807 LASSERT(lli->lli_mds_write_och != NULL);
808 och_p = &lli->lli_mds_write_och;
809 och_usecount = &lli->lli_open_fd_write_count;
811 LASSERT(lli->lli_mds_read_och != NULL);
812 och_p = &lli->lli_mds_read_och;
813 och_usecount = &lli->lli_open_fd_read_count;
815 if (*och_usecount == 1) {
822 mutex_unlock(&lli->lli_och_mutex);
823 if (rc < 0) /* more than 1 opener */
826 LASSERT(fd->fd_och != NULL);
827 old_handle = fd->fd_och->och_fh;
832 RETURN(ERR_PTR(-ENOMEM));
834 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
835 LUSTRE_OPC_ANY, NULL);
837 GOTO(out, rc = PTR_ERR(op_data));
839 /* To tell the MDT this openhandle is from the same owner */
840 op_data->op_handle = old_handle;
842 it.it_flags = fmode | open_flags;
843 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
844 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
845 ll_md_blocking_lease_ast,
846 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
847 * it can be cancelled which may mislead applications that the lease is
849 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
850 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
851 * doesn't deal with openhandle, so normal openhandle will be leaked. */
852 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
853 ll_finish_md_op_data(op_data);
854 ptlrpc_req_finished(req);
856 GOTO(out_release_it, rc);
858 if (it_disposition(&it, DISP_LOOKUP_NEG))
859 GOTO(out_release_it, rc = -ENOENT);
861 rc = it_open_error(DISP_OPEN_OPEN, &it);
863 GOTO(out_release_it, rc);
865 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
866 ll_och_fill(sbi->ll_md_exp, &it, och);
868 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
869 GOTO(out_close, rc = -EOPNOTSUPP);
871 /* already get lease, handle lease lock */
872 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
873 if (it.d.lustre.it_lock_mode == 0 ||
874 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
875 /* open lock must return for lease */
876 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
877 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
878 it.d.lustre.it_lock_bits);
879 GOTO(out_close, rc = -EPROTO);
882 ll_intent_release(&it);
886 /* Cancel open lock */
887 if (it.d.lustre.it_lock_mode != 0) {
888 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
889 it.d.lustre.it_lock_mode);
890 it.d.lustre.it_lock_mode = 0;
891 och->och_lease_handle.cookie = 0ULL;
893 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
895 CERROR("%s: error closing file "DFID": %d\n",
896 ll_get_fsname(inode->i_sb, NULL, 0),
897 PFID(&ll_i2info(inode)->lli_fid), rc2);
898 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
900 ll_intent_release(&it);
908 * Release lease and close the file.
909 * It will check if the lease has ever broken.
911 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
914 struct ldlm_lock *lock;
915 bool cancelled = true;
919 lock = ldlm_handle2lock(&och->och_lease_handle);
921 lock_res_and_lock(lock);
922 cancelled = ldlm_is_cancel(lock);
923 unlock_res_and_lock(lock);
927 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
928 PFID(&ll_i2info(inode)->lli_fid), cancelled);
931 ldlm_cli_cancel(&och->och_lease_handle, 0);
932 if (lease_broken != NULL)
933 *lease_broken = cancelled;
935 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
940 /* Fills the obdo with the attributes for the lsm */
941 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
942 struct obd_capa *capa, struct obdo *obdo,
943 __u64 ioepoch, int dv_flags)
945 struct ptlrpc_request_set *set;
946 struct obd_info oinfo = { { { 0 } } };
951 LASSERT(lsm != NULL);
955 oinfo.oi_oa->o_oi = lsm->lsm_oi;
956 oinfo.oi_oa->o_mode = S_IFREG;
957 oinfo.oi_oa->o_ioepoch = ioepoch;
958 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
959 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
960 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
961 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
962 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
963 OBD_MD_FLDATAVERSION;
964 oinfo.oi_capa = capa;
965 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
966 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
967 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
968 if (dv_flags & LL_DV_WR_FLUSH)
969 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
972 set = ptlrpc_prep_set();
974 CERROR("can't allocate ptlrpc set\n");
977 rc = obd_getattr_async(exp, &oinfo, set);
979 rc = ptlrpc_set_wait(set);
980 ptlrpc_set_destroy(set);
983 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
984 OBD_MD_FLATIME | OBD_MD_FLMTIME |
985 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
986 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
987 if (dv_flags & LL_DV_WR_FLUSH &&
988 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
989 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
996 * Performs the getattr on the inode and updates its fields.
997 * If @sync != 0, perform the getattr under the server-side lock.
999 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1000 __u64 ioepoch, int sync)
1002 struct obd_capa *capa = ll_mdscapa_get(inode);
1003 struct lov_stripe_md *lsm;
1007 lsm = ccc_inode_lsm_get(inode);
1008 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1009 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1012 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1014 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1015 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1016 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1017 (unsigned long long)inode->i_blocks,
1018 (unsigned long)ll_inode_blksize(inode));
1020 ccc_inode_lsm_put(inode, lsm);
1024 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1026 struct ll_inode_info *lli = ll_i2info(inode);
1027 struct cl_object *obj = lli->lli_clob;
1028 struct cl_attr *attr = ccc_env_thread_attr(env);
1034 ll_inode_size_lock(inode);
1035 /* merge timestamps the most recently obtained from mds with
1036 timestamps obtained from osts */
1037 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1038 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1039 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1040 inode_init_lvb(inode, &lvb);
1042 cl_object_attr_lock(obj);
1043 rc = cl_object_attr_get(env, obj, attr);
1044 cl_object_attr_unlock(obj);
1047 if (lvb.lvb_atime < attr->cat_atime)
1048 lvb.lvb_atime = attr->cat_atime;
1049 if (lvb.lvb_ctime < attr->cat_ctime)
1050 lvb.lvb_ctime = attr->cat_ctime;
1051 if (lvb.lvb_mtime < attr->cat_mtime)
1052 lvb.lvb_mtime = attr->cat_mtime;
1054 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1055 PFID(&lli->lli_fid), attr->cat_size);
1056 cl_isize_write_nolock(inode, attr->cat_size);
1058 inode->i_blocks = attr->cat_blocks;
1060 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1061 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1062 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1064 ll_inode_size_unlock(inode);
1069 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1072 struct obdo obdo = { 0 };
1075 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1077 st->st_size = obdo.o_size;
1078 st->st_blocks = obdo.o_blocks;
1079 st->st_mtime = obdo.o_mtime;
1080 st->st_atime = obdo.o_atime;
1081 st->st_ctime = obdo.o_ctime;
1086 static bool file_is_noatime(const struct file *file)
1088 const struct vfsmount *mnt = file->f_path.mnt;
1089 const struct inode *inode = file->f_path.dentry->d_inode;
1091 /* Adapted from file_accessed() and touch_atime().*/
1092 if (file->f_flags & O_NOATIME)
1095 if (inode->i_flags & S_NOATIME)
1098 if (IS_NOATIME(inode))
1101 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1104 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1107 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1113 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1115 struct inode *inode = file->f_dentry->d_inode;
1117 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1119 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1120 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1121 file->f_flags & O_DIRECT ||
1124 io->ci_obj = ll_i2info(inode)->lli_clob;
1125 io->ci_lockreq = CILR_MAYBE;
1126 if (ll_file_nolock(file)) {
1127 io->ci_lockreq = CILR_NEVER;
1128 io->ci_no_srvlock = 1;
1129 } else if (file->f_flags & O_APPEND) {
1130 io->ci_lockreq = CILR_MANDATORY;
1133 io->ci_noatime = file_is_noatime(file);
1137 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1138 struct file *file, enum cl_io_type iot,
1139 loff_t *ppos, size_t count)
1141 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1142 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1148 file->f_dentry->d_name.name, iot, *ppos, count);
1151 io = ccc_env_thread_io(env);
1152 ll_io_init(io, file, iot == CIT_WRITE);
1154 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1155 struct vvp_io *vio = vvp_env_io(env);
1156 struct ccc_io *cio = ccc_env_io(env);
1157 int write_mutex_locked = 0;
1159 cio->cui_fd = LUSTRE_FPRIVATE(file);
1160 vio->cui_io_subtype = args->via_io_subtype;
1162 switch (vio->cui_io_subtype) {
1164 cio->cui_iov = args->u.normal.via_iov;
1165 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1166 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1167 cio->cui_iocb = args->u.normal.via_iocb;
1168 if ((iot == CIT_WRITE) &&
1169 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1170 if (mutex_lock_interruptible(&lli->
1172 GOTO(out, result = -ERESTARTSYS);
1173 write_mutex_locked = 1;
1175 down_read(&lli->lli_trunc_sem);
1178 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1179 vio->u.splice.cui_flags = args->u.splice.via_flags;
1182 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1185 result = cl_io_loop(env, io);
1186 if (args->via_io_subtype == IO_NORMAL)
1187 up_read(&lli->lli_trunc_sem);
1188 if (write_mutex_locked)
1189 mutex_unlock(&lli->lli_write_mutex);
1191 /* cl_io_rw_init() handled IO */
1192 result = io->ci_result;
1195 if (io->ci_nob > 0) {
1196 result = io->ci_nob;
1197 *ppos = io->u.ci_wr.wr.crw_pos;
1201 cl_io_fini(env, io);
1202 /* If any bit been read/written (result != 0), we just return
1203 * short read/write instead of restart io. */
1204 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1205 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1206 iot == CIT_READ ? "read" : "write",
1207 file->f_dentry->d_name.name, *ppos, count);
1208 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1212 if (iot == CIT_READ) {
1214 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1215 LPROC_LL_READ_BYTES, result);
1216 } else if (iot == CIT_WRITE) {
1218 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1219 LPROC_LL_WRITE_BYTES, result);
1220 fd->fd_write_failed = false;
1221 } else if (result != -ERESTARTSYS) {
1222 fd->fd_write_failed = true;
1225 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1232 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1234 static int ll_file_get_iov_count(const struct iovec *iov,
1235 unsigned long *nr_segs, size_t *count)
1240 for (seg = 0; seg < *nr_segs; seg++) {
1241 const struct iovec *iv = &iov[seg];
1244 * If any segment has a negative length, or the cumulative
1245 * length ever wraps negative then return -EINVAL.
1248 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1250 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1255 cnt -= iv->iov_len; /* This segment is no good */
1262 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1263 unsigned long nr_segs, loff_t pos)
1266 struct vvp_io_args *args;
1272 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1276 env = cl_env_get(&refcheck);
1278 RETURN(PTR_ERR(env));
1280 args = vvp_env_args(env, IO_NORMAL);
1281 args->u.normal.via_iov = (struct iovec *)iov;
1282 args->u.normal.via_nrsegs = nr_segs;
1283 args->u.normal.via_iocb = iocb;
1285 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1286 &iocb->ki_pos, count);
1287 cl_env_put(env, &refcheck);
1291 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1295 struct iovec *local_iov;
1296 struct kiocb *kiocb;
1301 env = cl_env_get(&refcheck);
1303 RETURN(PTR_ERR(env));
1305 local_iov = &vvp_env_info(env)->vti_local_iov;
1306 kiocb = &vvp_env_info(env)->vti_kiocb;
1307 local_iov->iov_base = (void __user *)buf;
1308 local_iov->iov_len = count;
1309 init_sync_kiocb(kiocb, file);
1310 kiocb->ki_pos = *ppos;
1311 #ifdef HAVE_KIOCB_KI_LEFT
1312 kiocb->ki_left = count;
1314 kiocb->ki_nbytes = count;
1317 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1318 *ppos = kiocb->ki_pos;
1320 cl_env_put(env, &refcheck);
1325 * Write to a file (through the page cache).
1328 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1329 unsigned long nr_segs, loff_t pos)
1332 struct vvp_io_args *args;
1338 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1342 env = cl_env_get(&refcheck);
1344 RETURN(PTR_ERR(env));
1346 args = vvp_env_args(env, IO_NORMAL);
1347 args->u.normal.via_iov = (struct iovec *)iov;
1348 args->u.normal.via_nrsegs = nr_segs;
1349 args->u.normal.via_iocb = iocb;
1351 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1352 &iocb->ki_pos, count);
1353 cl_env_put(env, &refcheck);
1357 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1361 struct iovec *local_iov;
1362 struct kiocb *kiocb;
1367 env = cl_env_get(&refcheck);
1369 RETURN(PTR_ERR(env));
1371 local_iov = &vvp_env_info(env)->vti_local_iov;
1372 kiocb = &vvp_env_info(env)->vti_kiocb;
1373 local_iov->iov_base = (void __user *)buf;
1374 local_iov->iov_len = count;
1375 init_sync_kiocb(kiocb, file);
1376 kiocb->ki_pos = *ppos;
1377 #ifdef HAVE_KIOCB_KI_LEFT
1378 kiocb->ki_left = count;
1380 kiocb->ki_nbytes = count;
1383 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1384 *ppos = kiocb->ki_pos;
1386 cl_env_put(env, &refcheck);
1391 * Send file content (through pagecache) somewhere with helper
1393 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1394 struct pipe_inode_info *pipe, size_t count,
1398 struct vvp_io_args *args;
1403 env = cl_env_get(&refcheck);
1405 RETURN(PTR_ERR(env));
1407 args = vvp_env_args(env, IO_SPLICE);
1408 args->u.splice.via_pipe = pipe;
1409 args->u.splice.via_flags = flags;
1411 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1412 cl_env_put(env, &refcheck);
1416 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1419 struct obd_export *exp = ll_i2dtexp(inode);
1420 struct obd_trans_info oti = { 0 };
1421 struct obdo *oa = NULL;
1424 struct lov_stripe_md *lsm = NULL, *lsm2;
1431 lsm = ccc_inode_lsm_get(inode);
1432 if (!lsm_has_objects(lsm))
1433 GOTO(out, rc = -ENOENT);
1435 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1436 (lsm->lsm_stripe_count));
1438 OBD_ALLOC_LARGE(lsm2, lsm_size);
1440 GOTO(out, rc = -ENOMEM);
1443 oa->o_nlink = ost_idx;
1444 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1445 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1446 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1447 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1448 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1449 memcpy(lsm2, lsm, lsm_size);
1450 ll_inode_size_lock(inode);
1451 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1452 ll_inode_size_unlock(inode);
1454 OBD_FREE_LARGE(lsm2, lsm_size);
1457 ccc_inode_lsm_put(inode, lsm);
1462 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1464 struct ll_recreate_obj ucreat;
1468 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1471 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1475 ostid_set_seq_mdt0(&oi);
1476 ostid_set_id(&oi, ucreat.lrc_id);
1477 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1480 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1487 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1490 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1493 fid_to_ostid(&fid, &oi);
1494 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1495 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1498 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1499 __u64 flags, struct lov_user_md *lum,
1502 struct lov_stripe_md *lsm = NULL;
1503 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1507 lsm = ccc_inode_lsm_get(inode);
1509 ccc_inode_lsm_put(inode, lsm);
1510 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1511 PFID(ll_inode2fid(inode)));
1512 GOTO(out, rc = -EEXIST);
1515 ll_inode_size_lock(inode);
1516 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1518 GOTO(out_unlock, rc);
1519 rc = oit.d.lustre.it_status;
1521 GOTO(out_req_free, rc);
1523 ll_release_openhandle(file->f_dentry, &oit);
1526 ll_inode_size_unlock(inode);
1527 ll_intent_release(&oit);
1528 ccc_inode_lsm_put(inode, lsm);
1530 cl_lov_delay_create_clear(&file->f_flags);
1533 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1537 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1538 struct lov_mds_md **lmmp, int *lmm_size,
1539 struct ptlrpc_request **request)
1541 struct ll_sb_info *sbi = ll_i2sbi(inode);
1542 struct mdt_body *body;
1543 struct lov_mds_md *lmm = NULL;
1544 struct ptlrpc_request *req = NULL;
1545 struct md_op_data *op_data;
1548 rc = ll_get_default_mdsize(sbi, &lmmsize);
1552 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1553 strlen(filename), lmmsize,
1554 LUSTRE_OPC_ANY, NULL);
1555 if (IS_ERR(op_data))
1556 RETURN(PTR_ERR(op_data));
1558 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1559 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1560 ll_finish_md_op_data(op_data);
1562 CDEBUG(D_INFO, "md_getattr_name failed "
1563 "on %s: rc %d\n", filename, rc);
1567 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1568 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1570 lmmsize = body->eadatasize;
1572 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1574 GOTO(out, rc = -ENODATA);
1577 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1578 LASSERT(lmm != NULL);
1580 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1581 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1582 GOTO(out, rc = -EPROTO);
1586 * This is coming from the MDS, so is probably in
1587 * little endian. We convert it to host endian before
1588 * passing it to userspace.
1590 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1593 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1594 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1597 /* if function called for directory - we should
1598 * avoid swab not existent lsm objects */
1599 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1600 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1601 if (S_ISREG(body->mode))
1602 lustre_swab_lov_user_md_objects(
1603 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1605 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1606 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1607 if (S_ISREG(body->mode))
1608 lustre_swab_lov_user_md_objects(
1609 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1616 *lmm_size = lmmsize;
1621 static int ll_lov_setea(struct inode *inode, struct file *file,
1624 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1625 struct lov_user_md *lump;
1626 int lum_size = sizeof(struct lov_user_md) +
1627 sizeof(struct lov_user_ost_data);
1631 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1634 OBD_ALLOC_LARGE(lump, lum_size);
1638 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1639 OBD_FREE_LARGE(lump, lum_size);
1643 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1645 OBD_FREE_LARGE(lump, lum_size);
1649 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1652 struct lov_user_md_v3 lumv3;
1653 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1654 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1655 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1657 __u64 flags = FMODE_WRITE;
1660 /* first try with v1 which is smaller than v3 */
1661 lum_size = sizeof(struct lov_user_md_v1);
1662 if (copy_from_user(lumv1, lumv1p, lum_size))
1665 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1666 lum_size = sizeof(struct lov_user_md_v3);
1667 if (copy_from_user(&lumv3, lumv3p, lum_size))
1671 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1673 struct lov_stripe_md *lsm;
1676 put_user(0, &lumv1p->lmm_stripe_count);
1678 ll_layout_refresh(inode, &gen);
1679 lsm = ccc_inode_lsm_get(inode);
1680 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1681 0, lsm, (void *)arg);
1682 ccc_inode_lsm_put(inode, lsm);
1687 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1689 struct lov_stripe_md *lsm;
1693 lsm = ccc_inode_lsm_get(inode);
1695 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1697 ccc_inode_lsm_put(inode, lsm);
1702 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1704 struct ll_inode_info *lli = ll_i2info(inode);
1705 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1706 struct ccc_grouplock grouplock;
1710 if (ll_file_nolock(file))
1711 RETURN(-EOPNOTSUPP);
1713 spin_lock(&lli->lli_lock);
1714 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1715 CWARN("group lock already existed with gid %lu\n",
1716 fd->fd_grouplock.cg_gid);
1717 spin_unlock(&lli->lli_lock);
1720 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1721 spin_unlock(&lli->lli_lock);
1723 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1724 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1728 spin_lock(&lli->lli_lock);
1729 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1730 spin_unlock(&lli->lli_lock);
1731 CERROR("another thread just won the race\n");
1732 cl_put_grouplock(&grouplock);
1736 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1737 fd->fd_grouplock = grouplock;
1738 spin_unlock(&lli->lli_lock);
1740 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1744 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1746 struct ll_inode_info *lli = ll_i2info(inode);
1747 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1748 struct ccc_grouplock grouplock;
1751 spin_lock(&lli->lli_lock);
1752 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1753 spin_unlock(&lli->lli_lock);
1754 CWARN("no group lock held\n");
1757 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1759 if (fd->fd_grouplock.cg_gid != arg) {
1760 CWARN("group lock %lu doesn't match current id %lu\n",
1761 arg, fd->fd_grouplock.cg_gid);
1762 spin_unlock(&lli->lli_lock);
1766 grouplock = fd->fd_grouplock;
1767 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1768 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1769 spin_unlock(&lli->lli_lock);
1771 cl_put_grouplock(&grouplock);
1772 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1777 * Close inode open handle
1779 * \param dentry [in] dentry which contains the inode
1780 * \param it [in,out] intent which contains open info and result
1783 * \retval <0 failure
1785 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1787 struct inode *inode = dentry->d_inode;
1788 struct obd_client_handle *och;
1794 /* Root ? Do nothing. */
1795 if (dentry->d_inode->i_sb->s_root == dentry)
1798 /* No open handle to close? Move away */
1799 if (!it_disposition(it, DISP_OPEN_OPEN))
1802 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1804 OBD_ALLOC(och, sizeof(*och));
1806 GOTO(out, rc = -ENOMEM);
1808 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1810 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1813 /* this one is in place of ll_file_open */
1814 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1815 ptlrpc_req_finished(it->d.lustre.it_data);
1816 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1822 * Get size for inode for which FIEMAP mapping is requested.
1823 * Make the FIEMAP get_info call and returns the result.
1825 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1828 struct obd_export *exp = ll_i2dtexp(inode);
1829 struct lov_stripe_md *lsm = NULL;
1830 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1831 __u32 vallen = num_bytes;
1835 /* Checks for fiemap flags */
1836 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1837 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1841 /* Check for FIEMAP_FLAG_SYNC */
1842 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1843 rc = filemap_fdatawrite(inode->i_mapping);
1848 lsm = ccc_inode_lsm_get(inode);
1852 /* If the stripe_count > 1 and the application does not understand
1853 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1855 if (lsm->lsm_stripe_count > 1 &&
1856 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1857 GOTO(out, rc = -EOPNOTSUPP);
1859 fm_key.oa.o_oi = lsm->lsm_oi;
1860 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1862 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1863 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1864 /* If filesize is 0, then there would be no objects for mapping */
1865 if (fm_key.oa.o_size == 0) {
1866 fiemap->fm_mapped_extents = 0;
1870 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1872 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1875 CERROR("obd_get_info failed: rc = %d\n", rc);
1878 ccc_inode_lsm_put(inode, lsm);
1882 int ll_fid2path(struct inode *inode, void *arg)
1884 struct obd_export *exp = ll_i2mdexp(inode);
1885 struct getinfo_fid2path *gfout, *gfin;
1889 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1890 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1893 /* Need to get the buflen */
1894 OBD_ALLOC_PTR(gfin);
1897 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1902 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1903 OBD_ALLOC(gfout, outsize);
1904 if (gfout == NULL) {
1908 memcpy(gfout, gfin, sizeof(*gfout));
1911 /* Call mdc_iocontrol */
1912 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1916 if (copy_to_user(arg, gfout, outsize))
1920 OBD_FREE(gfout, outsize);
1924 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1926 struct ll_user_fiemap *fiemap_s;
1927 size_t num_bytes, ret_bytes;
1928 unsigned int extent_count;
1931 /* Get the extent count so we can calculate the size of
1932 * required fiemap buffer */
1933 if (get_user(extent_count,
1934 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1936 num_bytes = sizeof(*fiemap_s) + (extent_count *
1937 sizeof(struct ll_fiemap_extent));
1939 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1940 if (fiemap_s == NULL)
1943 /* get the fiemap value */
1944 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1946 GOTO(error, rc = -EFAULT);
1948 /* If fm_extent_count is non-zero, read the first extent since
1949 * it is used to calculate end_offset and device from previous
1952 if (copy_from_user(&fiemap_s->fm_extents[0],
1953 (char __user *)arg + sizeof(*fiemap_s),
1954 sizeof(struct ll_fiemap_extent)))
1955 GOTO(error, rc = -EFAULT);
1958 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1962 ret_bytes = sizeof(struct ll_user_fiemap);
1964 if (extent_count != 0)
1965 ret_bytes += (fiemap_s->fm_mapped_extents *
1966 sizeof(struct ll_fiemap_extent));
1968 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1972 OBD_FREE_LARGE(fiemap_s, num_bytes);
1977 * Read the data_version for inode.
1979 * This value is computed using stripe object version on OST.
1980 * Version is computed using server side locking.
1982 * @param sync if do sync on the OST side;
1984 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1985 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1987 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1989 struct lov_stripe_md *lsm = NULL;
1990 struct ll_sb_info *sbi = ll_i2sbi(inode);
1991 struct obdo *obdo = NULL;
1995 /* If no stripe, we consider version is 0. */
1996 lsm = ccc_inode_lsm_get(inode);
1997 if (!lsm_has_objects(lsm)) {
1999 CDEBUG(D_INODE, "No object for inode\n");
2003 OBD_ALLOC_PTR(obdo);
2005 GOTO(out, rc = -ENOMEM);
2007 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2009 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2012 *data_version = obdo->o_data_version;
2018 ccc_inode_lsm_put(inode, lsm);
2023 * Trigger a HSM release request for the provided inode.
2025 int ll_hsm_release(struct inode *inode)
2027 struct cl_env_nest nest;
2029 struct obd_client_handle *och = NULL;
2030 __u64 data_version = 0;
2034 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2035 ll_get_fsname(inode->i_sb, NULL, 0),
2036 PFID(&ll_i2info(inode)->lli_fid));
2038 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2040 GOTO(out, rc = PTR_ERR(och));
2042 /* Grab latest data_version and [am]time values */
2043 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2047 env = cl_env_nested_get(&nest);
2049 GOTO(out, rc = PTR_ERR(env));
2051 ll_merge_lvb(env, inode);
2052 cl_env_nested_put(&nest, env);
2054 /* Release the file.
2055 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2056 * we still need it to pack l_remote_handle to MDT. */
2057 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2063 if (och != NULL && !IS_ERR(och)) /* close the file */
2064 ll_lease_close(och, inode, NULL);
2069 struct ll_swap_stack {
2070 struct iattr ia1, ia2;
2072 struct inode *inode1, *inode2;
2073 bool check_dv1, check_dv2;
2076 static int ll_swap_layouts(struct file *file1, struct file *file2,
2077 struct lustre_swap_layouts *lsl)
2079 struct mdc_swap_layouts msl;
2080 struct md_op_data *op_data;
2083 struct ll_swap_stack *llss = NULL;
2086 OBD_ALLOC_PTR(llss);
2090 llss->inode1 = file1->f_dentry->d_inode;
2091 llss->inode2 = file2->f_dentry->d_inode;
2093 if (!S_ISREG(llss->inode2->i_mode))
2094 GOTO(free, rc = -EINVAL);
2096 if (inode_permission(llss->inode1, MAY_WRITE) ||
2097 inode_permission(llss->inode2, MAY_WRITE))
2098 GOTO(free, rc = -EPERM);
2100 if (llss->inode2->i_sb != llss->inode1->i_sb)
2101 GOTO(free, rc = -EXDEV);
2103 /* we use 2 bool because it is easier to swap than 2 bits */
2104 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2105 llss->check_dv1 = true;
2107 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2108 llss->check_dv2 = true;
2110 /* we cannot use lsl->sl_dvX directly because we may swap them */
2111 llss->dv1 = lsl->sl_dv1;
2112 llss->dv2 = lsl->sl_dv2;
2114 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2115 if (rc == 0) /* same file, done! */
2118 if (rc < 0) { /* sequentialize it */
2119 swap(llss->inode1, llss->inode2);
2121 swap(llss->dv1, llss->dv2);
2122 swap(llss->check_dv1, llss->check_dv2);
2126 if (gid != 0) { /* application asks to flush dirty cache */
2127 rc = ll_get_grouplock(llss->inode1, file1, gid);
2131 rc = ll_get_grouplock(llss->inode2, file2, gid);
2133 ll_put_grouplock(llss->inode1, file1, gid);
2138 /* to be able to restore mtime and atime after swap
2139 * we need to first save them */
2141 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2142 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2143 llss->ia1.ia_atime = llss->inode1->i_atime;
2144 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2145 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2146 llss->ia2.ia_atime = llss->inode2->i_atime;
2147 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2150 /* ultimate check, before swaping the layouts we check if
2151 * dataversion has changed (if requested) */
2152 if (llss->check_dv1) {
2153 rc = ll_data_version(llss->inode1, &dv, 0);
2156 if (dv != llss->dv1)
2157 GOTO(putgl, rc = -EAGAIN);
2160 if (llss->check_dv2) {
2161 rc = ll_data_version(llss->inode2, &dv, 0);
2164 if (dv != llss->dv2)
2165 GOTO(putgl, rc = -EAGAIN);
2168 /* struct md_op_data is used to send the swap args to the mdt
2169 * only flags is missing, so we use struct mdc_swap_layouts
2170 * through the md_op_data->op_data */
2171 /* flags from user space have to be converted before they are send to
2172 * server, no flag is sent today, they are only used on the client */
2175 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2176 0, LUSTRE_OPC_ANY, &msl);
2177 if (IS_ERR(op_data))
2178 GOTO(free, rc = PTR_ERR(op_data));
2180 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2181 sizeof(*op_data), op_data, NULL);
2182 ll_finish_md_op_data(op_data);
2186 ll_put_grouplock(llss->inode2, file2, gid);
2187 ll_put_grouplock(llss->inode1, file1, gid);
2190 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2194 /* clear useless flags */
2195 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2196 llss->ia1.ia_valid &= ~ATTR_MTIME;
2197 llss->ia2.ia_valid &= ~ATTR_MTIME;
2200 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2201 llss->ia1.ia_valid &= ~ATTR_ATIME;
2202 llss->ia2.ia_valid &= ~ATTR_ATIME;
2205 /* update time if requested */
2207 if (llss->ia2.ia_valid != 0) {
2208 mutex_lock(&llss->inode1->i_mutex);
2209 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2210 mutex_unlock(&llss->inode1->i_mutex);
2213 if (llss->ia1.ia_valid != 0) {
2216 mutex_lock(&llss->inode2->i_mutex);
2217 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2218 mutex_unlock(&llss->inode2->i_mutex);
2230 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2232 struct md_op_data *op_data;
2235 /* Non-root users are forbidden to set or clear flags which are
2236 * NOT defined in HSM_USER_MASK. */
2237 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2238 !cfs_capable(CFS_CAP_SYS_ADMIN))
2241 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2242 LUSTRE_OPC_ANY, hss);
2243 if (IS_ERR(op_data))
2244 RETURN(PTR_ERR(op_data));
2246 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2247 sizeof(*op_data), op_data, NULL);
2249 ll_finish_md_op_data(op_data);
2254 static int ll_hsm_import(struct inode *inode, struct file *file,
2255 struct hsm_user_import *hui)
2257 struct hsm_state_set *hss = NULL;
2258 struct iattr *attr = NULL;
2262 if (!S_ISREG(inode->i_mode))
2268 GOTO(out, rc = -ENOMEM);
2270 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2271 hss->hss_archive_id = hui->hui_archive_id;
2272 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2273 rc = ll_hsm_state_set(inode, hss);
2277 OBD_ALLOC_PTR(attr);
2279 GOTO(out, rc = -ENOMEM);
2281 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2282 attr->ia_mode |= S_IFREG;
2283 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2284 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2285 attr->ia_size = hui->hui_size;
2286 attr->ia_mtime.tv_sec = hui->hui_mtime;
2287 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2288 attr->ia_atime.tv_sec = hui->hui_atime;
2289 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2291 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2292 ATTR_UID | ATTR_GID |
2293 ATTR_MTIME | ATTR_MTIME_SET |
2294 ATTR_ATIME | ATTR_ATIME_SET;
2296 rc = ll_setattr_raw(file->f_dentry, attr, true);
2311 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2313 struct inode *inode = file->f_dentry->d_inode;
2314 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2319 PFID(ll_inode2fid(inode)), inode, cmd);
2320 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2322 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2323 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2327 case LL_IOC_GETFLAGS:
2328 /* Get the current value of the file flags */
2329 return put_user(fd->fd_flags, (int *)arg);
2330 case LL_IOC_SETFLAGS:
2331 case LL_IOC_CLRFLAGS:
2332 /* Set or clear specific file flags */
2333 /* XXX This probably needs checks to ensure the flags are
2334 * not abused, and to handle any flag side effects.
2336 if (get_user(flags, (int *) arg))
2339 if (cmd == LL_IOC_SETFLAGS) {
2340 if ((flags & LL_FILE_IGNORE_LOCK) &&
2341 !(file->f_flags & O_DIRECT)) {
2342 CERROR("%s: unable to disable locking on "
2343 "non-O_DIRECT file\n", current->comm);
2347 fd->fd_flags |= flags;
2349 fd->fd_flags &= ~flags;
2352 case LL_IOC_LOV_SETSTRIPE:
2353 RETURN(ll_lov_setstripe(inode, file, arg));
2354 case LL_IOC_LOV_SETEA:
2355 RETURN(ll_lov_setea(inode, file, arg));
2356 case LL_IOC_LOV_SWAP_LAYOUTS: {
2358 struct lustre_swap_layouts lsl;
2360 if (copy_from_user(&lsl, (char *)arg,
2361 sizeof(struct lustre_swap_layouts)))
2364 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2367 file2 = fget(lsl.sl_fd);
2372 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2373 rc = ll_swap_layouts(file, file2, &lsl);
2377 case LL_IOC_LOV_GETSTRIPE:
2378 RETURN(ll_lov_getstripe(inode, arg));
2379 case LL_IOC_RECREATE_OBJ:
2380 RETURN(ll_lov_recreate_obj(inode, arg));
2381 case LL_IOC_RECREATE_FID:
2382 RETURN(ll_lov_recreate_fid(inode, arg));
2383 case FSFILT_IOC_FIEMAP:
2384 RETURN(ll_ioctl_fiemap(inode, arg));
2385 case FSFILT_IOC_GETFLAGS:
2386 case FSFILT_IOC_SETFLAGS:
2387 RETURN(ll_iocontrol(inode, file, cmd, arg));
2388 case FSFILT_IOC_GETVERSION_OLD:
2389 case FSFILT_IOC_GETVERSION:
2390 RETURN(put_user(inode->i_generation, (int *)arg));
2391 case LL_IOC_GROUP_LOCK:
2392 RETURN(ll_get_grouplock(inode, file, arg));
2393 case LL_IOC_GROUP_UNLOCK:
2394 RETURN(ll_put_grouplock(inode, file, arg));
2395 case IOC_OBD_STATFS:
2396 RETURN(ll_obd_statfs(inode, (void *)arg));
2398 /* We need to special case any other ioctls we want to handle,
2399 * to send them to the MDS/OST as appropriate and to properly
2400 * network encode the arg field.
2401 case FSFILT_IOC_SETVERSION_OLD:
2402 case FSFILT_IOC_SETVERSION:
2404 case LL_IOC_FLUSHCTX:
2405 RETURN(ll_flush_ctx(inode));
2406 case LL_IOC_PATH2FID: {
2407 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2408 sizeof(struct lu_fid)))
2413 case OBD_IOC_FID2PATH:
2414 RETURN(ll_fid2path(inode, (void *)arg));
2415 case LL_IOC_DATA_VERSION: {
2416 struct ioc_data_version idv;
2419 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2422 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2423 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2425 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2431 case LL_IOC_GET_MDTIDX: {
2434 mdtidx = ll_get_mdt_idx(inode);
2438 if (put_user((int)mdtidx, (int*)arg))
2443 case OBD_IOC_GETDTNAME:
2444 case OBD_IOC_GETMDNAME:
2445 RETURN(ll_get_obd_name(inode, cmd, arg));
2446 case LL_IOC_HSM_STATE_GET: {
2447 struct md_op_data *op_data;
2448 struct hsm_user_state *hus;
2455 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2456 LUSTRE_OPC_ANY, hus);
2457 if (IS_ERR(op_data)) {
2459 RETURN(PTR_ERR(op_data));
2462 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2465 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2468 ll_finish_md_op_data(op_data);
2472 case LL_IOC_HSM_STATE_SET: {
2473 struct hsm_state_set *hss;
2480 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2485 rc = ll_hsm_state_set(inode, hss);
2490 case LL_IOC_HSM_ACTION: {
2491 struct md_op_data *op_data;
2492 struct hsm_current_action *hca;
2499 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2500 LUSTRE_OPC_ANY, hca);
2501 if (IS_ERR(op_data)) {
2503 RETURN(PTR_ERR(op_data));
2506 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2509 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2512 ll_finish_md_op_data(op_data);
2516 case LL_IOC_SET_LEASE: {
2517 struct ll_inode_info *lli = ll_i2info(inode);
2518 struct obd_client_handle *och = NULL;
2524 if (!(file->f_mode & FMODE_WRITE))
2529 if (!(file->f_mode & FMODE_READ))
2534 mutex_lock(&lli->lli_och_mutex);
2535 if (fd->fd_lease_och != NULL) {
2536 och = fd->fd_lease_och;
2537 fd->fd_lease_och = NULL;
2539 mutex_unlock(&lli->lli_och_mutex);
2542 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2543 rc = ll_lease_close(och, inode, &lease_broken);
2544 if (rc == 0 && lease_broken)
2550 /* return the type of lease or error */
2551 RETURN(rc < 0 ? rc : (int)mode);
2556 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2558 /* apply for lease */
2559 och = ll_lease_open(inode, file, mode, 0);
2561 RETURN(PTR_ERR(och));
2564 mutex_lock(&lli->lli_och_mutex);
2565 if (fd->fd_lease_och == NULL) {
2566 fd->fd_lease_och = och;
2569 mutex_unlock(&lli->lli_och_mutex);
2571 /* impossible now that only excl is supported for now */
2572 ll_lease_close(och, inode, &lease_broken);
2577 case LL_IOC_GET_LEASE: {
2578 struct ll_inode_info *lli = ll_i2info(inode);
2579 struct ldlm_lock *lock = NULL;
2582 mutex_lock(&lli->lli_och_mutex);
2583 if (fd->fd_lease_och != NULL) {
2584 struct obd_client_handle *och = fd->fd_lease_och;
2586 lock = ldlm_handle2lock(&och->och_lease_handle);
2588 lock_res_and_lock(lock);
2589 if (!ldlm_is_cancel(lock))
2590 rc = och->och_flags &
2591 (FMODE_READ | FMODE_WRITE);
2592 unlock_res_and_lock(lock);
2593 LDLM_LOCK_PUT(lock);
2596 mutex_unlock(&lli->lli_och_mutex);
2599 case LL_IOC_HSM_IMPORT: {
2600 struct hsm_user_import *hui;
2606 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2611 rc = ll_hsm_import(inode, file, hui);
2621 ll_iocontrol_call(inode, file, cmd, arg, &err))
2624 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2630 #ifndef HAVE_FILE_LLSEEK_SIZE
2631 static inline loff_t
2632 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2634 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2636 if (offset > maxsize)
2639 if (offset != file->f_pos) {
2640 file->f_pos = offset;
2641 file->f_version = 0;
2647 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2648 loff_t maxsize, loff_t eof)
2650 struct inode *inode = file->f_dentry->d_inode;
2658 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2659 * position-querying operation. Avoid rewriting the "same"
2660 * f_pos value back to the file because a concurrent read(),
2661 * write() or lseek() might have altered it
2666 * f_lock protects against read/modify/write race with other
2667 * SEEK_CURs. Note that parallel writes and reads behave
2670 mutex_lock(&inode->i_mutex);
2671 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2672 mutex_unlock(&inode->i_mutex);
2676 * In the generic case the entire file is data, so as long as
2677 * offset isn't at the end of the file then the offset is data.
2684 * There is a virtual hole at the end of the file, so as long as
2685 * offset isn't i_size or larger, return i_size.
2693 return llseek_execute(file, offset, maxsize);
2697 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2699 struct inode *inode = file->f_dentry->d_inode;
2700 loff_t retval, eof = 0;
2703 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2704 (origin == SEEK_CUR) ? file->f_pos : 0);
2705 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2706 PFID(ll_inode2fid(inode)), inode, retval, retval,
2708 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2710 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2711 retval = ll_glimpse_size(inode);
2714 eof = i_size_read(inode);
2717 retval = ll_generic_file_llseek_size(file, offset, origin,
2718 ll_file_maxbytes(inode), eof);
2722 static int ll_flush(struct file *file, fl_owner_t id)
2724 struct inode *inode = file->f_dentry->d_inode;
2725 struct ll_inode_info *lli = ll_i2info(inode);
2726 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2729 LASSERT(!S_ISDIR(inode->i_mode));
2731 /* catch async errors that were recorded back when async writeback
2732 * failed for pages in this mapping. */
2733 rc = lli->lli_async_rc;
2734 lli->lli_async_rc = 0;
2735 if (lli->lli_clob != NULL) {
2736 err = lov_read_and_clear_async_rc(lli->lli_clob);
2741 /* The application has been told write failure already.
2742 * Do not report failure again. */
2743 if (fd->fd_write_failed)
2745 return rc ? -EIO : 0;
2749 * Called to make sure a portion of file has been written out.
2750 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2752 * Return how many pages have been written.
2754 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2755 enum cl_fsync_mode mode, int ignore_layout)
2757 struct cl_env_nest nest;
2760 struct obd_capa *capa = NULL;
2761 struct cl_fsync_io *fio;
2765 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2766 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2769 env = cl_env_nested_get(&nest);
2771 RETURN(PTR_ERR(env));
2773 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2775 io = ccc_env_thread_io(env);
2776 io->ci_obj = cl_i2info(inode)->lli_clob;
2777 io->ci_ignore_layout = ignore_layout;
2779 /* initialize parameters for sync */
2780 fio = &io->u.ci_fsync;
2781 fio->fi_capa = capa;
2782 fio->fi_start = start;
2784 fio->fi_fid = ll_inode2fid(inode);
2785 fio->fi_mode = mode;
2786 fio->fi_nr_written = 0;
2788 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2789 result = cl_io_loop(env, io);
2791 result = io->ci_result;
2793 result = fio->fi_nr_written;
2794 cl_io_fini(env, io);
2795 cl_env_nested_put(&nest, env);
2803 * When dentry is provided (the 'else' case), *file->f_dentry may be
2804 * null and dentry must be used directly rather than pulled from
2805 * *file->f_dentry as is done otherwise.
2808 #ifdef HAVE_FILE_FSYNC_4ARGS
2809 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2811 struct dentry *dentry = file->f_dentry;
2812 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2813 int ll_fsync(struct file *file, int datasync)
2815 struct dentry *dentry = file->f_dentry;
2817 loff_t end = LLONG_MAX;
2819 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2822 loff_t end = LLONG_MAX;
2824 struct inode *inode = dentry->d_inode;
2825 struct ll_inode_info *lli = ll_i2info(inode);
2826 struct ptlrpc_request *req;
2827 struct obd_capa *oc;
2831 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2832 PFID(ll_inode2fid(inode)), inode);
2833 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2835 #ifdef HAVE_FILE_FSYNC_4ARGS
2836 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2837 mutex_lock(&inode->i_mutex);
2839 /* fsync's caller has already called _fdata{sync,write}, we want
2840 * that IO to finish before calling the osc and mdc sync methods */
2841 rc = filemap_fdatawait(inode->i_mapping);
2844 /* catch async errors that were recorded back when async writeback
2845 * failed for pages in this mapping. */
2846 if (!S_ISDIR(inode->i_mode)) {
2847 err = lli->lli_async_rc;
2848 lli->lli_async_rc = 0;
2851 err = lov_read_and_clear_async_rc(lli->lli_clob);
2856 oc = ll_mdscapa_get(inode);
2857 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2863 ptlrpc_req_finished(req);
2865 if (S_ISREG(inode->i_mode)) {
2866 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2868 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2869 if (rc == 0 && err < 0)
2872 fd->fd_write_failed = true;
2874 fd->fd_write_failed = false;
2877 #ifdef HAVE_FILE_FSYNC_4ARGS
2878 mutex_unlock(&inode->i_mutex);
2884 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2886 struct inode *inode = file->f_dentry->d_inode;
2887 struct ll_sb_info *sbi = ll_i2sbi(inode);
2888 struct ldlm_enqueue_info einfo = {
2889 .ei_type = LDLM_FLOCK,
2890 .ei_cb_cp = ldlm_flock_completion_ast,
2891 .ei_cbdata = file_lock,
2893 struct md_op_data *op_data;
2894 struct lustre_handle lockh = {0};
2895 ldlm_policy_data_t flock = {{0}};
2896 int fl_type = file_lock->fl_type;
2902 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2903 PFID(ll_inode2fid(inode)), file_lock);
2905 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2907 if (file_lock->fl_flags & FL_FLOCK) {
2908 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2909 /* flocks are whole-file locks */
2910 flock.l_flock.end = OFFSET_MAX;
2911 /* For flocks owner is determined by the local file desctiptor*/
2912 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2913 } else if (file_lock->fl_flags & FL_POSIX) {
2914 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2915 flock.l_flock.start = file_lock->fl_start;
2916 flock.l_flock.end = file_lock->fl_end;
2920 flock.l_flock.pid = file_lock->fl_pid;
2922 /* Somewhat ugly workaround for svc lockd.
2923 * lockd installs custom fl_lmops->lm_compare_owner that checks
2924 * for the fl_owner to be the same (which it always is on local node
2925 * I guess between lockd processes) and then compares pid.
2926 * As such we assign pid to the owner field to make it all work,
2927 * conflict with normal locks is unlikely since pid space and
2928 * pointer space for current->files are not intersecting */
2929 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2930 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2934 einfo.ei_mode = LCK_PR;
2937 /* An unlock request may or may not have any relation to
2938 * existing locks so we may not be able to pass a lock handle
2939 * via a normal ldlm_lock_cancel() request. The request may even
2940 * unlock a byte range in the middle of an existing lock. In
2941 * order to process an unlock request we need all of the same
2942 * information that is given with a normal read or write record
2943 * lock request. To avoid creating another ldlm unlock (cancel)
2944 * message we'll treat a LCK_NL flock request as an unlock. */
2945 einfo.ei_mode = LCK_NL;
2948 einfo.ei_mode = LCK_PW;
2951 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2966 flags = LDLM_FL_BLOCK_NOWAIT;
2972 flags = LDLM_FL_TEST_LOCK;
2975 CERROR("unknown fcntl lock command: %d\n", cmd);
2979 /* Save the old mode so that if the mode in the lock changes we
2980 * can decrement the appropriate reader or writer refcount. */
2981 file_lock->fl_type = einfo.ei_mode;
2983 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2984 LUSTRE_OPC_ANY, NULL);
2985 if (IS_ERR(op_data))
2986 RETURN(PTR_ERR(op_data));
2988 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2989 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2990 flock.l_flock.pid, flags, einfo.ei_mode,
2991 flock.l_flock.start, flock.l_flock.end);
2993 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2994 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2996 /* Restore the file lock type if not TEST lock. */
2997 if (!(flags & LDLM_FL_TEST_LOCK))
2998 file_lock->fl_type = fl_type;
3000 if ((file_lock->fl_flags & FL_FLOCK) &&
3001 (rc == 0 || file_lock->fl_type == F_UNLCK))
3002 rc2 = flock_lock_file_wait(file, file_lock);
3003 if ((file_lock->fl_flags & FL_POSIX) &&
3004 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3005 !(flags & LDLM_FL_TEST_LOCK))
3006 rc2 = posix_lock_file_wait(file, file_lock);
3008 if (rc2 && file_lock->fl_type != F_UNLCK) {
3009 einfo.ei_mode = LCK_NL;
3010 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
3011 op_data, &lockh, &flock, 0, NULL /* req */, flags);
3015 ll_finish_md_op_data(op_data);
3020 int ll_get_fid_by_name(struct inode *parent, const char *name,
3021 int namelen, struct lu_fid *fid)
3023 struct md_op_data *op_data = NULL;
3024 struct mdt_body *body;
3025 struct ptlrpc_request *req;
3029 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3030 LUSTRE_OPC_ANY, NULL);
3031 if (IS_ERR(op_data))
3032 RETURN(PTR_ERR(op_data));
3034 op_data->op_valid = OBD_MD_FLID;
3035 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3036 ll_finish_md_op_data(op_data);
3040 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3042 GOTO(out_req, rc = -EFAULT);
3046 ptlrpc_req_finished(req);
3050 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3051 const char *name, int namelen)
3053 struct dentry *dchild = NULL;
3054 struct inode *child_inode = NULL;
3055 struct md_op_data *op_data;
3056 struct ptlrpc_request *request = NULL;
3061 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3062 name, PFID(ll_inode2fid(parent)), mdtidx);
3064 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3065 0, LUSTRE_OPC_ANY, NULL);
3066 if (IS_ERR(op_data))
3067 RETURN(PTR_ERR(op_data));
3069 /* Get child FID first */
3070 qstr.hash = full_name_hash(name, namelen);
3073 dchild = d_lookup(file->f_dentry, &qstr);
3074 if (dchild != NULL && dchild->d_inode != NULL) {
3075 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3076 if (dchild->d_inode != NULL) {
3077 child_inode = igrab(dchild->d_inode);
3078 ll_invalidate_aliases(child_inode);
3082 rc = ll_get_fid_by_name(parent, name, namelen,
3088 if (!fid_is_sane(&op_data->op_fid3)) {
3089 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3090 ll_get_fsname(parent->i_sb, NULL, 0), name,
3091 PFID(&op_data->op_fid3));
3092 GOTO(out_free, rc = -EINVAL);
3095 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3100 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3101 PFID(&op_data->op_fid3), mdtidx);
3102 GOTO(out_free, rc = 0);
3105 op_data->op_mds = mdtidx;
3106 op_data->op_cli_flags = CLI_MIGRATE;
3107 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3108 namelen, name, namelen, &request);
3110 ll_update_times(request, parent);
3112 ptlrpc_req_finished(request);
3117 if (child_inode != NULL) {
3118 clear_nlink(child_inode);
3122 ll_finish_md_op_data(op_data);
3127 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3135 * test if some locks matching bits and l_req_mode are acquired
3136 * - bits can be in different locks
3137 * - if found clear the common lock bits in *bits
3138 * - the bits not found, are kept in *bits
3140 * \param bits [IN] searched lock bits [IN]
3141 * \param l_req_mode [IN] searched lock mode
3142 * \retval boolean, true iff all bits are found
3144 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3146 struct lustre_handle lockh;
3147 ldlm_policy_data_t policy;
3148 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3149 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3158 fid = &ll_i2info(inode)->lli_fid;
3159 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3160 ldlm_lockname[mode]);
3162 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3163 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3164 policy.l_inodebits.bits = *bits & (1 << i);
3165 if (policy.l_inodebits.bits == 0)
3168 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3169 &policy, mode, &lockh)) {
3170 struct ldlm_lock *lock;
3172 lock = ldlm_handle2lock(&lockh);
3175 ~(lock->l_policy_data.l_inodebits.bits);
3176 LDLM_LOCK_PUT(lock);
3178 *bits &= ~policy.l_inodebits.bits;
3185 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3186 struct lustre_handle *lockh, __u64 flags,
3189 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3194 fid = &ll_i2info(inode)->lli_fid;
3195 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3197 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3198 fid, LDLM_IBITS, &policy, mode, lockh);
3203 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3205 /* Already unlinked. Just update nlink and return success */
3206 if (rc == -ENOENT) {
3208 /* This path cannot be hit for regular files unless in
3209 * case of obscure races, so no need to to validate
3211 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3213 } else if (rc != 0) {
3214 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3215 "%s: revalidate FID "DFID" error: rc = %d\n",
3216 ll_get_fsname(inode->i_sb, NULL, 0),
3217 PFID(ll_inode2fid(inode)), rc);
3223 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3225 struct inode *inode = dentry->d_inode;
3226 struct ptlrpc_request *req = NULL;
3227 struct obd_export *exp;
3231 LASSERT(inode != NULL);
3233 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3234 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3236 exp = ll_i2mdexp(inode);
3238 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3239 * But under CMD case, it caused some lock issues, should be fixed
3240 * with new CMD ibits lock. See bug 12718 */
3241 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3242 struct lookup_intent oit = { .it_op = IT_GETATTR };
3243 struct md_op_data *op_data;
3245 if (ibits == MDS_INODELOCK_LOOKUP)
3246 oit.it_op = IT_LOOKUP;
3248 /* Call getattr by fid, so do not provide name at all. */
3249 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3250 dentry->d_inode, NULL, 0, 0,
3251 LUSTRE_OPC_ANY, NULL);
3252 if (IS_ERR(op_data))
3253 RETURN(PTR_ERR(op_data));
3255 oit.it_create_mode |= M_CHECK_STALE;
3256 rc = md_intent_lock(exp, op_data, NULL, 0,
3257 /* we are not interested in name
3260 ll_md_blocking_ast, 0);
3261 ll_finish_md_op_data(op_data);
3262 oit.it_create_mode &= ~M_CHECK_STALE;
3264 rc = ll_inode_revalidate_fini(inode, rc);
3268 rc = ll_revalidate_it_finish(req, &oit, dentry);
3270 ll_intent_release(&oit);
3274 /* Unlinked? Unhash dentry, so it is not picked up later by
3275 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3276 here to preserve get_cwd functionality on 2.6.
3278 if (!dentry->d_inode->i_nlink)
3279 d_lustre_invalidate(dentry, 0);
3281 ll_lookup_finish_locks(&oit, dentry);
3282 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3283 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3284 obd_valid valid = OBD_MD_FLGETATTR;
3285 struct md_op_data *op_data;
3288 if (S_ISREG(inode->i_mode)) {
3289 rc = ll_get_default_mdsize(sbi, &ealen);
3292 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3295 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3296 0, ealen, LUSTRE_OPC_ANY,
3298 if (IS_ERR(op_data))
3299 RETURN(PTR_ERR(op_data));
3301 op_data->op_valid = valid;
3302 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3303 * capa for this inode. Because we only keep capas of dirs
3305 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3306 ll_finish_md_op_data(op_data);
3308 rc = ll_inode_revalidate_fini(inode, rc);
3312 rc = ll_prep_inode(&inode, req, NULL, NULL);
3315 ptlrpc_req_finished(req);
3319 static int ll_merge_md_attr(struct inode *inode)
3321 struct cl_attr attr = { 0 };
3324 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3325 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3330 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3331 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3333 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3334 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3335 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3341 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3343 struct inode *inode = dentry->d_inode;
3347 rc = __ll_inode_revalidate(dentry, ibits);
3351 /* if object isn't regular file, don't validate size */
3352 if (!S_ISREG(inode->i_mode)) {
3353 if (S_ISDIR(inode->i_mode) &&
3354 ll_i2info(inode)->lli_lsm_md != NULL) {
3355 rc = ll_merge_md_attr(inode);
3360 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3361 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3362 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3364 /* In case of restore, the MDT has the right size and has
3365 * already send it back without granting the layout lock,
3366 * inode is up-to-date so glimpse is useless.
3367 * Also to glimpse we need the layout, in case of a running
3368 * restore the MDT holds the layout lock so the glimpse will
3369 * block up to the end of restore (getattr will block)
3371 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3372 rc = ll_glimpse_size(inode);
3377 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3379 struct inode *inode = de->d_inode;
3380 struct ll_sb_info *sbi = ll_i2sbi(inode);
3381 struct ll_inode_info *lli = ll_i2info(inode);
3384 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3385 MDS_INODELOCK_LOOKUP);
3386 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3391 stat->dev = inode->i_sb->s_dev;
3392 if (ll_need_32bit_api(sbi))
3393 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3395 stat->ino = inode->i_ino;
3396 stat->mode = inode->i_mode;
3397 stat->uid = inode->i_uid;
3398 stat->gid = inode->i_gid;
3399 stat->rdev = inode->i_rdev;
3400 stat->atime = inode->i_atime;
3401 stat->mtime = inode->i_mtime;
3402 stat->ctime = inode->i_ctime;
3403 stat->blksize = 1 << inode->i_blkbits;
3404 stat->blocks = inode->i_blocks;
3406 if (S_ISDIR(inode->i_mode) &&
3407 ll_i2info(inode)->lli_lsm_md != NULL) {
3408 stat->nlink = lli->lli_stripe_dir_nlink;
3409 stat->size = lli->lli_stripe_dir_size;
3411 stat->nlink = inode->i_nlink;
3412 stat->size = i_size_read(inode);
3418 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3419 __u64 start, __u64 len)
3423 struct ll_user_fiemap *fiemap;
3424 unsigned int extent_count = fieinfo->fi_extents_max;
3426 num_bytes = sizeof(*fiemap) + (extent_count *
3427 sizeof(struct ll_fiemap_extent));
3428 OBD_ALLOC_LARGE(fiemap, num_bytes);
3433 fiemap->fm_flags = fieinfo->fi_flags;
3434 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3435 fiemap->fm_start = start;
3436 fiemap->fm_length = len;
3437 if (extent_count > 0)
3438 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3439 sizeof(struct ll_fiemap_extent));
3441 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3443 fieinfo->fi_flags = fiemap->fm_flags;
3444 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3445 if (extent_count > 0)
3446 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3447 fiemap->fm_mapped_extents *
3448 sizeof(struct ll_fiemap_extent));
3450 OBD_FREE_LARGE(fiemap, num_bytes);
3454 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3456 struct ll_inode_info *lli = ll_i2info(inode);
3457 struct posix_acl *acl = NULL;
3460 spin_lock(&lli->lli_lock);
3461 /* VFS' acl_permission_check->check_acl will release the refcount */
3462 acl = posix_acl_dup(lli->lli_posix_acl);
3463 spin_unlock(&lli->lli_lock);
3468 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3470 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3471 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3473 ll_check_acl(struct inode *inode, int mask)
3476 # ifdef CONFIG_FS_POSIX_ACL
3477 struct posix_acl *acl;
3481 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3482 if (flags & IPERM_FLAG_RCU)
3485 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3490 rc = posix_acl_permission(inode, acl, mask);
3491 posix_acl_release(acl);
3494 # else /* !CONFIG_FS_POSIX_ACL */
3496 # endif /* CONFIG_FS_POSIX_ACL */
3498 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3500 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3501 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3503 # ifdef HAVE_INODE_PERMISION_2ARGS
3504 int ll_inode_permission(struct inode *inode, int mask)
3506 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3513 #ifdef MAY_NOT_BLOCK
3514 if (mask & MAY_NOT_BLOCK)
3516 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3517 if (flags & IPERM_FLAG_RCU)
3521 /* as root inode are NOT getting validated in lookup operation,
3522 * need to do it before permission check. */
3524 if (inode == inode->i_sb->s_root->d_inode) {
3525 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3526 MDS_INODELOCK_LOOKUP);
3531 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3532 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3534 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3535 return lustre_check_remote_perm(inode, mask);
3537 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3538 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3543 /* -o localflock - only provides locally consistent flock locks */
3544 struct file_operations ll_file_operations = {
3545 .read = ll_file_read,
3546 .aio_read = ll_file_aio_read,
3547 .write = ll_file_write,
3548 .aio_write = ll_file_aio_write,
3549 .unlocked_ioctl = ll_file_ioctl,
3550 .open = ll_file_open,
3551 .release = ll_file_release,
3552 .mmap = ll_file_mmap,
3553 .llseek = ll_file_seek,
3554 .splice_read = ll_file_splice_read,
3559 struct file_operations ll_file_operations_flock = {
3560 .read = ll_file_read,
3561 .aio_read = ll_file_aio_read,
3562 .write = ll_file_write,
3563 .aio_write = ll_file_aio_write,
3564 .unlocked_ioctl = ll_file_ioctl,
3565 .open = ll_file_open,
3566 .release = ll_file_release,
3567 .mmap = ll_file_mmap,
3568 .llseek = ll_file_seek,
3569 .splice_read = ll_file_splice_read,
3572 .flock = ll_file_flock,
3573 .lock = ll_file_flock
3576 /* These are for -o noflock - to return ENOSYS on flock calls */
3577 struct file_operations ll_file_operations_noflock = {
3578 .read = ll_file_read,
3579 .aio_read = ll_file_aio_read,
3580 .write = ll_file_write,
3581 .aio_write = ll_file_aio_write,
3582 .unlocked_ioctl = ll_file_ioctl,
3583 .open = ll_file_open,
3584 .release = ll_file_release,
3585 .mmap = ll_file_mmap,
3586 .llseek = ll_file_seek,
3587 .splice_read = ll_file_splice_read,
3590 .flock = ll_file_noflock,
3591 .lock = ll_file_noflock
3594 struct inode_operations ll_file_inode_operations = {
3595 .setattr = ll_setattr,
3596 .getattr = ll_getattr,
3597 .permission = ll_inode_permission,
3598 .setxattr = ll_setxattr,
3599 .getxattr = ll_getxattr,
3600 .listxattr = ll_listxattr,
3601 .removexattr = ll_removexattr,
3602 .fiemap = ll_fiemap,
3603 #ifdef HAVE_IOP_GET_ACL
3604 .get_acl = ll_get_acl,
3608 /* dynamic ioctl number support routins */
3609 static struct llioc_ctl_data {
3610 struct rw_semaphore ioc_sem;
3611 struct list_head ioc_head;
3613 __RWSEM_INITIALIZER(llioc.ioc_sem),
3614 LIST_HEAD_INIT(llioc.ioc_head)
3619 struct list_head iocd_list;
3620 unsigned int iocd_size;
3621 llioc_callback_t iocd_cb;
3622 unsigned int iocd_count;
3623 unsigned int iocd_cmd[0];
3626 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3629 struct llioc_data *in_data = NULL;
3632 if (cb == NULL || cmd == NULL ||
3633 count > LLIOC_MAX_CMD || count < 0)
3636 size = sizeof(*in_data) + count * sizeof(unsigned int);
3637 OBD_ALLOC(in_data, size);
3638 if (in_data == NULL)
3641 memset(in_data, 0, sizeof(*in_data));
3642 in_data->iocd_size = size;
3643 in_data->iocd_cb = cb;
3644 in_data->iocd_count = count;
3645 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3647 down_write(&llioc.ioc_sem);
3648 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3649 up_write(&llioc.ioc_sem);
3654 void ll_iocontrol_unregister(void *magic)
3656 struct llioc_data *tmp;
3661 down_write(&llioc.ioc_sem);
3662 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3664 unsigned int size = tmp->iocd_size;
3666 list_del(&tmp->iocd_list);
3667 up_write(&llioc.ioc_sem);
3669 OBD_FREE(tmp, size);
3673 up_write(&llioc.ioc_sem);
3675 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3678 EXPORT_SYMBOL(ll_iocontrol_register);
3679 EXPORT_SYMBOL(ll_iocontrol_unregister);
3681 static enum llioc_iter
3682 ll_iocontrol_call(struct inode *inode, struct file *file,
3683 unsigned int cmd, unsigned long arg, int *rcp)
3685 enum llioc_iter ret = LLIOC_CONT;
3686 struct llioc_data *data;
3687 int rc = -EINVAL, i;
3689 down_read(&llioc.ioc_sem);
3690 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3691 for (i = 0; i < data->iocd_count; i++) {
3692 if (cmd != data->iocd_cmd[i])
3695 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3699 if (ret == LLIOC_STOP)
3702 up_read(&llioc.ioc_sem);
3709 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3711 struct ll_inode_info *lli = ll_i2info(inode);
3712 struct cl_env_nest nest;
3717 if (lli->lli_clob == NULL)
3720 env = cl_env_nested_get(&nest);
3722 RETURN(PTR_ERR(env));
3724 result = cl_conf_set(env, lli->lli_clob, conf);
3725 cl_env_nested_put(&nest, env);
3727 if (conf->coc_opc == OBJECT_CONF_SET) {
3728 struct ldlm_lock *lock = conf->coc_lock;
3730 LASSERT(lock != NULL);
3731 LASSERT(ldlm_has_layout(lock));
3733 struct lustre_md *md = conf->u.coc_md;
3734 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3736 /* it can only be allowed to match after layout is
3737 * applied to inode otherwise false layout would be
3738 * seen. Applying layout shoud happen before dropping
3739 * the intent lock. */
3740 ldlm_lock_allow_match(lock);
3742 lli->lli_has_smd = lsm_has_objects(md->lsm);
3743 if (md->lsm != NULL)
3744 gen = md->lsm->lsm_layout_gen;
3747 DFID ": layout version change: %u -> %u\n",
3748 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3750 ll_layout_version_set(lli, gen);
3756 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3757 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3760 struct ll_sb_info *sbi = ll_i2sbi(inode);
3761 struct obd_capa *oc;
3762 struct ptlrpc_request *req;
3763 struct mdt_body *body;
3770 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3771 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3772 lock->l_lvb_data, lock->l_lvb_len);
3774 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3777 /* if layout lock was granted right away, the layout is returned
3778 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3779 * blocked and then granted via completion ast, we have to fetch
3780 * layout here. Please note that we can't use the LVB buffer in
3781 * completion AST because it doesn't have a large enough buffer */
3782 oc = ll_mdscapa_get(inode);
3783 rc = ll_get_default_mdsize(sbi, &lmmsize);
3785 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3786 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3792 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3794 GOTO(out, rc = -EPROTO);
3796 lmmsize = body->eadatasize;
3797 if (lmmsize == 0) /* empty layout */
3800 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3802 GOTO(out, rc = -EFAULT);
3804 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3805 if (lvbdata == NULL)
3806 GOTO(out, rc = -ENOMEM);
3808 memcpy(lvbdata, lmm, lmmsize);
3809 lock_res_and_lock(lock);
3810 if (lock->l_lvb_data != NULL)
3811 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3813 lock->l_lvb_data = lvbdata;
3814 lock->l_lvb_len = lmmsize;
3815 unlock_res_and_lock(lock);
3820 ptlrpc_req_finished(req);
3825 * Apply the layout to the inode. Layout lock is held and will be released
3828 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3829 struct inode *inode, __u32 *gen, bool reconf)
3831 struct ll_inode_info *lli = ll_i2info(inode);
3832 struct ll_sb_info *sbi = ll_i2sbi(inode);
3833 struct ldlm_lock *lock;
3834 struct lustre_md md = { NULL };
3835 struct cl_object_conf conf;
3838 bool wait_layout = false;
3841 LASSERT(lustre_handle_is_used(lockh));
3843 lock = ldlm_handle2lock(lockh);
3844 LASSERT(lock != NULL);
3845 LASSERT(ldlm_has_layout(lock));
3847 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3848 PFID(&lli->lli_fid), inode, reconf);
3850 /* in case this is a caching lock and reinstate with new inode */
3851 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3853 lock_res_and_lock(lock);
3854 lvb_ready = ldlm_is_lvb_ready(lock);
3855 unlock_res_and_lock(lock);
3856 /* checking lvb_ready is racy but this is okay. The worst case is
3857 * that multi processes may configure the file on the same time. */
3859 if (lvb_ready || !reconf) {
3862 /* layout_gen must be valid if layout lock is not
3863 * cancelled and stripe has already set */
3864 *gen = ll_layout_version_get(lli);
3870 rc = ll_layout_fetch(inode, lock);
3874 /* for layout lock, lmm is returned in lock's lvb.
3875 * lvb_data is immutable if the lock is held so it's safe to access it
3876 * without res lock. See the description in ldlm_lock_decref_internal()
3877 * for the condition to free lvb_data of layout lock */
3878 if (lock->l_lvb_data != NULL) {
3879 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3880 lock->l_lvb_data, lock->l_lvb_len);
3882 *gen = LL_LAYOUT_GEN_EMPTY;
3884 *gen = md.lsm->lsm_layout_gen;
3887 CERROR("%s: file "DFID" unpackmd error: %d\n",
3888 ll_get_fsname(inode->i_sb, NULL, 0),
3889 PFID(&lli->lli_fid), rc);
3895 /* set layout to file. Unlikely this will fail as old layout was
3896 * surely eliminated */
3897 memset(&conf, 0, sizeof conf);
3898 conf.coc_opc = OBJECT_CONF_SET;
3899 conf.coc_inode = inode;
3900 conf.coc_lock = lock;
3901 conf.u.coc_md = &md;
3902 rc = ll_layout_conf(inode, &conf);
3905 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3907 /* refresh layout failed, need to wait */
3908 wait_layout = rc == -EBUSY;
3912 LDLM_LOCK_PUT(lock);
3913 ldlm_lock_decref(lockh, mode);
3915 /* wait for IO to complete if it's still being used. */
3917 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3918 ll_get_fsname(inode->i_sb, NULL, 0),
3919 PFID(&lli->lli_fid), inode);
3921 memset(&conf, 0, sizeof conf);
3922 conf.coc_opc = OBJECT_CONF_WAIT;
3923 conf.coc_inode = inode;
3924 rc = ll_layout_conf(inode, &conf);
3928 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3929 ll_get_fsname(inode->i_sb, NULL, 0),
3930 PFID(&lli->lli_fid), rc);
3936 * This function checks if there exists a LAYOUT lock on the client side,
3937 * or enqueues it if it doesn't have one in cache.
3939 * This function will not hold layout lock so it may be revoked any time after
3940 * this function returns. Any operations depend on layout should be redone
3943 * This function should be called before lov_io_init() to get an uptodate
3944 * layout version, the caller should save the version number and after IO
3945 * is finished, this function should be called again to verify that layout
3946 * is not changed during IO time.
3948 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3950 struct ll_inode_info *lli = ll_i2info(inode);
3951 struct ll_sb_info *sbi = ll_i2sbi(inode);
3952 struct md_op_data *op_data;
3953 struct lookup_intent it;
3954 struct lustre_handle lockh;
3956 struct ldlm_enqueue_info einfo = {
3957 .ei_type = LDLM_IBITS,
3959 .ei_cb_bl = ll_md_blocking_ast,
3960 .ei_cb_cp = ldlm_completion_ast,
3965 *gen = ll_layout_version_get(lli);
3966 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3970 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3971 LASSERT(S_ISREG(inode->i_mode));
3973 /* take layout lock mutex to enqueue layout lock exclusively. */
3974 mutex_lock(&lli->lli_layout_mutex);
3977 /* mostly layout lock is caching on the local side, so try to match
3978 * it before grabbing layout lock mutex. */
3979 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3980 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3981 if (mode != 0) { /* hit cached lock */
3982 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3986 mutex_unlock(&lli->lli_layout_mutex);
3990 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3991 0, 0, LUSTRE_OPC_ANY, NULL);
3992 if (IS_ERR(op_data)) {
3993 mutex_unlock(&lli->lli_layout_mutex);
3994 RETURN(PTR_ERR(op_data));
3997 /* have to enqueue one */
3998 memset(&it, 0, sizeof(it));
3999 it.it_op = IT_LAYOUT;
4000 lockh.cookie = 0ULL;
4002 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4003 ll_get_fsname(inode->i_sb, NULL, 0),
4004 PFID(&lli->lli_fid), inode);
4006 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
4008 if (it.d.lustre.it_data != NULL)
4009 ptlrpc_req_finished(it.d.lustre.it_data);
4010 it.d.lustre.it_data = NULL;
4012 ll_finish_md_op_data(op_data);
4014 mode = it.d.lustre.it_lock_mode;
4015 it.d.lustre.it_lock_mode = 0;
4016 ll_intent_drop_lock(&it);
4019 /* set lock data in case this is a new lock */
4020 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4021 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4025 mutex_unlock(&lli->lli_layout_mutex);
4031 * This function send a restore request to the MDT
4033 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4035 struct hsm_user_request *hur;
4039 len = sizeof(struct hsm_user_request) +
4040 sizeof(struct hsm_user_item);
4041 OBD_ALLOC(hur, len);
4045 hur->hur_request.hr_action = HUA_RESTORE;
4046 hur->hur_request.hr_archive_id = 0;
4047 hur->hur_request.hr_flags = 0;
4048 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4049 sizeof(hur->hur_user_item[0].hui_fid));
4050 hur->hur_user_item[0].hui_extent.offset = offset;
4051 hur->hur_user_item[0].hui_extent.length = length;
4052 hur->hur_request.hr_itemcount = 1;
4053 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,