4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
94 ll_inode_to_ext_flags(inode->i_flags);
95 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Closes the IO epoch and packs all the attributes into @op_data for
108 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
109 struct obd_client_handle *och)
113 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
114 ATTR_MTIME | ATTR_MTIME_SET |
115 ATTR_CTIME | ATTR_CTIME_SET;
117 if (!(och->och_flags & FMODE_WRITE))
120 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
121 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
123 ll_ioepoch_close(inode, op_data, &och, 0);
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
147 * XXX: in case of LMV, is this correct to access
150 CERROR("Invalid MDC connection handle "LPX64"\n",
151 ll_i2mdexp(inode)->exp_handle.h_cookie);
155 OBD_ALLOC_PTR(op_data);
157 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
159 ll_prepare_close(inode, op_data, och);
160 if (data_version != NULL) {
161 /* Pass in data_version implies release. */
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *data_version;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 /* This close must have the epoch closed. */
171 LASSERT(epoch_close);
172 /* MDS has instructed us to obtain Size-on-MDS attribute from
173 * OSTs and send setattr to back to MDS. */
174 rc = ll_som_update(inode, op_data);
176 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
177 " failed: rc = %d\n",
178 ll_i2mdexp(inode)->exp_obd->obd_name,
179 PFID(ll_inode2fid(inode)), rc);
183 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
184 ll_i2mdexp(inode)->exp_obd->obd_name,
185 PFID(ll_inode2fid(inode)), rc);
188 /* DATA_MODIFIED flag was successfully sent on close, cancel data
189 * modification flag. */
190 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
191 struct ll_inode_info *lli = ll_i2info(inode);
193 spin_lock(&lli->lli_lock);
194 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
195 spin_unlock(&lli->lli_lock);
199 rc = ll_objects_destroy(req, inode);
201 CERROR("%s: inode "DFID
202 " ll_objects destroy: rc = %d\n",
203 ll_i2mdexp(inode)->exp_obd->obd_name,
204 PFID(ll_inode2fid(inode)), rc);
207 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
208 struct mdt_body *body;
209 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
210 if (!(body->valid & OBD_MD_FLRELEASED))
214 ll_finish_md_op_data(op_data);
218 if (exp_connect_som(exp) && !epoch_close &&
219 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
220 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
222 md_clear_open_replay_data(md_exp, och);
223 /* Free @och if it is not waiting for DONE_WRITING. */
224 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
227 if (req) /* This is close request */
228 ptlrpc_req_finished(req);
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
275 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
279 struct ll_inode_info *lli = ll_i2info(inode);
283 /* clear group lock, if present */
284 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
285 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
287 if (fd->fd_lease_och != NULL) {
290 /* Usually the lease is not released when the
291 * application crashed, we need to release here. */
292 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
293 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
294 PFID(&lli->lli_fid), rc, lease_broken);
296 fd->fd_lease_och = NULL;
299 if (fd->fd_och != NULL) {
300 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
305 /* Let's see if we have good enough OPEN lock on the file and if
306 we can skip talking to MDS */
307 if (file->f_dentry->d_inode) { /* Can this ever be false? */
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct lustre_handle lockh;
311 struct inode *inode = file->f_dentry->d_inode;
312 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
314 mutex_lock(&lli->lli_och_mutex);
315 if (fd->fd_omode & FMODE_WRITE) {
317 LASSERT(lli->lli_open_fd_write_count);
318 lli->lli_open_fd_write_count--;
319 } else if (fd->fd_omode & FMODE_EXEC) {
321 LASSERT(lli->lli_open_fd_exec_count);
322 lli->lli_open_fd_exec_count--;
325 LASSERT(lli->lli_open_fd_read_count);
326 lli->lli_open_fd_read_count--;
328 mutex_unlock(&lli->lli_och_mutex);
330 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
331 LDLM_IBITS, &policy, lockmode,
333 rc = ll_md_real_close(file->f_dentry->d_inode,
337 CERROR("Releasing a file %p with negative dentry %p. Name %s",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead.
385 * Different processes can open the same dir, "ll_opendir_key" means:
386 * it is me that should stop the statahead thread. */
387 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
388 lli->lli_opendir_pid != 0)
389 ll_stop_statahead(inode, lli->lli_opendir_key);
391 if (inode->i_sb->s_root == file->f_dentry) {
392 LUSTRE_FPRIVATE(file) = NULL;
393 ll_file_data_put(fd);
397 if (!S_ISDIR(inode->i_mode)) {
398 if (lli->lli_clob != NULL)
399 lov_read_and_clear_async_rc(lli->lli_clob);
400 lli->lli_async_rc = 0;
403 rc = ll_md_close(sbi->ll_md_exp, inode, file);
405 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
406 libcfs_debug_dumplog();
411 static int ll_intent_file_open(struct file *file, void *lmm,
412 int lmmsize, struct lookup_intent *itp)
414 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
415 struct dentry *parent = file->f_dentry->d_parent;
416 const char *name = file->f_dentry->d_name.name;
417 const int len = file->f_dentry->d_name.len;
418 struct md_op_data *op_data;
419 struct ptlrpc_request *req = NULL;
420 __u32 opc = LUSTRE_OPC_ANY;
427 /* Usually we come here only for NFSD, and we want open lock.
428 But we can also get here with pre 2.6.15 patchless kernels, and in
429 that case that lock is also ok */
430 /* We can also get here if there was cached open handle in revalidate_it
431 * but it disappeared while we were getting from there to ll_file_open.
432 * But this means this file was closed and immediatelly opened which
433 * makes a good candidate for using OPEN lock */
434 /* If lmmsize & lmm are not 0, we are just setting stripe info
435 * parameters. No need for the open lock */
436 if (lmm == NULL && lmmsize == 0) {
437 itp->it_flags |= MDS_OPEN_LOCK;
438 if (itp->it_flags & FMODE_WRITE)
439 opc = LUSTRE_OPC_CREATE;
442 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
443 file->f_dentry->d_inode, name, len,
446 RETURN(PTR_ERR(op_data));
448 op_data->op_data = lmm;
449 op_data->op_data_size = lmmsize;
451 itp->it_flags |= MDS_OPEN_BY_FID;
452 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
453 &ll_md_blocking_ast, 0);
454 ll_finish_md_op_data(op_data);
456 /* reason for keep own exit path - don`t flood log
457 * with messages with -ESTALE errors.
459 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
460 it_open_error(DISP_OPEN_OPEN, itp))
462 ll_release_openhandle(file->f_dentry, itp);
466 if (it_disposition(itp, DISP_LOOKUP_NEG))
467 GOTO(out, rc = -ENOENT);
469 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
470 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
471 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
475 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
476 if (!rc && itp->d.lustre.it_lock_mode)
477 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
481 ptlrpc_req_finished(req);
482 ll_intent_drop_lock(itp);
488 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
489 * not believe attributes if a few ioepoch holders exist. Attributes for
490 * previous ioepoch if new one is opened are also skipped by MDS.
492 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
494 if (ioepoch && lli->lli_ioepoch != ioepoch) {
495 lli->lli_ioepoch = ioepoch;
496 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
497 ioepoch, PFID(&lli->lli_fid));
501 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
502 struct obd_client_handle *och)
504 struct ptlrpc_request *req = it->d.lustre.it_data;
505 struct mdt_body *body;
507 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
508 och->och_fh = body->handle;
509 och->och_fid = body->fid1;
510 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
511 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
512 och->och_flags = it->it_flags;
514 return md_set_open_replay_data(md_exp, och, it);
517 static int ll_local_open(struct file *file, struct lookup_intent *it,
518 struct ll_file_data *fd, struct obd_client_handle *och)
520 struct inode *inode = file->f_dentry->d_inode;
521 struct ll_inode_info *lli = ll_i2info(inode);
524 LASSERT(!LUSTRE_FPRIVATE(file));
529 struct ptlrpc_request *req = it->d.lustre.it_data;
530 struct mdt_body *body;
533 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
537 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
538 ll_ioepoch_open(lli, body->ioepoch);
541 LUSTRE_FPRIVATE(file) = fd;
542 ll_readahead_init(inode, &fd->fd_ras);
543 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
548 /* Open a file, and (for the very first open) create objects on the OSTs at
549 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
550 * creation or open until ll_lov_setstripe() ioctl is called.
552 * If we already have the stripe MD locally then we don't request it in
553 * md_open(), by passing a lmm_size = 0.
555 * It is up to the application to ensure no other processes open this file
556 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
557 * used. We might be able to avoid races of that sort by getting lli_open_sem
558 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
559 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
561 int ll_file_open(struct inode *inode, struct file *file)
563 struct ll_inode_info *lli = ll_i2info(inode);
564 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
565 .it_flags = file->f_flags };
566 struct obd_client_handle **och_p = NULL;
567 __u64 *och_usecount = NULL;
568 struct ll_file_data *fd;
569 int rc = 0, opendir_set = 0;
572 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
573 PFID(ll_inode2fid(inode)), inode, file->f_flags);
575 it = file->private_data; /* XXX: compat macro */
576 file->private_data = NULL; /* prevent ll_local_open assertion */
578 fd = ll_file_data_get();
580 GOTO(out_openerr, rc = -ENOMEM);
583 if (S_ISDIR(inode->i_mode)) {
584 spin_lock(&lli->lli_sa_lock);
585 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
586 lli->lli_opendir_pid == 0) {
587 lli->lli_opendir_key = fd;
588 lli->lli_opendir_pid = current_pid();
591 spin_unlock(&lli->lli_sa_lock);
594 if (inode->i_sb->s_root == file->f_dentry) {
595 LUSTRE_FPRIVATE(file) = fd;
599 if (!it || !it->d.lustre.it_disposition) {
600 /* Convert f_flags into access mode. We cannot use file->f_mode,
601 * because everything but O_ACCMODE mask was stripped from
603 if ((oit.it_flags + 1) & O_ACCMODE)
605 if (file->f_flags & O_TRUNC)
606 oit.it_flags |= FMODE_WRITE;
608 /* kernel only call f_op->open in dentry_open. filp_open calls
609 * dentry_open after call to open_namei that checks permissions.
610 * Only nfsd_open call dentry_open directly without checking
611 * permissions and because of that this code below is safe. */
612 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
613 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
615 /* We do not want O_EXCL here, presumably we opened the file
616 * already? XXX - NFS implications? */
617 oit.it_flags &= ~O_EXCL;
619 /* bug20584, if "it_flags" contains O_CREAT, the file will be
620 * created if necessary, then "IT_CREAT" should be set to keep
621 * consistent with it */
622 if (oit.it_flags & O_CREAT)
623 oit.it_op |= IT_CREAT;
629 /* Let's see if we have file open on MDS already. */
630 if (it->it_flags & FMODE_WRITE) {
631 och_p = &lli->lli_mds_write_och;
632 och_usecount = &lli->lli_open_fd_write_count;
633 } else if (it->it_flags & FMODE_EXEC) {
634 och_p = &lli->lli_mds_exec_och;
635 och_usecount = &lli->lli_open_fd_exec_count;
637 och_p = &lli->lli_mds_read_och;
638 och_usecount = &lli->lli_open_fd_read_count;
641 mutex_lock(&lli->lli_och_mutex);
642 if (*och_p) { /* Open handle is present */
643 if (it_disposition(it, DISP_OPEN_OPEN)) {
644 /* Well, there's extra open request that we do not need,
645 let's close it somehow. This will decref request. */
646 rc = it_open_error(DISP_OPEN_OPEN, it);
648 mutex_unlock(&lli->lli_och_mutex);
649 GOTO(out_openerr, rc);
652 ll_release_openhandle(file->f_dentry, it);
656 rc = ll_local_open(file, it, fd, NULL);
659 mutex_unlock(&lli->lli_och_mutex);
660 GOTO(out_openerr, rc);
663 LASSERT(*och_usecount == 0);
664 if (!it->d.lustre.it_disposition) {
665 /* We cannot just request lock handle now, new ELC code
666 means that one of other OPEN locks for this file
667 could be cancelled, and since blocking ast handler
668 would attempt to grab och_mutex as well, that would
669 result in a deadlock */
670 mutex_unlock(&lli->lli_och_mutex);
671 it->it_create_mode |= M_CHECK_STALE;
672 rc = ll_intent_file_open(file, NULL, 0, it);
673 it->it_create_mode &= ~M_CHECK_STALE;
675 GOTO(out_openerr, rc);
679 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
681 GOTO(out_och_free, rc = -ENOMEM);
685 /* md_intent_lock() didn't get a request ref if there was an
686 * open error, so don't do cleanup on the request here
688 /* XXX (green): Should not we bail out on any error here, not
689 * just open error? */
690 rc = it_open_error(DISP_OPEN_OPEN, it);
692 GOTO(out_och_free, rc);
694 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
695 "inode %p: disposition %x, status %d\n", inode,
696 it_disposition(it, ~0), it->d.lustre.it_status);
698 rc = ll_local_open(file, it, fd, *och_p);
700 GOTO(out_och_free, rc);
702 mutex_unlock(&lli->lli_och_mutex);
705 /* Must do this outside lli_och_mutex lock to prevent deadlock where
706 different kind of OPEN lock for this same inode gets cancelled
707 by ldlm_cancel_lru */
708 if (!S_ISREG(inode->i_mode))
709 GOTO(out_och_free, rc);
713 if (!lli->lli_has_smd &&
714 (cl_is_lov_delay_create(file->f_flags) ||
715 (file->f_mode & FMODE_WRITE) == 0)) {
716 CDEBUG(D_INODE, "object creation was delayed\n");
717 GOTO(out_och_free, rc);
719 cl_lov_delay_create_clear(&file->f_flags);
720 GOTO(out_och_free, rc);
724 if (och_p && *och_p) {
725 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
726 *och_p = NULL; /* OBD_FREE writes some magic there */
729 mutex_unlock(&lli->lli_och_mutex);
732 if (opendir_set != 0)
733 ll_stop_statahead(inode, lli->lli_opendir_key);
735 ll_file_data_put(fd);
737 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
740 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
741 ptlrpc_req_finished(it->d.lustre.it_data);
742 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
748 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
749 struct ldlm_lock_desc *desc, void *data, int flag)
752 struct lustre_handle lockh;
756 case LDLM_CB_BLOCKING:
757 ldlm_lock2handle(lock, &lockh);
758 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
760 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
764 case LDLM_CB_CANCELING:
772 * Acquire a lease and open the file.
774 static struct obd_client_handle *
775 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
778 struct lookup_intent it = { .it_op = IT_OPEN };
779 struct ll_sb_info *sbi = ll_i2sbi(inode);
780 struct md_op_data *op_data;
781 struct ptlrpc_request *req = NULL;
782 struct lustre_handle old_handle = { 0 };
783 struct obd_client_handle *och = NULL;
788 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
789 RETURN(ERR_PTR(-EINVAL));
792 struct ll_inode_info *lli = ll_i2info(inode);
793 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
794 struct obd_client_handle **och_p;
797 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
798 RETURN(ERR_PTR(-EPERM));
800 /* Get the openhandle of the file */
802 mutex_lock(&lli->lli_och_mutex);
803 if (fd->fd_lease_och != NULL) {
804 mutex_unlock(&lli->lli_och_mutex);
808 if (fd->fd_och == NULL) {
809 if (file->f_mode & FMODE_WRITE) {
810 LASSERT(lli->lli_mds_write_och != NULL);
811 och_p = &lli->lli_mds_write_och;
812 och_usecount = &lli->lli_open_fd_write_count;
814 LASSERT(lli->lli_mds_read_och != NULL);
815 och_p = &lli->lli_mds_read_och;
816 och_usecount = &lli->lli_open_fd_read_count;
818 if (*och_usecount == 1) {
825 mutex_unlock(&lli->lli_och_mutex);
826 if (rc < 0) /* more than 1 opener */
829 LASSERT(fd->fd_och != NULL);
830 old_handle = fd->fd_och->och_fh;
835 RETURN(ERR_PTR(-ENOMEM));
837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
838 LUSTRE_OPC_ANY, NULL);
840 GOTO(out, rc = PTR_ERR(op_data));
842 /* To tell the MDT this openhandle is from the same owner */
843 op_data->op_handle = old_handle;
845 it.it_flags = fmode | open_flags;
846 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
847 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
848 &ll_md_blocking_lease_ast,
849 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
850 * it can be cancelled which may mislead applications that the lease is
852 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
853 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
854 * doesn't deal with openhandle, so normal openhandle will be leaked. */
855 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
856 ll_finish_md_op_data(op_data);
857 ptlrpc_req_finished(req);
859 GOTO(out_release_it, rc);
861 if (it_disposition(&it, DISP_LOOKUP_NEG))
862 GOTO(out_release_it, rc = -ENOENT);
864 rc = it_open_error(DISP_OPEN_OPEN, &it);
866 GOTO(out_release_it, rc);
868 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
869 ll_och_fill(sbi->ll_md_exp, &it, och);
871 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
872 GOTO(out_close, rc = -EOPNOTSUPP);
874 /* already get lease, handle lease lock */
875 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
876 if (it.d.lustre.it_lock_mode == 0 ||
877 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
878 /* open lock must return for lease */
879 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
880 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
881 it.d.lustre.it_lock_bits);
882 GOTO(out_close, rc = -EPROTO);
885 ll_intent_release(&it);
889 /* Cancel open lock */
890 if (it.d.lustre.it_lock_mode != 0) {
891 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
892 it.d.lustre.it_lock_mode);
893 it.d.lustre.it_lock_mode = 0;
894 och->och_lease_handle.cookie = 0ULL;
896 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
898 CERROR("%s: error closing file "DFID": %d\n",
899 ll_get_fsname(inode->i_sb, NULL, 0),
900 PFID(&ll_i2info(inode)->lli_fid), rc2);
901 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
903 ll_intent_release(&it);
911 * Release lease and close the file.
912 * It will check if the lease has ever broken.
914 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
917 struct ldlm_lock *lock;
918 bool cancelled = true;
922 lock = ldlm_handle2lock(&och->och_lease_handle);
924 lock_res_and_lock(lock);
925 cancelled = ldlm_is_cancel(lock);
926 unlock_res_and_lock(lock);
930 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
931 PFID(&ll_i2info(inode)->lli_fid), cancelled);
934 ldlm_cli_cancel(&och->och_lease_handle, 0);
935 if (lease_broken != NULL)
936 *lease_broken = cancelled;
938 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
943 /* Fills the obdo with the attributes for the lsm */
944 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
945 struct obd_capa *capa, struct obdo *obdo,
946 __u64 ioepoch, int dv_flags)
948 struct ptlrpc_request_set *set;
949 struct obd_info oinfo = { { { 0 } } };
954 LASSERT(lsm != NULL);
958 oinfo.oi_oa->o_oi = lsm->lsm_oi;
959 oinfo.oi_oa->o_mode = S_IFREG;
960 oinfo.oi_oa->o_ioepoch = ioepoch;
961 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
962 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
963 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
964 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
965 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
966 OBD_MD_FLDATAVERSION;
967 oinfo.oi_capa = capa;
968 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
969 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
970 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
971 if (dv_flags & LL_DV_WR_FLUSH)
972 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
975 set = ptlrpc_prep_set();
977 CERROR("can't allocate ptlrpc set\n");
980 rc = obd_getattr_async(exp, &oinfo, set);
982 rc = ptlrpc_set_wait(set);
983 ptlrpc_set_destroy(set);
986 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
987 OBD_MD_FLATIME | OBD_MD_FLMTIME |
988 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
989 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
990 if (dv_flags & LL_DV_WR_FLUSH &&
991 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
992 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
999 * Performs the getattr on the inode and updates its fields.
1000 * If @sync != 0, perform the getattr under the server-side lock.
1002 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1003 __u64 ioepoch, int sync)
1005 struct obd_capa *capa = ll_mdscapa_get(inode);
1006 struct lov_stripe_md *lsm;
1010 lsm = ccc_inode_lsm_get(inode);
1011 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1012 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1015 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1017 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1018 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1019 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1020 (unsigned long long)inode->i_blocks,
1021 (unsigned long)ll_inode_blksize(inode));
1023 ccc_inode_lsm_put(inode, lsm);
1027 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1029 struct ll_inode_info *lli = ll_i2info(inode);
1030 struct cl_object *obj = lli->lli_clob;
1031 struct cl_attr *attr = ccc_env_thread_attr(env);
1037 ll_inode_size_lock(inode);
1038 /* merge timestamps the most recently obtained from mds with
1039 timestamps obtained from osts */
1040 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1041 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1042 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1043 inode_init_lvb(inode, &lvb);
1045 cl_object_attr_lock(obj);
1046 rc = cl_object_attr_get(env, obj, attr);
1047 cl_object_attr_unlock(obj);
1050 if (lvb.lvb_atime < attr->cat_atime)
1051 lvb.lvb_atime = attr->cat_atime;
1052 if (lvb.lvb_ctime < attr->cat_ctime)
1053 lvb.lvb_ctime = attr->cat_ctime;
1054 if (lvb.lvb_mtime < attr->cat_mtime)
1055 lvb.lvb_mtime = attr->cat_mtime;
1057 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1058 PFID(&lli->lli_fid), attr->cat_size);
1059 cl_isize_write_nolock(inode, attr->cat_size);
1061 inode->i_blocks = attr->cat_blocks;
1063 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1064 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1065 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1067 ll_inode_size_unlock(inode);
1072 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1075 struct obdo obdo = { 0 };
1078 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1080 st->st_size = obdo.o_size;
1081 st->st_blocks = obdo.o_blocks;
1082 st->st_mtime = obdo.o_mtime;
1083 st->st_atime = obdo.o_atime;
1084 st->st_ctime = obdo.o_ctime;
1089 static bool file_is_noatime(const struct file *file)
1091 const struct vfsmount *mnt = file->f_path.mnt;
1092 const struct inode *inode = file->f_path.dentry->d_inode;
1094 /* Adapted from file_accessed() and touch_atime().*/
1095 if (file->f_flags & O_NOATIME)
1098 if (inode->i_flags & S_NOATIME)
1101 if (IS_NOATIME(inode))
1104 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1107 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1110 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1116 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1118 struct inode *inode = file->f_dentry->d_inode;
1120 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1122 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1123 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1124 file->f_flags & O_DIRECT ||
1127 io->ci_obj = ll_i2info(inode)->lli_clob;
1128 io->ci_lockreq = CILR_MAYBE;
1129 if (ll_file_nolock(file)) {
1130 io->ci_lockreq = CILR_NEVER;
1131 io->ci_no_srvlock = 1;
1132 } else if (file->f_flags & O_APPEND) {
1133 io->ci_lockreq = CILR_MANDATORY;
1136 io->ci_noatime = file_is_noatime(file);
1140 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1141 struct file *file, enum cl_io_type iot,
1142 loff_t *ppos, size_t count)
1144 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1145 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1150 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1151 file->f_dentry->d_name.name, iot, *ppos, count);
1154 io = ccc_env_thread_io(env);
1155 ll_io_init(io, file, iot == CIT_WRITE);
1157 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1158 struct vvp_io *vio = vvp_env_io(env);
1159 struct ccc_io *cio = ccc_env_io(env);
1160 int write_mutex_locked = 0;
1162 cio->cui_fd = LUSTRE_FPRIVATE(file);
1163 vio->cui_io_subtype = args->via_io_subtype;
1165 switch (vio->cui_io_subtype) {
1167 cio->cui_iov = args->u.normal.via_iov;
1168 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1169 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1170 cio->cui_iocb = args->u.normal.via_iocb;
1171 if ((iot == CIT_WRITE) &&
1172 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1173 if (mutex_lock_interruptible(&lli->
1175 GOTO(out, result = -ERESTARTSYS);
1176 write_mutex_locked = 1;
1178 down_read(&lli->lli_trunc_sem);
1181 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1182 vio->u.splice.cui_flags = args->u.splice.via_flags;
1185 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1188 result = cl_io_loop(env, io);
1189 if (args->via_io_subtype == IO_NORMAL)
1190 up_read(&lli->lli_trunc_sem);
1191 if (write_mutex_locked)
1192 mutex_unlock(&lli->lli_write_mutex);
1194 /* cl_io_rw_init() handled IO */
1195 result = io->ci_result;
1198 if (io->ci_nob > 0) {
1199 result = io->ci_nob;
1200 *ppos = io->u.ci_wr.wr.crw_pos;
1204 cl_io_fini(env, io);
1205 /* If any bit been read/written (result != 0), we just return
1206 * short read/write instead of restart io. */
1207 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1208 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1209 iot == CIT_READ ? "read" : "write",
1210 file->f_dentry->d_name.name, *ppos, count);
1211 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1215 if (iot == CIT_READ) {
1217 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1218 LPROC_LL_READ_BYTES, result);
1219 } else if (iot == CIT_WRITE) {
1221 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1222 LPROC_LL_WRITE_BYTES, result);
1223 fd->fd_write_failed = false;
1224 } else if (result != -ERESTARTSYS) {
1225 fd->fd_write_failed = true;
1228 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1235 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1237 static int ll_file_get_iov_count(const struct iovec *iov,
1238 unsigned long *nr_segs, size_t *count)
1243 for (seg = 0; seg < *nr_segs; seg++) {
1244 const struct iovec *iv = &iov[seg];
1247 * If any segment has a negative length, or the cumulative
1248 * length ever wraps negative then return -EINVAL.
1251 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1253 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1258 cnt -= iv->iov_len; /* This segment is no good */
1265 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1266 unsigned long nr_segs, loff_t pos)
1269 struct vvp_io_args *args;
1275 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1279 env = cl_env_get(&refcheck);
1281 RETURN(PTR_ERR(env));
1283 args = vvp_env_args(env, IO_NORMAL);
1284 args->u.normal.via_iov = (struct iovec *)iov;
1285 args->u.normal.via_nrsegs = nr_segs;
1286 args->u.normal.via_iocb = iocb;
1288 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1289 &iocb->ki_pos, count);
1290 cl_env_put(env, &refcheck);
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1298 struct iovec *local_iov;
1299 struct kiocb *kiocb;
1304 env = cl_env_get(&refcheck);
1306 RETURN(PTR_ERR(env));
1308 local_iov = &vvp_env_info(env)->vti_local_iov;
1309 kiocb = &vvp_env_info(env)->vti_kiocb;
1310 local_iov->iov_base = (void __user *)buf;
1311 local_iov->iov_len = count;
1312 init_sync_kiocb(kiocb, file);
1313 kiocb->ki_pos = *ppos;
1314 #ifdef HAVE_KIOCB_KI_LEFT
1315 kiocb->ki_left = count;
1317 kiocb->ki_nbytes = count;
1320 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1321 *ppos = kiocb->ki_pos;
1323 cl_env_put(env, &refcheck);
1328 * Write to a file (through the page cache).
1331 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1332 unsigned long nr_segs, loff_t pos)
1335 struct vvp_io_args *args;
1341 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1345 env = cl_env_get(&refcheck);
1347 RETURN(PTR_ERR(env));
1349 args = vvp_env_args(env, IO_NORMAL);
1350 args->u.normal.via_iov = (struct iovec *)iov;
1351 args->u.normal.via_nrsegs = nr_segs;
1352 args->u.normal.via_iocb = iocb;
1354 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1355 &iocb->ki_pos, count);
1356 cl_env_put(env, &refcheck);
1360 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1364 struct iovec *local_iov;
1365 struct kiocb *kiocb;
1370 env = cl_env_get(&refcheck);
1372 RETURN(PTR_ERR(env));
1374 local_iov = &vvp_env_info(env)->vti_local_iov;
1375 kiocb = &vvp_env_info(env)->vti_kiocb;
1376 local_iov->iov_base = (void __user *)buf;
1377 local_iov->iov_len = count;
1378 init_sync_kiocb(kiocb, file);
1379 kiocb->ki_pos = *ppos;
1380 #ifdef HAVE_KIOCB_KI_LEFT
1381 kiocb->ki_left = count;
1383 kiocb->ki_nbytes = count;
1386 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1387 *ppos = kiocb->ki_pos;
1389 cl_env_put(env, &refcheck);
1394 * Send file content (through pagecache) somewhere with helper
1396 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1397 struct pipe_inode_info *pipe, size_t count,
1401 struct vvp_io_args *args;
1406 env = cl_env_get(&refcheck);
1408 RETURN(PTR_ERR(env));
1410 args = vvp_env_args(env, IO_SPLICE);
1411 args->u.splice.via_pipe = pipe;
1412 args->u.splice.via_flags = flags;
1414 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1415 cl_env_put(env, &refcheck);
1419 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1422 struct obd_export *exp = ll_i2dtexp(inode);
1423 struct obd_trans_info oti = { 0 };
1424 struct obdo *oa = NULL;
1427 struct lov_stripe_md *lsm = NULL, *lsm2;
1434 lsm = ccc_inode_lsm_get(inode);
1435 if (!lsm_has_objects(lsm))
1436 GOTO(out, rc = -ENOENT);
1438 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1439 (lsm->lsm_stripe_count));
1441 OBD_ALLOC_LARGE(lsm2, lsm_size);
1443 GOTO(out, rc = -ENOMEM);
1446 oa->o_nlink = ost_idx;
1447 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1448 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1449 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1450 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1451 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1452 memcpy(lsm2, lsm, lsm_size);
1453 ll_inode_size_lock(inode);
1454 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1455 ll_inode_size_unlock(inode);
1457 OBD_FREE_LARGE(lsm2, lsm_size);
1460 ccc_inode_lsm_put(inode, lsm);
1465 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1467 struct ll_recreate_obj ucreat;
1471 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1474 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1478 ostid_set_seq_mdt0(&oi);
1479 ostid_set_id(&oi, ucreat.lrc_id);
1480 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1483 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1490 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1493 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1496 fid_to_ostid(&fid, &oi);
1497 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1498 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1501 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1502 __u64 flags, struct lov_user_md *lum,
1505 struct lov_stripe_md *lsm = NULL;
1506 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1510 lsm = ccc_inode_lsm_get(inode);
1512 ccc_inode_lsm_put(inode, lsm);
1513 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1514 PFID(ll_inode2fid(inode)));
1515 GOTO(out, rc = -EEXIST);
1518 ll_inode_size_lock(inode);
1519 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1521 GOTO(out_unlock, rc);
1522 rc = oit.d.lustre.it_status;
1524 GOTO(out_req_free, rc);
1526 ll_release_openhandle(file->f_dentry, &oit);
1529 ll_inode_size_unlock(inode);
1530 ll_intent_release(&oit);
1531 ccc_inode_lsm_put(inode, lsm);
1533 cl_lov_delay_create_clear(&file->f_flags);
1536 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1540 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1541 struct lov_mds_md **lmmp, int *lmm_size,
1542 struct ptlrpc_request **request)
1544 struct ll_sb_info *sbi = ll_i2sbi(inode);
1545 struct mdt_body *body;
1546 struct lov_mds_md *lmm = NULL;
1547 struct ptlrpc_request *req = NULL;
1548 struct md_op_data *op_data;
1551 rc = ll_get_default_mdsize(sbi, &lmmsize);
1555 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1556 strlen(filename), lmmsize,
1557 LUSTRE_OPC_ANY, NULL);
1558 if (IS_ERR(op_data))
1559 RETURN(PTR_ERR(op_data));
1561 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1562 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1563 ll_finish_md_op_data(op_data);
1565 CDEBUG(D_INFO, "md_getattr_name failed "
1566 "on %s: rc %d\n", filename, rc);
1570 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1571 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1573 lmmsize = body->eadatasize;
1575 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1577 GOTO(out, rc = -ENODATA);
1580 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1581 LASSERT(lmm != NULL);
1583 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1584 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1585 GOTO(out, rc = -EPROTO);
1589 * This is coming from the MDS, so is probably in
1590 * little endian. We convert it to host endian before
1591 * passing it to userspace.
1593 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1596 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1597 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1600 /* if function called for directory - we should
1601 * avoid swab not existent lsm objects */
1602 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1603 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1604 if (S_ISREG(body->mode))
1605 lustre_swab_lov_user_md_objects(
1606 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1608 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1609 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1610 if (S_ISREG(body->mode))
1611 lustre_swab_lov_user_md_objects(
1612 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1619 *lmm_size = lmmsize;
1624 static int ll_lov_setea(struct inode *inode, struct file *file,
1627 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1628 struct lov_user_md *lump;
1629 int lum_size = sizeof(struct lov_user_md) +
1630 sizeof(struct lov_user_ost_data);
1634 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1637 OBD_ALLOC_LARGE(lump, lum_size);
1641 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1642 OBD_FREE_LARGE(lump, lum_size);
1646 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1648 OBD_FREE_LARGE(lump, lum_size);
1652 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1655 struct lov_user_md_v3 lumv3;
1656 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1657 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1658 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1660 __u64 flags = FMODE_WRITE;
1663 /* first try with v1 which is smaller than v3 */
1664 lum_size = sizeof(struct lov_user_md_v1);
1665 if (copy_from_user(lumv1, lumv1p, lum_size))
1668 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1669 lum_size = sizeof(struct lov_user_md_v3);
1670 if (copy_from_user(&lumv3, lumv3p, lum_size))
1674 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1676 struct lov_stripe_md *lsm;
1679 put_user(0, &lumv1p->lmm_stripe_count);
1681 ll_layout_refresh(inode, &gen);
1682 lsm = ccc_inode_lsm_get(inode);
1683 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1684 0, lsm, (void *)arg);
1685 ccc_inode_lsm_put(inode, lsm);
1690 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1692 struct lov_stripe_md *lsm;
1696 lsm = ccc_inode_lsm_get(inode);
1698 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1700 ccc_inode_lsm_put(inode, lsm);
1705 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1707 struct ll_inode_info *lli = ll_i2info(inode);
1708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1709 struct ccc_grouplock grouplock;
1713 if (ll_file_nolock(file))
1714 RETURN(-EOPNOTSUPP);
1716 spin_lock(&lli->lli_lock);
1717 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1718 CWARN("group lock already existed with gid %lu\n",
1719 fd->fd_grouplock.cg_gid);
1720 spin_unlock(&lli->lli_lock);
1723 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1724 spin_unlock(&lli->lli_lock);
1726 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1727 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1731 spin_lock(&lli->lli_lock);
1732 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1733 spin_unlock(&lli->lli_lock);
1734 CERROR("another thread just won the race\n");
1735 cl_put_grouplock(&grouplock);
1739 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1740 fd->fd_grouplock = grouplock;
1741 spin_unlock(&lli->lli_lock);
1743 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1747 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1749 struct ll_inode_info *lli = ll_i2info(inode);
1750 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1751 struct ccc_grouplock grouplock;
1754 spin_lock(&lli->lli_lock);
1755 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1756 spin_unlock(&lli->lli_lock);
1757 CWARN("no group lock held\n");
1760 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1762 if (fd->fd_grouplock.cg_gid != arg) {
1763 CWARN("group lock %lu doesn't match current id %lu\n",
1764 arg, fd->fd_grouplock.cg_gid);
1765 spin_unlock(&lli->lli_lock);
1769 grouplock = fd->fd_grouplock;
1770 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1771 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1772 spin_unlock(&lli->lli_lock);
1774 cl_put_grouplock(&grouplock);
1775 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1780 * Close inode open handle
1782 * \param dentry [in] dentry which contains the inode
1783 * \param it [in,out] intent which contains open info and result
1786 * \retval <0 failure
1788 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1790 struct inode *inode = dentry->d_inode;
1791 struct obd_client_handle *och;
1797 /* Root ? Do nothing. */
1798 if (dentry->d_inode->i_sb->s_root == dentry)
1801 /* No open handle to close? Move away */
1802 if (!it_disposition(it, DISP_OPEN_OPEN))
1805 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1807 OBD_ALLOC(och, sizeof(*och));
1809 GOTO(out, rc = -ENOMEM);
1811 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1813 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1816 /* this one is in place of ll_file_open */
1817 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1818 ptlrpc_req_finished(it->d.lustre.it_data);
1819 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1825 * Get size for inode for which FIEMAP mapping is requested.
1826 * Make the FIEMAP get_info call and returns the result.
1828 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1831 struct obd_export *exp = ll_i2dtexp(inode);
1832 struct lov_stripe_md *lsm = NULL;
1833 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1834 __u32 vallen = num_bytes;
1838 /* Checks for fiemap flags */
1839 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1840 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1844 /* Check for FIEMAP_FLAG_SYNC */
1845 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1846 rc = filemap_fdatawrite(inode->i_mapping);
1851 lsm = ccc_inode_lsm_get(inode);
1855 /* If the stripe_count > 1 and the application does not understand
1856 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1858 if (lsm->lsm_stripe_count > 1 &&
1859 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1860 GOTO(out, rc = -EOPNOTSUPP);
1862 fm_key.oa.o_oi = lsm->lsm_oi;
1863 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1865 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1866 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1867 /* If filesize is 0, then there would be no objects for mapping */
1868 if (fm_key.oa.o_size == 0) {
1869 fiemap->fm_mapped_extents = 0;
1873 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1875 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1878 CERROR("obd_get_info failed: rc = %d\n", rc);
1881 ccc_inode_lsm_put(inode, lsm);
1885 int ll_fid2path(struct inode *inode, void *arg)
1887 struct obd_export *exp = ll_i2mdexp(inode);
1888 struct getinfo_fid2path *gfout, *gfin;
1892 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1893 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1896 /* Need to get the buflen */
1897 OBD_ALLOC_PTR(gfin);
1900 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1905 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1906 OBD_ALLOC(gfout, outsize);
1907 if (gfout == NULL) {
1911 memcpy(gfout, gfin, sizeof(*gfout));
1914 /* Call mdc_iocontrol */
1915 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1919 if (copy_to_user(arg, gfout, outsize))
1923 OBD_FREE(gfout, outsize);
1927 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1929 struct ll_user_fiemap *fiemap_s;
1930 size_t num_bytes, ret_bytes;
1931 unsigned int extent_count;
1934 /* Get the extent count so we can calculate the size of
1935 * required fiemap buffer */
1936 if (get_user(extent_count,
1937 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1939 num_bytes = sizeof(*fiemap_s) + (extent_count *
1940 sizeof(struct ll_fiemap_extent));
1942 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1943 if (fiemap_s == NULL)
1946 /* get the fiemap value */
1947 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1949 GOTO(error, rc = -EFAULT);
1951 /* If fm_extent_count is non-zero, read the first extent since
1952 * it is used to calculate end_offset and device from previous
1955 if (copy_from_user(&fiemap_s->fm_extents[0],
1956 (char __user *)arg + sizeof(*fiemap_s),
1957 sizeof(struct ll_fiemap_extent)))
1958 GOTO(error, rc = -EFAULT);
1961 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1965 ret_bytes = sizeof(struct ll_user_fiemap);
1967 if (extent_count != 0)
1968 ret_bytes += (fiemap_s->fm_mapped_extents *
1969 sizeof(struct ll_fiemap_extent));
1971 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1975 OBD_FREE_LARGE(fiemap_s, num_bytes);
1980 * Read the data_version for inode.
1982 * This value is computed using stripe object version on OST.
1983 * Version is computed using server side locking.
1985 * @param sync if do sync on the OST side;
1987 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1988 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1990 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1992 struct lov_stripe_md *lsm = NULL;
1993 struct ll_sb_info *sbi = ll_i2sbi(inode);
1994 struct obdo *obdo = NULL;
1998 /* If no stripe, we consider version is 0. */
1999 lsm = ccc_inode_lsm_get(inode);
2000 if (!lsm_has_objects(lsm)) {
2002 CDEBUG(D_INODE, "No object for inode\n");
2006 OBD_ALLOC_PTR(obdo);
2008 GOTO(out, rc = -ENOMEM);
2010 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2012 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2015 *data_version = obdo->o_data_version;
2021 ccc_inode_lsm_put(inode, lsm);
2026 * Trigger a HSM release request for the provided inode.
2028 int ll_hsm_release(struct inode *inode)
2030 struct cl_env_nest nest;
2032 struct obd_client_handle *och = NULL;
2033 __u64 data_version = 0;
2037 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2038 ll_get_fsname(inode->i_sb, NULL, 0),
2039 PFID(&ll_i2info(inode)->lli_fid));
2041 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2043 GOTO(out, rc = PTR_ERR(och));
2045 /* Grab latest data_version and [am]time values */
2046 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2050 env = cl_env_nested_get(&nest);
2052 GOTO(out, rc = PTR_ERR(env));
2054 ll_merge_lvb(env, inode);
2055 cl_env_nested_put(&nest, env);
2057 /* Release the file.
2058 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2059 * we still need it to pack l_remote_handle to MDT. */
2060 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2066 if (och != NULL && !IS_ERR(och)) /* close the file */
2067 ll_lease_close(och, inode, NULL);
2072 struct ll_swap_stack {
2073 struct iattr ia1, ia2;
2075 struct inode *inode1, *inode2;
2076 bool check_dv1, check_dv2;
2079 static int ll_swap_layouts(struct file *file1, struct file *file2,
2080 struct lustre_swap_layouts *lsl)
2082 struct mdc_swap_layouts msl;
2083 struct md_op_data *op_data;
2086 struct ll_swap_stack *llss = NULL;
2089 OBD_ALLOC_PTR(llss);
2093 llss->inode1 = file1->f_dentry->d_inode;
2094 llss->inode2 = file2->f_dentry->d_inode;
2096 if (!S_ISREG(llss->inode2->i_mode))
2097 GOTO(free, rc = -EINVAL);
2099 if (inode_permission(llss->inode1, MAY_WRITE) ||
2100 inode_permission(llss->inode2, MAY_WRITE))
2101 GOTO(free, rc = -EPERM);
2103 if (llss->inode2->i_sb != llss->inode1->i_sb)
2104 GOTO(free, rc = -EXDEV);
2106 /* we use 2 bool because it is easier to swap than 2 bits */
2107 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2108 llss->check_dv1 = true;
2110 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2111 llss->check_dv2 = true;
2113 /* we cannot use lsl->sl_dvX directly because we may swap them */
2114 llss->dv1 = lsl->sl_dv1;
2115 llss->dv2 = lsl->sl_dv2;
2117 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2118 if (rc == 0) /* same file, done! */
2121 if (rc < 0) { /* sequentialize it */
2122 swap(llss->inode1, llss->inode2);
2124 swap(llss->dv1, llss->dv2);
2125 swap(llss->check_dv1, llss->check_dv2);
2129 if (gid != 0) { /* application asks to flush dirty cache */
2130 rc = ll_get_grouplock(llss->inode1, file1, gid);
2134 rc = ll_get_grouplock(llss->inode2, file2, gid);
2136 ll_put_grouplock(llss->inode1, file1, gid);
2141 /* to be able to restore mtime and atime after swap
2142 * we need to first save them */
2144 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2145 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2146 llss->ia1.ia_atime = llss->inode1->i_atime;
2147 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2148 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2149 llss->ia2.ia_atime = llss->inode2->i_atime;
2150 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2153 /* ultimate check, before swaping the layouts we check if
2154 * dataversion has changed (if requested) */
2155 if (llss->check_dv1) {
2156 rc = ll_data_version(llss->inode1, &dv, 0);
2159 if (dv != llss->dv1)
2160 GOTO(putgl, rc = -EAGAIN);
2163 if (llss->check_dv2) {
2164 rc = ll_data_version(llss->inode2, &dv, 0);
2167 if (dv != llss->dv2)
2168 GOTO(putgl, rc = -EAGAIN);
2171 /* struct md_op_data is used to send the swap args to the mdt
2172 * only flags is missing, so we use struct mdc_swap_layouts
2173 * through the md_op_data->op_data */
2174 /* flags from user space have to be converted before they are send to
2175 * server, no flag is sent today, they are only used on the client */
2178 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2179 0, LUSTRE_OPC_ANY, &msl);
2180 if (IS_ERR(op_data))
2181 GOTO(free, rc = PTR_ERR(op_data));
2183 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2184 sizeof(*op_data), op_data, NULL);
2185 ll_finish_md_op_data(op_data);
2189 ll_put_grouplock(llss->inode2, file2, gid);
2190 ll_put_grouplock(llss->inode1, file1, gid);
2193 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2197 /* clear useless flags */
2198 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2199 llss->ia1.ia_valid &= ~ATTR_MTIME;
2200 llss->ia2.ia_valid &= ~ATTR_MTIME;
2203 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2204 llss->ia1.ia_valid &= ~ATTR_ATIME;
2205 llss->ia2.ia_valid &= ~ATTR_ATIME;
2208 /* update time if requested */
2210 if (llss->ia2.ia_valid != 0) {
2211 mutex_lock(&llss->inode1->i_mutex);
2212 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2213 mutex_unlock(&llss->inode1->i_mutex);
2216 if (llss->ia1.ia_valid != 0) {
2219 mutex_lock(&llss->inode2->i_mutex);
2220 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2221 mutex_unlock(&llss->inode2->i_mutex);
2233 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2235 struct md_op_data *op_data;
2238 /* Non-root users are forbidden to set or clear flags which are
2239 * NOT defined in HSM_USER_MASK. */
2240 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2241 !cfs_capable(CFS_CAP_SYS_ADMIN))
2244 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2245 LUSTRE_OPC_ANY, hss);
2246 if (IS_ERR(op_data))
2247 RETURN(PTR_ERR(op_data));
2249 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2250 sizeof(*op_data), op_data, NULL);
2252 ll_finish_md_op_data(op_data);
2257 static int ll_hsm_import(struct inode *inode, struct file *file,
2258 struct hsm_user_import *hui)
2260 struct hsm_state_set *hss = NULL;
2261 struct iattr *attr = NULL;
2265 if (!S_ISREG(inode->i_mode))
2271 GOTO(out, rc = -ENOMEM);
2273 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2274 hss->hss_archive_id = hui->hui_archive_id;
2275 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2276 rc = ll_hsm_state_set(inode, hss);
2280 OBD_ALLOC_PTR(attr);
2282 GOTO(out, rc = -ENOMEM);
2284 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2285 attr->ia_mode |= S_IFREG;
2286 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2287 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2288 attr->ia_size = hui->hui_size;
2289 attr->ia_mtime.tv_sec = hui->hui_mtime;
2290 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2291 attr->ia_atime.tv_sec = hui->hui_atime;
2292 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2294 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2295 ATTR_UID | ATTR_GID |
2296 ATTR_MTIME | ATTR_MTIME_SET |
2297 ATTR_ATIME | ATTR_ATIME_SET;
2299 rc = ll_setattr_raw(file->f_dentry, attr, true);
2314 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2316 struct inode *inode = file->f_dentry->d_inode;
2317 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2321 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2322 PFID(ll_inode2fid(inode)), inode, cmd);
2323 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2325 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2326 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2330 case LL_IOC_GETFLAGS:
2331 /* Get the current value of the file flags */
2332 return put_user(fd->fd_flags, (int *)arg);
2333 case LL_IOC_SETFLAGS:
2334 case LL_IOC_CLRFLAGS:
2335 /* Set or clear specific file flags */
2336 /* XXX This probably needs checks to ensure the flags are
2337 * not abused, and to handle any flag side effects.
2339 if (get_user(flags, (int *) arg))
2342 if (cmd == LL_IOC_SETFLAGS) {
2343 if ((flags & LL_FILE_IGNORE_LOCK) &&
2344 !(file->f_flags & O_DIRECT)) {
2345 CERROR("%s: unable to disable locking on "
2346 "non-O_DIRECT file\n", current->comm);
2350 fd->fd_flags |= flags;
2352 fd->fd_flags &= ~flags;
2355 case LL_IOC_LOV_SETSTRIPE:
2356 RETURN(ll_lov_setstripe(inode, file, arg));
2357 case LL_IOC_LOV_SETEA:
2358 RETURN(ll_lov_setea(inode, file, arg));
2359 case LL_IOC_LOV_SWAP_LAYOUTS: {
2361 struct lustre_swap_layouts lsl;
2363 if (copy_from_user(&lsl, (char *)arg,
2364 sizeof(struct lustre_swap_layouts)))
2367 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2370 file2 = fget(lsl.sl_fd);
2375 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2376 rc = ll_swap_layouts(file, file2, &lsl);
2380 case LL_IOC_LOV_GETSTRIPE:
2381 RETURN(ll_lov_getstripe(inode, arg));
2382 case LL_IOC_RECREATE_OBJ:
2383 RETURN(ll_lov_recreate_obj(inode, arg));
2384 case LL_IOC_RECREATE_FID:
2385 RETURN(ll_lov_recreate_fid(inode, arg));
2386 case FSFILT_IOC_FIEMAP:
2387 RETURN(ll_ioctl_fiemap(inode, arg));
2388 case FSFILT_IOC_GETFLAGS:
2389 case FSFILT_IOC_SETFLAGS:
2390 RETURN(ll_iocontrol(inode, file, cmd, arg));
2391 case FSFILT_IOC_GETVERSION_OLD:
2392 case FSFILT_IOC_GETVERSION:
2393 RETURN(put_user(inode->i_generation, (int *)arg));
2394 case LL_IOC_GROUP_LOCK:
2395 RETURN(ll_get_grouplock(inode, file, arg));
2396 case LL_IOC_GROUP_UNLOCK:
2397 RETURN(ll_put_grouplock(inode, file, arg));
2398 case IOC_OBD_STATFS:
2399 RETURN(ll_obd_statfs(inode, (void *)arg));
2401 /* We need to special case any other ioctls we want to handle,
2402 * to send them to the MDS/OST as appropriate and to properly
2403 * network encode the arg field.
2404 case FSFILT_IOC_SETVERSION_OLD:
2405 case FSFILT_IOC_SETVERSION:
2407 case LL_IOC_FLUSHCTX:
2408 RETURN(ll_flush_ctx(inode));
2409 case LL_IOC_PATH2FID: {
2410 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2411 sizeof(struct lu_fid)))
2416 case OBD_IOC_FID2PATH:
2417 RETURN(ll_fid2path(inode, (void *)arg));
2418 case LL_IOC_DATA_VERSION: {
2419 struct ioc_data_version idv;
2422 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2425 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2426 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2428 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2434 case LL_IOC_GET_MDTIDX: {
2437 mdtidx = ll_get_mdt_idx(inode);
2441 if (put_user((int)mdtidx, (int*)arg))
2446 case OBD_IOC_GETDTNAME:
2447 case OBD_IOC_GETMDNAME:
2448 RETURN(ll_get_obd_name(inode, cmd, arg));
2449 case LL_IOC_HSM_STATE_GET: {
2450 struct md_op_data *op_data;
2451 struct hsm_user_state *hus;
2458 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2459 LUSTRE_OPC_ANY, hus);
2460 if (IS_ERR(op_data)) {
2462 RETURN(PTR_ERR(op_data));
2465 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2468 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2471 ll_finish_md_op_data(op_data);
2475 case LL_IOC_HSM_STATE_SET: {
2476 struct hsm_state_set *hss;
2483 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2488 rc = ll_hsm_state_set(inode, hss);
2493 case LL_IOC_HSM_ACTION: {
2494 struct md_op_data *op_data;
2495 struct hsm_current_action *hca;
2502 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2503 LUSTRE_OPC_ANY, hca);
2504 if (IS_ERR(op_data)) {
2506 RETURN(PTR_ERR(op_data));
2509 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2512 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2515 ll_finish_md_op_data(op_data);
2519 case LL_IOC_SET_LEASE: {
2520 struct ll_inode_info *lli = ll_i2info(inode);
2521 struct obd_client_handle *och = NULL;
2527 if (!(file->f_mode & FMODE_WRITE))
2532 if (!(file->f_mode & FMODE_READ))
2537 mutex_lock(&lli->lli_och_mutex);
2538 if (fd->fd_lease_och != NULL) {
2539 och = fd->fd_lease_och;
2540 fd->fd_lease_och = NULL;
2542 mutex_unlock(&lli->lli_och_mutex);
2545 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2546 rc = ll_lease_close(och, inode, &lease_broken);
2547 if (rc == 0 && lease_broken)
2553 /* return the type of lease or error */
2554 RETURN(rc < 0 ? rc : (int)mode);
2559 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2561 /* apply for lease */
2562 och = ll_lease_open(inode, file, mode, 0);
2564 RETURN(PTR_ERR(och));
2567 mutex_lock(&lli->lli_och_mutex);
2568 if (fd->fd_lease_och == NULL) {
2569 fd->fd_lease_och = och;
2572 mutex_unlock(&lli->lli_och_mutex);
2574 /* impossible now that only excl is supported for now */
2575 ll_lease_close(och, inode, &lease_broken);
2580 case LL_IOC_GET_LEASE: {
2581 struct ll_inode_info *lli = ll_i2info(inode);
2582 struct ldlm_lock *lock = NULL;
2585 mutex_lock(&lli->lli_och_mutex);
2586 if (fd->fd_lease_och != NULL) {
2587 struct obd_client_handle *och = fd->fd_lease_och;
2589 lock = ldlm_handle2lock(&och->och_lease_handle);
2591 lock_res_and_lock(lock);
2592 if (!ldlm_is_cancel(lock))
2593 rc = och->och_flags &
2594 (FMODE_READ | FMODE_WRITE);
2595 unlock_res_and_lock(lock);
2596 LDLM_LOCK_PUT(lock);
2599 mutex_unlock(&lli->lli_och_mutex);
2602 case LL_IOC_HSM_IMPORT: {
2603 struct hsm_user_import *hui;
2609 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2614 rc = ll_hsm_import(inode, file, hui);
2624 ll_iocontrol_call(inode, file, cmd, arg, &err))
2627 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2633 #ifndef HAVE_FILE_LLSEEK_SIZE
2634 static inline loff_t
2635 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2637 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2639 if (offset > maxsize)
2642 if (offset != file->f_pos) {
2643 file->f_pos = offset;
2644 file->f_version = 0;
2650 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2651 loff_t maxsize, loff_t eof)
2653 struct inode *inode = file->f_dentry->d_inode;
2661 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2662 * position-querying operation. Avoid rewriting the "same"
2663 * f_pos value back to the file because a concurrent read(),
2664 * write() or lseek() might have altered it
2669 * f_lock protects against read/modify/write race with other
2670 * SEEK_CURs. Note that parallel writes and reads behave
2673 mutex_lock(&inode->i_mutex);
2674 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2675 mutex_unlock(&inode->i_mutex);
2679 * In the generic case the entire file is data, so as long as
2680 * offset isn't at the end of the file then the offset is data.
2687 * There is a virtual hole at the end of the file, so as long as
2688 * offset isn't i_size or larger, return i_size.
2696 return llseek_execute(file, offset, maxsize);
2700 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2702 struct inode *inode = file->f_dentry->d_inode;
2703 loff_t retval, eof = 0;
2706 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2707 (origin == SEEK_CUR) ? file->f_pos : 0);
2708 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2709 PFID(ll_inode2fid(inode)), inode, retval, retval,
2711 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2713 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2714 retval = ll_glimpse_size(inode);
2717 eof = i_size_read(inode);
2720 retval = ll_generic_file_llseek_size(file, offset, origin,
2721 ll_file_maxbytes(inode), eof);
2725 static int ll_flush(struct file *file, fl_owner_t id)
2727 struct inode *inode = file->f_dentry->d_inode;
2728 struct ll_inode_info *lli = ll_i2info(inode);
2729 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2732 LASSERT(!S_ISDIR(inode->i_mode));
2734 /* catch async errors that were recorded back when async writeback
2735 * failed for pages in this mapping. */
2736 rc = lli->lli_async_rc;
2737 lli->lli_async_rc = 0;
2738 if (lli->lli_clob != NULL) {
2739 err = lov_read_and_clear_async_rc(lli->lli_clob);
2744 /* The application has been told write failure already.
2745 * Do not report failure again. */
2746 if (fd->fd_write_failed)
2748 return rc ? -EIO : 0;
2752 * Called to make sure a portion of file has been written out.
2753 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2755 * Return how many pages have been written.
2757 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2758 enum cl_fsync_mode mode, int ignore_layout)
2760 struct cl_env_nest nest;
2763 struct obd_capa *capa = NULL;
2764 struct cl_fsync_io *fio;
2768 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2769 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2772 env = cl_env_nested_get(&nest);
2774 RETURN(PTR_ERR(env));
2776 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2778 io = ccc_env_thread_io(env);
2779 io->ci_obj = cl_i2info(inode)->lli_clob;
2780 io->ci_ignore_layout = ignore_layout;
2782 /* initialize parameters for sync */
2783 fio = &io->u.ci_fsync;
2784 fio->fi_capa = capa;
2785 fio->fi_start = start;
2787 fio->fi_fid = ll_inode2fid(inode);
2788 fio->fi_mode = mode;
2789 fio->fi_nr_written = 0;
2791 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2792 result = cl_io_loop(env, io);
2794 result = io->ci_result;
2796 result = fio->fi_nr_written;
2797 cl_io_fini(env, io);
2798 cl_env_nested_put(&nest, env);
2806 * When dentry is provided (the 'else' case), *file->f_dentry may be
2807 * null and dentry must be used directly rather than pulled from
2808 * *file->f_dentry as is done otherwise.
2811 #ifdef HAVE_FILE_FSYNC_4ARGS
2812 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2814 struct dentry *dentry = file->f_dentry;
2815 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2816 int ll_fsync(struct file *file, int datasync)
2818 struct dentry *dentry = file->f_dentry;
2820 loff_t end = LLONG_MAX;
2822 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2825 loff_t end = LLONG_MAX;
2827 struct inode *inode = dentry->d_inode;
2828 struct ll_inode_info *lli = ll_i2info(inode);
2829 struct ptlrpc_request *req;
2830 struct obd_capa *oc;
2834 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2835 PFID(ll_inode2fid(inode)), inode);
2836 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2838 #ifdef HAVE_FILE_FSYNC_4ARGS
2839 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2840 mutex_lock(&inode->i_mutex);
2842 /* fsync's caller has already called _fdata{sync,write}, we want
2843 * that IO to finish before calling the osc and mdc sync methods */
2844 rc = filemap_fdatawait(inode->i_mapping);
2847 /* catch async errors that were recorded back when async writeback
2848 * failed for pages in this mapping. */
2849 if (!S_ISDIR(inode->i_mode)) {
2850 err = lli->lli_async_rc;
2851 lli->lli_async_rc = 0;
2854 err = lov_read_and_clear_async_rc(lli->lli_clob);
2859 oc = ll_mdscapa_get(inode);
2860 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2866 ptlrpc_req_finished(req);
2868 if (S_ISREG(inode->i_mode)) {
2869 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2871 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2872 if (rc == 0 && err < 0)
2875 fd->fd_write_failed = true;
2877 fd->fd_write_failed = false;
2880 #ifdef HAVE_FILE_FSYNC_4ARGS
2881 mutex_unlock(&inode->i_mutex);
2887 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2889 struct inode *inode = file->f_dentry->d_inode;
2890 struct ll_sb_info *sbi = ll_i2sbi(inode);
2891 struct ldlm_enqueue_info einfo = {
2892 .ei_type = LDLM_FLOCK,
2893 .ei_cb_cp = ldlm_flock_completion_ast,
2894 .ei_cbdata = file_lock,
2896 struct md_op_data *op_data;
2897 struct lustre_handle lockh = {0};
2898 ldlm_policy_data_t flock = {{0}};
2899 int fl_type = file_lock->fl_type;
2905 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2906 PFID(ll_inode2fid(inode)), file_lock);
2908 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2910 if (file_lock->fl_flags & FL_FLOCK) {
2911 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2912 /* flocks are whole-file locks */
2913 flock.l_flock.end = OFFSET_MAX;
2914 /* For flocks owner is determined by the local file desctiptor*/
2915 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2916 } else if (file_lock->fl_flags & FL_POSIX) {
2917 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2918 flock.l_flock.start = file_lock->fl_start;
2919 flock.l_flock.end = file_lock->fl_end;
2923 flock.l_flock.pid = file_lock->fl_pid;
2925 /* Somewhat ugly workaround for svc lockd.
2926 * lockd installs custom fl_lmops->lm_compare_owner that checks
2927 * for the fl_owner to be the same (which it always is on local node
2928 * I guess between lockd processes) and then compares pid.
2929 * As such we assign pid to the owner field to make it all work,
2930 * conflict with normal locks is unlikely since pid space and
2931 * pointer space for current->files are not intersecting */
2932 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2933 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2937 einfo.ei_mode = LCK_PR;
2940 /* An unlock request may or may not have any relation to
2941 * existing locks so we may not be able to pass a lock handle
2942 * via a normal ldlm_lock_cancel() request. The request may even
2943 * unlock a byte range in the middle of an existing lock. In
2944 * order to process an unlock request we need all of the same
2945 * information that is given with a normal read or write record
2946 * lock request. To avoid creating another ldlm unlock (cancel)
2947 * message we'll treat a LCK_NL flock request as an unlock. */
2948 einfo.ei_mode = LCK_NL;
2951 einfo.ei_mode = LCK_PW;
2954 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2969 flags = LDLM_FL_BLOCK_NOWAIT;
2975 flags = LDLM_FL_TEST_LOCK;
2978 CERROR("unknown fcntl lock command: %d\n", cmd);
2982 /* Save the old mode so that if the mode in the lock changes we
2983 * can decrement the appropriate reader or writer refcount. */
2984 file_lock->fl_type = einfo.ei_mode;
2986 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2987 LUSTRE_OPC_ANY, NULL);
2988 if (IS_ERR(op_data))
2989 RETURN(PTR_ERR(op_data));
2991 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2992 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2993 flock.l_flock.pid, flags, einfo.ei_mode,
2994 flock.l_flock.start, flock.l_flock.end);
2996 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2999 /* Restore the file lock type if not TEST lock. */
3000 if (!(flags & LDLM_FL_TEST_LOCK))
3001 file_lock->fl_type = fl_type;
3003 if ((file_lock->fl_flags & FL_FLOCK) &&
3004 (rc == 0 || file_lock->fl_type == F_UNLCK))
3005 rc2 = flock_lock_file_wait(file, file_lock);
3006 if ((file_lock->fl_flags & FL_POSIX) &&
3007 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3008 !(flags & LDLM_FL_TEST_LOCK))
3009 rc2 = posix_lock_file_wait(file, file_lock);
3011 if (rc2 && file_lock->fl_type != F_UNLCK) {
3012 einfo.ei_mode = LCK_NL;
3013 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3018 ll_finish_md_op_data(op_data);
3023 int ll_get_fid_by_name(struct inode *parent, const char *name,
3024 int namelen, struct lu_fid *fid)
3026 struct md_op_data *op_data = NULL;
3027 struct mdt_body *body;
3028 struct ptlrpc_request *req;
3032 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3033 LUSTRE_OPC_ANY, NULL);
3034 if (IS_ERR(op_data))
3035 RETURN(PTR_ERR(op_data));
3037 op_data->op_valid = OBD_MD_FLID;
3038 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3039 ll_finish_md_op_data(op_data);
3043 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3045 GOTO(out_req, rc = -EFAULT);
3049 ptlrpc_req_finished(req);
3053 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3054 const char *name, int namelen)
3056 struct dentry *dchild = NULL;
3057 struct inode *child_inode = NULL;
3058 struct md_op_data *op_data;
3059 struct ptlrpc_request *request = NULL;
3064 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3065 name, PFID(ll_inode2fid(parent)), mdtidx);
3067 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3068 0, LUSTRE_OPC_ANY, NULL);
3069 if (IS_ERR(op_data))
3070 RETURN(PTR_ERR(op_data));
3072 /* Get child FID first */
3073 qstr.hash = full_name_hash(name, namelen);
3076 dchild = d_lookup(file->f_dentry, &qstr);
3077 if (dchild != NULL && dchild->d_inode != NULL) {
3078 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3079 if (dchild->d_inode != NULL) {
3080 child_inode = igrab(dchild->d_inode);
3081 ll_invalidate_aliases(child_inode);
3085 rc = ll_get_fid_by_name(parent, name, namelen,
3091 if (!fid_is_sane(&op_data->op_fid3)) {
3092 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3093 ll_get_fsname(parent->i_sb, NULL, 0), name,
3094 PFID(&op_data->op_fid3));
3095 GOTO(out_free, rc = -EINVAL);
3098 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3103 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3104 PFID(&op_data->op_fid3), mdtidx);
3105 GOTO(out_free, rc = 0);
3108 op_data->op_mds = mdtidx;
3109 op_data->op_cli_flags = CLI_MIGRATE;
3110 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3111 namelen, name, namelen, &request);
3113 ll_update_times(request, parent);
3115 ptlrpc_req_finished(request);
3120 if (child_inode != NULL) {
3121 clear_nlink(child_inode);
3125 ll_finish_md_op_data(op_data);
3130 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3138 * test if some locks matching bits and l_req_mode are acquired
3139 * - bits can be in different locks
3140 * - if found clear the common lock bits in *bits
3141 * - the bits not found, are kept in *bits
3143 * \param bits [IN] searched lock bits [IN]
3144 * \param l_req_mode [IN] searched lock mode
3145 * \retval boolean, true iff all bits are found
3147 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3149 struct lustre_handle lockh;
3150 ldlm_policy_data_t policy;
3151 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3152 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3161 fid = &ll_i2info(inode)->lli_fid;
3162 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3163 ldlm_lockname[mode]);
3165 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3166 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3167 policy.l_inodebits.bits = *bits & (1 << i);
3168 if (policy.l_inodebits.bits == 0)
3171 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3172 &policy, mode, &lockh)) {
3173 struct ldlm_lock *lock;
3175 lock = ldlm_handle2lock(&lockh);
3178 ~(lock->l_policy_data.l_inodebits.bits);
3179 LDLM_LOCK_PUT(lock);
3181 *bits &= ~policy.l_inodebits.bits;
3188 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3189 struct lustre_handle *lockh, __u64 flags,
3192 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3197 fid = &ll_i2info(inode)->lli_fid;
3198 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3200 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3201 fid, LDLM_IBITS, &policy, mode, lockh);
3206 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3208 /* Already unlinked. Just update nlink and return success */
3209 if (rc == -ENOENT) {
3211 /* This path cannot be hit for regular files unless in
3212 * case of obscure races, so no need to to validate
3214 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3216 } else if (rc != 0) {
3217 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3218 "%s: revalidate FID "DFID" error: rc = %d\n",
3219 ll_get_fsname(inode->i_sb, NULL, 0),
3220 PFID(ll_inode2fid(inode)), rc);
3226 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3228 struct inode *inode = dentry->d_inode;
3229 struct ptlrpc_request *req = NULL;
3230 struct obd_export *exp;
3234 LASSERT(inode != NULL);
3236 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3237 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3239 exp = ll_i2mdexp(inode);
3241 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3242 * But under CMD case, it caused some lock issues, should be fixed
3243 * with new CMD ibits lock. See bug 12718 */
3244 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3245 struct lookup_intent oit = { .it_op = IT_GETATTR };
3246 struct md_op_data *op_data;
3248 if (ibits == MDS_INODELOCK_LOOKUP)
3249 oit.it_op = IT_LOOKUP;
3251 /* Call getattr by fid, so do not provide name at all. */
3252 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3253 dentry->d_inode, NULL, 0, 0,
3254 LUSTRE_OPC_ANY, NULL);
3255 if (IS_ERR(op_data))
3256 RETURN(PTR_ERR(op_data));
3258 oit.it_create_mode |= M_CHECK_STALE;
3259 rc = md_intent_lock(exp, op_data, &oit, &req,
3260 &ll_md_blocking_ast, 0);
3261 ll_finish_md_op_data(op_data);
3262 oit.it_create_mode &= ~M_CHECK_STALE;
3264 rc = ll_inode_revalidate_fini(inode, rc);
3268 rc = ll_revalidate_it_finish(req, &oit, dentry);
3270 ll_intent_release(&oit);
3274 /* Unlinked? Unhash dentry, so it is not picked up later by
3275 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3276 here to preserve get_cwd functionality on 2.6.
3278 if (!dentry->d_inode->i_nlink)
3279 d_lustre_invalidate(dentry, 0);
3281 ll_lookup_finish_locks(&oit, dentry);
3282 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3283 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3284 obd_valid valid = OBD_MD_FLGETATTR;
3285 struct md_op_data *op_data;
3288 if (S_ISREG(inode->i_mode)) {
3289 rc = ll_get_default_mdsize(sbi, &ealen);
3292 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3295 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3296 0, ealen, LUSTRE_OPC_ANY,
3298 if (IS_ERR(op_data))
3299 RETURN(PTR_ERR(op_data));
3301 op_data->op_valid = valid;
3302 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3303 * capa for this inode. Because we only keep capas of dirs
3305 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3306 ll_finish_md_op_data(op_data);
3308 rc = ll_inode_revalidate_fini(inode, rc);
3312 rc = ll_prep_inode(&inode, req, NULL, NULL);
3315 ptlrpc_req_finished(req);
3319 static int ll_merge_md_attr(struct inode *inode)
3321 struct cl_attr attr = { 0 };
3324 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3325 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3330 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3331 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3333 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3334 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3335 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3341 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3343 struct inode *inode = dentry->d_inode;
3347 rc = __ll_inode_revalidate(dentry, ibits);
3351 /* if object isn't regular file, don't validate size */
3352 if (!S_ISREG(inode->i_mode)) {
3353 if (S_ISDIR(inode->i_mode) &&
3354 ll_i2info(inode)->lli_lsm_md != NULL) {
3355 rc = ll_merge_md_attr(inode);
3360 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3361 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3362 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3364 /* In case of restore, the MDT has the right size and has
3365 * already send it back without granting the layout lock,
3366 * inode is up-to-date so glimpse is useless.
3367 * Also to glimpse we need the layout, in case of a running
3368 * restore the MDT holds the layout lock so the glimpse will
3369 * block up to the end of restore (getattr will block)
3371 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3372 rc = ll_glimpse_size(inode);
3377 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3379 struct inode *inode = de->d_inode;
3380 struct ll_sb_info *sbi = ll_i2sbi(inode);
3381 struct ll_inode_info *lli = ll_i2info(inode);
3384 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3385 MDS_INODELOCK_LOOKUP);
3386 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3391 stat->dev = inode->i_sb->s_dev;
3392 if (ll_need_32bit_api(sbi))
3393 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3395 stat->ino = inode->i_ino;
3396 stat->mode = inode->i_mode;
3397 stat->uid = inode->i_uid;
3398 stat->gid = inode->i_gid;
3399 stat->rdev = inode->i_rdev;
3400 stat->atime = inode->i_atime;
3401 stat->mtime = inode->i_mtime;
3402 stat->ctime = inode->i_ctime;
3403 stat->blksize = 1 << inode->i_blkbits;
3404 stat->blocks = inode->i_blocks;
3406 if (S_ISDIR(inode->i_mode) &&
3407 ll_i2info(inode)->lli_lsm_md != NULL) {
3408 stat->nlink = lli->lli_stripe_dir_nlink;
3409 stat->size = lli->lli_stripe_dir_size;
3411 stat->nlink = inode->i_nlink;
3412 stat->size = i_size_read(inode);
3418 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3419 __u64 start, __u64 len)
3423 struct ll_user_fiemap *fiemap;
3424 unsigned int extent_count = fieinfo->fi_extents_max;
3426 num_bytes = sizeof(*fiemap) + (extent_count *
3427 sizeof(struct ll_fiemap_extent));
3428 OBD_ALLOC_LARGE(fiemap, num_bytes);
3433 fiemap->fm_flags = fieinfo->fi_flags;
3434 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3435 fiemap->fm_start = start;
3436 fiemap->fm_length = len;
3437 if (extent_count > 0)
3438 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3439 sizeof(struct ll_fiemap_extent));
3441 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3443 fieinfo->fi_flags = fiemap->fm_flags;
3444 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3445 if (extent_count > 0)
3446 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3447 fiemap->fm_mapped_extents *
3448 sizeof(struct ll_fiemap_extent));
3450 OBD_FREE_LARGE(fiemap, num_bytes);
3454 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3456 struct ll_inode_info *lli = ll_i2info(inode);
3457 struct posix_acl *acl = NULL;
3460 spin_lock(&lli->lli_lock);
3461 /* VFS' acl_permission_check->check_acl will release the refcount */
3462 acl = posix_acl_dup(lli->lli_posix_acl);
3463 spin_unlock(&lli->lli_lock);
3468 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3470 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3471 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3473 ll_check_acl(struct inode *inode, int mask)
3476 # ifdef CONFIG_FS_POSIX_ACL
3477 struct posix_acl *acl;
3481 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3482 if (flags & IPERM_FLAG_RCU)
3485 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3490 rc = posix_acl_permission(inode, acl, mask);
3491 posix_acl_release(acl);
3494 # else /* !CONFIG_FS_POSIX_ACL */
3496 # endif /* CONFIG_FS_POSIX_ACL */
3498 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3500 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3501 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3503 # ifdef HAVE_INODE_PERMISION_2ARGS
3504 int ll_inode_permission(struct inode *inode, int mask)
3506 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3513 #ifdef MAY_NOT_BLOCK
3514 if (mask & MAY_NOT_BLOCK)
3516 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3517 if (flags & IPERM_FLAG_RCU)
3521 /* as root inode are NOT getting validated in lookup operation,
3522 * need to do it before permission check. */
3524 if (inode == inode->i_sb->s_root->d_inode) {
3525 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3526 MDS_INODELOCK_LOOKUP);
3531 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3532 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3534 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3535 return lustre_check_remote_perm(inode, mask);
3537 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3538 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3543 /* -o localflock - only provides locally consistent flock locks */
3544 struct file_operations ll_file_operations = {
3545 .read = ll_file_read,
3546 .aio_read = ll_file_aio_read,
3547 .write = ll_file_write,
3548 .aio_write = ll_file_aio_write,
3549 .unlocked_ioctl = ll_file_ioctl,
3550 .open = ll_file_open,
3551 .release = ll_file_release,
3552 .mmap = ll_file_mmap,
3553 .llseek = ll_file_seek,
3554 .splice_read = ll_file_splice_read,
3559 struct file_operations ll_file_operations_flock = {
3560 .read = ll_file_read,
3561 .aio_read = ll_file_aio_read,
3562 .write = ll_file_write,
3563 .aio_write = ll_file_aio_write,
3564 .unlocked_ioctl = ll_file_ioctl,
3565 .open = ll_file_open,
3566 .release = ll_file_release,
3567 .mmap = ll_file_mmap,
3568 .llseek = ll_file_seek,
3569 .splice_read = ll_file_splice_read,
3572 .flock = ll_file_flock,
3573 .lock = ll_file_flock
3576 /* These are for -o noflock - to return ENOSYS on flock calls */
3577 struct file_operations ll_file_operations_noflock = {
3578 .read = ll_file_read,
3579 .aio_read = ll_file_aio_read,
3580 .write = ll_file_write,
3581 .aio_write = ll_file_aio_write,
3582 .unlocked_ioctl = ll_file_ioctl,
3583 .open = ll_file_open,
3584 .release = ll_file_release,
3585 .mmap = ll_file_mmap,
3586 .llseek = ll_file_seek,
3587 .splice_read = ll_file_splice_read,
3590 .flock = ll_file_noflock,
3591 .lock = ll_file_noflock
3594 struct inode_operations ll_file_inode_operations = {
3595 .setattr = ll_setattr,
3596 .getattr = ll_getattr,
3597 .permission = ll_inode_permission,
3598 .setxattr = ll_setxattr,
3599 .getxattr = ll_getxattr,
3600 .listxattr = ll_listxattr,
3601 .removexattr = ll_removexattr,
3602 .fiemap = ll_fiemap,
3603 #ifdef HAVE_IOP_GET_ACL
3604 .get_acl = ll_get_acl,
3608 /* dynamic ioctl number support routins */
3609 static struct llioc_ctl_data {
3610 struct rw_semaphore ioc_sem;
3611 cfs_list_t ioc_head;
3613 __RWSEM_INITIALIZER(llioc.ioc_sem),
3614 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3619 cfs_list_t iocd_list;
3620 unsigned int iocd_size;
3621 llioc_callback_t iocd_cb;
3622 unsigned int iocd_count;
3623 unsigned int iocd_cmd[0];
3626 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3629 struct llioc_data *in_data = NULL;
3632 if (cb == NULL || cmd == NULL ||
3633 count > LLIOC_MAX_CMD || count < 0)
3636 size = sizeof(*in_data) + count * sizeof(unsigned int);
3637 OBD_ALLOC(in_data, size);
3638 if (in_data == NULL)
3641 memset(in_data, 0, sizeof(*in_data));
3642 in_data->iocd_size = size;
3643 in_data->iocd_cb = cb;
3644 in_data->iocd_count = count;
3645 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3647 down_write(&llioc.ioc_sem);
3648 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3649 up_write(&llioc.ioc_sem);
3654 void ll_iocontrol_unregister(void *magic)
3656 struct llioc_data *tmp;
3661 down_write(&llioc.ioc_sem);
3662 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3664 unsigned int size = tmp->iocd_size;
3666 cfs_list_del(&tmp->iocd_list);
3667 up_write(&llioc.ioc_sem);
3669 OBD_FREE(tmp, size);
3673 up_write(&llioc.ioc_sem);
3675 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3678 EXPORT_SYMBOL(ll_iocontrol_register);
3679 EXPORT_SYMBOL(ll_iocontrol_unregister);
3681 static enum llioc_iter
3682 ll_iocontrol_call(struct inode *inode, struct file *file,
3683 unsigned int cmd, unsigned long arg, int *rcp)
3685 enum llioc_iter ret = LLIOC_CONT;
3686 struct llioc_data *data;
3687 int rc = -EINVAL, i;
3689 down_read(&llioc.ioc_sem);
3690 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3691 for (i = 0; i < data->iocd_count; i++) {
3692 if (cmd != data->iocd_cmd[i])
3695 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3699 if (ret == LLIOC_STOP)
3702 up_read(&llioc.ioc_sem);
3709 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3711 struct ll_inode_info *lli = ll_i2info(inode);
3712 struct cl_env_nest nest;
3717 if (lli->lli_clob == NULL)
3720 env = cl_env_nested_get(&nest);
3722 RETURN(PTR_ERR(env));
3724 result = cl_conf_set(env, lli->lli_clob, conf);
3725 cl_env_nested_put(&nest, env);
3727 if (conf->coc_opc == OBJECT_CONF_SET) {
3728 struct ldlm_lock *lock = conf->coc_lock;
3730 LASSERT(lock != NULL);
3731 LASSERT(ldlm_has_layout(lock));
3733 struct lustre_md *md = conf->u.coc_md;
3734 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3736 /* it can only be allowed to match after layout is
3737 * applied to inode otherwise false layout would be
3738 * seen. Applying layout shoud happen before dropping
3739 * the intent lock. */
3740 ldlm_lock_allow_match(lock);
3742 lli->lli_has_smd = lsm_has_objects(md->lsm);
3743 if (md->lsm != NULL)
3744 gen = md->lsm->lsm_layout_gen;
3747 DFID ": layout version change: %u -> %u\n",
3748 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3750 ll_layout_version_set(lli, gen);
3756 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3757 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3760 struct ll_sb_info *sbi = ll_i2sbi(inode);
3761 struct obd_capa *oc;
3762 struct ptlrpc_request *req;
3763 struct mdt_body *body;
3770 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3771 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3772 lock->l_lvb_data, lock->l_lvb_len);
3774 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3777 /* if layout lock was granted right away, the layout is returned
3778 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3779 * blocked and then granted via completion ast, we have to fetch
3780 * layout here. Please note that we can't use the LVB buffer in
3781 * completion AST because it doesn't have a large enough buffer */
3782 oc = ll_mdscapa_get(inode);
3783 rc = ll_get_default_mdsize(sbi, &lmmsize);
3785 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3786 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3792 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3794 GOTO(out, rc = -EPROTO);
3796 lmmsize = body->eadatasize;
3797 if (lmmsize == 0) /* empty layout */
3800 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3802 GOTO(out, rc = -EFAULT);
3804 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3805 if (lvbdata == NULL)
3806 GOTO(out, rc = -ENOMEM);
3808 memcpy(lvbdata, lmm, lmmsize);
3809 lock_res_and_lock(lock);
3810 if (lock->l_lvb_data != NULL)
3811 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3813 lock->l_lvb_data = lvbdata;
3814 lock->l_lvb_len = lmmsize;
3815 unlock_res_and_lock(lock);
3820 ptlrpc_req_finished(req);
3825 * Apply the layout to the inode. Layout lock is held and will be released
3828 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3829 struct inode *inode, __u32 *gen, bool reconf)
3831 struct ll_inode_info *lli = ll_i2info(inode);
3832 struct ll_sb_info *sbi = ll_i2sbi(inode);
3833 struct ldlm_lock *lock;
3834 struct lustre_md md = { NULL };
3835 struct cl_object_conf conf;
3838 bool wait_layout = false;
3841 LASSERT(lustre_handle_is_used(lockh));
3843 lock = ldlm_handle2lock(lockh);
3844 LASSERT(lock != NULL);
3845 LASSERT(ldlm_has_layout(lock));
3847 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3848 PFID(&lli->lli_fid), inode, reconf);
3850 /* in case this is a caching lock and reinstate with new inode */
3851 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3853 lock_res_and_lock(lock);
3854 lvb_ready = ldlm_is_lvb_ready(lock);
3855 unlock_res_and_lock(lock);
3856 /* checking lvb_ready is racy but this is okay. The worst case is
3857 * that multi processes may configure the file on the same time. */
3859 if (lvb_ready || !reconf) {
3862 /* layout_gen must be valid if layout lock is not
3863 * cancelled and stripe has already set */
3864 *gen = ll_layout_version_get(lli);
3870 rc = ll_layout_fetch(inode, lock);
3874 /* for layout lock, lmm is returned in lock's lvb.
3875 * lvb_data is immutable if the lock is held so it's safe to access it
3876 * without res lock. See the description in ldlm_lock_decref_internal()
3877 * for the condition to free lvb_data of layout lock */
3878 if (lock->l_lvb_data != NULL) {
3879 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3880 lock->l_lvb_data, lock->l_lvb_len);
3882 *gen = LL_LAYOUT_GEN_EMPTY;
3884 *gen = md.lsm->lsm_layout_gen;
3887 CERROR("%s: file "DFID" unpackmd error: %d\n",
3888 ll_get_fsname(inode->i_sb, NULL, 0),
3889 PFID(&lli->lli_fid), rc);
3895 /* set layout to file. Unlikely this will fail as old layout was
3896 * surely eliminated */
3897 memset(&conf, 0, sizeof conf);
3898 conf.coc_opc = OBJECT_CONF_SET;
3899 conf.coc_inode = inode;
3900 conf.coc_lock = lock;
3901 conf.u.coc_md = &md;
3902 rc = ll_layout_conf(inode, &conf);
3905 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3907 /* refresh layout failed, need to wait */
3908 wait_layout = rc == -EBUSY;
3912 LDLM_LOCK_PUT(lock);
3913 ldlm_lock_decref(lockh, mode);
3915 /* wait for IO to complete if it's still being used. */
3917 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3918 ll_get_fsname(inode->i_sb, NULL, 0),
3919 PFID(&lli->lli_fid), inode);
3921 memset(&conf, 0, sizeof conf);
3922 conf.coc_opc = OBJECT_CONF_WAIT;
3923 conf.coc_inode = inode;
3924 rc = ll_layout_conf(inode, &conf);
3928 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3929 ll_get_fsname(inode->i_sb, NULL, 0),
3930 PFID(&lli->lli_fid), rc);
3936 * This function checks if there exists a LAYOUT lock on the client side,
3937 * or enqueues it if it doesn't have one in cache.
3939 * This function will not hold layout lock so it may be revoked any time after
3940 * this function returns. Any operations depend on layout should be redone
3943 * This function should be called before lov_io_init() to get an uptodate
3944 * layout version, the caller should save the version number and after IO
3945 * is finished, this function should be called again to verify that layout
3946 * is not changed during IO time.
3948 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3950 struct ll_inode_info *lli = ll_i2info(inode);
3951 struct ll_sb_info *sbi = ll_i2sbi(inode);
3952 struct md_op_data *op_data;
3953 struct lookup_intent it;
3954 struct lustre_handle lockh;
3956 struct ldlm_enqueue_info einfo = {
3957 .ei_type = LDLM_IBITS,
3959 .ei_cb_bl = &ll_md_blocking_ast,
3960 .ei_cb_cp = &ldlm_completion_ast,
3965 *gen = ll_layout_version_get(lli);
3966 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3970 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3971 LASSERT(S_ISREG(inode->i_mode));
3973 /* take layout lock mutex to enqueue layout lock exclusively. */
3974 mutex_lock(&lli->lli_layout_mutex);
3977 /* mostly layout lock is caching on the local side, so try to match
3978 * it before grabbing layout lock mutex. */
3979 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3980 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3981 if (mode != 0) { /* hit cached lock */
3982 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3986 mutex_unlock(&lli->lli_layout_mutex);
3990 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3991 0, 0, LUSTRE_OPC_ANY, NULL);
3992 if (IS_ERR(op_data)) {
3993 mutex_unlock(&lli->lli_layout_mutex);
3994 RETURN(PTR_ERR(op_data));
3997 /* have to enqueue one */
3998 memset(&it, 0, sizeof(it));
3999 it.it_op = IT_LAYOUT;
4000 lockh.cookie = 0ULL;
4002 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4003 ll_get_fsname(inode->i_sb, NULL, 0),
4004 PFID(&lli->lli_fid), inode);
4006 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4007 if (it.d.lustre.it_data != NULL)
4008 ptlrpc_req_finished(it.d.lustre.it_data);
4009 it.d.lustre.it_data = NULL;
4011 ll_finish_md_op_data(op_data);
4013 mode = it.d.lustre.it_lock_mode;
4014 it.d.lustre.it_lock_mode = 0;
4015 ll_intent_drop_lock(&it);
4018 /* set lock data in case this is a new lock */
4019 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4020 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4024 mutex_unlock(&lli->lli_layout_mutex);
4030 * This function send a restore request to the MDT
4032 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4034 struct hsm_user_request *hur;
4038 len = sizeof(struct hsm_user_request) +
4039 sizeof(struct hsm_user_item);
4040 OBD_ALLOC(hur, len);
4044 hur->hur_request.hr_action = HUA_RESTORE;
4045 hur->hur_request.hr_archive_id = 0;
4046 hur->hur_request.hr_flags = 0;
4047 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4048 sizeof(hur->hur_user_item[0].hui_fid));
4049 hur->hur_user_item[0].hui_extent.offset = offset;
4050 hur->hur_user_item[0].hui_extent.length = length;
4051 hur->hur_request.hr_itemcount = 1;
4052 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,