4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och,
124 const __u64 *data_version)
126 struct obd_export *exp = ll_i2mdexp(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
129 struct obd_device *obd = class_exp2obd(exp);
136 * XXX: in case of LMV, is this correct to access
139 CERROR("Invalid MDC connection handle "LPX64"\n",
140 ll_i2mdexp(inode)->exp_handle.h_cookie);
144 OBD_ALLOC_PTR(op_data);
146 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
148 ll_prepare_close(inode, op_data, och);
149 if (data_version != NULL) {
150 /* Pass in data_version implies release. */
151 op_data->op_bias |= MDS_HSM_RELEASE;
152 op_data->op_data_version = *data_version;
153 op_data->op_lease_handle = och->och_lease_handle;
154 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
157 rc = md_close(md_exp, op_data, och->och_mod, &req);
159 /* This close must have the epoch closed. */
160 LASSERT(epoch_close);
161 /* MDS has instructed us to obtain Size-on-MDS attribute from
162 * OSTs and send setattr to back to MDS. */
163 rc = ll_som_update(inode, op_data);
165 CERROR("inode %lu mdc Size-on-MDS update failed: "
166 "rc = %d\n", inode->i_ino, rc);
170 CERROR("inode %lu mdc close failed: rc = %d\n",
174 /* DATA_MODIFIED flag was successfully sent on close, cancel data
175 * modification flag. */
176 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
177 struct ll_inode_info *lli = ll_i2info(inode);
179 spin_lock(&lli->lli_lock);
180 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
181 spin_unlock(&lli->lli_lock);
185 rc = ll_objects_destroy(req, inode);
187 CERROR("inode %lu ll_objects destroy: rc = %d\n",
191 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
192 struct mdt_body *body;
193 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
194 if (!(body->valid & OBD_MD_FLRELEASED))
198 ll_finish_md_op_data(op_data);
202 if (exp_connect_som(exp) && !epoch_close &&
203 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
204 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
206 md_clear_open_replay_data(md_exp, och);
207 /* Free @och if it is not waiting for DONE_WRITING. */
208 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
211 if (req) /* This is close request */
212 ptlrpc_req_finished(req);
216 int ll_md_real_close(struct inode *inode, int flags)
218 struct ll_inode_info *lli = ll_i2info(inode);
219 struct obd_client_handle **och_p;
220 struct obd_client_handle *och;
225 if (flags & FMODE_WRITE) {
226 och_p = &lli->lli_mds_write_och;
227 och_usecount = &lli->lli_open_fd_write_count;
228 } else if (flags & FMODE_EXEC) {
229 och_p = &lli->lli_mds_exec_och;
230 och_usecount = &lli->lli_open_fd_exec_count;
232 LASSERT(flags & FMODE_READ);
233 och_p = &lli->lli_mds_read_och;
234 och_usecount = &lli->lli_open_fd_read_count;
237 mutex_lock(&lli->lli_och_mutex);
238 if (*och_usecount) { /* There are still users of this handle, so
240 mutex_unlock(&lli->lli_och_mutex);
245 mutex_unlock(&lli->lli_och_mutex);
247 if (och) { /* There might be a race and somebody have freed this och
249 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
256 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
260 struct ll_inode_info *lli = ll_i2info(inode);
264 /* clear group lock, if present */
265 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
266 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
268 if (fd->fd_lease_och != NULL) {
271 /* Usually the lease is not released when the
272 * application crashed, we need to release here. */
273 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
274 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
275 PFID(&lli->lli_fid), rc, lease_broken);
277 fd->fd_lease_och = NULL;
280 if (fd->fd_och != NULL) {
281 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
286 /* Let's see if we have good enough OPEN lock on the file and if
287 we can skip talking to MDS */
288 if (file->f_dentry->d_inode) { /* Can this ever be false? */
290 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
291 struct lustre_handle lockh;
292 struct inode *inode = file->f_dentry->d_inode;
293 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
295 mutex_lock(&lli->lli_och_mutex);
296 if (fd->fd_omode & FMODE_WRITE) {
298 LASSERT(lli->lli_open_fd_write_count);
299 lli->lli_open_fd_write_count--;
300 } else if (fd->fd_omode & FMODE_EXEC) {
302 LASSERT(lli->lli_open_fd_exec_count);
303 lli->lli_open_fd_exec_count--;
306 LASSERT(lli->lli_open_fd_read_count);
307 lli->lli_open_fd_read_count--;
309 mutex_unlock(&lli->lli_och_mutex);
311 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
312 LDLM_IBITS, &policy, lockmode,
314 rc = ll_md_real_close(file->f_dentry->d_inode,
318 CERROR("Releasing a file %p with negative dentry %p. Name %s",
319 file, file->f_dentry, file->f_dentry->d_name.name);
323 LUSTRE_FPRIVATE(file) = NULL;
324 ll_file_data_put(fd);
325 ll_capa_close(inode);
330 /* While this returns an error code, fput() the caller does not, so we need
331 * to make every effort to clean up all of our state here. Also, applications
332 * rarely check close errors and even if an error is returned they will not
333 * re-try the close call.
335 int ll_file_release(struct inode *inode, struct file *file)
337 struct ll_file_data *fd;
338 struct ll_sb_info *sbi = ll_i2sbi(inode);
339 struct ll_inode_info *lli = ll_i2info(inode);
343 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
344 inode->i_generation, inode);
346 #ifdef CONFIG_FS_POSIX_ACL
347 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
348 inode == inode->i_sb->s_root->d_inode) {
349 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
352 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
353 fd->fd_flags &= ~LL_FILE_RMTACL;
354 rct_del(&sbi->ll_rct, current_pid());
355 et_search_free(&sbi->ll_et, current_pid());
360 if (inode->i_sb->s_root != file->f_dentry)
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead.
366 * Different processes can open the same dir, "ll_opendir_key" means:
367 * it is me that should stop the statahead thread. */
368 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
369 lli->lli_opendir_pid != 0)
370 ll_stop_statahead(inode, lli->lli_opendir_key);
372 if (inode->i_sb->s_root == file->f_dentry) {
373 LUSTRE_FPRIVATE(file) = NULL;
374 ll_file_data_put(fd);
378 if (!S_ISDIR(inode->i_mode)) {
379 lov_read_and_clear_async_rc(lli->lli_clob);
380 lli->lli_async_rc = 0;
383 rc = ll_md_close(sbi->ll_md_exp, inode, file);
385 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
386 libcfs_debug_dumplog();
391 static int ll_intent_file_open(struct file *file, void *lmm,
392 int lmmsize, struct lookup_intent *itp)
394 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
395 struct dentry *parent = file->f_dentry->d_parent;
396 const char *name = file->f_dentry->d_name.name;
397 const int len = file->f_dentry->d_name.len;
398 struct md_op_data *op_data;
399 struct ptlrpc_request *req;
400 __u32 opc = LUSTRE_OPC_ANY;
407 /* Usually we come here only for NFSD, and we want open lock.
408 But we can also get here with pre 2.6.15 patchless kernels, and in
409 that case that lock is also ok */
410 /* We can also get here if there was cached open handle in revalidate_it
411 * but it disappeared while we were getting from there to ll_file_open.
412 * But this means this file was closed and immediatelly opened which
413 * makes a good candidate for using OPEN lock */
414 /* If lmmsize & lmm are not 0, we are just setting stripe info
415 * parameters. No need for the open lock */
416 if (lmm == NULL && lmmsize == 0) {
417 itp->it_flags |= MDS_OPEN_LOCK;
418 if (itp->it_flags & FMODE_WRITE)
419 opc = LUSTRE_OPC_CREATE;
422 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
423 file->f_dentry->d_inode, name, len,
426 RETURN(PTR_ERR(op_data));
428 itp->it_flags |= MDS_OPEN_BY_FID;
429 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
430 0 /*unused */, &req, ll_md_blocking_ast, 0);
431 ll_finish_md_op_data(op_data);
433 /* reason for keep own exit path - don`t flood log
434 * with messages with -ESTALE errors.
436 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
437 it_open_error(DISP_OPEN_OPEN, itp))
439 ll_release_openhandle(file->f_dentry, itp);
443 if (it_disposition(itp, DISP_LOOKUP_NEG))
444 GOTO(out, rc = -ENOENT);
446 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
447 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
448 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
452 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
453 if (!rc && itp->d.lustre.it_lock_mode)
454 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
458 ptlrpc_req_finished(itp->d.lustre.it_data);
459 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->handle;
487 och->och_fid = body->fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, req);
495 int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
526 /* Open a file, and (for the very first open) create objects on the OSTs at
527 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
528 * creation or open until ll_lov_setstripe() ioctl is called.
530 * If we already have the stripe MD locally then we don't request it in
531 * md_open(), by passing a lmm_size = 0.
533 * It is up to the application to ensure no other processes open this file
534 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
535 * used. We might be able to avoid races of that sort by getting lli_open_sem
536 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
537 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
539 int ll_file_open(struct inode *inode, struct file *file)
541 struct ll_inode_info *lli = ll_i2info(inode);
542 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
543 .it_flags = file->f_flags };
544 struct obd_client_handle **och_p = NULL;
545 __u64 *och_usecount = NULL;
546 struct ll_file_data *fd;
547 int rc = 0, opendir_set = 0;
550 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
551 inode->i_generation, inode, file->f_flags);
553 it = file->private_data; /* XXX: compat macro */
554 file->private_data = NULL; /* prevent ll_local_open assertion */
556 fd = ll_file_data_get();
558 GOTO(out_openerr, rc = -ENOMEM);
561 if (S_ISDIR(inode->i_mode)) {
562 spin_lock(&lli->lli_sa_lock);
563 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
564 lli->lli_opendir_pid == 0) {
565 lli->lli_opendir_key = fd;
566 lli->lli_opendir_pid = current_pid();
569 spin_unlock(&lli->lli_sa_lock);
572 if (inode->i_sb->s_root == file->f_dentry) {
573 LUSTRE_FPRIVATE(file) = fd;
577 if (!it || !it->d.lustre.it_disposition) {
578 /* Convert f_flags into access mode. We cannot use file->f_mode,
579 * because everything but O_ACCMODE mask was stripped from
581 if ((oit.it_flags + 1) & O_ACCMODE)
583 if (file->f_flags & O_TRUNC)
584 oit.it_flags |= FMODE_WRITE;
586 /* kernel only call f_op->open in dentry_open. filp_open calls
587 * dentry_open after call to open_namei that checks permissions.
588 * Only nfsd_open call dentry_open directly without checking
589 * permissions and because of that this code below is safe. */
590 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
591 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
593 /* We do not want O_EXCL here, presumably we opened the file
594 * already? XXX - NFS implications? */
595 oit.it_flags &= ~O_EXCL;
597 /* bug20584, if "it_flags" contains O_CREAT, the file will be
598 * created if necessary, then "IT_CREAT" should be set to keep
599 * consistent with it */
600 if (oit.it_flags & O_CREAT)
601 oit.it_op |= IT_CREAT;
607 /* Let's see if we have file open on MDS already. */
608 if (it->it_flags & FMODE_WRITE) {
609 och_p = &lli->lli_mds_write_och;
610 och_usecount = &lli->lli_open_fd_write_count;
611 } else if (it->it_flags & FMODE_EXEC) {
612 och_p = &lli->lli_mds_exec_och;
613 och_usecount = &lli->lli_open_fd_exec_count;
615 och_p = &lli->lli_mds_read_och;
616 och_usecount = &lli->lli_open_fd_read_count;
619 mutex_lock(&lli->lli_och_mutex);
620 if (*och_p) { /* Open handle is present */
621 if (it_disposition(it, DISP_OPEN_OPEN)) {
622 /* Well, there's extra open request that we do not need,
623 let's close it somehow. This will decref request. */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 mutex_unlock(&lli->lli_och_mutex);
627 GOTO(out_openerr, rc);
630 ll_release_openhandle(file->f_dentry, it);
634 rc = ll_local_open(file, it, fd, NULL);
637 mutex_unlock(&lli->lli_och_mutex);
638 GOTO(out_openerr, rc);
641 LASSERT(*och_usecount == 0);
642 if (!it->d.lustre.it_disposition) {
643 /* We cannot just request lock handle now, new ELC code
644 means that one of other OPEN locks for this file
645 could be cancelled, and since blocking ast handler
646 would attempt to grab och_mutex as well, that would
647 result in a deadlock */
648 mutex_unlock(&lli->lli_och_mutex);
649 it->it_create_mode |= M_CHECK_STALE;
650 rc = ll_intent_file_open(file, NULL, 0, it);
651 it->it_create_mode &= ~M_CHECK_STALE;
653 GOTO(out_openerr, rc);
657 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
659 GOTO(out_och_free, rc = -ENOMEM);
663 /* md_intent_lock() didn't get a request ref if there was an
664 * open error, so don't do cleanup on the request here
666 /* XXX (green): Should not we bail out on any error here, not
667 * just open error? */
668 rc = it_open_error(DISP_OPEN_OPEN, it);
670 GOTO(out_och_free, rc);
672 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
674 rc = ll_local_open(file, it, fd, *och_p);
676 GOTO(out_och_free, rc);
678 mutex_unlock(&lli->lli_och_mutex);
681 /* Must do this outside lli_och_mutex lock to prevent deadlock where
682 different kind of OPEN lock for this same inode gets cancelled
683 by ldlm_cancel_lru */
684 if (!S_ISREG(inode->i_mode))
685 GOTO(out_och_free, rc);
689 if (!lli->lli_has_smd) {
690 if (file->f_flags & O_LOV_DELAY_CREATE ||
691 !(file->f_mode & FMODE_WRITE)) {
692 CDEBUG(D_INODE, "object creation was delayed\n");
693 GOTO(out_och_free, rc);
696 file->f_flags &= ~O_LOV_DELAY_CREATE;
697 GOTO(out_och_free, rc);
701 if (och_p && *och_p) {
702 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
703 *och_p = NULL; /* OBD_FREE writes some magic there */
706 mutex_unlock(&lli->lli_och_mutex);
709 if (opendir_set != 0)
710 ll_stop_statahead(inode, lli->lli_opendir_key);
712 ll_file_data_put(fd);
714 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
717 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
718 ptlrpc_req_finished(it->d.lustre.it_data);
719 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
725 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
726 struct ldlm_lock_desc *desc, void *data, int flag)
729 struct lustre_handle lockh;
733 case LDLM_CB_BLOCKING:
734 ldlm_lock2handle(lock, &lockh);
735 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
737 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
741 case LDLM_CB_CANCELING:
749 * Acquire a lease and open the file.
751 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
752 fmode_t fmode, __u64 open_flags)
754 struct lookup_intent it = { .it_op = IT_OPEN };
755 struct ll_sb_info *sbi = ll_i2sbi(inode);
756 struct md_op_data *op_data;
757 struct ptlrpc_request *req;
758 struct lustre_handle old_handle = { 0 };
759 struct obd_client_handle *och = NULL;
764 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
765 RETURN(ERR_PTR(-EINVAL));
768 struct ll_inode_info *lli = ll_i2info(inode);
769 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
770 struct obd_client_handle **och_p;
773 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
774 RETURN(ERR_PTR(-EPERM));
776 /* Get the openhandle of the file */
778 mutex_lock(&lli->lli_och_mutex);
779 if (fd->fd_lease_och != NULL) {
780 mutex_unlock(&lli->lli_och_mutex);
784 if (fd->fd_och == NULL) {
785 if (file->f_mode & FMODE_WRITE) {
786 LASSERT(lli->lli_mds_write_och != NULL);
787 och_p = &lli->lli_mds_write_och;
788 och_usecount = &lli->lli_open_fd_write_count;
790 LASSERT(lli->lli_mds_read_och != NULL);
791 och_p = &lli->lli_mds_read_och;
792 och_usecount = &lli->lli_open_fd_read_count;
794 if (*och_usecount == 1) {
801 mutex_unlock(&lli->lli_och_mutex);
802 if (rc < 0) /* more than 1 opener */
805 LASSERT(fd->fd_och != NULL);
806 old_handle = fd->fd_och->och_fh;
811 RETURN(ERR_PTR(-ENOMEM));
813 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
814 LUSTRE_OPC_ANY, NULL);
816 GOTO(out, rc = PTR_ERR(op_data));
818 /* To tell the MDT this openhandle is from the same owner */
819 op_data->op_handle = old_handle;
821 it.it_flags = fmode | open_flags;
822 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
823 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
824 ll_md_blocking_lease_ast,
825 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
826 * it can be cancelled which may mislead applications that the lease is
828 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
829 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
830 * doesn't deal with openhandle, so normal openhandle will be leaked. */
831 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
832 ll_finish_md_op_data(op_data);
834 ptlrpc_req_finished(req);
835 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
838 GOTO(out_release_it, rc);
840 if (it_disposition(&it, DISP_LOOKUP_NEG))
841 GOTO(out_release_it, rc = -ENOENT);
843 rc = it_open_error(DISP_OPEN_OPEN, &it);
845 GOTO(out_release_it, rc);
847 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
848 ll_och_fill(sbi->ll_md_exp, &it, och);
850 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
851 GOTO(out_close, rc = -EOPNOTSUPP);
853 /* already get lease, handle lease lock */
854 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
855 if (it.d.lustre.it_lock_mode == 0 ||
856 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
857 /* open lock must return for lease */
858 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
859 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
860 it.d.lustre.it_lock_bits);
861 GOTO(out_close, rc = -EPROTO);
864 ll_intent_release(&it);
868 /* Cancel open lock */
869 if (it.d.lustre.it_lock_mode != 0) {
870 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
871 it.d.lustre.it_lock_mode);
872 it.d.lustre.it_lock_mode = 0;
873 och->och_lease_handle.cookie = 0ULL;
875 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
877 CERROR("%s: error closing file "DFID": %d\n",
878 ll_get_fsname(inode->i_sb, NULL, 0),
879 PFID(&ll_i2info(inode)->lli_fid), rc2);
880 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
882 ll_intent_release(&it);
888 EXPORT_SYMBOL(ll_lease_open);
891 * Release lease and close the file.
892 * It will check if the lease has ever broken.
894 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
897 struct ldlm_lock *lock;
898 bool cancelled = true;
902 lock = ldlm_handle2lock(&och->och_lease_handle);
904 lock_res_and_lock(lock);
905 cancelled = ldlm_is_cancel(lock);
906 unlock_res_and_lock(lock);
910 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
911 PFID(&ll_i2info(inode)->lli_fid), cancelled);
914 ldlm_cli_cancel(&och->och_lease_handle, 0);
915 if (lease_broken != NULL)
916 *lease_broken = cancelled;
918 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
922 EXPORT_SYMBOL(ll_lease_close);
924 /* Fills the obdo with the attributes for the lsm */
925 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
926 struct obd_capa *capa, struct obdo *obdo,
927 __u64 ioepoch, int dv_flags)
929 struct ptlrpc_request_set *set;
930 struct obd_info oinfo = { { { 0 } } };
935 LASSERT(lsm != NULL);
939 oinfo.oi_oa->o_oi = lsm->lsm_oi;
940 oinfo.oi_oa->o_mode = S_IFREG;
941 oinfo.oi_oa->o_ioepoch = ioepoch;
942 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
943 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
944 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
945 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
946 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
947 OBD_MD_FLDATAVERSION;
948 oinfo.oi_capa = capa;
949 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
950 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
951 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
952 if (dv_flags & LL_DV_WR_FLUSH)
953 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
956 set = ptlrpc_prep_set();
958 CERROR("can't allocate ptlrpc set\n");
961 rc = obd_getattr_async(exp, &oinfo, set);
963 rc = ptlrpc_set_wait(set);
964 ptlrpc_set_destroy(set);
967 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
968 OBD_MD_FLATIME | OBD_MD_FLMTIME |
969 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
970 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
971 if (dv_flags & LL_DV_WR_FLUSH &&
972 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
973 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
980 * Performs the getattr on the inode and updates its fields.
981 * If @sync != 0, perform the getattr under the server-side lock.
983 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
984 __u64 ioepoch, int sync)
986 struct obd_capa *capa = ll_mdscapa_get(inode);
987 struct lov_stripe_md *lsm;
991 lsm = ccc_inode_lsm_get(inode);
992 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
993 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
996 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
998 obdo_refresh_inode(inode, obdo, obdo->o_valid);
999 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1000 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1001 (unsigned long long)inode->i_blocks,
1002 (unsigned long)ll_inode_blksize(inode));
1004 ccc_inode_lsm_put(inode, lsm);
1008 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1010 struct ll_inode_info *lli = ll_i2info(inode);
1011 struct cl_object *obj = lli->lli_clob;
1012 struct cl_attr *attr = ccc_env_thread_attr(env);
1018 ll_inode_size_lock(inode);
1019 /* merge timestamps the most recently obtained from mds with
1020 timestamps obtained from osts */
1021 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1022 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1023 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1024 inode_init_lvb(inode, &lvb);
1026 cl_object_attr_lock(obj);
1027 rc = cl_object_attr_get(env, obj, attr);
1028 cl_object_attr_unlock(obj);
1031 if (lvb.lvb_atime < attr->cat_atime)
1032 lvb.lvb_atime = attr->cat_atime;
1033 if (lvb.lvb_ctime < attr->cat_ctime)
1034 lvb.lvb_ctime = attr->cat_ctime;
1035 if (lvb.lvb_mtime < attr->cat_mtime)
1036 lvb.lvb_mtime = attr->cat_mtime;
1038 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1039 PFID(&lli->lli_fid), attr->cat_size);
1040 cl_isize_write_nolock(inode, attr->cat_size);
1042 inode->i_blocks = attr->cat_blocks;
1044 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1045 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1046 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1048 ll_inode_size_unlock(inode);
1053 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1056 struct obdo obdo = { 0 };
1059 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1061 st->st_size = obdo.o_size;
1062 st->st_blocks = obdo.o_blocks;
1063 st->st_mtime = obdo.o_mtime;
1064 st->st_atime = obdo.o_atime;
1065 st->st_ctime = obdo.o_ctime;
1070 static bool file_is_noatime(const struct file *file)
1072 const struct vfsmount *mnt = file->f_path.mnt;
1073 const struct inode *inode = file->f_path.dentry->d_inode;
1075 /* Adapted from file_accessed() and touch_atime().*/
1076 if (file->f_flags & O_NOATIME)
1079 if (inode->i_flags & S_NOATIME)
1082 if (IS_NOATIME(inode))
1085 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1088 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1091 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1097 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1099 struct inode *inode = file->f_dentry->d_inode;
1101 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1103 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1104 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1105 file->f_flags & O_DIRECT ||
1108 io->ci_obj = ll_i2info(inode)->lli_clob;
1109 io->ci_lockreq = CILR_MAYBE;
1110 if (ll_file_nolock(file)) {
1111 io->ci_lockreq = CILR_NEVER;
1112 io->ci_no_srvlock = 1;
1113 } else if (file->f_flags & O_APPEND) {
1114 io->ci_lockreq = CILR_MANDATORY;
1117 io->ci_noatime = file_is_noatime(file);
1121 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1122 struct file *file, enum cl_io_type iot,
1123 loff_t *ppos, size_t count)
1125 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1126 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1132 io = ccc_env_thread_io(env);
1133 ll_io_init(io, file, iot == CIT_WRITE);
1135 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1136 struct vvp_io *vio = vvp_env_io(env);
1137 struct ccc_io *cio = ccc_env_io(env);
1138 int write_mutex_locked = 0;
1140 cio->cui_fd = LUSTRE_FPRIVATE(file);
1141 vio->cui_io_subtype = args->via_io_subtype;
1143 switch (vio->cui_io_subtype) {
1145 cio->cui_iov = args->u.normal.via_iov;
1146 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1147 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1148 cio->cui_iocb = args->u.normal.via_iocb;
1149 if ((iot == CIT_WRITE) &&
1150 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1151 if (mutex_lock_interruptible(&lli->
1153 GOTO(out, result = -ERESTARTSYS);
1154 write_mutex_locked = 1;
1155 } else if (iot == CIT_READ) {
1156 down_read(&lli->lli_trunc_sem);
1160 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1161 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1164 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1165 vio->u.splice.cui_flags = args->u.splice.via_flags;
1168 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1171 result = cl_io_loop(env, io);
1172 if (write_mutex_locked)
1173 mutex_unlock(&lli->lli_write_mutex);
1174 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1175 up_read(&lli->lli_trunc_sem);
1177 /* cl_io_rw_init() handled IO */
1178 result = io->ci_result;
1181 if (io->ci_nob > 0) {
1182 result = io->ci_nob;
1183 *ppos = io->u.ci_wr.wr.crw_pos;
1187 cl_io_fini(env, io);
1188 /* If any bit been read/written (result != 0), we just return
1189 * short read/write instead of restart io. */
1190 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1191 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1192 iot == CIT_READ ? "read" : "write",
1193 file->f_dentry->d_name.name, *ppos, count);
1194 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1198 if (iot == CIT_READ) {
1200 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1201 LPROC_LL_READ_BYTES, result);
1202 } else if (iot == CIT_WRITE) {
1204 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1205 LPROC_LL_WRITE_BYTES, result);
1206 fd->fd_write_failed = false;
1207 } else if (result != -ERESTARTSYS) {
1208 fd->fd_write_failed = true;
1217 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1219 static int ll_file_get_iov_count(const struct iovec *iov,
1220 unsigned long *nr_segs, size_t *count)
1225 for (seg = 0; seg < *nr_segs; seg++) {
1226 const struct iovec *iv = &iov[seg];
1229 * If any segment has a negative length, or the cumulative
1230 * length ever wraps negative then return -EINVAL.
1233 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1235 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1240 cnt -= iv->iov_len; /* This segment is no good */
1247 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1248 unsigned long nr_segs, loff_t pos)
1251 struct vvp_io_args *args;
1257 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1261 env = cl_env_get(&refcheck);
1263 RETURN(PTR_ERR(env));
1265 args = vvp_env_args(env, IO_NORMAL);
1266 args->u.normal.via_iov = (struct iovec *)iov;
1267 args->u.normal.via_nrsegs = nr_segs;
1268 args->u.normal.via_iocb = iocb;
1270 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1271 &iocb->ki_pos, count);
1272 cl_env_put(env, &refcheck);
1276 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1280 struct iovec *local_iov;
1281 struct kiocb *kiocb;
1286 env = cl_env_get(&refcheck);
1288 RETURN(PTR_ERR(env));
1290 local_iov = &vvp_env_info(env)->vti_local_iov;
1291 kiocb = &vvp_env_info(env)->vti_kiocb;
1292 local_iov->iov_base = (void __user *)buf;
1293 local_iov->iov_len = count;
1294 init_sync_kiocb(kiocb, file);
1295 kiocb->ki_pos = *ppos;
1296 kiocb->ki_left = count;
1298 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1299 *ppos = kiocb->ki_pos;
1301 cl_env_put(env, &refcheck);
1306 * Write to a file (through the page cache).
1309 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1310 unsigned long nr_segs, loff_t pos)
1313 struct vvp_io_args *args;
1319 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1323 env = cl_env_get(&refcheck);
1325 RETURN(PTR_ERR(env));
1327 args = vvp_env_args(env, IO_NORMAL);
1328 args->u.normal.via_iov = (struct iovec *)iov;
1329 args->u.normal.via_nrsegs = nr_segs;
1330 args->u.normal.via_iocb = iocb;
1332 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1333 &iocb->ki_pos, count);
1334 cl_env_put(env, &refcheck);
1338 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1342 struct iovec *local_iov;
1343 struct kiocb *kiocb;
1348 env = cl_env_get(&refcheck);
1350 RETURN(PTR_ERR(env));
1352 local_iov = &vvp_env_info(env)->vti_local_iov;
1353 kiocb = &vvp_env_info(env)->vti_kiocb;
1354 local_iov->iov_base = (void __user *)buf;
1355 local_iov->iov_len = count;
1356 init_sync_kiocb(kiocb, file);
1357 kiocb->ki_pos = *ppos;
1358 kiocb->ki_left = count;
1360 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1361 *ppos = kiocb->ki_pos;
1363 cl_env_put(env, &refcheck);
1368 * Send file content (through pagecache) somewhere with helper
1370 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1371 struct pipe_inode_info *pipe, size_t count,
1375 struct vvp_io_args *args;
1380 env = cl_env_get(&refcheck);
1382 RETURN(PTR_ERR(env));
1384 args = vvp_env_args(env, IO_SPLICE);
1385 args->u.splice.via_pipe = pipe;
1386 args->u.splice.via_flags = flags;
1388 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1389 cl_env_put(env, &refcheck);
1393 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1396 struct obd_export *exp = ll_i2dtexp(inode);
1397 struct obd_trans_info oti = { 0 };
1398 struct obdo *oa = NULL;
1401 struct lov_stripe_md *lsm = NULL, *lsm2;
1408 lsm = ccc_inode_lsm_get(inode);
1409 if (!lsm_has_objects(lsm))
1410 GOTO(out, rc = -ENOENT);
1412 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1413 (lsm->lsm_stripe_count));
1415 OBD_ALLOC_LARGE(lsm2, lsm_size);
1417 GOTO(out, rc = -ENOMEM);
1420 oa->o_nlink = ost_idx;
1421 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1422 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1423 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1424 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1425 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1426 memcpy(lsm2, lsm, lsm_size);
1427 ll_inode_size_lock(inode);
1428 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1429 ll_inode_size_unlock(inode);
1431 OBD_FREE_LARGE(lsm2, lsm_size);
1434 ccc_inode_lsm_put(inode, lsm);
1439 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1441 struct ll_recreate_obj ucreat;
1445 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1448 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1452 ostid_set_seq_mdt0(&oi);
1453 ostid_set_id(&oi, ucreat.lrc_id);
1454 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1457 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1464 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1467 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1470 fid_to_ostid(&fid, &oi);
1471 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1472 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1475 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1476 __u64 flags, struct lov_user_md *lum,
1479 struct lov_stripe_md *lsm = NULL;
1480 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1484 lsm = ccc_inode_lsm_get(inode);
1486 ccc_inode_lsm_put(inode, lsm);
1487 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1492 ll_inode_size_lock(inode);
1493 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1496 rc = oit.d.lustre.it_status;
1498 GOTO(out_req_free, rc);
1500 ll_release_openhandle(file->f_dentry, &oit);
1503 ll_inode_size_unlock(inode);
1504 ll_intent_release(&oit);
1505 ccc_inode_lsm_put(inode, lsm);
1508 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1512 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1513 struct lov_mds_md **lmmp, int *lmm_size,
1514 struct ptlrpc_request **request)
1516 struct ll_sb_info *sbi = ll_i2sbi(inode);
1517 struct mdt_body *body;
1518 struct lov_mds_md *lmm = NULL;
1519 struct ptlrpc_request *req = NULL;
1520 struct md_op_data *op_data;
1523 rc = ll_get_max_mdsize(sbi, &lmmsize);
1527 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1528 strlen(filename), lmmsize,
1529 LUSTRE_OPC_ANY, NULL);
1530 if (IS_ERR(op_data))
1531 RETURN(PTR_ERR(op_data));
1533 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1534 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1535 ll_finish_md_op_data(op_data);
1537 CDEBUG(D_INFO, "md_getattr_name failed "
1538 "on %s: rc %d\n", filename, rc);
1542 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1543 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1545 lmmsize = body->eadatasize;
1547 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1549 GOTO(out, rc = -ENODATA);
1552 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1553 LASSERT(lmm != NULL);
1555 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1556 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1557 GOTO(out, rc = -EPROTO);
1561 * This is coming from the MDS, so is probably in
1562 * little endian. We convert it to host endian before
1563 * passing it to userspace.
1565 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1568 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1569 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1572 /* if function called for directory - we should
1573 * avoid swab not existent lsm objects */
1574 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1575 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1576 if (S_ISREG(body->mode))
1577 lustre_swab_lov_user_md_objects(
1578 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1580 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1581 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1582 if (S_ISREG(body->mode))
1583 lustre_swab_lov_user_md_objects(
1584 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1591 *lmm_size = lmmsize;
1596 static int ll_lov_setea(struct inode *inode, struct file *file,
1599 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1600 struct lov_user_md *lump;
1601 int lum_size = sizeof(struct lov_user_md) +
1602 sizeof(struct lov_user_ost_data);
1606 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1609 OBD_ALLOC_LARGE(lump, lum_size);
1613 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1614 OBD_FREE_LARGE(lump, lum_size);
1618 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1620 OBD_FREE_LARGE(lump, lum_size);
1624 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1627 struct lov_user_md_v3 lumv3;
1628 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1629 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1630 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1632 __u64 flags = FMODE_WRITE;
1635 /* first try with v1 which is smaller than v3 */
1636 lum_size = sizeof(struct lov_user_md_v1);
1637 if (copy_from_user(lumv1, lumv1p, lum_size))
1640 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1641 lum_size = sizeof(struct lov_user_md_v3);
1642 if (copy_from_user(&lumv3, lumv3p, lum_size))
1646 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1648 struct lov_stripe_md *lsm;
1651 put_user(0, &lumv1p->lmm_stripe_count);
1653 ll_layout_refresh(inode, &gen);
1654 lsm = ccc_inode_lsm_get(inode);
1655 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1656 0, lsm, (void *)arg);
1657 ccc_inode_lsm_put(inode, lsm);
1662 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1664 struct lov_stripe_md *lsm;
1668 lsm = ccc_inode_lsm_get(inode);
1670 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1672 ccc_inode_lsm_put(inode, lsm);
1676 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1678 struct ll_inode_info *lli = ll_i2info(inode);
1679 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1680 struct ccc_grouplock grouplock;
1684 if (ll_file_nolock(file))
1685 RETURN(-EOPNOTSUPP);
1687 spin_lock(&lli->lli_lock);
1688 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1689 CWARN("group lock already existed with gid %lu\n",
1690 fd->fd_grouplock.cg_gid);
1691 spin_unlock(&lli->lli_lock);
1694 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1695 spin_unlock(&lli->lli_lock);
1697 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1698 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1702 spin_lock(&lli->lli_lock);
1703 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1704 spin_unlock(&lli->lli_lock);
1705 CERROR("another thread just won the race\n");
1706 cl_put_grouplock(&grouplock);
1710 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1711 fd->fd_grouplock = grouplock;
1712 spin_unlock(&lli->lli_lock);
1714 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1718 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1720 struct ll_inode_info *lli = ll_i2info(inode);
1721 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1722 struct ccc_grouplock grouplock;
1725 spin_lock(&lli->lli_lock);
1726 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1727 spin_unlock(&lli->lli_lock);
1728 CWARN("no group lock held\n");
1731 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1733 if (fd->fd_grouplock.cg_gid != arg) {
1734 CWARN("group lock %lu doesn't match current id %lu\n",
1735 arg, fd->fd_grouplock.cg_gid);
1736 spin_unlock(&lli->lli_lock);
1740 grouplock = fd->fd_grouplock;
1741 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1742 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1743 spin_unlock(&lli->lli_lock);
1745 cl_put_grouplock(&grouplock);
1746 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1751 * Close inode open handle
1753 * \param dentry [in] dentry which contains the inode
1754 * \param it [in,out] intent which contains open info and result
1757 * \retval <0 failure
1759 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1761 struct inode *inode = dentry->d_inode;
1762 struct obd_client_handle *och;
1768 /* Root ? Do nothing. */
1769 if (dentry->d_inode->i_sb->s_root == dentry)
1772 /* No open handle to close? Move away */
1773 if (!it_disposition(it, DISP_OPEN_OPEN))
1776 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1778 OBD_ALLOC(och, sizeof(*och));
1780 GOTO(out, rc = -ENOMEM);
1782 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1784 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1787 /* this one is in place of ll_file_open */
1788 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1789 ptlrpc_req_finished(it->d.lustre.it_data);
1790 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1796 * Get size for inode for which FIEMAP mapping is requested.
1797 * Make the FIEMAP get_info call and returns the result.
1799 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1802 struct obd_export *exp = ll_i2dtexp(inode);
1803 struct lov_stripe_md *lsm = NULL;
1804 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1805 int vallen = num_bytes;
1809 /* Checks for fiemap flags */
1810 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1811 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1815 /* Check for FIEMAP_FLAG_SYNC */
1816 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1817 rc = filemap_fdatawrite(inode->i_mapping);
1822 lsm = ccc_inode_lsm_get(inode);
1826 /* If the stripe_count > 1 and the application does not understand
1827 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1829 if (lsm->lsm_stripe_count > 1 &&
1830 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1831 GOTO(out, rc = -EOPNOTSUPP);
1833 fm_key.oa.o_oi = lsm->lsm_oi;
1834 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1836 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1837 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1838 /* If filesize is 0, then there would be no objects for mapping */
1839 if (fm_key.oa.o_size == 0) {
1840 fiemap->fm_mapped_extents = 0;
1844 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1846 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1849 CERROR("obd_get_info failed: rc = %d\n", rc);
1852 ccc_inode_lsm_put(inode, lsm);
1856 int ll_fid2path(struct inode *inode, void *arg)
1858 struct obd_export *exp = ll_i2mdexp(inode);
1859 struct getinfo_fid2path *gfout, *gfin;
1863 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1864 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1867 /* Need to get the buflen */
1868 OBD_ALLOC_PTR(gfin);
1871 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1876 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1877 OBD_ALLOC(gfout, outsize);
1878 if (gfout == NULL) {
1882 memcpy(gfout, gfin, sizeof(*gfout));
1885 /* Call mdc_iocontrol */
1886 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1890 if (copy_to_user(arg, gfout, outsize))
1894 OBD_FREE(gfout, outsize);
1898 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1900 struct ll_user_fiemap *fiemap_s;
1901 size_t num_bytes, ret_bytes;
1902 unsigned int extent_count;
1905 /* Get the extent count so we can calculate the size of
1906 * required fiemap buffer */
1907 if (get_user(extent_count,
1908 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1910 num_bytes = sizeof(*fiemap_s) + (extent_count *
1911 sizeof(struct ll_fiemap_extent));
1913 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1914 if (fiemap_s == NULL)
1917 /* get the fiemap value */
1918 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1920 GOTO(error, rc = -EFAULT);
1922 /* If fm_extent_count is non-zero, read the first extent since
1923 * it is used to calculate end_offset and device from previous
1926 if (copy_from_user(&fiemap_s->fm_extents[0],
1927 (char __user *)arg + sizeof(*fiemap_s),
1928 sizeof(struct ll_fiemap_extent)))
1929 GOTO(error, rc = -EFAULT);
1932 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1936 ret_bytes = sizeof(struct ll_user_fiemap);
1938 if (extent_count != 0)
1939 ret_bytes += (fiemap_s->fm_mapped_extents *
1940 sizeof(struct ll_fiemap_extent));
1942 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1946 OBD_FREE_LARGE(fiemap_s, num_bytes);
1951 * Read the data_version for inode.
1953 * This value is computed using stripe object version on OST.
1954 * Version is computed using server side locking.
1956 * @param sync if do sync on the OST side;
1958 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1959 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1961 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1963 struct lov_stripe_md *lsm = NULL;
1964 struct ll_sb_info *sbi = ll_i2sbi(inode);
1965 struct obdo *obdo = NULL;
1969 /* If no stripe, we consider version is 0. */
1970 lsm = ccc_inode_lsm_get(inode);
1971 if (!lsm_has_objects(lsm)) {
1973 CDEBUG(D_INODE, "No object for inode\n");
1977 OBD_ALLOC_PTR(obdo);
1979 GOTO(out, rc = -ENOMEM);
1981 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1983 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1986 *data_version = obdo->o_data_version;
1992 ccc_inode_lsm_put(inode, lsm);
1997 * Trigger a HSM release request for the provided inode.
1999 int ll_hsm_release(struct inode *inode)
2001 struct cl_env_nest nest;
2003 struct obd_client_handle *och = NULL;
2004 __u64 data_version = 0;
2008 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2009 ll_get_fsname(inode->i_sb, NULL, 0),
2010 PFID(&ll_i2info(inode)->lli_fid));
2012 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2014 GOTO(out, rc = PTR_ERR(och));
2016 /* Grab latest data_version and [am]time values */
2017 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2021 env = cl_env_nested_get(&nest);
2023 GOTO(out, rc = PTR_ERR(env));
2025 ll_merge_lvb(env, inode);
2026 cl_env_nested_put(&nest, env);
2028 /* Release the file.
2029 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2030 * we still need it to pack l_remote_handle to MDT. */
2031 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2037 if (och != NULL && !IS_ERR(och)) /* close the file */
2038 ll_lease_close(och, inode, NULL);
2043 struct ll_swap_stack {
2044 struct iattr ia1, ia2;
2046 struct inode *inode1, *inode2;
2047 bool check_dv1, check_dv2;
2050 static int ll_swap_layouts(struct file *file1, struct file *file2,
2051 struct lustre_swap_layouts *lsl)
2053 struct mdc_swap_layouts msl;
2054 struct md_op_data *op_data;
2057 struct ll_swap_stack *llss = NULL;
2060 OBD_ALLOC_PTR(llss);
2064 llss->inode1 = file1->f_dentry->d_inode;
2065 llss->inode2 = file2->f_dentry->d_inode;
2067 if (!S_ISREG(llss->inode2->i_mode))
2068 GOTO(free, rc = -EINVAL);
2070 if (inode_permission(llss->inode1, MAY_WRITE) ||
2071 inode_permission(llss->inode2, MAY_WRITE))
2072 GOTO(free, rc = -EPERM);
2074 if (llss->inode2->i_sb != llss->inode1->i_sb)
2075 GOTO(free, rc = -EXDEV);
2077 /* we use 2 bool because it is easier to swap than 2 bits */
2078 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2079 llss->check_dv1 = true;
2081 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2082 llss->check_dv2 = true;
2084 /* we cannot use lsl->sl_dvX directly because we may swap them */
2085 llss->dv1 = lsl->sl_dv1;
2086 llss->dv2 = lsl->sl_dv2;
2088 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2089 if (rc == 0) /* same file, done! */
2092 if (rc < 0) { /* sequentialize it */
2093 swap(llss->inode1, llss->inode2);
2095 swap(llss->dv1, llss->dv2);
2096 swap(llss->check_dv1, llss->check_dv2);
2100 if (gid != 0) { /* application asks to flush dirty cache */
2101 rc = ll_get_grouplock(llss->inode1, file1, gid);
2105 rc = ll_get_grouplock(llss->inode2, file2, gid);
2107 ll_put_grouplock(llss->inode1, file1, gid);
2112 /* to be able to restore mtime and atime after swap
2113 * we need to first save them */
2115 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2116 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2117 llss->ia1.ia_atime = llss->inode1->i_atime;
2118 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2119 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2120 llss->ia2.ia_atime = llss->inode2->i_atime;
2121 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2124 /* ultimate check, before swaping the layouts we check if
2125 * dataversion has changed (if requested) */
2126 if (llss->check_dv1) {
2127 rc = ll_data_version(llss->inode1, &dv, 0);
2130 if (dv != llss->dv1)
2131 GOTO(putgl, rc = -EAGAIN);
2134 if (llss->check_dv2) {
2135 rc = ll_data_version(llss->inode2, &dv, 0);
2138 if (dv != llss->dv2)
2139 GOTO(putgl, rc = -EAGAIN);
2142 /* struct md_op_data is used to send the swap args to the mdt
2143 * only flags is missing, so we use struct mdc_swap_layouts
2144 * through the md_op_data->op_data */
2145 /* flags from user space have to be converted before they are send to
2146 * server, no flag is sent today, they are only used on the client */
2149 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2150 0, LUSTRE_OPC_ANY, &msl);
2151 if (IS_ERR(op_data))
2152 GOTO(free, rc = PTR_ERR(op_data));
2154 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2155 sizeof(*op_data), op_data, NULL);
2156 ll_finish_md_op_data(op_data);
2160 ll_put_grouplock(llss->inode2, file2, gid);
2161 ll_put_grouplock(llss->inode1, file1, gid);
2164 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2168 /* clear useless flags */
2169 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2170 llss->ia1.ia_valid &= ~ATTR_MTIME;
2171 llss->ia2.ia_valid &= ~ATTR_MTIME;
2174 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2175 llss->ia1.ia_valid &= ~ATTR_ATIME;
2176 llss->ia2.ia_valid &= ~ATTR_ATIME;
2179 /* update time if requested */
2181 if (llss->ia2.ia_valid != 0) {
2182 mutex_lock(&llss->inode1->i_mutex);
2183 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2184 mutex_unlock(&llss->inode1->i_mutex);
2187 if (llss->ia1.ia_valid != 0) {
2190 mutex_lock(&llss->inode2->i_mutex);
2191 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2192 mutex_unlock(&llss->inode2->i_mutex);
2204 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2206 struct md_op_data *op_data;
2209 /* Non-root users are forbidden to set or clear flags which are
2210 * NOT defined in HSM_USER_MASK. */
2211 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2212 !cfs_capable(CFS_CAP_SYS_ADMIN))
2215 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2216 LUSTRE_OPC_ANY, hss);
2217 if (IS_ERR(op_data))
2218 RETURN(PTR_ERR(op_data));
2220 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2221 sizeof(*op_data), op_data, NULL);
2223 ll_finish_md_op_data(op_data);
2228 static int ll_hsm_import(struct inode *inode, struct file *file,
2229 struct hsm_user_import *hui)
2231 struct hsm_state_set *hss = NULL;
2232 struct iattr *attr = NULL;
2236 if (!S_ISREG(inode->i_mode))
2242 GOTO(out, rc = -ENOMEM);
2244 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2245 hss->hss_archive_id = hui->hui_archive_id;
2246 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2247 rc = ll_hsm_state_set(inode, hss);
2251 OBD_ALLOC_PTR(attr);
2253 GOTO(out, rc = -ENOMEM);
2255 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2256 attr->ia_mode |= S_IFREG;
2257 attr->ia_uid = hui->hui_uid;
2258 attr->ia_gid = hui->hui_gid;
2259 attr->ia_size = hui->hui_size;
2260 attr->ia_mtime.tv_sec = hui->hui_mtime;
2261 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2262 attr->ia_atime.tv_sec = hui->hui_atime;
2263 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2265 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2266 ATTR_UID | ATTR_GID |
2267 ATTR_MTIME | ATTR_MTIME_SET |
2268 ATTR_ATIME | ATTR_ATIME_SET;
2270 rc = ll_setattr_raw(file->f_dentry, attr, true);
2284 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2286 struct inode *inode = file->f_dentry->d_inode;
2287 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2291 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2292 inode->i_generation, inode, cmd);
2293 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2295 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2296 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2300 case LL_IOC_GETFLAGS:
2301 /* Get the current value of the file flags */
2302 return put_user(fd->fd_flags, (int *)arg);
2303 case LL_IOC_SETFLAGS:
2304 case LL_IOC_CLRFLAGS:
2305 /* Set or clear specific file flags */
2306 /* XXX This probably needs checks to ensure the flags are
2307 * not abused, and to handle any flag side effects.
2309 if (get_user(flags, (int *) arg))
2312 if (cmd == LL_IOC_SETFLAGS) {
2313 if ((flags & LL_FILE_IGNORE_LOCK) &&
2314 !(file->f_flags & O_DIRECT)) {
2315 CERROR("%s: unable to disable locking on "
2316 "non-O_DIRECT file\n", current->comm);
2320 fd->fd_flags |= flags;
2322 fd->fd_flags &= ~flags;
2325 case LL_IOC_LOV_SETSTRIPE:
2326 RETURN(ll_lov_setstripe(inode, file, arg));
2327 case LL_IOC_LOV_SETEA:
2328 RETURN(ll_lov_setea(inode, file, arg));
2329 case LL_IOC_LOV_SWAP_LAYOUTS: {
2331 struct lustre_swap_layouts lsl;
2333 if (copy_from_user(&lsl, (char *)arg,
2334 sizeof(struct lustre_swap_layouts)))
2337 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2340 file2 = fget(lsl.sl_fd);
2345 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2346 rc = ll_swap_layouts(file, file2, &lsl);
2350 case LL_IOC_LOV_GETSTRIPE:
2351 RETURN(ll_lov_getstripe(inode, arg));
2352 case LL_IOC_RECREATE_OBJ:
2353 RETURN(ll_lov_recreate_obj(inode, arg));
2354 case LL_IOC_RECREATE_FID:
2355 RETURN(ll_lov_recreate_fid(inode, arg));
2356 case FSFILT_IOC_FIEMAP:
2357 RETURN(ll_ioctl_fiemap(inode, arg));
2358 case FSFILT_IOC_GETFLAGS:
2359 case FSFILT_IOC_SETFLAGS:
2360 RETURN(ll_iocontrol(inode, file, cmd, arg));
2361 case FSFILT_IOC_GETVERSION_OLD:
2362 case FSFILT_IOC_GETVERSION:
2363 RETURN(put_user(inode->i_generation, (int *)arg));
2364 case LL_IOC_GROUP_LOCK:
2365 RETURN(ll_get_grouplock(inode, file, arg));
2366 case LL_IOC_GROUP_UNLOCK:
2367 RETURN(ll_put_grouplock(inode, file, arg));
2368 case IOC_OBD_STATFS:
2369 RETURN(ll_obd_statfs(inode, (void *)arg));
2371 /* We need to special case any other ioctls we want to handle,
2372 * to send them to the MDS/OST as appropriate and to properly
2373 * network encode the arg field.
2374 case FSFILT_IOC_SETVERSION_OLD:
2375 case FSFILT_IOC_SETVERSION:
2377 case LL_IOC_FLUSHCTX:
2378 RETURN(ll_flush_ctx(inode));
2379 case LL_IOC_PATH2FID: {
2380 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2381 sizeof(struct lu_fid)))
2386 case OBD_IOC_FID2PATH:
2387 RETURN(ll_fid2path(inode, (void *)arg));
2388 case LL_IOC_DATA_VERSION: {
2389 struct ioc_data_version idv;
2392 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2395 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2396 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2398 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2404 case LL_IOC_GET_MDTIDX: {
2407 mdtidx = ll_get_mdt_idx(inode);
2411 if (put_user((int)mdtidx, (int*)arg))
2416 case OBD_IOC_GETDTNAME:
2417 case OBD_IOC_GETMDNAME:
2418 RETURN(ll_get_obd_name(inode, cmd, arg));
2419 case LL_IOC_HSM_STATE_GET: {
2420 struct md_op_data *op_data;
2421 struct hsm_user_state *hus;
2428 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2429 LUSTRE_OPC_ANY, hus);
2430 if (IS_ERR(op_data)) {
2432 RETURN(PTR_ERR(op_data));
2435 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2438 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2441 ll_finish_md_op_data(op_data);
2445 case LL_IOC_HSM_STATE_SET: {
2446 struct hsm_state_set *hss;
2453 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2458 rc = ll_hsm_state_set(inode, hss);
2463 case LL_IOC_HSM_ACTION: {
2464 struct md_op_data *op_data;
2465 struct hsm_current_action *hca;
2472 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2473 LUSTRE_OPC_ANY, hca);
2474 if (IS_ERR(op_data)) {
2476 RETURN(PTR_ERR(op_data));
2479 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2482 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2485 ll_finish_md_op_data(op_data);
2489 case LL_IOC_SET_LEASE: {
2490 struct ll_inode_info *lli = ll_i2info(inode);
2491 struct obd_client_handle *och = NULL;
2497 if (!(file->f_mode & FMODE_WRITE))
2502 if (!(file->f_mode & FMODE_READ))
2507 mutex_lock(&lli->lli_och_mutex);
2508 if (fd->fd_lease_och != NULL) {
2509 och = fd->fd_lease_och;
2510 fd->fd_lease_och = NULL;
2512 mutex_unlock(&lli->lli_och_mutex);
2515 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2516 rc = ll_lease_close(och, inode, &lease_broken);
2517 if (rc == 0 && lease_broken)
2523 /* return the type of lease or error */
2524 RETURN(rc < 0 ? rc : (int)mode);
2529 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2531 /* apply for lease */
2532 och = ll_lease_open(inode, file, mode, 0);
2534 RETURN(PTR_ERR(och));
2537 mutex_lock(&lli->lli_och_mutex);
2538 if (fd->fd_lease_och == NULL) {
2539 fd->fd_lease_och = och;
2542 mutex_unlock(&lli->lli_och_mutex);
2544 /* impossible now that only excl is supported for now */
2545 ll_lease_close(och, inode, &lease_broken);
2550 case LL_IOC_GET_LEASE: {
2551 struct ll_inode_info *lli = ll_i2info(inode);
2552 struct ldlm_lock *lock = NULL;
2555 mutex_lock(&lli->lli_och_mutex);
2556 if (fd->fd_lease_och != NULL) {
2557 struct obd_client_handle *och = fd->fd_lease_och;
2559 lock = ldlm_handle2lock(&och->och_lease_handle);
2561 lock_res_and_lock(lock);
2562 if (!ldlm_is_cancel(lock))
2563 rc = och->och_flags &
2564 (FMODE_READ | FMODE_WRITE);
2565 unlock_res_and_lock(lock);
2566 LDLM_LOCK_PUT(lock);
2569 mutex_unlock(&lli->lli_och_mutex);
2572 case LL_IOC_HSM_IMPORT: {
2573 struct hsm_user_import *hui;
2579 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2584 rc = ll_hsm_import(inode, file, hui);
2593 ll_iocontrol_call(inode, file, cmd, arg, &err))
2596 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2602 #ifndef HAVE_FILE_LLSEEK_SIZE
2603 static inline loff_t
2604 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2606 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2608 if (offset > maxsize)
2611 if (offset != file->f_pos) {
2612 file->f_pos = offset;
2613 file->f_version = 0;
2619 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2620 loff_t maxsize, loff_t eof)
2622 struct inode *inode = file->f_dentry->d_inode;
2630 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2631 * position-querying operation. Avoid rewriting the "same"
2632 * f_pos value back to the file because a concurrent read(),
2633 * write() or lseek() might have altered it
2638 * f_lock protects against read/modify/write race with other
2639 * SEEK_CURs. Note that parallel writes and reads behave
2642 mutex_lock(&inode->i_mutex);
2643 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2644 mutex_unlock(&inode->i_mutex);
2648 * In the generic case the entire file is data, so as long as
2649 * offset isn't at the end of the file then the offset is data.
2656 * There is a virtual hole at the end of the file, so as long as
2657 * offset isn't i_size or larger, return i_size.
2665 return llseek_execute(file, offset, maxsize);
2669 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2671 struct inode *inode = file->f_dentry->d_inode;
2672 loff_t retval, eof = 0;
2675 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2676 (origin == SEEK_CUR) ? file->f_pos : 0);
2677 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2678 inode->i_ino, inode->i_generation, inode, retval, retval,
2680 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2682 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2683 retval = ll_glimpse_size(inode);
2686 eof = i_size_read(inode);
2689 retval = ll_generic_file_llseek_size(file, offset, origin,
2690 ll_file_maxbytes(inode), eof);
2694 int ll_flush(struct file *file, fl_owner_t id)
2696 struct inode *inode = file->f_dentry->d_inode;
2697 struct ll_inode_info *lli = ll_i2info(inode);
2698 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2701 LASSERT(!S_ISDIR(inode->i_mode));
2703 /* catch async errors that were recorded back when async writeback
2704 * failed for pages in this mapping. */
2705 rc = lli->lli_async_rc;
2706 lli->lli_async_rc = 0;
2707 err = lov_read_and_clear_async_rc(lli->lli_clob);
2711 /* The application has been told write failure already.
2712 * Do not report failure again. */
2713 if (fd->fd_write_failed)
2715 return rc ? -EIO : 0;
2719 * Called to make sure a portion of file has been written out.
2720 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2722 * Return how many pages have been written.
2724 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2725 enum cl_fsync_mode mode, int ignore_layout)
2727 struct cl_env_nest nest;
2730 struct obd_capa *capa = NULL;
2731 struct cl_fsync_io *fio;
2735 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2736 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2739 env = cl_env_nested_get(&nest);
2741 RETURN(PTR_ERR(env));
2743 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2745 io = ccc_env_thread_io(env);
2746 io->ci_obj = cl_i2info(inode)->lli_clob;
2747 io->ci_ignore_layout = ignore_layout;
2749 /* initialize parameters for sync */
2750 fio = &io->u.ci_fsync;
2751 fio->fi_capa = capa;
2752 fio->fi_start = start;
2754 fio->fi_fid = ll_inode2fid(inode);
2755 fio->fi_mode = mode;
2756 fio->fi_nr_written = 0;
2758 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2759 result = cl_io_loop(env, io);
2761 result = io->ci_result;
2763 result = fio->fi_nr_written;
2764 cl_io_fini(env, io);
2765 cl_env_nested_put(&nest, env);
2773 * When dentry is provided (the 'else' case), *file->f_dentry may be
2774 * null and dentry must be used directly rather than pulled from
2775 * *file->f_dentry as is done otherwise.
2778 #ifdef HAVE_FILE_FSYNC_4ARGS
2779 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2781 struct dentry *dentry = file->f_dentry;
2782 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2783 int ll_fsync(struct file *file, int datasync)
2785 struct dentry *dentry = file->f_dentry;
2787 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2790 struct inode *inode = dentry->d_inode;
2791 struct ll_inode_info *lli = ll_i2info(inode);
2792 struct ptlrpc_request *req;
2793 struct obd_capa *oc;
2797 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2798 inode->i_generation, inode);
2799 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2801 #ifdef HAVE_FILE_FSYNC_4ARGS
2802 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2803 mutex_lock(&inode->i_mutex);
2805 /* fsync's caller has already called _fdata{sync,write}, we want
2806 * that IO to finish before calling the osc and mdc sync methods */
2807 rc = filemap_fdatawait(inode->i_mapping);
2810 /* catch async errors that were recorded back when async writeback
2811 * failed for pages in this mapping. */
2812 if (!S_ISDIR(inode->i_mode)) {
2813 err = lli->lli_async_rc;
2814 lli->lli_async_rc = 0;
2817 err = lov_read_and_clear_async_rc(lli->lli_clob);
2822 oc = ll_mdscapa_get(inode);
2823 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2829 ptlrpc_req_finished(req);
2831 if (datasync && S_ISREG(inode->i_mode)) {
2832 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2834 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2836 if (rc == 0 && err < 0)
2839 fd->fd_write_failed = true;
2841 fd->fd_write_failed = false;
2844 #ifdef HAVE_FILE_FSYNC_4ARGS
2845 mutex_unlock(&inode->i_mutex);
2850 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2852 struct inode *inode = file->f_dentry->d_inode;
2853 struct ll_sb_info *sbi = ll_i2sbi(inode);
2854 struct ldlm_enqueue_info einfo = {
2855 .ei_type = LDLM_FLOCK,
2856 .ei_cb_cp = ldlm_flock_completion_ast,
2857 .ei_cbdata = file_lock,
2859 struct md_op_data *op_data;
2860 struct lustre_handle lockh = {0};
2861 ldlm_policy_data_t flock = {{0}};
2867 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2868 inode->i_ino, file_lock);
2870 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2872 if (file_lock->fl_flags & FL_FLOCK) {
2873 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2874 /* flocks are whole-file locks */
2875 flock.l_flock.end = OFFSET_MAX;
2876 /* For flocks owner is determined by the local file desctiptor*/
2877 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2878 } else if (file_lock->fl_flags & FL_POSIX) {
2879 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2880 flock.l_flock.start = file_lock->fl_start;
2881 flock.l_flock.end = file_lock->fl_end;
2885 flock.l_flock.pid = file_lock->fl_pid;
2887 /* Somewhat ugly workaround for svc lockd.
2888 * lockd installs custom fl_lmops->lm_compare_owner that checks
2889 * for the fl_owner to be the same (which it always is on local node
2890 * I guess between lockd processes) and then compares pid.
2891 * As such we assign pid to the owner field to make it all work,
2892 * conflict with normal locks is unlikely since pid space and
2893 * pointer space for current->files are not intersecting */
2894 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2895 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2897 switch (file_lock->fl_type) {
2899 einfo.ei_mode = LCK_PR;
2902 /* An unlock request may or may not have any relation to
2903 * existing locks so we may not be able to pass a lock handle
2904 * via a normal ldlm_lock_cancel() request. The request may even
2905 * unlock a byte range in the middle of an existing lock. In
2906 * order to process an unlock request we need all of the same
2907 * information that is given with a normal read or write record
2908 * lock request. To avoid creating another ldlm unlock (cancel)
2909 * message we'll treat a LCK_NL flock request as an unlock. */
2910 einfo.ei_mode = LCK_NL;
2913 einfo.ei_mode = LCK_PW;
2916 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2917 file_lock->fl_type);
2932 flags = LDLM_FL_BLOCK_NOWAIT;
2938 flags = LDLM_FL_TEST_LOCK;
2939 /* Save the old mode so that if the mode in the lock changes we
2940 * can decrement the appropriate reader or writer refcount. */
2941 file_lock->fl_type = einfo.ei_mode;
2944 CERROR("unknown fcntl lock command: %d\n", cmd);
2948 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2949 LUSTRE_OPC_ANY, NULL);
2950 if (IS_ERR(op_data))
2951 RETURN(PTR_ERR(op_data));
2953 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, "
2954 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2955 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2957 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2958 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2960 if ((file_lock->fl_flags & FL_FLOCK) &&
2961 (rc == 0 || file_lock->fl_type == F_UNLCK))
2962 rc2 = flock_lock_file_wait(file, file_lock);
2963 if ((file_lock->fl_flags & FL_POSIX) &&
2964 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2965 !(flags & LDLM_FL_TEST_LOCK))
2966 rc2 = posix_lock_file_wait(file, file_lock);
2968 if (rc2 && file_lock->fl_type != F_UNLCK) {
2969 einfo.ei_mode = LCK_NL;
2970 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2971 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2975 ll_finish_md_op_data(op_data);
2980 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2988 * test if some locks matching bits and l_req_mode are acquired
2989 * - bits can be in different locks
2990 * - if found clear the common lock bits in *bits
2991 * - the bits not found, are kept in *bits
2993 * \param bits [IN] searched lock bits [IN]
2994 * \param l_req_mode [IN] searched lock mode
2995 * \retval boolean, true iff all bits are found
2997 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2999 struct lustre_handle lockh;
3000 ldlm_policy_data_t policy;
3001 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3002 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3011 fid = &ll_i2info(inode)->lli_fid;
3012 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3013 ldlm_lockname[mode]);
3015 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3016 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3017 policy.l_inodebits.bits = *bits & (1 << i);
3018 if (policy.l_inodebits.bits == 0)
3021 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3022 &policy, mode, &lockh)) {
3023 struct ldlm_lock *lock;
3025 lock = ldlm_handle2lock(&lockh);
3028 ~(lock->l_policy_data.l_inodebits.bits);
3029 LDLM_LOCK_PUT(lock);
3031 *bits &= ~policy.l_inodebits.bits;
3038 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3039 struct lustre_handle *lockh, __u64 flags,
3042 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3047 fid = &ll_i2info(inode)->lli_fid;
3048 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3050 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3051 fid, LDLM_IBITS, &policy, mode, lockh);
3056 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3058 /* Already unlinked. Just update nlink and return success */
3059 if (rc == -ENOENT) {
3061 /* This path cannot be hit for regular files unless in
3062 * case of obscure races, so no need to to validate
3064 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3066 } else if (rc != 0) {
3067 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
3068 ll_get_fsname(inode->i_sb, NULL, 0),
3069 PFID(ll_inode2fid(inode)), rc);
3075 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3078 struct inode *inode = dentry->d_inode;
3079 struct ptlrpc_request *req = NULL;
3080 struct obd_export *exp;
3084 LASSERT(inode != NULL);
3086 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3087 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3089 exp = ll_i2mdexp(inode);
3091 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3092 * But under CMD case, it caused some lock issues, should be fixed
3093 * with new CMD ibits lock. See bug 12718 */
3094 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3095 struct lookup_intent oit = { .it_op = IT_GETATTR };
3096 struct md_op_data *op_data;
3098 if (ibits == MDS_INODELOCK_LOOKUP)
3099 oit.it_op = IT_LOOKUP;
3101 /* Call getattr by fid, so do not provide name at all. */
3102 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3103 dentry->d_inode, NULL, 0, 0,
3104 LUSTRE_OPC_ANY, NULL);
3105 if (IS_ERR(op_data))
3106 RETURN(PTR_ERR(op_data));
3108 oit.it_create_mode |= M_CHECK_STALE;
3109 rc = md_intent_lock(exp, op_data, NULL, 0,
3110 /* we are not interested in name
3113 ll_md_blocking_ast, 0);
3114 ll_finish_md_op_data(op_data);
3115 oit.it_create_mode &= ~M_CHECK_STALE;
3117 rc = ll_inode_revalidate_fini(inode, rc);
3121 rc = ll_revalidate_it_finish(req, &oit, dentry);
3123 ll_intent_release(&oit);
3127 /* Unlinked? Unhash dentry, so it is not picked up later by
3128 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3129 here to preserve get_cwd functionality on 2.6.
3131 if (!dentry->d_inode->i_nlink)
3132 d_lustre_invalidate(dentry, 0);
3134 ll_lookup_finish_locks(&oit, dentry);
3135 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3136 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3137 obd_valid valid = OBD_MD_FLGETATTR;
3138 struct md_op_data *op_data;
3141 if (S_ISREG(inode->i_mode)) {
3142 rc = ll_get_max_mdsize(sbi, &ealen);
3145 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3148 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3149 0, ealen, LUSTRE_OPC_ANY,
3151 if (IS_ERR(op_data))
3152 RETURN(PTR_ERR(op_data));
3154 op_data->op_valid = valid;
3155 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3156 * capa for this inode. Because we only keep capas of dirs
3158 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3159 ll_finish_md_op_data(op_data);
3161 rc = ll_inode_revalidate_fini(inode, rc);
3165 rc = ll_prep_inode(&inode, req, NULL, NULL);
3168 ptlrpc_req_finished(req);
3172 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3175 struct inode *inode = dentry->d_inode;
3179 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3183 /* if object isn't regular file, don't validate size */
3184 if (!S_ISREG(inode->i_mode)) {
3185 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3186 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3187 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3189 /* In case of restore, the MDT has the right size and has
3190 * already send it back without granting the layout lock,
3191 * inode is up-to-date so glimpse is useless.
3192 * Also to glimpse we need the layout, in case of a running
3193 * restore the MDT holds the layout lock so the glimpse will
3194 * block up to the end of restore (getattr will block)
3196 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3197 rc = ll_glimpse_size(inode);
3202 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3203 struct lookup_intent *it, struct kstat *stat)
3205 struct inode *inode = de->d_inode;
3206 struct ll_sb_info *sbi = ll_i2sbi(inode);
3207 struct ll_inode_info *lli = ll_i2info(inode);
3210 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3211 MDS_INODELOCK_LOOKUP);
3212 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3217 stat->dev = inode->i_sb->s_dev;
3218 if (ll_need_32bit_api(sbi))
3219 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3221 stat->ino = inode->i_ino;
3222 stat->mode = inode->i_mode;
3223 stat->nlink = inode->i_nlink;
3224 stat->uid = inode->i_uid;
3225 stat->gid = inode->i_gid;
3226 stat->rdev = inode->i_rdev;
3227 stat->atime = inode->i_atime;
3228 stat->mtime = inode->i_mtime;
3229 stat->ctime = inode->i_ctime;
3230 stat->blksize = 1 << inode->i_blkbits;
3232 stat->size = i_size_read(inode);
3233 stat->blocks = inode->i_blocks;
3237 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3239 struct lookup_intent it = { .it_op = IT_GETATTR };
3241 return ll_getattr_it(mnt, de, &it, stat);
3244 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3245 __u64 start, __u64 len)
3249 struct ll_user_fiemap *fiemap;
3250 unsigned int extent_count = fieinfo->fi_extents_max;
3252 num_bytes = sizeof(*fiemap) + (extent_count *
3253 sizeof(struct ll_fiemap_extent));
3254 OBD_ALLOC_LARGE(fiemap, num_bytes);
3259 fiemap->fm_flags = fieinfo->fi_flags;
3260 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3261 fiemap->fm_start = start;
3262 fiemap->fm_length = len;
3263 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3264 sizeof(struct ll_fiemap_extent));
3266 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3268 fieinfo->fi_flags = fiemap->fm_flags;
3269 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3270 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3271 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3273 OBD_FREE_LARGE(fiemap, num_bytes);
3277 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3279 struct ll_inode_info *lli = ll_i2info(inode);
3280 struct posix_acl *acl = NULL;
3283 spin_lock(&lli->lli_lock);
3284 /* VFS' acl_permission_check->check_acl will release the refcount */
3285 acl = posix_acl_dup(lli->lli_posix_acl);
3286 spin_unlock(&lli->lli_lock);
3291 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3293 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3294 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3296 ll_check_acl(struct inode *inode, int mask)
3299 # ifdef CONFIG_FS_POSIX_ACL
3300 struct posix_acl *acl;
3304 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3305 if (flags & IPERM_FLAG_RCU)
3308 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3313 rc = posix_acl_permission(inode, acl, mask);
3314 posix_acl_release(acl);
3317 # else /* !CONFIG_FS_POSIX_ACL */
3319 # endif /* CONFIG_FS_POSIX_ACL */
3321 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3323 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3324 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3326 # ifdef HAVE_INODE_PERMISION_2ARGS
3327 int ll_inode_permission(struct inode *inode, int mask)
3329 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3336 #ifdef MAY_NOT_BLOCK
3337 if (mask & MAY_NOT_BLOCK)
3339 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3340 if (flags & IPERM_FLAG_RCU)
3344 /* as root inode are NOT getting validated in lookup operation,
3345 * need to do it before permission check. */
3347 if (inode == inode->i_sb->s_root->d_inode) {
3348 struct lookup_intent it = { .it_op = IT_LOOKUP };
3350 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3351 MDS_INODELOCK_LOOKUP);
3356 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3357 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3359 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3360 return lustre_check_remote_perm(inode, mask);
3362 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3363 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3368 /* -o localflock - only provides locally consistent flock locks */
3369 struct file_operations ll_file_operations = {
3370 .read = ll_file_read,
3371 .aio_read = ll_file_aio_read,
3372 .write = ll_file_write,
3373 .aio_write = ll_file_aio_write,
3374 .unlocked_ioctl = ll_file_ioctl,
3375 .open = ll_file_open,
3376 .release = ll_file_release,
3377 .mmap = ll_file_mmap,
3378 .llseek = ll_file_seek,
3379 .splice_read = ll_file_splice_read,
3384 struct file_operations ll_file_operations_flock = {
3385 .read = ll_file_read,
3386 .aio_read = ll_file_aio_read,
3387 .write = ll_file_write,
3388 .aio_write = ll_file_aio_write,
3389 .unlocked_ioctl = ll_file_ioctl,
3390 .open = ll_file_open,
3391 .release = ll_file_release,
3392 .mmap = ll_file_mmap,
3393 .llseek = ll_file_seek,
3394 .splice_read = ll_file_splice_read,
3397 .flock = ll_file_flock,
3398 .lock = ll_file_flock
3401 /* These are for -o noflock - to return ENOSYS on flock calls */
3402 struct file_operations ll_file_operations_noflock = {
3403 .read = ll_file_read,
3404 .aio_read = ll_file_aio_read,
3405 .write = ll_file_write,
3406 .aio_write = ll_file_aio_write,
3407 .unlocked_ioctl = ll_file_ioctl,
3408 .open = ll_file_open,
3409 .release = ll_file_release,
3410 .mmap = ll_file_mmap,
3411 .llseek = ll_file_seek,
3412 .splice_read = ll_file_splice_read,
3415 .flock = ll_file_noflock,
3416 .lock = ll_file_noflock
3419 struct inode_operations ll_file_inode_operations = {
3420 .setattr = ll_setattr,
3421 .getattr = ll_getattr,
3422 .permission = ll_inode_permission,
3423 .setxattr = ll_setxattr,
3424 .getxattr = ll_getxattr,
3425 .listxattr = ll_listxattr,
3426 .removexattr = ll_removexattr,
3427 .fiemap = ll_fiemap,
3428 #ifdef HAVE_IOP_GET_ACL
3429 .get_acl = ll_get_acl,
3433 /* dynamic ioctl number support routins */
3434 static struct llioc_ctl_data {
3435 struct rw_semaphore ioc_sem;
3436 cfs_list_t ioc_head;
3438 __RWSEM_INITIALIZER(llioc.ioc_sem),
3439 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3444 cfs_list_t iocd_list;
3445 unsigned int iocd_size;
3446 llioc_callback_t iocd_cb;
3447 unsigned int iocd_count;
3448 unsigned int iocd_cmd[0];
3451 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3454 struct llioc_data *in_data = NULL;
3457 if (cb == NULL || cmd == NULL ||
3458 count > LLIOC_MAX_CMD || count < 0)
3461 size = sizeof(*in_data) + count * sizeof(unsigned int);
3462 OBD_ALLOC(in_data, size);
3463 if (in_data == NULL)
3466 memset(in_data, 0, sizeof(*in_data));
3467 in_data->iocd_size = size;
3468 in_data->iocd_cb = cb;
3469 in_data->iocd_count = count;
3470 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3472 down_write(&llioc.ioc_sem);
3473 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3474 up_write(&llioc.ioc_sem);
3479 void ll_iocontrol_unregister(void *magic)
3481 struct llioc_data *tmp;
3486 down_write(&llioc.ioc_sem);
3487 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3489 unsigned int size = tmp->iocd_size;
3491 cfs_list_del(&tmp->iocd_list);
3492 up_write(&llioc.ioc_sem);
3494 OBD_FREE(tmp, size);
3498 up_write(&llioc.ioc_sem);
3500 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3503 EXPORT_SYMBOL(ll_iocontrol_register);
3504 EXPORT_SYMBOL(ll_iocontrol_unregister);
3506 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3507 unsigned int cmd, unsigned long arg, int *rcp)
3509 enum llioc_iter ret = LLIOC_CONT;
3510 struct llioc_data *data;
3511 int rc = -EINVAL, i;
3513 down_read(&llioc.ioc_sem);
3514 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3515 for (i = 0; i < data->iocd_count; i++) {
3516 if (cmd != data->iocd_cmd[i])
3519 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3523 if (ret == LLIOC_STOP)
3526 up_read(&llioc.ioc_sem);
3533 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3535 struct ll_inode_info *lli = ll_i2info(inode);
3536 struct cl_env_nest nest;
3541 if (lli->lli_clob == NULL)
3544 env = cl_env_nested_get(&nest);
3546 RETURN(PTR_ERR(env));
3548 result = cl_conf_set(env, lli->lli_clob, conf);
3549 cl_env_nested_put(&nest, env);
3551 if (conf->coc_opc == OBJECT_CONF_SET) {
3552 struct ldlm_lock *lock = conf->coc_lock;
3554 LASSERT(lock != NULL);
3555 LASSERT(ldlm_has_layout(lock));
3557 /* it can only be allowed to match after layout is
3558 * applied to inode otherwise false layout would be
3559 * seen. Applying layout shoud happen before dropping
3560 * the intent lock. */
3561 ldlm_lock_allow_match(lock);
3567 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3568 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3571 struct ll_sb_info *sbi = ll_i2sbi(inode);
3572 struct obd_capa *oc;
3573 struct ptlrpc_request *req;
3574 struct mdt_body *body;
3581 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3582 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3583 lock->l_lvb_data, lock->l_lvb_len);
3585 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3588 /* if layout lock was granted right away, the layout is returned
3589 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3590 * blocked and then granted via completion ast, we have to fetch
3591 * layout here. Please note that we can't use the LVB buffer in
3592 * completion AST because it doesn't have a large enough buffer */
3593 oc = ll_mdscapa_get(inode);
3594 rc = ll_get_max_mdsize(sbi, &lmmsize);
3596 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3597 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3603 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3604 if (body == NULL || body->eadatasize > lmmsize)
3605 GOTO(out, rc = -EPROTO);
3607 lmmsize = body->eadatasize;
3608 if (lmmsize == 0) /* empty layout */
3611 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3613 GOTO(out, rc = -EFAULT);
3615 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3616 if (lvbdata == NULL)
3617 GOTO(out, rc = -ENOMEM);
3619 memcpy(lvbdata, lmm, lmmsize);
3620 lock_res_and_lock(lock);
3621 if (lock->l_lvb_data != NULL)
3622 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3624 lock->l_lvb_data = lvbdata;
3625 lock->l_lvb_len = lmmsize;
3626 unlock_res_and_lock(lock);
3631 ptlrpc_req_finished(req);
3636 * Apply the layout to the inode. Layout lock is held and will be released
3639 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3640 struct inode *inode, __u32 *gen, bool reconf)
3642 struct ll_inode_info *lli = ll_i2info(inode);
3643 struct ll_sb_info *sbi = ll_i2sbi(inode);
3644 struct ldlm_lock *lock;
3645 struct lustre_md md = { NULL };
3646 struct cl_object_conf conf;
3649 bool wait_layout = false;
3652 LASSERT(lustre_handle_is_used(lockh));
3654 lock = ldlm_handle2lock(lockh);
3655 LASSERT(lock != NULL);
3656 LASSERT(ldlm_has_layout(lock));
3658 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3659 inode, PFID(&lli->lli_fid), reconf);
3661 /* in case this is a caching lock and reinstate with new inode */
3662 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3664 lock_res_and_lock(lock);
3665 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3666 unlock_res_and_lock(lock);
3667 /* checking lvb_ready is racy but this is okay. The worst case is
3668 * that multi processes may configure the file on the same time. */
3670 if (lvb_ready || !reconf) {
3673 /* layout_gen must be valid if layout lock is not
3674 * cancelled and stripe has already set */
3675 *gen = lli->lli_layout_gen;
3681 rc = ll_layout_fetch(inode, lock);
3685 /* for layout lock, lmm is returned in lock's lvb.
3686 * lvb_data is immutable if the lock is held so it's safe to access it
3687 * without res lock. See the description in ldlm_lock_decref_internal()
3688 * for the condition to free lvb_data of layout lock */
3689 if (lock->l_lvb_data != NULL) {
3690 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3691 lock->l_lvb_data, lock->l_lvb_len);
3693 *gen = LL_LAYOUT_GEN_EMPTY;
3695 *gen = md.lsm->lsm_layout_gen;
3698 CERROR("%s: file "DFID" unpackmd error: %d\n",
3699 ll_get_fsname(inode->i_sb, NULL, 0),
3700 PFID(&lli->lli_fid), rc);
3706 /* set layout to file. Unlikely this will fail as old layout was
3707 * surely eliminated */
3708 memset(&conf, 0, sizeof conf);
3709 conf.coc_opc = OBJECT_CONF_SET;
3710 conf.coc_inode = inode;
3711 conf.coc_lock = lock;
3712 conf.u.coc_md = &md;
3713 rc = ll_layout_conf(inode, &conf);
3716 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3718 /* refresh layout failed, need to wait */
3719 wait_layout = rc == -EBUSY;
3723 LDLM_LOCK_PUT(lock);
3724 ldlm_lock_decref(lockh, mode);
3726 /* wait for IO to complete if it's still being used. */
3728 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3729 ll_get_fsname(inode->i_sb, NULL, 0),
3730 inode, PFID(&lli->lli_fid));
3732 memset(&conf, 0, sizeof conf);
3733 conf.coc_opc = OBJECT_CONF_WAIT;
3734 conf.coc_inode = inode;
3735 rc = ll_layout_conf(inode, &conf);
3739 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3740 PFID(&lli->lli_fid), rc);
3746 * This function checks if there exists a LAYOUT lock on the client side,
3747 * or enqueues it if it doesn't have one in cache.
3749 * This function will not hold layout lock so it may be revoked any time after
3750 * this function returns. Any operations depend on layout should be redone
3753 * This function should be called before lov_io_init() to get an uptodate
3754 * layout version, the caller should save the version number and after IO
3755 * is finished, this function should be called again to verify that layout
3756 * is not changed during IO time.
3758 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3760 struct ll_inode_info *lli = ll_i2info(inode);
3761 struct ll_sb_info *sbi = ll_i2sbi(inode);
3762 struct md_op_data *op_data;
3763 struct lookup_intent it;
3764 struct lustre_handle lockh;
3766 struct ldlm_enqueue_info einfo = {
3767 .ei_type = LDLM_IBITS,
3769 .ei_cb_bl = ll_md_blocking_ast,
3770 .ei_cb_cp = ldlm_completion_ast,
3775 *gen = lli->lli_layout_gen;
3776 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3780 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3781 LASSERT(S_ISREG(inode->i_mode));
3783 /* mostly layout lock is caching on the local side, so try to match
3784 * it before grabbing layout lock mutex. */
3785 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3786 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3787 if (mode != 0) { /* hit cached lock */
3788 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3792 /* better hold lli_layout_mutex to try again otherwise
3793 * it will have starvation problem. */
3796 /* take layout lock mutex to enqueue layout lock exclusively. */
3797 mutex_lock(&lli->lli_layout_mutex);
3800 /* try again. Maybe somebody else has done this. */
3801 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3802 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3803 if (mode != 0) { /* hit cached lock */
3804 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3808 mutex_unlock(&lli->lli_layout_mutex);
3812 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3813 0, 0, LUSTRE_OPC_ANY, NULL);
3814 if (IS_ERR(op_data)) {
3815 mutex_unlock(&lli->lli_layout_mutex);
3816 RETURN(PTR_ERR(op_data));
3819 /* have to enqueue one */
3820 memset(&it, 0, sizeof(it));
3821 it.it_op = IT_LAYOUT;
3822 lockh.cookie = 0ULL;
3824 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3825 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3826 PFID(&lli->lli_fid));
3828 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3830 if (it.d.lustre.it_data != NULL)
3831 ptlrpc_req_finished(it.d.lustre.it_data);
3832 it.d.lustre.it_data = NULL;
3834 ll_finish_md_op_data(op_data);
3836 mode = it.d.lustre.it_lock_mode;
3837 it.d.lustre.it_lock_mode = 0;
3838 ll_intent_drop_lock(&it);
3841 /* set lock data in case this is a new lock */
3842 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3843 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3847 mutex_unlock(&lli->lli_layout_mutex);
3853 * This function send a restore request to the MDT
3855 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3857 struct hsm_user_request *hur;
3861 len = sizeof(struct hsm_user_request) +
3862 sizeof(struct hsm_user_item);
3863 OBD_ALLOC(hur, len);
3867 hur->hur_request.hr_action = HUA_RESTORE;
3868 hur->hur_request.hr_archive_id = 0;
3869 hur->hur_request.hr_flags = 0;
3870 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3871 sizeof(hur->hur_user_item[0].hui_fid));
3872 hur->hur_user_item[0].hui_extent.offset = offset;
3873 hur->hur_user_item[0].hui_extent.length = length;
3874 hur->hur_request.hr_itemcount = 1;
3875 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,