4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och,
124 const __u64 *data_version)
126 struct obd_export *exp = ll_i2mdexp(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
129 struct obd_device *obd = class_exp2obd(exp);
136 * XXX: in case of LMV, is this correct to access
139 CERROR("Invalid MDC connection handle "LPX64"\n",
140 ll_i2mdexp(inode)->exp_handle.h_cookie);
144 OBD_ALLOC_PTR(op_data);
146 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
148 ll_prepare_close(inode, op_data, och);
149 if (data_version != NULL) {
150 /* Pass in data_version implies release. */
151 op_data->op_bias |= MDS_HSM_RELEASE;
152 op_data->op_data_version = *data_version;
153 op_data->op_lease_handle = och->och_lease_handle;
154 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
157 rc = md_close(md_exp, op_data, och->och_mod, &req);
159 /* This close must have the epoch closed. */
160 LASSERT(epoch_close);
161 /* MDS has instructed us to obtain Size-on-MDS attribute from
162 * OSTs and send setattr to back to MDS. */
163 rc = ll_som_update(inode, op_data);
165 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
166 " failed: rc = %d\n",
167 ll_i2mdexp(inode)->exp_obd->obd_name,
168 PFID(ll_inode2fid(inode)), rc);
172 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
173 ll_i2mdexp(inode)->exp_obd->obd_name,
174 PFID(ll_inode2fid(inode)), rc);
177 /* DATA_MODIFIED flag was successfully sent on close, cancel data
178 * modification flag. */
179 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
180 struct ll_inode_info *lli = ll_i2info(inode);
182 spin_lock(&lli->lli_lock);
183 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
184 spin_unlock(&lli->lli_lock);
188 rc = ll_objects_destroy(req, inode);
190 CERROR("%s: inode "DFID
191 " ll_objects destroy: rc = %d\n",
192 ll_i2mdexp(inode)->exp_obd->obd_name,
193 PFID(ll_inode2fid(inode)), rc);
196 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
197 struct mdt_body *body;
198 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
199 if (!(body->valid & OBD_MD_FLRELEASED))
203 ll_finish_md_op_data(op_data);
207 if (exp_connect_som(exp) && !epoch_close &&
208 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
209 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 md_clear_open_replay_data(md_exp, och);
212 /* Free @och if it is not waiting for DONE_WRITING. */
213 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
216 if (req) /* This is close request */
217 ptlrpc_req_finished(req);
221 int ll_md_real_close(struct inode *inode, fmode_t fmode)
223 struct ll_inode_info *lli = ll_i2info(inode);
224 struct obd_client_handle **och_p;
225 struct obd_client_handle *och;
230 if (fmode & FMODE_WRITE) {
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
233 } else if (fmode & FMODE_EXEC) {
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
237 LASSERT(fmode & FMODE_READ);
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
242 mutex_lock(&lli->lli_och_mutex);
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
246 mutex_unlock(&lli->lli_och_mutex);
252 mutex_unlock(&lli->lli_och_mutex);
255 /* There might be a race and this handle may already
257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
264 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
272 /* clear group lock, if present */
273 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
274 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
276 if (fd->fd_lease_och != NULL) {
279 /* Usually the lease is not released when the
280 * application crashed, we need to release here. */
281 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
282 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
283 PFID(&lli->lli_fid), rc, lease_broken);
285 fd->fd_lease_och = NULL;
288 if (fd->fd_och != NULL) {
289 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
294 /* Let's see if we have good enough OPEN lock on the file and if
295 we can skip talking to MDS */
296 if (file->f_dentry->d_inode) { /* Can this ever be false? */
298 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
299 struct lustre_handle lockh;
300 struct inode *inode = file->f_dentry->d_inode;
301 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
303 mutex_lock(&lli->lli_och_mutex);
304 if (fd->fd_omode & FMODE_WRITE) {
306 LASSERT(lli->lli_open_fd_write_count);
307 lli->lli_open_fd_write_count--;
308 } else if (fd->fd_omode & FMODE_EXEC) {
310 LASSERT(lli->lli_open_fd_exec_count);
311 lli->lli_open_fd_exec_count--;
314 LASSERT(lli->lli_open_fd_read_count);
315 lli->lli_open_fd_read_count--;
317 mutex_unlock(&lli->lli_och_mutex);
319 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
320 LDLM_IBITS, &policy, lockmode,
322 rc = ll_md_real_close(file->f_dentry->d_inode,
326 CERROR("Releasing a file %p with negative dentry %p. Name %s",
327 file, file->f_dentry, file->f_dentry->d_name.name);
331 LUSTRE_FPRIVATE(file) = NULL;
332 ll_file_data_put(fd);
333 ll_capa_close(inode);
338 /* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
343 int ll_file_release(struct inode *inode, struct file *file)
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
351 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
352 PFID(ll_inode2fid(inode)), inode);
354 #ifdef CONFIG_FS_POSIX_ACL
355 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
356 inode == inode->i_sb->s_root->d_inode) {
357 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
360 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
361 fd->fd_flags &= ~LL_FILE_RMTACL;
362 rct_del(&sbi->ll_rct, current_pid());
363 et_search_free(&sbi->ll_et, current_pid());
368 if (inode->i_sb->s_root != file->f_dentry)
369 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
370 fd = LUSTRE_FPRIVATE(file);
373 /* The last ref on @file, maybe not the the owner pid of statahead.
374 * Different processes can open the same dir, "ll_opendir_key" means:
375 * it is me that should stop the statahead thread. */
376 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
377 lli->lli_opendir_pid != 0)
378 ll_stop_statahead(inode, lli->lli_opendir_key);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm,
401 int lmmsize, struct lookup_intent *itp)
403 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
404 struct dentry *parent = file->f_dentry->d_parent;
405 const char *name = file->f_dentry->d_name.name;
406 const int len = file->f_dentry->d_name.len;
407 struct md_op_data *op_data;
408 struct ptlrpc_request *req;
409 __u32 opc = LUSTRE_OPC_ANY;
416 /* Usually we come here only for NFSD, and we want open lock.
417 But we can also get here with pre 2.6.15 patchless kernels, and in
418 that case that lock is also ok */
419 /* We can also get here if there was cached open handle in revalidate_it
420 * but it disappeared while we were getting from there to ll_file_open.
421 * But this means this file was closed and immediatelly opened which
422 * makes a good candidate for using OPEN lock */
423 /* If lmmsize & lmm are not 0, we are just setting stripe info
424 * parameters. No need for the open lock */
425 if (lmm == NULL && lmmsize == 0) {
426 itp->it_flags |= MDS_OPEN_LOCK;
427 if (itp->it_flags & FMODE_WRITE)
428 opc = LUSTRE_OPC_CREATE;
431 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
432 file->f_dentry->d_inode, name, len,
435 RETURN(PTR_ERR(op_data));
437 itp->it_flags |= MDS_OPEN_BY_FID;
438 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
439 0 /*unused */, &req, ll_md_blocking_ast, 0);
440 ll_finish_md_op_data(op_data);
442 /* reason for keep own exit path - don`t flood log
443 * with messages with -ESTALE errors.
445 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
446 it_open_error(DISP_OPEN_OPEN, itp))
448 ll_release_openhandle(file->f_dentry, itp);
452 if (it_disposition(itp, DISP_LOOKUP_NEG))
453 GOTO(out, rc = -ENOENT);
455 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
456 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
457 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
461 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
462 if (!rc && itp->d.lustre.it_lock_mode)
463 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
467 ptlrpc_req_finished(req);
468 ll_intent_drop_lock(itp);
474 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
475 * not believe attributes if a few ioepoch holders exist. Attributes for
476 * previous ioepoch if new one is opened are also skipped by MDS.
478 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
480 if (ioepoch && lli->lli_ioepoch != ioepoch) {
481 lli->lli_ioepoch = ioepoch;
482 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
483 ioepoch, PFID(&lli->lli_fid));
487 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
488 struct obd_client_handle *och)
490 struct ptlrpc_request *req = it->d.lustre.it_data;
491 struct mdt_body *body;
493 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
494 och->och_fh = body->handle;
495 och->och_fid = body->fid1;
496 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
497 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
498 och->och_flags = it->it_flags;
500 return md_set_open_replay_data(md_exp, och, it);
503 int ll_local_open(struct file *file, struct lookup_intent *it,
504 struct ll_file_data *fd, struct obd_client_handle *och)
506 struct inode *inode = file->f_dentry->d_inode;
507 struct ll_inode_info *lli = ll_i2info(inode);
510 LASSERT(!LUSTRE_FPRIVATE(file));
515 struct ptlrpc_request *req = it->d.lustre.it_data;
516 struct mdt_body *body;
519 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
523 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
524 ll_ioepoch_open(lli, body->ioepoch);
527 LUSTRE_FPRIVATE(file) = fd;
528 ll_readahead_init(inode, &fd->fd_ras);
529 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
534 /* Open a file, and (for the very first open) create objects on the OSTs at
535 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
536 * creation or open until ll_lov_setstripe() ioctl is called.
538 * If we already have the stripe MD locally then we don't request it in
539 * md_open(), by passing a lmm_size = 0.
541 * It is up to the application to ensure no other processes open this file
542 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
543 * used. We might be able to avoid races of that sort by getting lli_open_sem
544 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
545 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
547 int ll_file_open(struct inode *inode, struct file *file)
549 struct ll_inode_info *lli = ll_i2info(inode);
550 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
551 .it_flags = file->f_flags };
552 struct obd_client_handle **och_p = NULL;
553 __u64 *och_usecount = NULL;
554 struct ll_file_data *fd;
555 int rc = 0, opendir_set = 0;
558 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
559 PFID(ll_inode2fid(inode)), inode, file->f_flags);
561 it = file->private_data; /* XXX: compat macro */
562 file->private_data = NULL; /* prevent ll_local_open assertion */
564 fd = ll_file_data_get();
566 GOTO(out_openerr, rc = -ENOMEM);
569 if (S_ISDIR(inode->i_mode)) {
570 spin_lock(&lli->lli_sa_lock);
571 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
572 lli->lli_opendir_pid == 0) {
573 lli->lli_opendir_key = fd;
574 lli->lli_opendir_pid = current_pid();
577 spin_unlock(&lli->lli_sa_lock);
580 if (inode->i_sb->s_root == file->f_dentry) {
581 LUSTRE_FPRIVATE(file) = fd;
585 if (!it || !it->d.lustre.it_disposition) {
586 /* Convert f_flags into access mode. We cannot use file->f_mode,
587 * because everything but O_ACCMODE mask was stripped from
589 if ((oit.it_flags + 1) & O_ACCMODE)
591 if (file->f_flags & O_TRUNC)
592 oit.it_flags |= FMODE_WRITE;
594 /* kernel only call f_op->open in dentry_open. filp_open calls
595 * dentry_open after call to open_namei that checks permissions.
596 * Only nfsd_open call dentry_open directly without checking
597 * permissions and because of that this code below is safe. */
598 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
599 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
601 /* We do not want O_EXCL here, presumably we opened the file
602 * already? XXX - NFS implications? */
603 oit.it_flags &= ~O_EXCL;
605 /* bug20584, if "it_flags" contains O_CREAT, the file will be
606 * created if necessary, then "IT_CREAT" should be set to keep
607 * consistent with it */
608 if (oit.it_flags & O_CREAT)
609 oit.it_op |= IT_CREAT;
615 /* Let's see if we have file open on MDS already. */
616 if (it->it_flags & FMODE_WRITE) {
617 och_p = &lli->lli_mds_write_och;
618 och_usecount = &lli->lli_open_fd_write_count;
619 } else if (it->it_flags & FMODE_EXEC) {
620 och_p = &lli->lli_mds_exec_och;
621 och_usecount = &lli->lli_open_fd_exec_count;
623 och_p = &lli->lli_mds_read_och;
624 och_usecount = &lli->lli_open_fd_read_count;
627 mutex_lock(&lli->lli_och_mutex);
628 if (*och_p) { /* Open handle is present */
629 if (it_disposition(it, DISP_OPEN_OPEN)) {
630 /* Well, there's extra open request that we do not need,
631 let's close it somehow. This will decref request. */
632 rc = it_open_error(DISP_OPEN_OPEN, it);
634 mutex_unlock(&lli->lli_och_mutex);
635 GOTO(out_openerr, rc);
638 ll_release_openhandle(file->f_dentry, it);
642 rc = ll_local_open(file, it, fd, NULL);
645 mutex_unlock(&lli->lli_och_mutex);
646 GOTO(out_openerr, rc);
649 LASSERT(*och_usecount == 0);
650 if (!it->d.lustre.it_disposition) {
651 /* We cannot just request lock handle now, new ELC code
652 means that one of other OPEN locks for this file
653 could be cancelled, and since blocking ast handler
654 would attempt to grab och_mutex as well, that would
655 result in a deadlock */
656 mutex_unlock(&lli->lli_och_mutex);
657 it->it_create_mode |= M_CHECK_STALE;
658 rc = ll_intent_file_open(file, NULL, 0, it);
659 it->it_create_mode &= ~M_CHECK_STALE;
661 GOTO(out_openerr, rc);
665 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
667 GOTO(out_och_free, rc = -ENOMEM);
671 /* md_intent_lock() didn't get a request ref if there was an
672 * open error, so don't do cleanup on the request here
674 /* XXX (green): Should not we bail out on any error here, not
675 * just open error? */
676 rc = it_open_error(DISP_OPEN_OPEN, it);
678 GOTO(out_och_free, rc);
680 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
681 "inode %p: disposition %x, status %d\n", inode,
682 it_disposition(it, ~0), it->d.lustre.it_status);
684 rc = ll_local_open(file, it, fd, *och_p);
686 GOTO(out_och_free, rc);
688 mutex_unlock(&lli->lli_och_mutex);
691 /* Must do this outside lli_och_mutex lock to prevent deadlock where
692 different kind of OPEN lock for this same inode gets cancelled
693 by ldlm_cancel_lru */
694 if (!S_ISREG(inode->i_mode))
695 GOTO(out_och_free, rc);
699 if (!lli->lli_has_smd &&
700 (cl_is_lov_delay_create(file->f_flags) ||
701 (file->f_mode & FMODE_WRITE) == 0)) {
702 CDEBUG(D_INODE, "object creation was delayed\n");
703 GOTO(out_och_free, rc);
705 cl_lov_delay_create_clear(&file->f_flags);
706 GOTO(out_och_free, rc);
710 if (och_p && *och_p) {
711 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
712 *och_p = NULL; /* OBD_FREE writes some magic there */
715 mutex_unlock(&lli->lli_och_mutex);
718 if (opendir_set != 0)
719 ll_stop_statahead(inode, lli->lli_opendir_key);
721 ll_file_data_put(fd);
723 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
726 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
727 ptlrpc_req_finished(it->d.lustre.it_data);
728 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
734 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
735 struct ldlm_lock_desc *desc, void *data, int flag)
738 struct lustre_handle lockh;
742 case LDLM_CB_BLOCKING:
743 ldlm_lock2handle(lock, &lockh);
744 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
746 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
750 case LDLM_CB_CANCELING:
758 * Acquire a lease and open the file.
760 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
761 fmode_t fmode, __u64 open_flags)
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
833 ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
894 EXPORT_SYMBOL(ll_lease_open);
897 * Release lease and close the file.
898 * It will check if the lease has ever broken.
900 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
903 struct ldlm_lock *lock;
904 bool cancelled = true;
908 lock = ldlm_handle2lock(&och->och_lease_handle);
910 lock_res_and_lock(lock);
911 cancelled = ldlm_is_cancel(lock);
912 unlock_res_and_lock(lock);
916 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
917 PFID(&ll_i2info(inode)->lli_fid), cancelled);
920 ldlm_cli_cancel(&och->och_lease_handle, 0);
921 if (lease_broken != NULL)
922 *lease_broken = cancelled;
924 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 EXPORT_SYMBOL(ll_lease_close);
930 /* Fills the obdo with the attributes for the lsm */
931 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
932 struct obd_capa *capa, struct obdo *obdo,
933 __u64 ioepoch, int dv_flags)
935 struct ptlrpc_request_set *set;
936 struct obd_info oinfo = { { { 0 } } };
941 LASSERT(lsm != NULL);
945 oinfo.oi_oa->o_oi = lsm->lsm_oi;
946 oinfo.oi_oa->o_mode = S_IFREG;
947 oinfo.oi_oa->o_ioepoch = ioepoch;
948 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
949 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
950 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
951 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
952 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
953 OBD_MD_FLDATAVERSION;
954 oinfo.oi_capa = capa;
955 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
956 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
957 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
958 if (dv_flags & LL_DV_WR_FLUSH)
959 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
962 set = ptlrpc_prep_set();
964 CERROR("can't allocate ptlrpc set\n");
967 rc = obd_getattr_async(exp, &oinfo, set);
969 rc = ptlrpc_set_wait(set);
970 ptlrpc_set_destroy(set);
973 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
974 OBD_MD_FLATIME | OBD_MD_FLMTIME |
975 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
976 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
977 if (dv_flags & LL_DV_WR_FLUSH &&
978 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
979 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
986 * Performs the getattr on the inode and updates its fields.
987 * If @sync != 0, perform the getattr under the server-side lock.
989 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
990 __u64 ioepoch, int sync)
992 struct obd_capa *capa = ll_mdscapa_get(inode);
993 struct lov_stripe_md *lsm;
997 lsm = ccc_inode_lsm_get(inode);
998 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
999 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1002 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1004 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1005 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1006 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1007 (unsigned long long)inode->i_blocks,
1008 (unsigned long)ll_inode_blksize(inode));
1010 ccc_inode_lsm_put(inode, lsm);
1014 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1016 struct ll_inode_info *lli = ll_i2info(inode);
1017 struct cl_object *obj = lli->lli_clob;
1018 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1025 /* merge timestamps the most recently obtained from mds with
1026 timestamps obtained from osts */
1027 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1028 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1029 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1030 inode_init_lvb(inode, &lvb);
1032 cl_object_attr_lock(obj);
1033 rc = cl_object_attr_get(env, obj, attr);
1034 cl_object_attr_unlock(obj);
1037 if (lvb.lvb_atime < attr->cat_atime)
1038 lvb.lvb_atime = attr->cat_atime;
1039 if (lvb.lvb_ctime < attr->cat_ctime)
1040 lvb.lvb_ctime = attr->cat_ctime;
1041 if (lvb.lvb_mtime < attr->cat_mtime)
1042 lvb.lvb_mtime = attr->cat_mtime;
1044 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1045 PFID(&lli->lli_fid), attr->cat_size);
1046 cl_isize_write_nolock(inode, attr->cat_size);
1048 inode->i_blocks = attr->cat_blocks;
1050 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1051 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1052 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1054 ll_inode_size_unlock(inode);
1059 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1062 struct obdo obdo = { 0 };
1065 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1067 st->st_size = obdo.o_size;
1068 st->st_blocks = obdo.o_blocks;
1069 st->st_mtime = obdo.o_mtime;
1070 st->st_atime = obdo.o_atime;
1071 st->st_ctime = obdo.o_ctime;
1076 static bool file_is_noatime(const struct file *file)
1078 const struct vfsmount *mnt = file->f_path.mnt;
1079 const struct inode *inode = file->f_path.dentry->d_inode;
1081 /* Adapted from file_accessed() and touch_atime().*/
1082 if (file->f_flags & O_NOATIME)
1085 if (inode->i_flags & S_NOATIME)
1088 if (IS_NOATIME(inode))
1091 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1094 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1097 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1103 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1105 struct inode *inode = file->f_dentry->d_inode;
1107 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1109 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1110 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1111 file->f_flags & O_DIRECT ||
1114 io->ci_obj = ll_i2info(inode)->lli_clob;
1115 io->ci_lockreq = CILR_MAYBE;
1116 if (ll_file_nolock(file)) {
1117 io->ci_lockreq = CILR_NEVER;
1118 io->ci_no_srvlock = 1;
1119 } else if (file->f_flags & O_APPEND) {
1120 io->ci_lockreq = CILR_MANDATORY;
1123 io->ci_noatime = file_is_noatime(file);
1127 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1128 struct file *file, enum cl_io_type iot,
1129 loff_t *ppos, size_t count)
1131 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1132 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1137 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1138 file->f_dentry->d_name.name, iot, *ppos, count);
1141 io = ccc_env_thread_io(env);
1142 ll_io_init(io, file, iot == CIT_WRITE);
1144 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1145 struct vvp_io *vio = vvp_env_io(env);
1146 struct ccc_io *cio = ccc_env_io(env);
1147 int write_mutex_locked = 0;
1149 cio->cui_fd = LUSTRE_FPRIVATE(file);
1150 vio->cui_io_subtype = args->via_io_subtype;
1152 switch (vio->cui_io_subtype) {
1154 cio->cui_iov = args->u.normal.via_iov;
1155 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1156 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1157 cio->cui_iocb = args->u.normal.via_iocb;
1158 if ((iot == CIT_WRITE) &&
1159 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1160 if (mutex_lock_interruptible(&lli->
1162 GOTO(out, result = -ERESTARTSYS);
1163 write_mutex_locked = 1;
1165 down_read(&lli->lli_trunc_sem);
1168 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1169 vio->u.splice.cui_flags = args->u.splice.via_flags;
1172 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1175 result = cl_io_loop(env, io);
1176 if (args->via_io_subtype == IO_NORMAL)
1177 up_read(&lli->lli_trunc_sem);
1178 if (write_mutex_locked)
1179 mutex_unlock(&lli->lli_write_mutex);
1181 /* cl_io_rw_init() handled IO */
1182 result = io->ci_result;
1185 if (io->ci_nob > 0) {
1186 result = io->ci_nob;
1187 *ppos = io->u.ci_wr.wr.crw_pos;
1191 cl_io_fini(env, io);
1192 /* If any bit been read/written (result != 0), we just return
1193 * short read/write instead of restart io. */
1194 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1195 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1196 iot == CIT_READ ? "read" : "write",
1197 file->f_dentry->d_name.name, *ppos, count);
1198 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1202 if (iot == CIT_READ) {
1204 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1205 LPROC_LL_READ_BYTES, result);
1206 } else if (iot == CIT_WRITE) {
1208 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1209 LPROC_LL_WRITE_BYTES, result);
1210 fd->fd_write_failed = false;
1211 } else if (result != -ERESTARTSYS) {
1212 fd->fd_write_failed = true;
1215 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1222 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1224 static int ll_file_get_iov_count(const struct iovec *iov,
1225 unsigned long *nr_segs, size_t *count)
1230 for (seg = 0; seg < *nr_segs; seg++) {
1231 const struct iovec *iv = &iov[seg];
1234 * If any segment has a negative length, or the cumulative
1235 * length ever wraps negative then return -EINVAL.
1238 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1240 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1245 cnt -= iv->iov_len; /* This segment is no good */
1252 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1253 unsigned long nr_segs, loff_t pos)
1256 struct vvp_io_args *args;
1262 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1266 env = cl_env_get(&refcheck);
1268 RETURN(PTR_ERR(env));
1270 args = vvp_env_args(env, IO_NORMAL);
1271 args->u.normal.via_iov = (struct iovec *)iov;
1272 args->u.normal.via_nrsegs = nr_segs;
1273 args->u.normal.via_iocb = iocb;
1275 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1276 &iocb->ki_pos, count);
1277 cl_env_put(env, &refcheck);
1281 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1285 struct iovec *local_iov;
1286 struct kiocb *kiocb;
1291 env = cl_env_get(&refcheck);
1293 RETURN(PTR_ERR(env));
1295 local_iov = &vvp_env_info(env)->vti_local_iov;
1296 kiocb = &vvp_env_info(env)->vti_kiocb;
1297 local_iov->iov_base = (void __user *)buf;
1298 local_iov->iov_len = count;
1299 init_sync_kiocb(kiocb, file);
1300 kiocb->ki_pos = *ppos;
1301 #ifdef HAVE_KIOCB_KI_LEFT
1302 kiocb->ki_left = count;
1304 kiocb->ki_nbytes = count;
1307 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1308 *ppos = kiocb->ki_pos;
1310 cl_env_put(env, &refcheck);
1315 * Write to a file (through the page cache).
1318 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1319 unsigned long nr_segs, loff_t pos)
1322 struct vvp_io_args *args;
1328 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1332 env = cl_env_get(&refcheck);
1334 RETURN(PTR_ERR(env));
1336 args = vvp_env_args(env, IO_NORMAL);
1337 args->u.normal.via_iov = (struct iovec *)iov;
1338 args->u.normal.via_nrsegs = nr_segs;
1339 args->u.normal.via_iocb = iocb;
1341 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1342 &iocb->ki_pos, count);
1343 cl_env_put(env, &refcheck);
1347 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1351 struct iovec *local_iov;
1352 struct kiocb *kiocb;
1357 env = cl_env_get(&refcheck);
1359 RETURN(PTR_ERR(env));
1361 local_iov = &vvp_env_info(env)->vti_local_iov;
1362 kiocb = &vvp_env_info(env)->vti_kiocb;
1363 local_iov->iov_base = (void __user *)buf;
1364 local_iov->iov_len = count;
1365 init_sync_kiocb(kiocb, file);
1366 kiocb->ki_pos = *ppos;
1367 #ifdef HAVE_KIOCB_KI_LEFT
1368 kiocb->ki_left = count;
1370 kiocb->ki_nbytes = count;
1373 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1374 *ppos = kiocb->ki_pos;
1376 cl_env_put(env, &refcheck);
1381 * Send file content (through pagecache) somewhere with helper
1383 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1384 struct pipe_inode_info *pipe, size_t count,
1388 struct vvp_io_args *args;
1393 env = cl_env_get(&refcheck);
1395 RETURN(PTR_ERR(env));
1397 args = vvp_env_args(env, IO_SPLICE);
1398 args->u.splice.via_pipe = pipe;
1399 args->u.splice.via_flags = flags;
1401 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1402 cl_env_put(env, &refcheck);
1406 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1409 struct obd_export *exp = ll_i2dtexp(inode);
1410 struct obd_trans_info oti = { 0 };
1411 struct obdo *oa = NULL;
1414 struct lov_stripe_md *lsm = NULL, *lsm2;
1421 lsm = ccc_inode_lsm_get(inode);
1422 if (!lsm_has_objects(lsm))
1423 GOTO(out, rc = -ENOENT);
1425 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1426 (lsm->lsm_stripe_count));
1428 OBD_ALLOC_LARGE(lsm2, lsm_size);
1430 GOTO(out, rc = -ENOMEM);
1433 oa->o_nlink = ost_idx;
1434 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1435 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1436 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1437 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1438 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1439 memcpy(lsm2, lsm, lsm_size);
1440 ll_inode_size_lock(inode);
1441 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1442 ll_inode_size_unlock(inode);
1444 OBD_FREE_LARGE(lsm2, lsm_size);
1447 ccc_inode_lsm_put(inode, lsm);
1452 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1454 struct ll_recreate_obj ucreat;
1458 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1461 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1465 ostid_set_seq_mdt0(&oi);
1466 ostid_set_id(&oi, ucreat.lrc_id);
1467 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1470 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1477 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1480 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1483 fid_to_ostid(&fid, &oi);
1484 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1485 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1488 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1489 __u64 flags, struct lov_user_md *lum,
1492 struct lov_stripe_md *lsm = NULL;
1493 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1497 lsm = ccc_inode_lsm_get(inode);
1499 ccc_inode_lsm_put(inode, lsm);
1500 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1501 PFID(ll_inode2fid(inode)));
1502 GOTO(out, rc = -EEXIST);
1505 ll_inode_size_lock(inode);
1506 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1508 GOTO(out_unlock, rc);
1509 rc = oit.d.lustre.it_status;
1511 GOTO(out_req_free, rc);
1513 ll_release_openhandle(file->f_dentry, &oit);
1516 ll_inode_size_unlock(inode);
1517 ll_intent_release(&oit);
1518 ccc_inode_lsm_put(inode, lsm);
1520 cl_lov_delay_create_clear(&file->f_flags);
1523 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1527 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1528 struct lov_mds_md **lmmp, int *lmm_size,
1529 struct ptlrpc_request **request)
1531 struct ll_sb_info *sbi = ll_i2sbi(inode);
1532 struct mdt_body *body;
1533 struct lov_mds_md *lmm = NULL;
1534 struct ptlrpc_request *req = NULL;
1535 struct md_op_data *op_data;
1538 rc = ll_get_default_mdsize(sbi, &lmmsize);
1542 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1543 strlen(filename), lmmsize,
1544 LUSTRE_OPC_ANY, NULL);
1545 if (IS_ERR(op_data))
1546 RETURN(PTR_ERR(op_data));
1548 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1549 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1550 ll_finish_md_op_data(op_data);
1552 CDEBUG(D_INFO, "md_getattr_name failed "
1553 "on %s: rc %d\n", filename, rc);
1557 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1558 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1560 lmmsize = body->eadatasize;
1562 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1564 GOTO(out, rc = -ENODATA);
1567 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1568 LASSERT(lmm != NULL);
1570 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1571 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1572 GOTO(out, rc = -EPROTO);
1576 * This is coming from the MDS, so is probably in
1577 * little endian. We convert it to host endian before
1578 * passing it to userspace.
1580 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1583 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1584 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1587 /* if function called for directory - we should
1588 * avoid swab not existent lsm objects */
1589 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1590 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1591 if (S_ISREG(body->mode))
1592 lustre_swab_lov_user_md_objects(
1593 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1595 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1596 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1597 if (S_ISREG(body->mode))
1598 lustre_swab_lov_user_md_objects(
1599 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1606 *lmm_size = lmmsize;
1611 static int ll_lov_setea(struct inode *inode, struct file *file,
1614 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1615 struct lov_user_md *lump;
1616 int lum_size = sizeof(struct lov_user_md) +
1617 sizeof(struct lov_user_ost_data);
1621 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1624 OBD_ALLOC_LARGE(lump, lum_size);
1628 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1629 OBD_FREE_LARGE(lump, lum_size);
1633 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1635 OBD_FREE_LARGE(lump, lum_size);
1639 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1642 struct lov_user_md_v3 lumv3;
1643 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1644 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1645 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1647 __u64 flags = FMODE_WRITE;
1650 /* first try with v1 which is smaller than v3 */
1651 lum_size = sizeof(struct lov_user_md_v1);
1652 if (copy_from_user(lumv1, lumv1p, lum_size))
1655 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1656 lum_size = sizeof(struct lov_user_md_v3);
1657 if (copy_from_user(&lumv3, lumv3p, lum_size))
1661 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1663 struct lov_stripe_md *lsm;
1666 put_user(0, &lumv1p->lmm_stripe_count);
1668 ll_layout_refresh(inode, &gen);
1669 lsm = ccc_inode_lsm_get(inode);
1670 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1671 0, lsm, (void *)arg);
1672 ccc_inode_lsm_put(inode, lsm);
1677 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1679 struct lov_stripe_md *lsm;
1683 lsm = ccc_inode_lsm_get(inode);
1685 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1687 ccc_inode_lsm_put(inode, lsm);
1691 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1693 struct ll_inode_info *lli = ll_i2info(inode);
1694 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1695 struct ccc_grouplock grouplock;
1699 if (ll_file_nolock(file))
1700 RETURN(-EOPNOTSUPP);
1702 spin_lock(&lli->lli_lock);
1703 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1704 CWARN("group lock already existed with gid %lu\n",
1705 fd->fd_grouplock.cg_gid);
1706 spin_unlock(&lli->lli_lock);
1709 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1710 spin_unlock(&lli->lli_lock);
1712 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1713 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1717 spin_lock(&lli->lli_lock);
1718 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1719 spin_unlock(&lli->lli_lock);
1720 CERROR("another thread just won the race\n");
1721 cl_put_grouplock(&grouplock);
1725 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1726 fd->fd_grouplock = grouplock;
1727 spin_unlock(&lli->lli_lock);
1729 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1733 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1735 struct ll_inode_info *lli = ll_i2info(inode);
1736 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1737 struct ccc_grouplock grouplock;
1740 spin_lock(&lli->lli_lock);
1741 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1742 spin_unlock(&lli->lli_lock);
1743 CWARN("no group lock held\n");
1746 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1748 if (fd->fd_grouplock.cg_gid != arg) {
1749 CWARN("group lock %lu doesn't match current id %lu\n",
1750 arg, fd->fd_grouplock.cg_gid);
1751 spin_unlock(&lli->lli_lock);
1755 grouplock = fd->fd_grouplock;
1756 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1757 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1758 spin_unlock(&lli->lli_lock);
1760 cl_put_grouplock(&grouplock);
1761 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1766 * Close inode open handle
1768 * \param dentry [in] dentry which contains the inode
1769 * \param it [in,out] intent which contains open info and result
1772 * \retval <0 failure
1774 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1776 struct inode *inode = dentry->d_inode;
1777 struct obd_client_handle *och;
1783 /* Root ? Do nothing. */
1784 if (dentry->d_inode->i_sb->s_root == dentry)
1787 /* No open handle to close? Move away */
1788 if (!it_disposition(it, DISP_OPEN_OPEN))
1791 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1793 OBD_ALLOC(och, sizeof(*och));
1795 GOTO(out, rc = -ENOMEM);
1797 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1799 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1802 /* this one is in place of ll_file_open */
1803 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1804 ptlrpc_req_finished(it->d.lustre.it_data);
1805 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1811 * Get size for inode for which FIEMAP mapping is requested.
1812 * Make the FIEMAP get_info call and returns the result.
1814 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1817 struct obd_export *exp = ll_i2dtexp(inode);
1818 struct lov_stripe_md *lsm = NULL;
1819 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1820 int vallen = num_bytes;
1824 /* Checks for fiemap flags */
1825 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1826 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1830 /* Check for FIEMAP_FLAG_SYNC */
1831 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1832 rc = filemap_fdatawrite(inode->i_mapping);
1837 lsm = ccc_inode_lsm_get(inode);
1841 /* If the stripe_count > 1 and the application does not understand
1842 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1844 if (lsm->lsm_stripe_count > 1 &&
1845 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1846 GOTO(out, rc = -EOPNOTSUPP);
1848 fm_key.oa.o_oi = lsm->lsm_oi;
1849 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1851 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1852 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1853 /* If filesize is 0, then there would be no objects for mapping */
1854 if (fm_key.oa.o_size == 0) {
1855 fiemap->fm_mapped_extents = 0;
1859 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1861 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1864 CERROR("obd_get_info failed: rc = %d\n", rc);
1867 ccc_inode_lsm_put(inode, lsm);
1871 int ll_fid2path(struct inode *inode, void *arg)
1873 struct obd_export *exp = ll_i2mdexp(inode);
1874 struct getinfo_fid2path *gfout, *gfin;
1878 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1879 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1882 /* Need to get the buflen */
1883 OBD_ALLOC_PTR(gfin);
1886 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1891 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1892 OBD_ALLOC(gfout, outsize);
1893 if (gfout == NULL) {
1897 memcpy(gfout, gfin, sizeof(*gfout));
1900 /* Call mdc_iocontrol */
1901 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1905 if (copy_to_user(arg, gfout, outsize))
1909 OBD_FREE(gfout, outsize);
1913 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1915 struct ll_user_fiemap *fiemap_s;
1916 size_t num_bytes, ret_bytes;
1917 unsigned int extent_count;
1920 /* Get the extent count so we can calculate the size of
1921 * required fiemap buffer */
1922 if (get_user(extent_count,
1923 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1925 num_bytes = sizeof(*fiemap_s) + (extent_count *
1926 sizeof(struct ll_fiemap_extent));
1928 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1929 if (fiemap_s == NULL)
1932 /* get the fiemap value */
1933 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1935 GOTO(error, rc = -EFAULT);
1937 /* If fm_extent_count is non-zero, read the first extent since
1938 * it is used to calculate end_offset and device from previous
1941 if (copy_from_user(&fiemap_s->fm_extents[0],
1942 (char __user *)arg + sizeof(*fiemap_s),
1943 sizeof(struct ll_fiemap_extent)))
1944 GOTO(error, rc = -EFAULT);
1947 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1951 ret_bytes = sizeof(struct ll_user_fiemap);
1953 if (extent_count != 0)
1954 ret_bytes += (fiemap_s->fm_mapped_extents *
1955 sizeof(struct ll_fiemap_extent));
1957 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1961 OBD_FREE_LARGE(fiemap_s, num_bytes);
1966 * Read the data_version for inode.
1968 * This value is computed using stripe object version on OST.
1969 * Version is computed using server side locking.
1971 * @param sync if do sync on the OST side;
1973 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1974 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1976 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1978 struct lov_stripe_md *lsm = NULL;
1979 struct ll_sb_info *sbi = ll_i2sbi(inode);
1980 struct obdo *obdo = NULL;
1984 /* If no stripe, we consider version is 0. */
1985 lsm = ccc_inode_lsm_get(inode);
1986 if (!lsm_has_objects(lsm)) {
1988 CDEBUG(D_INODE, "No object for inode\n");
1992 OBD_ALLOC_PTR(obdo);
1994 GOTO(out, rc = -ENOMEM);
1996 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1998 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2001 *data_version = obdo->o_data_version;
2007 ccc_inode_lsm_put(inode, lsm);
2012 * Trigger a HSM release request for the provided inode.
2014 int ll_hsm_release(struct inode *inode)
2016 struct cl_env_nest nest;
2018 struct obd_client_handle *och = NULL;
2019 __u64 data_version = 0;
2023 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2024 ll_get_fsname(inode->i_sb, NULL, 0),
2025 PFID(&ll_i2info(inode)->lli_fid));
2027 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2029 GOTO(out, rc = PTR_ERR(och));
2031 /* Grab latest data_version and [am]time values */
2032 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2036 env = cl_env_nested_get(&nest);
2038 GOTO(out, rc = PTR_ERR(env));
2040 ll_merge_lvb(env, inode);
2041 cl_env_nested_put(&nest, env);
2043 /* Release the file.
2044 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2045 * we still need it to pack l_remote_handle to MDT. */
2046 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2052 if (och != NULL && !IS_ERR(och)) /* close the file */
2053 ll_lease_close(och, inode, NULL);
2058 struct ll_swap_stack {
2059 struct iattr ia1, ia2;
2061 struct inode *inode1, *inode2;
2062 bool check_dv1, check_dv2;
2065 static int ll_swap_layouts(struct file *file1, struct file *file2,
2066 struct lustre_swap_layouts *lsl)
2068 struct mdc_swap_layouts msl;
2069 struct md_op_data *op_data;
2072 struct ll_swap_stack *llss = NULL;
2075 OBD_ALLOC_PTR(llss);
2079 llss->inode1 = file1->f_dentry->d_inode;
2080 llss->inode2 = file2->f_dentry->d_inode;
2082 if (!S_ISREG(llss->inode2->i_mode))
2083 GOTO(free, rc = -EINVAL);
2085 if (inode_permission(llss->inode1, MAY_WRITE) ||
2086 inode_permission(llss->inode2, MAY_WRITE))
2087 GOTO(free, rc = -EPERM);
2089 if (llss->inode2->i_sb != llss->inode1->i_sb)
2090 GOTO(free, rc = -EXDEV);
2092 /* we use 2 bool because it is easier to swap than 2 bits */
2093 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2094 llss->check_dv1 = true;
2096 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2097 llss->check_dv2 = true;
2099 /* we cannot use lsl->sl_dvX directly because we may swap them */
2100 llss->dv1 = lsl->sl_dv1;
2101 llss->dv2 = lsl->sl_dv2;
2103 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2104 if (rc == 0) /* same file, done! */
2107 if (rc < 0) { /* sequentialize it */
2108 swap(llss->inode1, llss->inode2);
2110 swap(llss->dv1, llss->dv2);
2111 swap(llss->check_dv1, llss->check_dv2);
2115 if (gid != 0) { /* application asks to flush dirty cache */
2116 rc = ll_get_grouplock(llss->inode1, file1, gid);
2120 rc = ll_get_grouplock(llss->inode2, file2, gid);
2122 ll_put_grouplock(llss->inode1, file1, gid);
2127 /* to be able to restore mtime and atime after swap
2128 * we need to first save them */
2130 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2131 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2132 llss->ia1.ia_atime = llss->inode1->i_atime;
2133 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2134 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2135 llss->ia2.ia_atime = llss->inode2->i_atime;
2136 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2139 /* ultimate check, before swaping the layouts we check if
2140 * dataversion has changed (if requested) */
2141 if (llss->check_dv1) {
2142 rc = ll_data_version(llss->inode1, &dv, 0);
2145 if (dv != llss->dv1)
2146 GOTO(putgl, rc = -EAGAIN);
2149 if (llss->check_dv2) {
2150 rc = ll_data_version(llss->inode2, &dv, 0);
2153 if (dv != llss->dv2)
2154 GOTO(putgl, rc = -EAGAIN);
2157 /* struct md_op_data is used to send the swap args to the mdt
2158 * only flags is missing, so we use struct mdc_swap_layouts
2159 * through the md_op_data->op_data */
2160 /* flags from user space have to be converted before they are send to
2161 * server, no flag is sent today, they are only used on the client */
2164 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2165 0, LUSTRE_OPC_ANY, &msl);
2166 if (IS_ERR(op_data))
2167 GOTO(free, rc = PTR_ERR(op_data));
2169 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2170 sizeof(*op_data), op_data, NULL);
2171 ll_finish_md_op_data(op_data);
2175 ll_put_grouplock(llss->inode2, file2, gid);
2176 ll_put_grouplock(llss->inode1, file1, gid);
2179 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2183 /* clear useless flags */
2184 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2185 llss->ia1.ia_valid &= ~ATTR_MTIME;
2186 llss->ia2.ia_valid &= ~ATTR_MTIME;
2189 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2190 llss->ia1.ia_valid &= ~ATTR_ATIME;
2191 llss->ia2.ia_valid &= ~ATTR_ATIME;
2194 /* update time if requested */
2196 if (llss->ia2.ia_valid != 0) {
2197 mutex_lock(&llss->inode1->i_mutex);
2198 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2199 mutex_unlock(&llss->inode1->i_mutex);
2202 if (llss->ia1.ia_valid != 0) {
2205 mutex_lock(&llss->inode2->i_mutex);
2206 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2207 mutex_unlock(&llss->inode2->i_mutex);
2219 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2221 struct md_op_data *op_data;
2224 /* Non-root users are forbidden to set or clear flags which are
2225 * NOT defined in HSM_USER_MASK. */
2226 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2227 !cfs_capable(CFS_CAP_SYS_ADMIN))
2230 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2231 LUSTRE_OPC_ANY, hss);
2232 if (IS_ERR(op_data))
2233 RETURN(PTR_ERR(op_data));
2235 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2236 sizeof(*op_data), op_data, NULL);
2238 ll_finish_md_op_data(op_data);
2243 static int ll_hsm_import(struct inode *inode, struct file *file,
2244 struct hsm_user_import *hui)
2246 struct hsm_state_set *hss = NULL;
2247 struct iattr *attr = NULL;
2251 if (!S_ISREG(inode->i_mode))
2257 GOTO(out, rc = -ENOMEM);
2259 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2260 hss->hss_archive_id = hui->hui_archive_id;
2261 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2262 rc = ll_hsm_state_set(inode, hss);
2266 OBD_ALLOC_PTR(attr);
2268 GOTO(out, rc = -ENOMEM);
2270 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2271 attr->ia_mode |= S_IFREG;
2272 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2273 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2274 attr->ia_size = hui->hui_size;
2275 attr->ia_mtime.tv_sec = hui->hui_mtime;
2276 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2277 attr->ia_atime.tv_sec = hui->hui_atime;
2278 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2280 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2281 ATTR_UID | ATTR_GID |
2282 ATTR_MTIME | ATTR_MTIME_SET |
2283 ATTR_ATIME | ATTR_ATIME_SET;
2285 rc = ll_setattr_raw(file->f_dentry, attr, true);
2299 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2301 struct inode *inode = file->f_dentry->d_inode;
2302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2306 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2307 PFID(ll_inode2fid(inode)), inode, cmd);
2308 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2310 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2311 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2315 case LL_IOC_GETFLAGS:
2316 /* Get the current value of the file flags */
2317 return put_user(fd->fd_flags, (int *)arg);
2318 case LL_IOC_SETFLAGS:
2319 case LL_IOC_CLRFLAGS:
2320 /* Set or clear specific file flags */
2321 /* XXX This probably needs checks to ensure the flags are
2322 * not abused, and to handle any flag side effects.
2324 if (get_user(flags, (int *) arg))
2327 if (cmd == LL_IOC_SETFLAGS) {
2328 if ((flags & LL_FILE_IGNORE_LOCK) &&
2329 !(file->f_flags & O_DIRECT)) {
2330 CERROR("%s: unable to disable locking on "
2331 "non-O_DIRECT file\n", current->comm);
2335 fd->fd_flags |= flags;
2337 fd->fd_flags &= ~flags;
2340 case LL_IOC_LOV_SETSTRIPE:
2341 RETURN(ll_lov_setstripe(inode, file, arg));
2342 case LL_IOC_LOV_SETEA:
2343 RETURN(ll_lov_setea(inode, file, arg));
2344 case LL_IOC_LOV_SWAP_LAYOUTS: {
2346 struct lustre_swap_layouts lsl;
2348 if (copy_from_user(&lsl, (char *)arg,
2349 sizeof(struct lustre_swap_layouts)))
2352 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2355 file2 = fget(lsl.sl_fd);
2360 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2361 rc = ll_swap_layouts(file, file2, &lsl);
2365 case LL_IOC_LOV_GETSTRIPE:
2366 RETURN(ll_lov_getstripe(inode, arg));
2367 case LL_IOC_RECREATE_OBJ:
2368 RETURN(ll_lov_recreate_obj(inode, arg));
2369 case LL_IOC_RECREATE_FID:
2370 RETURN(ll_lov_recreate_fid(inode, arg));
2371 case FSFILT_IOC_FIEMAP:
2372 RETURN(ll_ioctl_fiemap(inode, arg));
2373 case FSFILT_IOC_GETFLAGS:
2374 case FSFILT_IOC_SETFLAGS:
2375 RETURN(ll_iocontrol(inode, file, cmd, arg));
2376 case FSFILT_IOC_GETVERSION_OLD:
2377 case FSFILT_IOC_GETVERSION:
2378 RETURN(put_user(inode->i_generation, (int *)arg));
2379 case LL_IOC_GROUP_LOCK:
2380 RETURN(ll_get_grouplock(inode, file, arg));
2381 case LL_IOC_GROUP_UNLOCK:
2382 RETURN(ll_put_grouplock(inode, file, arg));
2383 case IOC_OBD_STATFS:
2384 RETURN(ll_obd_statfs(inode, (void *)arg));
2386 /* We need to special case any other ioctls we want to handle,
2387 * to send them to the MDS/OST as appropriate and to properly
2388 * network encode the arg field.
2389 case FSFILT_IOC_SETVERSION_OLD:
2390 case FSFILT_IOC_SETVERSION:
2392 case LL_IOC_FLUSHCTX:
2393 RETURN(ll_flush_ctx(inode));
2394 case LL_IOC_PATH2FID: {
2395 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2396 sizeof(struct lu_fid)))
2401 case OBD_IOC_FID2PATH:
2402 RETURN(ll_fid2path(inode, (void *)arg));
2403 case LL_IOC_DATA_VERSION: {
2404 struct ioc_data_version idv;
2407 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2410 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2411 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2413 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2419 case LL_IOC_GET_MDTIDX: {
2422 mdtidx = ll_get_mdt_idx(inode);
2426 if (put_user((int)mdtidx, (int*)arg))
2431 case OBD_IOC_GETDTNAME:
2432 case OBD_IOC_GETMDNAME:
2433 RETURN(ll_get_obd_name(inode, cmd, arg));
2434 case LL_IOC_HSM_STATE_GET: {
2435 struct md_op_data *op_data;
2436 struct hsm_user_state *hus;
2443 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2444 LUSTRE_OPC_ANY, hus);
2445 if (IS_ERR(op_data)) {
2447 RETURN(PTR_ERR(op_data));
2450 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2453 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2456 ll_finish_md_op_data(op_data);
2460 case LL_IOC_HSM_STATE_SET: {
2461 struct hsm_state_set *hss;
2468 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2473 rc = ll_hsm_state_set(inode, hss);
2478 case LL_IOC_HSM_ACTION: {
2479 struct md_op_data *op_data;
2480 struct hsm_current_action *hca;
2487 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2488 LUSTRE_OPC_ANY, hca);
2489 if (IS_ERR(op_data)) {
2491 RETURN(PTR_ERR(op_data));
2494 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2497 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2500 ll_finish_md_op_data(op_data);
2504 case LL_IOC_SET_LEASE: {
2505 struct ll_inode_info *lli = ll_i2info(inode);
2506 struct obd_client_handle *och = NULL;
2512 if (!(file->f_mode & FMODE_WRITE))
2517 if (!(file->f_mode & FMODE_READ))
2522 mutex_lock(&lli->lli_och_mutex);
2523 if (fd->fd_lease_och != NULL) {
2524 och = fd->fd_lease_och;
2525 fd->fd_lease_och = NULL;
2527 mutex_unlock(&lli->lli_och_mutex);
2530 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2531 rc = ll_lease_close(och, inode, &lease_broken);
2532 if (rc == 0 && lease_broken)
2538 /* return the type of lease or error */
2539 RETURN(rc < 0 ? rc : (int)mode);
2544 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2546 /* apply for lease */
2547 och = ll_lease_open(inode, file, mode, 0);
2549 RETURN(PTR_ERR(och));
2552 mutex_lock(&lli->lli_och_mutex);
2553 if (fd->fd_lease_och == NULL) {
2554 fd->fd_lease_och = och;
2557 mutex_unlock(&lli->lli_och_mutex);
2559 /* impossible now that only excl is supported for now */
2560 ll_lease_close(och, inode, &lease_broken);
2565 case LL_IOC_GET_LEASE: {
2566 struct ll_inode_info *lli = ll_i2info(inode);
2567 struct ldlm_lock *lock = NULL;
2570 mutex_lock(&lli->lli_och_mutex);
2571 if (fd->fd_lease_och != NULL) {
2572 struct obd_client_handle *och = fd->fd_lease_och;
2574 lock = ldlm_handle2lock(&och->och_lease_handle);
2576 lock_res_and_lock(lock);
2577 if (!ldlm_is_cancel(lock))
2578 rc = och->och_flags &
2579 (FMODE_READ | FMODE_WRITE);
2580 unlock_res_and_lock(lock);
2581 LDLM_LOCK_PUT(lock);
2584 mutex_unlock(&lli->lli_och_mutex);
2587 case LL_IOC_HSM_IMPORT: {
2588 struct hsm_user_import *hui;
2594 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2599 rc = ll_hsm_import(inode, file, hui);
2609 ll_iocontrol_call(inode, file, cmd, arg, &err))
2612 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2618 #ifndef HAVE_FILE_LLSEEK_SIZE
2619 static inline loff_t
2620 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2622 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2624 if (offset > maxsize)
2627 if (offset != file->f_pos) {
2628 file->f_pos = offset;
2629 file->f_version = 0;
2635 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2636 loff_t maxsize, loff_t eof)
2638 struct inode *inode = file->f_dentry->d_inode;
2646 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2647 * position-querying operation. Avoid rewriting the "same"
2648 * f_pos value back to the file because a concurrent read(),
2649 * write() or lseek() might have altered it
2654 * f_lock protects against read/modify/write race with other
2655 * SEEK_CURs. Note that parallel writes and reads behave
2658 mutex_lock(&inode->i_mutex);
2659 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2660 mutex_unlock(&inode->i_mutex);
2664 * In the generic case the entire file is data, so as long as
2665 * offset isn't at the end of the file then the offset is data.
2672 * There is a virtual hole at the end of the file, so as long as
2673 * offset isn't i_size or larger, return i_size.
2681 return llseek_execute(file, offset, maxsize);
2685 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2687 struct inode *inode = file->f_dentry->d_inode;
2688 loff_t retval, eof = 0;
2691 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2692 (origin == SEEK_CUR) ? file->f_pos : 0);
2693 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2694 PFID(ll_inode2fid(inode)), inode, retval, retval,
2696 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2698 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2699 retval = ll_glimpse_size(inode);
2702 eof = i_size_read(inode);
2705 retval = ll_generic_file_llseek_size(file, offset, origin,
2706 ll_file_maxbytes(inode), eof);
2710 int ll_flush(struct file *file, fl_owner_t id)
2712 struct inode *inode = file->f_dentry->d_inode;
2713 struct ll_inode_info *lli = ll_i2info(inode);
2714 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2717 LASSERT(!S_ISDIR(inode->i_mode));
2719 /* catch async errors that were recorded back when async writeback
2720 * failed for pages in this mapping. */
2721 rc = lli->lli_async_rc;
2722 lli->lli_async_rc = 0;
2723 if (lli->lli_clob != NULL) {
2724 err = lov_read_and_clear_async_rc(lli->lli_clob);
2729 /* The application has been told write failure already.
2730 * Do not report failure again. */
2731 if (fd->fd_write_failed)
2733 return rc ? -EIO : 0;
2737 * Called to make sure a portion of file has been written out.
2738 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2740 * Return how many pages have been written.
2742 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2743 enum cl_fsync_mode mode, int ignore_layout)
2745 struct cl_env_nest nest;
2748 struct obd_capa *capa = NULL;
2749 struct cl_fsync_io *fio;
2753 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2754 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2757 env = cl_env_nested_get(&nest);
2759 RETURN(PTR_ERR(env));
2761 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2763 io = ccc_env_thread_io(env);
2764 io->ci_obj = cl_i2info(inode)->lli_clob;
2765 io->ci_ignore_layout = ignore_layout;
2767 /* initialize parameters for sync */
2768 fio = &io->u.ci_fsync;
2769 fio->fi_capa = capa;
2770 fio->fi_start = start;
2772 fio->fi_fid = ll_inode2fid(inode);
2773 fio->fi_mode = mode;
2774 fio->fi_nr_written = 0;
2776 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2777 result = cl_io_loop(env, io);
2779 result = io->ci_result;
2781 result = fio->fi_nr_written;
2782 cl_io_fini(env, io);
2783 cl_env_nested_put(&nest, env);
2791 * When dentry is provided (the 'else' case), *file->f_dentry may be
2792 * null and dentry must be used directly rather than pulled from
2793 * *file->f_dentry as is done otherwise.
2796 #ifdef HAVE_FILE_FSYNC_4ARGS
2797 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2799 struct dentry *dentry = file->f_dentry;
2800 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2801 int ll_fsync(struct file *file, int datasync)
2803 struct dentry *dentry = file->f_dentry;
2805 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2808 struct inode *inode = dentry->d_inode;
2809 struct ll_inode_info *lli = ll_i2info(inode);
2810 struct ptlrpc_request *req;
2811 struct obd_capa *oc;
2815 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2816 PFID(ll_inode2fid(inode)), inode);
2817 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2819 #ifdef HAVE_FILE_FSYNC_4ARGS
2820 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2821 mutex_lock(&inode->i_mutex);
2823 /* fsync's caller has already called _fdata{sync,write}, we want
2824 * that IO to finish before calling the osc and mdc sync methods */
2825 rc = filemap_fdatawait(inode->i_mapping);
2828 /* catch async errors that were recorded back when async writeback
2829 * failed for pages in this mapping. */
2830 if (!S_ISDIR(inode->i_mode)) {
2831 err = lli->lli_async_rc;
2832 lli->lli_async_rc = 0;
2835 err = lov_read_and_clear_async_rc(lli->lli_clob);
2840 oc = ll_mdscapa_get(inode);
2841 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2847 ptlrpc_req_finished(req);
2849 if (S_ISREG(inode->i_mode)) {
2850 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2852 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2854 if (rc == 0 && err < 0)
2857 fd->fd_write_failed = true;
2859 fd->fd_write_failed = false;
2862 #ifdef HAVE_FILE_FSYNC_4ARGS
2863 mutex_unlock(&inode->i_mutex);
2868 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2870 struct inode *inode = file->f_dentry->d_inode;
2871 struct ll_sb_info *sbi = ll_i2sbi(inode);
2872 struct ldlm_enqueue_info einfo = {
2873 .ei_type = LDLM_FLOCK,
2874 .ei_cb_cp = ldlm_flock_completion_ast,
2875 .ei_cbdata = file_lock,
2877 struct md_op_data *op_data;
2878 struct lustre_handle lockh = {0};
2879 ldlm_policy_data_t flock = {{0}};
2885 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2886 PFID(ll_inode2fid(inode)), file_lock);
2888 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2890 if (file_lock->fl_flags & FL_FLOCK) {
2891 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2892 /* flocks are whole-file locks */
2893 flock.l_flock.end = OFFSET_MAX;
2894 /* For flocks owner is determined by the local file desctiptor*/
2895 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2896 } else if (file_lock->fl_flags & FL_POSIX) {
2897 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2898 flock.l_flock.start = file_lock->fl_start;
2899 flock.l_flock.end = file_lock->fl_end;
2903 flock.l_flock.pid = file_lock->fl_pid;
2905 /* Somewhat ugly workaround for svc lockd.
2906 * lockd installs custom fl_lmops->lm_compare_owner that checks
2907 * for the fl_owner to be the same (which it always is on local node
2908 * I guess between lockd processes) and then compares pid.
2909 * As such we assign pid to the owner field to make it all work,
2910 * conflict with normal locks is unlikely since pid space and
2911 * pointer space for current->files are not intersecting */
2912 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2913 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2915 switch (file_lock->fl_type) {
2917 einfo.ei_mode = LCK_PR;
2920 /* An unlock request may or may not have any relation to
2921 * existing locks so we may not be able to pass a lock handle
2922 * via a normal ldlm_lock_cancel() request. The request may even
2923 * unlock a byte range in the middle of an existing lock. In
2924 * order to process an unlock request we need all of the same
2925 * information that is given with a normal read or write record
2926 * lock request. To avoid creating another ldlm unlock (cancel)
2927 * message we'll treat a LCK_NL flock request as an unlock. */
2928 einfo.ei_mode = LCK_NL;
2931 einfo.ei_mode = LCK_PW;
2934 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2935 file_lock->fl_type);
2950 flags = LDLM_FL_BLOCK_NOWAIT;
2956 flags = LDLM_FL_TEST_LOCK;
2957 /* Save the old mode so that if the mode in the lock changes we
2958 * can decrement the appropriate reader or writer refcount. */
2959 file_lock->fl_type = einfo.ei_mode;
2962 CERROR("unknown fcntl lock command: %d\n", cmd);
2966 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2967 LUSTRE_OPC_ANY, NULL);
2968 if (IS_ERR(op_data))
2969 RETURN(PTR_ERR(op_data));
2971 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2972 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2973 flock.l_flock.pid, flags, einfo.ei_mode,
2974 flock.l_flock.start, flock.l_flock.end);
2976 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2977 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2979 if ((file_lock->fl_flags & FL_FLOCK) &&
2980 (rc == 0 || file_lock->fl_type == F_UNLCK))
2981 rc2 = flock_lock_file_wait(file, file_lock);
2982 if ((file_lock->fl_flags & FL_POSIX) &&
2983 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2984 !(flags & LDLM_FL_TEST_LOCK))
2985 rc2 = posix_lock_file_wait(file, file_lock);
2987 if (rc2 && file_lock->fl_type != F_UNLCK) {
2988 einfo.ei_mode = LCK_NL;
2989 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2990 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2994 ll_finish_md_op_data(op_data);
2999 int ll_get_fid_by_name(struct inode *parent, const char *name,
3000 int namelen, struct lu_fid *fid)
3002 struct md_op_data *op_data = NULL;
3003 struct mdt_body *body;
3004 struct ptlrpc_request *req;
3008 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3009 LUSTRE_OPC_ANY, NULL);
3010 if (IS_ERR(op_data))
3011 RETURN(PTR_ERR(op_data));
3013 op_data->op_valid = OBD_MD_FLID;
3014 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3015 ll_finish_md_op_data(op_data);
3019 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3021 GOTO(out_req, rc = -EFAULT);
3025 ptlrpc_req_finished(req);
3029 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3030 const char *name, int namelen)
3032 struct dentry *dchild = NULL;
3033 struct inode *child_inode = NULL;
3034 struct md_op_data *op_data;
3035 struct ptlrpc_request *request = NULL;
3040 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3041 name, PFID(ll_inode2fid(parent)), mdtidx);
3043 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3044 0, LUSTRE_OPC_ANY, NULL);
3045 if (IS_ERR(op_data))
3046 RETURN(PTR_ERR(op_data));
3048 /* Get child FID first */
3049 qstr.hash = full_name_hash(name, namelen);
3052 dchild = d_lookup(file->f_dentry, &qstr);
3053 if (dchild != NULL && dchild->d_inode != NULL) {
3054 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3055 if (dchild->d_inode != NULL) {
3056 child_inode = igrab(dchild->d_inode);
3057 ll_invalidate_aliases(child_inode);
3061 rc = ll_get_fid_by_name(parent, name, namelen,
3067 if (!fid_is_sane(&op_data->op_fid3)) {
3068 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3069 ll_get_fsname(parent->i_sb, NULL, 0), name,
3070 PFID(&op_data->op_fid3));
3071 GOTO(out_free, rc = -EINVAL);
3074 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3079 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3080 PFID(&op_data->op_fid3), mdtidx);
3081 GOTO(out_free, rc = 0);
3084 op_data->op_mds = mdtidx;
3085 op_data->op_cli_flags = CLI_MIGRATE;
3086 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3087 namelen, name, namelen, &request);
3089 ll_update_times(request, parent);
3091 ptlrpc_req_finished(request);
3096 if (child_inode != NULL) {
3097 clear_nlink(child_inode);
3101 ll_finish_md_op_data(op_data);
3105 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3113 * test if some locks matching bits and l_req_mode are acquired
3114 * - bits can be in different locks
3115 * - if found clear the common lock bits in *bits
3116 * - the bits not found, are kept in *bits
3118 * \param bits [IN] searched lock bits [IN]
3119 * \param l_req_mode [IN] searched lock mode
3120 * \retval boolean, true iff all bits are found
3122 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3124 struct lustre_handle lockh;
3125 ldlm_policy_data_t policy;
3126 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3127 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3136 fid = &ll_i2info(inode)->lli_fid;
3137 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3138 ldlm_lockname[mode]);
3140 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3141 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3142 policy.l_inodebits.bits = *bits & (1 << i);
3143 if (policy.l_inodebits.bits == 0)
3146 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3147 &policy, mode, &lockh)) {
3148 struct ldlm_lock *lock;
3150 lock = ldlm_handle2lock(&lockh);
3153 ~(lock->l_policy_data.l_inodebits.bits);
3154 LDLM_LOCK_PUT(lock);
3156 *bits &= ~policy.l_inodebits.bits;
3163 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3164 struct lustre_handle *lockh, __u64 flags,
3167 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3172 fid = &ll_i2info(inode)->lli_fid;
3173 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3175 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3176 fid, LDLM_IBITS, &policy, mode, lockh);
3181 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3183 /* Already unlinked. Just update nlink and return success */
3184 if (rc == -ENOENT) {
3186 /* This path cannot be hit for regular files unless in
3187 * case of obscure races, so no need to to validate
3189 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3191 } else if (rc != 0) {
3192 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3193 "%s: revalidate FID "DFID" error: rc = %d\n",
3194 ll_get_fsname(inode->i_sb, NULL, 0),
3195 PFID(ll_inode2fid(inode)), rc);
3201 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3204 struct inode *inode = dentry->d_inode;
3205 struct ptlrpc_request *req = NULL;
3206 struct obd_export *exp;
3210 LASSERT(inode != NULL);
3212 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3213 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3215 exp = ll_i2mdexp(inode);
3217 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3218 * But under CMD case, it caused some lock issues, should be fixed
3219 * with new CMD ibits lock. See bug 12718 */
3220 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3221 struct lookup_intent oit = { .it_op = IT_GETATTR };
3222 struct md_op_data *op_data;
3224 if (ibits == MDS_INODELOCK_LOOKUP)
3225 oit.it_op = IT_LOOKUP;
3227 /* Call getattr by fid, so do not provide name at all. */
3228 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3229 dentry->d_inode, NULL, 0, 0,
3230 LUSTRE_OPC_ANY, NULL);
3231 if (IS_ERR(op_data))
3232 RETURN(PTR_ERR(op_data));
3234 oit.it_create_mode |= M_CHECK_STALE;
3235 rc = md_intent_lock(exp, op_data, NULL, 0,
3236 /* we are not interested in name
3239 ll_md_blocking_ast, 0);
3240 ll_finish_md_op_data(op_data);
3241 oit.it_create_mode &= ~M_CHECK_STALE;
3243 rc = ll_inode_revalidate_fini(inode, rc);
3247 rc = ll_revalidate_it_finish(req, &oit, dentry);
3249 ll_intent_release(&oit);
3253 /* Unlinked? Unhash dentry, so it is not picked up later by
3254 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3255 here to preserve get_cwd functionality on 2.6.
3257 if (!dentry->d_inode->i_nlink)
3258 d_lustre_invalidate(dentry, 0);
3260 ll_lookup_finish_locks(&oit, dentry);
3261 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3262 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3263 obd_valid valid = OBD_MD_FLGETATTR;
3264 struct md_op_data *op_data;
3267 if (S_ISREG(inode->i_mode)) {
3268 rc = ll_get_default_mdsize(sbi, &ealen);
3271 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3274 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3275 0, ealen, LUSTRE_OPC_ANY,
3277 if (IS_ERR(op_data))
3278 RETURN(PTR_ERR(op_data));
3280 op_data->op_valid = valid;
3281 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3282 * capa for this inode. Because we only keep capas of dirs
3284 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3285 ll_finish_md_op_data(op_data);
3287 rc = ll_inode_revalidate_fini(inode, rc);
3291 rc = ll_prep_inode(&inode, req, NULL, NULL);
3294 ptlrpc_req_finished(req);
3298 static int ll_merge_md_attr(struct inode *inode)
3300 struct cl_attr attr = { 0 };
3303 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3304 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3309 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3310 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3312 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3313 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3314 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3319 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3322 struct inode *inode = dentry->d_inode;
3326 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3330 /* if object isn't regular file, don't validate size */
3331 if (!S_ISREG(inode->i_mode)) {
3332 if (S_ISDIR(inode->i_mode) &&
3333 ll_i2info(inode)->lli_lsm_md != NULL) {
3334 rc = ll_merge_md_attr(inode);
3339 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3340 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3341 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3343 /* In case of restore, the MDT has the right size and has
3344 * already send it back without granting the layout lock,
3345 * inode is up-to-date so glimpse is useless.
3346 * Also to glimpse we need the layout, in case of a running
3347 * restore the MDT holds the layout lock so the glimpse will
3348 * block up to the end of restore (getattr will block)
3350 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3351 rc = ll_glimpse_size(inode);
3356 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3357 struct lookup_intent *it, struct kstat *stat)
3359 struct inode *inode = de->d_inode;
3360 struct ll_sb_info *sbi = ll_i2sbi(inode);
3361 struct ll_inode_info *lli = ll_i2info(inode);
3364 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3365 MDS_INODELOCK_LOOKUP);
3366 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3371 stat->dev = inode->i_sb->s_dev;
3372 if (ll_need_32bit_api(sbi))
3373 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3375 stat->ino = inode->i_ino;
3376 stat->mode = inode->i_mode;
3377 stat->uid = inode->i_uid;
3378 stat->gid = inode->i_gid;
3379 stat->rdev = inode->i_rdev;
3380 stat->atime = inode->i_atime;
3381 stat->mtime = inode->i_mtime;
3382 stat->ctime = inode->i_ctime;
3383 stat->blksize = 1 << inode->i_blkbits;
3384 stat->blocks = inode->i_blocks;
3386 if (S_ISDIR(inode->i_mode) &&
3387 ll_i2info(inode)->lli_lsm_md != NULL) {
3388 stat->nlink = lli->lli_stripe_dir_nlink;
3389 stat->size = lli->lli_stripe_dir_size;
3391 stat->nlink = inode->i_nlink;
3392 stat->size = i_size_read(inode);
3397 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3399 struct lookup_intent it = { .it_op = IT_GETATTR };
3401 return ll_getattr_it(mnt, de, &it, stat);
3404 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3405 __u64 start, __u64 len)
3409 struct ll_user_fiemap *fiemap;
3410 unsigned int extent_count = fieinfo->fi_extents_max;
3412 num_bytes = sizeof(*fiemap) + (extent_count *
3413 sizeof(struct ll_fiemap_extent));
3414 OBD_ALLOC_LARGE(fiemap, num_bytes);
3419 fiemap->fm_flags = fieinfo->fi_flags;
3420 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3421 fiemap->fm_start = start;
3422 fiemap->fm_length = len;
3423 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3424 sizeof(struct ll_fiemap_extent));
3426 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3428 fieinfo->fi_flags = fiemap->fm_flags;
3429 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3430 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3431 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3433 OBD_FREE_LARGE(fiemap, num_bytes);
3437 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3439 struct ll_inode_info *lli = ll_i2info(inode);
3440 struct posix_acl *acl = NULL;
3443 spin_lock(&lli->lli_lock);
3444 /* VFS' acl_permission_check->check_acl will release the refcount */
3445 acl = posix_acl_dup(lli->lli_posix_acl);
3446 spin_unlock(&lli->lli_lock);
3451 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3453 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3454 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3456 ll_check_acl(struct inode *inode, int mask)
3459 # ifdef CONFIG_FS_POSIX_ACL
3460 struct posix_acl *acl;
3464 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3465 if (flags & IPERM_FLAG_RCU)
3468 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3473 rc = posix_acl_permission(inode, acl, mask);
3474 posix_acl_release(acl);
3477 # else /* !CONFIG_FS_POSIX_ACL */
3479 # endif /* CONFIG_FS_POSIX_ACL */
3481 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3483 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3484 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3486 # ifdef HAVE_INODE_PERMISION_2ARGS
3487 int ll_inode_permission(struct inode *inode, int mask)
3489 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3496 #ifdef MAY_NOT_BLOCK
3497 if (mask & MAY_NOT_BLOCK)
3499 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3500 if (flags & IPERM_FLAG_RCU)
3504 /* as root inode are NOT getting validated in lookup operation,
3505 * need to do it before permission check. */
3507 if (inode == inode->i_sb->s_root->d_inode) {
3508 struct lookup_intent it = { .it_op = IT_LOOKUP };
3510 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3511 MDS_INODELOCK_LOOKUP);
3516 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3517 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3519 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3520 return lustre_check_remote_perm(inode, mask);
3522 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3523 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3528 /* -o localflock - only provides locally consistent flock locks */
3529 struct file_operations ll_file_operations = {
3530 .read = ll_file_read,
3531 .aio_read = ll_file_aio_read,
3532 .write = ll_file_write,
3533 .aio_write = ll_file_aio_write,
3534 .unlocked_ioctl = ll_file_ioctl,
3535 .open = ll_file_open,
3536 .release = ll_file_release,
3537 .mmap = ll_file_mmap,
3538 .llseek = ll_file_seek,
3539 .splice_read = ll_file_splice_read,
3544 struct file_operations ll_file_operations_flock = {
3545 .read = ll_file_read,
3546 .aio_read = ll_file_aio_read,
3547 .write = ll_file_write,
3548 .aio_write = ll_file_aio_write,
3549 .unlocked_ioctl = ll_file_ioctl,
3550 .open = ll_file_open,
3551 .release = ll_file_release,
3552 .mmap = ll_file_mmap,
3553 .llseek = ll_file_seek,
3554 .splice_read = ll_file_splice_read,
3557 .flock = ll_file_flock,
3558 .lock = ll_file_flock
3561 /* These are for -o noflock - to return ENOSYS on flock calls */
3562 struct file_operations ll_file_operations_noflock = {
3563 .read = ll_file_read,
3564 .aio_read = ll_file_aio_read,
3565 .write = ll_file_write,
3566 .aio_write = ll_file_aio_write,
3567 .unlocked_ioctl = ll_file_ioctl,
3568 .open = ll_file_open,
3569 .release = ll_file_release,
3570 .mmap = ll_file_mmap,
3571 .llseek = ll_file_seek,
3572 .splice_read = ll_file_splice_read,
3575 .flock = ll_file_noflock,
3576 .lock = ll_file_noflock
3579 struct inode_operations ll_file_inode_operations = {
3580 .setattr = ll_setattr,
3581 .getattr = ll_getattr,
3582 .permission = ll_inode_permission,
3583 .setxattr = ll_setxattr,
3584 .getxattr = ll_getxattr,
3585 .listxattr = ll_listxattr,
3586 .removexattr = ll_removexattr,
3587 .fiemap = ll_fiemap,
3588 #ifdef HAVE_IOP_GET_ACL
3589 .get_acl = ll_get_acl,
3593 /* dynamic ioctl number support routins */
3594 static struct llioc_ctl_data {
3595 struct rw_semaphore ioc_sem;
3596 cfs_list_t ioc_head;
3598 __RWSEM_INITIALIZER(llioc.ioc_sem),
3599 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3604 cfs_list_t iocd_list;
3605 unsigned int iocd_size;
3606 llioc_callback_t iocd_cb;
3607 unsigned int iocd_count;
3608 unsigned int iocd_cmd[0];
3611 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3614 struct llioc_data *in_data = NULL;
3617 if (cb == NULL || cmd == NULL ||
3618 count > LLIOC_MAX_CMD || count < 0)
3621 size = sizeof(*in_data) + count * sizeof(unsigned int);
3622 OBD_ALLOC(in_data, size);
3623 if (in_data == NULL)
3626 memset(in_data, 0, sizeof(*in_data));
3627 in_data->iocd_size = size;
3628 in_data->iocd_cb = cb;
3629 in_data->iocd_count = count;
3630 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3632 down_write(&llioc.ioc_sem);
3633 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3634 up_write(&llioc.ioc_sem);
3639 void ll_iocontrol_unregister(void *magic)
3641 struct llioc_data *tmp;
3646 down_write(&llioc.ioc_sem);
3647 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3649 unsigned int size = tmp->iocd_size;
3651 cfs_list_del(&tmp->iocd_list);
3652 up_write(&llioc.ioc_sem);
3654 OBD_FREE(tmp, size);
3658 up_write(&llioc.ioc_sem);
3660 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3663 EXPORT_SYMBOL(ll_iocontrol_register);
3664 EXPORT_SYMBOL(ll_iocontrol_unregister);
3666 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3667 unsigned int cmd, unsigned long arg, int *rcp)
3669 enum llioc_iter ret = LLIOC_CONT;
3670 struct llioc_data *data;
3671 int rc = -EINVAL, i;
3673 down_read(&llioc.ioc_sem);
3674 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3675 for (i = 0; i < data->iocd_count; i++) {
3676 if (cmd != data->iocd_cmd[i])
3679 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3683 if (ret == LLIOC_STOP)
3686 up_read(&llioc.ioc_sem);
3693 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3695 struct ll_inode_info *lli = ll_i2info(inode);
3696 struct cl_env_nest nest;
3701 if (lli->lli_clob == NULL)
3704 env = cl_env_nested_get(&nest);
3706 RETURN(PTR_ERR(env));
3708 result = cl_conf_set(env, lli->lli_clob, conf);
3709 cl_env_nested_put(&nest, env);
3711 if (conf->coc_opc == OBJECT_CONF_SET) {
3712 struct ldlm_lock *lock = conf->coc_lock;
3714 LASSERT(lock != NULL);
3715 LASSERT(ldlm_has_layout(lock));
3717 /* it can only be allowed to match after layout is
3718 * applied to inode otherwise false layout would be
3719 * seen. Applying layout shoud happen before dropping
3720 * the intent lock. */
3721 ldlm_lock_allow_match(lock);
3727 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3728 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3731 struct ll_sb_info *sbi = ll_i2sbi(inode);
3732 struct obd_capa *oc;
3733 struct ptlrpc_request *req;
3734 struct mdt_body *body;
3741 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3742 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3743 lock->l_lvb_data, lock->l_lvb_len);
3745 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3748 /* if layout lock was granted right away, the layout is returned
3749 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3750 * blocked and then granted via completion ast, we have to fetch
3751 * layout here. Please note that we can't use the LVB buffer in
3752 * completion AST because it doesn't have a large enough buffer */
3753 oc = ll_mdscapa_get(inode);
3754 rc = ll_get_default_mdsize(sbi, &lmmsize);
3756 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3757 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3763 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3765 GOTO(out, rc = -EPROTO);
3767 lmmsize = body->eadatasize;
3768 if (lmmsize == 0) /* empty layout */
3771 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3773 GOTO(out, rc = -EFAULT);
3775 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3776 if (lvbdata == NULL)
3777 GOTO(out, rc = -ENOMEM);
3779 memcpy(lvbdata, lmm, lmmsize);
3780 lock_res_and_lock(lock);
3781 if (lock->l_lvb_data != NULL)
3782 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3784 lock->l_lvb_data = lvbdata;
3785 lock->l_lvb_len = lmmsize;
3786 unlock_res_and_lock(lock);
3791 ptlrpc_req_finished(req);
3796 * Apply the layout to the inode. Layout lock is held and will be released
3799 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3800 struct inode *inode, __u32 *gen, bool reconf)
3802 struct ll_inode_info *lli = ll_i2info(inode);
3803 struct ll_sb_info *sbi = ll_i2sbi(inode);
3804 struct ldlm_lock *lock;
3805 struct lustre_md md = { NULL };
3806 struct cl_object_conf conf;
3809 bool wait_layout = false;
3812 LASSERT(lustre_handle_is_used(lockh));
3814 lock = ldlm_handle2lock(lockh);
3815 LASSERT(lock != NULL);
3816 LASSERT(ldlm_has_layout(lock));
3818 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3819 PFID(&lli->lli_fid), inode, reconf);
3821 /* in case this is a caching lock and reinstate with new inode */
3822 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3824 lock_res_and_lock(lock);
3825 lvb_ready = ldlm_is_lvb_ready(lock);
3826 unlock_res_and_lock(lock);
3827 /* checking lvb_ready is racy but this is okay. The worst case is
3828 * that multi processes may configure the file on the same time. */
3830 if (lvb_ready || !reconf) {
3833 /* layout_gen must be valid if layout lock is not
3834 * cancelled and stripe has already set */
3835 *gen = ll_layout_version_get(lli);
3841 rc = ll_layout_fetch(inode, lock);
3845 /* for layout lock, lmm is returned in lock's lvb.
3846 * lvb_data is immutable if the lock is held so it's safe to access it
3847 * without res lock. See the description in ldlm_lock_decref_internal()
3848 * for the condition to free lvb_data of layout lock */
3849 if (lock->l_lvb_data != NULL) {
3850 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3851 lock->l_lvb_data, lock->l_lvb_len);
3853 *gen = LL_LAYOUT_GEN_EMPTY;
3855 *gen = md.lsm->lsm_layout_gen;
3858 CERROR("%s: file "DFID" unpackmd error: %d\n",
3859 ll_get_fsname(inode->i_sb, NULL, 0),
3860 PFID(&lli->lli_fid), rc);
3866 /* set layout to file. Unlikely this will fail as old layout was
3867 * surely eliminated */
3868 memset(&conf, 0, sizeof conf);
3869 conf.coc_opc = OBJECT_CONF_SET;
3870 conf.coc_inode = inode;
3871 conf.coc_lock = lock;
3872 conf.u.coc_md = &md;
3873 rc = ll_layout_conf(inode, &conf);
3876 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3878 /* refresh layout failed, need to wait */
3879 wait_layout = rc == -EBUSY;
3883 LDLM_LOCK_PUT(lock);
3884 ldlm_lock_decref(lockh, mode);
3886 /* wait for IO to complete if it's still being used. */
3888 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3889 ll_get_fsname(inode->i_sb, NULL, 0),
3890 PFID(&lli->lli_fid), inode);
3892 memset(&conf, 0, sizeof conf);
3893 conf.coc_opc = OBJECT_CONF_WAIT;
3894 conf.coc_inode = inode;
3895 rc = ll_layout_conf(inode, &conf);
3899 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3900 ll_get_fsname(inode->i_sb, NULL, 0),
3901 PFID(&lli->lli_fid), rc);
3907 * This function checks if there exists a LAYOUT lock on the client side,
3908 * or enqueues it if it doesn't have one in cache.
3910 * This function will not hold layout lock so it may be revoked any time after
3911 * this function returns. Any operations depend on layout should be redone
3914 * This function should be called before lov_io_init() to get an uptodate
3915 * layout version, the caller should save the version number and after IO
3916 * is finished, this function should be called again to verify that layout
3917 * is not changed during IO time.
3919 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3921 struct ll_inode_info *lli = ll_i2info(inode);
3922 struct ll_sb_info *sbi = ll_i2sbi(inode);
3923 struct md_op_data *op_data;
3924 struct lookup_intent it;
3925 struct lustre_handle lockh;
3927 struct ldlm_enqueue_info einfo = {
3928 .ei_type = LDLM_IBITS,
3930 .ei_cb_bl = ll_md_blocking_ast,
3931 .ei_cb_cp = ldlm_completion_ast,
3936 *gen = ll_layout_version_get(lli);
3937 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3941 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3942 LASSERT(S_ISREG(inode->i_mode));
3944 /* take layout lock mutex to enqueue layout lock exclusively. */
3945 mutex_lock(&lli->lli_layout_mutex);
3948 /* mostly layout lock is caching on the local side, so try to match
3949 * it before grabbing layout lock mutex. */
3950 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3951 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3952 if (mode != 0) { /* hit cached lock */
3953 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3957 mutex_unlock(&lli->lli_layout_mutex);
3961 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3962 0, 0, LUSTRE_OPC_ANY, NULL);
3963 if (IS_ERR(op_data)) {
3964 mutex_unlock(&lli->lli_layout_mutex);
3965 RETURN(PTR_ERR(op_data));
3968 /* have to enqueue one */
3969 memset(&it, 0, sizeof(it));
3970 it.it_op = IT_LAYOUT;
3971 lockh.cookie = 0ULL;
3973 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
3974 ll_get_fsname(inode->i_sb, NULL, 0),
3975 PFID(&lli->lli_fid), inode);
3977 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3979 if (it.d.lustre.it_data != NULL)
3980 ptlrpc_req_finished(it.d.lustre.it_data);
3981 it.d.lustre.it_data = NULL;
3983 ll_finish_md_op_data(op_data);
3985 mode = it.d.lustre.it_lock_mode;
3986 it.d.lustre.it_lock_mode = 0;
3987 ll_intent_drop_lock(&it);
3990 /* set lock data in case this is a new lock */
3991 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3992 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3996 mutex_unlock(&lli->lli_layout_mutex);
4002 * This function send a restore request to the MDT
4004 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4006 struct hsm_user_request *hur;
4010 len = sizeof(struct hsm_user_request) +
4011 sizeof(struct hsm_user_item);
4012 OBD_ALLOC(hur, len);
4016 hur->hur_request.hr_action = HUA_RESTORE;
4017 hur->hur_request.hr_archive_id = 0;
4018 hur->hur_request.hr_flags = 0;
4019 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4020 sizeof(hur->hur_user_item[0].hui_fid));
4021 hur->hur_user_item[0].hui_extent.offset = offset;
4022 hur->hur_user_item[0].hui_extent.length = length;
4023 hur->hur_request.hr_itemcount = 1;
4024 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,