4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och,
124 const __u64 *data_version)
126 struct obd_export *exp = ll_i2mdexp(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
129 struct obd_device *obd = class_exp2obd(exp);
136 * XXX: in case of LMV, is this correct to access
139 CERROR("Invalid MDC connection handle "LPX64"\n",
140 ll_i2mdexp(inode)->exp_handle.h_cookie);
144 OBD_ALLOC_PTR(op_data);
146 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
148 ll_prepare_close(inode, op_data, och);
149 if (data_version != NULL) {
150 /* Pass in data_version implies release. */
151 op_data->op_bias |= MDS_HSM_RELEASE;
152 op_data->op_data_version = *data_version;
153 op_data->op_lease_handle = och->och_lease_handle;
154 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
157 rc = md_close(md_exp, op_data, och->och_mod, &req);
159 /* This close must have the epoch closed. */
160 LASSERT(epoch_close);
161 /* MDS has instructed us to obtain Size-on-MDS attribute from
162 * OSTs and send setattr to back to MDS. */
163 rc = ll_som_update(inode, op_data);
165 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
166 " failed: rc = %d\n",
167 ll_i2mdexp(inode)->exp_obd->obd_name,
168 PFID(ll_inode2fid(inode)), rc);
172 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
173 ll_i2mdexp(inode)->exp_obd->obd_name,
174 PFID(ll_inode2fid(inode)), rc);
177 /* DATA_MODIFIED flag was successfully sent on close, cancel data
178 * modification flag. */
179 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
180 struct ll_inode_info *lli = ll_i2info(inode);
182 spin_lock(&lli->lli_lock);
183 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
184 spin_unlock(&lli->lli_lock);
188 rc = ll_objects_destroy(req, inode);
190 CERROR("%s: inode "DFID
191 " ll_objects destroy: rc = %d\n",
192 ll_i2mdexp(inode)->exp_obd->obd_name,
193 PFID(ll_inode2fid(inode)), rc);
196 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
197 struct mdt_body *body;
198 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
199 if (!(body->valid & OBD_MD_FLRELEASED))
203 ll_finish_md_op_data(op_data);
207 if (exp_connect_som(exp) && !epoch_close &&
208 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
209 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 md_clear_open_replay_data(md_exp, och);
212 /* Free @och if it is not waiting for DONE_WRITING. */
213 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
216 if (req) /* This is close request */
217 ptlrpc_req_finished(req);
221 int ll_md_real_close(struct inode *inode, fmode_t fmode)
223 struct ll_inode_info *lli = ll_i2info(inode);
224 struct obd_client_handle **och_p;
225 struct obd_client_handle *och;
230 if (fmode & FMODE_WRITE) {
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
233 } else if (fmode & FMODE_EXEC) {
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
237 LASSERT(fmode & FMODE_READ);
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
242 mutex_lock(&lli->lli_och_mutex);
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
246 mutex_unlock(&lli->lli_och_mutex);
252 mutex_unlock(&lli->lli_och_mutex);
255 /* There might be a race and this handle may already
257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
264 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
272 /* clear group lock, if present */
273 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
274 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
276 if (fd->fd_lease_och != NULL) {
279 /* Usually the lease is not released when the
280 * application crashed, we need to release here. */
281 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
282 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
283 PFID(&lli->lli_fid), rc, lease_broken);
285 fd->fd_lease_och = NULL;
288 if (fd->fd_och != NULL) {
289 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
294 /* Let's see if we have good enough OPEN lock on the file and if
295 we can skip talking to MDS */
296 if (file->f_dentry->d_inode) { /* Can this ever be false? */
298 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
299 struct lustre_handle lockh;
300 struct inode *inode = file->f_dentry->d_inode;
301 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
303 mutex_lock(&lli->lli_och_mutex);
304 if (fd->fd_omode & FMODE_WRITE) {
306 LASSERT(lli->lli_open_fd_write_count);
307 lli->lli_open_fd_write_count--;
308 } else if (fd->fd_omode & FMODE_EXEC) {
310 LASSERT(lli->lli_open_fd_exec_count);
311 lli->lli_open_fd_exec_count--;
314 LASSERT(lli->lli_open_fd_read_count);
315 lli->lli_open_fd_read_count--;
317 mutex_unlock(&lli->lli_och_mutex);
319 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
320 LDLM_IBITS, &policy, lockmode,
322 rc = ll_md_real_close(file->f_dentry->d_inode,
326 CERROR("Releasing a file %p with negative dentry %p. Name %s",
327 file, file->f_dentry, file->f_dentry->d_name.name);
331 LUSTRE_FPRIVATE(file) = NULL;
332 ll_file_data_put(fd);
333 ll_capa_close(inode);
338 /* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
343 int ll_file_release(struct inode *inode, struct file *file)
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
351 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
352 PFID(ll_inode2fid(inode)), inode);
354 #ifdef CONFIG_FS_POSIX_ACL
355 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
356 inode == inode->i_sb->s_root->d_inode) {
357 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
360 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
361 fd->fd_flags &= ~LL_FILE_RMTACL;
362 rct_del(&sbi->ll_rct, current_pid());
363 et_search_free(&sbi->ll_et, current_pid());
368 if (inode->i_sb->s_root != file->f_dentry)
369 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
370 fd = LUSTRE_FPRIVATE(file);
373 /* The last ref on @file, maybe not the the owner pid of statahead.
374 * Different processes can open the same dir, "ll_opendir_key" means:
375 * it is me that should stop the statahead thread. */
376 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
377 lli->lli_opendir_pid != 0)
378 ll_stop_statahead(inode, lli->lli_opendir_key);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm,
401 int lmmsize, struct lookup_intent *itp)
403 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
404 struct dentry *parent = file->f_dentry->d_parent;
405 const char *name = file->f_dentry->d_name.name;
406 const int len = file->f_dentry->d_name.len;
407 struct md_op_data *op_data;
408 struct ptlrpc_request *req;
409 __u32 opc = LUSTRE_OPC_ANY;
416 /* Usually we come here only for NFSD, and we want open lock.
417 But we can also get here with pre 2.6.15 patchless kernels, and in
418 that case that lock is also ok */
419 /* We can also get here if there was cached open handle in revalidate_it
420 * but it disappeared while we were getting from there to ll_file_open.
421 * But this means this file was closed and immediatelly opened which
422 * makes a good candidate for using OPEN lock */
423 /* If lmmsize & lmm are not 0, we are just setting stripe info
424 * parameters. No need for the open lock */
425 if (lmm == NULL && lmmsize == 0) {
426 itp->it_flags |= MDS_OPEN_LOCK;
427 if (itp->it_flags & FMODE_WRITE)
428 opc = LUSTRE_OPC_CREATE;
431 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
432 file->f_dentry->d_inode, name, len,
435 RETURN(PTR_ERR(op_data));
437 itp->it_flags |= MDS_OPEN_BY_FID;
438 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
439 0 /*unused */, &req, ll_md_blocking_ast, 0);
440 ll_finish_md_op_data(op_data);
442 /* reason for keep own exit path - don`t flood log
443 * with messages with -ESTALE errors.
445 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
446 it_open_error(DISP_OPEN_OPEN, itp))
448 ll_release_openhandle(file->f_dentry, itp);
452 if (it_disposition(itp, DISP_LOOKUP_NEG))
453 GOTO(out, rc = -ENOENT);
455 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
456 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
457 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
461 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
462 if (!rc && itp->d.lustre.it_lock_mode)
463 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
467 ptlrpc_req_finished(req);
468 ll_intent_drop_lock(itp);
474 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
475 * not believe attributes if a few ioepoch holders exist. Attributes for
476 * previous ioepoch if new one is opened are also skipped by MDS.
478 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
480 if (ioepoch && lli->lli_ioepoch != ioepoch) {
481 lli->lli_ioepoch = ioepoch;
482 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
483 ioepoch, PFID(&lli->lli_fid));
487 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
488 struct obd_client_handle *och)
490 struct ptlrpc_request *req = it->d.lustre.it_data;
491 struct mdt_body *body;
493 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
494 och->och_fh = body->handle;
495 och->och_fid = body->fid1;
496 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
497 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
498 och->och_flags = it->it_flags;
500 return md_set_open_replay_data(md_exp, och, it);
503 int ll_local_open(struct file *file, struct lookup_intent *it,
504 struct ll_file_data *fd, struct obd_client_handle *och)
506 struct inode *inode = file->f_dentry->d_inode;
507 struct ll_inode_info *lli = ll_i2info(inode);
510 LASSERT(!LUSTRE_FPRIVATE(file));
515 struct ptlrpc_request *req = it->d.lustre.it_data;
516 struct mdt_body *body;
519 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
523 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
524 ll_ioepoch_open(lli, body->ioepoch);
527 LUSTRE_FPRIVATE(file) = fd;
528 ll_readahead_init(inode, &fd->fd_ras);
529 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
534 /* Open a file, and (for the very first open) create objects on the OSTs at
535 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
536 * creation or open until ll_lov_setstripe() ioctl is called.
538 * If we already have the stripe MD locally then we don't request it in
539 * md_open(), by passing a lmm_size = 0.
541 * It is up to the application to ensure no other processes open this file
542 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
543 * used. We might be able to avoid races of that sort by getting lli_open_sem
544 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
545 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
547 int ll_file_open(struct inode *inode, struct file *file)
549 struct ll_inode_info *lli = ll_i2info(inode);
550 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
551 .it_flags = file->f_flags };
552 struct obd_client_handle **och_p = NULL;
553 __u64 *och_usecount = NULL;
554 struct ll_file_data *fd;
555 int rc = 0, opendir_set = 0;
558 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
559 PFID(ll_inode2fid(inode)), inode, file->f_flags);
561 it = file->private_data; /* XXX: compat macro */
562 file->private_data = NULL; /* prevent ll_local_open assertion */
564 fd = ll_file_data_get();
566 GOTO(out_openerr, rc = -ENOMEM);
569 if (S_ISDIR(inode->i_mode)) {
570 spin_lock(&lli->lli_sa_lock);
571 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
572 lli->lli_opendir_pid == 0) {
573 lli->lli_opendir_key = fd;
574 lli->lli_opendir_pid = current_pid();
577 spin_unlock(&lli->lli_sa_lock);
580 if (inode->i_sb->s_root == file->f_dentry) {
581 LUSTRE_FPRIVATE(file) = fd;
585 if (!it || !it->d.lustre.it_disposition) {
586 /* Convert f_flags into access mode. We cannot use file->f_mode,
587 * because everything but O_ACCMODE mask was stripped from
589 if ((oit.it_flags + 1) & O_ACCMODE)
591 if (file->f_flags & O_TRUNC)
592 oit.it_flags |= FMODE_WRITE;
594 /* kernel only call f_op->open in dentry_open. filp_open calls
595 * dentry_open after call to open_namei that checks permissions.
596 * Only nfsd_open call dentry_open directly without checking
597 * permissions and because of that this code below is safe. */
598 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
599 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
601 /* We do not want O_EXCL here, presumably we opened the file
602 * already? XXX - NFS implications? */
603 oit.it_flags &= ~O_EXCL;
605 /* bug20584, if "it_flags" contains O_CREAT, the file will be
606 * created if necessary, then "IT_CREAT" should be set to keep
607 * consistent with it */
608 if (oit.it_flags & O_CREAT)
609 oit.it_op |= IT_CREAT;
615 /* Let's see if we have file open on MDS already. */
616 if (it->it_flags & FMODE_WRITE) {
617 och_p = &lli->lli_mds_write_och;
618 och_usecount = &lli->lli_open_fd_write_count;
619 } else if (it->it_flags & FMODE_EXEC) {
620 och_p = &lli->lli_mds_exec_och;
621 och_usecount = &lli->lli_open_fd_exec_count;
623 och_p = &lli->lli_mds_read_och;
624 och_usecount = &lli->lli_open_fd_read_count;
627 mutex_lock(&lli->lli_och_mutex);
628 if (*och_p) { /* Open handle is present */
629 if (it_disposition(it, DISP_OPEN_OPEN)) {
630 /* Well, there's extra open request that we do not need,
631 let's close it somehow. This will decref request. */
632 rc = it_open_error(DISP_OPEN_OPEN, it);
634 mutex_unlock(&lli->lli_och_mutex);
635 GOTO(out_openerr, rc);
638 ll_release_openhandle(file->f_dentry, it);
642 rc = ll_local_open(file, it, fd, NULL);
645 mutex_unlock(&lli->lli_och_mutex);
646 GOTO(out_openerr, rc);
649 LASSERT(*och_usecount == 0);
650 if (!it->d.lustre.it_disposition) {
651 /* We cannot just request lock handle now, new ELC code
652 means that one of other OPEN locks for this file
653 could be cancelled, and since blocking ast handler
654 would attempt to grab och_mutex as well, that would
655 result in a deadlock */
656 mutex_unlock(&lli->lli_och_mutex);
657 it->it_create_mode |= M_CHECK_STALE;
658 rc = ll_intent_file_open(file, NULL, 0, it);
659 it->it_create_mode &= ~M_CHECK_STALE;
661 GOTO(out_openerr, rc);
665 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
667 GOTO(out_och_free, rc = -ENOMEM);
671 /* md_intent_lock() didn't get a request ref if there was an
672 * open error, so don't do cleanup on the request here
674 /* XXX (green): Should not we bail out on any error here, not
675 * just open error? */
676 rc = it_open_error(DISP_OPEN_OPEN, it);
678 GOTO(out_och_free, rc);
680 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
681 "inode %p: disposition %x, status %d\n", inode,
682 it_disposition(it, ~0), it->d.lustre.it_status);
684 rc = ll_local_open(file, it, fd, *och_p);
686 GOTO(out_och_free, rc);
688 mutex_unlock(&lli->lli_och_mutex);
691 /* Must do this outside lli_och_mutex lock to prevent deadlock where
692 different kind of OPEN lock for this same inode gets cancelled
693 by ldlm_cancel_lru */
694 if (!S_ISREG(inode->i_mode))
695 GOTO(out_och_free, rc);
699 if (!lli->lli_has_smd &&
700 (cl_is_lov_delay_create(file->f_flags) ||
701 (file->f_mode & FMODE_WRITE) == 0)) {
702 CDEBUG(D_INODE, "object creation was delayed\n");
703 GOTO(out_och_free, rc);
705 cl_lov_delay_create_clear(&file->f_flags);
706 GOTO(out_och_free, rc);
710 if (och_p && *och_p) {
711 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
712 *och_p = NULL; /* OBD_FREE writes some magic there */
715 mutex_unlock(&lli->lli_och_mutex);
718 if (opendir_set != 0)
719 ll_stop_statahead(inode, lli->lli_opendir_key);
721 ll_file_data_put(fd);
723 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
726 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
727 ptlrpc_req_finished(it->d.lustre.it_data);
728 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
734 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
735 struct ldlm_lock_desc *desc, void *data, int flag)
738 struct lustre_handle lockh;
742 case LDLM_CB_BLOCKING:
743 ldlm_lock2handle(lock, &lockh);
744 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
746 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
750 case LDLM_CB_CANCELING:
758 * Acquire a lease and open the file.
760 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
761 fmode_t fmode, __u64 open_flags)
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
833 ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
894 EXPORT_SYMBOL(ll_lease_open);
897 * Release lease and close the file.
898 * It will check if the lease has ever broken.
900 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
903 struct ldlm_lock *lock;
904 bool cancelled = true;
908 lock = ldlm_handle2lock(&och->och_lease_handle);
910 lock_res_and_lock(lock);
911 cancelled = ldlm_is_cancel(lock);
912 unlock_res_and_lock(lock);
916 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
917 PFID(&ll_i2info(inode)->lli_fid), cancelled);
920 ldlm_cli_cancel(&och->och_lease_handle, 0);
921 if (lease_broken != NULL)
922 *lease_broken = cancelled;
924 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 EXPORT_SYMBOL(ll_lease_close);
930 /* Fills the obdo with the attributes for the lsm */
931 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
932 struct obd_capa *capa, struct obdo *obdo,
933 __u64 ioepoch, int dv_flags)
935 struct ptlrpc_request_set *set;
936 struct obd_info oinfo = { { { 0 } } };
941 LASSERT(lsm != NULL);
945 oinfo.oi_oa->o_oi = lsm->lsm_oi;
946 oinfo.oi_oa->o_mode = S_IFREG;
947 oinfo.oi_oa->o_ioepoch = ioepoch;
948 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
949 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
950 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
951 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
952 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
953 OBD_MD_FLDATAVERSION;
954 oinfo.oi_capa = capa;
955 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
956 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
957 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
958 if (dv_flags & LL_DV_WR_FLUSH)
959 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
962 set = ptlrpc_prep_set();
964 CERROR("can't allocate ptlrpc set\n");
967 rc = obd_getattr_async(exp, &oinfo, set);
969 rc = ptlrpc_set_wait(set);
970 ptlrpc_set_destroy(set);
973 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
974 OBD_MD_FLATIME | OBD_MD_FLMTIME |
975 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
976 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
977 if (dv_flags & LL_DV_WR_FLUSH &&
978 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
979 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
986 * Performs the getattr on the inode and updates its fields.
987 * If @sync != 0, perform the getattr under the server-side lock.
989 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
990 __u64 ioepoch, int sync)
992 struct obd_capa *capa = ll_mdscapa_get(inode);
993 struct lov_stripe_md *lsm;
997 lsm = ccc_inode_lsm_get(inode);
998 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
999 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1002 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1004 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1005 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1006 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1007 (unsigned long long)inode->i_blocks,
1008 (unsigned long)ll_inode_blksize(inode));
1010 ccc_inode_lsm_put(inode, lsm);
1014 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1016 struct ll_inode_info *lli = ll_i2info(inode);
1017 struct cl_object *obj = lli->lli_clob;
1018 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1025 /* merge timestamps the most recently obtained from mds with
1026 timestamps obtained from osts */
1027 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1028 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1029 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1030 inode_init_lvb(inode, &lvb);
1032 cl_object_attr_lock(obj);
1033 rc = cl_object_attr_get(env, obj, attr);
1034 cl_object_attr_unlock(obj);
1037 if (lvb.lvb_atime < attr->cat_atime)
1038 lvb.lvb_atime = attr->cat_atime;
1039 if (lvb.lvb_ctime < attr->cat_ctime)
1040 lvb.lvb_ctime = attr->cat_ctime;
1041 if (lvb.lvb_mtime < attr->cat_mtime)
1042 lvb.lvb_mtime = attr->cat_mtime;
1044 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1045 PFID(&lli->lli_fid), attr->cat_size);
1046 cl_isize_write_nolock(inode, attr->cat_size);
1048 inode->i_blocks = attr->cat_blocks;
1050 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1051 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1052 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1054 ll_inode_size_unlock(inode);
1059 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1062 struct obdo obdo = { 0 };
1065 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1067 st->st_size = obdo.o_size;
1068 st->st_blocks = obdo.o_blocks;
1069 st->st_mtime = obdo.o_mtime;
1070 st->st_atime = obdo.o_atime;
1071 st->st_ctime = obdo.o_ctime;
1076 static bool file_is_noatime(const struct file *file)
1078 const struct vfsmount *mnt = file->f_path.mnt;
1079 const struct inode *inode = file->f_path.dentry->d_inode;
1081 /* Adapted from file_accessed() and touch_atime().*/
1082 if (file->f_flags & O_NOATIME)
1085 if (inode->i_flags & S_NOATIME)
1088 if (IS_NOATIME(inode))
1091 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1094 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1097 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1103 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1105 struct inode *inode = file->f_dentry->d_inode;
1107 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1109 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1110 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1111 file->f_flags & O_DIRECT ||
1114 io->ci_obj = ll_i2info(inode)->lli_clob;
1115 io->ci_lockreq = CILR_MAYBE;
1116 if (ll_file_nolock(file)) {
1117 io->ci_lockreq = CILR_NEVER;
1118 io->ci_no_srvlock = 1;
1119 } else if (file->f_flags & O_APPEND) {
1120 io->ci_lockreq = CILR_MANDATORY;
1123 io->ci_noatime = file_is_noatime(file);
1127 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1128 struct file *file, enum cl_io_type iot,
1129 loff_t *ppos, size_t count)
1131 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1132 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1137 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1138 file->f_dentry->d_name.name, iot, *ppos, count);
1141 io = ccc_env_thread_io(env);
1142 ll_io_init(io, file, iot == CIT_WRITE);
1144 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1145 struct vvp_io *vio = vvp_env_io(env);
1146 struct ccc_io *cio = ccc_env_io(env);
1147 int write_mutex_locked = 0;
1149 cio->cui_fd = LUSTRE_FPRIVATE(file);
1150 vio->cui_io_subtype = args->via_io_subtype;
1152 switch (vio->cui_io_subtype) {
1154 cio->cui_iov = args->u.normal.via_iov;
1155 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1156 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1157 cio->cui_iocb = args->u.normal.via_iocb;
1158 if ((iot == CIT_WRITE) &&
1159 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1160 if (mutex_lock_interruptible(&lli->
1162 GOTO(out, result = -ERESTARTSYS);
1163 write_mutex_locked = 1;
1165 down_read(&lli->lli_trunc_sem);
1168 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1169 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1172 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1173 vio->u.splice.cui_flags = args->u.splice.via_flags;
1176 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1179 result = cl_io_loop(env, io);
1180 if (args->via_io_subtype == IO_NORMAL)
1181 up_read(&lli->lli_trunc_sem);
1182 if (write_mutex_locked)
1183 mutex_unlock(&lli->lli_write_mutex);
1185 /* cl_io_rw_init() handled IO */
1186 result = io->ci_result;
1189 if (io->ci_nob > 0) {
1190 result = io->ci_nob;
1191 *ppos = io->u.ci_wr.wr.crw_pos;
1195 cl_io_fini(env, io);
1196 /* If any bit been read/written (result != 0), we just return
1197 * short read/write instead of restart io. */
1198 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1199 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1200 iot == CIT_READ ? "read" : "write",
1201 file->f_dentry->d_name.name, *ppos, count);
1202 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1206 if (iot == CIT_READ) {
1208 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1209 LPROC_LL_READ_BYTES, result);
1210 } else if (iot == CIT_WRITE) {
1212 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1213 LPROC_LL_WRITE_BYTES, result);
1214 fd->fd_write_failed = false;
1215 } else if (result != -ERESTARTSYS) {
1216 fd->fd_write_failed = true;
1219 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1226 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1228 static int ll_file_get_iov_count(const struct iovec *iov,
1229 unsigned long *nr_segs, size_t *count)
1234 for (seg = 0; seg < *nr_segs; seg++) {
1235 const struct iovec *iv = &iov[seg];
1238 * If any segment has a negative length, or the cumulative
1239 * length ever wraps negative then return -EINVAL.
1242 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1244 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1249 cnt -= iv->iov_len; /* This segment is no good */
1256 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1257 unsigned long nr_segs, loff_t pos)
1260 struct vvp_io_args *args;
1266 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1270 env = cl_env_get(&refcheck);
1272 RETURN(PTR_ERR(env));
1274 args = vvp_env_args(env, IO_NORMAL);
1275 args->u.normal.via_iov = (struct iovec *)iov;
1276 args->u.normal.via_nrsegs = nr_segs;
1277 args->u.normal.via_iocb = iocb;
1279 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1280 &iocb->ki_pos, count);
1281 cl_env_put(env, &refcheck);
1285 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1289 struct iovec *local_iov;
1290 struct kiocb *kiocb;
1295 env = cl_env_get(&refcheck);
1297 RETURN(PTR_ERR(env));
1299 local_iov = &vvp_env_info(env)->vti_local_iov;
1300 kiocb = &vvp_env_info(env)->vti_kiocb;
1301 local_iov->iov_base = (void __user *)buf;
1302 local_iov->iov_len = count;
1303 init_sync_kiocb(kiocb, file);
1304 kiocb->ki_pos = *ppos;
1305 #ifdef HAVE_KIOCB_KI_LEFT
1306 kiocb->ki_left = count;
1308 kiocb->ki_nbytes = count;
1311 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1312 *ppos = kiocb->ki_pos;
1314 cl_env_put(env, &refcheck);
1319 * Write to a file (through the page cache).
1322 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1323 unsigned long nr_segs, loff_t pos)
1326 struct vvp_io_args *args;
1332 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1336 env = cl_env_get(&refcheck);
1338 RETURN(PTR_ERR(env));
1340 args = vvp_env_args(env, IO_NORMAL);
1341 args->u.normal.via_iov = (struct iovec *)iov;
1342 args->u.normal.via_nrsegs = nr_segs;
1343 args->u.normal.via_iocb = iocb;
1345 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1346 &iocb->ki_pos, count);
1347 cl_env_put(env, &refcheck);
1351 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1355 struct iovec *local_iov;
1356 struct kiocb *kiocb;
1361 env = cl_env_get(&refcheck);
1363 RETURN(PTR_ERR(env));
1365 local_iov = &vvp_env_info(env)->vti_local_iov;
1366 kiocb = &vvp_env_info(env)->vti_kiocb;
1367 local_iov->iov_base = (void __user *)buf;
1368 local_iov->iov_len = count;
1369 init_sync_kiocb(kiocb, file);
1370 kiocb->ki_pos = *ppos;
1371 #ifdef HAVE_KIOCB_KI_LEFT
1372 kiocb->ki_left = count;
1374 kiocb->ki_nbytes = count;
1377 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1378 *ppos = kiocb->ki_pos;
1380 cl_env_put(env, &refcheck);
1385 * Send file content (through pagecache) somewhere with helper
1387 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1388 struct pipe_inode_info *pipe, size_t count,
1392 struct vvp_io_args *args;
1397 env = cl_env_get(&refcheck);
1399 RETURN(PTR_ERR(env));
1401 args = vvp_env_args(env, IO_SPLICE);
1402 args->u.splice.via_pipe = pipe;
1403 args->u.splice.via_flags = flags;
1405 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1406 cl_env_put(env, &refcheck);
1410 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1413 struct obd_export *exp = ll_i2dtexp(inode);
1414 struct obd_trans_info oti = { 0 };
1415 struct obdo *oa = NULL;
1418 struct lov_stripe_md *lsm = NULL, *lsm2;
1425 lsm = ccc_inode_lsm_get(inode);
1426 if (!lsm_has_objects(lsm))
1427 GOTO(out, rc = -ENOENT);
1429 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1430 (lsm->lsm_stripe_count));
1432 OBD_ALLOC_LARGE(lsm2, lsm_size);
1434 GOTO(out, rc = -ENOMEM);
1437 oa->o_nlink = ost_idx;
1438 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1439 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1440 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1441 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1442 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1443 memcpy(lsm2, lsm, lsm_size);
1444 ll_inode_size_lock(inode);
1445 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1446 ll_inode_size_unlock(inode);
1448 OBD_FREE_LARGE(lsm2, lsm_size);
1451 ccc_inode_lsm_put(inode, lsm);
1456 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1458 struct ll_recreate_obj ucreat;
1462 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1465 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1469 ostid_set_seq_mdt0(&oi);
1470 ostid_set_id(&oi, ucreat.lrc_id);
1471 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1474 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1481 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1484 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1487 fid_to_ostid(&fid, &oi);
1488 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1489 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1492 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1493 __u64 flags, struct lov_user_md *lum,
1496 struct lov_stripe_md *lsm = NULL;
1497 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1501 lsm = ccc_inode_lsm_get(inode);
1503 ccc_inode_lsm_put(inode, lsm);
1504 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1505 PFID(ll_inode2fid(inode)));
1506 GOTO(out, rc = -EEXIST);
1509 ll_inode_size_lock(inode);
1510 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1512 GOTO(out_unlock, rc);
1513 rc = oit.d.lustre.it_status;
1515 GOTO(out_req_free, rc);
1517 ll_release_openhandle(file->f_dentry, &oit);
1520 ll_inode_size_unlock(inode);
1521 ll_intent_release(&oit);
1522 ccc_inode_lsm_put(inode, lsm);
1524 cl_lov_delay_create_clear(&file->f_flags);
1527 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1531 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1532 struct lov_mds_md **lmmp, int *lmm_size,
1533 struct ptlrpc_request **request)
1535 struct ll_sb_info *sbi = ll_i2sbi(inode);
1536 struct mdt_body *body;
1537 struct lov_mds_md *lmm = NULL;
1538 struct ptlrpc_request *req = NULL;
1539 struct md_op_data *op_data;
1542 rc = ll_get_max_mdsize(sbi, &lmmsize);
1546 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1547 strlen(filename), lmmsize,
1548 LUSTRE_OPC_ANY, NULL);
1549 if (IS_ERR(op_data))
1550 RETURN(PTR_ERR(op_data));
1552 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1553 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1554 ll_finish_md_op_data(op_data);
1556 CDEBUG(D_INFO, "md_getattr_name failed "
1557 "on %s: rc %d\n", filename, rc);
1561 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1562 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1564 lmmsize = body->eadatasize;
1566 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1568 GOTO(out, rc = -ENODATA);
1571 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1572 LASSERT(lmm != NULL);
1574 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1575 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1576 GOTO(out, rc = -EPROTO);
1580 * This is coming from the MDS, so is probably in
1581 * little endian. We convert it to host endian before
1582 * passing it to userspace.
1584 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1587 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1588 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1591 /* if function called for directory - we should
1592 * avoid swab not existent lsm objects */
1593 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1594 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1595 if (S_ISREG(body->mode))
1596 lustre_swab_lov_user_md_objects(
1597 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1599 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1600 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1601 if (S_ISREG(body->mode))
1602 lustre_swab_lov_user_md_objects(
1603 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1610 *lmm_size = lmmsize;
1615 static int ll_lov_setea(struct inode *inode, struct file *file,
1618 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1619 struct lov_user_md *lump;
1620 int lum_size = sizeof(struct lov_user_md) +
1621 sizeof(struct lov_user_ost_data);
1625 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1628 OBD_ALLOC_LARGE(lump, lum_size);
1632 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1633 OBD_FREE_LARGE(lump, lum_size);
1637 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1639 OBD_FREE_LARGE(lump, lum_size);
1643 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1646 struct lov_user_md_v3 lumv3;
1647 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1648 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1649 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1651 __u64 flags = FMODE_WRITE;
1654 /* first try with v1 which is smaller than v3 */
1655 lum_size = sizeof(struct lov_user_md_v1);
1656 if (copy_from_user(lumv1, lumv1p, lum_size))
1659 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1660 lum_size = sizeof(struct lov_user_md_v3);
1661 if (copy_from_user(&lumv3, lumv3p, lum_size))
1665 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1667 struct lov_stripe_md *lsm;
1670 put_user(0, &lumv1p->lmm_stripe_count);
1672 ll_layout_refresh(inode, &gen);
1673 lsm = ccc_inode_lsm_get(inode);
1674 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1675 0, lsm, (void *)arg);
1676 ccc_inode_lsm_put(inode, lsm);
1681 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1683 struct lov_stripe_md *lsm;
1687 lsm = ccc_inode_lsm_get(inode);
1689 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1691 ccc_inode_lsm_put(inode, lsm);
1695 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1697 struct ll_inode_info *lli = ll_i2info(inode);
1698 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1699 struct ccc_grouplock grouplock;
1703 if (ll_file_nolock(file))
1704 RETURN(-EOPNOTSUPP);
1706 spin_lock(&lli->lli_lock);
1707 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1708 CWARN("group lock already existed with gid %lu\n",
1709 fd->fd_grouplock.cg_gid);
1710 spin_unlock(&lli->lli_lock);
1713 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1714 spin_unlock(&lli->lli_lock);
1716 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1717 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1721 spin_lock(&lli->lli_lock);
1722 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1723 spin_unlock(&lli->lli_lock);
1724 CERROR("another thread just won the race\n");
1725 cl_put_grouplock(&grouplock);
1729 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1730 fd->fd_grouplock = grouplock;
1731 spin_unlock(&lli->lli_lock);
1733 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1737 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1739 struct ll_inode_info *lli = ll_i2info(inode);
1740 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1741 struct ccc_grouplock grouplock;
1744 spin_lock(&lli->lli_lock);
1745 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1746 spin_unlock(&lli->lli_lock);
1747 CWARN("no group lock held\n");
1750 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1752 if (fd->fd_grouplock.cg_gid != arg) {
1753 CWARN("group lock %lu doesn't match current id %lu\n",
1754 arg, fd->fd_grouplock.cg_gid);
1755 spin_unlock(&lli->lli_lock);
1759 grouplock = fd->fd_grouplock;
1760 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1761 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1762 spin_unlock(&lli->lli_lock);
1764 cl_put_grouplock(&grouplock);
1765 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1770 * Close inode open handle
1772 * \param dentry [in] dentry which contains the inode
1773 * \param it [in,out] intent which contains open info and result
1776 * \retval <0 failure
1778 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1780 struct inode *inode = dentry->d_inode;
1781 struct obd_client_handle *och;
1787 /* Root ? Do nothing. */
1788 if (dentry->d_inode->i_sb->s_root == dentry)
1791 /* No open handle to close? Move away */
1792 if (!it_disposition(it, DISP_OPEN_OPEN))
1795 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1797 OBD_ALLOC(och, sizeof(*och));
1799 GOTO(out, rc = -ENOMEM);
1801 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1803 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1806 /* this one is in place of ll_file_open */
1807 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1808 ptlrpc_req_finished(it->d.lustre.it_data);
1809 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1815 * Get size for inode for which FIEMAP mapping is requested.
1816 * Make the FIEMAP get_info call and returns the result.
1818 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1821 struct obd_export *exp = ll_i2dtexp(inode);
1822 struct lov_stripe_md *lsm = NULL;
1823 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1824 int vallen = num_bytes;
1828 /* Checks for fiemap flags */
1829 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1830 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1834 /* Check for FIEMAP_FLAG_SYNC */
1835 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1836 rc = filemap_fdatawrite(inode->i_mapping);
1841 lsm = ccc_inode_lsm_get(inode);
1845 /* If the stripe_count > 1 and the application does not understand
1846 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1848 if (lsm->lsm_stripe_count > 1 &&
1849 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1850 GOTO(out, rc = -EOPNOTSUPP);
1852 fm_key.oa.o_oi = lsm->lsm_oi;
1853 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1855 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1856 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1857 /* If filesize is 0, then there would be no objects for mapping */
1858 if (fm_key.oa.o_size == 0) {
1859 fiemap->fm_mapped_extents = 0;
1863 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1865 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1868 CERROR("obd_get_info failed: rc = %d\n", rc);
1871 ccc_inode_lsm_put(inode, lsm);
1875 int ll_fid2path(struct inode *inode, void *arg)
1877 struct obd_export *exp = ll_i2mdexp(inode);
1878 struct getinfo_fid2path *gfout, *gfin;
1882 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1883 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1886 /* Need to get the buflen */
1887 OBD_ALLOC_PTR(gfin);
1890 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1895 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1896 OBD_ALLOC(gfout, outsize);
1897 if (gfout == NULL) {
1901 memcpy(gfout, gfin, sizeof(*gfout));
1904 /* Call mdc_iocontrol */
1905 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1909 if (copy_to_user(arg, gfout, outsize))
1913 OBD_FREE(gfout, outsize);
1917 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1919 struct ll_user_fiemap *fiemap_s;
1920 size_t num_bytes, ret_bytes;
1921 unsigned int extent_count;
1924 /* Get the extent count so we can calculate the size of
1925 * required fiemap buffer */
1926 if (get_user(extent_count,
1927 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1929 num_bytes = sizeof(*fiemap_s) + (extent_count *
1930 sizeof(struct ll_fiemap_extent));
1932 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1933 if (fiemap_s == NULL)
1936 /* get the fiemap value */
1937 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1939 GOTO(error, rc = -EFAULT);
1941 /* If fm_extent_count is non-zero, read the first extent since
1942 * it is used to calculate end_offset and device from previous
1945 if (copy_from_user(&fiemap_s->fm_extents[0],
1946 (char __user *)arg + sizeof(*fiemap_s),
1947 sizeof(struct ll_fiemap_extent)))
1948 GOTO(error, rc = -EFAULT);
1951 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1955 ret_bytes = sizeof(struct ll_user_fiemap);
1957 if (extent_count != 0)
1958 ret_bytes += (fiemap_s->fm_mapped_extents *
1959 sizeof(struct ll_fiemap_extent));
1961 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1965 OBD_FREE_LARGE(fiemap_s, num_bytes);
1970 * Read the data_version for inode.
1972 * This value is computed using stripe object version on OST.
1973 * Version is computed using server side locking.
1975 * @param sync if do sync on the OST side;
1977 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1978 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1980 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1982 struct lov_stripe_md *lsm = NULL;
1983 struct ll_sb_info *sbi = ll_i2sbi(inode);
1984 struct obdo *obdo = NULL;
1988 /* If no stripe, we consider version is 0. */
1989 lsm = ccc_inode_lsm_get(inode);
1990 if (!lsm_has_objects(lsm)) {
1992 CDEBUG(D_INODE, "No object for inode\n");
1996 OBD_ALLOC_PTR(obdo);
1998 GOTO(out, rc = -ENOMEM);
2000 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2002 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2005 *data_version = obdo->o_data_version;
2011 ccc_inode_lsm_put(inode, lsm);
2016 * Trigger a HSM release request for the provided inode.
2018 int ll_hsm_release(struct inode *inode)
2020 struct cl_env_nest nest;
2022 struct obd_client_handle *och = NULL;
2023 __u64 data_version = 0;
2027 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2028 ll_get_fsname(inode->i_sb, NULL, 0),
2029 PFID(&ll_i2info(inode)->lli_fid));
2031 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2033 GOTO(out, rc = PTR_ERR(och));
2035 /* Grab latest data_version and [am]time values */
2036 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2040 env = cl_env_nested_get(&nest);
2042 GOTO(out, rc = PTR_ERR(env));
2044 ll_merge_lvb(env, inode);
2045 cl_env_nested_put(&nest, env);
2047 /* Release the file.
2048 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2049 * we still need it to pack l_remote_handle to MDT. */
2050 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2056 if (och != NULL && !IS_ERR(och)) /* close the file */
2057 ll_lease_close(och, inode, NULL);
2062 struct ll_swap_stack {
2063 struct iattr ia1, ia2;
2065 struct inode *inode1, *inode2;
2066 bool check_dv1, check_dv2;
2069 static int ll_swap_layouts(struct file *file1, struct file *file2,
2070 struct lustre_swap_layouts *lsl)
2072 struct mdc_swap_layouts msl;
2073 struct md_op_data *op_data;
2076 struct ll_swap_stack *llss = NULL;
2079 OBD_ALLOC_PTR(llss);
2083 llss->inode1 = file1->f_dentry->d_inode;
2084 llss->inode2 = file2->f_dentry->d_inode;
2086 if (!S_ISREG(llss->inode2->i_mode))
2087 GOTO(free, rc = -EINVAL);
2089 if (inode_permission(llss->inode1, MAY_WRITE) ||
2090 inode_permission(llss->inode2, MAY_WRITE))
2091 GOTO(free, rc = -EPERM);
2093 if (llss->inode2->i_sb != llss->inode1->i_sb)
2094 GOTO(free, rc = -EXDEV);
2096 /* we use 2 bool because it is easier to swap than 2 bits */
2097 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2098 llss->check_dv1 = true;
2100 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2101 llss->check_dv2 = true;
2103 /* we cannot use lsl->sl_dvX directly because we may swap them */
2104 llss->dv1 = lsl->sl_dv1;
2105 llss->dv2 = lsl->sl_dv2;
2107 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2108 if (rc == 0) /* same file, done! */
2111 if (rc < 0) { /* sequentialize it */
2112 swap(llss->inode1, llss->inode2);
2114 swap(llss->dv1, llss->dv2);
2115 swap(llss->check_dv1, llss->check_dv2);
2119 if (gid != 0) { /* application asks to flush dirty cache */
2120 rc = ll_get_grouplock(llss->inode1, file1, gid);
2124 rc = ll_get_grouplock(llss->inode2, file2, gid);
2126 ll_put_grouplock(llss->inode1, file1, gid);
2131 /* to be able to restore mtime and atime after swap
2132 * we need to first save them */
2134 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2135 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2136 llss->ia1.ia_atime = llss->inode1->i_atime;
2137 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2138 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2139 llss->ia2.ia_atime = llss->inode2->i_atime;
2140 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2143 /* ultimate check, before swaping the layouts we check if
2144 * dataversion has changed (if requested) */
2145 if (llss->check_dv1) {
2146 rc = ll_data_version(llss->inode1, &dv, 0);
2149 if (dv != llss->dv1)
2150 GOTO(putgl, rc = -EAGAIN);
2153 if (llss->check_dv2) {
2154 rc = ll_data_version(llss->inode2, &dv, 0);
2157 if (dv != llss->dv2)
2158 GOTO(putgl, rc = -EAGAIN);
2161 /* struct md_op_data is used to send the swap args to the mdt
2162 * only flags is missing, so we use struct mdc_swap_layouts
2163 * through the md_op_data->op_data */
2164 /* flags from user space have to be converted before they are send to
2165 * server, no flag is sent today, they are only used on the client */
2168 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2169 0, LUSTRE_OPC_ANY, &msl);
2170 if (IS_ERR(op_data))
2171 GOTO(free, rc = PTR_ERR(op_data));
2173 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2174 sizeof(*op_data), op_data, NULL);
2175 ll_finish_md_op_data(op_data);
2179 ll_put_grouplock(llss->inode2, file2, gid);
2180 ll_put_grouplock(llss->inode1, file1, gid);
2183 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2187 /* clear useless flags */
2188 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2189 llss->ia1.ia_valid &= ~ATTR_MTIME;
2190 llss->ia2.ia_valid &= ~ATTR_MTIME;
2193 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2194 llss->ia1.ia_valid &= ~ATTR_ATIME;
2195 llss->ia2.ia_valid &= ~ATTR_ATIME;
2198 /* update time if requested */
2200 if (llss->ia2.ia_valid != 0) {
2201 mutex_lock(&llss->inode1->i_mutex);
2202 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2203 mutex_unlock(&llss->inode1->i_mutex);
2206 if (llss->ia1.ia_valid != 0) {
2209 mutex_lock(&llss->inode2->i_mutex);
2210 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2211 mutex_unlock(&llss->inode2->i_mutex);
2223 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2225 struct md_op_data *op_data;
2228 /* Non-root users are forbidden to set or clear flags which are
2229 * NOT defined in HSM_USER_MASK. */
2230 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2231 !cfs_capable(CFS_CAP_SYS_ADMIN))
2234 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2235 LUSTRE_OPC_ANY, hss);
2236 if (IS_ERR(op_data))
2237 RETURN(PTR_ERR(op_data));
2239 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2240 sizeof(*op_data), op_data, NULL);
2242 ll_finish_md_op_data(op_data);
2247 static int ll_hsm_import(struct inode *inode, struct file *file,
2248 struct hsm_user_import *hui)
2250 struct hsm_state_set *hss = NULL;
2251 struct iattr *attr = NULL;
2255 if (!S_ISREG(inode->i_mode))
2261 GOTO(out, rc = -ENOMEM);
2263 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2264 hss->hss_archive_id = hui->hui_archive_id;
2265 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2266 rc = ll_hsm_state_set(inode, hss);
2270 OBD_ALLOC_PTR(attr);
2272 GOTO(out, rc = -ENOMEM);
2274 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2275 attr->ia_mode |= S_IFREG;
2276 attr->ia_uid = hui->hui_uid;
2277 attr->ia_gid = hui->hui_gid;
2278 attr->ia_size = hui->hui_size;
2279 attr->ia_mtime.tv_sec = hui->hui_mtime;
2280 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2281 attr->ia_atime.tv_sec = hui->hui_atime;
2282 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2284 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2285 ATTR_UID | ATTR_GID |
2286 ATTR_MTIME | ATTR_MTIME_SET |
2287 ATTR_ATIME | ATTR_ATIME_SET;
2289 rc = ll_setattr_raw(file->f_dentry, attr, true);
2303 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2305 struct inode *inode = file->f_dentry->d_inode;
2306 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2310 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2311 PFID(ll_inode2fid(inode)), inode, cmd);
2312 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2314 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2315 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2319 case LL_IOC_GETFLAGS:
2320 /* Get the current value of the file flags */
2321 return put_user(fd->fd_flags, (int *)arg);
2322 case LL_IOC_SETFLAGS:
2323 case LL_IOC_CLRFLAGS:
2324 /* Set or clear specific file flags */
2325 /* XXX This probably needs checks to ensure the flags are
2326 * not abused, and to handle any flag side effects.
2328 if (get_user(flags, (int *) arg))
2331 if (cmd == LL_IOC_SETFLAGS) {
2332 if ((flags & LL_FILE_IGNORE_LOCK) &&
2333 !(file->f_flags & O_DIRECT)) {
2334 CERROR("%s: unable to disable locking on "
2335 "non-O_DIRECT file\n", current->comm);
2339 fd->fd_flags |= flags;
2341 fd->fd_flags &= ~flags;
2344 case LL_IOC_LOV_SETSTRIPE:
2345 RETURN(ll_lov_setstripe(inode, file, arg));
2346 case LL_IOC_LOV_SETEA:
2347 RETURN(ll_lov_setea(inode, file, arg));
2348 case LL_IOC_LOV_SWAP_LAYOUTS: {
2350 struct lustre_swap_layouts lsl;
2352 if (copy_from_user(&lsl, (char *)arg,
2353 sizeof(struct lustre_swap_layouts)))
2356 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2359 file2 = fget(lsl.sl_fd);
2364 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2365 rc = ll_swap_layouts(file, file2, &lsl);
2369 case LL_IOC_LOV_GETSTRIPE:
2370 RETURN(ll_lov_getstripe(inode, arg));
2371 case LL_IOC_RECREATE_OBJ:
2372 RETURN(ll_lov_recreate_obj(inode, arg));
2373 case LL_IOC_RECREATE_FID:
2374 RETURN(ll_lov_recreate_fid(inode, arg));
2375 case FSFILT_IOC_FIEMAP:
2376 RETURN(ll_ioctl_fiemap(inode, arg));
2377 case FSFILT_IOC_GETFLAGS:
2378 case FSFILT_IOC_SETFLAGS:
2379 RETURN(ll_iocontrol(inode, file, cmd, arg));
2380 case FSFILT_IOC_GETVERSION_OLD:
2381 case FSFILT_IOC_GETVERSION:
2382 RETURN(put_user(inode->i_generation, (int *)arg));
2383 case LL_IOC_GROUP_LOCK:
2384 RETURN(ll_get_grouplock(inode, file, arg));
2385 case LL_IOC_GROUP_UNLOCK:
2386 RETURN(ll_put_grouplock(inode, file, arg));
2387 case IOC_OBD_STATFS:
2388 RETURN(ll_obd_statfs(inode, (void *)arg));
2390 /* We need to special case any other ioctls we want to handle,
2391 * to send them to the MDS/OST as appropriate and to properly
2392 * network encode the arg field.
2393 case FSFILT_IOC_SETVERSION_OLD:
2394 case FSFILT_IOC_SETVERSION:
2396 case LL_IOC_FLUSHCTX:
2397 RETURN(ll_flush_ctx(inode));
2398 case LL_IOC_PATH2FID: {
2399 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2400 sizeof(struct lu_fid)))
2405 case OBD_IOC_FID2PATH:
2406 RETURN(ll_fid2path(inode, (void *)arg));
2407 case LL_IOC_DATA_VERSION: {
2408 struct ioc_data_version idv;
2411 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2414 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2415 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2417 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2423 case LL_IOC_GET_MDTIDX: {
2426 mdtidx = ll_get_mdt_idx(inode);
2430 if (put_user((int)mdtidx, (int*)arg))
2435 case OBD_IOC_GETDTNAME:
2436 case OBD_IOC_GETMDNAME:
2437 RETURN(ll_get_obd_name(inode, cmd, arg));
2438 case LL_IOC_HSM_STATE_GET: {
2439 struct md_op_data *op_data;
2440 struct hsm_user_state *hus;
2447 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2448 LUSTRE_OPC_ANY, hus);
2449 if (IS_ERR(op_data)) {
2451 RETURN(PTR_ERR(op_data));
2454 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2457 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2460 ll_finish_md_op_data(op_data);
2464 case LL_IOC_HSM_STATE_SET: {
2465 struct hsm_state_set *hss;
2472 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2477 rc = ll_hsm_state_set(inode, hss);
2482 case LL_IOC_HSM_ACTION: {
2483 struct md_op_data *op_data;
2484 struct hsm_current_action *hca;
2491 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2492 LUSTRE_OPC_ANY, hca);
2493 if (IS_ERR(op_data)) {
2495 RETURN(PTR_ERR(op_data));
2498 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2501 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2504 ll_finish_md_op_data(op_data);
2508 case LL_IOC_SET_LEASE: {
2509 struct ll_inode_info *lli = ll_i2info(inode);
2510 struct obd_client_handle *och = NULL;
2516 if (!(file->f_mode & FMODE_WRITE))
2521 if (!(file->f_mode & FMODE_READ))
2526 mutex_lock(&lli->lli_och_mutex);
2527 if (fd->fd_lease_och != NULL) {
2528 och = fd->fd_lease_och;
2529 fd->fd_lease_och = NULL;
2531 mutex_unlock(&lli->lli_och_mutex);
2534 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2535 rc = ll_lease_close(och, inode, &lease_broken);
2536 if (rc == 0 && lease_broken)
2542 /* return the type of lease or error */
2543 RETURN(rc < 0 ? rc : (int)mode);
2548 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2550 /* apply for lease */
2551 och = ll_lease_open(inode, file, mode, 0);
2553 RETURN(PTR_ERR(och));
2556 mutex_lock(&lli->lli_och_mutex);
2557 if (fd->fd_lease_och == NULL) {
2558 fd->fd_lease_och = och;
2561 mutex_unlock(&lli->lli_och_mutex);
2563 /* impossible now that only excl is supported for now */
2564 ll_lease_close(och, inode, &lease_broken);
2569 case LL_IOC_GET_LEASE: {
2570 struct ll_inode_info *lli = ll_i2info(inode);
2571 struct ldlm_lock *lock = NULL;
2574 mutex_lock(&lli->lli_och_mutex);
2575 if (fd->fd_lease_och != NULL) {
2576 struct obd_client_handle *och = fd->fd_lease_och;
2578 lock = ldlm_handle2lock(&och->och_lease_handle);
2580 lock_res_and_lock(lock);
2581 if (!ldlm_is_cancel(lock))
2582 rc = och->och_flags &
2583 (FMODE_READ | FMODE_WRITE);
2584 unlock_res_and_lock(lock);
2585 LDLM_LOCK_PUT(lock);
2588 mutex_unlock(&lli->lli_och_mutex);
2591 case LL_IOC_HSM_IMPORT: {
2592 struct hsm_user_import *hui;
2598 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2603 rc = ll_hsm_import(inode, file, hui);
2613 ll_iocontrol_call(inode, file, cmd, arg, &err))
2616 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2622 #ifndef HAVE_FILE_LLSEEK_SIZE
2623 static inline loff_t
2624 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2626 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2628 if (offset > maxsize)
2631 if (offset != file->f_pos) {
2632 file->f_pos = offset;
2633 file->f_version = 0;
2639 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2640 loff_t maxsize, loff_t eof)
2642 struct inode *inode = file->f_dentry->d_inode;
2650 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2651 * position-querying operation. Avoid rewriting the "same"
2652 * f_pos value back to the file because a concurrent read(),
2653 * write() or lseek() might have altered it
2658 * f_lock protects against read/modify/write race with other
2659 * SEEK_CURs. Note that parallel writes and reads behave
2662 mutex_lock(&inode->i_mutex);
2663 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2664 mutex_unlock(&inode->i_mutex);
2668 * In the generic case the entire file is data, so as long as
2669 * offset isn't at the end of the file then the offset is data.
2676 * There is a virtual hole at the end of the file, so as long as
2677 * offset isn't i_size or larger, return i_size.
2685 return llseek_execute(file, offset, maxsize);
2689 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2691 struct inode *inode = file->f_dentry->d_inode;
2692 loff_t retval, eof = 0;
2695 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2696 (origin == SEEK_CUR) ? file->f_pos : 0);
2697 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2698 PFID(ll_inode2fid(inode)), inode, retval, retval,
2700 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2702 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2703 retval = ll_glimpse_size(inode);
2706 eof = i_size_read(inode);
2709 retval = ll_generic_file_llseek_size(file, offset, origin,
2710 ll_file_maxbytes(inode), eof);
2714 int ll_flush(struct file *file, fl_owner_t id)
2716 struct inode *inode = file->f_dentry->d_inode;
2717 struct ll_inode_info *lli = ll_i2info(inode);
2718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2721 LASSERT(!S_ISDIR(inode->i_mode));
2723 /* catch async errors that were recorded back when async writeback
2724 * failed for pages in this mapping. */
2725 rc = lli->lli_async_rc;
2726 lli->lli_async_rc = 0;
2727 if (lli->lli_clob != NULL) {
2728 err = lov_read_and_clear_async_rc(lli->lli_clob);
2733 /* The application has been told write failure already.
2734 * Do not report failure again. */
2735 if (fd->fd_write_failed)
2737 return rc ? -EIO : 0;
2741 * Called to make sure a portion of file has been written out.
2742 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2744 * Return how many pages have been written.
2746 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2747 enum cl_fsync_mode mode, int ignore_layout)
2749 struct cl_env_nest nest;
2752 struct obd_capa *capa = NULL;
2753 struct cl_fsync_io *fio;
2757 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2758 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2761 env = cl_env_nested_get(&nest);
2763 RETURN(PTR_ERR(env));
2765 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2767 io = ccc_env_thread_io(env);
2768 io->ci_obj = cl_i2info(inode)->lli_clob;
2769 io->ci_ignore_layout = ignore_layout;
2771 /* initialize parameters for sync */
2772 fio = &io->u.ci_fsync;
2773 fio->fi_capa = capa;
2774 fio->fi_start = start;
2776 fio->fi_fid = ll_inode2fid(inode);
2777 fio->fi_mode = mode;
2778 fio->fi_nr_written = 0;
2780 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2781 result = cl_io_loop(env, io);
2783 result = io->ci_result;
2785 result = fio->fi_nr_written;
2786 cl_io_fini(env, io);
2787 cl_env_nested_put(&nest, env);
2795 * When dentry is provided (the 'else' case), *file->f_dentry may be
2796 * null and dentry must be used directly rather than pulled from
2797 * *file->f_dentry as is done otherwise.
2800 #ifdef HAVE_FILE_FSYNC_4ARGS
2801 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2803 struct dentry *dentry = file->f_dentry;
2804 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2805 int ll_fsync(struct file *file, int datasync)
2807 struct dentry *dentry = file->f_dentry;
2809 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2812 struct inode *inode = dentry->d_inode;
2813 struct ll_inode_info *lli = ll_i2info(inode);
2814 struct ptlrpc_request *req;
2815 struct obd_capa *oc;
2819 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2820 PFID(ll_inode2fid(inode)), inode);
2821 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2823 #ifdef HAVE_FILE_FSYNC_4ARGS
2824 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2825 mutex_lock(&inode->i_mutex);
2827 /* fsync's caller has already called _fdata{sync,write}, we want
2828 * that IO to finish before calling the osc and mdc sync methods */
2829 rc = filemap_fdatawait(inode->i_mapping);
2832 /* catch async errors that were recorded back when async writeback
2833 * failed for pages in this mapping. */
2834 if (!S_ISDIR(inode->i_mode)) {
2835 err = lli->lli_async_rc;
2836 lli->lli_async_rc = 0;
2839 err = lov_read_and_clear_async_rc(lli->lli_clob);
2844 oc = ll_mdscapa_get(inode);
2845 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2851 ptlrpc_req_finished(req);
2853 if (datasync && S_ISREG(inode->i_mode)) {
2854 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2856 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2858 if (rc == 0 && err < 0)
2861 fd->fd_write_failed = true;
2863 fd->fd_write_failed = false;
2866 #ifdef HAVE_FILE_FSYNC_4ARGS
2867 mutex_unlock(&inode->i_mutex);
2872 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2874 struct inode *inode = file->f_dentry->d_inode;
2875 struct ll_sb_info *sbi = ll_i2sbi(inode);
2876 struct ldlm_enqueue_info einfo = {
2877 .ei_type = LDLM_FLOCK,
2878 .ei_cb_cp = ldlm_flock_completion_ast,
2879 .ei_cbdata = file_lock,
2881 struct md_op_data *op_data;
2882 struct lustre_handle lockh = {0};
2883 ldlm_policy_data_t flock = {{0}};
2889 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2890 PFID(ll_inode2fid(inode)), file_lock);
2892 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2894 if (file_lock->fl_flags & FL_FLOCK) {
2895 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2896 /* flocks are whole-file locks */
2897 flock.l_flock.end = OFFSET_MAX;
2898 /* For flocks owner is determined by the local file desctiptor*/
2899 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2900 } else if (file_lock->fl_flags & FL_POSIX) {
2901 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2902 flock.l_flock.start = file_lock->fl_start;
2903 flock.l_flock.end = file_lock->fl_end;
2907 flock.l_flock.pid = file_lock->fl_pid;
2909 /* Somewhat ugly workaround for svc lockd.
2910 * lockd installs custom fl_lmops->lm_compare_owner that checks
2911 * for the fl_owner to be the same (which it always is on local node
2912 * I guess between lockd processes) and then compares pid.
2913 * As such we assign pid to the owner field to make it all work,
2914 * conflict with normal locks is unlikely since pid space and
2915 * pointer space for current->files are not intersecting */
2916 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2917 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2919 switch (file_lock->fl_type) {
2921 einfo.ei_mode = LCK_PR;
2924 /* An unlock request may or may not have any relation to
2925 * existing locks so we may not be able to pass a lock handle
2926 * via a normal ldlm_lock_cancel() request. The request may even
2927 * unlock a byte range in the middle of an existing lock. In
2928 * order to process an unlock request we need all of the same
2929 * information that is given with a normal read or write record
2930 * lock request. To avoid creating another ldlm unlock (cancel)
2931 * message we'll treat a LCK_NL flock request as an unlock. */
2932 einfo.ei_mode = LCK_NL;
2935 einfo.ei_mode = LCK_PW;
2938 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2939 file_lock->fl_type);
2954 flags = LDLM_FL_BLOCK_NOWAIT;
2960 flags = LDLM_FL_TEST_LOCK;
2961 /* Save the old mode so that if the mode in the lock changes we
2962 * can decrement the appropriate reader or writer refcount. */
2963 file_lock->fl_type = einfo.ei_mode;
2966 CERROR("unknown fcntl lock command: %d\n", cmd);
2970 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2971 LUSTRE_OPC_ANY, NULL);
2972 if (IS_ERR(op_data))
2973 RETURN(PTR_ERR(op_data));
2975 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2976 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2977 flock.l_flock.pid, flags, einfo.ei_mode,
2978 flock.l_flock.start, flock.l_flock.end);
2980 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2981 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2983 if ((file_lock->fl_flags & FL_FLOCK) &&
2984 (rc == 0 || file_lock->fl_type == F_UNLCK))
2985 rc2 = flock_lock_file_wait(file, file_lock);
2986 if ((file_lock->fl_flags & FL_POSIX) &&
2987 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2988 !(flags & LDLM_FL_TEST_LOCK))
2989 rc2 = posix_lock_file_wait(file, file_lock);
2991 if (rc2 && file_lock->fl_type != F_UNLCK) {
2992 einfo.ei_mode = LCK_NL;
2993 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2994 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2998 ll_finish_md_op_data(op_data);
3003 static int ll_get_fid_by_name(struct inode *parent, const char *name,
3004 int namelen, struct lu_fid *fid)
3006 struct md_op_data *op_data = NULL;
3007 struct mdt_body *body;
3008 struct ptlrpc_request *req;
3011 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3012 LUSTRE_OPC_ANY, NULL);
3013 if (IS_ERR(op_data))
3014 return PTR_ERR(op_data);
3016 op_data->op_valid = OBD_MD_FLID;
3017 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3021 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3023 GOTO(out_req, rc = -EFAULT);
3027 ptlrpc_req_finished(req);
3029 if (op_data != NULL)
3030 ll_finish_md_op_data(op_data);
3034 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3035 const char *name, int namelen)
3037 struct dentry *dchild = NULL;
3038 struct md_op_data *op_data;
3039 struct ptlrpc_request *request = NULL;
3044 CDEBUG(D_VFSTRACE, "migrate %s under"DFID" to MDT%d\n",
3045 name, PFID(ll_inode2fid(parent)), mdtidx);
3047 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3048 0, LUSTRE_OPC_ANY, NULL);
3049 if (IS_ERR(op_data))
3050 RETURN(PTR_ERR(op_data));
3052 /* Get child FID first */
3053 qstr.hash = full_name_hash(name, namelen);
3056 dchild = d_lookup(file->f_dentry, &qstr);
3057 if (dchild != NULL && dchild->d_inode != NULL) {
3058 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3060 rc = ll_get_fid_by_name(parent, name, strnlen(name, namelen),
3066 if (!fid_is_sane(&op_data->op_fid3)) {
3067 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3068 ll_get_fsname(parent->i_sb, NULL, 0), name,
3069 PFID(&op_data->op_fid3));
3073 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3078 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3079 PFID(&op_data->op_fid3), mdtidx);
3080 GOTO(out_free, rc = 0);
3083 op_data->op_mds = mdtidx;
3084 op_data->op_cli_flags = CLI_MIGRATE;
3085 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3086 strnlen(name, namelen), name, strnlen(name, namelen),
3089 ll_update_times(request, parent);
3091 ptlrpc_req_finished(request);
3096 if (dchild != NULL) {
3097 if (dchild->d_inode != NULL)
3098 ll_delete_inode(dchild->d_inode);
3102 ll_finish_md_op_data(op_data);
3106 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3114 * test if some locks matching bits and l_req_mode are acquired
3115 * - bits can be in different locks
3116 * - if found clear the common lock bits in *bits
3117 * - the bits not found, are kept in *bits
3119 * \param bits [IN] searched lock bits [IN]
3120 * \param l_req_mode [IN] searched lock mode
3121 * \retval boolean, true iff all bits are found
3123 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3125 struct lustre_handle lockh;
3126 ldlm_policy_data_t policy;
3127 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3128 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3137 fid = &ll_i2info(inode)->lli_fid;
3138 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3139 ldlm_lockname[mode]);
3141 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3142 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3143 policy.l_inodebits.bits = *bits & (1 << i);
3144 if (policy.l_inodebits.bits == 0)
3147 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3148 &policy, mode, &lockh)) {
3149 struct ldlm_lock *lock;
3151 lock = ldlm_handle2lock(&lockh);
3154 ~(lock->l_policy_data.l_inodebits.bits);
3155 LDLM_LOCK_PUT(lock);
3157 *bits &= ~policy.l_inodebits.bits;
3164 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3165 struct lustre_handle *lockh, __u64 flags,
3168 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3173 fid = &ll_i2info(inode)->lli_fid;
3174 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3176 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3177 fid, LDLM_IBITS, &policy, mode, lockh);
3182 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3184 /* Already unlinked. Just update nlink and return success */
3185 if (rc == -ENOENT) {
3187 /* This path cannot be hit for regular files unless in
3188 * case of obscure races, so no need to to validate
3190 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3192 } else if (rc != 0) {
3193 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3194 "%s: revalidate FID "DFID" error: rc = %d\n",
3195 ll_get_fsname(inode->i_sb, NULL, 0),
3196 PFID(ll_inode2fid(inode)), rc);
3202 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3205 struct inode *inode = dentry->d_inode;
3206 struct ptlrpc_request *req = NULL;
3207 struct obd_export *exp;
3211 LASSERT(inode != NULL);
3213 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3214 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3216 exp = ll_i2mdexp(inode);
3218 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3219 * But under CMD case, it caused some lock issues, should be fixed
3220 * with new CMD ibits lock. See bug 12718 */
3221 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3222 struct lookup_intent oit = { .it_op = IT_GETATTR };
3223 struct md_op_data *op_data;
3225 if (ibits == MDS_INODELOCK_LOOKUP)
3226 oit.it_op = IT_LOOKUP;
3228 /* Call getattr by fid, so do not provide name at all. */
3229 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3230 dentry->d_inode, NULL, 0, 0,
3231 LUSTRE_OPC_ANY, NULL);
3232 if (IS_ERR(op_data))
3233 RETURN(PTR_ERR(op_data));
3235 oit.it_create_mode |= M_CHECK_STALE;
3236 rc = md_intent_lock(exp, op_data, NULL, 0,
3237 /* we are not interested in name
3240 ll_md_blocking_ast, 0);
3241 ll_finish_md_op_data(op_data);
3242 oit.it_create_mode &= ~M_CHECK_STALE;
3244 rc = ll_inode_revalidate_fini(inode, rc);
3248 rc = ll_revalidate_it_finish(req, &oit, dentry);
3250 ll_intent_release(&oit);
3254 /* Unlinked? Unhash dentry, so it is not picked up later by
3255 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3256 here to preserve get_cwd functionality on 2.6.
3258 if (!dentry->d_inode->i_nlink)
3259 d_lustre_invalidate(dentry, 0);
3261 ll_lookup_finish_locks(&oit, dentry);
3262 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3263 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3264 obd_valid valid = OBD_MD_FLGETATTR;
3265 struct md_op_data *op_data;
3268 if (S_ISREG(inode->i_mode)) {
3269 rc = ll_get_max_mdsize(sbi, &ealen);
3272 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3275 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3276 0, ealen, LUSTRE_OPC_ANY,
3278 if (IS_ERR(op_data))
3279 RETURN(PTR_ERR(op_data));
3281 op_data->op_valid = valid;
3282 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3283 * capa for this inode. Because we only keep capas of dirs
3285 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3286 ll_finish_md_op_data(op_data);
3288 rc = ll_inode_revalidate_fini(inode, rc);
3292 rc = ll_prep_inode(&inode, req, NULL, NULL);
3295 ptlrpc_req_finished(req);
3299 static int ll_merge_md_attr(struct inode *inode)
3301 struct cl_attr attr = { 0 };
3304 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3305 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3310 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3311 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3313 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3314 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3315 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3320 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3323 struct inode *inode = dentry->d_inode;
3327 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3331 /* if object isn't regular file, don't validate size */
3332 if (!S_ISREG(inode->i_mode)) {
3333 if (S_ISDIR(inode->i_mode) &&
3334 ll_i2info(inode)->lli_lsm_md != NULL) {
3335 rc = ll_merge_md_attr(inode);
3340 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3341 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3342 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3344 /* In case of restore, the MDT has the right size and has
3345 * already send it back without granting the layout lock,
3346 * inode is up-to-date so glimpse is useless.
3347 * Also to glimpse we need the layout, in case of a running
3348 * restore the MDT holds the layout lock so the glimpse will
3349 * block up to the end of restore (getattr will block)
3351 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3352 rc = ll_glimpse_size(inode);
3357 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3358 struct lookup_intent *it, struct kstat *stat)
3360 struct inode *inode = de->d_inode;
3361 struct ll_sb_info *sbi = ll_i2sbi(inode);
3362 struct ll_inode_info *lli = ll_i2info(inode);
3365 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3366 MDS_INODELOCK_LOOKUP);
3367 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3372 stat->dev = inode->i_sb->s_dev;
3373 if (ll_need_32bit_api(sbi))
3374 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3376 stat->ino = inode->i_ino;
3377 stat->mode = inode->i_mode;
3378 stat->uid = inode->i_uid;
3379 stat->gid = inode->i_gid;
3380 stat->rdev = inode->i_rdev;
3381 stat->atime = inode->i_atime;
3382 stat->mtime = inode->i_mtime;
3383 stat->ctime = inode->i_ctime;
3384 stat->blksize = 1 << inode->i_blkbits;
3385 stat->blocks = inode->i_blocks;
3387 if (S_ISDIR(inode->i_mode) &&
3388 ll_i2info(inode)->lli_lsm_md != NULL) {
3389 stat->nlink = lli->lli_stripe_dir_nlink;
3390 stat->size = lli->lli_stripe_dir_size;
3392 stat->nlink = inode->i_nlink;
3393 stat->size = i_size_read(inode);
3398 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3400 struct lookup_intent it = { .it_op = IT_GETATTR };
3402 return ll_getattr_it(mnt, de, &it, stat);
3405 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3406 __u64 start, __u64 len)
3410 struct ll_user_fiemap *fiemap;
3411 unsigned int extent_count = fieinfo->fi_extents_max;
3413 num_bytes = sizeof(*fiemap) + (extent_count *
3414 sizeof(struct ll_fiemap_extent));
3415 OBD_ALLOC_LARGE(fiemap, num_bytes);
3420 fiemap->fm_flags = fieinfo->fi_flags;
3421 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3422 fiemap->fm_start = start;
3423 fiemap->fm_length = len;
3424 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3425 sizeof(struct ll_fiemap_extent));
3427 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3429 fieinfo->fi_flags = fiemap->fm_flags;
3430 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3431 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3432 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3434 OBD_FREE_LARGE(fiemap, num_bytes);
3438 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3440 struct ll_inode_info *lli = ll_i2info(inode);
3441 struct posix_acl *acl = NULL;
3444 spin_lock(&lli->lli_lock);
3445 /* VFS' acl_permission_check->check_acl will release the refcount */
3446 acl = posix_acl_dup(lli->lli_posix_acl);
3447 spin_unlock(&lli->lli_lock);
3452 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3454 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3455 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3457 ll_check_acl(struct inode *inode, int mask)
3460 # ifdef CONFIG_FS_POSIX_ACL
3461 struct posix_acl *acl;
3465 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3466 if (flags & IPERM_FLAG_RCU)
3469 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3474 rc = posix_acl_permission(inode, acl, mask);
3475 posix_acl_release(acl);
3478 # else /* !CONFIG_FS_POSIX_ACL */
3480 # endif /* CONFIG_FS_POSIX_ACL */
3482 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3484 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3485 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3487 # ifdef HAVE_INODE_PERMISION_2ARGS
3488 int ll_inode_permission(struct inode *inode, int mask)
3490 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3497 #ifdef MAY_NOT_BLOCK
3498 if (mask & MAY_NOT_BLOCK)
3500 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3501 if (flags & IPERM_FLAG_RCU)
3505 /* as root inode are NOT getting validated in lookup operation,
3506 * need to do it before permission check. */
3508 if (inode == inode->i_sb->s_root->d_inode) {
3509 struct lookup_intent it = { .it_op = IT_LOOKUP };
3511 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3512 MDS_INODELOCK_LOOKUP);
3517 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3518 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3520 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3521 return lustre_check_remote_perm(inode, mask);
3523 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3524 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3529 /* -o localflock - only provides locally consistent flock locks */
3530 struct file_operations ll_file_operations = {
3531 .read = ll_file_read,
3532 .aio_read = ll_file_aio_read,
3533 .write = ll_file_write,
3534 .aio_write = ll_file_aio_write,
3535 .unlocked_ioctl = ll_file_ioctl,
3536 .open = ll_file_open,
3537 .release = ll_file_release,
3538 .mmap = ll_file_mmap,
3539 .llseek = ll_file_seek,
3540 .splice_read = ll_file_splice_read,
3545 struct file_operations ll_file_operations_flock = {
3546 .read = ll_file_read,
3547 .aio_read = ll_file_aio_read,
3548 .write = ll_file_write,
3549 .aio_write = ll_file_aio_write,
3550 .unlocked_ioctl = ll_file_ioctl,
3551 .open = ll_file_open,
3552 .release = ll_file_release,
3553 .mmap = ll_file_mmap,
3554 .llseek = ll_file_seek,
3555 .splice_read = ll_file_splice_read,
3558 .flock = ll_file_flock,
3559 .lock = ll_file_flock
3562 /* These are for -o noflock - to return ENOSYS on flock calls */
3563 struct file_operations ll_file_operations_noflock = {
3564 .read = ll_file_read,
3565 .aio_read = ll_file_aio_read,
3566 .write = ll_file_write,
3567 .aio_write = ll_file_aio_write,
3568 .unlocked_ioctl = ll_file_ioctl,
3569 .open = ll_file_open,
3570 .release = ll_file_release,
3571 .mmap = ll_file_mmap,
3572 .llseek = ll_file_seek,
3573 .splice_read = ll_file_splice_read,
3576 .flock = ll_file_noflock,
3577 .lock = ll_file_noflock
3580 struct inode_operations ll_file_inode_operations = {
3581 .setattr = ll_setattr,
3582 .getattr = ll_getattr,
3583 .permission = ll_inode_permission,
3584 .setxattr = ll_setxattr,
3585 .getxattr = ll_getxattr,
3586 .listxattr = ll_listxattr,
3587 .removexattr = ll_removexattr,
3588 .fiemap = ll_fiemap,
3589 #ifdef HAVE_IOP_GET_ACL
3590 .get_acl = ll_get_acl,
3594 /* dynamic ioctl number support routins */
3595 static struct llioc_ctl_data {
3596 struct rw_semaphore ioc_sem;
3597 cfs_list_t ioc_head;
3599 __RWSEM_INITIALIZER(llioc.ioc_sem),
3600 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3605 cfs_list_t iocd_list;
3606 unsigned int iocd_size;
3607 llioc_callback_t iocd_cb;
3608 unsigned int iocd_count;
3609 unsigned int iocd_cmd[0];
3612 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3615 struct llioc_data *in_data = NULL;
3618 if (cb == NULL || cmd == NULL ||
3619 count > LLIOC_MAX_CMD || count < 0)
3622 size = sizeof(*in_data) + count * sizeof(unsigned int);
3623 OBD_ALLOC(in_data, size);
3624 if (in_data == NULL)
3627 memset(in_data, 0, sizeof(*in_data));
3628 in_data->iocd_size = size;
3629 in_data->iocd_cb = cb;
3630 in_data->iocd_count = count;
3631 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3633 down_write(&llioc.ioc_sem);
3634 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3635 up_write(&llioc.ioc_sem);
3640 void ll_iocontrol_unregister(void *magic)
3642 struct llioc_data *tmp;
3647 down_write(&llioc.ioc_sem);
3648 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3650 unsigned int size = tmp->iocd_size;
3652 cfs_list_del(&tmp->iocd_list);
3653 up_write(&llioc.ioc_sem);
3655 OBD_FREE(tmp, size);
3659 up_write(&llioc.ioc_sem);
3661 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3664 EXPORT_SYMBOL(ll_iocontrol_register);
3665 EXPORT_SYMBOL(ll_iocontrol_unregister);
3667 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3668 unsigned int cmd, unsigned long arg, int *rcp)
3670 enum llioc_iter ret = LLIOC_CONT;
3671 struct llioc_data *data;
3672 int rc = -EINVAL, i;
3674 down_read(&llioc.ioc_sem);
3675 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3676 for (i = 0; i < data->iocd_count; i++) {
3677 if (cmd != data->iocd_cmd[i])
3680 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3684 if (ret == LLIOC_STOP)
3687 up_read(&llioc.ioc_sem);
3694 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3696 struct ll_inode_info *lli = ll_i2info(inode);
3697 struct cl_env_nest nest;
3702 if (lli->lli_clob == NULL)
3705 env = cl_env_nested_get(&nest);
3707 RETURN(PTR_ERR(env));
3709 result = cl_conf_set(env, lli->lli_clob, conf);
3710 cl_env_nested_put(&nest, env);
3712 if (conf->coc_opc == OBJECT_CONF_SET) {
3713 struct ldlm_lock *lock = conf->coc_lock;
3715 LASSERT(lock != NULL);
3716 LASSERT(ldlm_has_layout(lock));
3718 /* it can only be allowed to match after layout is
3719 * applied to inode otherwise false layout would be
3720 * seen. Applying layout shoud happen before dropping
3721 * the intent lock. */
3722 ldlm_lock_allow_match(lock);
3728 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3729 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3732 struct ll_sb_info *sbi = ll_i2sbi(inode);
3733 struct obd_capa *oc;
3734 struct ptlrpc_request *req;
3735 struct mdt_body *body;
3742 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3743 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3744 lock->l_lvb_data, lock->l_lvb_len);
3746 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3749 /* if layout lock was granted right away, the layout is returned
3750 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3751 * blocked and then granted via completion ast, we have to fetch
3752 * layout here. Please note that we can't use the LVB buffer in
3753 * completion AST because it doesn't have a large enough buffer */
3754 oc = ll_mdscapa_get(inode);
3755 rc = ll_get_max_mdsize(sbi, &lmmsize);
3757 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3758 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3764 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3765 if (body == NULL || body->eadatasize > lmmsize)
3766 GOTO(out, rc = -EPROTO);
3768 lmmsize = body->eadatasize;
3769 if (lmmsize == 0) /* empty layout */
3772 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3774 GOTO(out, rc = -EFAULT);
3776 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3777 if (lvbdata == NULL)
3778 GOTO(out, rc = -ENOMEM);
3780 memcpy(lvbdata, lmm, lmmsize);
3781 lock_res_and_lock(lock);
3782 if (lock->l_lvb_data != NULL)
3783 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3785 lock->l_lvb_data = lvbdata;
3786 lock->l_lvb_len = lmmsize;
3787 unlock_res_and_lock(lock);
3792 ptlrpc_req_finished(req);
3797 * Apply the layout to the inode. Layout lock is held and will be released
3800 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3801 struct inode *inode, __u32 *gen, bool reconf)
3803 struct ll_inode_info *lli = ll_i2info(inode);
3804 struct ll_sb_info *sbi = ll_i2sbi(inode);
3805 struct ldlm_lock *lock;
3806 struct lustre_md md = { NULL };
3807 struct cl_object_conf conf;
3810 bool wait_layout = false;
3813 LASSERT(lustre_handle_is_used(lockh));
3815 lock = ldlm_handle2lock(lockh);
3816 LASSERT(lock != NULL);
3817 LASSERT(ldlm_has_layout(lock));
3819 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3820 PFID(&lli->lli_fid), inode, reconf);
3822 /* in case this is a caching lock and reinstate with new inode */
3823 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3825 lock_res_and_lock(lock);
3826 lvb_ready = ldlm_is_lvb_ready(lock);
3827 unlock_res_and_lock(lock);
3828 /* checking lvb_ready is racy but this is okay. The worst case is
3829 * that multi processes may configure the file on the same time. */
3831 if (lvb_ready || !reconf) {
3834 /* layout_gen must be valid if layout lock is not
3835 * cancelled and stripe has already set */
3836 *gen = lli->lli_layout_gen;
3842 rc = ll_layout_fetch(inode, lock);
3846 /* for layout lock, lmm is returned in lock's lvb.
3847 * lvb_data is immutable if the lock is held so it's safe to access it
3848 * without res lock. See the description in ldlm_lock_decref_internal()
3849 * for the condition to free lvb_data of layout lock */
3850 if (lock->l_lvb_data != NULL) {
3851 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3852 lock->l_lvb_data, lock->l_lvb_len);
3854 *gen = LL_LAYOUT_GEN_EMPTY;
3856 *gen = md.lsm->lsm_layout_gen;
3859 CERROR("%s: file "DFID" unpackmd error: %d\n",
3860 ll_get_fsname(inode->i_sb, NULL, 0),
3861 PFID(&lli->lli_fid), rc);
3867 /* set layout to file. Unlikely this will fail as old layout was
3868 * surely eliminated */
3869 memset(&conf, 0, sizeof conf);
3870 conf.coc_opc = OBJECT_CONF_SET;
3871 conf.coc_inode = inode;
3872 conf.coc_lock = lock;
3873 conf.u.coc_md = &md;
3874 rc = ll_layout_conf(inode, &conf);
3877 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3879 /* refresh layout failed, need to wait */
3880 wait_layout = rc == -EBUSY;
3884 LDLM_LOCK_PUT(lock);
3885 ldlm_lock_decref(lockh, mode);
3887 /* wait for IO to complete if it's still being used. */
3889 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3890 ll_get_fsname(inode->i_sb, NULL, 0),
3891 PFID(&lli->lli_fid), inode);
3893 memset(&conf, 0, sizeof conf);
3894 conf.coc_opc = OBJECT_CONF_WAIT;
3895 conf.coc_inode = inode;
3896 rc = ll_layout_conf(inode, &conf);
3900 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3901 ll_get_fsname(inode->i_sb, NULL, 0),
3902 PFID(&lli->lli_fid), rc);
3908 * This function checks if there exists a LAYOUT lock on the client side,
3909 * or enqueues it if it doesn't have one in cache.
3911 * This function will not hold layout lock so it may be revoked any time after
3912 * this function returns. Any operations depend on layout should be redone
3915 * This function should be called before lov_io_init() to get an uptodate
3916 * layout version, the caller should save the version number and after IO
3917 * is finished, this function should be called again to verify that layout
3918 * is not changed during IO time.
3920 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3922 struct ll_inode_info *lli = ll_i2info(inode);
3923 struct ll_sb_info *sbi = ll_i2sbi(inode);
3924 struct md_op_data *op_data;
3925 struct lookup_intent it;
3926 struct lustre_handle lockh;
3928 struct ldlm_enqueue_info einfo = {
3929 .ei_type = LDLM_IBITS,
3931 .ei_cb_bl = ll_md_blocking_ast,
3932 .ei_cb_cp = ldlm_completion_ast,
3937 *gen = lli->lli_layout_gen;
3938 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3942 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3943 LASSERT(S_ISREG(inode->i_mode));
3945 /* mostly layout lock is caching on the local side, so try to match
3946 * it before grabbing layout lock mutex. */
3947 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3948 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3949 if (mode != 0) { /* hit cached lock */
3950 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3954 /* better hold lli_layout_mutex to try again otherwise
3955 * it will have starvation problem. */
3958 /* take layout lock mutex to enqueue layout lock exclusively. */
3959 mutex_lock(&lli->lli_layout_mutex);
3962 /* try again. Maybe somebody else has done this. */
3963 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3964 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3965 if (mode != 0) { /* hit cached lock */
3966 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3970 mutex_unlock(&lli->lli_layout_mutex);
3974 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3975 0, 0, LUSTRE_OPC_ANY, NULL);
3976 if (IS_ERR(op_data)) {
3977 mutex_unlock(&lli->lli_layout_mutex);
3978 RETURN(PTR_ERR(op_data));
3981 /* have to enqueue one */
3982 memset(&it, 0, sizeof(it));
3983 it.it_op = IT_LAYOUT;
3984 lockh.cookie = 0ULL;
3986 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
3987 ll_get_fsname(inode->i_sb, NULL, 0),
3988 PFID(&lli->lli_fid), inode);
3990 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3992 if (it.d.lustre.it_data != NULL)
3993 ptlrpc_req_finished(it.d.lustre.it_data);
3994 it.d.lustre.it_data = NULL;
3996 ll_finish_md_op_data(op_data);
3998 mode = it.d.lustre.it_lock_mode;
3999 it.d.lustre.it_lock_mode = 0;
4000 ll_intent_drop_lock(&it);
4003 /* set lock data in case this is a new lock */
4004 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4005 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4009 mutex_unlock(&lli->lli_layout_mutex);
4015 * This function send a restore request to the MDT
4017 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4019 struct hsm_user_request *hur;
4023 len = sizeof(struct hsm_user_request) +
4024 sizeof(struct hsm_user_item);
4025 OBD_ALLOC(hur, len);
4029 hur->hur_request.hr_action = HUA_RESTORE;
4030 hur->hur_request.hr_archive_id = 0;
4031 hur->hur_request.hr_flags = 0;
4032 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4033 sizeof(hur->hur_user_item[0].hui_fid));
4034 hur->hur_user_item[0].hui_extent.offset = offset;
4035 hur->hur_user_item[0].hui_extent.length = length;
4036 hur->hur_request.hr_itemcount = 1;
4037 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,