4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("Releasing a file %p with negative dentry %p. Name %s",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead.
386 * Different processes can open the same dir, "ll_opendir_key" means:
387 * it is me that should stop the statahead thread. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
389 lli->lli_opendir_pid != 0)
390 ll_stop_statahead(inode, lli->lli_opendir_key);
392 if (inode->i_sb->s_root == file->f_dentry) {
393 LUSTRE_FPRIVATE(file) = NULL;
394 ll_file_data_put(fd);
398 if (!S_ISDIR(inode->i_mode)) {
399 if (lli->lli_clob != NULL)
400 lov_read_and_clear_async_rc(lli->lli_clob);
401 lli->lli_async_rc = 0;
404 rc = ll_md_close(sbi->ll_md_exp, inode, file);
406 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
407 libcfs_debug_dumplog();
412 static int ll_intent_file_open(struct file *file, void *lmm,
413 int lmmsize, struct lookup_intent *itp)
415 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
416 struct dentry *parent = file->f_dentry->d_parent;
417 const char *name = file->f_dentry->d_name.name;
418 const int len = file->f_dentry->d_name.len;
419 struct md_op_data *op_data;
420 struct ptlrpc_request *req;
421 __u32 opc = LUSTRE_OPC_ANY;
428 /* Usually we come here only for NFSD, and we want open lock.
429 But we can also get here with pre 2.6.15 patchless kernels, and in
430 that case that lock is also ok */
431 /* We can also get here if there was cached open handle in revalidate_it
432 * but it disappeared while we were getting from there to ll_file_open.
433 * But this means this file was closed and immediatelly opened which
434 * makes a good candidate for using OPEN lock */
435 /* If lmmsize & lmm are not 0, we are just setting stripe info
436 * parameters. No need for the open lock */
437 if (lmm == NULL && lmmsize == 0) {
438 itp->it_flags |= MDS_OPEN_LOCK;
439 if (itp->it_flags & FMODE_WRITE)
440 opc = LUSTRE_OPC_CREATE;
443 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
444 file->f_dentry->d_inode, name, len,
447 RETURN(PTR_ERR(op_data));
449 itp->it_flags |= MDS_OPEN_BY_FID;
450 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
451 0 /*unused */, &req, ll_md_blocking_ast, 0);
452 ll_finish_md_op_data(op_data);
454 /* reason for keep own exit path - don`t flood log
455 * with messages with -ESTALE errors.
457 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
458 it_open_error(DISP_OPEN_OPEN, itp))
460 ll_release_openhandle(file->f_dentry, itp);
464 if (it_disposition(itp, DISP_LOOKUP_NEG))
465 GOTO(out, rc = -ENOENT);
467 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
468 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
469 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
473 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
474 if (!rc && itp->d.lustre.it_lock_mode)
475 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
479 ptlrpc_req_finished(req);
480 ll_intent_drop_lock(itp);
486 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
487 * not believe attributes if a few ioepoch holders exist. Attributes for
488 * previous ioepoch if new one is opened are also skipped by MDS.
490 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
492 if (ioepoch && lli->lli_ioepoch != ioepoch) {
493 lli->lli_ioepoch = ioepoch;
494 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
495 ioepoch, PFID(&lli->lli_fid));
499 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
500 struct obd_client_handle *och)
502 struct ptlrpc_request *req = it->d.lustre.it_data;
503 struct mdt_body *body;
505 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
506 och->och_fh = body->handle;
507 och->och_fid = body->fid1;
508 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
509 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
510 och->och_flags = it->it_flags;
512 return md_set_open_replay_data(md_exp, och, it);
515 static int ll_local_open(struct file *file, struct lookup_intent *it,
516 struct ll_file_data *fd, struct obd_client_handle *och)
518 struct inode *inode = file->f_dentry->d_inode;
519 struct ll_inode_info *lli = ll_i2info(inode);
522 LASSERT(!LUSTRE_FPRIVATE(file));
527 struct ptlrpc_request *req = it->d.lustre.it_data;
528 struct mdt_body *body;
531 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
535 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
536 ll_ioepoch_open(lli, body->ioepoch);
539 LUSTRE_FPRIVATE(file) = fd;
540 ll_readahead_init(inode, &fd->fd_ras);
541 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
546 /* Open a file, and (for the very first open) create objects on the OSTs at
547 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
548 * creation or open until ll_lov_setstripe() ioctl is called.
550 * If we already have the stripe MD locally then we don't request it in
551 * md_open(), by passing a lmm_size = 0.
553 * It is up to the application to ensure no other processes open this file
554 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
555 * used. We might be able to avoid races of that sort by getting lli_open_sem
556 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
557 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
559 int ll_file_open(struct inode *inode, struct file *file)
561 struct ll_inode_info *lli = ll_i2info(inode);
562 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
563 .it_flags = file->f_flags };
564 struct obd_client_handle **och_p = NULL;
565 __u64 *och_usecount = NULL;
566 struct ll_file_data *fd;
567 int rc = 0, opendir_set = 0;
570 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
571 PFID(ll_inode2fid(inode)), inode, file->f_flags);
573 it = file->private_data; /* XXX: compat macro */
574 file->private_data = NULL; /* prevent ll_local_open assertion */
576 fd = ll_file_data_get();
578 GOTO(out_openerr, rc = -ENOMEM);
581 if (S_ISDIR(inode->i_mode)) {
582 spin_lock(&lli->lli_sa_lock);
583 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
584 lli->lli_opendir_pid == 0) {
585 lli->lli_opendir_key = fd;
586 lli->lli_opendir_pid = current_pid();
589 spin_unlock(&lli->lli_sa_lock);
592 if (inode->i_sb->s_root == file->f_dentry) {
593 LUSTRE_FPRIVATE(file) = fd;
597 if (!it || !it->d.lustre.it_disposition) {
598 /* Convert f_flags into access mode. We cannot use file->f_mode,
599 * because everything but O_ACCMODE mask was stripped from
601 if ((oit.it_flags + 1) & O_ACCMODE)
603 if (file->f_flags & O_TRUNC)
604 oit.it_flags |= FMODE_WRITE;
606 /* kernel only call f_op->open in dentry_open. filp_open calls
607 * dentry_open after call to open_namei that checks permissions.
608 * Only nfsd_open call dentry_open directly without checking
609 * permissions and because of that this code below is safe. */
610 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
611 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
613 /* We do not want O_EXCL here, presumably we opened the file
614 * already? XXX - NFS implications? */
615 oit.it_flags &= ~O_EXCL;
617 /* bug20584, if "it_flags" contains O_CREAT, the file will be
618 * created if necessary, then "IT_CREAT" should be set to keep
619 * consistent with it */
620 if (oit.it_flags & O_CREAT)
621 oit.it_op |= IT_CREAT;
627 /* Let's see if we have file open on MDS already. */
628 if (it->it_flags & FMODE_WRITE) {
629 och_p = &lli->lli_mds_write_och;
630 och_usecount = &lli->lli_open_fd_write_count;
631 } else if (it->it_flags & FMODE_EXEC) {
632 och_p = &lli->lli_mds_exec_och;
633 och_usecount = &lli->lli_open_fd_exec_count;
635 och_p = &lli->lli_mds_read_och;
636 och_usecount = &lli->lli_open_fd_read_count;
639 mutex_lock(&lli->lli_och_mutex);
640 if (*och_p) { /* Open handle is present */
641 if (it_disposition(it, DISP_OPEN_OPEN)) {
642 /* Well, there's extra open request that we do not need,
643 let's close it somehow. This will decref request. */
644 rc = it_open_error(DISP_OPEN_OPEN, it);
646 mutex_unlock(&lli->lli_och_mutex);
647 GOTO(out_openerr, rc);
650 ll_release_openhandle(file->f_dentry, it);
654 rc = ll_local_open(file, it, fd, NULL);
657 mutex_unlock(&lli->lli_och_mutex);
658 GOTO(out_openerr, rc);
661 LASSERT(*och_usecount == 0);
662 if (!it->d.lustre.it_disposition) {
663 /* We cannot just request lock handle now, new ELC code
664 means that one of other OPEN locks for this file
665 could be cancelled, and since blocking ast handler
666 would attempt to grab och_mutex as well, that would
667 result in a deadlock */
668 mutex_unlock(&lli->lli_och_mutex);
669 it->it_create_mode |= M_CHECK_STALE;
670 rc = ll_intent_file_open(file, NULL, 0, it);
671 it->it_create_mode &= ~M_CHECK_STALE;
673 GOTO(out_openerr, rc);
677 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
679 GOTO(out_och_free, rc = -ENOMEM);
683 /* md_intent_lock() didn't get a request ref if there was an
684 * open error, so don't do cleanup on the request here
686 /* XXX (green): Should not we bail out on any error here, not
687 * just open error? */
688 rc = it_open_error(DISP_OPEN_OPEN, it);
690 GOTO(out_och_free, rc);
692 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
693 "inode %p: disposition %x, status %d\n", inode,
694 it_disposition(it, ~0), it->d.lustre.it_status);
696 rc = ll_local_open(file, it, fd, *och_p);
698 GOTO(out_och_free, rc);
700 mutex_unlock(&lli->lli_och_mutex);
703 /* Must do this outside lli_och_mutex lock to prevent deadlock where
704 different kind of OPEN lock for this same inode gets cancelled
705 by ldlm_cancel_lru */
706 if (!S_ISREG(inode->i_mode))
707 GOTO(out_och_free, rc);
711 if (!lli->lli_has_smd &&
712 (cl_is_lov_delay_create(file->f_flags) ||
713 (file->f_mode & FMODE_WRITE) == 0)) {
714 CDEBUG(D_INODE, "object creation was delayed\n");
715 GOTO(out_och_free, rc);
717 cl_lov_delay_create_clear(&file->f_flags);
718 GOTO(out_och_free, rc);
722 if (och_p && *och_p) {
723 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
724 *och_p = NULL; /* OBD_FREE writes some magic there */
727 mutex_unlock(&lli->lli_och_mutex);
730 if (opendir_set != 0)
731 ll_stop_statahead(inode, lli->lli_opendir_key);
733 ll_file_data_put(fd);
735 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
738 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
739 ptlrpc_req_finished(it->d.lustre.it_data);
740 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
746 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
747 struct ldlm_lock_desc *desc, void *data, int flag)
750 struct lustre_handle lockh;
754 case LDLM_CB_BLOCKING:
755 ldlm_lock2handle(lock, &lockh);
756 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
758 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
762 case LDLM_CB_CANCELING:
770 * Acquire a lease and open the file.
772 static struct obd_client_handle *
773 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
776 struct lookup_intent it = { .it_op = IT_OPEN };
777 struct ll_sb_info *sbi = ll_i2sbi(inode);
778 struct md_op_data *op_data;
779 struct ptlrpc_request *req;
780 struct lustre_handle old_handle = { 0 };
781 struct obd_client_handle *och = NULL;
786 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
787 RETURN(ERR_PTR(-EINVAL));
790 struct ll_inode_info *lli = ll_i2info(inode);
791 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
792 struct obd_client_handle **och_p;
795 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
796 RETURN(ERR_PTR(-EPERM));
798 /* Get the openhandle of the file */
800 mutex_lock(&lli->lli_och_mutex);
801 if (fd->fd_lease_och != NULL) {
802 mutex_unlock(&lli->lli_och_mutex);
806 if (fd->fd_och == NULL) {
807 if (file->f_mode & FMODE_WRITE) {
808 LASSERT(lli->lli_mds_write_och != NULL);
809 och_p = &lli->lli_mds_write_och;
810 och_usecount = &lli->lli_open_fd_write_count;
812 LASSERT(lli->lli_mds_read_och != NULL);
813 och_p = &lli->lli_mds_read_och;
814 och_usecount = &lli->lli_open_fd_read_count;
816 if (*och_usecount == 1) {
823 mutex_unlock(&lli->lli_och_mutex);
824 if (rc < 0) /* more than 1 opener */
827 LASSERT(fd->fd_och != NULL);
828 old_handle = fd->fd_och->och_fh;
833 RETURN(ERR_PTR(-ENOMEM));
835 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
836 LUSTRE_OPC_ANY, NULL);
838 GOTO(out, rc = PTR_ERR(op_data));
840 /* To tell the MDT this openhandle is from the same owner */
841 op_data->op_handle = old_handle;
843 it.it_flags = fmode | open_flags;
844 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
845 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
846 ll_md_blocking_lease_ast,
847 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
848 * it can be cancelled which may mislead applications that the lease is
850 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
851 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
852 * doesn't deal with openhandle, so normal openhandle will be leaked. */
853 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
854 ll_finish_md_op_data(op_data);
855 ptlrpc_req_finished(req);
857 GOTO(out_release_it, rc);
859 if (it_disposition(&it, DISP_LOOKUP_NEG))
860 GOTO(out_release_it, rc = -ENOENT);
862 rc = it_open_error(DISP_OPEN_OPEN, &it);
864 GOTO(out_release_it, rc);
866 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
867 ll_och_fill(sbi->ll_md_exp, &it, och);
869 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
870 GOTO(out_close, rc = -EOPNOTSUPP);
872 /* already get lease, handle lease lock */
873 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
874 if (it.d.lustre.it_lock_mode == 0 ||
875 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
876 /* open lock must return for lease */
877 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
878 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
879 it.d.lustre.it_lock_bits);
880 GOTO(out_close, rc = -EPROTO);
883 ll_intent_release(&it);
887 /* Cancel open lock */
888 if (it.d.lustre.it_lock_mode != 0) {
889 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
890 it.d.lustre.it_lock_mode);
891 it.d.lustre.it_lock_mode = 0;
892 och->och_lease_handle.cookie = 0ULL;
894 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
896 CERROR("%s: error closing file "DFID": %d\n",
897 ll_get_fsname(inode->i_sb, NULL, 0),
898 PFID(&ll_i2info(inode)->lli_fid), rc2);
899 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
901 ll_intent_release(&it);
909 * Release lease and close the file.
910 * It will check if the lease has ever broken.
912 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
915 struct ldlm_lock *lock;
916 bool cancelled = true;
920 lock = ldlm_handle2lock(&och->och_lease_handle);
922 lock_res_and_lock(lock);
923 cancelled = ldlm_is_cancel(lock);
924 unlock_res_and_lock(lock);
928 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
929 PFID(&ll_i2info(inode)->lli_fid), cancelled);
932 ldlm_cli_cancel(&och->och_lease_handle, 0);
933 if (lease_broken != NULL)
934 *lease_broken = cancelled;
936 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
941 /* Fills the obdo with the attributes for the lsm */
942 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
943 struct obd_capa *capa, struct obdo *obdo,
944 __u64 ioepoch, int dv_flags)
946 struct ptlrpc_request_set *set;
947 struct obd_info oinfo = { { { 0 } } };
952 LASSERT(lsm != NULL);
956 oinfo.oi_oa->o_oi = lsm->lsm_oi;
957 oinfo.oi_oa->o_mode = S_IFREG;
958 oinfo.oi_oa->o_ioepoch = ioepoch;
959 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
960 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
961 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
962 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
963 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
964 OBD_MD_FLDATAVERSION;
965 oinfo.oi_capa = capa;
966 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
967 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
968 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
969 if (dv_flags & LL_DV_WR_FLUSH)
970 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
973 set = ptlrpc_prep_set();
975 CERROR("can't allocate ptlrpc set\n");
978 rc = obd_getattr_async(exp, &oinfo, set);
980 rc = ptlrpc_set_wait(set);
981 ptlrpc_set_destroy(set);
984 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
985 OBD_MD_FLATIME | OBD_MD_FLMTIME |
986 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
987 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
988 if (dv_flags & LL_DV_WR_FLUSH &&
989 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
990 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
997 * Performs the getattr on the inode and updates its fields.
998 * If @sync != 0, perform the getattr under the server-side lock.
1000 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1001 __u64 ioepoch, int sync)
1003 struct obd_capa *capa = ll_mdscapa_get(inode);
1004 struct lov_stripe_md *lsm;
1008 lsm = ccc_inode_lsm_get(inode);
1009 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1010 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1013 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1015 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1016 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1017 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1018 (unsigned long long)inode->i_blocks,
1019 (unsigned long)ll_inode_blksize(inode));
1021 ccc_inode_lsm_put(inode, lsm);
1025 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1027 struct ll_inode_info *lli = ll_i2info(inode);
1028 struct cl_object *obj = lli->lli_clob;
1029 struct cl_attr *attr = ccc_env_thread_attr(env);
1035 ll_inode_size_lock(inode);
1036 /* merge timestamps the most recently obtained from mds with
1037 timestamps obtained from osts */
1038 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1039 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1040 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1041 inode_init_lvb(inode, &lvb);
1043 cl_object_attr_lock(obj);
1044 rc = cl_object_attr_get(env, obj, attr);
1045 cl_object_attr_unlock(obj);
1048 if (lvb.lvb_atime < attr->cat_atime)
1049 lvb.lvb_atime = attr->cat_atime;
1050 if (lvb.lvb_ctime < attr->cat_ctime)
1051 lvb.lvb_ctime = attr->cat_ctime;
1052 if (lvb.lvb_mtime < attr->cat_mtime)
1053 lvb.lvb_mtime = attr->cat_mtime;
1055 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1056 PFID(&lli->lli_fid), attr->cat_size);
1057 cl_isize_write_nolock(inode, attr->cat_size);
1059 inode->i_blocks = attr->cat_blocks;
1061 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1062 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1063 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1065 ll_inode_size_unlock(inode);
1070 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1073 struct obdo obdo = { 0 };
1076 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1078 st->st_size = obdo.o_size;
1079 st->st_blocks = obdo.o_blocks;
1080 st->st_mtime = obdo.o_mtime;
1081 st->st_atime = obdo.o_atime;
1082 st->st_ctime = obdo.o_ctime;
1087 static bool file_is_noatime(const struct file *file)
1089 const struct vfsmount *mnt = file->f_path.mnt;
1090 const struct inode *inode = file->f_path.dentry->d_inode;
1092 /* Adapted from file_accessed() and touch_atime().*/
1093 if (file->f_flags & O_NOATIME)
1096 if (inode->i_flags & S_NOATIME)
1099 if (IS_NOATIME(inode))
1102 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1105 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1108 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1114 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1116 struct inode *inode = file->f_dentry->d_inode;
1118 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1120 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1121 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1122 file->f_flags & O_DIRECT ||
1125 io->ci_obj = ll_i2info(inode)->lli_clob;
1126 io->ci_lockreq = CILR_MAYBE;
1127 if (ll_file_nolock(file)) {
1128 io->ci_lockreq = CILR_NEVER;
1129 io->ci_no_srvlock = 1;
1130 } else if (file->f_flags & O_APPEND) {
1131 io->ci_lockreq = CILR_MANDATORY;
1134 io->ci_noatime = file_is_noatime(file);
1138 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1139 struct file *file, enum cl_io_type iot,
1140 loff_t *ppos, size_t count)
1142 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1143 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1149 file->f_dentry->d_name.name, iot, *ppos, count);
1152 io = ccc_env_thread_io(env);
1153 ll_io_init(io, file, iot == CIT_WRITE);
1155 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1156 struct vvp_io *vio = vvp_env_io(env);
1157 struct ccc_io *cio = ccc_env_io(env);
1158 int write_mutex_locked = 0;
1160 cio->cui_fd = LUSTRE_FPRIVATE(file);
1161 vio->cui_io_subtype = args->via_io_subtype;
1163 switch (vio->cui_io_subtype) {
1165 cio->cui_iov = args->u.normal.via_iov;
1166 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1167 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1168 cio->cui_iocb = args->u.normal.via_iocb;
1169 if ((iot == CIT_WRITE) &&
1170 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1171 if (mutex_lock_interruptible(&lli->
1173 GOTO(out, result = -ERESTARTSYS);
1174 write_mutex_locked = 1;
1176 down_read(&lli->lli_trunc_sem);
1179 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1180 vio->u.splice.cui_flags = args->u.splice.via_flags;
1183 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1186 result = cl_io_loop(env, io);
1187 if (args->via_io_subtype == IO_NORMAL)
1188 up_read(&lli->lli_trunc_sem);
1189 if (write_mutex_locked)
1190 mutex_unlock(&lli->lli_write_mutex);
1192 /* cl_io_rw_init() handled IO */
1193 result = io->ci_result;
1196 if (io->ci_nob > 0) {
1197 result = io->ci_nob;
1198 *ppos = io->u.ci_wr.wr.crw_pos;
1202 cl_io_fini(env, io);
1203 /* If any bit been read/written (result != 0), we just return
1204 * short read/write instead of restart io. */
1205 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1206 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1207 iot == CIT_READ ? "read" : "write",
1208 file->f_dentry->d_name.name, *ppos, count);
1209 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1213 if (iot == CIT_READ) {
1215 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1216 LPROC_LL_READ_BYTES, result);
1217 } else if (iot == CIT_WRITE) {
1219 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1220 LPROC_LL_WRITE_BYTES, result);
1221 fd->fd_write_failed = false;
1222 } else if (result != -ERESTARTSYS) {
1223 fd->fd_write_failed = true;
1226 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1233 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1235 static int ll_file_get_iov_count(const struct iovec *iov,
1236 unsigned long *nr_segs, size_t *count)
1241 for (seg = 0; seg < *nr_segs; seg++) {
1242 const struct iovec *iv = &iov[seg];
1245 * If any segment has a negative length, or the cumulative
1246 * length ever wraps negative then return -EINVAL.
1249 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1251 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1256 cnt -= iv->iov_len; /* This segment is no good */
1263 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1264 unsigned long nr_segs, loff_t pos)
1267 struct vvp_io_args *args;
1273 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1277 env = cl_env_get(&refcheck);
1279 RETURN(PTR_ERR(env));
1281 args = vvp_env_args(env, IO_NORMAL);
1282 args->u.normal.via_iov = (struct iovec *)iov;
1283 args->u.normal.via_nrsegs = nr_segs;
1284 args->u.normal.via_iocb = iocb;
1286 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1287 &iocb->ki_pos, count);
1288 cl_env_put(env, &refcheck);
1292 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1296 struct iovec *local_iov;
1297 struct kiocb *kiocb;
1302 env = cl_env_get(&refcheck);
1304 RETURN(PTR_ERR(env));
1306 local_iov = &vvp_env_info(env)->vti_local_iov;
1307 kiocb = &vvp_env_info(env)->vti_kiocb;
1308 local_iov->iov_base = (void __user *)buf;
1309 local_iov->iov_len = count;
1310 init_sync_kiocb(kiocb, file);
1311 kiocb->ki_pos = *ppos;
1312 #ifdef HAVE_KIOCB_KI_LEFT
1313 kiocb->ki_left = count;
1315 kiocb->ki_nbytes = count;
1318 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1319 *ppos = kiocb->ki_pos;
1321 cl_env_put(env, &refcheck);
1326 * Write to a file (through the page cache).
1329 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1330 unsigned long nr_segs, loff_t pos)
1333 struct vvp_io_args *args;
1339 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1343 env = cl_env_get(&refcheck);
1345 RETURN(PTR_ERR(env));
1347 args = vvp_env_args(env, IO_NORMAL);
1348 args->u.normal.via_iov = (struct iovec *)iov;
1349 args->u.normal.via_nrsegs = nr_segs;
1350 args->u.normal.via_iocb = iocb;
1352 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1353 &iocb->ki_pos, count);
1354 cl_env_put(env, &refcheck);
1358 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1362 struct iovec *local_iov;
1363 struct kiocb *kiocb;
1368 env = cl_env_get(&refcheck);
1370 RETURN(PTR_ERR(env));
1372 local_iov = &vvp_env_info(env)->vti_local_iov;
1373 kiocb = &vvp_env_info(env)->vti_kiocb;
1374 local_iov->iov_base = (void __user *)buf;
1375 local_iov->iov_len = count;
1376 init_sync_kiocb(kiocb, file);
1377 kiocb->ki_pos = *ppos;
1378 #ifdef HAVE_KIOCB_KI_LEFT
1379 kiocb->ki_left = count;
1381 kiocb->ki_nbytes = count;
1384 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1385 *ppos = kiocb->ki_pos;
1387 cl_env_put(env, &refcheck);
1392 * Send file content (through pagecache) somewhere with helper
1394 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1395 struct pipe_inode_info *pipe, size_t count,
1399 struct vvp_io_args *args;
1404 env = cl_env_get(&refcheck);
1406 RETURN(PTR_ERR(env));
1408 args = vvp_env_args(env, IO_SPLICE);
1409 args->u.splice.via_pipe = pipe;
1410 args->u.splice.via_flags = flags;
1412 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1413 cl_env_put(env, &refcheck);
1417 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1420 struct obd_export *exp = ll_i2dtexp(inode);
1421 struct obd_trans_info oti = { 0 };
1422 struct obdo *oa = NULL;
1425 struct lov_stripe_md *lsm = NULL, *lsm2;
1432 lsm = ccc_inode_lsm_get(inode);
1433 if (!lsm_has_objects(lsm))
1434 GOTO(out, rc = -ENOENT);
1436 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1437 (lsm->lsm_stripe_count));
1439 OBD_ALLOC_LARGE(lsm2, lsm_size);
1441 GOTO(out, rc = -ENOMEM);
1444 oa->o_nlink = ost_idx;
1445 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1446 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1447 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1448 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1449 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1450 memcpy(lsm2, lsm, lsm_size);
1451 ll_inode_size_lock(inode);
1452 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1453 ll_inode_size_unlock(inode);
1455 OBD_FREE_LARGE(lsm2, lsm_size);
1458 ccc_inode_lsm_put(inode, lsm);
1463 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1465 struct ll_recreate_obj ucreat;
1469 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1472 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1476 ostid_set_seq_mdt0(&oi);
1477 ostid_set_id(&oi, ucreat.lrc_id);
1478 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1481 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1488 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1491 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1494 fid_to_ostid(&fid, &oi);
1495 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1496 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1499 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1500 __u64 flags, struct lov_user_md *lum,
1503 struct lov_stripe_md *lsm = NULL;
1504 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1508 lsm = ccc_inode_lsm_get(inode);
1510 ccc_inode_lsm_put(inode, lsm);
1511 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1512 PFID(ll_inode2fid(inode)));
1513 GOTO(out, rc = -EEXIST);
1516 ll_inode_size_lock(inode);
1517 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1519 GOTO(out_unlock, rc);
1520 rc = oit.d.lustre.it_status;
1522 GOTO(out_req_free, rc);
1524 ll_release_openhandle(file->f_dentry, &oit);
1527 ll_inode_size_unlock(inode);
1528 ll_intent_release(&oit);
1529 ccc_inode_lsm_put(inode, lsm);
1531 cl_lov_delay_create_clear(&file->f_flags);
1534 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1538 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1539 struct lov_mds_md **lmmp, int *lmm_size,
1540 struct ptlrpc_request **request)
1542 struct ll_sb_info *sbi = ll_i2sbi(inode);
1543 struct mdt_body *body;
1544 struct lov_mds_md *lmm = NULL;
1545 struct ptlrpc_request *req = NULL;
1546 struct md_op_data *op_data;
1549 rc = ll_get_default_mdsize(sbi, &lmmsize);
1553 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1554 strlen(filename), lmmsize,
1555 LUSTRE_OPC_ANY, NULL);
1556 if (IS_ERR(op_data))
1557 RETURN(PTR_ERR(op_data));
1559 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1560 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1561 ll_finish_md_op_data(op_data);
1563 CDEBUG(D_INFO, "md_getattr_name failed "
1564 "on %s: rc %d\n", filename, rc);
1568 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1569 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1571 lmmsize = body->eadatasize;
1573 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1575 GOTO(out, rc = -ENODATA);
1578 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1579 LASSERT(lmm != NULL);
1581 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1582 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1583 GOTO(out, rc = -EPROTO);
1587 * This is coming from the MDS, so is probably in
1588 * little endian. We convert it to host endian before
1589 * passing it to userspace.
1591 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1594 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1595 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1598 /* if function called for directory - we should
1599 * avoid swab not existent lsm objects */
1600 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1601 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1602 if (S_ISREG(body->mode))
1603 lustre_swab_lov_user_md_objects(
1604 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1606 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1607 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1608 if (S_ISREG(body->mode))
1609 lustre_swab_lov_user_md_objects(
1610 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1617 *lmm_size = lmmsize;
1622 static int ll_lov_setea(struct inode *inode, struct file *file,
1625 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1626 struct lov_user_md *lump;
1627 int lum_size = sizeof(struct lov_user_md) +
1628 sizeof(struct lov_user_ost_data);
1632 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1635 OBD_ALLOC_LARGE(lump, lum_size);
1639 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1640 OBD_FREE_LARGE(lump, lum_size);
1644 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1646 OBD_FREE_LARGE(lump, lum_size);
1650 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1653 struct lov_user_md_v3 lumv3;
1654 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1655 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1656 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1658 __u64 flags = FMODE_WRITE;
1661 /* first try with v1 which is smaller than v3 */
1662 lum_size = sizeof(struct lov_user_md_v1);
1663 if (copy_from_user(lumv1, lumv1p, lum_size))
1666 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1667 lum_size = sizeof(struct lov_user_md_v3);
1668 if (copy_from_user(&lumv3, lumv3p, lum_size))
1672 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1674 struct lov_stripe_md *lsm;
1677 put_user(0, &lumv1p->lmm_stripe_count);
1679 ll_layout_refresh(inode, &gen);
1680 lsm = ccc_inode_lsm_get(inode);
1681 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1682 0, lsm, (void *)arg);
1683 ccc_inode_lsm_put(inode, lsm);
1688 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1690 struct lov_stripe_md *lsm;
1694 lsm = ccc_inode_lsm_get(inode);
1696 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1698 ccc_inode_lsm_put(inode, lsm);
1703 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1705 struct ll_inode_info *lli = ll_i2info(inode);
1706 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1707 struct ccc_grouplock grouplock;
1711 if (ll_file_nolock(file))
1712 RETURN(-EOPNOTSUPP);
1714 spin_lock(&lli->lli_lock);
1715 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1716 CWARN("group lock already existed with gid %lu\n",
1717 fd->fd_grouplock.cg_gid);
1718 spin_unlock(&lli->lli_lock);
1721 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1722 spin_unlock(&lli->lli_lock);
1724 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1725 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1729 spin_lock(&lli->lli_lock);
1730 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1731 spin_unlock(&lli->lli_lock);
1732 CERROR("another thread just won the race\n");
1733 cl_put_grouplock(&grouplock);
1737 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1738 fd->fd_grouplock = grouplock;
1739 spin_unlock(&lli->lli_lock);
1741 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1745 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1747 struct ll_inode_info *lli = ll_i2info(inode);
1748 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1749 struct ccc_grouplock grouplock;
1752 spin_lock(&lli->lli_lock);
1753 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1754 spin_unlock(&lli->lli_lock);
1755 CWARN("no group lock held\n");
1758 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1760 if (fd->fd_grouplock.cg_gid != arg) {
1761 CWARN("group lock %lu doesn't match current id %lu\n",
1762 arg, fd->fd_grouplock.cg_gid);
1763 spin_unlock(&lli->lli_lock);
1767 grouplock = fd->fd_grouplock;
1768 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1769 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1770 spin_unlock(&lli->lli_lock);
1772 cl_put_grouplock(&grouplock);
1773 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1778 * Close inode open handle
1780 * \param dentry [in] dentry which contains the inode
1781 * \param it [in,out] intent which contains open info and result
1784 * \retval <0 failure
1786 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1788 struct inode *inode = dentry->d_inode;
1789 struct obd_client_handle *och;
1795 /* Root ? Do nothing. */
1796 if (dentry->d_inode->i_sb->s_root == dentry)
1799 /* No open handle to close? Move away */
1800 if (!it_disposition(it, DISP_OPEN_OPEN))
1803 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1805 OBD_ALLOC(och, sizeof(*och));
1807 GOTO(out, rc = -ENOMEM);
1809 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1811 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1814 /* this one is in place of ll_file_open */
1815 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1816 ptlrpc_req_finished(it->d.lustre.it_data);
1817 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1823 * Get size for inode for which FIEMAP mapping is requested.
1824 * Make the FIEMAP get_info call and returns the result.
1826 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1829 struct obd_export *exp = ll_i2dtexp(inode);
1830 struct lov_stripe_md *lsm = NULL;
1831 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1832 __u32 vallen = num_bytes;
1836 /* Checks for fiemap flags */
1837 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1838 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1842 /* Check for FIEMAP_FLAG_SYNC */
1843 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1844 rc = filemap_fdatawrite(inode->i_mapping);
1849 lsm = ccc_inode_lsm_get(inode);
1853 /* If the stripe_count > 1 and the application does not understand
1854 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1856 if (lsm->lsm_stripe_count > 1 &&
1857 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1858 GOTO(out, rc = -EOPNOTSUPP);
1860 fm_key.oa.o_oi = lsm->lsm_oi;
1861 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1863 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1864 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1865 /* If filesize is 0, then there would be no objects for mapping */
1866 if (fm_key.oa.o_size == 0) {
1867 fiemap->fm_mapped_extents = 0;
1871 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1873 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1876 CERROR("obd_get_info failed: rc = %d\n", rc);
1879 ccc_inode_lsm_put(inode, lsm);
1883 int ll_fid2path(struct inode *inode, void *arg)
1885 struct obd_export *exp = ll_i2mdexp(inode);
1886 struct getinfo_fid2path *gfout, *gfin;
1890 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1891 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1894 /* Need to get the buflen */
1895 OBD_ALLOC_PTR(gfin);
1898 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1903 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1904 OBD_ALLOC(gfout, outsize);
1905 if (gfout == NULL) {
1909 memcpy(gfout, gfin, sizeof(*gfout));
1912 /* Call mdc_iocontrol */
1913 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1917 if (copy_to_user(arg, gfout, outsize))
1921 OBD_FREE(gfout, outsize);
1925 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1927 struct ll_user_fiemap *fiemap_s;
1928 size_t num_bytes, ret_bytes;
1929 unsigned int extent_count;
1932 /* Get the extent count so we can calculate the size of
1933 * required fiemap buffer */
1934 if (get_user(extent_count,
1935 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1937 num_bytes = sizeof(*fiemap_s) + (extent_count *
1938 sizeof(struct ll_fiemap_extent));
1940 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1941 if (fiemap_s == NULL)
1944 /* get the fiemap value */
1945 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1947 GOTO(error, rc = -EFAULT);
1949 /* If fm_extent_count is non-zero, read the first extent since
1950 * it is used to calculate end_offset and device from previous
1953 if (copy_from_user(&fiemap_s->fm_extents[0],
1954 (char __user *)arg + sizeof(*fiemap_s),
1955 sizeof(struct ll_fiemap_extent)))
1956 GOTO(error, rc = -EFAULT);
1959 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1963 ret_bytes = sizeof(struct ll_user_fiemap);
1965 if (extent_count != 0)
1966 ret_bytes += (fiemap_s->fm_mapped_extents *
1967 sizeof(struct ll_fiemap_extent));
1969 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1973 OBD_FREE_LARGE(fiemap_s, num_bytes);
1978 * Read the data_version for inode.
1980 * This value is computed using stripe object version on OST.
1981 * Version is computed using server side locking.
1983 * @param sync if do sync on the OST side;
1985 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1986 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1988 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1990 struct lov_stripe_md *lsm = NULL;
1991 struct ll_sb_info *sbi = ll_i2sbi(inode);
1992 struct obdo *obdo = NULL;
1996 /* If no stripe, we consider version is 0. */
1997 lsm = ccc_inode_lsm_get(inode);
1998 if (!lsm_has_objects(lsm)) {
2000 CDEBUG(D_INODE, "No object for inode\n");
2004 OBD_ALLOC_PTR(obdo);
2006 GOTO(out, rc = -ENOMEM);
2008 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2010 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2013 *data_version = obdo->o_data_version;
2019 ccc_inode_lsm_put(inode, lsm);
2024 * Trigger a HSM release request for the provided inode.
2026 int ll_hsm_release(struct inode *inode)
2028 struct cl_env_nest nest;
2030 struct obd_client_handle *och = NULL;
2031 __u64 data_version = 0;
2035 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2036 ll_get_fsname(inode->i_sb, NULL, 0),
2037 PFID(&ll_i2info(inode)->lli_fid));
2039 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2041 GOTO(out, rc = PTR_ERR(och));
2043 /* Grab latest data_version and [am]time values */
2044 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2048 env = cl_env_nested_get(&nest);
2050 GOTO(out, rc = PTR_ERR(env));
2052 ll_merge_lvb(env, inode);
2053 cl_env_nested_put(&nest, env);
2055 /* Release the file.
2056 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2057 * we still need it to pack l_remote_handle to MDT. */
2058 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2064 if (och != NULL && !IS_ERR(och)) /* close the file */
2065 ll_lease_close(och, inode, NULL);
2070 struct ll_swap_stack {
2071 struct iattr ia1, ia2;
2073 struct inode *inode1, *inode2;
2074 bool check_dv1, check_dv2;
2077 static int ll_swap_layouts(struct file *file1, struct file *file2,
2078 struct lustre_swap_layouts *lsl)
2080 struct mdc_swap_layouts msl;
2081 struct md_op_data *op_data;
2084 struct ll_swap_stack *llss = NULL;
2087 OBD_ALLOC_PTR(llss);
2091 llss->inode1 = file1->f_dentry->d_inode;
2092 llss->inode2 = file2->f_dentry->d_inode;
2094 if (!S_ISREG(llss->inode2->i_mode))
2095 GOTO(free, rc = -EINVAL);
2097 if (inode_permission(llss->inode1, MAY_WRITE) ||
2098 inode_permission(llss->inode2, MAY_WRITE))
2099 GOTO(free, rc = -EPERM);
2101 if (llss->inode2->i_sb != llss->inode1->i_sb)
2102 GOTO(free, rc = -EXDEV);
2104 /* we use 2 bool because it is easier to swap than 2 bits */
2105 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2106 llss->check_dv1 = true;
2108 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2109 llss->check_dv2 = true;
2111 /* we cannot use lsl->sl_dvX directly because we may swap them */
2112 llss->dv1 = lsl->sl_dv1;
2113 llss->dv2 = lsl->sl_dv2;
2115 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2116 if (rc == 0) /* same file, done! */
2119 if (rc < 0) { /* sequentialize it */
2120 swap(llss->inode1, llss->inode2);
2122 swap(llss->dv1, llss->dv2);
2123 swap(llss->check_dv1, llss->check_dv2);
2127 if (gid != 0) { /* application asks to flush dirty cache */
2128 rc = ll_get_grouplock(llss->inode1, file1, gid);
2132 rc = ll_get_grouplock(llss->inode2, file2, gid);
2134 ll_put_grouplock(llss->inode1, file1, gid);
2139 /* to be able to restore mtime and atime after swap
2140 * we need to first save them */
2142 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2143 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2144 llss->ia1.ia_atime = llss->inode1->i_atime;
2145 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2146 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2147 llss->ia2.ia_atime = llss->inode2->i_atime;
2148 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2151 /* ultimate check, before swaping the layouts we check if
2152 * dataversion has changed (if requested) */
2153 if (llss->check_dv1) {
2154 rc = ll_data_version(llss->inode1, &dv, 0);
2157 if (dv != llss->dv1)
2158 GOTO(putgl, rc = -EAGAIN);
2161 if (llss->check_dv2) {
2162 rc = ll_data_version(llss->inode2, &dv, 0);
2165 if (dv != llss->dv2)
2166 GOTO(putgl, rc = -EAGAIN);
2169 /* struct md_op_data is used to send the swap args to the mdt
2170 * only flags is missing, so we use struct mdc_swap_layouts
2171 * through the md_op_data->op_data */
2172 /* flags from user space have to be converted before they are send to
2173 * server, no flag is sent today, they are only used on the client */
2176 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2177 0, LUSTRE_OPC_ANY, &msl);
2178 if (IS_ERR(op_data))
2179 GOTO(free, rc = PTR_ERR(op_data));
2181 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2182 sizeof(*op_data), op_data, NULL);
2183 ll_finish_md_op_data(op_data);
2187 ll_put_grouplock(llss->inode2, file2, gid);
2188 ll_put_grouplock(llss->inode1, file1, gid);
2191 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2195 /* clear useless flags */
2196 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2197 llss->ia1.ia_valid &= ~ATTR_MTIME;
2198 llss->ia2.ia_valid &= ~ATTR_MTIME;
2201 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2202 llss->ia1.ia_valid &= ~ATTR_ATIME;
2203 llss->ia2.ia_valid &= ~ATTR_ATIME;
2206 /* update time if requested */
2208 if (llss->ia2.ia_valid != 0) {
2209 mutex_lock(&llss->inode1->i_mutex);
2210 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2211 mutex_unlock(&llss->inode1->i_mutex);
2214 if (llss->ia1.ia_valid != 0) {
2217 mutex_lock(&llss->inode2->i_mutex);
2218 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2219 mutex_unlock(&llss->inode2->i_mutex);
2231 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2233 struct md_op_data *op_data;
2236 /* Non-root users are forbidden to set or clear flags which are
2237 * NOT defined in HSM_USER_MASK. */
2238 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2239 !cfs_capable(CFS_CAP_SYS_ADMIN))
2242 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2243 LUSTRE_OPC_ANY, hss);
2244 if (IS_ERR(op_data))
2245 RETURN(PTR_ERR(op_data));
2247 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2248 sizeof(*op_data), op_data, NULL);
2250 ll_finish_md_op_data(op_data);
2255 static int ll_hsm_import(struct inode *inode, struct file *file,
2256 struct hsm_user_import *hui)
2258 struct hsm_state_set *hss = NULL;
2259 struct iattr *attr = NULL;
2263 if (!S_ISREG(inode->i_mode))
2269 GOTO(out, rc = -ENOMEM);
2271 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2272 hss->hss_archive_id = hui->hui_archive_id;
2273 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2274 rc = ll_hsm_state_set(inode, hss);
2278 OBD_ALLOC_PTR(attr);
2280 GOTO(out, rc = -ENOMEM);
2282 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2283 attr->ia_mode |= S_IFREG;
2284 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2285 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2286 attr->ia_size = hui->hui_size;
2287 attr->ia_mtime.tv_sec = hui->hui_mtime;
2288 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2289 attr->ia_atime.tv_sec = hui->hui_atime;
2290 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2292 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2293 ATTR_UID | ATTR_GID |
2294 ATTR_MTIME | ATTR_MTIME_SET |
2295 ATTR_ATIME | ATTR_ATIME_SET;
2297 rc = ll_setattr_raw(file->f_dentry, attr, true);
2312 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2314 struct inode *inode = file->f_dentry->d_inode;
2315 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2319 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2320 PFID(ll_inode2fid(inode)), inode, cmd);
2321 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2323 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2324 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2328 case LL_IOC_GETFLAGS:
2329 /* Get the current value of the file flags */
2330 return put_user(fd->fd_flags, (int *)arg);
2331 case LL_IOC_SETFLAGS:
2332 case LL_IOC_CLRFLAGS:
2333 /* Set or clear specific file flags */
2334 /* XXX This probably needs checks to ensure the flags are
2335 * not abused, and to handle any flag side effects.
2337 if (get_user(flags, (int *) arg))
2340 if (cmd == LL_IOC_SETFLAGS) {
2341 if ((flags & LL_FILE_IGNORE_LOCK) &&
2342 !(file->f_flags & O_DIRECT)) {
2343 CERROR("%s: unable to disable locking on "
2344 "non-O_DIRECT file\n", current->comm);
2348 fd->fd_flags |= flags;
2350 fd->fd_flags &= ~flags;
2353 case LL_IOC_LOV_SETSTRIPE:
2354 RETURN(ll_lov_setstripe(inode, file, arg));
2355 case LL_IOC_LOV_SETEA:
2356 RETURN(ll_lov_setea(inode, file, arg));
2357 case LL_IOC_LOV_SWAP_LAYOUTS: {
2359 struct lustre_swap_layouts lsl;
2361 if (copy_from_user(&lsl, (char *)arg,
2362 sizeof(struct lustre_swap_layouts)))
2365 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2368 file2 = fget(lsl.sl_fd);
2373 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2374 rc = ll_swap_layouts(file, file2, &lsl);
2378 case LL_IOC_LOV_GETSTRIPE:
2379 RETURN(ll_lov_getstripe(inode, arg));
2380 case LL_IOC_RECREATE_OBJ:
2381 RETURN(ll_lov_recreate_obj(inode, arg));
2382 case LL_IOC_RECREATE_FID:
2383 RETURN(ll_lov_recreate_fid(inode, arg));
2384 case FSFILT_IOC_FIEMAP:
2385 RETURN(ll_ioctl_fiemap(inode, arg));
2386 case FSFILT_IOC_GETFLAGS:
2387 case FSFILT_IOC_SETFLAGS:
2388 RETURN(ll_iocontrol(inode, file, cmd, arg));
2389 case FSFILT_IOC_GETVERSION_OLD:
2390 case FSFILT_IOC_GETVERSION:
2391 RETURN(put_user(inode->i_generation, (int *)arg));
2392 case LL_IOC_GROUP_LOCK:
2393 RETURN(ll_get_grouplock(inode, file, arg));
2394 case LL_IOC_GROUP_UNLOCK:
2395 RETURN(ll_put_grouplock(inode, file, arg));
2396 case IOC_OBD_STATFS:
2397 RETURN(ll_obd_statfs(inode, (void *)arg));
2399 /* We need to special case any other ioctls we want to handle,
2400 * to send them to the MDS/OST as appropriate and to properly
2401 * network encode the arg field.
2402 case FSFILT_IOC_SETVERSION_OLD:
2403 case FSFILT_IOC_SETVERSION:
2405 case LL_IOC_FLUSHCTX:
2406 RETURN(ll_flush_ctx(inode));
2407 case LL_IOC_PATH2FID: {
2408 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2409 sizeof(struct lu_fid)))
2414 case OBD_IOC_FID2PATH:
2415 RETURN(ll_fid2path(inode, (void *)arg));
2416 case LL_IOC_DATA_VERSION: {
2417 struct ioc_data_version idv;
2420 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2423 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2424 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2426 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2432 case LL_IOC_GET_MDTIDX: {
2435 mdtidx = ll_get_mdt_idx(inode);
2439 if (put_user((int)mdtidx, (int*)arg))
2444 case OBD_IOC_GETDTNAME:
2445 case OBD_IOC_GETMDNAME:
2446 RETURN(ll_get_obd_name(inode, cmd, arg));
2447 case LL_IOC_HSM_STATE_GET: {
2448 struct md_op_data *op_data;
2449 struct hsm_user_state *hus;
2456 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2457 LUSTRE_OPC_ANY, hus);
2458 if (IS_ERR(op_data)) {
2460 RETURN(PTR_ERR(op_data));
2463 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2466 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2469 ll_finish_md_op_data(op_data);
2473 case LL_IOC_HSM_STATE_SET: {
2474 struct hsm_state_set *hss;
2481 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2486 rc = ll_hsm_state_set(inode, hss);
2491 case LL_IOC_HSM_ACTION: {
2492 struct md_op_data *op_data;
2493 struct hsm_current_action *hca;
2500 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2501 LUSTRE_OPC_ANY, hca);
2502 if (IS_ERR(op_data)) {
2504 RETURN(PTR_ERR(op_data));
2507 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2510 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2513 ll_finish_md_op_data(op_data);
2517 case LL_IOC_SET_LEASE: {
2518 struct ll_inode_info *lli = ll_i2info(inode);
2519 struct obd_client_handle *och = NULL;
2525 if (!(file->f_mode & FMODE_WRITE))
2530 if (!(file->f_mode & FMODE_READ))
2535 mutex_lock(&lli->lli_och_mutex);
2536 if (fd->fd_lease_och != NULL) {
2537 och = fd->fd_lease_och;
2538 fd->fd_lease_och = NULL;
2540 mutex_unlock(&lli->lli_och_mutex);
2543 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2544 rc = ll_lease_close(och, inode, &lease_broken);
2545 if (rc == 0 && lease_broken)
2551 /* return the type of lease or error */
2552 RETURN(rc < 0 ? rc : (int)mode);
2557 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2559 /* apply for lease */
2560 och = ll_lease_open(inode, file, mode, 0);
2562 RETURN(PTR_ERR(och));
2565 mutex_lock(&lli->lli_och_mutex);
2566 if (fd->fd_lease_och == NULL) {
2567 fd->fd_lease_och = och;
2570 mutex_unlock(&lli->lli_och_mutex);
2572 /* impossible now that only excl is supported for now */
2573 ll_lease_close(och, inode, &lease_broken);
2578 case LL_IOC_GET_LEASE: {
2579 struct ll_inode_info *lli = ll_i2info(inode);
2580 struct ldlm_lock *lock = NULL;
2583 mutex_lock(&lli->lli_och_mutex);
2584 if (fd->fd_lease_och != NULL) {
2585 struct obd_client_handle *och = fd->fd_lease_och;
2587 lock = ldlm_handle2lock(&och->och_lease_handle);
2589 lock_res_and_lock(lock);
2590 if (!ldlm_is_cancel(lock))
2591 rc = och->och_flags &
2592 (FMODE_READ | FMODE_WRITE);
2593 unlock_res_and_lock(lock);
2594 LDLM_LOCK_PUT(lock);
2597 mutex_unlock(&lli->lli_och_mutex);
2600 case LL_IOC_HSM_IMPORT: {
2601 struct hsm_user_import *hui;
2607 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2612 rc = ll_hsm_import(inode, file, hui);
2622 ll_iocontrol_call(inode, file, cmd, arg, &err))
2625 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2631 #ifndef HAVE_FILE_LLSEEK_SIZE
2632 static inline loff_t
2633 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2635 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2637 if (offset > maxsize)
2640 if (offset != file->f_pos) {
2641 file->f_pos = offset;
2642 file->f_version = 0;
2648 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2649 loff_t maxsize, loff_t eof)
2651 struct inode *inode = file->f_dentry->d_inode;
2659 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2660 * position-querying operation. Avoid rewriting the "same"
2661 * f_pos value back to the file because a concurrent read(),
2662 * write() or lseek() might have altered it
2667 * f_lock protects against read/modify/write race with other
2668 * SEEK_CURs. Note that parallel writes and reads behave
2671 mutex_lock(&inode->i_mutex);
2672 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2673 mutex_unlock(&inode->i_mutex);
2677 * In the generic case the entire file is data, so as long as
2678 * offset isn't at the end of the file then the offset is data.
2685 * There is a virtual hole at the end of the file, so as long as
2686 * offset isn't i_size or larger, return i_size.
2694 return llseek_execute(file, offset, maxsize);
2698 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2700 struct inode *inode = file->f_dentry->d_inode;
2701 loff_t retval, eof = 0;
2704 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2705 (origin == SEEK_CUR) ? file->f_pos : 0);
2706 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2707 PFID(ll_inode2fid(inode)), inode, retval, retval,
2709 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2711 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2712 retval = ll_glimpse_size(inode);
2715 eof = i_size_read(inode);
2718 retval = ll_generic_file_llseek_size(file, offset, origin,
2719 ll_file_maxbytes(inode), eof);
2723 static int ll_flush(struct file *file, fl_owner_t id)
2725 struct inode *inode = file->f_dentry->d_inode;
2726 struct ll_inode_info *lli = ll_i2info(inode);
2727 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2730 LASSERT(!S_ISDIR(inode->i_mode));
2732 /* catch async errors that were recorded back when async writeback
2733 * failed for pages in this mapping. */
2734 rc = lli->lli_async_rc;
2735 lli->lli_async_rc = 0;
2736 if (lli->lli_clob != NULL) {
2737 err = lov_read_and_clear_async_rc(lli->lli_clob);
2742 /* The application has been told write failure already.
2743 * Do not report failure again. */
2744 if (fd->fd_write_failed)
2746 return rc ? -EIO : 0;
2750 * Called to make sure a portion of file has been written out.
2751 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2753 * Return how many pages have been written.
2755 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2756 enum cl_fsync_mode mode, int ignore_layout)
2758 struct cl_env_nest nest;
2761 struct obd_capa *capa = NULL;
2762 struct cl_fsync_io *fio;
2766 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2767 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2770 env = cl_env_nested_get(&nest);
2772 RETURN(PTR_ERR(env));
2774 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2776 io = ccc_env_thread_io(env);
2777 io->ci_obj = cl_i2info(inode)->lli_clob;
2778 io->ci_ignore_layout = ignore_layout;
2780 /* initialize parameters for sync */
2781 fio = &io->u.ci_fsync;
2782 fio->fi_capa = capa;
2783 fio->fi_start = start;
2785 fio->fi_fid = ll_inode2fid(inode);
2786 fio->fi_mode = mode;
2787 fio->fi_nr_written = 0;
2789 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2790 result = cl_io_loop(env, io);
2792 result = io->ci_result;
2794 result = fio->fi_nr_written;
2795 cl_io_fini(env, io);
2796 cl_env_nested_put(&nest, env);
2804 * When dentry is provided (the 'else' case), *file->f_dentry may be
2805 * null and dentry must be used directly rather than pulled from
2806 * *file->f_dentry as is done otherwise.
2809 #ifdef HAVE_FILE_FSYNC_4ARGS
2810 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2812 struct dentry *dentry = file->f_dentry;
2813 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2814 int ll_fsync(struct file *file, int datasync)
2816 struct dentry *dentry = file->f_dentry;
2818 loff_t end = LLONG_MAX;
2820 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2823 loff_t end = LLONG_MAX;
2825 struct inode *inode = dentry->d_inode;
2826 struct ll_inode_info *lli = ll_i2info(inode);
2827 struct ptlrpc_request *req;
2828 struct obd_capa *oc;
2832 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2833 PFID(ll_inode2fid(inode)), inode);
2834 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2836 #ifdef HAVE_FILE_FSYNC_4ARGS
2837 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2838 mutex_lock(&inode->i_mutex);
2840 /* fsync's caller has already called _fdata{sync,write}, we want
2841 * that IO to finish before calling the osc and mdc sync methods */
2842 rc = filemap_fdatawait(inode->i_mapping);
2845 /* catch async errors that were recorded back when async writeback
2846 * failed for pages in this mapping. */
2847 if (!S_ISDIR(inode->i_mode)) {
2848 err = lli->lli_async_rc;
2849 lli->lli_async_rc = 0;
2852 err = lov_read_and_clear_async_rc(lli->lli_clob);
2857 oc = ll_mdscapa_get(inode);
2858 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2864 ptlrpc_req_finished(req);
2866 if (S_ISREG(inode->i_mode)) {
2867 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2869 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2870 if (rc == 0 && err < 0)
2873 fd->fd_write_failed = true;
2875 fd->fd_write_failed = false;
2878 #ifdef HAVE_FILE_FSYNC_4ARGS
2879 mutex_unlock(&inode->i_mutex);
2885 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2887 struct inode *inode = file->f_dentry->d_inode;
2888 struct ll_sb_info *sbi = ll_i2sbi(inode);
2889 struct ldlm_enqueue_info einfo = {
2890 .ei_type = LDLM_FLOCK,
2891 .ei_cb_cp = ldlm_flock_completion_ast,
2892 .ei_cbdata = file_lock,
2894 struct md_op_data *op_data;
2895 struct lustre_handle lockh = {0};
2896 ldlm_policy_data_t flock = {{0}};
2897 int fl_type = file_lock->fl_type;
2903 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2904 PFID(ll_inode2fid(inode)), file_lock);
2906 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2908 if (file_lock->fl_flags & FL_FLOCK) {
2909 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2910 /* flocks are whole-file locks */
2911 flock.l_flock.end = OFFSET_MAX;
2912 /* For flocks owner is determined by the local file desctiptor*/
2913 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2914 } else if (file_lock->fl_flags & FL_POSIX) {
2915 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2916 flock.l_flock.start = file_lock->fl_start;
2917 flock.l_flock.end = file_lock->fl_end;
2921 flock.l_flock.pid = file_lock->fl_pid;
2923 /* Somewhat ugly workaround for svc lockd.
2924 * lockd installs custom fl_lmops->lm_compare_owner that checks
2925 * for the fl_owner to be the same (which it always is on local node
2926 * I guess between lockd processes) and then compares pid.
2927 * As such we assign pid to the owner field to make it all work,
2928 * conflict with normal locks is unlikely since pid space and
2929 * pointer space for current->files are not intersecting */
2930 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2931 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2935 einfo.ei_mode = LCK_PR;
2938 /* An unlock request may or may not have any relation to
2939 * existing locks so we may not be able to pass a lock handle
2940 * via a normal ldlm_lock_cancel() request. The request may even
2941 * unlock a byte range in the middle of an existing lock. In
2942 * order to process an unlock request we need all of the same
2943 * information that is given with a normal read or write record
2944 * lock request. To avoid creating another ldlm unlock (cancel)
2945 * message we'll treat a LCK_NL flock request as an unlock. */
2946 einfo.ei_mode = LCK_NL;
2949 einfo.ei_mode = LCK_PW;
2952 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2967 flags = LDLM_FL_BLOCK_NOWAIT;
2973 flags = LDLM_FL_TEST_LOCK;
2976 CERROR("unknown fcntl lock command: %d\n", cmd);
2980 /* Save the old mode so that if the mode in the lock changes we
2981 * can decrement the appropriate reader or writer refcount. */
2982 file_lock->fl_type = einfo.ei_mode;
2984 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2985 LUSTRE_OPC_ANY, NULL);
2986 if (IS_ERR(op_data))
2987 RETURN(PTR_ERR(op_data));
2989 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2990 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2991 flock.l_flock.pid, flags, einfo.ei_mode,
2992 flock.l_flock.start, flock.l_flock.end);
2994 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2995 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2997 /* Restore the file lock type if not TEST lock. */
2998 if (!(flags & LDLM_FL_TEST_LOCK))
2999 file_lock->fl_type = fl_type;
3001 if ((file_lock->fl_flags & FL_FLOCK) &&
3002 (rc == 0 || file_lock->fl_type == F_UNLCK))
3003 rc2 = flock_lock_file_wait(file, file_lock);
3004 if ((file_lock->fl_flags & FL_POSIX) &&
3005 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3006 !(flags & LDLM_FL_TEST_LOCK))
3007 rc2 = posix_lock_file_wait(file, file_lock);
3009 if (rc2 && file_lock->fl_type != F_UNLCK) {
3010 einfo.ei_mode = LCK_NL;
3011 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
3012 op_data, &lockh, &flock, 0, NULL /* req */, flags);
3016 ll_finish_md_op_data(op_data);
3021 int ll_get_fid_by_name(struct inode *parent, const char *name,
3022 int namelen, struct lu_fid *fid)
3024 struct md_op_data *op_data = NULL;
3025 struct mdt_body *body;
3026 struct ptlrpc_request *req;
3030 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3031 LUSTRE_OPC_ANY, NULL);
3032 if (IS_ERR(op_data))
3033 RETURN(PTR_ERR(op_data));
3035 op_data->op_valid = OBD_MD_FLID;
3036 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3037 ll_finish_md_op_data(op_data);
3041 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3043 GOTO(out_req, rc = -EFAULT);
3047 ptlrpc_req_finished(req);
3051 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3052 const char *name, int namelen)
3054 struct dentry *dchild = NULL;
3055 struct inode *child_inode = NULL;
3056 struct md_op_data *op_data;
3057 struct ptlrpc_request *request = NULL;
3062 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3063 name, PFID(ll_inode2fid(parent)), mdtidx);
3065 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3066 0, LUSTRE_OPC_ANY, NULL);
3067 if (IS_ERR(op_data))
3068 RETURN(PTR_ERR(op_data));
3070 /* Get child FID first */
3071 qstr.hash = full_name_hash(name, namelen);
3074 dchild = d_lookup(file->f_dentry, &qstr);
3075 if (dchild != NULL && dchild->d_inode != NULL) {
3076 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3077 if (dchild->d_inode != NULL) {
3078 child_inode = igrab(dchild->d_inode);
3079 ll_invalidate_aliases(child_inode);
3083 rc = ll_get_fid_by_name(parent, name, namelen,
3089 if (!fid_is_sane(&op_data->op_fid3)) {
3090 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3091 ll_get_fsname(parent->i_sb, NULL, 0), name,
3092 PFID(&op_data->op_fid3));
3093 GOTO(out_free, rc = -EINVAL);
3096 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3101 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3102 PFID(&op_data->op_fid3), mdtidx);
3103 GOTO(out_free, rc = 0);
3106 op_data->op_mds = mdtidx;
3107 op_data->op_cli_flags = CLI_MIGRATE;
3108 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3109 namelen, name, namelen, &request);
3111 ll_update_times(request, parent);
3113 ptlrpc_req_finished(request);
3118 if (child_inode != NULL) {
3119 clear_nlink(child_inode);
3123 ll_finish_md_op_data(op_data);
3128 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3136 * test if some locks matching bits and l_req_mode are acquired
3137 * - bits can be in different locks
3138 * - if found clear the common lock bits in *bits
3139 * - the bits not found, are kept in *bits
3141 * \param bits [IN] searched lock bits [IN]
3142 * \param l_req_mode [IN] searched lock mode
3143 * \retval boolean, true iff all bits are found
3145 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3147 struct lustre_handle lockh;
3148 ldlm_policy_data_t policy;
3149 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3150 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3159 fid = &ll_i2info(inode)->lli_fid;
3160 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3161 ldlm_lockname[mode]);
3163 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3164 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3165 policy.l_inodebits.bits = *bits & (1 << i);
3166 if (policy.l_inodebits.bits == 0)
3169 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3170 &policy, mode, &lockh)) {
3171 struct ldlm_lock *lock;
3173 lock = ldlm_handle2lock(&lockh);
3176 ~(lock->l_policy_data.l_inodebits.bits);
3177 LDLM_LOCK_PUT(lock);
3179 *bits &= ~policy.l_inodebits.bits;
3186 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3187 struct lustre_handle *lockh, __u64 flags,
3190 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3195 fid = &ll_i2info(inode)->lli_fid;
3196 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3198 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3199 fid, LDLM_IBITS, &policy, mode, lockh);
3204 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3206 /* Already unlinked. Just update nlink and return success */
3207 if (rc == -ENOENT) {
3209 /* This path cannot be hit for regular files unless in
3210 * case of obscure races, so no need to to validate
3212 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3214 } else if (rc != 0) {
3215 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3216 "%s: revalidate FID "DFID" error: rc = %d\n",
3217 ll_get_fsname(inode->i_sb, NULL, 0),
3218 PFID(ll_inode2fid(inode)), rc);
3224 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3226 struct inode *inode = dentry->d_inode;
3227 struct ptlrpc_request *req = NULL;
3228 struct obd_export *exp;
3232 LASSERT(inode != NULL);
3234 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3235 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3237 exp = ll_i2mdexp(inode);
3239 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3240 * But under CMD case, it caused some lock issues, should be fixed
3241 * with new CMD ibits lock. See bug 12718 */
3242 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3243 struct lookup_intent oit = { .it_op = IT_GETATTR };
3244 struct md_op_data *op_data;
3246 if (ibits == MDS_INODELOCK_LOOKUP)
3247 oit.it_op = IT_LOOKUP;
3249 /* Call getattr by fid, so do not provide name at all. */
3250 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3251 dentry->d_inode, NULL, 0, 0,
3252 LUSTRE_OPC_ANY, NULL);
3253 if (IS_ERR(op_data))
3254 RETURN(PTR_ERR(op_data));
3256 oit.it_create_mode |= M_CHECK_STALE;
3257 rc = md_intent_lock(exp, op_data, NULL, 0,
3258 /* we are not interested in name
3261 ll_md_blocking_ast, 0);
3262 ll_finish_md_op_data(op_data);
3263 oit.it_create_mode &= ~M_CHECK_STALE;
3265 rc = ll_inode_revalidate_fini(inode, rc);
3269 rc = ll_revalidate_it_finish(req, &oit, dentry);
3271 ll_intent_release(&oit);
3275 /* Unlinked? Unhash dentry, so it is not picked up later by
3276 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3277 here to preserve get_cwd functionality on 2.6.
3279 if (!dentry->d_inode->i_nlink)
3280 d_lustre_invalidate(dentry, 0);
3282 ll_lookup_finish_locks(&oit, dentry);
3283 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3284 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3285 obd_valid valid = OBD_MD_FLGETATTR;
3286 struct md_op_data *op_data;
3289 if (S_ISREG(inode->i_mode)) {
3290 rc = ll_get_default_mdsize(sbi, &ealen);
3293 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3296 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3297 0, ealen, LUSTRE_OPC_ANY,
3299 if (IS_ERR(op_data))
3300 RETURN(PTR_ERR(op_data));
3302 op_data->op_valid = valid;
3303 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3304 * capa for this inode. Because we only keep capas of dirs
3306 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3307 ll_finish_md_op_data(op_data);
3309 rc = ll_inode_revalidate_fini(inode, rc);
3313 rc = ll_prep_inode(&inode, req, NULL, NULL);
3316 ptlrpc_req_finished(req);
3320 static int ll_merge_md_attr(struct inode *inode)
3322 struct cl_attr attr = { 0 };
3325 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3326 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3331 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3332 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3334 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3335 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3336 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3342 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3344 struct inode *inode = dentry->d_inode;
3348 rc = __ll_inode_revalidate(dentry, ibits);
3352 /* if object isn't regular file, don't validate size */
3353 if (!S_ISREG(inode->i_mode)) {
3354 if (S_ISDIR(inode->i_mode) &&
3355 ll_i2info(inode)->lli_lsm_md != NULL) {
3356 rc = ll_merge_md_attr(inode);
3361 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3362 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3363 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3365 /* In case of restore, the MDT has the right size and has
3366 * already send it back without granting the layout lock,
3367 * inode is up-to-date so glimpse is useless.
3368 * Also to glimpse we need the layout, in case of a running
3369 * restore the MDT holds the layout lock so the glimpse will
3370 * block up to the end of restore (getattr will block)
3372 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3373 rc = ll_glimpse_size(inode);
3378 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3380 struct inode *inode = de->d_inode;
3381 struct ll_sb_info *sbi = ll_i2sbi(inode);
3382 struct ll_inode_info *lli = ll_i2info(inode);
3385 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3386 MDS_INODELOCK_LOOKUP);
3387 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3392 stat->dev = inode->i_sb->s_dev;
3393 if (ll_need_32bit_api(sbi))
3394 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3396 stat->ino = inode->i_ino;
3397 stat->mode = inode->i_mode;
3398 stat->uid = inode->i_uid;
3399 stat->gid = inode->i_gid;
3400 stat->rdev = inode->i_rdev;
3401 stat->atime = inode->i_atime;
3402 stat->mtime = inode->i_mtime;
3403 stat->ctime = inode->i_ctime;
3404 stat->blksize = 1 << inode->i_blkbits;
3405 stat->blocks = inode->i_blocks;
3407 if (S_ISDIR(inode->i_mode) &&
3408 ll_i2info(inode)->lli_lsm_md != NULL) {
3409 stat->nlink = lli->lli_stripe_dir_nlink;
3410 stat->size = lli->lli_stripe_dir_size;
3412 stat->nlink = inode->i_nlink;
3413 stat->size = i_size_read(inode);
3419 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3420 __u64 start, __u64 len)
3424 struct ll_user_fiemap *fiemap;
3425 unsigned int extent_count = fieinfo->fi_extents_max;
3427 num_bytes = sizeof(*fiemap) + (extent_count *
3428 sizeof(struct ll_fiemap_extent));
3429 OBD_ALLOC_LARGE(fiemap, num_bytes);
3434 fiemap->fm_flags = fieinfo->fi_flags;
3435 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3436 fiemap->fm_start = start;
3437 fiemap->fm_length = len;
3438 if (extent_count > 0)
3439 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3440 sizeof(struct ll_fiemap_extent));
3442 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3444 fieinfo->fi_flags = fiemap->fm_flags;
3445 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3446 if (extent_count > 0)
3447 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3448 fiemap->fm_mapped_extents *
3449 sizeof(struct ll_fiemap_extent));
3451 OBD_FREE_LARGE(fiemap, num_bytes);
3455 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3457 struct ll_inode_info *lli = ll_i2info(inode);
3458 struct posix_acl *acl = NULL;
3461 spin_lock(&lli->lli_lock);
3462 /* VFS' acl_permission_check->check_acl will release the refcount */
3463 acl = posix_acl_dup(lli->lli_posix_acl);
3464 spin_unlock(&lli->lli_lock);
3469 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3471 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3472 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3474 ll_check_acl(struct inode *inode, int mask)
3477 # ifdef CONFIG_FS_POSIX_ACL
3478 struct posix_acl *acl;
3482 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3483 if (flags & IPERM_FLAG_RCU)
3486 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3491 rc = posix_acl_permission(inode, acl, mask);
3492 posix_acl_release(acl);
3495 # else /* !CONFIG_FS_POSIX_ACL */
3497 # endif /* CONFIG_FS_POSIX_ACL */
3499 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3501 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3502 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3504 # ifdef HAVE_INODE_PERMISION_2ARGS
3505 int ll_inode_permission(struct inode *inode, int mask)
3507 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3512 struct ll_sb_info *sbi;
3513 struct root_squash_info *squash;
3514 struct cred *cred = NULL;
3515 const struct cred *old_cred = NULL;
3517 bool squash_id = false;
3520 #ifdef MAY_NOT_BLOCK
3521 if (mask & MAY_NOT_BLOCK)
3523 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3524 if (flags & IPERM_FLAG_RCU)
3528 /* as root inode are NOT getting validated in lookup operation,
3529 * need to do it before permission check. */
3531 if (inode == inode->i_sb->s_root->d_inode) {
3532 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3533 MDS_INODELOCK_LOOKUP);
3538 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3539 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3541 /* squash fsuid/fsgid if needed */
3542 sbi = ll_i2sbi(inode);
3543 squash = &sbi->ll_squash;
3544 if (unlikely(squash->rsi_uid != 0 &&
3545 current_fsuid() == 0 &&
3546 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3550 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3551 current_fsuid(), current_fsgid(),
3552 squash->rsi_uid, squash->rsi_gid);
3554 /* update current process's credentials
3555 * and FS capability */
3556 cred = prepare_creds();
3560 cred->fsuid = squash->rsi_uid;
3561 cred->fsgid = squash->rsi_gid;
3562 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3563 if ((1 << cap) & CFS_CAP_FS_MASK)
3564 cap_lower(cred->cap_effective, cap);
3566 old_cred = override_creds(cred);
3569 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3571 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3572 rc = lustre_check_remote_perm(inode, mask);
3574 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3576 /* restore current process's credentials and FS capability */
3578 revert_creds(old_cred);
3585 /* -o localflock - only provides locally consistent flock locks */
3586 struct file_operations ll_file_operations = {
3587 .read = ll_file_read,
3588 .aio_read = ll_file_aio_read,
3589 .write = ll_file_write,
3590 .aio_write = ll_file_aio_write,
3591 .unlocked_ioctl = ll_file_ioctl,
3592 .open = ll_file_open,
3593 .release = ll_file_release,
3594 .mmap = ll_file_mmap,
3595 .llseek = ll_file_seek,
3596 .splice_read = ll_file_splice_read,
3601 struct file_operations ll_file_operations_flock = {
3602 .read = ll_file_read,
3603 .aio_read = ll_file_aio_read,
3604 .write = ll_file_write,
3605 .aio_write = ll_file_aio_write,
3606 .unlocked_ioctl = ll_file_ioctl,
3607 .open = ll_file_open,
3608 .release = ll_file_release,
3609 .mmap = ll_file_mmap,
3610 .llseek = ll_file_seek,
3611 .splice_read = ll_file_splice_read,
3614 .flock = ll_file_flock,
3615 .lock = ll_file_flock
3618 /* These are for -o noflock - to return ENOSYS on flock calls */
3619 struct file_operations ll_file_operations_noflock = {
3620 .read = ll_file_read,
3621 .aio_read = ll_file_aio_read,
3622 .write = ll_file_write,
3623 .aio_write = ll_file_aio_write,
3624 .unlocked_ioctl = ll_file_ioctl,
3625 .open = ll_file_open,
3626 .release = ll_file_release,
3627 .mmap = ll_file_mmap,
3628 .llseek = ll_file_seek,
3629 .splice_read = ll_file_splice_read,
3632 .flock = ll_file_noflock,
3633 .lock = ll_file_noflock
3636 struct inode_operations ll_file_inode_operations = {
3637 .setattr = ll_setattr,
3638 .getattr = ll_getattr,
3639 .permission = ll_inode_permission,
3640 .setxattr = ll_setxattr,
3641 .getxattr = ll_getxattr,
3642 .listxattr = ll_listxattr,
3643 .removexattr = ll_removexattr,
3644 .fiemap = ll_fiemap,
3645 #ifdef HAVE_IOP_GET_ACL
3646 .get_acl = ll_get_acl,
3650 /* dynamic ioctl number support routins */
3651 static struct llioc_ctl_data {
3652 struct rw_semaphore ioc_sem;
3653 struct list_head ioc_head;
3655 __RWSEM_INITIALIZER(llioc.ioc_sem),
3656 LIST_HEAD_INIT(llioc.ioc_head)
3661 struct list_head iocd_list;
3662 unsigned int iocd_size;
3663 llioc_callback_t iocd_cb;
3664 unsigned int iocd_count;
3665 unsigned int iocd_cmd[0];
3668 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3671 struct llioc_data *in_data = NULL;
3674 if (cb == NULL || cmd == NULL ||
3675 count > LLIOC_MAX_CMD || count < 0)
3678 size = sizeof(*in_data) + count * sizeof(unsigned int);
3679 OBD_ALLOC(in_data, size);
3680 if (in_data == NULL)
3683 memset(in_data, 0, sizeof(*in_data));
3684 in_data->iocd_size = size;
3685 in_data->iocd_cb = cb;
3686 in_data->iocd_count = count;
3687 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3689 down_write(&llioc.ioc_sem);
3690 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3691 up_write(&llioc.ioc_sem);
3696 void ll_iocontrol_unregister(void *magic)
3698 struct llioc_data *tmp;
3703 down_write(&llioc.ioc_sem);
3704 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3706 unsigned int size = tmp->iocd_size;
3708 list_del(&tmp->iocd_list);
3709 up_write(&llioc.ioc_sem);
3711 OBD_FREE(tmp, size);
3715 up_write(&llioc.ioc_sem);
3717 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3720 EXPORT_SYMBOL(ll_iocontrol_register);
3721 EXPORT_SYMBOL(ll_iocontrol_unregister);
3723 static enum llioc_iter
3724 ll_iocontrol_call(struct inode *inode, struct file *file,
3725 unsigned int cmd, unsigned long arg, int *rcp)
3727 enum llioc_iter ret = LLIOC_CONT;
3728 struct llioc_data *data;
3729 int rc = -EINVAL, i;
3731 down_read(&llioc.ioc_sem);
3732 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3733 for (i = 0; i < data->iocd_count; i++) {
3734 if (cmd != data->iocd_cmd[i])
3737 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3741 if (ret == LLIOC_STOP)
3744 up_read(&llioc.ioc_sem);
3751 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3753 struct ll_inode_info *lli = ll_i2info(inode);
3754 struct cl_env_nest nest;
3759 if (lli->lli_clob == NULL)
3762 env = cl_env_nested_get(&nest);
3764 RETURN(PTR_ERR(env));
3766 result = cl_conf_set(env, lli->lli_clob, conf);
3767 cl_env_nested_put(&nest, env);
3769 if (conf->coc_opc == OBJECT_CONF_SET) {
3770 struct ldlm_lock *lock = conf->coc_lock;
3772 LASSERT(lock != NULL);
3773 LASSERT(ldlm_has_layout(lock));
3775 struct lustre_md *md = conf->u.coc_md;
3776 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3778 /* it can only be allowed to match after layout is
3779 * applied to inode otherwise false layout would be
3780 * seen. Applying layout shoud happen before dropping
3781 * the intent lock. */
3782 ldlm_lock_allow_match(lock);
3784 lli->lli_has_smd = lsm_has_objects(md->lsm);
3785 if (md->lsm != NULL)
3786 gen = md->lsm->lsm_layout_gen;
3789 DFID ": layout version change: %u -> %u\n",
3790 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3792 ll_layout_version_set(lli, gen);
3798 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3799 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3802 struct ll_sb_info *sbi = ll_i2sbi(inode);
3803 struct obd_capa *oc;
3804 struct ptlrpc_request *req;
3805 struct mdt_body *body;
3812 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3813 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3814 lock->l_lvb_data, lock->l_lvb_len);
3816 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3819 /* if layout lock was granted right away, the layout is returned
3820 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3821 * blocked and then granted via completion ast, we have to fetch
3822 * layout here. Please note that we can't use the LVB buffer in
3823 * completion AST because it doesn't have a large enough buffer */
3824 oc = ll_mdscapa_get(inode);
3825 rc = ll_get_default_mdsize(sbi, &lmmsize);
3827 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3828 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3834 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3836 GOTO(out, rc = -EPROTO);
3838 lmmsize = body->eadatasize;
3839 if (lmmsize == 0) /* empty layout */
3842 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3844 GOTO(out, rc = -EFAULT);
3846 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3847 if (lvbdata == NULL)
3848 GOTO(out, rc = -ENOMEM);
3850 memcpy(lvbdata, lmm, lmmsize);
3851 lock_res_and_lock(lock);
3852 if (lock->l_lvb_data != NULL)
3853 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3855 lock->l_lvb_data = lvbdata;
3856 lock->l_lvb_len = lmmsize;
3857 unlock_res_and_lock(lock);
3862 ptlrpc_req_finished(req);
3867 * Apply the layout to the inode. Layout lock is held and will be released
3870 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3871 struct inode *inode, __u32 *gen, bool reconf)
3873 struct ll_inode_info *lli = ll_i2info(inode);
3874 struct ll_sb_info *sbi = ll_i2sbi(inode);
3875 struct ldlm_lock *lock;
3876 struct lustre_md md = { NULL };
3877 struct cl_object_conf conf;
3880 bool wait_layout = false;
3883 LASSERT(lustre_handle_is_used(lockh));
3885 lock = ldlm_handle2lock(lockh);
3886 LASSERT(lock != NULL);
3887 LASSERT(ldlm_has_layout(lock));
3889 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3890 PFID(&lli->lli_fid), inode, reconf);
3892 /* in case this is a caching lock and reinstate with new inode */
3893 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3895 lock_res_and_lock(lock);
3896 lvb_ready = ldlm_is_lvb_ready(lock);
3897 unlock_res_and_lock(lock);
3898 /* checking lvb_ready is racy but this is okay. The worst case is
3899 * that multi processes may configure the file on the same time. */
3901 if (lvb_ready || !reconf) {
3904 /* layout_gen must be valid if layout lock is not
3905 * cancelled and stripe has already set */
3906 *gen = ll_layout_version_get(lli);
3912 rc = ll_layout_fetch(inode, lock);
3916 /* for layout lock, lmm is returned in lock's lvb.
3917 * lvb_data is immutable if the lock is held so it's safe to access it
3918 * without res lock. See the description in ldlm_lock_decref_internal()
3919 * for the condition to free lvb_data of layout lock */
3920 if (lock->l_lvb_data != NULL) {
3921 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3922 lock->l_lvb_data, lock->l_lvb_len);
3924 *gen = LL_LAYOUT_GEN_EMPTY;
3926 *gen = md.lsm->lsm_layout_gen;
3929 CERROR("%s: file "DFID" unpackmd error: %d\n",
3930 ll_get_fsname(inode->i_sb, NULL, 0),
3931 PFID(&lli->lli_fid), rc);
3937 /* set layout to file. Unlikely this will fail as old layout was
3938 * surely eliminated */
3939 memset(&conf, 0, sizeof conf);
3940 conf.coc_opc = OBJECT_CONF_SET;
3941 conf.coc_inode = inode;
3942 conf.coc_lock = lock;
3943 conf.u.coc_md = &md;
3944 rc = ll_layout_conf(inode, &conf);
3947 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3949 /* refresh layout failed, need to wait */
3950 wait_layout = rc == -EBUSY;
3954 LDLM_LOCK_PUT(lock);
3955 ldlm_lock_decref(lockh, mode);
3957 /* wait for IO to complete if it's still being used. */
3959 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3960 ll_get_fsname(inode->i_sb, NULL, 0),
3961 PFID(&lli->lli_fid), inode);
3963 memset(&conf, 0, sizeof conf);
3964 conf.coc_opc = OBJECT_CONF_WAIT;
3965 conf.coc_inode = inode;
3966 rc = ll_layout_conf(inode, &conf);
3970 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3971 ll_get_fsname(inode->i_sb, NULL, 0),
3972 PFID(&lli->lli_fid), rc);
3978 * This function checks if there exists a LAYOUT lock on the client side,
3979 * or enqueues it if it doesn't have one in cache.
3981 * This function will not hold layout lock so it may be revoked any time after
3982 * this function returns. Any operations depend on layout should be redone
3985 * This function should be called before lov_io_init() to get an uptodate
3986 * layout version, the caller should save the version number and after IO
3987 * is finished, this function should be called again to verify that layout
3988 * is not changed during IO time.
3990 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3992 struct ll_inode_info *lli = ll_i2info(inode);
3993 struct ll_sb_info *sbi = ll_i2sbi(inode);
3994 struct md_op_data *op_data;
3995 struct lookup_intent it;
3996 struct lustre_handle lockh;
3998 struct ldlm_enqueue_info einfo = {
3999 .ei_type = LDLM_IBITS,
4001 .ei_cb_bl = ll_md_blocking_ast,
4002 .ei_cb_cp = ldlm_completion_ast,
4007 *gen = ll_layout_version_get(lli);
4008 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4012 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4013 LASSERT(S_ISREG(inode->i_mode));
4015 /* take layout lock mutex to enqueue layout lock exclusively. */
4016 mutex_lock(&lli->lli_layout_mutex);
4019 /* mostly layout lock is caching on the local side, so try to match
4020 * it before grabbing layout lock mutex. */
4021 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4022 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4023 if (mode != 0) { /* hit cached lock */
4024 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4028 mutex_unlock(&lli->lli_layout_mutex);
4032 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4033 0, 0, LUSTRE_OPC_ANY, NULL);
4034 if (IS_ERR(op_data)) {
4035 mutex_unlock(&lli->lli_layout_mutex);
4036 RETURN(PTR_ERR(op_data));
4039 /* have to enqueue one */
4040 memset(&it, 0, sizeof(it));
4041 it.it_op = IT_LAYOUT;
4042 lockh.cookie = 0ULL;
4044 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4045 ll_get_fsname(inode->i_sb, NULL, 0),
4046 PFID(&lli->lli_fid), inode);
4048 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
4050 if (it.d.lustre.it_data != NULL)
4051 ptlrpc_req_finished(it.d.lustre.it_data);
4052 it.d.lustre.it_data = NULL;
4054 ll_finish_md_op_data(op_data);
4056 mode = it.d.lustre.it_lock_mode;
4057 it.d.lustre.it_lock_mode = 0;
4058 ll_intent_drop_lock(&it);
4061 /* set lock data in case this is a new lock */
4062 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4063 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4067 mutex_unlock(&lli->lli_layout_mutex);
4073 * This function send a restore request to the MDT
4075 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4077 struct hsm_user_request *hur;
4081 len = sizeof(struct hsm_user_request) +
4082 sizeof(struct hsm_user_item);
4083 OBD_ALLOC(hur, len);
4087 hur->hur_request.hr_action = HUA_RESTORE;
4088 hur->hur_request.hr_archive_id = 0;
4089 hur->hur_request.hr_flags = 0;
4090 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4091 sizeof(hur->hur_user_item[0].hui_fid));
4092 hur->hur_user_item[0].hui_extent.offset = offset;
4093 hur->hur_user_item[0].hui_extent.length = length;
4094 hur->hur_request.hr_itemcount = 1;
4095 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,