4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("released file has negative dentry: file = %p, "
339 "dentry = %p, name = %s\n",
340 file, file->f_dentry, file->f_dentry->d_name.name);
344 LUSTRE_FPRIVATE(file) = NULL;
345 ll_file_data_put(fd);
346 ll_capa_close(inode);
351 /* While this returns an error code, fput() the caller does not, so we need
352 * to make every effort to clean up all of our state here. Also, applications
353 * rarely check close errors and even if an error is returned they will not
354 * re-try the close call.
356 int ll_file_release(struct inode *inode, struct file *file)
358 struct ll_file_data *fd;
359 struct ll_sb_info *sbi = ll_i2sbi(inode);
360 struct ll_inode_info *lli = ll_i2info(inode);
364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
365 PFID(ll_inode2fid(inode)), inode);
367 #ifdef CONFIG_FS_POSIX_ACL
368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
369 inode == inode->i_sb->s_root->d_inode) {
370 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
373 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
374 fd->fd_flags &= ~LL_FILE_RMTACL;
375 rct_del(&sbi->ll_rct, current_pid());
376 et_search_free(&sbi->ll_et, current_pid());
381 if (inode->i_sb->s_root != file->f_dentry)
382 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
383 fd = LUSTRE_FPRIVATE(file);
386 /* The last ref on @file, maybe not the the owner pid of statahead.
387 * Different processes can open the same dir, "ll_opendir_key" means:
388 * it is me that should stop the statahead thread. */
389 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
390 lli->lli_opendir_pid != 0)
391 ll_stop_statahead(inode, lli->lli_opendir_key);
393 if (inode->i_sb->s_root == file->f_dentry) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 if (!S_ISDIR(inode->i_mode)) {
400 if (lli->lli_clob != NULL)
401 lov_read_and_clear_async_rc(lli->lli_clob);
402 lli->lli_async_rc = 0;
405 rc = ll_md_close(sbi->ll_md_exp, inode, file);
407 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
408 libcfs_debug_dumplog();
413 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
414 struct lookup_intent *itp)
416 struct dentry *de = file->f_dentry;
417 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
418 struct dentry *parent = de->d_parent;
419 const char *name = NULL;
421 struct md_op_data *op_data;
422 struct ptlrpc_request *req = NULL;
426 LASSERT(parent != NULL);
427 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
429 /* if server supports open-by-fid, or file name is invalid, don't pack
430 * name in open request */
431 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
432 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
433 name = de->d_name.name;
434 len = de->d_name.len;
437 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
438 name, len, 0, LUSTRE_OPC_ANY, NULL);
440 RETURN(PTR_ERR(op_data));
441 op_data->op_data = lmm;
442 op_data->op_data_size = lmmsize;
444 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
445 &ll_md_blocking_ast, 0);
446 ll_finish_md_op_data(op_data);
448 /* reason for keep own exit path - don`t flood log
449 * with messages with -ESTALE errors.
451 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
452 it_open_error(DISP_OPEN_OPEN, itp))
454 ll_release_openhandle(de, itp);
458 if (it_disposition(itp, DISP_LOOKUP_NEG))
459 GOTO(out, rc = -ENOENT);
461 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
462 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
463 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
467 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
468 if (!rc && itp->d.lustre.it_lock_mode)
469 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
472 ptlrpc_req_finished(req);
473 ll_intent_drop_lock(itp);
479 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
480 * not believe attributes if a few ioepoch holders exist. Attributes for
481 * previous ioepoch if new one is opened are also skipped by MDS.
483 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
485 if (ioepoch && lli->lli_ioepoch != ioepoch) {
486 lli->lli_ioepoch = ioepoch;
487 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
488 ioepoch, PFID(&lli->lli_fid));
492 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
493 struct obd_client_handle *och)
495 struct ptlrpc_request *req = it->d.lustre.it_data;
496 struct mdt_body *body;
498 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
499 och->och_fh = body->mbo_handle;
500 och->och_fid = body->mbo_fid1;
501 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
502 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
503 och->och_flags = it->it_flags;
505 return md_set_open_replay_data(md_exp, och, it);
508 static int ll_local_open(struct file *file, struct lookup_intent *it,
509 struct ll_file_data *fd, struct obd_client_handle *och)
511 struct inode *inode = file->f_dentry->d_inode;
512 struct ll_inode_info *lli = ll_i2info(inode);
515 LASSERT(!LUSTRE_FPRIVATE(file));
520 struct ptlrpc_request *req = it->d.lustre.it_data;
521 struct mdt_body *body;
524 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
528 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
529 ll_ioepoch_open(lli, body->mbo_ioepoch);
532 LUSTRE_FPRIVATE(file) = fd;
533 ll_readahead_init(inode, &fd->fd_ras);
534 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
536 /* ll_cl_context initialize */
537 rwlock_init(&fd->fd_lock);
538 INIT_LIST_HEAD(&fd->fd_lccs);
543 /* Open a file, and (for the very first open) create objects on the OSTs at
544 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
545 * creation or open until ll_lov_setstripe() ioctl is called.
547 * If we already have the stripe MD locally then we don't request it in
548 * md_open(), by passing a lmm_size = 0.
550 * It is up to the application to ensure no other processes open this file
551 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
552 * used. We might be able to avoid races of that sort by getting lli_open_sem
553 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
554 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
556 int ll_file_open(struct inode *inode, struct file *file)
558 struct ll_inode_info *lli = ll_i2info(inode);
559 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
560 .it_flags = file->f_flags };
561 struct obd_client_handle **och_p = NULL;
562 __u64 *och_usecount = NULL;
563 struct ll_file_data *fd;
564 int rc = 0, opendir_set = 0;
567 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
568 PFID(ll_inode2fid(inode)), inode, file->f_flags);
570 it = file->private_data; /* XXX: compat macro */
571 file->private_data = NULL; /* prevent ll_local_open assertion */
573 fd = ll_file_data_get();
575 GOTO(out_openerr, rc = -ENOMEM);
578 if (S_ISDIR(inode->i_mode)) {
579 spin_lock(&lli->lli_sa_lock);
580 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
581 lli->lli_opendir_pid == 0) {
582 lli->lli_opendir_key = fd;
583 lli->lli_opendir_pid = current_pid();
586 spin_unlock(&lli->lli_sa_lock);
589 if (inode->i_sb->s_root == file->f_dentry) {
590 LUSTRE_FPRIVATE(file) = fd;
594 if (!it || !it->d.lustre.it_disposition) {
595 /* Convert f_flags into access mode. We cannot use file->f_mode,
596 * because everything but O_ACCMODE mask was stripped from
598 if ((oit.it_flags + 1) & O_ACCMODE)
600 if (file->f_flags & O_TRUNC)
601 oit.it_flags |= FMODE_WRITE;
603 /* kernel only call f_op->open in dentry_open. filp_open calls
604 * dentry_open after call to open_namei that checks permissions.
605 * Only nfsd_open call dentry_open directly without checking
606 * permissions and because of that this code below is safe. */
607 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
608 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
610 /* We do not want O_EXCL here, presumably we opened the file
611 * already? XXX - NFS implications? */
612 oit.it_flags &= ~O_EXCL;
614 /* bug20584, if "it_flags" contains O_CREAT, the file will be
615 * created if necessary, then "IT_CREAT" should be set to keep
616 * consistent with it */
617 if (oit.it_flags & O_CREAT)
618 oit.it_op |= IT_CREAT;
624 /* Let's see if we have file open on MDS already. */
625 if (it->it_flags & FMODE_WRITE) {
626 och_p = &lli->lli_mds_write_och;
627 och_usecount = &lli->lli_open_fd_write_count;
628 } else if (it->it_flags & FMODE_EXEC) {
629 och_p = &lli->lli_mds_exec_och;
630 och_usecount = &lli->lli_open_fd_exec_count;
632 och_p = &lli->lli_mds_read_och;
633 och_usecount = &lli->lli_open_fd_read_count;
636 mutex_lock(&lli->lli_och_mutex);
637 if (*och_p) { /* Open handle is present */
638 if (it_disposition(it, DISP_OPEN_OPEN)) {
639 /* Well, there's extra open request that we do not need,
640 let's close it somehow. This will decref request. */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 mutex_unlock(&lli->lli_och_mutex);
644 GOTO(out_openerr, rc);
647 ll_release_openhandle(file->f_dentry, it);
651 rc = ll_local_open(file, it, fd, NULL);
654 mutex_unlock(&lli->lli_och_mutex);
655 GOTO(out_openerr, rc);
658 LASSERT(*och_usecount == 0);
659 if (!it->d.lustre.it_disposition) {
660 /* We cannot just request lock handle now, new ELC code
661 means that one of other OPEN locks for this file
662 could be cancelled, and since blocking ast handler
663 would attempt to grab och_mutex as well, that would
664 result in a deadlock */
665 mutex_unlock(&lli->lli_och_mutex);
667 * Normally called under two situations:
669 * 2. A race/condition on MDS resulting in no open
670 * handle to be returned from LOOKUP|OPEN request,
671 * for example if the target entry was a symlink.
673 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
675 * Always specify MDS_OPEN_BY_FID because we don't want
676 * to get file with different fid.
678 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
679 rc = ll_intent_file_open(file, NULL, 0, it);
681 GOTO(out_openerr, rc);
685 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
687 GOTO(out_och_free, rc = -ENOMEM);
691 /* md_intent_lock() didn't get a request ref if there was an
692 * open error, so don't do cleanup on the request here
694 /* XXX (green): Should not we bail out on any error here, not
695 * just open error? */
696 rc = it_open_error(DISP_OPEN_OPEN, it);
698 GOTO(out_och_free, rc);
700 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
701 "inode %p: disposition %x, status %d\n", inode,
702 it_disposition(it, ~0), it->d.lustre.it_status);
704 rc = ll_local_open(file, it, fd, *och_p);
706 GOTO(out_och_free, rc);
708 mutex_unlock(&lli->lli_och_mutex);
711 /* Must do this outside lli_och_mutex lock to prevent deadlock where
712 different kind of OPEN lock for this same inode gets cancelled
713 by ldlm_cancel_lru */
714 if (!S_ISREG(inode->i_mode))
715 GOTO(out_och_free, rc);
719 if (!lli->lli_has_smd &&
720 (cl_is_lov_delay_create(file->f_flags) ||
721 (file->f_mode & FMODE_WRITE) == 0)) {
722 CDEBUG(D_INODE, "object creation was delayed\n");
723 GOTO(out_och_free, rc);
725 cl_lov_delay_create_clear(&file->f_flags);
726 GOTO(out_och_free, rc);
730 if (och_p && *och_p) {
731 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
732 *och_p = NULL; /* OBD_FREE writes some magic there */
735 mutex_unlock(&lli->lli_och_mutex);
738 if (opendir_set != 0)
739 ll_stop_statahead(inode, lli->lli_opendir_key);
741 ll_file_data_put(fd);
743 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
746 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
747 ptlrpc_req_finished(it->d.lustre.it_data);
748 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
754 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
755 struct ldlm_lock_desc *desc, void *data, int flag)
758 struct lustre_handle lockh;
762 case LDLM_CB_BLOCKING:
763 ldlm_lock2handle(lock, &lockh);
764 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
766 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
770 case LDLM_CB_CANCELING:
778 * Acquire a lease and open the file.
780 static struct obd_client_handle *
781 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
784 struct lookup_intent it = { .it_op = IT_OPEN };
785 struct ll_sb_info *sbi = ll_i2sbi(inode);
786 struct md_op_data *op_data;
787 struct ptlrpc_request *req = NULL;
788 struct lustre_handle old_handle = { 0 };
789 struct obd_client_handle *och = NULL;
794 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
795 RETURN(ERR_PTR(-EINVAL));
798 struct ll_inode_info *lli = ll_i2info(inode);
799 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
800 struct obd_client_handle **och_p;
803 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
804 RETURN(ERR_PTR(-EPERM));
806 /* Get the openhandle of the file */
808 mutex_lock(&lli->lli_och_mutex);
809 if (fd->fd_lease_och != NULL) {
810 mutex_unlock(&lli->lli_och_mutex);
814 if (fd->fd_och == NULL) {
815 if (file->f_mode & FMODE_WRITE) {
816 LASSERT(lli->lli_mds_write_och != NULL);
817 och_p = &lli->lli_mds_write_och;
818 och_usecount = &lli->lli_open_fd_write_count;
820 LASSERT(lli->lli_mds_read_och != NULL);
821 och_p = &lli->lli_mds_read_och;
822 och_usecount = &lli->lli_open_fd_read_count;
824 if (*och_usecount == 1) {
831 mutex_unlock(&lli->lli_och_mutex);
832 if (rc < 0) /* more than 1 opener */
835 LASSERT(fd->fd_och != NULL);
836 old_handle = fd->fd_och->och_fh;
841 RETURN(ERR_PTR(-ENOMEM));
843 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
844 LUSTRE_OPC_ANY, NULL);
846 GOTO(out, rc = PTR_ERR(op_data));
848 /* To tell the MDT this openhandle is from the same owner */
849 op_data->op_handle = old_handle;
851 it.it_flags = fmode | open_flags;
852 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
853 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
854 &ll_md_blocking_lease_ast,
855 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
856 * it can be cancelled which may mislead applications that the lease is
858 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
859 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
860 * doesn't deal with openhandle, so normal openhandle will be leaked. */
861 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
862 ll_finish_md_op_data(op_data);
863 ptlrpc_req_finished(req);
865 GOTO(out_release_it, rc);
867 if (it_disposition(&it, DISP_LOOKUP_NEG))
868 GOTO(out_release_it, rc = -ENOENT);
870 rc = it_open_error(DISP_OPEN_OPEN, &it);
872 GOTO(out_release_it, rc);
874 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
875 ll_och_fill(sbi->ll_md_exp, &it, och);
877 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
878 GOTO(out_close, rc = -EOPNOTSUPP);
880 /* already get lease, handle lease lock */
881 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
882 if (it.d.lustre.it_lock_mode == 0 ||
883 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
884 /* open lock must return for lease */
885 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
886 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
887 it.d.lustre.it_lock_bits);
888 GOTO(out_close, rc = -EPROTO);
891 ll_intent_release(&it);
895 /* Cancel open lock */
896 if (it.d.lustre.it_lock_mode != 0) {
897 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
898 it.d.lustre.it_lock_mode);
899 it.d.lustre.it_lock_mode = 0;
900 och->och_lease_handle.cookie = 0ULL;
902 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
904 CERROR("%s: error closing file "DFID": %d\n",
905 ll_get_fsname(inode->i_sb, NULL, 0),
906 PFID(&ll_i2info(inode)->lli_fid), rc2);
907 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
909 ll_intent_release(&it);
917 * Release lease and close the file.
918 * It will check if the lease has ever broken.
920 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
923 struct ldlm_lock *lock;
924 bool cancelled = true;
928 lock = ldlm_handle2lock(&och->och_lease_handle);
930 lock_res_and_lock(lock);
931 cancelled = ldlm_is_cancel(lock);
932 unlock_res_and_lock(lock);
936 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
937 PFID(&ll_i2info(inode)->lli_fid), cancelled);
940 ldlm_cli_cancel(&och->och_lease_handle, 0);
941 if (lease_broken != NULL)
942 *lease_broken = cancelled;
944 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
949 /* Fills the obdo with the attributes for the lsm */
950 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
951 struct obd_capa *capa, struct obdo *obdo,
952 __u64 ioepoch, int dv_flags)
954 struct ptlrpc_request_set *set;
955 struct obd_info oinfo = { { { 0 } } };
960 LASSERT(lsm != NULL);
964 oinfo.oi_oa->o_oi = lsm->lsm_oi;
965 oinfo.oi_oa->o_mode = S_IFREG;
966 oinfo.oi_oa->o_ioepoch = ioepoch;
967 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
968 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
969 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
970 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
971 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
972 OBD_MD_FLDATAVERSION;
973 oinfo.oi_capa = capa;
974 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
975 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
976 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
977 if (dv_flags & LL_DV_WR_FLUSH)
978 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
981 set = ptlrpc_prep_set();
983 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
986 rc = obd_getattr_async(exp, &oinfo, set);
988 rc = ptlrpc_set_wait(set);
989 ptlrpc_set_destroy(set);
992 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
993 OBD_MD_FLATIME | OBD_MD_FLMTIME |
994 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
995 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
996 if (dv_flags & LL_DV_WR_FLUSH &&
997 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
998 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1005 * Performs the getattr on the inode and updates its fields.
1006 * If @sync != 0, perform the getattr under the server-side lock.
1008 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1009 __u64 ioepoch, int sync)
1011 struct obd_capa *capa = ll_mdscapa_get(inode);
1012 struct lov_stripe_md *lsm;
1016 lsm = ccc_inode_lsm_get(inode);
1017 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1018 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1021 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1023 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1024 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1025 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1026 (unsigned long long)inode->i_blocks,
1027 (unsigned long)ll_inode_blksize(inode));
1029 ccc_inode_lsm_put(inode, lsm);
1033 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1035 struct ll_inode_info *lli = ll_i2info(inode);
1036 struct cl_object *obj = lli->lli_clob;
1037 struct cl_attr *attr = ccc_env_thread_attr(env);
1043 ll_inode_size_lock(inode);
1044 /* merge timestamps the most recently obtained from mds with
1045 timestamps obtained from osts */
1046 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1047 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1048 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1049 inode_init_lvb(inode, &lvb);
1051 cl_object_attr_lock(obj);
1052 rc = cl_object_attr_get(env, obj, attr);
1053 cl_object_attr_unlock(obj);
1056 if (lvb.lvb_atime < attr->cat_atime)
1057 lvb.lvb_atime = attr->cat_atime;
1058 if (lvb.lvb_ctime < attr->cat_ctime)
1059 lvb.lvb_ctime = attr->cat_ctime;
1060 if (lvb.lvb_mtime < attr->cat_mtime)
1061 lvb.lvb_mtime = attr->cat_mtime;
1063 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1064 PFID(&lli->lli_fid), attr->cat_size);
1065 cl_isize_write_nolock(inode, attr->cat_size);
1067 inode->i_blocks = attr->cat_blocks;
1069 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1070 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1071 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1073 ll_inode_size_unlock(inode);
1078 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1081 struct obdo obdo = { 0 };
1084 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1086 st->st_size = obdo.o_size;
1087 st->st_blocks = obdo.o_blocks;
1088 st->st_mtime = obdo.o_mtime;
1089 st->st_atime = obdo.o_atime;
1090 st->st_ctime = obdo.o_ctime;
1095 static bool file_is_noatime(const struct file *file)
1097 const struct vfsmount *mnt = file->f_path.mnt;
1098 const struct inode *inode = file->f_path.dentry->d_inode;
1100 /* Adapted from file_accessed() and touch_atime().*/
1101 if (file->f_flags & O_NOATIME)
1104 if (inode->i_flags & S_NOATIME)
1107 if (IS_NOATIME(inode))
1110 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1113 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1116 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1122 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1124 struct inode *inode = file->f_dentry->d_inode;
1126 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1128 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1129 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1130 file->f_flags & O_DIRECT ||
1133 io->ci_obj = ll_i2info(inode)->lli_clob;
1134 io->ci_lockreq = CILR_MAYBE;
1135 if (ll_file_nolock(file)) {
1136 io->ci_lockreq = CILR_NEVER;
1137 io->ci_no_srvlock = 1;
1138 } else if (file->f_flags & O_APPEND) {
1139 io->ci_lockreq = CILR_MANDATORY;
1142 io->ci_noatime = file_is_noatime(file);
1146 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1147 struct file *file, enum cl_io_type iot,
1148 loff_t *ppos, size_t count)
1150 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1151 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1156 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1157 file->f_dentry->d_name.name, iot, *ppos, count);
1160 io = ccc_env_thread_io(env);
1161 ll_io_init(io, file, iot == CIT_WRITE);
1163 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1164 struct vvp_io *vio = vvp_env_io(env);
1165 struct ccc_io *cio = ccc_env_io(env);
1166 int write_mutex_locked = 0;
1168 cio->cui_fd = LUSTRE_FPRIVATE(file);
1169 vio->cui_io_subtype = args->via_io_subtype;
1171 switch (vio->cui_io_subtype) {
1173 cio->cui_iov = args->u.normal.via_iov;
1174 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1175 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1176 cio->cui_iocb = args->u.normal.via_iocb;
1177 if ((iot == CIT_WRITE) &&
1178 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1179 if (mutex_lock_interruptible(&lli->
1181 GOTO(out, result = -ERESTARTSYS);
1182 write_mutex_locked = 1;
1184 down_read(&lli->lli_trunc_sem);
1187 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1188 vio->u.splice.cui_flags = args->u.splice.via_flags;
1191 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1195 ll_cl_add(file, env, io);
1196 result = cl_io_loop(env, io);
1197 ll_cl_remove(file, env);
1199 if (args->via_io_subtype == IO_NORMAL)
1200 up_read(&lli->lli_trunc_sem);
1201 if (write_mutex_locked)
1202 mutex_unlock(&lli->lli_write_mutex);
1204 /* cl_io_rw_init() handled IO */
1205 result = io->ci_result;
1208 if (io->ci_nob > 0) {
1209 result = io->ci_nob;
1210 *ppos = io->u.ci_wr.wr.crw_pos;
1214 cl_io_fini(env, io);
1215 /* If any bit been read/written (result != 0), we just return
1216 * short read/write instead of restart io. */
1217 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1218 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1219 iot == CIT_READ ? "read" : "write",
1220 file->f_dentry->d_name.name, *ppos, count);
1221 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1225 if (iot == CIT_READ) {
1227 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1228 LPROC_LL_READ_BYTES, result);
1229 } else if (iot == CIT_WRITE) {
1231 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1232 LPROC_LL_WRITE_BYTES, result);
1233 fd->fd_write_failed = false;
1234 } else if (result != -ERESTARTSYS) {
1235 fd->fd_write_failed = true;
1238 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1245 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1247 static int ll_file_get_iov_count(const struct iovec *iov,
1248 unsigned long *nr_segs, size_t *count)
1253 for (seg = 0; seg < *nr_segs; seg++) {
1254 const struct iovec *iv = &iov[seg];
1257 * If any segment has a negative length, or the cumulative
1258 * length ever wraps negative then return -EINVAL.
1261 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1263 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1268 cnt -= iv->iov_len; /* This segment is no good */
1275 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1276 unsigned long nr_segs, loff_t pos)
1279 struct vvp_io_args *args;
1285 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1289 env = cl_env_get(&refcheck);
1291 RETURN(PTR_ERR(env));
1293 args = vvp_env_args(env, IO_NORMAL);
1294 args->u.normal.via_iov = (struct iovec *)iov;
1295 args->u.normal.via_nrsegs = nr_segs;
1296 args->u.normal.via_iocb = iocb;
1298 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1299 &iocb->ki_pos, count);
1300 cl_env_put(env, &refcheck);
1304 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1308 struct iovec *local_iov;
1309 struct kiocb *kiocb;
1314 env = cl_env_get(&refcheck);
1316 RETURN(PTR_ERR(env));
1318 local_iov = &vvp_env_info(env)->vti_local_iov;
1319 kiocb = &vvp_env_info(env)->vti_kiocb;
1320 local_iov->iov_base = (void __user *)buf;
1321 local_iov->iov_len = count;
1322 init_sync_kiocb(kiocb, file);
1323 kiocb->ki_pos = *ppos;
1324 #ifdef HAVE_KIOCB_KI_LEFT
1325 kiocb->ki_left = count;
1327 kiocb->ki_nbytes = count;
1330 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1331 *ppos = kiocb->ki_pos;
1333 cl_env_put(env, &refcheck);
1338 * Write to a file (through the page cache).
1341 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1342 unsigned long nr_segs, loff_t pos)
1345 struct vvp_io_args *args;
1351 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1355 env = cl_env_get(&refcheck);
1357 RETURN(PTR_ERR(env));
1359 args = vvp_env_args(env, IO_NORMAL);
1360 args->u.normal.via_iov = (struct iovec *)iov;
1361 args->u.normal.via_nrsegs = nr_segs;
1362 args->u.normal.via_iocb = iocb;
1364 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1365 &iocb->ki_pos, count);
1366 cl_env_put(env, &refcheck);
1370 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1374 struct iovec *local_iov;
1375 struct kiocb *kiocb;
1380 env = cl_env_get(&refcheck);
1382 RETURN(PTR_ERR(env));
1384 local_iov = &vvp_env_info(env)->vti_local_iov;
1385 kiocb = &vvp_env_info(env)->vti_kiocb;
1386 local_iov->iov_base = (void __user *)buf;
1387 local_iov->iov_len = count;
1388 init_sync_kiocb(kiocb, file);
1389 kiocb->ki_pos = *ppos;
1390 #ifdef HAVE_KIOCB_KI_LEFT
1391 kiocb->ki_left = count;
1393 kiocb->ki_nbytes = count;
1396 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1397 *ppos = kiocb->ki_pos;
1399 cl_env_put(env, &refcheck);
1404 * Send file content (through pagecache) somewhere with helper
1406 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1407 struct pipe_inode_info *pipe, size_t count,
1411 struct vvp_io_args *args;
1416 env = cl_env_get(&refcheck);
1418 RETURN(PTR_ERR(env));
1420 args = vvp_env_args(env, IO_SPLICE);
1421 args->u.splice.via_pipe = pipe;
1422 args->u.splice.via_flags = flags;
1424 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1425 cl_env_put(env, &refcheck);
1429 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1432 struct obd_export *exp = ll_i2dtexp(inode);
1433 struct obd_trans_info oti = { 0 };
1434 struct obdo *oa = NULL;
1437 struct lov_stripe_md *lsm = NULL, *lsm2;
1444 lsm = ccc_inode_lsm_get(inode);
1445 if (!lsm_has_objects(lsm))
1446 GOTO(out, rc = -ENOENT);
1448 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1449 (lsm->lsm_stripe_count));
1451 OBD_ALLOC_LARGE(lsm2, lsm_size);
1453 GOTO(out, rc = -ENOMEM);
1456 oa->o_nlink = ost_idx;
1457 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1458 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1459 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1460 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1461 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1462 memcpy(lsm2, lsm, lsm_size);
1463 ll_inode_size_lock(inode);
1464 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1465 ll_inode_size_unlock(inode);
1467 OBD_FREE_LARGE(lsm2, lsm_size);
1470 ccc_inode_lsm_put(inode, lsm);
1475 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1477 struct ll_recreate_obj ucreat;
1481 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1484 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1488 ostid_set_seq_mdt0(&oi);
1489 ostid_set_id(&oi, ucreat.lrc_id);
1490 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1493 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1500 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1503 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1506 fid_to_ostid(&fid, &oi);
1507 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1508 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1511 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1512 __u64 flags, struct lov_user_md *lum,
1515 struct lov_stripe_md *lsm = NULL;
1516 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1520 lsm = ccc_inode_lsm_get(inode);
1522 ccc_inode_lsm_put(inode, lsm);
1523 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1524 PFID(ll_inode2fid(inode)));
1525 GOTO(out, rc = -EEXIST);
1528 ll_inode_size_lock(inode);
1529 oit.it_flags |= MDS_OPEN_BY_FID;
1530 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1532 GOTO(out_unlock, rc);
1533 rc = oit.d.lustre.it_status;
1535 GOTO(out_req_free, rc);
1537 ll_release_openhandle(file->f_dentry, &oit);
1540 ll_inode_size_unlock(inode);
1541 ll_intent_release(&oit);
1542 ccc_inode_lsm_put(inode, lsm);
1544 cl_lov_delay_create_clear(&file->f_flags);
1547 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1551 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1552 struct lov_mds_md **lmmp, int *lmm_size,
1553 struct ptlrpc_request **request)
1555 struct ll_sb_info *sbi = ll_i2sbi(inode);
1556 struct mdt_body *body;
1557 struct lov_mds_md *lmm = NULL;
1558 struct ptlrpc_request *req = NULL;
1559 struct md_op_data *op_data;
1562 rc = ll_get_default_mdsize(sbi, &lmmsize);
1566 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1567 strlen(filename), lmmsize,
1568 LUSTRE_OPC_ANY, NULL);
1569 if (IS_ERR(op_data))
1570 RETURN(PTR_ERR(op_data));
1572 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1573 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1574 ll_finish_md_op_data(op_data);
1576 CDEBUG(D_INFO, "md_getattr_name failed "
1577 "on %s: rc %d\n", filename, rc);
1581 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1582 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1584 lmmsize = body->mbo_eadatasize;
1586 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1588 GOTO(out, rc = -ENODATA);
1591 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1592 LASSERT(lmm != NULL);
1594 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1595 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1596 GOTO(out, rc = -EPROTO);
1600 * This is coming from the MDS, so is probably in
1601 * little endian. We convert it to host endian before
1602 * passing it to userspace.
1604 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1607 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1608 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1611 /* if function called for directory - we should
1612 * avoid swab not existent lsm objects */
1613 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1614 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1615 if (S_ISREG(body->mbo_mode))
1616 lustre_swab_lov_user_md_objects(
1617 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1619 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1620 lustre_swab_lov_user_md_v3(
1621 (struct lov_user_md_v3 *)lmm);
1622 if (S_ISREG(body->mbo_mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1631 *lmm_size = lmmsize;
1636 static int ll_lov_setea(struct inode *inode, struct file *file,
1639 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1640 struct lov_user_md *lump;
1641 int lum_size = sizeof(struct lov_user_md) +
1642 sizeof(struct lov_user_ost_data);
1646 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1649 OBD_ALLOC_LARGE(lump, lum_size);
1653 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1654 OBD_FREE_LARGE(lump, lum_size);
1658 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1660 OBD_FREE_LARGE(lump, lum_size);
1664 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1667 struct lov_user_md_v3 lumv3;
1668 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1669 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1670 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1672 __u64 flags = FMODE_WRITE;
1675 /* first try with v1 which is smaller than v3 */
1676 lum_size = sizeof(struct lov_user_md_v1);
1677 if (copy_from_user(lumv1, lumv1p, lum_size))
1680 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1681 lum_size = sizeof(struct lov_user_md_v3);
1682 if (copy_from_user(&lumv3, lumv3p, lum_size))
1686 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1688 struct lov_stripe_md *lsm;
1691 put_user(0, &lumv1p->lmm_stripe_count);
1693 ll_layout_refresh(inode, &gen);
1694 lsm = ccc_inode_lsm_get(inode);
1695 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1696 0, lsm, (void *)arg);
1697 ccc_inode_lsm_put(inode, lsm);
1702 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1704 struct lov_stripe_md *lsm;
1708 lsm = ccc_inode_lsm_get(inode);
1710 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1712 ccc_inode_lsm_put(inode, lsm);
1717 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1719 struct ll_inode_info *lli = ll_i2info(inode);
1720 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1721 struct ccc_grouplock grouplock;
1725 if (ll_file_nolock(file))
1726 RETURN(-EOPNOTSUPP);
1728 spin_lock(&lli->lli_lock);
1729 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1730 CWARN("group lock already existed with gid %lu\n",
1731 fd->fd_grouplock.cg_gid);
1732 spin_unlock(&lli->lli_lock);
1735 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1736 spin_unlock(&lli->lli_lock);
1738 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1739 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1743 spin_lock(&lli->lli_lock);
1744 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1745 spin_unlock(&lli->lli_lock);
1746 CERROR("another thread just won the race\n");
1747 cl_put_grouplock(&grouplock);
1751 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1752 fd->fd_grouplock = grouplock;
1753 spin_unlock(&lli->lli_lock);
1755 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1759 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1761 struct ll_inode_info *lli = ll_i2info(inode);
1762 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1763 struct ccc_grouplock grouplock;
1766 spin_lock(&lli->lli_lock);
1767 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1768 spin_unlock(&lli->lli_lock);
1769 CWARN("no group lock held\n");
1772 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1774 if (fd->fd_grouplock.cg_gid != arg) {
1775 CWARN("group lock %lu doesn't match current id %lu\n",
1776 arg, fd->fd_grouplock.cg_gid);
1777 spin_unlock(&lli->lli_lock);
1781 grouplock = fd->fd_grouplock;
1782 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1783 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1784 spin_unlock(&lli->lli_lock);
1786 cl_put_grouplock(&grouplock);
1787 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1792 * Close inode open handle
1794 * \param dentry [in] dentry which contains the inode
1795 * \param it [in,out] intent which contains open info and result
1798 * \retval <0 failure
1800 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1802 struct inode *inode = dentry->d_inode;
1803 struct obd_client_handle *och;
1809 /* Root ? Do nothing. */
1810 if (dentry->d_inode->i_sb->s_root == dentry)
1813 /* No open handle to close? Move away */
1814 if (!it_disposition(it, DISP_OPEN_OPEN))
1817 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1819 OBD_ALLOC(och, sizeof(*och));
1821 GOTO(out, rc = -ENOMEM);
1823 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1825 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1828 /* this one is in place of ll_file_open */
1829 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1830 ptlrpc_req_finished(it->d.lustre.it_data);
1831 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1837 * Get size for inode for which FIEMAP mapping is requested.
1838 * Make the FIEMAP get_info call and returns the result.
1840 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1843 struct obd_export *exp = ll_i2dtexp(inode);
1844 struct lov_stripe_md *lsm = NULL;
1845 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1846 __u32 vallen = num_bytes;
1850 /* Checks for fiemap flags */
1851 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1852 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1856 /* Check for FIEMAP_FLAG_SYNC */
1857 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1858 rc = filemap_fdatawrite(inode->i_mapping);
1863 lsm = ccc_inode_lsm_get(inode);
1867 /* If the stripe_count > 1 and the application does not understand
1868 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1870 if (lsm->lsm_stripe_count > 1 &&
1871 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1872 GOTO(out, rc = -EOPNOTSUPP);
1874 fm_key.oa.o_oi = lsm->lsm_oi;
1875 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1877 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1878 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1879 /* If filesize is 0, then there would be no objects for mapping */
1880 if (fm_key.oa.o_size == 0) {
1881 fiemap->fm_mapped_extents = 0;
1885 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1887 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1890 CERROR("obd_get_info failed: rc = %d\n", rc);
1893 ccc_inode_lsm_put(inode, lsm);
1897 int ll_fid2path(struct inode *inode, void __user *arg)
1899 struct obd_export *exp = ll_i2mdexp(inode);
1900 const struct getinfo_fid2path __user *gfin = arg;
1902 struct getinfo_fid2path *gfout;
1908 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1909 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1912 /* Only need to get the buflen */
1913 if (get_user(pathlen, &gfin->gf_pathlen))
1916 if (pathlen > PATH_MAX)
1919 outsize = sizeof(*gfout) + pathlen;
1920 OBD_ALLOC(gfout, outsize);
1924 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1925 GOTO(gf_free, rc = -EFAULT);
1927 /* Call mdc_iocontrol */
1928 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1932 if (copy_to_user(arg, gfout, outsize))
1936 OBD_FREE(gfout, outsize);
1940 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1942 struct ll_user_fiemap *fiemap_s;
1943 size_t num_bytes, ret_bytes;
1944 unsigned int extent_count;
1947 /* Get the extent count so we can calculate the size of
1948 * required fiemap buffer */
1949 if (get_user(extent_count,
1950 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1954 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1956 num_bytes = sizeof(*fiemap_s) + (extent_count *
1957 sizeof(struct ll_fiemap_extent));
1959 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1960 if (fiemap_s == NULL)
1963 /* get the fiemap value */
1964 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1966 GOTO(error, rc = -EFAULT);
1968 /* If fm_extent_count is non-zero, read the first extent since
1969 * it is used to calculate end_offset and device from previous
1972 if (copy_from_user(&fiemap_s->fm_extents[0],
1973 (char __user *)arg + sizeof(*fiemap_s),
1974 sizeof(struct ll_fiemap_extent)))
1975 GOTO(error, rc = -EFAULT);
1978 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1982 ret_bytes = sizeof(struct ll_user_fiemap);
1984 if (extent_count != 0)
1985 ret_bytes += (fiemap_s->fm_mapped_extents *
1986 sizeof(struct ll_fiemap_extent));
1988 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1992 OBD_FREE_LARGE(fiemap_s, num_bytes);
1997 * Read the data_version for inode.
1999 * This value is computed using stripe object version on OST.
2000 * Version is computed using server side locking.
2002 * @param sync if do sync on the OST side;
2004 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2005 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2007 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2009 struct lov_stripe_md *lsm = NULL;
2010 struct ll_sb_info *sbi = ll_i2sbi(inode);
2011 struct obdo *obdo = NULL;
2015 /* If no stripe, we consider version is 0. */
2016 lsm = ccc_inode_lsm_get(inode);
2017 if (!lsm_has_objects(lsm)) {
2019 CDEBUG(D_INODE, "No object for inode\n");
2023 OBD_ALLOC_PTR(obdo);
2025 GOTO(out, rc = -ENOMEM);
2027 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2029 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2032 *data_version = obdo->o_data_version;
2038 ccc_inode_lsm_put(inode, lsm);
2043 * Trigger a HSM release request for the provided inode.
2045 int ll_hsm_release(struct inode *inode)
2047 struct cl_env_nest nest;
2049 struct obd_client_handle *och = NULL;
2050 __u64 data_version = 0;
2054 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2055 ll_get_fsname(inode->i_sb, NULL, 0),
2056 PFID(&ll_i2info(inode)->lli_fid));
2058 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2060 GOTO(out, rc = PTR_ERR(och));
2062 /* Grab latest data_version and [am]time values */
2063 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2067 env = cl_env_nested_get(&nest);
2069 GOTO(out, rc = PTR_ERR(env));
2071 ll_merge_lvb(env, inode);
2072 cl_env_nested_put(&nest, env);
2074 /* Release the file.
2075 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2076 * we still need it to pack l_remote_handle to MDT. */
2077 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2083 if (och != NULL && !IS_ERR(och)) /* close the file */
2084 ll_lease_close(och, inode, NULL);
2089 struct ll_swap_stack {
2090 struct iattr ia1, ia2;
2092 struct inode *inode1, *inode2;
2093 bool check_dv1, check_dv2;
2096 static int ll_swap_layouts(struct file *file1, struct file *file2,
2097 struct lustre_swap_layouts *lsl)
2099 struct mdc_swap_layouts msl;
2100 struct md_op_data *op_data;
2103 struct ll_swap_stack *llss = NULL;
2106 OBD_ALLOC_PTR(llss);
2110 llss->inode1 = file1->f_dentry->d_inode;
2111 llss->inode2 = file2->f_dentry->d_inode;
2113 if (!S_ISREG(llss->inode2->i_mode))
2114 GOTO(free, rc = -EINVAL);
2116 if (inode_permission(llss->inode1, MAY_WRITE) ||
2117 inode_permission(llss->inode2, MAY_WRITE))
2118 GOTO(free, rc = -EPERM);
2120 if (llss->inode2->i_sb != llss->inode1->i_sb)
2121 GOTO(free, rc = -EXDEV);
2123 /* we use 2 bool because it is easier to swap than 2 bits */
2124 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2125 llss->check_dv1 = true;
2127 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2128 llss->check_dv2 = true;
2130 /* we cannot use lsl->sl_dvX directly because we may swap them */
2131 llss->dv1 = lsl->sl_dv1;
2132 llss->dv2 = lsl->sl_dv2;
2134 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2135 if (rc == 0) /* same file, done! */
2138 if (rc < 0) { /* sequentialize it */
2139 swap(llss->inode1, llss->inode2);
2141 swap(llss->dv1, llss->dv2);
2142 swap(llss->check_dv1, llss->check_dv2);
2146 if (gid != 0) { /* application asks to flush dirty cache */
2147 rc = ll_get_grouplock(llss->inode1, file1, gid);
2151 rc = ll_get_grouplock(llss->inode2, file2, gid);
2153 ll_put_grouplock(llss->inode1, file1, gid);
2158 /* to be able to restore mtime and atime after swap
2159 * we need to first save them */
2161 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2162 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2163 llss->ia1.ia_atime = llss->inode1->i_atime;
2164 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2165 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2166 llss->ia2.ia_atime = llss->inode2->i_atime;
2167 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2170 /* ultimate check, before swaping the layouts we check if
2171 * dataversion has changed (if requested) */
2172 if (llss->check_dv1) {
2173 rc = ll_data_version(llss->inode1, &dv, 0);
2176 if (dv != llss->dv1)
2177 GOTO(putgl, rc = -EAGAIN);
2180 if (llss->check_dv2) {
2181 rc = ll_data_version(llss->inode2, &dv, 0);
2184 if (dv != llss->dv2)
2185 GOTO(putgl, rc = -EAGAIN);
2188 /* struct md_op_data is used to send the swap args to the mdt
2189 * only flags is missing, so we use struct mdc_swap_layouts
2190 * through the md_op_data->op_data */
2191 /* flags from user space have to be converted before they are send to
2192 * server, no flag is sent today, they are only used on the client */
2195 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2196 0, LUSTRE_OPC_ANY, &msl);
2197 if (IS_ERR(op_data))
2198 GOTO(free, rc = PTR_ERR(op_data));
2200 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2201 sizeof(*op_data), op_data, NULL);
2202 ll_finish_md_op_data(op_data);
2206 ll_put_grouplock(llss->inode2, file2, gid);
2207 ll_put_grouplock(llss->inode1, file1, gid);
2210 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2214 /* clear useless flags */
2215 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2216 llss->ia1.ia_valid &= ~ATTR_MTIME;
2217 llss->ia2.ia_valid &= ~ATTR_MTIME;
2220 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2221 llss->ia1.ia_valid &= ~ATTR_ATIME;
2222 llss->ia2.ia_valid &= ~ATTR_ATIME;
2225 /* update time if requested */
2227 if (llss->ia2.ia_valid != 0) {
2228 mutex_lock(&llss->inode1->i_mutex);
2229 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2230 mutex_unlock(&llss->inode1->i_mutex);
2233 if (llss->ia1.ia_valid != 0) {
2236 mutex_lock(&llss->inode2->i_mutex);
2237 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2238 mutex_unlock(&llss->inode2->i_mutex);
2250 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2252 struct md_op_data *op_data;
2255 /* Non-root users are forbidden to set or clear flags which are
2256 * NOT defined in HSM_USER_MASK. */
2257 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2258 !cfs_capable(CFS_CAP_SYS_ADMIN))
2261 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2262 LUSTRE_OPC_ANY, hss);
2263 if (IS_ERR(op_data))
2264 RETURN(PTR_ERR(op_data));
2266 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2267 sizeof(*op_data), op_data, NULL);
2269 ll_finish_md_op_data(op_data);
2274 static int ll_hsm_import(struct inode *inode, struct file *file,
2275 struct hsm_user_import *hui)
2277 struct hsm_state_set *hss = NULL;
2278 struct iattr *attr = NULL;
2282 if (!S_ISREG(inode->i_mode))
2288 GOTO(out, rc = -ENOMEM);
2290 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2291 hss->hss_archive_id = hui->hui_archive_id;
2292 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2293 rc = ll_hsm_state_set(inode, hss);
2297 OBD_ALLOC_PTR(attr);
2299 GOTO(out, rc = -ENOMEM);
2301 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2302 attr->ia_mode |= S_IFREG;
2303 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2304 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2305 attr->ia_size = hui->hui_size;
2306 attr->ia_mtime.tv_sec = hui->hui_mtime;
2307 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2308 attr->ia_atime.tv_sec = hui->hui_atime;
2309 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2311 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2312 ATTR_UID | ATTR_GID |
2313 ATTR_MTIME | ATTR_MTIME_SET |
2314 ATTR_ATIME | ATTR_ATIME_SET;
2316 mutex_lock(&inode->i_mutex);
2318 rc = ll_setattr_raw(file->f_dentry, attr, true);
2322 mutex_unlock(&inode->i_mutex);
2334 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2336 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2337 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2341 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2343 struct inode *inode = file->f_dentry->d_inode;
2344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2348 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2349 PFID(ll_inode2fid(inode)), inode, cmd);
2350 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2352 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2353 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2357 case LL_IOC_GETFLAGS:
2358 /* Get the current value of the file flags */
2359 return put_user(fd->fd_flags, (int *)arg);
2360 case LL_IOC_SETFLAGS:
2361 case LL_IOC_CLRFLAGS:
2362 /* Set or clear specific file flags */
2363 /* XXX This probably needs checks to ensure the flags are
2364 * not abused, and to handle any flag side effects.
2366 if (get_user(flags, (int *) arg))
2369 if (cmd == LL_IOC_SETFLAGS) {
2370 if ((flags & LL_FILE_IGNORE_LOCK) &&
2371 !(file->f_flags & O_DIRECT)) {
2372 CERROR("%s: unable to disable locking on "
2373 "non-O_DIRECT file\n", current->comm);
2377 fd->fd_flags |= flags;
2379 fd->fd_flags &= ~flags;
2382 case LL_IOC_LOV_SETSTRIPE:
2383 RETURN(ll_lov_setstripe(inode, file, arg));
2384 case LL_IOC_LOV_SETEA:
2385 RETURN(ll_lov_setea(inode, file, arg));
2386 case LL_IOC_LOV_SWAP_LAYOUTS: {
2388 struct lustre_swap_layouts lsl;
2390 if (copy_from_user(&lsl, (char *)arg,
2391 sizeof(struct lustre_swap_layouts)))
2394 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2397 file2 = fget(lsl.sl_fd);
2402 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2403 rc = ll_swap_layouts(file, file2, &lsl);
2407 case LL_IOC_LOV_GETSTRIPE:
2408 RETURN(ll_lov_getstripe(inode, arg));
2409 case LL_IOC_RECREATE_OBJ:
2410 RETURN(ll_lov_recreate_obj(inode, arg));
2411 case LL_IOC_RECREATE_FID:
2412 RETURN(ll_lov_recreate_fid(inode, arg));
2413 case FSFILT_IOC_FIEMAP:
2414 RETURN(ll_ioctl_fiemap(inode, arg));
2415 case FSFILT_IOC_GETFLAGS:
2416 case FSFILT_IOC_SETFLAGS:
2417 RETURN(ll_iocontrol(inode, file, cmd, arg));
2418 case FSFILT_IOC_GETVERSION_OLD:
2419 case FSFILT_IOC_GETVERSION:
2420 RETURN(put_user(inode->i_generation, (int *)arg));
2421 case LL_IOC_GROUP_LOCK:
2422 RETURN(ll_get_grouplock(inode, file, arg));
2423 case LL_IOC_GROUP_UNLOCK:
2424 RETURN(ll_put_grouplock(inode, file, arg));
2425 case IOC_OBD_STATFS:
2426 RETURN(ll_obd_statfs(inode, (void *)arg));
2428 /* We need to special case any other ioctls we want to handle,
2429 * to send them to the MDS/OST as appropriate and to properly
2430 * network encode the arg field.
2431 case FSFILT_IOC_SETVERSION_OLD:
2432 case FSFILT_IOC_SETVERSION:
2434 case LL_IOC_FLUSHCTX:
2435 RETURN(ll_flush_ctx(inode));
2436 case LL_IOC_PATH2FID: {
2437 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2438 sizeof(struct lu_fid)))
2443 case OBD_IOC_FID2PATH:
2444 RETURN(ll_fid2path(inode, (void *)arg));
2445 case LL_IOC_DATA_VERSION: {
2446 struct ioc_data_version idv;
2449 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2452 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2453 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2455 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2461 case LL_IOC_GET_MDTIDX: {
2464 mdtidx = ll_get_mdt_idx(inode);
2468 if (put_user((int)mdtidx, (int*)arg))
2473 case OBD_IOC_GETDTNAME:
2474 case OBD_IOC_GETMDNAME:
2475 RETURN(ll_get_obd_name(inode, cmd, arg));
2476 case LL_IOC_HSM_STATE_GET: {
2477 struct md_op_data *op_data;
2478 struct hsm_user_state *hus;
2485 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2486 LUSTRE_OPC_ANY, hus);
2487 if (IS_ERR(op_data)) {
2489 RETURN(PTR_ERR(op_data));
2492 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2495 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2498 ll_finish_md_op_data(op_data);
2502 case LL_IOC_HSM_STATE_SET: {
2503 struct hsm_state_set *hss;
2510 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2515 rc = ll_hsm_state_set(inode, hss);
2520 case LL_IOC_HSM_ACTION: {
2521 struct md_op_data *op_data;
2522 struct hsm_current_action *hca;
2529 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2530 LUSTRE_OPC_ANY, hca);
2531 if (IS_ERR(op_data)) {
2533 RETURN(PTR_ERR(op_data));
2536 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2539 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2542 ll_finish_md_op_data(op_data);
2546 case LL_IOC_SET_LEASE: {
2547 struct ll_inode_info *lli = ll_i2info(inode);
2548 struct obd_client_handle *och = NULL;
2553 case LL_LEASE_WRLCK:
2554 if (!(file->f_mode & FMODE_WRITE))
2556 fmode = FMODE_WRITE;
2558 case LL_LEASE_RDLCK:
2559 if (!(file->f_mode & FMODE_READ))
2563 case LL_LEASE_UNLCK:
2564 mutex_lock(&lli->lli_och_mutex);
2565 if (fd->fd_lease_och != NULL) {
2566 och = fd->fd_lease_och;
2567 fd->fd_lease_och = NULL;
2569 mutex_unlock(&lli->lli_och_mutex);
2574 fmode = och->och_flags;
2575 rc = ll_lease_close(och, inode, &lease_broken);
2582 RETURN(ll_lease_type_from_fmode(fmode));
2587 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2589 /* apply for lease */
2590 och = ll_lease_open(inode, file, fmode, 0);
2592 RETURN(PTR_ERR(och));
2595 mutex_lock(&lli->lli_och_mutex);
2596 if (fd->fd_lease_och == NULL) {
2597 fd->fd_lease_och = och;
2600 mutex_unlock(&lli->lli_och_mutex);
2602 /* impossible now that only excl is supported for now */
2603 ll_lease_close(och, inode, &lease_broken);
2608 case LL_IOC_GET_LEASE: {
2609 struct ll_inode_info *lli = ll_i2info(inode);
2610 struct ldlm_lock *lock = NULL;
2613 mutex_lock(&lli->lli_och_mutex);
2614 if (fd->fd_lease_och != NULL) {
2615 struct obd_client_handle *och = fd->fd_lease_och;
2617 lock = ldlm_handle2lock(&och->och_lease_handle);
2619 lock_res_and_lock(lock);
2620 if (!ldlm_is_cancel(lock))
2621 fmode = och->och_flags;
2623 unlock_res_and_lock(lock);
2624 LDLM_LOCK_PUT(lock);
2627 mutex_unlock(&lli->lli_och_mutex);
2629 RETURN(ll_lease_type_from_fmode(fmode));
2631 case LL_IOC_HSM_IMPORT: {
2632 struct hsm_user_import *hui;
2638 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2643 rc = ll_hsm_import(inode, file, hui);
2653 ll_iocontrol_call(inode, file, cmd, arg, &err))
2656 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2662 #ifndef HAVE_FILE_LLSEEK_SIZE
2663 static inline loff_t
2664 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2666 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2668 if (offset > maxsize)
2671 if (offset != file->f_pos) {
2672 file->f_pos = offset;
2673 file->f_version = 0;
2679 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2680 loff_t maxsize, loff_t eof)
2682 struct inode *inode = file->f_dentry->d_inode;
2690 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2691 * position-querying operation. Avoid rewriting the "same"
2692 * f_pos value back to the file because a concurrent read(),
2693 * write() or lseek() might have altered it
2698 * f_lock protects against read/modify/write race with other
2699 * SEEK_CURs. Note that parallel writes and reads behave
2702 mutex_lock(&inode->i_mutex);
2703 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2704 mutex_unlock(&inode->i_mutex);
2708 * In the generic case the entire file is data, so as long as
2709 * offset isn't at the end of the file then the offset is data.
2716 * There is a virtual hole at the end of the file, so as long as
2717 * offset isn't i_size or larger, return i_size.
2725 return llseek_execute(file, offset, maxsize);
2729 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2731 struct inode *inode = file->f_dentry->d_inode;
2732 loff_t retval, eof = 0;
2735 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2736 (origin == SEEK_CUR) ? file->f_pos : 0);
2737 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2738 PFID(ll_inode2fid(inode)), inode, retval, retval,
2740 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2742 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2743 retval = ll_glimpse_size(inode);
2746 eof = i_size_read(inode);
2749 retval = ll_generic_file_llseek_size(file, offset, origin,
2750 ll_file_maxbytes(inode), eof);
2754 static int ll_flush(struct file *file, fl_owner_t id)
2756 struct inode *inode = file->f_dentry->d_inode;
2757 struct ll_inode_info *lli = ll_i2info(inode);
2758 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2761 LASSERT(!S_ISDIR(inode->i_mode));
2763 /* catch async errors that were recorded back when async writeback
2764 * failed for pages in this mapping. */
2765 rc = lli->lli_async_rc;
2766 lli->lli_async_rc = 0;
2767 if (lli->lli_clob != NULL) {
2768 err = lov_read_and_clear_async_rc(lli->lli_clob);
2773 /* The application has been told write failure already.
2774 * Do not report failure again. */
2775 if (fd->fd_write_failed)
2777 return rc ? -EIO : 0;
2781 * Called to make sure a portion of file has been written out.
2782 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2784 * Return how many pages have been written.
2786 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2787 enum cl_fsync_mode mode, int ignore_layout)
2789 struct cl_env_nest nest;
2792 struct obd_capa *capa = NULL;
2793 struct cl_fsync_io *fio;
2797 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2798 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2801 env = cl_env_nested_get(&nest);
2803 RETURN(PTR_ERR(env));
2805 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2807 io = ccc_env_thread_io(env);
2808 io->ci_obj = cl_i2info(inode)->lli_clob;
2809 io->ci_ignore_layout = ignore_layout;
2811 /* initialize parameters for sync */
2812 fio = &io->u.ci_fsync;
2813 fio->fi_capa = capa;
2814 fio->fi_start = start;
2816 fio->fi_fid = ll_inode2fid(inode);
2817 fio->fi_mode = mode;
2818 fio->fi_nr_written = 0;
2820 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2821 result = cl_io_loop(env, io);
2823 result = io->ci_result;
2825 result = fio->fi_nr_written;
2826 cl_io_fini(env, io);
2827 cl_env_nested_put(&nest, env);
2835 * When dentry is provided (the 'else' case), *file->f_dentry may be
2836 * null and dentry must be used directly rather than pulled from
2837 * *file->f_dentry as is done otherwise.
2840 #ifdef HAVE_FILE_FSYNC_4ARGS
2841 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2843 struct dentry *dentry = file->f_dentry;
2844 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2845 int ll_fsync(struct file *file, int datasync)
2847 struct dentry *dentry = file->f_dentry;
2849 loff_t end = LLONG_MAX;
2851 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2854 loff_t end = LLONG_MAX;
2856 struct inode *inode = dentry->d_inode;
2857 struct ll_inode_info *lli = ll_i2info(inode);
2858 struct ptlrpc_request *req;
2859 struct obd_capa *oc;
2863 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2864 PFID(ll_inode2fid(inode)), inode);
2865 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2867 #ifdef HAVE_FILE_FSYNC_4ARGS
2868 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2869 mutex_lock(&inode->i_mutex);
2871 /* fsync's caller has already called _fdata{sync,write}, we want
2872 * that IO to finish before calling the osc and mdc sync methods */
2873 rc = filemap_fdatawait(inode->i_mapping);
2876 /* catch async errors that were recorded back when async writeback
2877 * failed for pages in this mapping. */
2878 if (!S_ISDIR(inode->i_mode)) {
2879 err = lli->lli_async_rc;
2880 lli->lli_async_rc = 0;
2883 err = lov_read_and_clear_async_rc(lli->lli_clob);
2888 oc = ll_mdscapa_get(inode);
2889 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2895 ptlrpc_req_finished(req);
2897 if (S_ISREG(inode->i_mode)) {
2898 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2900 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2901 if (rc == 0 && err < 0)
2904 fd->fd_write_failed = true;
2906 fd->fd_write_failed = false;
2909 #ifdef HAVE_FILE_FSYNC_4ARGS
2910 mutex_unlock(&inode->i_mutex);
2916 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2918 struct inode *inode = file->f_dentry->d_inode;
2919 struct ll_sb_info *sbi = ll_i2sbi(inode);
2920 struct ldlm_enqueue_info einfo = {
2921 .ei_type = LDLM_FLOCK,
2922 .ei_cb_cp = ldlm_flock_completion_ast,
2923 .ei_cbdata = file_lock,
2925 struct md_op_data *op_data;
2926 struct lustre_handle lockh = {0};
2927 ldlm_policy_data_t flock = {{0}};
2928 int fl_type = file_lock->fl_type;
2934 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2935 PFID(ll_inode2fid(inode)), file_lock);
2937 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2939 if (file_lock->fl_flags & FL_FLOCK) {
2940 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2941 /* flocks are whole-file locks */
2942 flock.l_flock.end = OFFSET_MAX;
2943 /* For flocks owner is determined by the local file desctiptor*/
2944 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2945 } else if (file_lock->fl_flags & FL_POSIX) {
2946 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2947 flock.l_flock.start = file_lock->fl_start;
2948 flock.l_flock.end = file_lock->fl_end;
2952 flock.l_flock.pid = file_lock->fl_pid;
2954 /* Somewhat ugly workaround for svc lockd.
2955 * lockd installs custom fl_lmops->lm_compare_owner that checks
2956 * for the fl_owner to be the same (which it always is on local node
2957 * I guess between lockd processes) and then compares pid.
2958 * As such we assign pid to the owner field to make it all work,
2959 * conflict with normal locks is unlikely since pid space and
2960 * pointer space for current->files are not intersecting */
2961 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2962 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2966 einfo.ei_mode = LCK_PR;
2969 /* An unlock request may or may not have any relation to
2970 * existing locks so we may not be able to pass a lock handle
2971 * via a normal ldlm_lock_cancel() request. The request may even
2972 * unlock a byte range in the middle of an existing lock. In
2973 * order to process an unlock request we need all of the same
2974 * information that is given with a normal read or write record
2975 * lock request. To avoid creating another ldlm unlock (cancel)
2976 * message we'll treat a LCK_NL flock request as an unlock. */
2977 einfo.ei_mode = LCK_NL;
2980 einfo.ei_mode = LCK_PW;
2983 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2998 flags = LDLM_FL_BLOCK_NOWAIT;
3004 flags = LDLM_FL_TEST_LOCK;
3007 CERROR("unknown fcntl lock command: %d\n", cmd);
3011 /* Save the old mode so that if the mode in the lock changes we
3012 * can decrement the appropriate reader or writer refcount. */
3013 file_lock->fl_type = einfo.ei_mode;
3015 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3016 LUSTRE_OPC_ANY, NULL);
3017 if (IS_ERR(op_data))
3018 RETURN(PTR_ERR(op_data));
3020 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3021 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3022 flock.l_flock.pid, flags, einfo.ei_mode,
3023 flock.l_flock.start, flock.l_flock.end);
3025 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3028 /* Restore the file lock type if not TEST lock. */
3029 if (!(flags & LDLM_FL_TEST_LOCK))
3030 file_lock->fl_type = fl_type;
3032 if ((file_lock->fl_flags & FL_FLOCK) &&
3033 (rc == 0 || file_lock->fl_type == F_UNLCK))
3034 rc2 = flock_lock_file_wait(file, file_lock);
3035 if ((file_lock->fl_flags & FL_POSIX) &&
3036 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3037 !(flags & LDLM_FL_TEST_LOCK))
3038 rc2 = posix_lock_file_wait(file, file_lock);
3040 if (rc2 && file_lock->fl_type != F_UNLCK) {
3041 einfo.ei_mode = LCK_NL;
3042 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3047 ll_finish_md_op_data(op_data);
3052 int ll_get_fid_by_name(struct inode *parent, const char *name,
3053 int namelen, struct lu_fid *fid)
3055 struct md_op_data *op_data = NULL;
3056 struct mdt_body *body;
3057 struct ptlrpc_request *req;
3061 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3062 LUSTRE_OPC_ANY, NULL);
3063 if (IS_ERR(op_data))
3064 RETURN(PTR_ERR(op_data));
3066 op_data->op_valid = OBD_MD_FLID;
3067 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3068 ll_finish_md_op_data(op_data);
3072 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3074 GOTO(out_req, rc = -EFAULT);
3076 *fid = body->mbo_fid1;
3078 ptlrpc_req_finished(req);
3082 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3083 const char *name, int namelen)
3085 struct dentry *dchild = NULL;
3086 struct inode *child_inode = NULL;
3087 struct md_op_data *op_data;
3088 struct ptlrpc_request *request = NULL;
3093 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3094 name, PFID(ll_inode2fid(parent)), mdtidx);
3096 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3097 0, LUSTRE_OPC_ANY, NULL);
3098 if (IS_ERR(op_data))
3099 RETURN(PTR_ERR(op_data));
3101 /* Get child FID first */
3102 qstr.hash = full_name_hash(name, namelen);
3105 dchild = d_lookup(file->f_dentry, &qstr);
3106 if (dchild != NULL && dchild->d_inode != NULL) {
3107 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3108 if (dchild->d_inode != NULL) {
3109 child_inode = igrab(dchild->d_inode);
3110 ll_invalidate_aliases(child_inode);
3114 rc = ll_get_fid_by_name(parent, name, namelen,
3120 if (!fid_is_sane(&op_data->op_fid3)) {
3121 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3122 ll_get_fsname(parent->i_sb, NULL, 0), name,
3123 PFID(&op_data->op_fid3));
3124 GOTO(out_free, rc = -EINVAL);
3127 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3132 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3133 PFID(&op_data->op_fid3), mdtidx);
3134 GOTO(out_free, rc = 0);
3137 op_data->op_mds = mdtidx;
3138 op_data->op_cli_flags = CLI_MIGRATE;
3139 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3140 namelen, name, namelen, &request);
3142 ll_update_times(request, parent);
3144 ptlrpc_req_finished(request);
3149 if (child_inode != NULL) {
3150 clear_nlink(child_inode);
3154 ll_finish_md_op_data(op_data);
3159 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3167 * test if some locks matching bits and l_req_mode are acquired
3168 * - bits can be in different locks
3169 * - if found clear the common lock bits in *bits
3170 * - the bits not found, are kept in *bits
3172 * \param bits [IN] searched lock bits [IN]
3173 * \param l_req_mode [IN] searched lock mode
3174 * \retval boolean, true iff all bits are found
3176 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3178 struct lustre_handle lockh;
3179 ldlm_policy_data_t policy;
3180 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3181 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3190 fid = &ll_i2info(inode)->lli_fid;
3191 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3192 ldlm_lockname[mode]);
3194 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3195 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3196 policy.l_inodebits.bits = *bits & (1 << i);
3197 if (policy.l_inodebits.bits == 0)
3200 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3201 &policy, mode, &lockh)) {
3202 struct ldlm_lock *lock;
3204 lock = ldlm_handle2lock(&lockh);
3207 ~(lock->l_policy_data.l_inodebits.bits);
3208 LDLM_LOCK_PUT(lock);
3210 *bits &= ~policy.l_inodebits.bits;
3217 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3218 struct lustre_handle *lockh, __u64 flags,
3221 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3226 fid = &ll_i2info(inode)->lli_fid;
3227 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3229 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3230 fid, LDLM_IBITS, &policy, mode, lockh);
3235 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3237 /* Already unlinked. Just update nlink and return success */
3238 if (rc == -ENOENT) {
3240 /* This path cannot be hit for regular files unless in
3241 * case of obscure races, so no need to to validate
3243 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3245 } else if (rc != 0) {
3246 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3247 "%s: revalidate FID "DFID" error: rc = %d\n",
3248 ll_get_fsname(inode->i_sb, NULL, 0),
3249 PFID(ll_inode2fid(inode)), rc);
3255 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3257 struct inode *inode = dentry->d_inode;
3258 struct ptlrpc_request *req = NULL;
3259 struct obd_export *exp;
3263 LASSERT(inode != NULL);
3265 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3266 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3268 exp = ll_i2mdexp(inode);
3270 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3271 * But under CMD case, it caused some lock issues, should be fixed
3272 * with new CMD ibits lock. See bug 12718 */
3273 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3274 struct lookup_intent oit = { .it_op = IT_GETATTR };
3275 struct md_op_data *op_data;
3277 if (ibits == MDS_INODELOCK_LOOKUP)
3278 oit.it_op = IT_LOOKUP;
3280 /* Call getattr by fid, so do not provide name at all. */
3281 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3282 dentry->d_inode, NULL, 0, 0,
3283 LUSTRE_OPC_ANY, NULL);
3284 if (IS_ERR(op_data))
3285 RETURN(PTR_ERR(op_data));
3287 rc = md_intent_lock(exp, op_data, &oit, &req,
3288 &ll_md_blocking_ast, 0);
3289 ll_finish_md_op_data(op_data);
3291 rc = ll_inode_revalidate_fini(inode, rc);
3295 rc = ll_revalidate_it_finish(req, &oit, dentry);
3297 ll_intent_release(&oit);
3301 /* Unlinked? Unhash dentry, so it is not picked up later by
3302 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3303 here to preserve get_cwd functionality on 2.6.
3305 if (!dentry->d_inode->i_nlink)
3306 d_lustre_invalidate(dentry, 0);
3308 ll_lookup_finish_locks(&oit, dentry);
3309 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3310 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3311 obd_valid valid = OBD_MD_FLGETATTR;
3312 struct md_op_data *op_data;
3315 if (S_ISREG(inode->i_mode)) {
3316 rc = ll_get_default_mdsize(sbi, &ealen);
3319 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3322 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3323 0, ealen, LUSTRE_OPC_ANY,
3325 if (IS_ERR(op_data))
3326 RETURN(PTR_ERR(op_data));
3328 op_data->op_valid = valid;
3329 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3330 * capa for this inode. Because we only keep capas of dirs
3332 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3333 ll_finish_md_op_data(op_data);
3335 rc = ll_inode_revalidate_fini(inode, rc);
3339 rc = ll_prep_inode(&inode, req, NULL, NULL);
3342 ptlrpc_req_finished(req);
3346 static int ll_merge_md_attr(struct inode *inode)
3348 struct cl_attr attr = { 0 };
3351 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3352 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3357 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3358 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3360 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3361 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3362 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3368 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3370 struct inode *inode = dentry->d_inode;
3374 rc = __ll_inode_revalidate(dentry, ibits);
3378 /* if object isn't regular file, don't validate size */
3379 if (!S_ISREG(inode->i_mode)) {
3380 if (S_ISDIR(inode->i_mode) &&
3381 ll_i2info(inode)->lli_lsm_md != NULL) {
3382 rc = ll_merge_md_attr(inode);
3387 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3388 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3389 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3391 /* In case of restore, the MDT has the right size and has
3392 * already send it back without granting the layout lock,
3393 * inode is up-to-date so glimpse is useless.
3394 * Also to glimpse we need the layout, in case of a running
3395 * restore the MDT holds the layout lock so the glimpse will
3396 * block up to the end of restore (getattr will block)
3398 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3399 rc = ll_glimpse_size(inode);
3404 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3406 struct inode *inode = de->d_inode;
3407 struct ll_sb_info *sbi = ll_i2sbi(inode);
3408 struct ll_inode_info *lli = ll_i2info(inode);
3411 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3412 MDS_INODELOCK_LOOKUP);
3413 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3418 stat->dev = inode->i_sb->s_dev;
3419 if (ll_need_32bit_api(sbi))
3420 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3422 stat->ino = inode->i_ino;
3423 stat->mode = inode->i_mode;
3424 stat->uid = inode->i_uid;
3425 stat->gid = inode->i_gid;
3426 stat->rdev = inode->i_rdev;
3427 stat->atime = inode->i_atime;
3428 stat->mtime = inode->i_mtime;
3429 stat->ctime = inode->i_ctime;
3430 stat->blksize = 1 << inode->i_blkbits;
3431 stat->blocks = inode->i_blocks;
3433 if (S_ISDIR(inode->i_mode) &&
3434 ll_i2info(inode)->lli_lsm_md != NULL) {
3435 stat->nlink = lli->lli_stripe_dir_nlink;
3436 stat->size = lli->lli_stripe_dir_size;
3438 stat->nlink = inode->i_nlink;
3439 stat->size = i_size_read(inode);
3445 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3446 __u64 start, __u64 len)
3450 struct ll_user_fiemap *fiemap;
3451 unsigned int extent_count = fieinfo->fi_extents_max;
3453 num_bytes = sizeof(*fiemap) + (extent_count *
3454 sizeof(struct ll_fiemap_extent));
3455 OBD_ALLOC_LARGE(fiemap, num_bytes);
3460 fiemap->fm_flags = fieinfo->fi_flags;
3461 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3462 fiemap->fm_start = start;
3463 fiemap->fm_length = len;
3464 if (extent_count > 0)
3465 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3466 sizeof(struct ll_fiemap_extent));
3468 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3470 fieinfo->fi_flags = fiemap->fm_flags;
3471 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3472 if (extent_count > 0)
3473 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3474 fiemap->fm_mapped_extents *
3475 sizeof(struct ll_fiemap_extent));
3477 OBD_FREE_LARGE(fiemap, num_bytes);
3481 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3483 struct ll_inode_info *lli = ll_i2info(inode);
3484 struct posix_acl *acl = NULL;
3487 spin_lock(&lli->lli_lock);
3488 /* VFS' acl_permission_check->check_acl will release the refcount */
3489 acl = posix_acl_dup(lli->lli_posix_acl);
3490 spin_unlock(&lli->lli_lock);
3495 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3497 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3498 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3500 ll_check_acl(struct inode *inode, int mask)
3503 # ifdef CONFIG_FS_POSIX_ACL
3504 struct posix_acl *acl;
3508 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3509 if (flags & IPERM_FLAG_RCU)
3512 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3517 rc = posix_acl_permission(inode, acl, mask);
3518 posix_acl_release(acl);
3521 # else /* !CONFIG_FS_POSIX_ACL */
3523 # endif /* CONFIG_FS_POSIX_ACL */
3525 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3527 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3528 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3530 # ifdef HAVE_INODE_PERMISION_2ARGS
3531 int ll_inode_permission(struct inode *inode, int mask)
3533 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3538 struct ll_sb_info *sbi;
3539 struct root_squash_info *squash;
3540 struct cred *cred = NULL;
3541 const struct cred *old_cred = NULL;
3543 bool squash_id = false;
3546 #ifdef MAY_NOT_BLOCK
3547 if (mask & MAY_NOT_BLOCK)
3549 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3550 if (flags & IPERM_FLAG_RCU)
3554 /* as root inode are NOT getting validated in lookup operation,
3555 * need to do it before permission check. */
3557 if (inode == inode->i_sb->s_root->d_inode) {
3558 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3559 MDS_INODELOCK_LOOKUP);
3564 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3565 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3567 /* squash fsuid/fsgid if needed */
3568 sbi = ll_i2sbi(inode);
3569 squash = &sbi->ll_squash;
3570 if (unlikely(squash->rsi_uid != 0 &&
3571 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3572 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3576 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3577 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3578 squash->rsi_uid, squash->rsi_gid);
3580 /* update current process's credentials
3581 * and FS capability */
3582 cred = prepare_creds();
3586 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3587 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3588 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3589 if ((1 << cap) & CFS_CAP_FS_MASK)
3590 cap_lower(cred->cap_effective, cap);
3592 old_cred = override_creds(cred);
3595 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3597 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3598 rc = lustre_check_remote_perm(inode, mask);
3600 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3602 /* restore current process's credentials and FS capability */
3604 revert_creds(old_cred);
3611 /* -o localflock - only provides locally consistent flock locks */
3612 struct file_operations ll_file_operations = {
3613 .read = ll_file_read,
3614 .aio_read = ll_file_aio_read,
3615 .write = ll_file_write,
3616 .aio_write = ll_file_aio_write,
3617 .unlocked_ioctl = ll_file_ioctl,
3618 .open = ll_file_open,
3619 .release = ll_file_release,
3620 .mmap = ll_file_mmap,
3621 .llseek = ll_file_seek,
3622 .splice_read = ll_file_splice_read,
3627 struct file_operations ll_file_operations_flock = {
3628 .read = ll_file_read,
3629 .aio_read = ll_file_aio_read,
3630 .write = ll_file_write,
3631 .aio_write = ll_file_aio_write,
3632 .unlocked_ioctl = ll_file_ioctl,
3633 .open = ll_file_open,
3634 .release = ll_file_release,
3635 .mmap = ll_file_mmap,
3636 .llseek = ll_file_seek,
3637 .splice_read = ll_file_splice_read,
3640 .flock = ll_file_flock,
3641 .lock = ll_file_flock
3644 /* These are for -o noflock - to return ENOSYS on flock calls */
3645 struct file_operations ll_file_operations_noflock = {
3646 .read = ll_file_read,
3647 .aio_read = ll_file_aio_read,
3648 .write = ll_file_write,
3649 .aio_write = ll_file_aio_write,
3650 .unlocked_ioctl = ll_file_ioctl,
3651 .open = ll_file_open,
3652 .release = ll_file_release,
3653 .mmap = ll_file_mmap,
3654 .llseek = ll_file_seek,
3655 .splice_read = ll_file_splice_read,
3658 .flock = ll_file_noflock,
3659 .lock = ll_file_noflock
3662 struct inode_operations ll_file_inode_operations = {
3663 .setattr = ll_setattr,
3664 .getattr = ll_getattr,
3665 .permission = ll_inode_permission,
3666 .setxattr = ll_setxattr,
3667 .getxattr = ll_getxattr,
3668 .listxattr = ll_listxattr,
3669 .removexattr = ll_removexattr,
3670 .fiemap = ll_fiemap,
3671 #ifdef HAVE_IOP_GET_ACL
3672 .get_acl = ll_get_acl,
3676 /* dynamic ioctl number support routins */
3677 static struct llioc_ctl_data {
3678 struct rw_semaphore ioc_sem;
3679 struct list_head ioc_head;
3681 __RWSEM_INITIALIZER(llioc.ioc_sem),
3682 LIST_HEAD_INIT(llioc.ioc_head)
3687 struct list_head iocd_list;
3688 unsigned int iocd_size;
3689 llioc_callback_t iocd_cb;
3690 unsigned int iocd_count;
3691 unsigned int iocd_cmd[0];
3694 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3697 struct llioc_data *in_data = NULL;
3700 if (cb == NULL || cmd == NULL ||
3701 count > LLIOC_MAX_CMD || count < 0)
3704 size = sizeof(*in_data) + count * sizeof(unsigned int);
3705 OBD_ALLOC(in_data, size);
3706 if (in_data == NULL)
3709 memset(in_data, 0, sizeof(*in_data));
3710 in_data->iocd_size = size;
3711 in_data->iocd_cb = cb;
3712 in_data->iocd_count = count;
3713 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3715 down_write(&llioc.ioc_sem);
3716 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3717 up_write(&llioc.ioc_sem);
3722 void ll_iocontrol_unregister(void *magic)
3724 struct llioc_data *tmp;
3729 down_write(&llioc.ioc_sem);
3730 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3732 unsigned int size = tmp->iocd_size;
3734 list_del(&tmp->iocd_list);
3735 up_write(&llioc.ioc_sem);
3737 OBD_FREE(tmp, size);
3741 up_write(&llioc.ioc_sem);
3743 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3746 EXPORT_SYMBOL(ll_iocontrol_register);
3747 EXPORT_SYMBOL(ll_iocontrol_unregister);
3749 static enum llioc_iter
3750 ll_iocontrol_call(struct inode *inode, struct file *file,
3751 unsigned int cmd, unsigned long arg, int *rcp)
3753 enum llioc_iter ret = LLIOC_CONT;
3754 struct llioc_data *data;
3755 int rc = -EINVAL, i;
3757 down_read(&llioc.ioc_sem);
3758 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3759 for (i = 0; i < data->iocd_count; i++) {
3760 if (cmd != data->iocd_cmd[i])
3763 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3767 if (ret == LLIOC_STOP)
3770 up_read(&llioc.ioc_sem);
3777 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3779 struct ll_inode_info *lli = ll_i2info(inode);
3780 struct cl_env_nest nest;
3785 if (lli->lli_clob == NULL)
3788 env = cl_env_nested_get(&nest);
3790 RETURN(PTR_ERR(env));
3792 result = cl_conf_set(env, lli->lli_clob, conf);
3793 cl_env_nested_put(&nest, env);
3795 if (conf->coc_opc == OBJECT_CONF_SET) {
3796 struct ldlm_lock *lock = conf->coc_lock;
3798 LASSERT(lock != NULL);
3799 LASSERT(ldlm_has_layout(lock));
3801 struct lustre_md *md = conf->u.coc_md;
3802 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3804 /* it can only be allowed to match after layout is
3805 * applied to inode otherwise false layout would be
3806 * seen. Applying layout shoud happen before dropping
3807 * the intent lock. */
3808 ldlm_lock_allow_match(lock);
3810 lli->lli_has_smd = lsm_has_objects(md->lsm);
3811 if (md->lsm != NULL)
3812 gen = md->lsm->lsm_layout_gen;
3815 DFID ": layout version change: %u -> %u\n",
3816 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3818 ll_layout_version_set(lli, gen);
3824 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3825 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3828 struct ll_sb_info *sbi = ll_i2sbi(inode);
3829 struct obd_capa *oc;
3830 struct ptlrpc_request *req;
3831 struct mdt_body *body;
3838 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3839 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3840 lock->l_lvb_data, lock->l_lvb_len);
3842 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3845 /* if layout lock was granted right away, the layout is returned
3846 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3847 * blocked and then granted via completion ast, we have to fetch
3848 * layout here. Please note that we can't use the LVB buffer in
3849 * completion AST because it doesn't have a large enough buffer */
3850 oc = ll_mdscapa_get(inode);
3851 rc = ll_get_default_mdsize(sbi, &lmmsize);
3853 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3854 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3860 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3862 GOTO(out, rc = -EPROTO);
3864 lmmsize = body->mbo_eadatasize;
3865 if (lmmsize == 0) /* empty layout */
3868 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3870 GOTO(out, rc = -EFAULT);
3872 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3873 if (lvbdata == NULL)
3874 GOTO(out, rc = -ENOMEM);
3876 memcpy(lvbdata, lmm, lmmsize);
3877 lock_res_and_lock(lock);
3878 if (lock->l_lvb_data != NULL)
3879 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3881 lock->l_lvb_data = lvbdata;
3882 lock->l_lvb_len = lmmsize;
3883 unlock_res_and_lock(lock);
3888 ptlrpc_req_finished(req);
3893 * Apply the layout to the inode. Layout lock is held and will be released
3896 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3897 struct inode *inode, __u32 *gen, bool reconf)
3899 struct ll_inode_info *lli = ll_i2info(inode);
3900 struct ll_sb_info *sbi = ll_i2sbi(inode);
3901 struct ldlm_lock *lock;
3902 struct lustre_md md = { NULL };
3903 struct cl_object_conf conf;
3906 bool wait_layout = false;
3909 LASSERT(lustre_handle_is_used(lockh));
3911 lock = ldlm_handle2lock(lockh);
3912 LASSERT(lock != NULL);
3913 LASSERT(ldlm_has_layout(lock));
3915 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3916 PFID(&lli->lli_fid), inode, reconf);
3918 /* in case this is a caching lock and reinstate with new inode */
3919 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3921 lock_res_and_lock(lock);
3922 lvb_ready = ldlm_is_lvb_ready(lock);
3923 unlock_res_and_lock(lock);
3924 /* checking lvb_ready is racy but this is okay. The worst case is
3925 * that multi processes may configure the file on the same time. */
3927 if (lvb_ready || !reconf) {
3930 /* layout_gen must be valid if layout lock is not
3931 * cancelled and stripe has already set */
3932 *gen = ll_layout_version_get(lli);
3938 rc = ll_layout_fetch(inode, lock);
3942 /* for layout lock, lmm is returned in lock's lvb.
3943 * lvb_data is immutable if the lock is held so it's safe to access it
3944 * without res lock. See the description in ldlm_lock_decref_internal()
3945 * for the condition to free lvb_data of layout lock */
3946 if (lock->l_lvb_data != NULL) {
3947 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3948 lock->l_lvb_data, lock->l_lvb_len);
3950 *gen = LL_LAYOUT_GEN_EMPTY;
3952 *gen = md.lsm->lsm_layout_gen;
3955 CERROR("%s: file "DFID" unpackmd error: %d\n",
3956 ll_get_fsname(inode->i_sb, NULL, 0),
3957 PFID(&lli->lli_fid), rc);
3963 /* set layout to file. Unlikely this will fail as old layout was
3964 * surely eliminated */
3965 memset(&conf, 0, sizeof conf);
3966 conf.coc_opc = OBJECT_CONF_SET;
3967 conf.coc_inode = inode;
3968 conf.coc_lock = lock;
3969 conf.u.coc_md = &md;
3970 rc = ll_layout_conf(inode, &conf);
3973 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3975 /* refresh layout failed, need to wait */
3976 wait_layout = rc == -EBUSY;
3980 LDLM_LOCK_PUT(lock);
3981 ldlm_lock_decref(lockh, mode);
3983 /* wait for IO to complete if it's still being used. */
3985 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3986 ll_get_fsname(inode->i_sb, NULL, 0),
3987 PFID(&lli->lli_fid), inode);
3989 memset(&conf, 0, sizeof conf);
3990 conf.coc_opc = OBJECT_CONF_WAIT;
3991 conf.coc_inode = inode;
3992 rc = ll_layout_conf(inode, &conf);
3996 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3997 ll_get_fsname(inode->i_sb, NULL, 0),
3998 PFID(&lli->lli_fid), rc);
4004 * This function checks if there exists a LAYOUT lock on the client side,
4005 * or enqueues it if it doesn't have one in cache.
4007 * This function will not hold layout lock so it may be revoked any time after
4008 * this function returns. Any operations depend on layout should be redone
4011 * This function should be called before lov_io_init() to get an uptodate
4012 * layout version, the caller should save the version number and after IO
4013 * is finished, this function should be called again to verify that layout
4014 * is not changed during IO time.
4016 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4018 struct ll_inode_info *lli = ll_i2info(inode);
4019 struct ll_sb_info *sbi = ll_i2sbi(inode);
4020 struct md_op_data *op_data;
4021 struct lookup_intent it;
4022 struct lustre_handle lockh;
4024 struct ldlm_enqueue_info einfo = {
4025 .ei_type = LDLM_IBITS,
4027 .ei_cb_bl = &ll_md_blocking_ast,
4028 .ei_cb_cp = &ldlm_completion_ast,
4033 *gen = ll_layout_version_get(lli);
4034 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4038 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4039 LASSERT(S_ISREG(inode->i_mode));
4041 /* take layout lock mutex to enqueue layout lock exclusively. */
4042 mutex_lock(&lli->lli_layout_mutex);
4045 /* mostly layout lock is caching on the local side, so try to match
4046 * it before grabbing layout lock mutex. */
4047 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4048 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4049 if (mode != 0) { /* hit cached lock */
4050 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4054 mutex_unlock(&lli->lli_layout_mutex);
4058 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4059 0, 0, LUSTRE_OPC_ANY, NULL);
4060 if (IS_ERR(op_data)) {
4061 mutex_unlock(&lli->lli_layout_mutex);
4062 RETURN(PTR_ERR(op_data));
4065 /* have to enqueue one */
4066 memset(&it, 0, sizeof(it));
4067 it.it_op = IT_LAYOUT;
4068 lockh.cookie = 0ULL;
4070 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4071 ll_get_fsname(inode->i_sb, NULL, 0),
4072 PFID(&lli->lli_fid), inode);
4074 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4075 if (it.d.lustre.it_data != NULL)
4076 ptlrpc_req_finished(it.d.lustre.it_data);
4077 it.d.lustre.it_data = NULL;
4079 ll_finish_md_op_data(op_data);
4081 mode = it.d.lustre.it_lock_mode;
4082 it.d.lustre.it_lock_mode = 0;
4083 ll_intent_drop_lock(&it);
4086 /* set lock data in case this is a new lock */
4087 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4088 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4092 mutex_unlock(&lli->lli_layout_mutex);
4098 * This function send a restore request to the MDT
4100 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4102 struct hsm_user_request *hur;
4106 len = sizeof(struct hsm_user_request) +
4107 sizeof(struct hsm_user_item);
4108 OBD_ALLOC(hur, len);
4112 hur->hur_request.hr_action = HUA_RESTORE;
4113 hur->hur_request.hr_archive_id = 0;
4114 hur->hur_request.hr_flags = 0;
4115 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4116 sizeof(hur->hur_user_item[0].hui_fid));
4117 hur->hur_user_item[0].hui_extent.offset = offset;
4118 hur->hur_user_item[0].hui_extent.length = length;
4119 hur->hur_request.hr_itemcount = 1;
4120 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,