1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
34 #include "llite_internal.h"
36 /* also used by llite/special.c:ll_special_open() */
37 struct ll_file_data *ll_file_data_get(void)
39 struct ll_file_data *fd;
41 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
45 static void ll_file_data_put(struct ll_file_data *fd)
48 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
52 struct lustre_handle *fh)
54 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
55 op_data->op_attr.ia_mode = inode->i_mode;
56 op_data->op_attr.ia_atime = inode->i_atime;
57 op_data->op_attr.ia_mtime = inode->i_mtime;
58 op_data->op_attr.ia_ctime = inode->i_ctime;
59 op_data->op_attr.ia_size = inode->i_size;
60 op_data->op_attr_blocks = inode->i_blocks;
61 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
62 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
63 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
64 op_data->op_capa1 = ll_mdscapa_get(inode);
67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
68 struct obd_client_handle *och)
72 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
73 ATTR_MTIME_SET | ATTR_CTIME_SET;
75 if (!(och->och_flags & FMODE_WRITE))
78 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
79 !S_ISREG(inode->i_mode))
80 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
82 ll_epoch_close(inode, op_data, &och, 0);
85 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
89 static int ll_close_inode_openhandle(struct obd_export *md_exp,
91 struct obd_client_handle *och)
93 struct obd_export *exp = ll_i2mdexp(inode);
94 struct md_op_data *op_data;
95 struct ptlrpc_request *req = NULL;
96 struct obd_device *obd = class_exp2obd(exp);
103 * XXX: in case of LMV, is this correct to access
106 CERROR("Invalid MDC connection handle "LPX64"\n",
107 ll_i2mdexp(inode)->exp_handle.h_cookie);
112 * here we check if this is forced umount. If so this is called on
113 * canceling "open lock" and we do not call md_close() in this case, as
114 * it will not be successful, as import is already deactivated.
119 OBD_ALLOC_PTR(op_data);
121 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
123 ll_prepare_close(inode, op_data, och);
124 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
125 rc = md_close(md_exp, op_data, och, &req);
128 /* This close must have the epoch closed. */
129 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
130 LASSERT(epoch_close);
131 /* MDS has instructed us to obtain Size-on-MDS attribute from
132 * OSTs and send setattr to back to MDS. */
133 rc = ll_sizeonmds_update(inode, &och->och_fh,
134 op_data->op_ioepoch);
136 CERROR("inode %lu mdc Size-on-MDS update failed: "
137 "rc = %d\n", inode->i_ino, rc);
141 CERROR("inode %lu mdc close failed: rc = %d\n",
144 ll_finish_md_op_data(op_data);
147 rc = ll_objects_destroy(req, inode);
149 CERROR("inode %lu ll_objects destroy: rc = %d\n",
153 ptlrpc_req_finished(req); /* This is close request */
157 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
158 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
159 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
170 int ll_md_real_close(struct inode *inode, int flags)
172 struct ll_inode_info *lli = ll_i2info(inode);
173 struct obd_client_handle **och_p;
174 struct obd_client_handle *och;
179 if (flags & FMODE_WRITE) {
180 och_p = &lli->lli_mds_write_och;
181 och_usecount = &lli->lli_open_fd_write_count;
182 } else if (flags & FMODE_EXEC) {
183 och_p = &lli->lli_mds_exec_och;
184 och_usecount = &lli->lli_open_fd_exec_count;
186 LASSERT(flags & FMODE_READ);
187 och_p = &lli->lli_mds_read_och;
188 och_usecount = &lli->lli_open_fd_read_count;
191 down(&lli->lli_och_sem);
192 if (*och_usecount) { /* There are still users of this handle, so
194 up(&lli->lli_och_sem);
199 up(&lli->lli_och_sem);
201 if (och) { /* There might be a race and somebody have freed this och
203 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
210 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
213 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
214 struct ll_inode_info *lli = ll_i2info(inode);
218 /* clear group lock, if present */
219 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
220 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
221 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
222 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
226 /* Let's see if we have good enough OPEN lock on the file and if
227 we can skip talking to MDS */
228 if (file->f_dentry->d_inode) { /* Can this ever be false? */
230 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
231 struct lustre_handle lockh;
232 struct inode *inode = file->f_dentry->d_inode;
233 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
235 down(&lli->lli_och_sem);
236 if (fd->fd_omode & FMODE_WRITE) {
238 LASSERT(lli->lli_open_fd_write_count);
239 lli->lli_open_fd_write_count--;
240 } else if (fd->fd_omode & FMODE_EXEC) {
242 LASSERT(lli->lli_open_fd_exec_count);
243 lli->lli_open_fd_exec_count--;
246 LASSERT(lli->lli_open_fd_read_count);
247 lli->lli_open_fd_read_count--;
249 up(&lli->lli_och_sem);
251 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
252 LDLM_IBITS, &policy, lockmode,
254 rc = ll_md_real_close(file->f_dentry->d_inode,
258 CERROR("Releasing a file %p with negative dentry %p. Name %s",
259 file, file->f_dentry, file->f_dentry->d_name.name);
262 LUSTRE_FPRIVATE(file) = NULL;
263 ll_file_data_put(fd);
264 ll_capa_close(inode);
269 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
271 /* While this returns an error code, fput() the caller does not, so we need
272 * to make every effort to clean up all of our state here. Also, applications
273 * rarely check close errors and even if an error is returned they will not
274 * re-try the close call.
276 int ll_file_release(struct inode *inode, struct file *file)
278 struct ll_file_data *fd;
279 struct ll_sb_info *sbi = ll_i2sbi(inode);
280 struct ll_inode_info *lli = ll_i2info(inode);
281 struct lov_stripe_md *lsm = lli->lli_smd;
285 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
286 inode->i_generation, inode);
288 /* don't do anything for / */
289 if (inode->i_sb->s_root == file->f_dentry)
292 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
293 fd = LUSTRE_FPRIVATE(file);
296 /* don't do anything for / */
297 if (inode->i_sb->s_root == file->f_dentry) {
298 LUSTRE_FPRIVATE(file) = NULL;
299 ll_file_data_put(fd);
304 lov_test_and_clear_async_rc(lsm);
305 lli->lli_async_rc = 0;
307 rc = ll_md_close(sbi->ll_md_exp, inode, file);
311 static int ll_intent_file_open(struct file *file, void *lmm,
312 int lmmsize, struct lookup_intent *itp)
314 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
315 struct dentry *parent = file->f_dentry->d_parent;
316 const char *name = file->f_dentry->d_name.name;
317 const int len = file->f_dentry->d_name.len;
318 struct md_op_data *op_data;
319 struct ptlrpc_request *req;
325 /* Usually we come here only for NFSD, and we want open lock.
326 But we can also get here with pre 2.6.15 patchless kernels, and in
327 that case that lock is also ok */
328 /* We can also get here if there was cached open handle in revalidate_it
329 * but it disappeared while we were getting from there to ll_file_open.
330 * But this means this file was closed and immediatelly opened which
331 * makes a good candidate for using OPEN lock */
332 /* If lmmsize & lmm are not 0, we are just setting stripe info
333 * parameters. No need for the open lock */
334 if (!lmm && !lmmsize)
335 itp->it_flags |= MDS_OPEN_LOCK;
337 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
338 file->f_dentry->d_inode, name, len,
339 O_RDWR, LUSTRE_OPC_ANY, NULL);
341 RETURN(PTR_ERR(op_data));
343 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
344 0 /*unused */, &req, ll_md_blocking_ast, 0);
345 ll_finish_md_op_data(op_data);
347 /* reason for keep own exit path - don`t flood log
348 * with messages with -ESTALE errors.
350 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
351 it_open_error(DISP_OPEN_OPEN, itp))
353 ll_release_openhandle(file->f_dentry, itp);
357 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
358 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
359 CERROR("lock enqueue: err: %d\n", rc);
363 if (itp->d.lustre.it_lock_mode)
364 md_set_lock_data(sbi->ll_md_exp,
365 &itp->d.lustre.it_lock_handle,
366 file->f_dentry->d_inode);
368 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
371 ptlrpc_req_finished(itp->d.lustre.it_data);
374 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
375 ll_intent_drop_lock(itp);
380 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
381 struct lookup_intent *it, struct obd_client_handle *och)
383 struct ptlrpc_request *req = it->d.lustre.it_data;
384 struct mdt_body *body;
388 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
389 LASSERT(body != NULL); /* reply already checked out */
390 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
392 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
393 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
394 och->och_fid = lli->lli_fid;
395 och->och_flags = it->it_flags;
396 lli->lli_ioepoch = body->ioepoch;
398 return md_set_open_replay_data(md_exp, och, req);
401 int ll_local_open(struct file *file, struct lookup_intent *it,
402 struct ll_file_data *fd, struct obd_client_handle *och)
404 struct inode *inode = file->f_dentry->d_inode;
405 struct ll_inode_info *lli = ll_i2info(inode);
408 LASSERT(!LUSTRE_FPRIVATE(file));
413 struct ptlrpc_request *req = it->d.lustre.it_data;
414 struct mdt_body *body;
417 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
421 body = lustre_msg_buf(req->rq_repmsg,
422 DLM_REPLY_REC_OFF, sizeof(*body));
424 if ((it->it_flags & FMODE_WRITE) &&
425 (body->valid & OBD_MD_FLSIZE))
427 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
428 lli->lli_ioepoch, PFID(&lli->lli_fid));
432 LUSTRE_FPRIVATE(file) = fd;
433 ll_readahead_init(inode, &fd->fd_ras);
434 fd->fd_omode = it->it_flags;
438 /* Open a file, and (for the very first open) create objects on the OSTs at
439 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
440 * creation or open until ll_lov_setstripe() ioctl is called. We grab
441 * lli_open_sem to ensure no other process will create objects, send the
442 * stripe MD to the MDS, or try to destroy the objects if that fails.
444 * If we already have the stripe MD locally then we don't request it in
445 * md_open(), by passing a lmm_size = 0.
447 * It is up to the application to ensure no other processes open this file
448 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
449 * used. We might be able to avoid races of that sort by getting lli_open_sem
450 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
451 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
453 int ll_file_open(struct inode *inode, struct file *file)
455 struct ll_inode_info *lli = ll_i2info(inode);
456 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
457 .it_flags = file->f_flags };
458 struct lov_stripe_md *lsm;
459 struct ptlrpc_request *req = NULL;
460 struct obd_client_handle **och_p;
462 struct ll_file_data *fd;
466 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
467 inode->i_generation, inode, file->f_flags);
469 /* don't do anything for / */
470 if (inode->i_sb->s_root == file->f_dentry)
473 #ifdef LUSTRE_KERNEL_VERSION
476 it = file->private_data; /* XXX: compat macro */
477 file->private_data = NULL; /* prevent ll_local_open assertion */
480 fd = ll_file_data_get();
484 /* don't do anything for / */
485 if (inode->i_sb->s_root == file->f_dentry) {
486 LUSTRE_FPRIVATE(file) = fd;
490 if (!it || !it->d.lustre.it_disposition) {
491 /* Convert f_flags into access mode. We cannot use file->f_mode,
492 * because everything but O_ACCMODE mask was stripped from
494 if ((oit.it_flags + 1) & O_ACCMODE)
496 if (file->f_flags & O_TRUNC)
497 oit.it_flags |= FMODE_WRITE;
499 /* kernel only call f_op->open in dentry_open. filp_open calls
500 * dentry_open after call to open_namei that checks permissions.
501 * Only nfsd_open call dentry_open directly without checking
502 * permissions and because of that this code below is safe. */
503 if (oit.it_flags & FMODE_WRITE)
504 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
506 /* We do not want O_EXCL here, presumably we opened the file
507 * already? XXX - NFS implications? */
508 oit.it_flags &= ~O_EXCL;
513 /* Let's see if we have file open on MDS already. */
514 if (it->it_flags & FMODE_WRITE) {
515 och_p = &lli->lli_mds_write_och;
516 och_usecount = &lli->lli_open_fd_write_count;
517 } else if (it->it_flags & FMODE_EXEC) {
518 och_p = &lli->lli_mds_exec_och;
519 och_usecount = &lli->lli_open_fd_exec_count;
521 och_p = &lli->lli_mds_read_och;
522 och_usecount = &lli->lli_open_fd_read_count;
525 down(&lli->lli_och_sem);
526 if (*och_p) { /* Open handle is present */
527 if (it_disposition(it, DISP_OPEN_OPEN)) {
528 /* Well, there's extra open request that we do not need,
529 let's close it somehow. This will decref request. */
530 rc = it_open_error(DISP_OPEN_OPEN, it);
532 ll_file_data_put(fd);
533 GOTO(out_och_free, rc);
535 ll_release_openhandle(file->f_dentry, it);
536 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
541 rc = ll_local_open(file, it, fd, NULL);
543 up(&lli->lli_och_sem);
544 ll_file_data_put(fd);
548 LASSERT(*och_usecount == 0);
549 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
551 ll_file_data_put(fd);
552 GOTO(out_och_free, rc = -ENOMEM);
555 if (!it->d.lustre.it_disposition) {
556 it->it_flags |= O_CHECK_STALE;
557 rc = ll_intent_file_open(file, NULL, 0, it);
558 it->it_flags &= ~O_CHECK_STALE;
560 ll_file_data_put(fd);
561 GOTO(out_och_free, rc);
564 /* Got some error? Release the request */
565 if (it->d.lustre.it_status < 0) {
566 req = it->d.lustre.it_data;
567 ptlrpc_req_finished(req);
569 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
570 &it->d.lustre.it_lock_handle,
571 file->f_dentry->d_inode);
573 req = it->d.lustre.it_data;
575 /* md_intent_lock() didn't get a request ref if there was an
576 * open error, so don't do cleanup on the request here
578 /* XXX (green): Should not we bail out on any error here, not
579 * just open error? */
580 rc = it_open_error(DISP_OPEN_OPEN, it);
582 ll_file_data_put(fd);
583 GOTO(out_och_free, rc);
586 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
587 rc = ll_local_open(file, it, fd, *och_p);
589 up(&lli->lli_och_sem);
590 ll_file_data_put(fd);
591 GOTO(out_och_free, rc);
594 up(&lli->lli_och_sem);
596 /* Must do this outside lli_och_sem lock to prevent deadlock where
597 different kind of OPEN lock for this same inode gets cancelled
598 by ldlm_cancel_lru */
599 if (!S_ISREG(inode->i_mode))
606 if (file->f_flags & O_LOV_DELAY_CREATE ||
607 !(file->f_mode & FMODE_WRITE)) {
608 CDEBUG(D_INODE, "object creation was delayed\n");
612 file->f_flags &= ~O_LOV_DELAY_CREATE;
615 ptlrpc_req_finished(req);
617 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
621 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
622 *och_p = NULL; /* OBD_FREE writes some magic there */
625 up(&lli->lli_och_sem);
631 /* Fills the obdo with the attributes for the inode defined by lsm */
632 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
634 struct ptlrpc_request_set *set;
635 struct ll_inode_info *lli = ll_i2info(inode);
636 struct lov_stripe_md *lsm = lli->lli_smd;
638 struct obd_info oinfo = { { { 0 } } };
642 LASSERT(lsm != NULL);
646 oinfo.oi_oa->o_id = lsm->lsm_object_id;
647 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
648 oinfo.oi_oa->o_mode = S_IFREG;
649 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
650 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
651 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
652 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
654 oinfo.oi_capa = ll_mdscapa_get(inode);
656 set = ptlrpc_prep_set();
658 CERROR("can't allocate ptlrpc set\n");
661 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
663 rc = ptlrpc_set_wait(set);
664 ptlrpc_set_destroy(set);
666 capa_put(oinfo.oi_capa);
670 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
671 OBD_MD_FLATIME | OBD_MD_FLMTIME |
672 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
674 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
675 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
676 lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
681 static inline void ll_remove_suid(struct inode *inode)
685 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
686 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
688 /* was any of the uid bits set? */
689 mode &= inode->i_mode;
690 if (mode && !capable(CAP_FSETID)) {
691 inode->i_mode &= ~mode;
692 // XXX careful here - we cannot change the size
696 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
698 struct ll_inode_info *lli = ll_i2info(inode);
699 struct lov_stripe_md *lsm = lli->lli_smd;
700 struct obd_export *exp = ll_i2dtexp(inode);
703 struct ldlm_lock *lock;
704 struct lov_stripe_md *lsm;
705 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
706 __u32 stripe, vallen = sizeof(stripe);
710 if (lsm->lsm_stripe_count == 1)
711 GOTO(check, stripe = 0);
713 /* get our offset in the lov */
714 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
716 CERROR("obd_get_info: rc = %d\n", rc);
719 LASSERT(stripe < lsm->lsm_stripe_count);
722 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
723 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
724 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
725 lsm->lsm_oinfo[stripe]->loi_id,
726 lsm->lsm_oinfo[stripe]->loi_gr);
727 RETURN(-ELDLM_NO_LOCK_DATA);
733 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
734 * we get a lock cancellation for each stripe, so we have to map the obd's
735 * region back onto the stripes in the file that it held.
737 * No one can dirty the extent until we've finished our work and they can
738 * enqueue another lock. The DLM protects us from ll_file_read/write here,
739 * but other kernel actors could have pages locked.
741 * Called with the DLM lock held. */
742 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
743 struct ldlm_lock *lock, __u32 stripe)
745 ldlm_policy_data_t tmpex;
746 unsigned long start, end, count, skip, i, j;
748 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
749 struct lustre_handle lockh;
752 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
753 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
754 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
757 /* our locks are page granular thanks to osc_enqueue, we invalidate the
759 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
760 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
761 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
763 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
764 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
768 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
769 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
770 if (lsm->lsm_stripe_count > 1) {
771 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
772 skip = (lsm->lsm_stripe_count - 1) * count;
773 start += start/count * skip + stripe * count;
775 end += end/count * skip + stripe * count;
777 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
780 i = inode->i_size ? (__u64)(inode->i_size - 1) >> CFS_PAGE_SHIFT : 0;
784 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
785 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
786 count, skip, end, discard ? " (DISCARDING)" : "");
788 /* walk through the vmas on the inode and tear down mmaped pages that
789 * intersect with the lock. this stops immediately if there are no
790 * mmap()ed regions of the file. This is not efficient at all and
791 * should be short lived. We'll associate mmap()ed pages with the lock
792 * and will be able to find them directly */
793 for (i = start; i <= end; i += (j + skip)) {
794 j = min(count - (i % count), end - i + 1);
796 LASSERT(inode->i_mapping);
797 if (ll_teardown_mmaps(inode->i_mapping,
798 (__u64)i << CFS_PAGE_SHIFT,
799 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
803 /* this is the simplistic implementation of page eviction at
804 * cancelation. It is careful to get races with other page
805 * lockers handled correctly. fixes from bug 20 will make it
806 * more efficient by associating locks with pages and with
807 * batching writeback under the lock explicitly. */
808 for (i = start, j = start % count; i <= end;
809 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
811 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
817 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
818 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
819 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
822 if (!mapping_has_pages(inode->i_mapping)) {
823 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
829 page = find_get_page(inode->i_mapping, i);
832 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
833 i, tmpex.l_extent.start);
836 /* page->mapping to check with racing against teardown */
837 if (!discard && clear_page_dirty_for_io(page)) {
838 rc = ll_call_writepage(inode, page);
840 CERROR("writepage inode %lu(%p) of page %p "
841 "failed: %d\n", inode->i_ino, inode,
843 /* either waiting for io to complete or reacquiring
844 * the lock that the failed writepage released */
848 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
849 /* check to see if another DLM lock covers this page b=2765 */
850 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
851 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
853 &lock->l_resource->lr_name, LDLM_EXTENT,
854 &tmpex, LCK_PR | LCK_PW, &lockh);
856 if (rc2 <= 0 && page->mapping != NULL) {
857 struct ll_async_page *llap = llap_cast_private(page);
858 /* checking again to account for writeback's
860 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
862 ll_ra_accounting(llap, inode->i_mapping);
863 ll_truncate_complete_page(page);
866 page_cache_release(page);
868 LASSERTF(tmpex.l_extent.start <=
869 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
870 lock->l_policy_data.l_extent.end + 1),
871 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
872 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
877 static int ll_extent_lock_callback(struct ldlm_lock *lock,
878 struct ldlm_lock_desc *new, void *data,
881 struct lustre_handle lockh = { 0 };
885 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
886 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
891 case LDLM_CB_BLOCKING:
892 ldlm_lock2handle(lock, &lockh);
893 rc = ldlm_cli_cancel(&lockh);
895 CERROR("ldlm_cli_cancel failed: %d\n", rc);
897 case LDLM_CB_CANCELING: {
899 struct ll_inode_info *lli;
900 struct lov_stripe_md *lsm;
904 /* This lock wasn't granted, don't try to evict pages */
905 if (lock->l_req_mode != lock->l_granted_mode)
908 inode = ll_inode_from_lock(lock);
911 lli = ll_i2info(inode);
914 if (lli->lli_smd == NULL)
918 stripe = ll_lock_to_stripe_offset(inode, lock);
922 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
924 lov_stripe_lock(lsm);
925 lock_res_and_lock(lock);
926 kms = ldlm_extent_shift_kms(lock,
927 lsm->lsm_oinfo[stripe]->loi_kms);
929 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
930 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
931 lsm->lsm_oinfo[stripe]->loi_kms, kms);
932 lsm->lsm_oinfo[stripe]->loi_kms = kms;
933 unlock_res_and_lock(lock);
934 lov_stripe_unlock(lsm);
947 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
949 /* XXX ALLOCATE - 160 bytes */
950 struct inode *inode = ll_inode_from_lock(lock);
951 struct ll_inode_info *lli = ll_i2info(inode);
952 struct lustre_handle lockh = { 0 };
957 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
958 LDLM_FL_BLOCK_CONV)) {
959 LBUG(); /* not expecting any blocked async locks yet */
960 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
962 ldlm_lock_dump(D_OTHER, lock, 0);
963 ldlm_reprocess_all(lock->l_resource);
967 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
969 stripe = ll_lock_to_stripe_offset(inode, lock);
973 if (lock->l_lvb_len) {
974 struct lov_stripe_md *lsm = lli->lli_smd;
976 lvb = lock->l_lvb_data;
977 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
979 lock_res_and_lock(lock);
980 ll_inode_size_lock(inode, 1);
981 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
982 kms = ldlm_extent_shift_kms(NULL, kms);
983 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
984 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
985 lsm->lsm_oinfo[stripe].loi_kms, kms);
986 lsm->lsm_oinfo[stripe].loi_kms = kms;
987 ll_inode_size_unlock(inode, 1);
988 unlock_res_and_lock(lock);
993 wake_up(&lock->l_waitq);
995 ldlm_lock2handle(lock, &lockh);
996 ldlm_lock_decref(&lockh, LCK_PR);
1001 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1003 struct ptlrpc_request *req = reqp;
1004 struct inode *inode = ll_inode_from_lock(lock);
1005 struct ll_inode_info *lli;
1006 struct lov_stripe_md *lsm;
1007 struct ost_lvb *lvb;
1009 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1013 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1014 lli = ll_i2info(inode);
1016 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1019 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1021 /* First, find out which stripe index this lock corresponds to. */
1022 stripe = ll_lock_to_stripe_offset(inode, lock);
1024 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1026 rc = lustre_pack_reply(req, 2, size, NULL);
1028 CERROR("lustre_pack_reply: %d\n", rc);
1032 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1033 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1034 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1035 lvb->lvb_atime = LTIME_S(inode->i_atime);
1036 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1038 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1039 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1040 inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1041 lvb->lvb_atime, lvb->lvb_ctime);
1046 /* These errors are normal races, so we don't want to fill the console
1047 * with messages by calling ptlrpc_error() */
1048 if (rc == -ELDLM_NO_LOCK_DATA)
1049 lustre_pack_reply(req, 1, NULL, NULL);
1051 req->rq_status = rc;
1055 static void ll_merge_lvb(struct inode *inode)
1057 struct ll_inode_info *lli = ll_i2info(inode);
1058 struct ll_sb_info *sbi = ll_i2sbi(inode);
1062 ll_inode_size_lock(inode, 1);
1063 inode_init_lvb(inode, &lvb);
1064 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1065 inode->i_size = lvb.lvb_size;
1066 inode->i_blocks = lvb.lvb_blocks;
1067 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1068 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1069 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1070 ll_inode_size_unlock(inode, 1);
1074 int ll_local_size(struct inode *inode)
1076 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1077 struct ll_inode_info *lli = ll_i2info(inode);
1078 struct ll_sb_info *sbi = ll_i2sbi(inode);
1079 struct lustre_handle lockh = { 0 };
1084 if (lli->lli_smd->lsm_stripe_count == 0)
1087 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1088 &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1094 ll_merge_lvb(inode);
1095 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1099 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1102 struct lustre_handle lockh = { 0 };
1103 struct obd_enqueue_info einfo = { 0 };
1104 struct obd_info oinfo = { { { 0 } } };
1110 einfo.ei_type = LDLM_EXTENT;
1111 einfo.ei_mode = LCK_PR;
1112 einfo.ei_flags = LDLM_FL_HAS_INTENT;
1113 einfo.ei_cb_bl = ll_extent_lock_callback;
1114 einfo.ei_cb_cp = ldlm_completion_ast;
1115 einfo.ei_cb_gl = ll_glimpse_callback;
1116 einfo.ei_cbdata = NULL;
1118 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1119 oinfo.oi_lockh = &lockh;
1122 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1126 CERROR("obd_enqueue returned rc %d, "
1127 "returning -EIO\n", rc);
1128 RETURN(rc > 0 ? -EIO : rc);
1131 lov_stripe_lock(lsm);
1132 memset(&lvb, 0, sizeof(lvb));
1133 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1134 st->st_size = lvb.lvb_size;
1135 st->st_blocks = lvb.lvb_blocks;
1136 st->st_mtime = lvb.lvb_mtime;
1137 st->st_atime = lvb.lvb_atime;
1138 st->st_ctime = lvb.lvb_ctime;
1139 lov_stripe_unlock(lsm);
1144 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1145 * file (because it prefers KMS over RSS when larger) */
1146 int ll_glimpse_size(struct inode *inode, int ast_flags)
1148 struct ll_inode_info *lli = ll_i2info(inode);
1149 struct ll_sb_info *sbi = ll_i2sbi(inode);
1150 struct lustre_handle lockh = { 0 };
1151 struct obd_enqueue_info einfo = { 0 };
1152 struct obd_info oinfo = { { { 0 } } };
1156 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1159 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1161 if (!lli->lli_smd) {
1162 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1166 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1167 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1168 * won't revoke any conflicting DLM locks held. Instead,
1169 * ll_glimpse_callback() will be called on each client
1170 * holding a DLM lock against this file, and resulting size
1171 * will be returned for each stripe. DLM lock on [0, EOF] is
1172 * acquired only if there were no conflicting locks. */
1173 einfo.ei_type = LDLM_EXTENT;
1174 einfo.ei_mode = LCK_PR;
1175 einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1176 einfo.ei_cb_bl = ll_extent_lock_callback;
1177 einfo.ei_cb_cp = ldlm_completion_ast;
1178 einfo.ei_cb_gl = ll_glimpse_callback;
1179 einfo.ei_cbdata = inode;
1181 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1182 oinfo.oi_lockh = &lockh;
1183 oinfo.oi_md = lli->lli_smd;
1185 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1189 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1190 RETURN(rc > 0 ? -EIO : rc);
1193 ll_merge_lvb(inode);
1195 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1196 inode->i_size, inode->i_blocks);
1201 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1202 struct lov_stripe_md *lsm, int mode,
1203 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1206 struct ll_sb_info *sbi = ll_i2sbi(inode);
1208 struct obd_enqueue_info einfo = { 0 };
1209 struct obd_info oinfo = { { { 0 } } };
1213 LASSERT(!lustre_handle_is_used(lockh));
1214 LASSERT(lsm != NULL);
1216 /* don't drop the mmapped file to LRU */
1217 if (mapping_mapped(inode->i_mapping))
1218 ast_flags |= LDLM_FL_NO_LRU;
1220 /* XXX phil: can we do this? won't it screw the file size up? */
1221 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1222 (sbi->ll_flags & LL_SBI_NOLCK))
1225 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1226 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1228 einfo.ei_type = LDLM_EXTENT;
1229 einfo.ei_mode = mode;
1230 einfo.ei_flags = ast_flags;
1231 einfo.ei_cb_bl = ll_extent_lock_callback;
1232 einfo.ei_cb_cp = ldlm_completion_ast;
1233 einfo.ei_cb_gl = ll_glimpse_callback;
1234 einfo.ei_cbdata = inode;
1236 oinfo.oi_policy = *policy;
1237 oinfo.oi_lockh = lockh;
1240 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1241 *policy = oinfo.oi_policy;
1245 ll_inode_size_lock(inode, 1);
1246 inode_init_lvb(inode, &lvb);
1247 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1249 if (policy->l_extent.start == 0 &&
1250 policy->l_extent.end == OBD_OBJECT_EOF) {
1251 /* vmtruncate()->ll_truncate() first sets the i_size and then
1252 * the kms under both a DLM lock and the
1253 * ll_inode_size_lock(). If we don't get the
1254 * ll_inode_size_lock() here we can match the DLM lock and
1255 * reset i_size from the kms before the truncating path has
1256 * updated the kms. generic_file_write can then trust the
1257 * stale i_size when doing appending writes and effectively
1258 * cancel the result of the truncate. Getting the
1259 * ll_inode_size_lock() after the enqueue maintains the DLM
1260 * -> ll_inode_size_lock() acquiring order. */
1261 inode->i_size = lvb.lvb_size;
1262 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1263 inode->i_ino, inode->i_size);
1267 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1268 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1269 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1271 ll_inode_size_unlock(inode, 1);
1276 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1277 struct lov_stripe_md *lsm, int mode,
1278 struct lustre_handle *lockh)
1280 struct ll_sb_info *sbi = ll_i2sbi(inode);
1284 /* XXX phil: can we do this? won't it screw the file size up? */
1285 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1286 (sbi->ll_flags & LL_SBI_NOLCK))
1289 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1297 struct inode *inode = file->f_dentry->d_inode;
1298 struct ll_inode_info *lli = ll_i2info(inode);
1299 struct lov_stripe_md *lsm = lli->lli_smd;
1300 struct ll_sb_info *sbi = ll_i2sbi(inode);
1301 struct ll_lock_tree tree;
1302 struct ll_lock_tree_node *node;
1304 struct ll_ra_read bead;
1307 ssize_t retval, chunk, sum = 0;
1311 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1312 inode->i_ino, inode->i_generation, inode, count, *ppos);
1313 /* "If nbyte is 0, read() will return 0 and have no other results."
1314 * -- Single Unix Spec */
1318 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1321 /* Read on file with no objects should return zero-filled
1322 * buffers up to file size (we can get non-zero sizes with
1323 * mknod + truncate, then opening file for read. This is a
1324 * common pattern in NFS case, it seems). Bug 6243 */
1326 /* Since there are no objects on OSTs, we have nothing to get
1327 * lock on and so we are forced to access inode->i_size
1330 /* Read beyond end of file */
1331 if (*ppos >= inode->i_size)
1334 if (count > inode->i_size - *ppos)
1335 count = inode->i_size - *ppos;
1336 /* Make sure to correctly adjust the file pos pointer for
1338 notzeroed = clear_user(buf, count);
1347 if (sbi->ll_max_rw_chunk != 0) {
1348 /* first, let's know the end of the current stripe */
1350 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1353 /* correct, the end is beyond the request */
1354 if (end > *ppos + count - 1)
1355 end = *ppos + count - 1;
1357 /* and chunk shouldn't be too large even if striping is wide */
1358 if (end - *ppos > sbi->ll_max_rw_chunk)
1359 end = *ppos + sbi->ll_max_rw_chunk - 1;
1361 end = *ppos + count - 1;
1364 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1366 GOTO(out, retval = PTR_ERR(node));
1369 tree.lt_fd = LUSTRE_FPRIVATE(file);
1370 rc = ll_tree_lock(&tree, node, buf, count,
1371 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1373 GOTO(out, retval = rc);
1375 ll_inode_size_lock(inode, 1);
1377 * Consistency guarantees: following possibilities exist for the
1378 * relation between region being read and real file size at this
1381 * (A): the region is completely inside of the file;
1383 * (B-x): x bytes of region are inside of the file, the rest is
1386 * (C): the region is completely outside of the file.
1388 * This classification is stable under DLM lock acquired by
1389 * ll_tree_lock() above, because to change class, other client has to
1390 * take DLM lock conflicting with our lock. Also, any updates to
1391 * ->i_size by other threads on this client are serialized by
1392 * ll_inode_size_lock(). This guarantees that short reads are handled
1393 * correctly in the face of concurrent writes and truncates.
1395 inode_init_lvb(inode, &lvb);
1396 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1398 if (*ppos + count - 1 > kms) {
1399 /* A glimpse is necessary to determine whether we return a
1400 * short read (B) or some zeroes at the end of the buffer (C) */
1401 ll_inode_size_unlock(inode, 1);
1402 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1404 ll_tree_unlock(&tree);
1408 /* region is within kms and, hence, within real file size (A).
1409 * We need to increase i_size to cover the read region so that
1410 * generic_file_read() will do its job, but that doesn't mean
1411 * the kms size is _correct_, it is only the _minimum_ size.
1412 * If someone does a stat they will get the correct size which
1413 * will always be >= the kms value here. b=11081 */
1414 if (inode->i_size < kms)
1415 inode->i_size = kms;
1416 ll_inode_size_unlock(inode, 1);
1419 chunk = end - *ppos + 1;
1420 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1421 inode->i_ino, chunk, *ppos, inode->i_size);
1423 /* turn off the kernel's read-ahead */
1424 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1427 file->f_ra.ra_pages = 0;
1429 /* initialize read-ahead window once per syscall */
1432 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1433 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1434 ll_ra_read_in(file, &bead);
1438 file_accessed(file);
1439 retval = generic_file_read(file, buf, chunk, ppos);
1440 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1442 ll_tree_unlock(&tree);
1448 if (retval == chunk && count > 0)
1454 ll_ra_read_ex(file, &bead);
1455 retval = (sum > 0) ? sum : retval;
1460 * Write to a file (through the page cache).
1462 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1465 struct inode *inode = file->f_dentry->d_inode;
1466 struct ll_sb_info *sbi = ll_i2sbi(inode);
1467 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1468 struct ll_lock_tree tree;
1469 struct ll_lock_tree_node *node;
1470 loff_t maxbytes = ll_file_maxbytes(inode);
1471 loff_t lock_start, lock_end, end;
1472 ssize_t retval, chunk, sum = 0;
1476 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1477 inode->i_ino, inode->i_generation, inode, count, *ppos);
1479 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1481 /* POSIX, but surprised the VFS doesn't check this already */
1485 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1486 * called on the file, don't fail the below assertion (bug 2388). */
1487 if (file->f_flags & O_LOV_DELAY_CREATE &&
1488 ll_i2info(inode)->lli_smd == NULL)
1491 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1493 down(&ll_i2info(inode)->lli_write_sem);
1496 chunk = 0; /* just to fix gcc's warning */
1497 end = *ppos + count - 1;
1499 if (file->f_flags & O_APPEND) {
1501 lock_end = OBD_OBJECT_EOF;
1502 } else if (sbi->ll_max_rw_chunk != 0) {
1503 /* first, let's know the end of the current stripe */
1505 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1508 /* correct, the end is beyond the request */
1509 if (end > *ppos + count - 1)
1510 end = *ppos + count - 1;
1512 /* and chunk shouldn't be too large even if striping is wide */
1513 if (end - *ppos > sbi->ll_max_rw_chunk)
1514 end = *ppos + sbi->ll_max_rw_chunk - 1;
1519 lock_end = *ppos + count - 1;
1521 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1524 GOTO(out, retval = PTR_ERR(node));
1526 tree.lt_fd = LUSTRE_FPRIVATE(file);
1527 rc = ll_tree_lock(&tree, node, buf, count,
1528 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1530 GOTO(out, retval = rc);
1532 /* This is ok, g_f_w will overwrite this under i_sem if it races
1533 * with a local truncate, it just makes our maxbyte checking easier.
1534 * The i_size value gets updated in ll_extent_lock() as a consequence
1535 * of the [0,EOF] extent lock we requested above. */
1536 if (file->f_flags & O_APPEND) {
1537 *ppos = inode->i_size;
1538 end = *ppos + count - 1;
1541 if (*ppos >= maxbytes) {
1542 send_sig(SIGXFSZ, current, 0);
1543 GOTO(out_unlock, retval = -EFBIG);
1545 if (*ppos + count > maxbytes)
1546 count = maxbytes - *ppos;
1548 /* generic_file_write handles O_APPEND after getting i_mutex */
1549 chunk = end - *ppos + 1;
1550 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1551 inode->i_ino, chunk, *ppos);
1552 retval = generic_file_write(file, buf, chunk, ppos);
1553 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1556 ll_tree_unlock(&tree);
1563 if (retval == chunk && count > 0)
1567 up(&ll_i2info(inode)->lli_write_sem);
1569 retval = (sum > 0) ? sum : retval;
1570 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1571 retval > 0 ? retval : 0);
1576 * Send file content (through pagecache) somewhere with helper
1578 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1579 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1580 read_actor_t actor, void *target)
1582 struct inode *inode = in_file->f_dentry->d_inode;
1583 struct ll_inode_info *lli = ll_i2info(inode);
1584 struct lov_stripe_md *lsm = lli->lli_smd;
1585 struct ll_lock_tree tree;
1586 struct ll_lock_tree_node *node;
1588 struct ll_ra_read bead;
1593 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1594 inode->i_ino, inode->i_generation, inode, count, *ppos);
1596 /* "If nbyte is 0, read() will return 0 and have no other results."
1597 * -- Single Unix Spec */
1601 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1602 /* turn off the kernel's read-ahead */
1603 in_file->f_ra.ra_pages = 0;
1605 /* File with no objects, nothing to lock */
1607 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1609 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1611 RETURN(PTR_ERR(node));
1613 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1614 rc = ll_tree_lock(&tree, node, NULL, count,
1615 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1619 ll_inode_size_lock(inode, 1);
1621 * Consistency guarantees: following possibilities exist for the
1622 * relation between region being read and real file size at this
1625 * (A): the region is completely inside of the file;
1627 * (B-x): x bytes of region are inside of the file, the rest is
1630 * (C): the region is completely outside of the file.
1632 * This classification is stable under DLM lock acquired by
1633 * ll_tree_lock() above, because to change class, other client has to
1634 * take DLM lock conflicting with our lock. Also, any updates to
1635 * ->i_size by other threads on this client are serialized by
1636 * ll_inode_size_lock(). This guarantees that short reads are handled
1637 * correctly in the face of concurrent writes and truncates.
1639 inode_init_lvb(inode, &lvb);
1640 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1642 if (*ppos + count - 1 > kms) {
1643 /* A glimpse is necessary to determine whether we return a
1644 * short read (B) or some zeroes at the end of the buffer (C) */
1645 ll_inode_size_unlock(inode, 1);
1646 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1650 /* region is within kms and, hence, within real file size (A) */
1651 inode->i_size = kms;
1652 ll_inode_size_unlock(inode, 1);
1655 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1656 inode->i_ino, count, *ppos, inode->i_size);
1658 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1659 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1660 ll_ra_read_in(in_file, &bead);
1662 file_accessed(in_file);
1663 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1664 ll_ra_read_ex(in_file, &bead);
1667 ll_tree_unlock(&tree);
1672 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1675 struct ll_inode_info *lli = ll_i2info(inode);
1676 struct obd_export *exp = ll_i2dtexp(inode);
1677 struct ll_recreate_obj ucreatp;
1678 struct obd_trans_info oti = { 0 };
1679 struct obdo *oa = NULL;
1682 struct lov_stripe_md *lsm, *lsm2;
1685 if (!capable (CAP_SYS_ADMIN))
1688 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1689 sizeof(struct ll_recreate_obj));
1697 down(&lli->lli_size_sem);
1700 GOTO(out, rc = -ENOENT);
1701 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1702 (lsm->lsm_stripe_count));
1704 OBD_ALLOC(lsm2, lsm_size);
1706 GOTO(out, rc = -ENOMEM);
1708 oa->o_id = ucreatp.lrc_id;
1709 oa->o_gr = ucreatp.lrc_group;
1710 oa->o_nlink = ucreatp.lrc_ost_idx;
1711 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1712 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1713 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1714 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1716 oti.oti_objid = NULL;
1717 memcpy(lsm2, lsm, lsm_size);
1718 rc = obd_create(exp, oa, &lsm2, &oti);
1720 OBD_FREE(lsm2, lsm_size);
1723 up(&lli->lli_size_sem);
1728 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1729 int flags, struct lov_user_md *lum, int lum_size)
1731 struct ll_inode_info *lli = ll_i2info(inode);
1732 struct lov_stripe_md *lsm;
1733 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1737 down(&lli->lli_size_sem);
1740 up(&lli->lli_size_sem);
1741 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1746 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1749 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1750 GOTO(out_req_free, rc = -ENOENT);
1751 rc = oit.d.lustre.it_status;
1753 GOTO(out_req_free, rc);
1755 ll_release_openhandle(file->f_dentry, &oit);
1758 up(&lli->lli_size_sem);
1759 ll_intent_release(&oit);
1762 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1766 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1767 struct lov_mds_md **lmmp, int *lmm_size,
1768 struct ptlrpc_request **request)
1770 struct ll_sb_info *sbi = ll_i2sbi(inode);
1771 struct mdt_body *body;
1772 struct lov_mds_md *lmm = NULL;
1773 struct ptlrpc_request *req = NULL;
1774 struct obd_capa *oc;
1777 rc = ll_get_max_mdsize(sbi, &lmmsize);
1781 oc = ll_mdscapa_get(inode);
1782 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1783 oc, filename, strlen(filename) + 1,
1784 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1787 CDEBUG(D_INFO, "md_getattr_name failed "
1788 "on %s: rc %d\n", filename, rc);
1792 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1793 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1794 /* swabbed by mdc_getattr_name */
1795 LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1797 lmmsize = body->eadatasize;
1799 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1801 GOTO(out, rc = -ENODATA);
1804 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1805 LASSERT(lmm != NULL);
1806 LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1809 * This is coming from the MDS, so is probably in
1810 * little endian. We convert it to host endian before
1811 * passing it to userspace.
1813 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1814 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1815 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1816 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1817 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1820 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1821 struct lov_stripe_md *lsm;
1822 struct lov_user_md_join *lmj;
1823 int lmj_size, i, aindex = 0;
1825 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1827 GOTO(out, rc = -ENOMEM);
1828 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1830 GOTO(out_free_memmd, rc);
1832 lmj_size = sizeof(struct lov_user_md_join) +
1833 lsm->lsm_stripe_count *
1834 sizeof(struct lov_user_ost_data_join);
1835 OBD_ALLOC(lmj, lmj_size);
1837 GOTO(out_free_memmd, rc = -ENOMEM);
1839 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1840 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1841 struct lov_extent *lex =
1842 &lsm->lsm_array->lai_ext_array[aindex];
1844 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1846 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1847 LPU64" len %d\n", aindex, i,
1848 lex->le_start, (int)lex->le_len);
1849 lmj->lmm_objects[i].l_extent_start =
1852 if ((int)lex->le_len == -1)
1853 lmj->lmm_objects[i].l_extent_end = -1;
1855 lmj->lmm_objects[i].l_extent_end =
1856 lex->le_start + lex->le_len;
1857 lmj->lmm_objects[i].l_object_id =
1858 lsm->lsm_oinfo[i]->loi_id;
1859 lmj->lmm_objects[i].l_object_gr =
1860 lsm->lsm_oinfo[i]->loi_gr;
1861 lmj->lmm_objects[i].l_ost_gen =
1862 lsm->lsm_oinfo[i]->loi_ost_gen;
1863 lmj->lmm_objects[i].l_ost_idx =
1864 lsm->lsm_oinfo[i]->loi_ost_idx;
1866 lmm = (struct lov_mds_md *)lmj;
1869 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1873 *lmm_size = lmmsize;
1878 static int ll_lov_setea(struct inode *inode, struct file *file,
1881 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1882 struct lov_user_md *lump;
1883 int lum_size = sizeof(struct lov_user_md) +
1884 sizeof(struct lov_user_ost_data);
1888 if (!capable (CAP_SYS_ADMIN))
1891 OBD_ALLOC(lump, lum_size);
1895 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1897 OBD_FREE(lump, lum_size);
1901 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1903 OBD_FREE(lump, lum_size);
1907 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1910 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1912 int flags = FMODE_WRITE;
1915 /* Bug 1152: copy properly when this is no longer true */
1916 LASSERT(sizeof(lum) == sizeof(*lump));
1917 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1918 rc = copy_from_user(&lum, lump, sizeof(lum));
1922 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1924 put_user(0, &lump->lmm_stripe_count);
1925 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1926 0, ll_i2info(inode)->lli_smd, lump);
1931 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1933 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1938 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1942 static int ll_get_grouplock(struct inode *inode, struct file *file,
1945 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1946 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1947 .end = OBD_OBJECT_EOF}};
1948 struct lustre_handle lockh = { 0 };
1949 struct ll_inode_info *lli = ll_i2info(inode);
1950 struct lov_stripe_md *lsm = lli->lli_smd;
1954 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1958 policy.l_extent.gid = arg;
1959 if (file->f_flags & O_NONBLOCK)
1960 flags = LDLM_FL_BLOCK_NOWAIT;
1962 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1966 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1968 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1973 static int ll_put_grouplock(struct inode *inode, struct file *file,
1976 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1977 struct ll_inode_info *lli = ll_i2info(inode);
1978 struct lov_stripe_md *lsm = lli->lli_smd;
1982 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1983 /* Ugh, it's already unlocked. */
1987 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1990 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1992 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1997 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2002 static int join_sanity_check(struct inode *head, struct inode *tail)
2005 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2006 CERROR("server do not support join \n");
2009 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2010 CERROR("tail ino %lu and ino head %lu must be regular\n",
2011 head->i_ino, tail->i_ino);
2014 if (head->i_ino == tail->i_ino) {
2015 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2018 if (head->i_size % JOIN_FILE_ALIGN) {
2019 CERROR("hsize %llu must be times of 64K\n", head->i_size);
2025 static int join_file(struct inode *head_inode, struct file *head_filp,
2026 struct file *tail_filp)
2028 struct inode *tail_inode, *tail_parent;
2029 struct dentry *tail_dentry = tail_filp->f_dentry;
2030 struct lookup_intent oit = {.it_op = IT_OPEN,
2031 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2032 struct lustre_handle lockh;
2033 struct md_op_data *op_data;
2037 tail_dentry = tail_filp->f_dentry;
2038 tail_inode = tail_dentry->d_inode;
2039 tail_parent = tail_dentry->d_parent->d_inode;
2041 op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
2042 tail_dentry->d_name.name,
2043 tail_dentry->d_name.len, 0,
2044 LUSTRE_OPC_ANY, &head_inode->i_size);
2045 if (IS_ERR(op_data))
2046 RETURN(PTR_ERR(op_data));
2048 rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
2049 op_data, &lockh, NULL, 0, ldlm_completion_ast,
2050 ll_md_blocking_ast, NULL, 0);
2052 ll_finish_md_op_data(op_data);
2056 rc = oit.d.lustre.it_status;
2058 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2059 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2060 ptlrpc_req_finished((struct ptlrpc_request *)
2061 oit.d.lustre.it_data);
2065 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2067 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2068 oit.d.lustre.it_lock_mode = 0;
2070 ll_release_openhandle(head_filp->f_dentry, &oit);
2072 ll_intent_release(&oit);
2076 static int ll_file_join(struct inode *head, struct file *filp,
2077 char *filename_tail)
2079 struct inode *tail = NULL, *first = NULL, *second = NULL;
2080 struct dentry *tail_dentry;
2081 struct file *tail_filp, *first_filp, *second_filp;
2082 struct ll_lock_tree first_tree, second_tree;
2083 struct ll_lock_tree_node *first_node, *second_node;
2084 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2085 int rc = 0, cleanup_phase = 0;
2088 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2089 head->i_ino, head->i_generation, head, filename_tail);
2091 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2092 if (IS_ERR(tail_filp)) {
2093 CERROR("Can not open tail file %s", filename_tail);
2094 rc = PTR_ERR(tail_filp);
2097 tail = igrab(tail_filp->f_dentry->d_inode);
2099 tlli = ll_i2info(tail);
2100 tail_dentry = tail_filp->f_dentry;
2101 LASSERT(tail_dentry);
2104 /*reorder the inode for lock sequence*/
2105 first = head->i_ino > tail->i_ino ? head : tail;
2106 second = head->i_ino > tail->i_ino ? tail : head;
2107 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2108 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2110 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2111 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2112 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2113 if (IS_ERR(first_node)){
2114 rc = PTR_ERR(first_node);
2117 first_tree.lt_fd = first_filp->private_data;
2118 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2123 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2124 if (IS_ERR(second_node)){
2125 rc = PTR_ERR(second_node);
2128 second_tree.lt_fd = second_filp->private_data;
2129 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2134 rc = join_sanity_check(head, tail);
2138 rc = join_file(head, filp, tail_filp);
2142 switch (cleanup_phase) {
2144 ll_tree_unlock(&second_tree);
2145 obd_cancel_unused(ll_i2dtexp(second),
2146 ll_i2info(second)->lli_smd, 0, NULL);
2148 ll_tree_unlock(&first_tree);
2149 obd_cancel_unused(ll_i2dtexp(first),
2150 ll_i2info(first)->lli_smd, 0, NULL);
2152 filp_close(tail_filp, 0);
2155 if (head && rc == 0) {
2156 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2158 hlli->lli_smd = NULL;
2163 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2169 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2171 struct inode *inode = dentry->d_inode;
2172 struct obd_client_handle *och;
2178 /* Root ? Do nothing. */
2179 if (dentry->d_inode->i_sb->s_root == dentry)
2182 /* No open handle to close? Move away */
2183 if (!it_disposition(it, DISP_OPEN_OPEN))
2186 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2188 OBD_ALLOC(och, sizeof(*och));
2190 GOTO(out, rc = -ENOMEM);
2192 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2193 ll_i2info(inode), it, och);
2195 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2198 /* this one is in place of ll_file_open */
2199 ptlrpc_req_finished(it->d.lustre.it_data);
2200 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2204 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2207 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2211 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2212 inode->i_generation, inode, cmd);
2213 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2215 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2216 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2220 case LL_IOC_GETFLAGS:
2221 /* Get the current value of the file flags */
2222 return put_user(fd->fd_flags, (int *)arg);
2223 case LL_IOC_SETFLAGS:
2224 case LL_IOC_CLRFLAGS:
2225 /* Set or clear specific file flags */
2226 /* XXX This probably needs checks to ensure the flags are
2227 * not abused, and to handle any flag side effects.
2229 if (get_user(flags, (int *) arg))
2232 if (cmd == LL_IOC_SETFLAGS) {
2233 if ((flags & LL_FILE_IGNORE_LOCK) &&
2234 !(file->f_flags & O_DIRECT)) {
2235 CERROR("%s: unable to disable locking on "
2236 "non-O_DIRECT file\n", current->comm);
2240 fd->fd_flags |= flags;
2242 fd->fd_flags &= ~flags;
2245 case LL_IOC_LOV_SETSTRIPE:
2246 RETURN(ll_lov_setstripe(inode, file, arg));
2247 case LL_IOC_LOV_SETEA:
2248 RETURN(ll_lov_setea(inode, file, arg));
2249 case LL_IOC_LOV_GETSTRIPE:
2250 RETURN(ll_lov_getstripe(inode, arg));
2251 case LL_IOC_RECREATE_OBJ:
2252 RETURN(ll_lov_recreate_obj(inode, file, arg));
2253 case EXT3_IOC_GETFLAGS:
2254 case EXT3_IOC_SETFLAGS:
2255 RETURN(ll_iocontrol(inode, file, cmd, arg));
2256 case EXT3_IOC_GETVERSION_OLD:
2257 case EXT3_IOC_GETVERSION:
2258 RETURN(put_user(inode->i_generation, (int *)arg));
2263 ftail = getname((const char *)arg);
2265 RETURN(PTR_ERR(ftail));
2266 rc = ll_file_join(inode, file, ftail);
2270 case LL_IOC_GROUP_LOCK:
2271 RETURN(ll_get_grouplock(inode, file, arg));
2272 case LL_IOC_GROUP_UNLOCK:
2273 RETURN(ll_put_grouplock(inode, file, arg));
2274 case IOC_OBD_STATFS:
2275 RETURN(ll_obd_statfs(inode, (void *)arg));
2277 /* We need to special case any other ioctls we want to handle,
2278 * to send them to the MDS/OST as appropriate and to properly
2279 * network encode the arg field.
2280 case EXT3_IOC_SETVERSION_OLD:
2281 case EXT3_IOC_SETVERSION:
2283 case LL_IOC_FLUSHCTX:
2284 RETURN(ll_flush_ctx(inode));
2285 case LL_IOC_GETFACL: {
2286 struct rmtacl_ioctl_data ioc;
2288 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2291 RETURN(ll_ioctl_getfacl(inode, &ioc));
2293 case LL_IOC_SETFACL: {
2294 struct rmtacl_ioctl_data ioc;
2296 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2299 RETURN(ll_ioctl_setfacl(inode, &ioc));
2302 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2307 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2309 struct inode *inode = file->f_dentry->d_inode;
2310 struct ll_inode_info *lli = ll_i2info(inode);
2311 struct lov_stripe_md *lsm = lli->lli_smd;
2314 retval = offset + ((origin == 2) ? inode->i_size :
2315 (origin == 1) ? file->f_pos : 0);
2316 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2317 inode->i_ino, inode->i_generation, inode, retval, retval,
2318 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2319 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2321 if (origin == 2) { /* SEEK_END */
2322 int nonblock = 0, rc;
2324 if (file->f_flags & O_NONBLOCK)
2325 nonblock = LDLM_FL_BLOCK_NOWAIT;
2328 rc = ll_glimpse_size(inode, nonblock);
2333 ll_inode_size_lock(inode, 0);
2334 offset += inode->i_size;
2335 ll_inode_size_unlock(inode, 0);
2336 } else if (origin == 1) { /* SEEK_CUR */
2337 offset += file->f_pos;
2341 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2342 if (offset != file->f_pos) {
2343 file->f_pos = offset;
2344 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2346 file->f_version = ++event;
2355 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2357 struct inode *inode = dentry->d_inode;
2358 struct ll_inode_info *lli = ll_i2info(inode);
2359 struct lov_stripe_md *lsm = lli->lli_smd;
2360 struct ptlrpc_request *req;
2361 struct obd_capa *oc;
2364 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2365 inode->i_generation, inode);
2366 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2368 /* fsync's caller has already called _fdata{sync,write}, we want
2369 * that IO to finish before calling the osc and mdc sync methods */
2370 rc = filemap_fdatawait(inode->i_mapping);
2372 /* catch async errors that were recorded back when async writeback
2373 * failed for pages in this mapping. */
2374 err = lli->lli_async_rc;
2375 lli->lli_async_rc = 0;
2379 err = lov_test_and_clear_async_rc(lsm);
2384 oc = ll_mdscapa_get(inode);
2385 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2391 ptlrpc_req_finished(req);
2398 RETURN(rc ? rc : -ENOMEM);
2400 oa->o_id = lsm->lsm_object_id;
2401 oa->o_gr = lsm->lsm_object_gr;
2402 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2403 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2404 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2407 oc = ll_osscapa_get(inode, 0, CAPA_OPC_OSS_WRITE);
2408 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2409 0, OBD_OBJECT_EOF, oc);
2419 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2421 struct inode *inode = file->f_dentry->d_inode;
2422 struct ll_sb_info *sbi = ll_i2sbi(inode);
2423 struct ldlm_res_id res_id =
2424 { .name = { fid_seq(ll_inode2fid(inode)),
2425 fid_oid(ll_inode2fid(inode)),
2426 fid_ver(ll_inode2fid(inode)),
2428 struct lustre_handle lockh = {0};
2429 ldlm_policy_data_t flock;
2430 ldlm_mode_t mode = 0;
2435 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2436 inode->i_ino, file_lock);
2438 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2440 if (file_lock->fl_flags & FL_FLOCK) {
2441 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2442 /* set missing params for flock() calls */
2443 file_lock->fl_end = OFFSET_MAX;
2444 file_lock->fl_pid = current->tgid;
2446 flock.l_flock.pid = file_lock->fl_pid;
2447 flock.l_flock.start = file_lock->fl_start;
2448 flock.l_flock.end = file_lock->fl_end;
2450 switch (file_lock->fl_type) {
2455 /* An unlock request may or may not have any relation to
2456 * existing locks so we may not be able to pass a lock handle
2457 * via a normal ldlm_lock_cancel() request. The request may even
2458 * unlock a byte range in the middle of an existing lock. In
2459 * order to process an unlock request we need all of the same
2460 * information that is given with a normal read or write record
2461 * lock request. To avoid creating another ldlm unlock (cancel)
2462 * message we'll treat a LCK_NL flock request as an unlock. */
2469 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2484 flags = LDLM_FL_BLOCK_NOWAIT;
2490 flags = LDLM_FL_TEST_LOCK;
2491 /* Save the old mode so that if the mode in the lock changes we
2492 * can decrement the appropriate reader or writer refcount. */
2493 file_lock->fl_type = mode;
2496 CERROR("unknown fcntl lock command: %d\n", cmd);
2500 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2501 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2502 flags, mode, flock.l_flock.start, flock.l_flock.end);
2504 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2505 LDLM_FLOCK, &flock, mode, &flags, NULL,
2506 ldlm_flock_completion_ast, NULL, file_lock,
2507 NULL, 0, NULL, &lockh, 0);
2508 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2509 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2510 #ifdef HAVE_F_OP_FLOCK
2511 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2512 !(flags & LDLM_FL_TEST_LOCK))
2513 posix_lock_file_wait(file, file_lock);
2519 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2526 int ll_have_md_lock(struct inode *inode, __u64 bits)
2528 struct lustre_handle lockh;
2529 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2537 fid = &ll_i2info(inode)->lli_fid;
2538 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2540 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2541 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2542 LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2549 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2550 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2551 * and return success */
2553 /* This path cannot be hit for regular files unless in
2554 * case of obscure races, so no need to to validate
2556 if (!S_ISREG(inode->i_mode) &&
2557 !S_ISDIR(inode->i_mode))
2562 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2570 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2572 struct inode *inode = dentry->d_inode;
2573 struct ptlrpc_request *req = NULL;
2574 struct ll_sb_info *sbi;
2575 struct obd_export *exp;
2580 CERROR("REPORT THIS LINE TO PETER\n");
2583 sbi = ll_i2sbi(inode);
2585 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2586 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2587 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2588 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2591 exp = ll_i2mdexp(inode);
2593 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2594 struct lookup_intent oit = { .it_op = IT_GETATTR };
2595 struct md_op_data *op_data;
2597 /* Call getattr by fid, so do not provide name at all. */
2598 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2599 dentry->d_inode, NULL, 0, 0,
2600 LUSTRE_OPC_ANY, NULL);
2601 if (IS_ERR(op_data))
2602 RETURN(PTR_ERR(op_data));
2604 oit.it_flags |= O_CHECK_STALE;
2605 rc = md_intent_lock(exp, op_data, NULL, 0,
2606 /* we are not interested in name
2609 ll_md_blocking_ast, 0);
2610 ll_finish_md_op_data(op_data);
2611 oit.it_flags &= ~O_CHECK_STALE;
2613 rc = ll_inode_revalidate_fini(inode, rc);
2617 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2619 ll_intent_release(&oit);
2623 /* Unlinked? Unhash dentry, so it is not picked up later by
2624 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2625 here to preserve get_cwd functionality on 2.6.
2627 if (!dentry->d_inode->i_nlink) {
2628 spin_lock(&dcache_lock);
2629 ll_drop_dentry(dentry);
2630 spin_unlock(&dcache_lock);
2633 ll_lookup_finish_locks(&oit, dentry);
2634 } else if (!ll_have_md_lock(dentry->d_inode,
2635 MDS_INODELOCK_UPDATE)) {
2636 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2637 obd_valid valid = OBD_MD_FLGETATTR;
2638 struct obd_capa *oc;
2641 if (S_ISREG(inode->i_mode)) {
2642 rc = ll_get_max_mdsize(sbi, &ealen);
2645 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2647 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2648 * capa for this inode. Because we only keep capas of dirs
2650 oc = ll_mdscapa_get(inode);
2651 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2655 rc = ll_inode_revalidate_fini(inode, rc);
2659 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2665 /* if object not yet allocated, don't validate size */
2666 if (ll_i2info(inode)->lli_smd == NULL)
2669 /* ll_glimpse_size will prefer locally cached writes if they extend
2671 rc = ll_glimpse_size(inode, 0);
2674 ptlrpc_req_finished(req);
2678 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2679 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2680 struct lookup_intent *it, struct kstat *stat)
2682 struct inode *inode = de->d_inode;
2685 res = ll_inode_revalidate_it(de, it);
2686 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2691 stat->dev = inode->i_sb->s_dev;
2692 stat->ino = inode->i_ino;
2693 stat->mode = inode->i_mode;
2694 stat->nlink = inode->i_nlink;
2695 stat->uid = inode->i_uid;
2696 stat->gid = inode->i_gid;
2697 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2698 stat->atime = inode->i_atime;
2699 stat->mtime = inode->i_mtime;
2700 stat->ctime = inode->i_ctime;
2701 #ifdef HAVE_INODE_BLKSIZE
2702 stat->blksize = inode->i_blksize;
2704 stat->blksize = 1 << inode->i_blkbits;
2707 ll_inode_size_lock(inode, 0);
2708 stat->size = inode->i_size;
2709 stat->blocks = inode->i_blocks;
2710 ll_inode_size_unlock(inode, 0);
2714 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2716 struct lookup_intent it = { .it_op = IT_GETATTR };
2718 return ll_getattr_it(mnt, de, &it, stat);
2723 int lustre_check_acl(struct inode *inode, int mask)
2725 #ifdef CONFIG_FS_POSIX_ACL
2726 struct ll_inode_info *lli = ll_i2info(inode);
2727 struct posix_acl *acl;
2731 spin_lock(&lli->lli_lock);
2732 acl = posix_acl_dup(lli->lli_posix_acl);
2733 spin_unlock(&lli->lli_lock);
2738 rc = posix_acl_permission(inode, acl, mask);
2739 posix_acl_release(acl);
2747 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2748 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2750 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2751 inode->i_ino, inode->i_generation, inode, mask);
2752 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2753 return lustre_check_remote_perm(inode, mask);
2755 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2756 return generic_permission(inode, mask, lustre_check_acl);
2759 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2760 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2762 int ll_inode_permission(struct inode *inode, int mask)
2765 int mode = inode->i_mode;
2768 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2769 inode->i_ino, inode->i_generation, inode, mask);
2771 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2772 return lustre_check_remote_perm(inode, mask);
2774 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2776 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2777 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2779 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2781 if (current->fsuid == inode->i_uid) {
2784 if (((mode >> 3) & mask & S_IRWXO) != mask)
2786 rc = lustre_check_acl(inode, mask);
2790 goto check_capabilities;
2794 if (in_group_p(inode->i_gid))
2797 if ((mode & mask & S_IRWXO) == mask)
2801 if (!(mask & MAY_EXEC) ||
2802 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2803 if (capable(CAP_DAC_OVERRIDE))
2806 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2807 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2814 /* -o localflock - only provides locally consistent flock locks */
2815 struct file_operations ll_file_operations = {
2816 .read = ll_file_read,
2817 .write = ll_file_write,
2818 .ioctl = ll_file_ioctl,
2819 .open = ll_file_open,
2820 .release = ll_file_release,
2821 .mmap = ll_file_mmap,
2822 .llseek = ll_file_seek,
2823 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2824 .sendfile = ll_file_sendfile,
2829 struct file_operations ll_file_operations_flock = {
2830 .read = ll_file_read,
2831 .write = ll_file_write,
2832 .ioctl = ll_file_ioctl,
2833 .open = ll_file_open,
2834 .release = ll_file_release,
2835 .mmap = ll_file_mmap,
2836 .llseek = ll_file_seek,
2837 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2838 .sendfile = ll_file_sendfile,
2841 #ifdef HAVE_F_OP_FLOCK
2842 .flock = ll_file_flock,
2844 .lock = ll_file_flock
2847 /* These are for -o noflock - to return ENOSYS on flock calls */
2848 struct file_operations ll_file_operations_noflock = {
2849 .read = ll_file_read,
2850 .write = ll_file_write,
2851 .ioctl = ll_file_ioctl,
2852 .open = ll_file_open,
2853 .release = ll_file_release,
2854 .mmap = ll_file_mmap,
2855 .llseek = ll_file_seek,
2856 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2857 .sendfile = ll_file_sendfile,
2860 #ifdef HAVE_F_OP_FLOCK
2861 .flock = ll_file_noflock,
2863 .lock = ll_file_noflock
2866 struct inode_operations ll_file_inode_operations = {
2867 #ifdef LUSTRE_KERNEL_VERSION
2868 .setattr_raw = ll_setattr_raw,
2870 .setattr = ll_setattr,
2871 .truncate = ll_truncate,
2872 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2873 .getattr = ll_getattr,
2875 .revalidate_it = ll_inode_revalidate_it,
2877 .permission = ll_inode_permission,
2878 .setxattr = ll_setxattr,
2879 .getxattr = ll_getxattr,
2880 .listxattr = ll_listxattr,
2881 .removexattr = ll_removexattr,