1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
34 #include "llite_internal.h"
36 /* also used by llite/special.c:ll_special_open() */
37 struct ll_file_data *ll_file_data_get(void)
39 struct ll_file_data *fd;
41 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
45 static void ll_file_data_put(struct ll_file_data *fd)
48 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
52 struct lustre_handle *fh)
54 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
55 op_data->op_attr.ia_mode = inode->i_mode;
56 op_data->op_attr.ia_atime = inode->i_atime;
57 op_data->op_attr.ia_mtime = inode->i_mtime;
58 op_data->op_attr.ia_ctime = inode->i_ctime;
59 op_data->op_attr.ia_size = inode->i_size;
60 op_data->op_attr_blocks = inode->i_blocks;
61 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
62 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
63 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
64 op_data->op_capa1 = ll_mdscapa_get(inode);
67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
68 struct obd_client_handle *och)
72 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
73 ATTR_MTIME_SET | ATTR_CTIME_SET;
75 if (!(och->och_flags & FMODE_WRITE))
78 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
79 !S_ISREG(inode->i_mode))
80 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
82 ll_epoch_close(inode, op_data, &och, 0);
85 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
89 static int ll_close_inode_openhandle(struct obd_export *md_exp,
91 struct obd_client_handle *och)
93 struct obd_export *exp = ll_i2mdexp(inode);
94 struct md_op_data *op_data;
95 struct ptlrpc_request *req = NULL;
96 struct obd_device *obd = class_exp2obd(exp);
103 * XXX: in case of LMV, is this correct to access
106 CERROR("Invalid MDC connection handle "LPX64"\n",
107 ll_i2mdexp(inode)->exp_handle.h_cookie);
112 * here we check if this is forced umount. If so this is called on
113 * canceling "open lock" and we do not call md_close() in this case, as
114 * it will not be successful, as import is already deactivated.
119 OBD_ALLOC_PTR(op_data);
121 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
123 ll_prepare_close(inode, op_data, och);
124 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
125 rc = md_close(md_exp, op_data, och, &req);
128 /* This close must have the epoch closed. */
129 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
130 LASSERT(epoch_close);
131 /* MDS has instructed us to obtain Size-on-MDS attribute from
132 * OSTs and send setattr to back to MDS. */
133 rc = ll_sizeonmds_update(inode, &och->och_fh,
134 op_data->op_ioepoch);
136 CERROR("inode %lu mdc Size-on-MDS update failed: "
137 "rc = %d\n", inode->i_ino, rc);
141 CERROR("inode %lu mdc close failed: rc = %d\n",
144 ll_finish_md_op_data(op_data);
147 rc = ll_objects_destroy(req, inode);
149 CERROR("inode %lu ll_objects destroy: rc = %d\n",
153 ptlrpc_req_finished(req); /* This is close request */
157 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
158 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
159 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
170 int ll_md_real_close(struct inode *inode, int flags)
172 struct ll_inode_info *lli = ll_i2info(inode);
173 struct obd_client_handle **och_p;
174 struct obd_client_handle *och;
179 if (flags & FMODE_WRITE) {
180 och_p = &lli->lli_mds_write_och;
181 och_usecount = &lli->lli_open_fd_write_count;
182 } else if (flags & FMODE_EXEC) {
183 och_p = &lli->lli_mds_exec_och;
184 och_usecount = &lli->lli_open_fd_exec_count;
186 LASSERT(flags & FMODE_READ);
187 och_p = &lli->lli_mds_read_och;
188 och_usecount = &lli->lli_open_fd_read_count;
191 down(&lli->lli_och_sem);
192 if (*och_usecount) { /* There are still users of this handle, so
194 up(&lli->lli_och_sem);
199 up(&lli->lli_och_sem);
201 if (och) { /* There might be a race and somebody have freed this och
203 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
210 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
213 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
214 struct ll_inode_info *lli = ll_i2info(inode);
218 /* clear group lock, if present */
219 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
220 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
221 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
222 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
226 /* Let's see if we have good enough OPEN lock on the file and if
227 we can skip talking to MDS */
228 if (file->f_dentry->d_inode) { /* Can this ever be false? */
230 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
231 struct lustre_handle lockh;
232 struct inode *inode = file->f_dentry->d_inode;
233 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
235 down(&lli->lli_och_sem);
236 if (fd->fd_omode & FMODE_WRITE) {
238 LASSERT(lli->lli_open_fd_write_count);
239 lli->lli_open_fd_write_count--;
240 } else if (fd->fd_omode & FMODE_EXEC) {
242 LASSERT(lli->lli_open_fd_exec_count);
243 lli->lli_open_fd_exec_count--;
246 LASSERT(lli->lli_open_fd_read_count);
247 lli->lli_open_fd_read_count--;
249 up(&lli->lli_och_sem);
251 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
252 LDLM_IBITS, &policy, lockmode,
254 rc = ll_md_real_close(file->f_dentry->d_inode,
258 CERROR("Releasing a file %p with negative dentry %p. Name %s",
259 file, file->f_dentry, file->f_dentry->d_name.name);
262 LUSTRE_FPRIVATE(file) = NULL;
263 ll_file_data_put(fd);
264 ll_capa_close(inode);
269 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
271 /* While this returns an error code, fput() the caller does not, so we need
272 * to make every effort to clean up all of our state here. Also, applications
273 * rarely check close errors and even if an error is returned they will not
274 * re-try the close call.
276 int ll_file_release(struct inode *inode, struct file *file)
278 struct ll_file_data *fd;
279 struct ll_sb_info *sbi = ll_i2sbi(inode);
280 struct ll_inode_info *lli = ll_i2info(inode);
281 struct lov_stripe_md *lsm = lli->lli_smd;
285 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
286 inode->i_generation, inode);
288 /* don't do anything for / */
289 if (inode->i_sb->s_root == file->f_dentry)
292 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
293 fd = LUSTRE_FPRIVATE(file);
296 /* don't do anything for / */
297 if (inode->i_sb->s_root == file->f_dentry) {
298 LUSTRE_FPRIVATE(file) = NULL;
299 ll_file_data_put(fd);
304 lov_test_and_clear_async_rc(lsm);
305 lli->lli_async_rc = 0;
307 rc = ll_md_close(sbi->ll_md_exp, inode, file);
311 static int ll_intent_file_open(struct file *file, void *lmm,
312 int lmmsize, struct lookup_intent *itp)
314 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
315 struct dentry *parent = file->f_dentry->d_parent;
316 const char *name = file->f_dentry->d_name.name;
317 const int len = file->f_dentry->d_name.len;
318 struct md_op_data *op_data;
319 struct ptlrpc_request *req;
325 /* Usually we come here only for NFSD, and we want open lock.
326 But we can also get here with pre 2.6.15 patchless kernels, and in
327 that case that lock is also ok */
328 /* We can also get here if there was cached open handle in revalidate_it
329 * but it disappeared while we were getting from there to ll_file_open.
330 * But this means this file was closed and immediatelly opened which
331 * makes a good candidate for using OPEN lock */
332 /* If lmmsize & lmm are not 0, we are just setting stripe info
333 * parameters. No need for the open lock */
334 if (!lmm && !lmmsize)
335 itp->it_flags |= MDS_OPEN_LOCK;
337 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
338 file->f_dentry->d_inode, name, len,
339 O_RDWR, LUSTRE_OPC_ANY, NULL);
341 RETURN(PTR_ERR(op_data));
343 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
344 0 /*unused */, &req, ll_md_blocking_ast, 0);
345 ll_finish_md_op_data(op_data);
347 /* reason for keep own exit path - don`t flood log
348 * with messages with -ESTALE errors.
350 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
351 it_open_error(DISP_OPEN_OPEN, itp))
353 ll_release_openhandle(file->f_dentry, itp);
357 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
358 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
359 CERROR("lock enqueue: err: %d\n", rc);
363 if (itp->d.lustre.it_lock_mode)
364 md_set_lock_data(sbi->ll_md_exp,
365 &itp->d.lustre.it_lock_handle,
366 file->f_dentry->d_inode);
368 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
371 ptlrpc_req_finished(itp->d.lustre.it_data);
374 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
375 ll_intent_drop_lock(itp);
380 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
381 struct lookup_intent *it, struct obd_client_handle *och)
383 struct ptlrpc_request *req = it->d.lustre.it_data;
384 struct mdt_body *body;
388 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
389 LASSERT(body != NULL); /* reply already checked out */
390 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
392 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
393 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
394 och->och_fid = lli->lli_fid;
395 och->och_flags = it->it_flags;
396 lli->lli_ioepoch = body->ioepoch;
398 return md_set_open_replay_data(md_exp, och, req);
401 int ll_local_open(struct file *file, struct lookup_intent *it,
402 struct ll_file_data *fd, struct obd_client_handle *och)
404 struct inode *inode = file->f_dentry->d_inode;
405 struct ll_inode_info *lli = ll_i2info(inode);
408 LASSERT(!LUSTRE_FPRIVATE(file));
413 struct ptlrpc_request *req = it->d.lustre.it_data;
414 struct mdt_body *body;
417 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
421 body = lustre_msg_buf(req->rq_repmsg,
422 DLM_REPLY_REC_OFF, sizeof(*body));
424 if ((it->it_flags & FMODE_WRITE) &&
425 (body->valid & OBD_MD_FLSIZE))
427 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
428 lli->lli_ioepoch, PFID(&lli->lli_fid));
432 LUSTRE_FPRIVATE(file) = fd;
433 ll_readahead_init(inode, &fd->fd_ras);
434 fd->fd_omode = it->it_flags;
438 /* Open a file, and (for the very first open) create objects on the OSTs at
439 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
440 * creation or open until ll_lov_setstripe() ioctl is called. We grab
441 * lli_open_sem to ensure no other process will create objects, send the
442 * stripe MD to the MDS, or try to destroy the objects if that fails.
444 * If we already have the stripe MD locally then we don't request it in
445 * md_open(), by passing a lmm_size = 0.
447 * It is up to the application to ensure no other processes open this file
448 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
449 * used. We might be able to avoid races of that sort by getting lli_open_sem
450 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
451 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
453 int ll_file_open(struct inode *inode, struct file *file)
455 struct ll_inode_info *lli = ll_i2info(inode);
456 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
457 .it_flags = file->f_flags };
458 struct lov_stripe_md *lsm;
459 struct ptlrpc_request *req = NULL;
460 struct obd_client_handle **och_p;
462 struct ll_file_data *fd;
466 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
467 inode->i_generation, inode, file->f_flags);
469 /* don't do anything for / */
470 if (inode->i_sb->s_root == file->f_dentry)
473 #ifdef LUSTRE_KERNEL_VERSION
476 it = file->private_data; /* XXX: compat macro */
477 file->private_data = NULL; /* prevent ll_local_open assertion */
480 fd = ll_file_data_get();
484 /* don't do anything for / */
485 if (inode->i_sb->s_root == file->f_dentry) {
486 LUSTRE_FPRIVATE(file) = fd;
490 if (!it || !it->d.lustre.it_disposition) {
491 /* Convert f_flags into access mode. We cannot use file->f_mode,
492 * because everything but O_ACCMODE mask was stripped from
494 if ((oit.it_flags + 1) & O_ACCMODE)
496 if (file->f_flags & O_TRUNC)
497 oit.it_flags |= FMODE_WRITE;
499 /* kernel only call f_op->open in dentry_open. filp_open calls
500 * dentry_open after call to open_namei that checks permissions.
501 * Only nfsd_open call dentry_open directly without checking
502 * permissions and because of that this code below is safe. */
503 if (oit.it_flags & FMODE_WRITE)
504 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
506 /* We do not want O_EXCL here, presumably we opened the file
507 * already? XXX - NFS implications? */
508 oit.it_flags &= ~O_EXCL;
513 /* Let's see if we have file open on MDS already. */
514 if (it->it_flags & FMODE_WRITE) {
515 och_p = &lli->lli_mds_write_och;
516 och_usecount = &lli->lli_open_fd_write_count;
517 } else if (it->it_flags & FMODE_EXEC) {
518 och_p = &lli->lli_mds_exec_och;
519 och_usecount = &lli->lli_open_fd_exec_count;
521 och_p = &lli->lli_mds_read_och;
522 och_usecount = &lli->lli_open_fd_read_count;
525 down(&lli->lli_och_sem);
526 if (*och_p) { /* Open handle is present */
527 if (it_disposition(it, DISP_OPEN_OPEN)) {
528 /* Well, there's extra open request that we do not need,
529 let's close it somehow. This will decref request. */
530 rc = it_open_error(DISP_OPEN_OPEN, it);
532 ll_file_data_put(fd);
533 GOTO(out_och_free, rc);
535 ll_release_openhandle(file->f_dentry, it);
536 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
541 rc = ll_local_open(file, it, fd, NULL);
543 up(&lli->lli_och_sem);
544 ll_file_data_put(fd);
548 LASSERT(*och_usecount == 0);
549 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
551 ll_file_data_put(fd);
552 GOTO(out_och_free, rc = -ENOMEM);
555 if (!it->d.lustre.it_disposition) {
556 it->it_flags |= O_CHECK_STALE;
557 rc = ll_intent_file_open(file, NULL, 0, it);
558 it->it_flags &= ~O_CHECK_STALE;
560 ll_file_data_put(fd);
561 GOTO(out_och_free, rc);
564 /* Got some error? Release the request */
565 if (it->d.lustre.it_status < 0) {
566 req = it->d.lustre.it_data;
567 ptlrpc_req_finished(req);
569 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
570 &it->d.lustre.it_lock_handle,
571 file->f_dentry->d_inode);
573 req = it->d.lustre.it_data;
575 /* md_intent_lock() didn't get a request ref if there was an
576 * open error, so don't do cleanup on the request here
578 /* XXX (green): Should not we bail out on any error here, not
579 * just open error? */
580 rc = it_open_error(DISP_OPEN_OPEN, it);
582 ll_file_data_put(fd);
583 GOTO(out_och_free, rc);
586 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
587 rc = ll_local_open(file, it, fd, *och_p);
589 up(&lli->lli_och_sem);
590 ll_file_data_put(fd);
591 GOTO(out_och_free, rc);
594 up(&lli->lli_och_sem);
596 /* Must do this outside lli_och_sem lock to prevent deadlock where
597 different kind of OPEN lock for this same inode gets cancelled
598 by ldlm_cancel_lru */
599 if (!S_ISREG(inode->i_mode))
606 if (file->f_flags & O_LOV_DELAY_CREATE ||
607 !(file->f_mode & FMODE_WRITE)) {
608 CDEBUG(D_INODE, "object creation was delayed\n");
612 file->f_flags &= ~O_LOV_DELAY_CREATE;
615 ptlrpc_req_finished(req);
617 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
621 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
622 *och_p = NULL; /* OBD_FREE writes some magic there */
625 up(&lli->lli_och_sem);
631 /* Fills the obdo with the attributes for the inode defined by lsm */
632 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
634 struct ptlrpc_request_set *set;
635 struct ll_inode_info *lli = ll_i2info(inode);
636 struct lov_stripe_md *lsm = lli->lli_smd;
638 struct obd_info oinfo = { { { 0 } } };
642 LASSERT(lsm != NULL);
646 oinfo.oi_oa->o_id = lsm->lsm_object_id;
647 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
648 oinfo.oi_oa->o_mode = S_IFREG;
649 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
650 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
651 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
652 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
654 oinfo.oi_capa = ll_mdscapa_get(inode);
656 set = ptlrpc_prep_set();
658 CERROR("can't allocate ptlrpc set\n");
661 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
663 rc = ptlrpc_set_wait(set);
664 ptlrpc_set_destroy(set);
666 capa_put(oinfo.oi_capa);
670 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
671 OBD_MD_FLATIME | OBD_MD_FLMTIME |
672 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
674 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
675 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
676 lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
681 static inline void ll_remove_suid(struct inode *inode)
685 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
686 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
688 /* was any of the uid bits set? */
689 mode &= inode->i_mode;
690 if (mode && !capable(CAP_FSETID)) {
691 inode->i_mode &= ~mode;
692 // XXX careful here - we cannot change the size
696 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
698 struct ll_inode_info *lli = ll_i2info(inode);
699 struct lov_stripe_md *lsm = lli->lli_smd;
700 struct obd_export *exp = ll_i2dtexp(inode);
703 struct ldlm_lock *lock;
704 struct lov_stripe_md *lsm;
705 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
706 __u32 stripe, vallen = sizeof(stripe);
710 if (lsm->lsm_stripe_count == 1)
711 GOTO(check, stripe = 0);
713 /* get our offset in the lov */
714 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
716 CERROR("obd_get_info: rc = %d\n", rc);
719 LASSERT(stripe < lsm->lsm_stripe_count);
722 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
723 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
724 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
725 lsm->lsm_oinfo[stripe]->loi_id,
726 lsm->lsm_oinfo[stripe]->loi_gr);
727 RETURN(-ELDLM_NO_LOCK_DATA);
733 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
734 * we get a lock cancellation for each stripe, so we have to map the obd's
735 * region back onto the stripes in the file that it held.
737 * No one can dirty the extent until we've finished our work and they can
738 * enqueue another lock. The DLM protects us from ll_file_read/write here,
739 * but other kernel actors could have pages locked.
741 * Called with the DLM lock held. */
742 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
743 struct ldlm_lock *lock, __u32 stripe)
745 ldlm_policy_data_t tmpex;
746 unsigned long start, end, count, skip, i, j;
748 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
749 struct lustre_handle lockh;
752 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
753 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
754 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
757 /* our locks are page granular thanks to osc_enqueue, we invalidate the
759 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
760 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
761 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
763 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
764 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
768 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
769 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
770 if (lsm->lsm_stripe_count > 1) {
771 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
772 skip = (lsm->lsm_stripe_count - 1) * count;
773 start += start/count * skip + stripe * count;
775 end += end/count * skip + stripe * count;
777 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
780 i = inode->i_size ? (__u64)(inode->i_size - 1) >> CFS_PAGE_SHIFT : 0;
784 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
785 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
786 count, skip, end, discard ? " (DISCARDING)" : "");
788 /* walk through the vmas on the inode and tear down mmaped pages that
789 * intersect with the lock. this stops immediately if there are no
790 * mmap()ed regions of the file. This is not efficient at all and
791 * should be short lived. We'll associate mmap()ed pages with the lock
792 * and will be able to find them directly */
793 for (i = start; i <= end; i += (j + skip)) {
794 j = min(count - (i % count), end - i + 1);
796 LASSERT(inode->i_mapping);
797 if (ll_teardown_mmaps(inode->i_mapping,
798 (__u64)i << CFS_PAGE_SHIFT,
799 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
803 /* this is the simplistic implementation of page eviction at
804 * cancelation. It is careful to get races with other page
805 * lockers handled correctly. fixes from bug 20 will make it
806 * more efficient by associating locks with pages and with
807 * batching writeback under the lock explicitly. */
808 for (i = start, j = start % count; i <= end;
809 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
811 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
817 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
818 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
819 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
822 if (!mapping_has_pages(inode->i_mapping)) {
823 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
829 page = find_get_page(inode->i_mapping, i);
832 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
833 i, tmpex.l_extent.start);
836 /* page->mapping to check with racing against teardown */
837 if (!discard && clear_page_dirty_for_io(page)) {
838 rc = ll_call_writepage(inode, page);
840 CERROR("writepage inode %lu(%p) of page %p "
841 "failed: %d\n", inode->i_ino, inode,
843 /* either waiting for io to complete or reacquiring
844 * the lock that the failed writepage released */
848 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
849 /* check to see if another DLM lock covers this page b=2765 */
850 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
851 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
853 &lock->l_resource->lr_name, LDLM_EXTENT,
854 &tmpex, LCK_PR | LCK_PW, &lockh);
856 if (rc2 <= 0 && page->mapping != NULL) {
857 struct ll_async_page *llap = llap_cast_private(page);
858 /* checking again to account for writeback's
860 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
862 ll_ra_accounting(llap, inode->i_mapping);
863 ll_truncate_complete_page(page);
866 page_cache_release(page);
868 LASSERTF(tmpex.l_extent.start <=
869 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
870 lock->l_policy_data.l_extent.end + 1),
871 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
872 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
877 static int ll_extent_lock_callback(struct ldlm_lock *lock,
878 struct ldlm_lock_desc *new, void *data,
881 struct lustre_handle lockh = { 0 };
885 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
886 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
891 case LDLM_CB_BLOCKING:
892 ldlm_lock2handle(lock, &lockh);
893 rc = ldlm_cli_cancel(&lockh);
895 CERROR("ldlm_cli_cancel failed: %d\n", rc);
897 case LDLM_CB_CANCELING: {
899 struct ll_inode_info *lli;
900 struct lov_stripe_md *lsm;
904 /* This lock wasn't granted, don't try to evict pages */
905 if (lock->l_req_mode != lock->l_granted_mode)
908 inode = ll_inode_from_lock(lock);
911 lli = ll_i2info(inode);
914 if (lli->lli_smd == NULL)
918 stripe = ll_lock_to_stripe_offset(inode, lock);
922 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
924 lov_stripe_lock(lsm);
925 lock_res_and_lock(lock);
926 kms = ldlm_extent_shift_kms(lock,
927 lsm->lsm_oinfo[stripe]->loi_kms);
929 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
930 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
931 lsm->lsm_oinfo[stripe]->loi_kms, kms);
932 lsm->lsm_oinfo[stripe]->loi_kms = kms;
933 unlock_res_and_lock(lock);
934 lov_stripe_unlock(lsm);
947 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
949 /* XXX ALLOCATE - 160 bytes */
950 struct inode *inode = ll_inode_from_lock(lock);
951 struct ll_inode_info *lli = ll_i2info(inode);
952 struct lustre_handle lockh = { 0 };
957 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
958 LDLM_FL_BLOCK_CONV)) {
959 LBUG(); /* not expecting any blocked async locks yet */
960 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
962 ldlm_lock_dump(D_OTHER, lock, 0);
963 ldlm_reprocess_all(lock->l_resource);
967 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
969 stripe = ll_lock_to_stripe_offset(inode, lock);
973 if (lock->l_lvb_len) {
974 struct lov_stripe_md *lsm = lli->lli_smd;
976 lvb = lock->l_lvb_data;
977 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
979 lock_res_and_lock(lock);
980 ll_inode_size_lock(inode, 1);
981 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
982 kms = ldlm_extent_shift_kms(NULL, kms);
983 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
984 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
985 lsm->lsm_oinfo[stripe].loi_kms, kms);
986 lsm->lsm_oinfo[stripe].loi_kms = kms;
987 ll_inode_size_unlock(inode, 1);
988 unlock_res_and_lock(lock);
993 wake_up(&lock->l_waitq);
995 ldlm_lock2handle(lock, &lockh);
996 ldlm_lock_decref(&lockh, LCK_PR);
1001 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1003 struct ptlrpc_request *req = reqp;
1004 struct inode *inode = ll_inode_from_lock(lock);
1005 struct ll_inode_info *lli;
1006 struct lov_stripe_md *lsm;
1007 struct ost_lvb *lvb;
1009 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1013 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1014 lli = ll_i2info(inode);
1016 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1019 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1021 /* First, find out which stripe index this lock corresponds to. */
1022 stripe = ll_lock_to_stripe_offset(inode, lock);
1024 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1026 rc = lustre_pack_reply(req, 2, size, NULL);
1028 CERROR("lustre_pack_reply: %d\n", rc);
1032 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1033 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1034 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1035 lvb->lvb_atime = LTIME_S(inode->i_atime);
1036 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1038 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1039 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1040 inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1041 lvb->lvb_atime, lvb->lvb_ctime);
1046 /* These errors are normal races, so we don't want to fill the console
1047 * with messages by calling ptlrpc_error() */
1048 if (rc == -ELDLM_NO_LOCK_DATA)
1049 lustre_pack_reply(req, 1, NULL, NULL);
1051 req->rq_status = rc;
1055 static void ll_merge_lvb(struct inode *inode)
1057 struct ll_inode_info *lli = ll_i2info(inode);
1058 struct ll_sb_info *sbi = ll_i2sbi(inode);
1062 ll_inode_size_lock(inode, 1);
1063 inode_init_lvb(inode, &lvb);
1064 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1065 inode->i_size = lvb.lvb_size;
1066 inode->i_blocks = lvb.lvb_blocks;
1067 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1068 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1069 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1070 ll_inode_size_unlock(inode, 1);
1074 int ll_local_size(struct inode *inode)
1076 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1077 struct ll_inode_info *lli = ll_i2info(inode);
1078 struct ll_sb_info *sbi = ll_i2sbi(inode);
1079 struct lustre_handle lockh = { 0 };
1084 if (lli->lli_smd->lsm_stripe_count == 0)
1087 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1088 &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1094 ll_merge_lvb(inode);
1095 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1099 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1102 struct lustre_handle lockh = { 0 };
1103 struct obd_enqueue_info einfo = { 0 };
1104 struct obd_info oinfo = { { { 0 } } };
1110 einfo.ei_type = LDLM_EXTENT;
1111 einfo.ei_mode = LCK_PR;
1112 einfo.ei_flags = LDLM_FL_HAS_INTENT;
1113 einfo.ei_cb_bl = ll_extent_lock_callback;
1114 einfo.ei_cb_cp = ldlm_completion_ast;
1115 einfo.ei_cb_gl = ll_glimpse_callback;
1116 einfo.ei_cbdata = NULL;
1118 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1119 oinfo.oi_lockh = &lockh;
1122 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1126 CERROR("obd_enqueue returned rc %d, "
1127 "returning -EIO\n", rc);
1128 RETURN(rc > 0 ? -EIO : rc);
1131 lov_stripe_lock(lsm);
1132 memset(&lvb, 0, sizeof(lvb));
1133 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1134 st->st_size = lvb.lvb_size;
1135 st->st_blocks = lvb.lvb_blocks;
1136 st->st_mtime = lvb.lvb_mtime;
1137 st->st_atime = lvb.lvb_atime;
1138 st->st_ctime = lvb.lvb_ctime;
1139 lov_stripe_unlock(lsm);
1144 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1145 * file (because it prefers KMS over RSS when larger) */
1146 int ll_glimpse_size(struct inode *inode, int ast_flags)
1148 struct ll_inode_info *lli = ll_i2info(inode);
1149 struct ll_sb_info *sbi = ll_i2sbi(inode);
1150 struct lustre_handle lockh = { 0 };
1151 struct obd_enqueue_info einfo = { 0 };
1152 struct obd_info oinfo = { { { 0 } } };
1156 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1159 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1161 if (!lli->lli_smd) {
1162 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1166 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1167 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1168 * won't revoke any conflicting DLM locks held. Instead,
1169 * ll_glimpse_callback() will be called on each client
1170 * holding a DLM lock against this file, and resulting size
1171 * will be returned for each stripe. DLM lock on [0, EOF] is
1172 * acquired only if there were no conflicting locks. */
1173 einfo.ei_type = LDLM_EXTENT;
1174 einfo.ei_mode = LCK_PR;
1175 einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1176 einfo.ei_cb_bl = ll_extent_lock_callback;
1177 einfo.ei_cb_cp = ldlm_completion_ast;
1178 einfo.ei_cb_gl = ll_glimpse_callback;
1179 einfo.ei_cbdata = inode;
1181 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1182 oinfo.oi_lockh = &lockh;
1183 oinfo.oi_md = lli->lli_smd;
1185 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1189 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1190 RETURN(rc > 0 ? -EIO : rc);
1193 ll_merge_lvb(inode);
1195 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1196 inode->i_size, inode->i_blocks);
1201 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1202 struct lov_stripe_md *lsm, int mode,
1203 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1206 struct ll_sb_info *sbi = ll_i2sbi(inode);
1208 struct obd_enqueue_info einfo = { 0 };
1209 struct obd_info oinfo = { { { 0 } } };
1213 LASSERT(!lustre_handle_is_used(lockh));
1214 LASSERT(lsm != NULL);
1216 /* don't drop the mmapped file to LRU */
1217 if (mapping_mapped(inode->i_mapping))
1218 ast_flags |= LDLM_FL_NO_LRU;
1220 /* XXX phil: can we do this? won't it screw the file size up? */
1221 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1222 (sbi->ll_flags & LL_SBI_NOLCK))
1225 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1226 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1228 einfo.ei_type = LDLM_EXTENT;
1229 einfo.ei_mode = mode;
1230 einfo.ei_flags = ast_flags;
1231 einfo.ei_cb_bl = ll_extent_lock_callback;
1232 einfo.ei_cb_cp = ldlm_completion_ast;
1233 einfo.ei_cb_gl = ll_glimpse_callback;
1234 einfo.ei_cbdata = inode;
1236 oinfo.oi_policy = *policy;
1237 oinfo.oi_lockh = lockh;
1240 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1241 *policy = oinfo.oi_policy;
1245 ll_inode_size_lock(inode, 1);
1246 inode_init_lvb(inode, &lvb);
1247 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1249 if (policy->l_extent.start == 0 &&
1250 policy->l_extent.end == OBD_OBJECT_EOF) {
1251 /* vmtruncate()->ll_truncate() first sets the i_size and then
1252 * the kms under both a DLM lock and the
1253 * ll_inode_size_lock(). If we don't get the
1254 * ll_inode_size_lock() here we can match the DLM lock and
1255 * reset i_size from the kms before the truncating path has
1256 * updated the kms. generic_file_write can then trust the
1257 * stale i_size when doing appending writes and effectively
1258 * cancel the result of the truncate. Getting the
1259 * ll_inode_size_lock() after the enqueue maintains the DLM
1260 * -> ll_inode_size_lock() acquiring order. */
1261 inode->i_size = lvb.lvb_size;
1262 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1263 inode->i_ino, inode->i_size);
1267 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1268 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1269 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1271 ll_inode_size_unlock(inode, 1);
1276 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1277 struct lov_stripe_md *lsm, int mode,
1278 struct lustre_handle *lockh)
1280 struct ll_sb_info *sbi = ll_i2sbi(inode);
1284 /* XXX phil: can we do this? won't it screw the file size up? */
1285 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1286 (sbi->ll_flags & LL_SBI_NOLCK))
1289 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1297 struct inode *inode = file->f_dentry->d_inode;
1298 struct ll_inode_info *lli = ll_i2info(inode);
1299 struct lov_stripe_md *lsm = lli->lli_smd;
1300 struct ll_sb_info *sbi = ll_i2sbi(inode);
1301 struct ll_lock_tree tree;
1302 struct ll_lock_tree_node *node;
1304 struct ll_ra_read bead;
1307 ssize_t retval, chunk, sum = 0;
1311 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1312 inode->i_ino, inode->i_generation, inode, count, *ppos);
1313 /* "If nbyte is 0, read() will return 0 and have no other results."
1314 * -- Single Unix Spec */
1318 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1321 /* Read on file with no objects should return zero-filled
1322 * buffers up to file size (we can get non-zero sizes with
1323 * mknod + truncate, then opening file for read. This is a
1324 * common pattern in NFS case, it seems). Bug 6243 */
1326 /* Since there are no objects on OSTs, we have nothing to get
1327 * lock on and so we are forced to access inode->i_size
1330 /* Read beyond end of file */
1331 if (*ppos >= inode->i_size)
1334 if (count > inode->i_size - *ppos)
1335 count = inode->i_size - *ppos;
1336 /* Make sure to correctly adjust the file pos pointer for
1338 notzeroed = clear_user(buf, count);
1347 if (sbi->ll_max_rw_chunk != 0) {
1348 /* first, let's know the end of the current stripe */
1350 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1353 /* correct, the end is beyond the request */
1354 if (end > *ppos + count - 1)
1355 end = *ppos + count - 1;
1357 /* and chunk shouldn't be too large even if striping is wide */
1358 if (end - *ppos > sbi->ll_max_rw_chunk)
1359 end = *ppos + sbi->ll_max_rw_chunk - 1;
1361 end = *ppos + count - 1;
1364 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1365 tree.lt_fd = LUSTRE_FPRIVATE(file);
1366 rc = ll_tree_lock(&tree, node, buf, count,
1367 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1369 GOTO(out, retval = rc);
1371 ll_inode_size_lock(inode, 1);
1373 * Consistency guarantees: following possibilities exist for the
1374 * relation between region being read and real file size at this
1377 * (A): the region is completely inside of the file;
1379 * (B-x): x bytes of region are inside of the file, the rest is
1382 * (C): the region is completely outside of the file.
1384 * This classification is stable under DLM lock acquired by
1385 * ll_tree_lock() above, because to change class, other client has to
1386 * take DLM lock conflicting with our lock. Also, any updates to
1387 * ->i_size by other threads on this client are serialized by
1388 * ll_inode_size_lock(). This guarantees that short reads are handled
1389 * correctly in the face of concurrent writes and truncates.
1391 inode_init_lvb(inode, &lvb);
1392 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1394 if (*ppos + count - 1 > kms) {
1395 /* A glimpse is necessary to determine whether we return a
1396 * short read (B) or some zeroes at the end of the buffer (C) */
1397 ll_inode_size_unlock(inode, 1);
1398 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1400 ll_tree_unlock(&tree);
1404 /* region is within kms and, hence, within real file size (A).
1405 * We need to increase i_size to cover the read region so that
1406 * generic_file_read() will do its job, but that doesn't mean
1407 * the kms size is _correct_, it is only the _minimum_ size.
1408 * If someone does a stat they will get the correct size which
1409 * will always be >= the kms value here. b=11081 */
1410 if (inode->i_size < kms)
1411 inode->i_size = kms;
1412 ll_inode_size_unlock(inode, 1);
1415 chunk = end - *ppos + 1;
1416 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1417 inode->i_ino, chunk, *ppos, inode->i_size);
1419 /* turn off the kernel's read-ahead */
1420 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1423 file->f_ra.ra_pages = 0;
1425 /* initialize read-ahead window once per syscall */
1428 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1429 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1430 ll_ra_read_in(file, &bead);
1434 file_accessed(file);
1435 retval = generic_file_read(file, buf, chunk, ppos);
1436 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1438 ll_tree_unlock(&tree);
1444 if (retval == chunk && count > 0)
1450 ll_ra_read_ex(file, &bead);
1451 retval = (sum > 0) ? sum : retval;
1456 * Write to a file (through the page cache).
1458 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1461 struct inode *inode = file->f_dentry->d_inode;
1462 struct ll_sb_info *sbi = ll_i2sbi(inode);
1463 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1464 struct ll_lock_tree tree;
1465 struct ll_lock_tree_node *node;
1466 loff_t maxbytes = ll_file_maxbytes(inode);
1467 loff_t lock_start, lock_end, end;
1468 ssize_t retval, chunk, sum = 0;
1472 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1473 inode->i_ino, inode->i_generation, inode, count, *ppos);
1475 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1477 /* POSIX, but surprised the VFS doesn't check this already */
1481 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1482 * called on the file, don't fail the below assertion (bug 2388). */
1483 if (file->f_flags & O_LOV_DELAY_CREATE &&
1484 ll_i2info(inode)->lli_smd == NULL)
1487 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1489 down(&ll_i2info(inode)->lli_write_sem);
1492 chunk = 0; /* just to fix gcc's warning */
1493 end = *ppos + count - 1;
1495 if (file->f_flags & O_APPEND) {
1497 lock_end = OBD_OBJECT_EOF;
1498 } else if (sbi->ll_max_rw_chunk != 0) {
1499 /* first, let's know the end of the current stripe */
1501 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1504 /* correct, the end is beyond the request */
1505 if (end > *ppos + count - 1)
1506 end = *ppos + count - 1;
1508 /* and chunk shouldn't be too large even if striping is wide */
1509 if (end - *ppos > sbi->ll_max_rw_chunk)
1510 end = *ppos + sbi->ll_max_rw_chunk - 1;
1515 lock_end = *ppos + count - 1;
1517 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1520 GOTO(out, retval = PTR_ERR(node));
1522 tree.lt_fd = LUSTRE_FPRIVATE(file);
1523 rc = ll_tree_lock(&tree, node, buf, count,
1524 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1526 GOTO(out, retval = rc);
1528 /* This is ok, g_f_w will overwrite this under i_sem if it races
1529 * with a local truncate, it just makes our maxbyte checking easier.
1530 * The i_size value gets updated in ll_extent_lock() as a consequence
1531 * of the [0,EOF] extent lock we requested above. */
1532 if (file->f_flags & O_APPEND) {
1533 *ppos = inode->i_size;
1534 end = *ppos + count - 1;
1537 if (*ppos >= maxbytes) {
1538 send_sig(SIGXFSZ, current, 0);
1539 GOTO(out, retval = -EFBIG);
1541 if (*ppos + count > maxbytes)
1542 count = maxbytes - *ppos;
1544 /* generic_file_write handles O_APPEND after getting i_mutex */
1545 chunk = end - *ppos + 1;
1546 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1547 inode->i_ino, chunk, *ppos);
1548 retval = generic_file_write(file, buf, chunk, ppos);
1549 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1552 ll_tree_unlock(&tree);
1558 if (retval == chunk && count > 0)
1562 up(&ll_i2info(inode)->lli_write_sem);
1564 retval = (sum > 0) ? sum : retval;
1565 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1566 retval > 0 ? retval : 0);
1571 * Send file content (through pagecache) somewhere with helper
1573 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1574 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1575 read_actor_t actor, void *target)
1577 struct inode *inode = in_file->f_dentry->d_inode;
1578 struct ll_inode_info *lli = ll_i2info(inode);
1579 struct lov_stripe_md *lsm = lli->lli_smd;
1580 struct ll_lock_tree tree;
1581 struct ll_lock_tree_node *node;
1583 struct ll_ra_read bead;
1588 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1589 inode->i_ino, inode->i_generation, inode, count, *ppos);
1591 /* "If nbyte is 0, read() will return 0 and have no other results."
1592 * -- Single Unix Spec */
1596 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1597 /* turn off the kernel's read-ahead */
1598 in_file->f_ra.ra_pages = 0;
1600 /* File with no objects, nothing to lock */
1602 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1604 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1605 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1606 rc = ll_tree_lock(&tree, node, NULL, count,
1607 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1611 ll_inode_size_lock(inode, 1);
1613 * Consistency guarantees: following possibilities exist for the
1614 * relation between region being read and real file size at this
1617 * (A): the region is completely inside of the file;
1619 * (B-x): x bytes of region are inside of the file, the rest is
1622 * (C): the region is completely outside of the file.
1624 * This classification is stable under DLM lock acquired by
1625 * ll_tree_lock() above, because to change class, other client has to
1626 * take DLM lock conflicting with our lock. Also, any updates to
1627 * ->i_size by other threads on this client are serialized by
1628 * ll_inode_size_lock(). This guarantees that short reads are handled
1629 * correctly in the face of concurrent writes and truncates.
1631 inode_init_lvb(inode, &lvb);
1632 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1634 if (*ppos + count - 1 > kms) {
1635 /* A glimpse is necessary to determine whether we return a
1636 * short read (B) or some zeroes at the end of the buffer (C) */
1637 ll_inode_size_unlock(inode, 1);
1638 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1642 /* region is within kms and, hence, within real file size (A) */
1643 inode->i_size = kms;
1644 ll_inode_size_unlock(inode, 1);
1647 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1648 inode->i_ino, count, *ppos, inode->i_size);
1650 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1651 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1652 ll_ra_read_in(in_file, &bead);
1654 file_accessed(in_file);
1655 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1656 ll_ra_read_ex(in_file, &bead);
1659 ll_tree_unlock(&tree);
1664 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1667 struct ll_inode_info *lli = ll_i2info(inode);
1668 struct obd_export *exp = ll_i2dtexp(inode);
1669 struct ll_recreate_obj ucreatp;
1670 struct obd_trans_info oti = { 0 };
1671 struct obdo *oa = NULL;
1674 struct lov_stripe_md *lsm, *lsm2;
1677 if (!capable (CAP_SYS_ADMIN))
1680 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1681 sizeof(struct ll_recreate_obj));
1689 down(&lli->lli_size_sem);
1692 GOTO(out, rc = -ENOENT);
1693 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1694 (lsm->lsm_stripe_count));
1696 OBD_ALLOC(lsm2, lsm_size);
1698 GOTO(out, rc = -ENOMEM);
1700 oa->o_id = ucreatp.lrc_id;
1701 oa->o_gr = ucreatp.lrc_group;
1702 oa->o_nlink = ucreatp.lrc_ost_idx;
1703 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1704 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1705 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1706 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1708 oti.oti_objid = NULL;
1709 memcpy(lsm2, lsm, lsm_size);
1710 rc = obd_create(exp, oa, &lsm2, &oti);
1712 OBD_FREE(lsm2, lsm_size);
1715 up(&lli->lli_size_sem);
1720 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1721 int flags, struct lov_user_md *lum, int lum_size)
1723 struct ll_inode_info *lli = ll_i2info(inode);
1724 struct lov_stripe_md *lsm;
1725 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1729 down(&lli->lli_size_sem);
1732 up(&lli->lli_size_sem);
1733 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1738 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1741 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1742 GOTO(out_req_free, rc = -ENOENT);
1743 rc = oit.d.lustre.it_status;
1745 GOTO(out_req_free, rc);
1747 ll_release_openhandle(file->f_dentry, &oit);
1750 up(&lli->lli_size_sem);
1751 ll_intent_release(&oit);
1754 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1758 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1759 struct lov_mds_md **lmmp, int *lmm_size,
1760 struct ptlrpc_request **request)
1762 struct ll_sb_info *sbi = ll_i2sbi(inode);
1763 struct mdt_body *body;
1764 struct lov_mds_md *lmm = NULL;
1765 struct ptlrpc_request *req = NULL;
1766 struct obd_capa *oc;
1769 rc = ll_get_max_mdsize(sbi, &lmmsize);
1773 oc = ll_mdscapa_get(inode);
1774 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1775 oc, filename, strlen(filename) + 1,
1776 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1779 CDEBUG(D_INFO, "md_getattr_name failed "
1780 "on %s: rc %d\n", filename, rc);
1784 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1785 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1786 /* swabbed by mdc_getattr_name */
1787 LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1789 lmmsize = body->eadatasize;
1791 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1793 GOTO(out, rc = -ENODATA);
1796 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1797 LASSERT(lmm != NULL);
1798 LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1801 * This is coming from the MDS, so is probably in
1802 * little endian. We convert it to host endian before
1803 * passing it to userspace.
1805 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1806 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1807 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1808 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1809 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1812 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1813 struct lov_stripe_md *lsm;
1814 struct lov_user_md_join *lmj;
1815 int lmj_size, i, aindex = 0;
1817 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1819 GOTO(out, rc = -ENOMEM);
1820 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1822 GOTO(out_free_memmd, rc);
1824 lmj_size = sizeof(struct lov_user_md_join) +
1825 lsm->lsm_stripe_count *
1826 sizeof(struct lov_user_ost_data_join);
1827 OBD_ALLOC(lmj, lmj_size);
1829 GOTO(out_free_memmd, rc = -ENOMEM);
1831 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1832 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1833 struct lov_extent *lex =
1834 &lsm->lsm_array->lai_ext_array[aindex];
1836 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1838 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1839 LPU64" len %d\n", aindex, i,
1840 lex->le_start, (int)lex->le_len);
1841 lmj->lmm_objects[i].l_extent_start =
1844 if ((int)lex->le_len == -1)
1845 lmj->lmm_objects[i].l_extent_end = -1;
1847 lmj->lmm_objects[i].l_extent_end =
1848 lex->le_start + lex->le_len;
1849 lmj->lmm_objects[i].l_object_id =
1850 lsm->lsm_oinfo[i]->loi_id;
1851 lmj->lmm_objects[i].l_object_gr =
1852 lsm->lsm_oinfo[i]->loi_gr;
1853 lmj->lmm_objects[i].l_ost_gen =
1854 lsm->lsm_oinfo[i]->loi_ost_gen;
1855 lmj->lmm_objects[i].l_ost_idx =
1856 lsm->lsm_oinfo[i]->loi_ost_idx;
1858 lmm = (struct lov_mds_md *)lmj;
1861 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1865 *lmm_size = lmmsize;
1870 static int ll_lov_setea(struct inode *inode, struct file *file,
1873 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1874 struct lov_user_md *lump;
1875 int lum_size = sizeof(struct lov_user_md) +
1876 sizeof(struct lov_user_ost_data);
1880 if (!capable (CAP_SYS_ADMIN))
1883 OBD_ALLOC(lump, lum_size);
1887 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1889 OBD_FREE(lump, lum_size);
1893 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1895 OBD_FREE(lump, lum_size);
1899 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1902 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1904 int flags = FMODE_WRITE;
1907 /* Bug 1152: copy properly when this is no longer true */
1908 LASSERT(sizeof(lum) == sizeof(*lump));
1909 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1910 rc = copy_from_user(&lum, lump, sizeof(lum));
1914 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1916 put_user(0, &lump->lmm_stripe_count);
1917 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1918 0, ll_i2info(inode)->lli_smd, lump);
1923 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1925 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1930 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1934 static int ll_get_grouplock(struct inode *inode, struct file *file,
1937 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1938 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1939 .end = OBD_OBJECT_EOF}};
1940 struct lustre_handle lockh = { 0 };
1941 struct ll_inode_info *lli = ll_i2info(inode);
1942 struct lov_stripe_md *lsm = lli->lli_smd;
1946 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1950 policy.l_extent.gid = arg;
1951 if (file->f_flags & O_NONBLOCK)
1952 flags = LDLM_FL_BLOCK_NOWAIT;
1954 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1958 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1960 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1965 static int ll_put_grouplock(struct inode *inode, struct file *file,
1968 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1969 struct ll_inode_info *lli = ll_i2info(inode);
1970 struct lov_stripe_md *lsm = lli->lli_smd;
1974 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1975 /* Ugh, it's already unlocked. */
1979 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1982 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1984 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1989 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1994 static int join_sanity_check(struct inode *head, struct inode *tail)
1997 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1998 CERROR("server do not support join \n");
2001 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2002 CERROR("tail ino %lu and ino head %lu must be regular\n",
2003 head->i_ino, tail->i_ino);
2006 if (head->i_ino == tail->i_ino) {
2007 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2010 if (head->i_size % JOIN_FILE_ALIGN) {
2011 CERROR("hsize %llu must be times of 64K\n", head->i_size);
2017 static int join_file(struct inode *head_inode, struct file *head_filp,
2018 struct file *tail_filp)
2020 struct inode *tail_inode, *tail_parent;
2021 struct dentry *tail_dentry = tail_filp->f_dentry;
2022 struct lookup_intent oit = {.it_op = IT_OPEN,
2023 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2024 struct lustre_handle lockh;
2025 struct md_op_data *op_data;
2029 tail_dentry = tail_filp->f_dentry;
2030 tail_inode = tail_dentry->d_inode;
2031 tail_parent = tail_dentry->d_parent->d_inode;
2033 op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
2034 tail_dentry->d_name.name,
2035 tail_dentry->d_name.len, 0,
2036 LUSTRE_OPC_ANY, &head_inode->i_size);
2037 if (IS_ERR(op_data))
2038 RETURN(PTR_ERR(op_data));
2040 rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
2041 op_data, &lockh, NULL, 0, ldlm_completion_ast,
2042 ll_md_blocking_ast, NULL, 0);
2044 ll_finish_md_op_data(op_data);
2048 rc = oit.d.lustre.it_status;
2050 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2051 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2052 ptlrpc_req_finished((struct ptlrpc_request *)
2053 oit.d.lustre.it_data);
2057 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2059 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2060 oit.d.lustre.it_lock_mode = 0;
2062 ll_release_openhandle(head_filp->f_dentry, &oit);
2064 ll_intent_release(&oit);
2068 static int ll_file_join(struct inode *head, struct file *filp,
2069 char *filename_tail)
2071 struct inode *tail = NULL, *first = NULL, *second = NULL;
2072 struct dentry *tail_dentry;
2073 struct file *tail_filp, *first_filp, *second_filp;
2074 struct ll_lock_tree first_tree, second_tree;
2075 struct ll_lock_tree_node *first_node, *second_node;
2076 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2077 int rc = 0, cleanup_phase = 0;
2080 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2081 head->i_ino, head->i_generation, head, filename_tail);
2083 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2084 if (IS_ERR(tail_filp)) {
2085 CERROR("Can not open tail file %s", filename_tail);
2086 rc = PTR_ERR(tail_filp);
2089 tail = igrab(tail_filp->f_dentry->d_inode);
2091 tlli = ll_i2info(tail);
2092 tail_dentry = tail_filp->f_dentry;
2093 LASSERT(tail_dentry);
2096 /*reorder the inode for lock sequence*/
2097 first = head->i_ino > tail->i_ino ? head : tail;
2098 second = head->i_ino > tail->i_ino ? tail : head;
2099 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2100 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2102 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2103 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2104 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2105 if (IS_ERR(first_node)){
2106 rc = PTR_ERR(first_node);
2109 first_tree.lt_fd = first_filp->private_data;
2110 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2115 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2116 if (IS_ERR(second_node)){
2117 rc = PTR_ERR(second_node);
2120 second_tree.lt_fd = second_filp->private_data;
2121 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2126 rc = join_sanity_check(head, tail);
2130 rc = join_file(head, filp, tail_filp);
2134 switch (cleanup_phase) {
2136 ll_tree_unlock(&second_tree);
2137 obd_cancel_unused(ll_i2dtexp(second),
2138 ll_i2info(second)->lli_smd, 0, NULL);
2140 ll_tree_unlock(&first_tree);
2141 obd_cancel_unused(ll_i2dtexp(first),
2142 ll_i2info(first)->lli_smd, 0, NULL);
2144 filp_close(tail_filp, 0);
2147 if (head && rc == 0) {
2148 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2150 hlli->lli_smd = NULL;
2155 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2161 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2163 struct inode *inode = dentry->d_inode;
2164 struct obd_client_handle *och;
2170 /* Root ? Do nothing. */
2171 if (dentry->d_inode->i_sb->s_root == dentry)
2174 /* No open handle to close? Move away */
2175 if (!it_disposition(it, DISP_OPEN_OPEN))
2178 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2180 OBD_ALLOC(och, sizeof(*och));
2182 GOTO(out, rc = -ENOMEM);
2184 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2185 ll_i2info(inode), it, och);
2187 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2190 /* this one is in place of ll_file_open */
2191 ptlrpc_req_finished(it->d.lustre.it_data);
2192 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2196 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2203 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2204 inode->i_generation, inode, cmd);
2205 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2207 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2208 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2212 case LL_IOC_GETFLAGS:
2213 /* Get the current value of the file flags */
2214 return put_user(fd->fd_flags, (int *)arg);
2215 case LL_IOC_SETFLAGS:
2216 case LL_IOC_CLRFLAGS:
2217 /* Set or clear specific file flags */
2218 /* XXX This probably needs checks to ensure the flags are
2219 * not abused, and to handle any flag side effects.
2221 if (get_user(flags, (int *) arg))
2224 if (cmd == LL_IOC_SETFLAGS) {
2225 if ((flags & LL_FILE_IGNORE_LOCK) &&
2226 !(file->f_flags & O_DIRECT)) {
2227 CERROR("%s: unable to disable locking on "
2228 "non-O_DIRECT file\n", current->comm);
2232 fd->fd_flags |= flags;
2234 fd->fd_flags &= ~flags;
2237 case LL_IOC_LOV_SETSTRIPE:
2238 RETURN(ll_lov_setstripe(inode, file, arg));
2239 case LL_IOC_LOV_SETEA:
2240 RETURN(ll_lov_setea(inode, file, arg));
2241 case LL_IOC_LOV_GETSTRIPE:
2242 RETURN(ll_lov_getstripe(inode, arg));
2243 case LL_IOC_RECREATE_OBJ:
2244 RETURN(ll_lov_recreate_obj(inode, file, arg));
2245 case EXT3_IOC_GETFLAGS:
2246 case EXT3_IOC_SETFLAGS:
2247 RETURN(ll_iocontrol(inode, file, cmd, arg));
2248 case EXT3_IOC_GETVERSION_OLD:
2249 case EXT3_IOC_GETVERSION:
2250 RETURN(put_user(inode->i_generation, (int *)arg));
2255 ftail = getname((const char *)arg);
2257 RETURN(PTR_ERR(ftail));
2258 rc = ll_file_join(inode, file, ftail);
2262 case LL_IOC_GROUP_LOCK:
2263 RETURN(ll_get_grouplock(inode, file, arg));
2264 case LL_IOC_GROUP_UNLOCK:
2265 RETURN(ll_put_grouplock(inode, file, arg));
2266 case IOC_OBD_STATFS:
2267 RETURN(ll_obd_statfs(inode, (void *)arg));
2269 /* We need to special case any other ioctls we want to handle,
2270 * to send them to the MDS/OST as appropriate and to properly
2271 * network encode the arg field.
2272 case EXT3_IOC_SETVERSION_OLD:
2273 case EXT3_IOC_SETVERSION:
2275 case LL_IOC_FLUSHCTX:
2276 RETURN(ll_flush_ctx(inode));
2277 case LL_IOC_GETFACL: {
2278 struct rmtacl_ioctl_data ioc;
2280 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2283 RETURN(ll_ioctl_getfacl(inode, &ioc));
2285 case LL_IOC_SETFACL: {
2286 struct rmtacl_ioctl_data ioc;
2288 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2291 RETURN(ll_ioctl_setfacl(inode, &ioc));
2294 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2299 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2301 struct inode *inode = file->f_dentry->d_inode;
2302 struct ll_inode_info *lli = ll_i2info(inode);
2303 struct lov_stripe_md *lsm = lli->lli_smd;
2306 retval = offset + ((origin == 2) ? inode->i_size :
2307 (origin == 1) ? file->f_pos : 0);
2308 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2309 inode->i_ino, inode->i_generation, inode, retval, retval,
2310 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2311 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2313 if (origin == 2) { /* SEEK_END */
2314 int nonblock = 0, rc;
2316 if (file->f_flags & O_NONBLOCK)
2317 nonblock = LDLM_FL_BLOCK_NOWAIT;
2320 rc = ll_glimpse_size(inode, nonblock);
2325 ll_inode_size_lock(inode, 0);
2326 offset += inode->i_size;
2327 ll_inode_size_unlock(inode, 0);
2328 } else if (origin == 1) { /* SEEK_CUR */
2329 offset += file->f_pos;
2333 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2334 if (offset != file->f_pos) {
2335 file->f_pos = offset;
2336 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2338 file->f_version = ++event;
2347 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2349 struct inode *inode = dentry->d_inode;
2350 struct ll_inode_info *lli = ll_i2info(inode);
2351 struct lov_stripe_md *lsm = lli->lli_smd;
2352 struct ptlrpc_request *req;
2353 struct obd_capa *oc;
2356 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2357 inode->i_generation, inode);
2358 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2360 /* fsync's caller has already called _fdata{sync,write}, we want
2361 * that IO to finish before calling the osc and mdc sync methods */
2362 rc = filemap_fdatawait(inode->i_mapping);
2364 /* catch async errors that were recorded back when async writeback
2365 * failed for pages in this mapping. */
2366 err = lli->lli_async_rc;
2367 lli->lli_async_rc = 0;
2371 err = lov_test_and_clear_async_rc(lsm);
2376 oc = ll_mdscapa_get(inode);
2377 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2383 ptlrpc_req_finished(req);
2390 RETURN(rc ? rc : -ENOMEM);
2392 oa->o_id = lsm->lsm_object_id;
2393 oa->o_gr = lsm->lsm_object_gr;
2394 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2395 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2396 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2399 oc = ll_osscapa_get(inode, 0, CAPA_OPC_OSS_WRITE);
2400 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2401 0, OBD_OBJECT_EOF, oc);
2411 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2413 struct inode *inode = file->f_dentry->d_inode;
2414 struct ll_sb_info *sbi = ll_i2sbi(inode);
2415 struct ldlm_res_id res_id =
2416 { .name = { fid_seq(ll_inode2fid(inode)),
2417 fid_oid(ll_inode2fid(inode)),
2418 fid_ver(ll_inode2fid(inode)),
2420 struct lustre_handle lockh = {0};
2421 ldlm_policy_data_t flock;
2422 ldlm_mode_t mode = 0;
2427 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2428 inode->i_ino, file_lock);
2430 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2432 if (file_lock->fl_flags & FL_FLOCK) {
2433 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2434 /* set missing params for flock() calls */
2435 file_lock->fl_end = OFFSET_MAX;
2436 file_lock->fl_pid = current->tgid;
2438 flock.l_flock.pid = file_lock->fl_pid;
2439 flock.l_flock.start = file_lock->fl_start;
2440 flock.l_flock.end = file_lock->fl_end;
2442 switch (file_lock->fl_type) {
2447 /* An unlock request may or may not have any relation to
2448 * existing locks so we may not be able to pass a lock handle
2449 * via a normal ldlm_lock_cancel() request. The request may even
2450 * unlock a byte range in the middle of an existing lock. In
2451 * order to process an unlock request we need all of the same
2452 * information that is given with a normal read or write record
2453 * lock request. To avoid creating another ldlm unlock (cancel)
2454 * message we'll treat a LCK_NL flock request as an unlock. */
2461 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2476 flags = LDLM_FL_BLOCK_NOWAIT;
2482 flags = LDLM_FL_TEST_LOCK;
2483 /* Save the old mode so that if the mode in the lock changes we
2484 * can decrement the appropriate reader or writer refcount. */
2485 file_lock->fl_type = mode;
2488 CERROR("unknown fcntl lock command: %d\n", cmd);
2492 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2493 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2494 flags, mode, flock.l_flock.start, flock.l_flock.end);
2496 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2497 LDLM_FLOCK, &flock, mode, &flags, NULL,
2498 ldlm_flock_completion_ast, NULL, file_lock,
2499 NULL, 0, NULL, &lockh, 0);
2500 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2501 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2502 #ifdef HAVE_F_OP_FLOCK
2503 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2504 !(flags & LDLM_FL_TEST_LOCK))
2505 posix_lock_file_wait(file, file_lock);
2511 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2518 int ll_have_md_lock(struct inode *inode, __u64 bits)
2520 struct lustre_handle lockh;
2521 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2529 fid = &ll_i2info(inode)->lli_fid;
2530 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2532 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2533 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2534 LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2541 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2542 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2543 * and return success */
2545 /* This path cannot be hit for regular files unless in
2546 * case of obscure races, so no need to to validate
2548 if (!S_ISREG(inode->i_mode) &&
2549 !S_ISDIR(inode->i_mode))
2554 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2562 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2564 struct inode *inode = dentry->d_inode;
2565 struct ptlrpc_request *req = NULL;
2566 struct ll_sb_info *sbi;
2567 struct obd_export *exp;
2572 CERROR("REPORT THIS LINE TO PETER\n");
2575 sbi = ll_i2sbi(inode);
2577 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2578 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2579 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2580 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2583 exp = ll_i2mdexp(inode);
2585 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2586 struct lookup_intent oit = { .it_op = IT_GETATTR };
2587 struct md_op_data *op_data;
2589 /* Call getattr by fid, so do not provide name at all. */
2590 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2591 dentry->d_inode, NULL, 0, 0,
2592 LUSTRE_OPC_ANY, NULL);
2593 if (IS_ERR(op_data))
2594 RETURN(PTR_ERR(op_data));
2596 oit.it_flags |= O_CHECK_STALE;
2597 rc = md_intent_lock(exp, op_data, NULL, 0,
2598 /* we are not interested in name
2601 ll_md_blocking_ast, 0);
2602 ll_finish_md_op_data(op_data);
2603 oit.it_flags &= ~O_CHECK_STALE;
2605 rc = ll_inode_revalidate_fini(inode, rc);
2609 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2611 ll_intent_release(&oit);
2615 /* Unlinked? Unhash dentry, so it is not picked up later by
2616 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2617 here to preserve get_cwd functionality on 2.6.
2619 if (!dentry->d_inode->i_nlink) {
2620 spin_lock(&dcache_lock);
2621 ll_drop_dentry(dentry);
2622 spin_unlock(&dcache_lock);
2625 ll_lookup_finish_locks(&oit, dentry);
2626 } else if (!ll_have_md_lock(dentry->d_inode,
2627 MDS_INODELOCK_UPDATE)) {
2628 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2629 obd_valid valid = OBD_MD_FLGETATTR;
2630 struct obd_capa *oc;
2633 if (S_ISREG(inode->i_mode)) {
2634 rc = ll_get_max_mdsize(sbi, &ealen);
2637 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2639 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2640 * capa for this inode. Because we only keep capas of dirs
2642 oc = ll_mdscapa_get(inode);
2643 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2647 rc = ll_inode_revalidate_fini(inode, rc);
2651 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2657 /* if object not yet allocated, don't validate size */
2658 if (ll_i2info(inode)->lli_smd == NULL)
2661 /* ll_glimpse_size will prefer locally cached writes if they extend
2663 rc = ll_glimpse_size(inode, 0);
2666 ptlrpc_req_finished(req);
2670 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2671 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2672 struct lookup_intent *it, struct kstat *stat)
2674 struct inode *inode = de->d_inode;
2677 res = ll_inode_revalidate_it(de, it);
2678 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2683 stat->dev = inode->i_sb->s_dev;
2684 stat->ino = inode->i_ino;
2685 stat->mode = inode->i_mode;
2686 stat->nlink = inode->i_nlink;
2687 stat->uid = inode->i_uid;
2688 stat->gid = inode->i_gid;
2689 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2690 stat->atime = inode->i_atime;
2691 stat->mtime = inode->i_mtime;
2692 stat->ctime = inode->i_ctime;
2693 #ifdef HAVE_INODE_BLKSIZE
2694 stat->blksize = inode->i_blksize;
2696 stat->blksize = 1 << inode->i_blkbits;
2699 ll_inode_size_lock(inode, 0);
2700 stat->size = inode->i_size;
2701 stat->blocks = inode->i_blocks;
2702 ll_inode_size_unlock(inode, 0);
2706 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2708 struct lookup_intent it = { .it_op = IT_GETATTR };
2710 return ll_getattr_it(mnt, de, &it, stat);
2715 int lustre_check_acl(struct inode *inode, int mask)
2717 #ifdef CONFIG_FS_POSIX_ACL
2718 struct ll_inode_info *lli = ll_i2info(inode);
2719 struct posix_acl *acl;
2723 spin_lock(&lli->lli_lock);
2724 acl = posix_acl_dup(lli->lli_posix_acl);
2725 spin_unlock(&lli->lli_lock);
2730 rc = posix_acl_permission(inode, acl, mask);
2731 posix_acl_release(acl);
2739 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2740 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2742 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2743 inode->i_ino, inode->i_generation, inode, mask);
2744 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2745 return lustre_check_remote_perm(inode, mask);
2747 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2748 return generic_permission(inode, mask, lustre_check_acl);
2751 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2752 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2754 int ll_inode_permission(struct inode *inode, int mask)
2757 int mode = inode->i_mode;
2760 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2761 inode->i_ino, inode->i_generation, inode, mask);
2763 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2764 return lustre_check_remote_perm(inode, mask);
2766 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2768 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2769 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2771 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2773 if (current->fsuid == inode->i_uid) {
2776 if (((mode >> 3) & mask & S_IRWXO) != mask)
2778 rc = lustre_check_acl(inode, mask);
2782 goto check_capabilities;
2786 if (in_group_p(inode->i_gid))
2789 if ((mode & mask & S_IRWXO) == mask)
2793 if (!(mask & MAY_EXEC) ||
2794 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2795 if (capable(CAP_DAC_OVERRIDE))
2798 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2799 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2806 /* -o localflock - only provides locally consistent flock locks */
2807 struct file_operations ll_file_operations = {
2808 .read = ll_file_read,
2809 .write = ll_file_write,
2810 .ioctl = ll_file_ioctl,
2811 .open = ll_file_open,
2812 .release = ll_file_release,
2813 .mmap = ll_file_mmap,
2814 .llseek = ll_file_seek,
2815 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2816 .sendfile = ll_file_sendfile,
2821 struct file_operations ll_file_operations_flock = {
2822 .read = ll_file_read,
2823 .write = ll_file_write,
2824 .ioctl = ll_file_ioctl,
2825 .open = ll_file_open,
2826 .release = ll_file_release,
2827 .mmap = ll_file_mmap,
2828 .llseek = ll_file_seek,
2829 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2830 .sendfile = ll_file_sendfile,
2833 #ifdef HAVE_F_OP_FLOCK
2834 .flock = ll_file_flock,
2836 .lock = ll_file_flock
2839 /* These are for -o noflock - to return ENOSYS on flock calls */
2840 struct file_operations ll_file_operations_noflock = {
2841 .read = ll_file_read,
2842 .write = ll_file_write,
2843 .ioctl = ll_file_ioctl,
2844 .open = ll_file_open,
2845 .release = ll_file_release,
2846 .mmap = ll_file_mmap,
2847 .llseek = ll_file_seek,
2848 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2849 .sendfile = ll_file_sendfile,
2852 #ifdef HAVE_F_OP_FLOCK
2853 .flock = ll_file_noflock,
2855 .lock = ll_file_noflock
2858 struct inode_operations ll_file_inode_operations = {
2859 #ifdef LUSTRE_KERNEL_VERSION
2860 .setattr_raw = ll_setattr_raw,
2862 .setattr = ll_setattr,
2863 .truncate = ll_truncate,
2864 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2865 .getattr = ll_getattr,
2867 .revalidate_it = ll_inode_revalidate_it,
2869 .permission = ll_inode_permission,
2870 .setxattr = ll_setxattr,
2871 .getxattr = ll_getxattr,
2872 .listxattr = ll_listxattr,
2873 .removexattr = ll_removexattr,