1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
34 #include "llite_internal.h"
36 /* also used by llite/special.c:ll_special_open() */
37 struct ll_file_data *ll_file_data_get(void)
39 struct ll_file_data *fd;
41 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
45 static void ll_file_data_put(struct ll_file_data *fd)
48 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
52 struct lustre_handle *fh)
54 op_data->fid1 = ll_i2info(inode)->lli_fid;
55 op_data->attr.ia_mode = inode->i_mode;
56 op_data->attr.ia_atime = inode->i_atime;
57 op_data->attr.ia_mtime = inode->i_mtime;
58 op_data->attr.ia_ctime = inode->i_ctime;
59 op_data->attr.ia_size = inode->i_size;
60 op_data->attr_blocks = inode->i_blocks;
61 ((struct ll_iattr *)&op_data->attr)->ia_attr_flags = inode->i_flags;
62 op_data->ioepoch = ll_i2info(inode)->lli_ioepoch;
63 memcpy(&op_data->handle, fh, sizeof(op_data->handle));
64 op_data->mod_capa1 = ll_mdscapa_get(inode);
67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
68 struct obd_client_handle *och)
72 op_data->attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
73 ATTR_MTIME_SET | ATTR_CTIME_SET;
75 if (!(och->och_flags & FMODE_WRITE))
78 if (!S_ISREG(inode->i_mode))
79 op_data->attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
81 ll_epoch_close(inode, op_data, &och, 0);
84 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
88 static int ll_close_inode_openhandle(struct obd_export *md_exp,
90 struct obd_client_handle *och)
92 struct md_op_data *op_data;
93 struct ptlrpc_request *req = NULL;
94 struct obd_device *obd;
99 obd = class_exp2obd(ll_i2mdexp(inode));
102 * XXX: in case of LMV, is this correct to access
105 CERROR("Invalid MDC connection handle "LPX64"\n",
106 ll_i2mdexp(inode)->exp_handle.h_cookie);
111 * here we check if this is forced umount. If so this is called on
112 * canceling "open lock" and we do not call md_close() in this case, as
113 * it will not be successful, as import is already deactivated.
115 if (obd->obd_no_recov)
118 OBD_ALLOC_PTR(op_data);
120 GOTO(out, rc = -ENOMEM);
122 ll_prepare_close(inode, op_data, och);
123 epoch_close = (och->och_flags & FMODE_WRITE) &&
124 ((op_data->flags & MF_EPOCH_CLOSE) ||
125 !S_ISREG(inode->i_mode));
126 rc = md_close(md_exp, op_data, och, &req);
128 ll_finish_md_op_data(op_data);
130 /* This close must have closed the epoch. */
131 LASSERT(epoch_close);
132 /* MDS has instructed us to obtain Size-on-MDS attribute from
133 * OSTs and send setattr to back to MDS. */
134 rc = ll_sizeonmds_update(inode, &och->och_fh);
136 CERROR("inode %lu mdc Size-on-MDS update failed: "
137 "rc = %d\n", inode->i_ino, rc);
141 CERROR("inode %lu mdc close failed: rc = %d\n",
145 if (!epoch_close && (och->och_flags & FMODE_WRITE))
146 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
149 rc = ll_objects_destroy(req, inode);
151 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 ptlrpc_req_finished(req); /* This is close request */
158 md_clear_open_replay_data(md_exp, och);
159 if (epoch_close || !(och->och_flags & FMODE_WRITE))
160 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
164 int ll_md_real_close(struct inode *inode, int flags)
166 struct ll_inode_info *lli = ll_i2info(inode);
167 struct obd_client_handle **och_p;
168 struct obd_client_handle *och;
173 if (flags & FMODE_WRITE) {
174 och_p = &lli->lli_mds_write_och;
175 och_usecount = &lli->lli_open_fd_write_count;
176 } else if (flags & FMODE_EXEC) {
177 och_p = &lli->lli_mds_exec_och;
178 och_usecount = &lli->lli_open_fd_exec_count;
180 LASSERT(flags & FMODE_READ);
181 och_p = &lli->lli_mds_read_och;
182 och_usecount = &lli->lli_open_fd_read_count;
185 down(&lli->lli_och_sem);
186 if (*och_usecount) { /* There are still users of this handle, so
188 up(&lli->lli_och_sem);
193 up(&lli->lli_och_sem);
195 if (och) { /* There might be a race and somebody have freed this och
197 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
199 /* Do not free @och is it is waiting for DONE_WRITING. */
200 if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
201 OBD_FREE(och, sizeof *och);
207 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
210 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
211 struct ll_inode_info *lli = ll_i2info(inode);
215 /* clear group lock, if present */
216 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
217 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
218 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
219 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
223 /* Let's see if we have good enough OPEN lock on the file and if
224 we can skip talking to MDS */
225 if (file->f_dentry->d_inode) { /* Can this ever be false? */
227 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
228 struct lustre_handle lockh;
229 struct inode *inode = file->f_dentry->d_inode;
230 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
232 down(&lli->lli_och_sem);
233 if (fd->fd_omode & FMODE_WRITE) {
235 LASSERT(lli->lli_open_fd_write_count);
236 lli->lli_open_fd_write_count--;
237 } else if (fd->fd_omode & FMODE_EXEC) {
239 LASSERT(lli->lli_open_fd_exec_count);
240 lli->lli_open_fd_exec_count--;
243 LASSERT(lli->lli_open_fd_read_count);
244 lli->lli_open_fd_read_count--;
246 up(&lli->lli_och_sem);
248 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
249 LDLM_IBITS, &policy, lockmode,
251 rc = ll_md_real_close(file->f_dentry->d_inode,
255 CERROR("Releasing a file %p with negative dentry %p. Name %s",
256 file, file->f_dentry, file->f_dentry->d_name.name);
259 LUSTRE_FPRIVATE(file) = NULL;
260 ll_file_data_put(fd);
261 ll_capa_close(inode);
266 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
268 /* While this returns an error code, fput() the caller does not, so we need
269 * to make every effort to clean up all of our state here. Also, applications
270 * rarely check close errors and even if an error is returned they will not
271 * re-try the close call.
273 int ll_file_release(struct inode *inode, struct file *file)
275 struct ll_file_data *fd;
276 struct ll_sb_info *sbi = ll_i2sbi(inode);
277 struct ll_inode_info *lli = ll_i2info(inode);
278 struct lov_stripe_md *lsm = lli->lli_smd;
282 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
283 inode->i_generation, inode);
285 lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
286 fd = LUSTRE_FPRIVATE(file);
289 /* don't do anything for / */
290 if (inode->i_sb->s_root == file->f_dentry) {
291 LUSTRE_FPRIVATE(file) = NULL;
292 ll_file_data_put(fd);
297 lov_test_and_clear_async_rc(lsm);
298 lli->lli_async_rc = 0;
300 rc = ll_md_close(sbi->ll_md_exp, inode, file);
304 static int ll_intent_file_open(struct file *file, void *lmm,
305 int lmmsize, struct lookup_intent *itp)
307 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
308 struct dentry *parent = file->f_dentry->d_parent;
309 const char *name = file->f_dentry->d_name.name;
310 const int len = file->f_dentry->d_name.len;
311 struct lustre_handle lockh;
312 struct md_op_data *op_data;
318 /* Usually we come here only for NFSD, and we want open lock.
319 But we can also get here with pre 2.6.15 patchless kernels, and in
320 that case that lock is also ok */
321 /* We can also get here if there was cached open handle in revalidate_it
322 * but it disappeared while we were getting from there to ll_file_open.
323 * But this means this file was closed and immediatelly opened which
324 * makes a good candidate for using OPEN lock */
325 /* If lmmsize & lmm are not 0, we are just setting stripe info
326 * parameters. No need for the open lock */
327 if (!lmm && !lmmsize)
328 itp->it_flags |= MDS_OPEN_LOCK;
330 op_data = ll_prep_md_op_data(NULL, parent->d_inode, NULL, name, len,
335 rc = md_enqueue(sbi->ll_md_exp, LDLM_IBITS, itp, LCK_CW, op_data,
336 &lockh, lmm, lmmsize, ldlm_completion_ast,
337 ll_md_blocking_ast, NULL, 0);
339 ll_finish_md_op_data(op_data);
341 CERROR("lock enqueue: err: %d\n", rc);
345 if (itp->d.lustre.it_lock_mode) { /* If we got lock - release it right
347 ldlm_lock_decref(&lockh, itp->d.lustre.it_lock_mode);
348 itp->d.lustre.it_lock_mode = 0;
350 rc = ll_prep_inode(&file->f_dentry->d_inode,
351 (struct ptlrpc_request *)itp->d.lustre.it_data,
352 DLM_REPLY_REC_OFF, NULL);
356 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
357 struct lookup_intent *it, struct obd_client_handle *och)
359 struct ptlrpc_request *req = it->d.lustre.it_data;
360 struct mdt_body *body;
364 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
365 LASSERT(body != NULL); /* reply already checked out */
366 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
368 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
369 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
370 och->och_fid = lli->lli_fid;
371 och->och_flags = it->it_flags;
372 lli->lli_ioepoch = body->ioepoch;
374 return md_set_open_replay_data(md_exp, och, req);
377 int ll_local_open(struct file *file, struct lookup_intent *it,
378 struct ll_file_data *fd, struct obd_client_handle *och)
380 struct inode *inode = file->f_dentry->d_inode;
381 struct ll_inode_info *lli = ll_i2info(inode);
384 LASSERT(!LUSTRE_FPRIVATE(file));
389 struct ptlrpc_request *req = it->d.lustre.it_data;
390 struct mdt_body *body;
393 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
397 body = lustre_msg_buf(req->rq_repmsg,
398 DLM_REPLY_REC_OFF, sizeof(*body));
400 if ((it->it_flags & FMODE_WRITE) &&
401 (body->valid & OBD_MD_FLSIZE))
403 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
404 lli->lli_ioepoch, PFID(&lli->lli_fid));
408 LUSTRE_FPRIVATE(file) = fd;
409 ll_readahead_init(inode, &fd->fd_ras);
410 fd->fd_omode = it->it_flags;
414 /* Open a file, and (for the very first open) create objects on the OSTs at
415 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
416 * creation or open until ll_lov_setstripe() ioctl is called. We grab
417 * lli_open_sem to ensure no other process will create objects, send the
418 * stripe MD to the MDS, or try to destroy the objects if that fails.
420 * If we already have the stripe MD locally then we don't request it in
421 * md_open(), by passing a lmm_size = 0.
423 * It is up to the application to ensure no other processes open this file
424 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
425 * used. We might be able to avoid races of that sort by getting lli_open_sem
426 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
427 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
429 int ll_file_open(struct inode *inode, struct file *file)
431 struct ll_inode_info *lli = ll_i2info(inode);
432 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
433 .it_flags = file->f_flags };
434 struct lov_stripe_md *lsm;
435 struct ptlrpc_request *req = NULL;
436 struct obd_client_handle **och_p;
438 struct ll_file_data *fd;
442 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
443 inode->i_generation, inode, file->f_flags);
445 #ifdef LUSTRE_KERNEL_VERSION
448 it = file->private_data; /* XXX: compat macro */
449 file->private_data = NULL; /* prevent ll_local_open assertion */
452 fd = ll_file_data_get();
456 /* don't do anything for / */
457 if (inode->i_sb->s_root == file->f_dentry) {
458 LUSTRE_FPRIVATE(file) = fd;
462 if (!it || !it->d.lustre.it_disposition) {
463 /* Convert f_flags into access mode. We cannot use file->f_mode,
464 * because everything but O_ACCMODE mask was stripped from
466 if ((oit.it_flags + 1) & O_ACCMODE)
468 if (oit.it_flags & O_TRUNC)
469 oit.it_flags |= FMODE_WRITE;
471 if (oit.it_flags & O_CREAT)
472 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
474 /* NFS hack - some strange NFS clients create files with zero
475 * permission bits, and then expect to be able to open such
476 * files. We are relying on real VFS client to do ll_permission
477 * first before coming here, so if we got here, we either came
478 * from NFS or all access checks ar eok, so it is safe to set
479 * this flag in any case (XXX - race with chmod?)
481 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
483 /* We do not want O_EXCL here, presumably we opened the file
484 * already? XXX - NFS implications? */
485 oit.it_flags &= ~O_EXCL;
490 /* Let's see if we have file open on MDS already. */
491 if (it->it_flags & FMODE_WRITE) {
492 och_p = &lli->lli_mds_write_och;
493 och_usecount = &lli->lli_open_fd_write_count;
494 } else if (it->it_flags & FMODE_EXEC) {
495 och_p = &lli->lli_mds_exec_och;
496 och_usecount = &lli->lli_open_fd_exec_count;
498 och_p = &lli->lli_mds_read_och;
499 och_usecount = &lli->lli_open_fd_read_count;
501 down(&lli->lli_och_sem);
502 if (*och_p) { /* Open handle is present */
503 if (it_disposition(it, DISP_OPEN_OPEN)) {
504 /* Well, there's extra open request that we do not need,
505 let's close it somehow. This will decref request. */
506 ll_release_openhandle(file->f_dentry, it);
510 rc = ll_local_open(file, it, fd, NULL);
512 up(&lli->lli_och_sem);
513 ll_file_data_put(fd);
517 LASSERT(*och_usecount == 0);
518 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
520 ll_file_data_put(fd);
521 GOTO(out_och_free, rc = -ENOMEM);
524 if (!it->d.lustre.it_disposition) {
525 rc = ll_intent_file_open(file, NULL, 0, it);
527 ll_file_data_put(fd);
528 GOTO(out_och_free, rc);
531 /* Got some error? Release the request */
532 if (it->d.lustre.it_status < 0) {
533 req = it->d.lustre.it_data;
534 ptlrpc_req_finished(req);
536 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
537 &it->d.lustre.it_lock_handle,
538 file->f_dentry->d_inode);
540 req = it->d.lustre.it_data;
542 /* md_intent_lock() didn't get a request ref if there was an
543 * open error, so don't do cleanup on the request here
545 /* XXX (green): Should not we bail out on any error here, not
546 * just open error? */
547 rc = it_open_error(DISP_OPEN_OPEN, it);
549 ll_file_data_put(fd);
550 GOTO(out_och_free, rc);
553 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
554 rc = ll_local_open(file, it, fd, *och_p);
556 up(&lli->lli_och_sem);
557 ll_file_data_put(fd);
558 GOTO(out_och_free, rc);
561 up(&lli->lli_och_sem);
563 /* Must do this outside lli_och_sem lock to prevent deadlock where
564 different kind of OPEN lock for this same inode gets cancelled
565 by ldlm_cancel_lru */
566 if (!S_ISREG(inode->i_mode))
573 if (file->f_flags & O_LOV_DELAY_CREATE ||
574 !(file->f_mode & FMODE_WRITE)) {
575 CDEBUG(D_INODE, "object creation was delayed\n");
579 file->f_flags &= ~O_LOV_DELAY_CREATE;
582 ptlrpc_req_finished(req);
584 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
588 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
589 *och_p = NULL; /* OBD_FREE writes some magic there */
592 up(&lli->lli_och_sem);
598 /* Fills the obdo with the attributes for the inode defined by lsm */
599 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
601 struct ptlrpc_request_set *set;
602 struct ll_inode_info *lli = ll_i2info(inode);
603 struct lov_stripe_md *lsm = lli->lli_smd;
605 struct obd_info oinfo = { { { 0 } } };
609 LASSERT(lsm != NULL);
613 oinfo.oi_oa->o_id = lsm->lsm_object_id;
614 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
615 oinfo.oi_oa->o_mode = S_IFREG;
616 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
617 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
618 OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
619 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
620 oinfo.oi_capa = ll_mdscapa_get(inode);
622 set = ptlrpc_prep_set();
624 CERROR("can't allocate ptlrpc set\n");
627 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
629 rc = ptlrpc_set_wait(set);
630 ptlrpc_set_destroy(set);
632 capa_put(oinfo.oi_capa);
636 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
637 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
640 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
641 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
642 lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
647 static inline void ll_remove_suid(struct inode *inode)
651 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
652 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
654 /* was any of the uid bits set? */
655 mode &= inode->i_mode;
656 if (mode && !capable(CAP_FSETID)) {
657 inode->i_mode &= ~mode;
658 // XXX careful here - we cannot change the size
662 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
664 struct ll_inode_info *lli = ll_i2info(inode);
665 struct lov_stripe_md *lsm = lli->lli_smd;
666 struct obd_export *exp = ll_i2dtexp(inode);
669 struct ldlm_lock *lock;
670 struct lov_stripe_md *lsm;
671 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
672 __u32 stripe, vallen = sizeof(stripe);
676 if (lsm->lsm_stripe_count == 1)
677 GOTO(check, stripe = 0);
679 /* get our offset in the lov */
680 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
682 CERROR("obd_get_info: rc = %d\n", rc);
685 LASSERT(stripe < lsm->lsm_stripe_count);
688 if (lsm->lsm_oinfo[stripe].loi_id != lock->l_resource->lr_name.name[0]||
689 lsm->lsm_oinfo[stripe].loi_gr != lock->l_resource->lr_name.name[2]){
690 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
691 lsm->lsm_oinfo[stripe].loi_id,
692 lsm->lsm_oinfo[stripe].loi_gr);
693 RETURN(-ELDLM_NO_LOCK_DATA);
699 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
700 * we get a lock cancellation for each stripe, so we have to map the obd's
701 * region back onto the stripes in the file that it held.
703 * No one can dirty the extent until we've finished our work and they can
704 * enqueue another lock. The DLM protects us from ll_file_read/write here,
705 * but other kernel actors could have pages locked.
707 * Called with the DLM lock held. */
708 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
709 struct ldlm_lock *lock, __u32 stripe)
711 ldlm_policy_data_t tmpex;
712 unsigned long start, end, count, skip, i, j;
714 int rc, rc2, l_flags, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
715 struct lustre_handle lockh;
718 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
719 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
720 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
723 /* our locks are page granular thanks to osc_enqueue, we invalidate the
725 if ((tmpex.l_extent.start & ~PAGE_CACHE_MASK) != 0 ||
726 ((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) != 0)
727 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",PAGE_SIZE);
728 LASSERT((tmpex.l_extent.start & ~PAGE_CACHE_MASK) == 0);
729 LASSERT(((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) == 0);
733 start = tmpex.l_extent.start >> PAGE_CACHE_SHIFT;
734 end = tmpex.l_extent.end >> PAGE_CACHE_SHIFT;
735 if (lsm->lsm_stripe_count > 1) {
736 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
737 skip = (lsm->lsm_stripe_count - 1) * count;
738 start += start/count * skip + stripe * count;
740 end += end/count * skip + stripe * count;
742 if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT)
745 i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0;
749 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
750 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
751 count, skip, end, discard ? " (DISCARDING)" : "");
753 /* walk through the vmas on the inode and tear down mmaped pages that
754 * intersect with the lock. this stops immediately if there are no
755 * mmap()ed regions of the file. This is not efficient at all and
756 * should be short lived. We'll associate mmap()ed pages with the lock
757 * and will be able to find them directly */
758 for (i = start; i <= end; i += (j + skip)) {
759 j = min(count - (i % count), end - i + 1);
761 LASSERT(inode->i_mapping);
762 if (ll_teardown_mmaps(inode->i_mapping,
763 (__u64)i << PAGE_CACHE_SHIFT,
764 ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) )
768 /* this is the simplistic implementation of page eviction at
769 * cancelation. It is careful to get races with other page
770 * lockers handled correctly. fixes from bug 20 will make it
771 * more efficient by associating locks with pages and with
772 * batching writeback under the lock explicitly. */
773 for (i = start, j = start % count; i <= end;
774 j++, i++, tmpex.l_extent.start += PAGE_CACHE_SIZE) {
776 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
782 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
783 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
784 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
787 if (!mapping_has_pages(inode->i_mapping)) {
788 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
794 page = find_get_page(inode->i_mapping, i);
797 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
798 i, tmpex.l_extent.start);
801 /* page->mapping to check with racing against teardown */
802 if (!discard && clear_page_dirty_for_io(page)) {
803 rc = ll_call_writepage(inode, page);
805 CERROR("writepage of page %p failed: %d\n",
807 /* either waiting for io to complete or reacquiring
808 * the lock that the failed writepage released */
812 tmpex.l_extent.end = tmpex.l_extent.start + PAGE_CACHE_SIZE - 1;
813 l_flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
814 /* check to see if another DLM lock covers this page b=2765 */
815 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
816 l_flags, &lock->l_resource->lr_name,
817 LDLM_EXTENT, &tmpex, LCK_PR | LCK_PW, &lockh);
819 if (rc2 <= 0 && page->mapping != NULL) {
820 struct ll_async_page *llap = llap_cast_private(page);
821 // checking again to account for writeback's lock_page()
822 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
824 ll_ra_accounting(llap, inode->i_mapping);
825 ll_truncate_complete_page(page);
828 page_cache_release(page);
830 LASSERTF(tmpex.l_extent.start <=
831 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
832 lock->l_policy_data.l_extent.end + 1),
833 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
834 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
839 static int ll_extent_lock_callback(struct ldlm_lock *lock,
840 struct ldlm_lock_desc *new, void *data,
843 struct lustre_handle lockh = { 0 };
847 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
848 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
853 case LDLM_CB_BLOCKING:
854 ldlm_lock2handle(lock, &lockh);
855 rc = ldlm_cli_cancel(&lockh);
857 CERROR("ldlm_cli_cancel failed: %d\n", rc);
859 case LDLM_CB_CANCELING: {
861 struct ll_inode_info *lli;
862 struct lov_stripe_md *lsm;
866 /* This lock wasn't granted, don't try to evict pages */
867 if (lock->l_req_mode != lock->l_granted_mode)
870 inode = ll_inode_from_lock(lock);
873 lli = ll_i2info(inode);
876 if (lli->lli_smd == NULL)
880 stripe = ll_lock_to_stripe_offset(inode, lock);
884 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
886 lov_stripe_lock(lsm);
887 lock_res_and_lock(lock);
888 kms = ldlm_extent_shift_kms(lock,
889 lsm->lsm_oinfo[stripe].loi_kms);
891 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
892 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
893 lsm->lsm_oinfo[stripe].loi_kms, kms);
894 lsm->lsm_oinfo[stripe].loi_kms = kms;
895 unlock_res_and_lock(lock);
896 lov_stripe_unlock(lsm);
909 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
911 /* XXX ALLOCATE - 160 bytes */
912 struct inode *inode = ll_inode_from_lock(lock);
913 struct ll_inode_info *lli = ll_i2info(inode);
914 struct lustre_handle lockh = { 0 };
919 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
920 LDLM_FL_BLOCK_CONV)) {
921 LBUG(); /* not expecting any blocked async locks yet */
922 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
924 ldlm_lock_dump(D_OTHER, lock, 0);
925 ldlm_reprocess_all(lock->l_resource);
929 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
931 stripe = ll_lock_to_stripe_offset(inode, lock);
935 if (lock->l_lvb_len) {
936 struct lov_stripe_md *lsm = lli->lli_smd;
938 lvb = lock->l_lvb_data;
939 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
941 LOCK_INODE_MUTEX(inode);
942 lock_res_and_lock(lock);
943 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
944 kms = ldlm_extent_shift_kms(NULL, kms);
945 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
946 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
947 lsm->lsm_oinfo[stripe].loi_kms, kms);
948 lsm->lsm_oinfo[stripe].loi_kms = kms;
949 unlock_res_and_lock(lock);
950 UNLOCK_INODE_MUTEX(inode);
955 wake_up(&lock->l_waitq);
957 ldlm_lock2handle(lock, &lockh);
958 ldlm_lock_decref(&lockh, LCK_PR);
963 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
965 struct ptlrpc_request *req = reqp;
966 struct inode *inode = ll_inode_from_lock(lock);
967 struct ll_inode_info *lli;
968 struct lov_stripe_md *lsm;
971 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
975 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
976 lli = ll_i2info(inode);
978 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
981 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
983 /* First, find out which stripe index this lock corresponds to. */
984 stripe = ll_lock_to_stripe_offset(inode, lock);
986 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
988 rc = lustre_pack_reply(req, 2, size, NULL);
990 CERROR("lustre_pack_reply: %d\n", rc);
994 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
995 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
996 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
997 lvb->lvb_atime = LTIME_S(inode->i_atime);
998 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1000 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1001 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1002 inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1003 lvb->lvb_atime, lvb->lvb_ctime);
1008 /* These errors are normal races, so we don't want to fill the console
1009 * with messages by calling ptlrpc_error() */
1010 if (rc == -ELDLM_NO_LOCK_DATA)
1011 lustre_pack_reply(req, 1, NULL, NULL);
1013 req->rq_status = rc;
1017 static void ll_merge_lvb(struct inode *inode)
1019 struct ll_inode_info *lli = ll_i2info(inode);
1020 struct ll_sb_info *sbi = ll_i2sbi(inode);
1024 ll_inode_size_lock(inode, 1);
1025 inode_init_lvb(inode, &lvb);
1026 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1027 inode->i_size = lvb.lvb_size;
1028 inode->i_blocks = lvb.lvb_blocks;
1029 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1030 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1031 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1032 ll_inode_size_unlock(inode, 1);
1036 int ll_local_size(struct inode *inode)
1038 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1039 struct ll_inode_info *lli = ll_i2info(inode);
1040 struct ll_sb_info *sbi = ll_i2sbi(inode);
1041 struct lustre_handle lockh = { 0 };
1046 if (lli->lli_smd->lsm_stripe_count == 0)
1049 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1050 &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1056 ll_merge_lvb(inode);
1057 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1061 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1064 struct lustre_handle lockh = { 0 };
1065 struct obd_enqueue_info einfo = { 0 };
1066 struct obd_info oinfo = { { { 0 } } };
1072 einfo.ei_type = LDLM_EXTENT;
1073 einfo.ei_mode = LCK_PR;
1074 einfo.ei_flags = LDLM_FL_HAS_INTENT;
1075 einfo.ei_cb_bl = ll_extent_lock_callback;
1076 einfo.ei_cb_cp = ldlm_completion_ast;
1077 einfo.ei_cb_gl = ll_glimpse_callback;
1078 einfo.ei_cbdata = NULL;
1080 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1081 oinfo.oi_lockh = &lockh;
1084 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1088 CERROR("obd_enqueue returned rc %d, "
1089 "returning -EIO\n", rc);
1090 RETURN(rc > 0 ? -EIO : rc);
1093 lov_stripe_lock(lsm);
1094 memset(&lvb, 0, sizeof(lvb));
1095 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1096 st->st_size = lvb.lvb_size;
1097 st->st_blocks = lvb.lvb_blocks;
1098 st->st_mtime = lvb.lvb_mtime;
1099 st->st_atime = lvb.lvb_atime;
1100 st->st_ctime = lvb.lvb_ctime;
1101 lov_stripe_unlock(lsm);
1106 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1107 * file (because it prefers KMS over RSS when larger) */
1108 int ll_glimpse_size(struct inode *inode, int ast_flags)
1110 struct ll_inode_info *lli = ll_i2info(inode);
1111 struct ll_sb_info *sbi = ll_i2sbi(inode);
1112 struct lustre_handle lockh = { 0 };
1113 struct obd_enqueue_info einfo = { 0 };
1114 struct obd_info oinfo = { { { 0 } } };
1118 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1121 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1123 if (!lli->lli_smd) {
1124 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1128 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1129 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1130 * won't revoke any conflicting DLM locks held. Instead,
1131 * ll_glimpse_callback() will be called on each client
1132 * holding a DLM lock against this file, and resulting size
1133 * will be returned for each stripe. DLM lock on [0, EOF] is
1134 * acquired only if there were no conflicting locks. */
1135 einfo.ei_type = LDLM_EXTENT;
1136 einfo.ei_mode = LCK_PR;
1137 einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1138 einfo.ei_cb_bl = ll_extent_lock_callback;
1139 einfo.ei_cb_cp = ldlm_completion_ast;
1140 einfo.ei_cb_gl = ll_glimpse_callback;
1141 einfo.ei_cbdata = inode;
1143 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1144 oinfo.oi_lockh = &lockh;
1145 oinfo.oi_md = lli->lli_smd;
1147 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1151 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1152 RETURN(rc > 0 ? -EIO : rc);
1155 ll_merge_lvb(inode);
1157 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1158 inode->i_size, inode->i_blocks);
1163 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1164 struct lov_stripe_md *lsm, int mode,
1165 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1168 struct ll_sb_info *sbi = ll_i2sbi(inode);
1170 struct obd_enqueue_info einfo = { 0 };
1171 struct obd_info oinfo = { { { 0 } } };
1175 LASSERT(!lustre_handle_is_used(lockh));
1176 LASSERT(lsm != NULL);
1178 /* don't drop the mmapped file to LRU */
1179 if (mapping_mapped(inode->i_mapping))
1180 ast_flags |= LDLM_FL_NO_LRU;
1182 /* XXX phil: can we do this? won't it screw the file size up? */
1183 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1184 (sbi->ll_flags & LL_SBI_NOLCK))
1187 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1188 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1190 einfo.ei_type = LDLM_EXTENT;
1191 einfo.ei_mode = mode;
1192 einfo.ei_flags = ast_flags;
1193 einfo.ei_cb_bl = ll_extent_lock_callback;
1194 einfo.ei_cb_cp = ldlm_completion_ast;
1195 einfo.ei_cb_gl = ll_glimpse_callback;
1196 einfo.ei_cbdata = inode;
1198 oinfo.oi_policy = *policy;
1199 oinfo.oi_lockh = lockh;
1202 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1203 *policy = oinfo.oi_policy;
1207 ll_inode_size_lock(inode, 1);
1208 inode_init_lvb(inode, &lvb);
1209 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1211 if (policy->l_extent.start == 0 &&
1212 policy->l_extent.end == OBD_OBJECT_EOF) {
1213 /* vmtruncate()->ll_truncate() first sets the i_size and then
1214 * the kms under both a DLM lock and the
1215 * ll_inode_size_lock(). If we don't get the
1216 * ll_inode_size_lock() here we can match the DLM lock and
1217 * reset i_size from the kms before the truncating path has
1218 * updated the kms. generic_file_write can then trust the
1219 * stale i_size when doing appending writes and effectively
1220 * cancel the result of the truncate. Getting the
1221 * ll_inode_size_lock() after the enqueue maintains the DLM
1222 * -> ll_inode_size_lock() acquiring order. */
1223 inode->i_size = lvb.lvb_size;
1227 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1228 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1229 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1231 ll_inode_size_unlock(inode, 1);
1236 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1237 struct lov_stripe_md *lsm, int mode,
1238 struct lustre_handle *lockh)
1240 struct ll_sb_info *sbi = ll_i2sbi(inode);
1244 /* XXX phil: can we do this? won't it screw the file size up? */
1245 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1246 (sbi->ll_flags & LL_SBI_NOLCK))
1249 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1254 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1257 struct inode *inode = file->f_dentry->d_inode;
1258 struct ll_inode_info *lli = ll_i2info(inode);
1259 struct lov_stripe_md *lsm = lli->lli_smd;
1260 struct ll_sb_info *sbi = ll_i2sbi(inode);
1261 struct ll_lock_tree tree;
1262 struct ll_lock_tree_node *node;
1264 struct ll_ra_read bead;
1267 ssize_t retval, chunk, sum = 0;
1271 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1272 inode->i_ino, inode->i_generation, inode, count, *ppos);
1274 /* "If nbyte is 0, read() will return 0 and have no other results."
1275 * -- Single Unix Spec */
1279 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1283 /* Read on file with no objects should return zero-filled
1284 * buffers up to file size (we can get non-zero sizes with
1285 * mknod + truncate, then opening file for read. This is a
1286 * common pattern in NFS case, it seems). Bug 6243 */
1288 /* Since there are no objects on OSTs, we have nothing to get
1289 * lock on and so we are forced to access inode->i_size
1292 /* Read beyond end of file */
1293 if (*ppos >= inode->i_size)
1296 if (count > inode->i_size - *ppos)
1297 count = inode->i_size - *ppos;
1298 /* Make sure to correctly adjust the file pos pointer for
1300 notzeroed = clear_user(buf, count);
1309 if (sbi->ll_max_rw_chunk != 0) {
1310 /* first, let's know the end of the current stripe */
1312 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1314 /* correct, the end is beyond the request */
1315 if (end > *ppos + count - 1)
1316 end = *ppos + count - 1;
1318 /* and chunk shouldn't be too large even if striping is wide */
1319 if (end - *ppos > sbi->ll_max_rw_chunk)
1320 end = *ppos + sbi->ll_max_rw_chunk - 1;
1322 end = *ppos + count - 1;
1325 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1326 tree.lt_fd = LUSTRE_FPRIVATE(file);
1327 rc = ll_tree_lock(&tree, node, buf, count,
1328 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1330 GOTO(out, retval = rc);
1332 ll_inode_size_lock(inode, 1);
1334 * Consistency guarantees: following possibilities exist for the
1335 * relation between region being read and real file size at this
1338 * (A): the region is completely inside of the file;
1340 * (B-x): x bytes of region are inside of the file, the rest is
1343 * (C): the region is completely outside of the file.
1345 * This classification is stable under DLM lock acquired by
1346 * ll_tree_lock() above, because to change class, other client has to
1347 * take DLM lock conflicting with our lock. Also, any updates to
1348 * ->i_size by other threads on this client are serialized by
1349 * ll_inode_size_lock(). This guarantees that short reads are handled
1350 * correctly in the face of concurrent writes and truncates.
1352 inode_init_lvb(inode, &lvb);
1353 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1355 if (*ppos + count - 1 > kms) {
1356 /* A glimpse is necessary to determine whether we return a
1357 * short read (B) or some zeroes at the end of the buffer (C) */
1358 ll_inode_size_unlock(inode, 1);
1359 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1361 ll_tree_unlock(&tree);
1365 /* region is within kms and, hence, within real file size (A) */
1366 inode->i_size = kms;
1367 ll_inode_size_unlock(inode, 1);
1370 chunk = end - *ppos + 1;
1371 CDEBUG(D_VFSTRACE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1372 inode->i_ino, chunk, *ppos, inode->i_size);
1374 /* turn off the kernel's read-ahead */
1375 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1378 file->f_ra.ra_pages = 0;
1380 /* initialize read-ahead window once per syscall */
1383 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1384 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1385 ll_ra_read_in(file, &bead);
1389 file_accessed(file);
1390 retval = generic_file_read(file, buf, chunk, ppos);
1391 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 0);
1393 ll_tree_unlock(&tree);
1399 if (retval == chunk && count > 0)
1405 ll_ra_read_ex(file, &bead);
1406 retval = (sum > 0) ? sum : retval;
1411 * Write to a file (through the page cache).
1413 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1416 struct inode *inode = file->f_dentry->d_inode;
1417 struct ll_sb_info *sbi = ll_i2sbi(inode);
1418 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1419 struct ll_lock_tree tree;
1420 struct ll_lock_tree_node *node;
1421 loff_t maxbytes = ll_file_maxbytes(inode);
1422 loff_t lock_start, lock_end, end;
1423 ssize_t retval, chunk, sum = 0;
1426 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1427 inode->i_ino, inode->i_generation, inode, count, *ppos);
1429 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1431 /* POSIX, but surprised the VFS doesn't check this already */
1435 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1436 * called on the file, don't fail the below assertion (bug 2388). */
1437 if (file->f_flags & O_LOV_DELAY_CREATE &&
1438 ll_i2info(inode)->lli_smd == NULL)
1441 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1443 down(&ll_i2info(inode)->lli_write_sem);
1446 chunk = 0; /* just to fix gcc's warning */
1447 end = *ppos + count - 1;
1449 if (file->f_flags & O_APPEND) {
1451 lock_end = OBD_OBJECT_EOF;
1452 } else if (sbi->ll_max_rw_chunk != 0) {
1453 /* first, let's know the end of the current stripe */
1455 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1457 /* correct, the end is beyond the request */
1458 if (end > *ppos + count - 1)
1459 end = *ppos + count - 1;
1461 /* and chunk shouldn't be too large even if striping is wide */
1462 if (end - *ppos > sbi->ll_max_rw_chunk)
1463 end = *ppos + sbi->ll_max_rw_chunk - 1;
1468 lock_end = *ppos + count - 1;
1470 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1473 GOTO(out, retval = PTR_ERR(node));
1475 tree.lt_fd = LUSTRE_FPRIVATE(file);
1476 rc = ll_tree_lock(&tree, node, buf, count,
1477 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1479 GOTO(out, retval = rc);
1481 /* this is ok, g_f_w will overwrite this under i_mutex if it races
1482 * with a local truncate, it just makes our maxbyte checking easier */
1483 if (file->f_flags & O_APPEND) {
1484 *ppos = inode->i_size;
1485 end = *ppos + count - 1;
1488 if (*ppos >= maxbytes) {
1489 send_sig(SIGXFSZ, current, 0);
1490 GOTO(out, retval = -EFBIG);
1492 if (*ppos + count > maxbytes)
1493 count = maxbytes - *ppos;
1495 /* generic_file_write handles O_APPEND after getting i_mutex */
1496 chunk = end - *ppos + 1;
1497 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1498 inode->i_ino, chunk, *ppos);
1499 retval = generic_file_write(file, buf, chunk, ppos);
1500 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1503 ll_tree_unlock(&tree);
1509 if (retval == chunk && count > 0)
1513 up(&ll_i2info(inode)->lli_write_sem);
1515 retval = (sum > 0) ? sum : retval;
1516 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
1517 retval > 0 ? retval : 0);
1522 * Send file content (through pagecache) somewhere with helper
1524 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1525 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1526 read_actor_t actor, void *target)
1528 struct inode *inode = in_file->f_dentry->d_inode;
1529 struct ll_inode_info *lli = ll_i2info(inode);
1530 struct lov_stripe_md *lsm = lli->lli_smd;
1531 struct ll_lock_tree tree;
1532 struct ll_lock_tree_node *node;
1534 struct ll_ra_read bead;
1539 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1540 inode->i_ino, inode->i_generation, inode, count, *ppos);
1542 /* "If nbyte is 0, read() will return 0 and have no other results."
1543 * -- Single Unix Spec */
1547 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1550 /* File with no objects, nothing to lock */
1552 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1554 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1555 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1556 rc = ll_tree_lock(&tree, node, NULL, count,
1557 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1561 ll_inode_size_lock(inode, 1);
1563 * Consistency guarantees: following possibilities exist for the
1564 * relation between region being read and real file size at this
1567 * (A): the region is completely inside of the file;
1569 * (B-x): x bytes of region are inside of the file, the rest is
1572 * (C): the region is completely outside of the file.
1574 * This classification is stable under DLM lock acquired by
1575 * ll_tree_lock() above, because to change class, other client has to
1576 * take DLM lock conflicting with our lock. Also, any updates to
1577 * ->i_size by other threads on this client are serialized by
1578 * ll_inode_size_lock(). This guarantees that short reads are handled
1579 * correctly in the face of concurrent writes and truncates.
1581 inode_init_lvb(inode, &lvb);
1582 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1584 if (*ppos + count - 1 > kms) {
1585 /* A glimpse is necessary to determine whether we return a
1586 * short read (B) or some zeroes at the end of the buffer (C) */
1587 ll_inode_size_unlock(inode, 1);
1588 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1592 /* region is within kms and, hence, within real file size (A) */
1593 inode->i_size = kms;
1594 ll_inode_size_unlock(inode, 1);
1597 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1598 inode->i_ino, count, *ppos, inode->i_size);
1600 /* turn off the kernel's read-ahead */
1601 in_file->f_ra.ra_pages = 0;
1603 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1604 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1605 ll_ra_read_in(in_file, &bead);
1607 file_accessed(in_file);
1608 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1609 ll_ra_read_ex(in_file, &bead);
1612 ll_tree_unlock(&tree);
1617 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1620 struct ll_inode_info *lli = ll_i2info(inode);
1621 struct obd_export *exp = ll_i2dtexp(inode);
1622 struct ll_recreate_obj ucreatp;
1623 struct obd_trans_info oti = { 0 };
1624 struct obdo *oa = NULL;
1627 struct lov_stripe_md *lsm, *lsm2;
1630 if (!capable (CAP_SYS_ADMIN))
1633 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1634 sizeof(struct ll_recreate_obj));
1642 down(&lli->lli_open_sem);
1645 GOTO(out, rc = -ENOENT);
1646 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1647 (lsm->lsm_stripe_count));
1649 OBD_ALLOC(lsm2, lsm_size);
1651 GOTO(out, rc = -ENOMEM);
1653 oa->o_id = ucreatp.lrc_id;
1654 oa->o_gr = ucreatp.lrc_group;
1655 oa->o_nlink = ucreatp.lrc_ost_idx;
1656 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1657 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1658 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1659 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1661 oti.oti_objid = NULL;
1662 memcpy(lsm2, lsm, lsm_size);
1663 rc = obd_create(exp, oa, &lsm2, &oti);
1665 OBD_FREE(lsm2, lsm_size);
1668 up(&lli->lli_open_sem);
1673 static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1674 int flags, struct lov_user_md *lum,
1677 struct ll_inode_info *lli = ll_i2info(inode);
1678 struct lov_stripe_md *lsm;
1679 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1683 down(&lli->lli_open_sem);
1686 up(&lli->lli_open_sem);
1687 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1692 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1695 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1696 GOTO(out_req_free, rc = -ENOENT);
1697 rc = oit.d.lustre.it_status;
1699 GOTO(out_req_free, rc);
1701 ll_release_openhandle(file->f_dentry, &oit);
1704 up(&lli->lli_open_sem);
1705 ll_intent_release(&oit);
1708 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1712 static int ll_lov_setea(struct inode *inode, struct file *file,
1715 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1716 struct lov_user_md *lump;
1717 int lum_size = sizeof(struct lov_user_md) +
1718 sizeof(struct lov_user_ost_data);
1722 if (!capable (CAP_SYS_ADMIN))
1725 OBD_ALLOC(lump, lum_size);
1729 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1731 OBD_FREE(lump, lum_size);
1735 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1737 OBD_FREE(lump, lum_size);
1741 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1744 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1746 int flags = FMODE_WRITE;
1749 /* Bug 1152: copy properly when this is no longer true */
1750 LASSERT(sizeof(lum) == sizeof(*lump));
1751 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1752 rc = copy_from_user(&lum, lump, sizeof(lum));
1756 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1758 put_user(0, &lump->lmm_stripe_count);
1759 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1760 0, ll_i2info(inode)->lli_smd, lump);
1765 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1767 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1772 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1776 static int ll_get_grouplock(struct inode *inode, struct file *file,
1779 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1780 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1781 .end = OBD_OBJECT_EOF}};
1782 struct lustre_handle lockh = { 0 };
1783 struct ll_inode_info *lli = ll_i2info(inode);
1784 struct lov_stripe_md *lsm = lli->lli_smd;
1788 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1792 policy.l_extent.gid = arg;
1793 if (file->f_flags & O_NONBLOCK)
1794 flags = LDLM_FL_BLOCK_NOWAIT;
1796 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1800 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1802 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1807 static int ll_put_grouplock(struct inode *inode, struct file *file,
1810 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1811 struct ll_inode_info *lli = ll_i2info(inode);
1812 struct lov_stripe_md *lsm = lli->lli_smd;
1816 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1817 /* Ugh, it's already unlocked. */
1821 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1824 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1826 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1831 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1836 static int join_sanity_check(struct inode *head, struct inode *tail)
1839 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1840 CERROR("server do not support join \n");
1843 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1844 CERROR("tail ino %lu and ino head %lu must be regular\n",
1845 head->i_ino, tail->i_ino);
1848 if (head->i_ino == tail->i_ino) {
1849 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1852 if (head->i_size % JOIN_FILE_ALIGN) {
1853 CERROR("hsize %llu must be times of 64K\n", head->i_size);
1859 static int join_file(struct inode *head_inode, struct file *head_filp,
1860 struct file *tail_filp)
1862 struct inode *tail_inode, *tail_parent;
1863 struct dentry *tail_dentry = tail_filp->f_dentry;
1864 struct lookup_intent oit = {.it_op = IT_OPEN,
1865 .it_flags = head_filp->f_flags|O_JOIN_FILE};
1866 struct lustre_handle lockh;
1867 struct md_op_data *op_data;
1868 __u32 hsize = head_inode->i_size >> 32;
1869 __u32 tsize = head_inode->i_size;
1873 tail_dentry = tail_filp->f_dentry;
1874 tail_inode = tail_dentry->d_inode;
1875 tail_parent = tail_dentry->d_parent->d_inode;
1877 op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
1878 tail_dentry->d_name.name,
1879 tail_dentry->d_name.len, 0);
1880 if (op_data == NULL)
1883 rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
1884 op_data, &lockh, &tsize, 0, ldlm_completion_ast,
1885 ll_md_blocking_ast, &hsize, 0);
1887 ll_finish_md_op_data(op_data);
1891 rc = oit.d.lustre.it_status;
1894 ptlrpc_req_finished((struct ptlrpc_request *)
1895 oit.d.lustre.it_data);
1899 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1901 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1902 oit.d.lustre.it_lock_mode = 0;
1904 ll_release_openhandle(head_filp->f_dentry, &oit);
1906 ll_intent_release(&oit);
1910 static int ll_file_join(struct inode *head, struct file *filp,
1911 char *filename_tail)
1913 struct inode *tail = NULL, *first = NULL, *second = NULL;
1914 struct dentry *tail_dentry;
1915 struct file *tail_filp, *first_filp, *second_filp;
1916 struct ll_lock_tree first_tree, second_tree;
1917 struct ll_lock_tree_node *first_node, *second_node;
1918 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1919 int rc = 0, cleanup_phase = 0;
1922 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1923 head->i_ino, head->i_generation, head, filename_tail);
1925 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1926 if (IS_ERR(tail_filp)) {
1927 CERROR("Can not open tail file %s", filename_tail);
1928 rc = PTR_ERR(tail_filp);
1931 tail = igrab(tail_filp->f_dentry->d_inode);
1933 tlli = ll_i2info(tail);
1934 tail_dentry = tail_filp->f_dentry;
1935 LASSERT(tail_dentry);
1938 /*reorder the inode for lock sequence*/
1939 first = head->i_ino > tail->i_ino ? head : tail;
1940 second = head->i_ino > tail->i_ino ? tail : head;
1941 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1942 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1944 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1945 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1946 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1947 if (IS_ERR(first_node)){
1948 rc = PTR_ERR(first_node);
1951 first_tree.lt_fd = first_filp->private_data;
1952 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1957 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1958 if (IS_ERR(second_node)){
1959 rc = PTR_ERR(second_node);
1962 second_tree.lt_fd = second_filp->private_data;
1963 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1968 rc = join_sanity_check(head, tail);
1972 rc = join_file(head, filp, tail_filp);
1976 switch (cleanup_phase) {
1978 ll_tree_unlock(&second_tree);
1979 obd_cancel_unused(ll_i2dtexp(second),
1980 ll_i2info(second)->lli_smd, 0, NULL);
1982 ll_tree_unlock(&first_tree);
1983 obd_cancel_unused(ll_i2dtexp(first),
1984 ll_i2info(first)->lli_smd, 0, NULL);
1986 filp_close(tail_filp, 0);
1989 if (head && rc == 0) {
1990 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1992 hlli->lli_smd = NULL;
1997 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2003 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2005 struct inode *inode = dentry->d_inode;
2006 struct obd_client_handle *och;
2012 /* Root ? Do nothing. */
2013 if (dentry->d_inode->i_sb->s_root == dentry)
2016 /* No open handle to close? Move away */
2017 if (!it_disposition(it, DISP_OPEN_OPEN))
2020 OBD_ALLOC(och, sizeof(*och));
2022 GOTO(out, rc = -ENOMEM);
2024 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2025 ll_i2info(inode), it, och);
2027 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2030 /* Do not free @och is it is waiting for DONE_WRITING. */
2031 if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
2032 OBD_FREE(och, sizeof(*och));
2034 /* this one is in place of ll_file_open */
2035 ptlrpc_req_finished(it->d.lustre.it_data);
2036 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2040 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2043 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2047 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2048 inode->i_generation, inode, cmd);
2050 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2051 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2054 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
2056 case LL_IOC_GETFLAGS:
2057 /* Get the current value of the file flags */
2058 return put_user(fd->fd_flags, (int *)arg);
2059 case LL_IOC_SETFLAGS:
2060 case LL_IOC_CLRFLAGS:
2061 /* Set or clear specific file flags */
2062 /* XXX This probably needs checks to ensure the flags are
2063 * not abused, and to handle any flag side effects.
2065 if (get_user(flags, (int *) arg))
2068 if (cmd == LL_IOC_SETFLAGS) {
2069 if ((flags & LL_FILE_IGNORE_LOCK) &&
2070 !(file->f_flags & O_DIRECT)) {
2071 CERROR("%s: unable to disable locking on "
2072 "non-O_DIRECT file\n", current->comm);
2076 fd->fd_flags |= flags;
2078 fd->fd_flags &= ~flags;
2081 case LL_IOC_LOV_SETSTRIPE:
2082 RETURN(ll_lov_setstripe(inode, file, arg));
2083 case LL_IOC_LOV_SETEA:
2084 RETURN(ll_lov_setea(inode, file, arg));
2085 case LL_IOC_LOV_GETSTRIPE:
2086 RETURN(ll_lov_getstripe(inode, arg));
2087 case LL_IOC_RECREATE_OBJ:
2088 RETURN(ll_lov_recreate_obj(inode, file, arg));
2089 case EXT3_IOC_GETFLAGS:
2090 case EXT3_IOC_SETFLAGS:
2091 RETURN(ll_iocontrol(inode, file, cmd, arg));
2092 case EXT3_IOC_GETVERSION_OLD:
2093 case EXT3_IOC_GETVERSION:
2094 RETURN(put_user(inode->i_generation, (int *)arg));
2099 ftail = getname((const char *)arg);
2101 RETURN(PTR_ERR(ftail));
2102 rc = ll_file_join(inode, file, ftail);
2106 case LL_IOC_GROUP_LOCK:
2107 RETURN(ll_get_grouplock(inode, file, arg));
2108 case LL_IOC_GROUP_UNLOCK:
2109 RETURN(ll_put_grouplock(inode, file, arg));
2110 case IOC_OBD_STATFS:
2111 RETURN(ll_obd_statfs(inode, (void *)arg));
2113 /* We need to special case any other ioctls we want to handle,
2114 * to send them to the MDS/OST as appropriate and to properly
2115 * network encode the arg field.
2116 case EXT3_IOC_SETVERSION_OLD:
2117 case EXT3_IOC_SETVERSION:
2119 case LL_IOC_FLUSHCTX:
2120 RETURN(ll_flush_ctx(inode));
2121 case LL_IOC_GETFACL: {
2122 struct rmtacl_ioctl_data ioc;
2124 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2127 RETURN(ll_ioctl_getfacl(inode, &ioc));
2129 case LL_IOC_SETFACL: {
2130 struct rmtacl_ioctl_data ioc;
2132 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2135 RETURN(ll_ioctl_setfacl(inode, &ioc));
2138 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2143 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2145 struct inode *inode = file->f_dentry->d_inode;
2146 struct ll_inode_info *lli = ll_i2info(inode);
2147 struct lov_stripe_md *lsm = lli->lli_smd;
2150 retval = offset + ((origin == 2) ? inode->i_size :
2151 (origin == 1) ? file->f_pos : 0);
2152 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2153 inode->i_ino, inode->i_generation, inode, retval, retval,
2154 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2156 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
2157 if (origin == 2) { /* SEEK_END */
2158 int nonblock = 0, rc;
2160 if (file->f_flags & O_NONBLOCK)
2161 nonblock = LDLM_FL_BLOCK_NOWAIT;
2164 rc = ll_glimpse_size(inode, nonblock);
2169 ll_inode_size_lock(inode, 0);
2170 offset += inode->i_size;
2171 ll_inode_size_unlock(inode, 0);
2172 } else if (origin == 1) { /* SEEK_CUR */
2173 offset += file->f_pos;
2177 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2178 if (offset != file->f_pos) {
2179 file->f_pos = offset;
2180 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2182 file->f_version = ++event;
2187 CERROR("invalid offset offset "LPX64" inode=%lu/%u(%p)"
2188 "seek (%s) isize "LPU64", f_ops "LPU64"\n",
2189 offset, inode->i_ino, inode->i_generation, inode,
2190 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR"
2191 : "SEEK_SET", inode->i_size, file->f_pos);
2197 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2199 struct inode *inode = dentry->d_inode;
2200 struct ll_inode_info *lli = ll_i2info(inode);
2201 struct lov_stripe_md *lsm = lli->lli_smd;
2202 struct ptlrpc_request *req;
2203 struct obd_capa *oc;
2206 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2207 inode->i_generation, inode);
2209 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
2211 /* fsync's caller has already called _fdata{sync,write}, we want
2212 * that IO to finish before calling the osc and mdc sync methods */
2213 rc = filemap_fdatawait(inode->i_mapping);
2215 /* catch async errors that were recorded back when async writeback
2216 * failed for pages in this mapping. */
2217 err = lli->lli_async_rc;
2218 lli->lli_async_rc = 0;
2222 err = lov_test_and_clear_async_rc(lsm);
2227 oc = ll_mdscapa_get(inode);
2228 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2234 ptlrpc_req_finished(req);
2237 struct obdo *oa = obdo_alloc();
2238 struct obd_capa *ocapa;
2241 RETURN(rc ? rc : -ENOMEM);
2243 oa->o_id = lsm->lsm_object_id;
2244 oa->o_gr = lsm->lsm_object_gr;
2245 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2246 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2247 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2250 ocapa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2251 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2252 0, OBD_OBJECT_EOF, ocapa);
2262 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2264 struct inode *inode = file->f_dentry->d_inode;
2265 struct ll_sb_info *sbi = ll_i2sbi(inode);
2266 struct ldlm_res_id res_id =
2267 { .name = { fid_seq(ll_inode2fid(inode)),
2268 fid_oid(ll_inode2fid(inode)),
2269 fid_ver(ll_inode2fid(inode)),
2271 struct lustre_handle lockh = {0};
2272 ldlm_policy_data_t flock;
2273 ldlm_mode_t mode = 0;
2278 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2279 inode->i_ino, file_lock);
2281 flock.l_flock.pid = file_lock->fl_pid;
2282 flock.l_flock.start = file_lock->fl_start;
2283 flock.l_flock.end = file_lock->fl_end;
2285 switch (file_lock->fl_type) {
2290 /* An unlock request may or may not have any relation to
2291 * existing locks so we may not be able to pass a lock handle
2292 * via a normal ldlm_lock_cancel() request. The request may even
2293 * unlock a byte range in the middle of an existing lock. In
2294 * order to process an unlock request we need all of the same
2295 * information that is given with a normal read or write record
2296 * lock request. To avoid creating another ldlm unlock (cancel)
2297 * message we'll treat a LCK_NL flock request as an unlock. */
2304 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2319 flags = LDLM_FL_BLOCK_NOWAIT;
2325 flags = LDLM_FL_TEST_LOCK;
2326 /* Save the old mode so that if the mode in the lock changes we
2327 * can decrement the appropriate reader or writer refcount. */
2328 file_lock->fl_type = mode;
2331 CERROR("unknown fcntl lock command: %d\n", cmd);
2335 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2336 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2337 flags, mode, flock.l_flock.start, flock.l_flock.end);
2339 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2340 LDLM_FLOCK, &flock, mode, &flags, NULL,
2341 ldlm_flock_completion_ast, NULL, file_lock,
2342 NULL, 0, NULL, &lockh, 0);
2346 int ll_have_md_lock(struct inode *inode, __u64 bits)
2348 struct lustre_handle lockh;
2349 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2357 fid = &ll_i2info(inode)->lli_fid;
2358 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2360 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2361 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2362 LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2369 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2370 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2371 * and return success */
2373 /* This path cannot be hit for regular files unless in
2374 * case of obscure races, so no need to to validate
2376 if (!S_ISREG(inode->i_mode) &&
2377 !S_ISDIR(inode->i_mode))
2382 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2390 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2392 struct inode *inode = dentry->d_inode;
2393 struct ptlrpc_request *req = NULL;
2394 struct ll_sb_info *sbi;
2395 struct obd_export *exp;
2400 CERROR("REPORT THIS LINE TO PETER\n");
2403 sbi = ll_i2sbi(inode);
2405 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2406 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2407 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2408 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
2411 exp = ll_i2mdexp(inode);
2413 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2414 struct lookup_intent oit = { .it_op = IT_GETATTR };
2415 struct md_op_data *op_data;
2417 /* Call getattr by fid, so do not provide name at all. */
2418 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2419 dentry->d_inode, NULL, 0, 0);
2420 if (op_data == NULL)
2422 it->it_flags |= O_CHECK_STALE;
2423 rc = md_intent_lock(exp, op_data, NULL, 0,
2424 /* we are not interested in name
2427 ll_md_blocking_ast, 0);
2428 ll_finish_md_op_data(op_data);
2429 it->it_flags &= ~ O_CHECK_STALE;
2431 rc = ll_inode_revalidate_fini(inode, rc);
2435 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2437 ll_intent_release(&oit);
2441 /* Unlinked? Unhash dentry, so it is not picked up later by
2442 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2443 here to preserve get_cwd functionality on 2.6.
2445 if (!dentry->d_inode->i_nlink) {
2446 spin_lock(&dcache_lock);
2447 ll_drop_dentry(dentry);
2448 spin_unlock(&dcache_lock);
2451 ll_lookup_finish_locks(&oit, dentry);
2452 } else if (!ll_have_md_lock(dentry->d_inode,
2453 MDS_INODELOCK_UPDATE)) {
2454 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2455 obd_valid valid = OBD_MD_FLGETATTR;
2457 struct obd_capa *oc;
2459 if (S_ISREG(inode->i_mode)) {
2460 rc = ll_get_max_mdsize(sbi, &ealen);
2463 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2465 oc = ll_mdscapa_get(inode);
2466 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2470 rc = ll_inode_revalidate_fini(inode, rc);
2474 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2480 /* if object not yet allocated, don't validate size */
2481 if (ll_i2info(inode)->lli_smd == NULL)
2484 /* ll_glimpse_size will prefer locally cached writes if they extend
2486 rc = ll_glimpse_size(inode, 0);
2489 ptlrpc_req_finished(req);
2493 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2494 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2495 struct lookup_intent *it, struct kstat *stat)
2497 struct inode *inode = de->d_inode;
2500 res = ll_inode_revalidate_it(de, it);
2501 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
2506 stat->dev = inode->i_sb->s_dev;
2507 stat->ino = inode->i_ino;
2508 stat->mode = inode->i_mode;
2509 stat->nlink = inode->i_nlink;
2510 stat->uid = inode->i_uid;
2511 stat->gid = inode->i_gid;
2512 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2513 stat->atime = inode->i_atime;
2514 stat->mtime = inode->i_mtime;
2515 stat->ctime = inode->i_ctime;
2516 stat->blksize = inode->i_blksize;
2518 ll_inode_size_lock(inode, 0);
2519 stat->size = inode->i_size;
2520 stat->blocks = inode->i_blocks;
2521 ll_inode_size_unlock(inode, 0);
2525 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2527 struct lookup_intent it = { .it_op = IT_GETATTR };
2529 return ll_getattr_it(mnt, de, &it, stat);
2534 int lustre_check_acl(struct inode *inode, int mask)
2536 #ifdef CONFIG_FS_POSIX_ACL
2537 struct ll_inode_info *lli = ll_i2info(inode);
2538 struct posix_acl *acl;
2542 spin_lock(&lli->lli_lock);
2543 acl = posix_acl_dup(lli->lli_posix_acl);
2544 spin_unlock(&lli->lli_lock);
2549 rc = posix_acl_permission(inode, acl, mask);
2550 posix_acl_release(acl);
2558 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2559 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2561 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2562 inode->i_ino, inode->i_generation, inode, mask);
2563 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2564 return lustre_check_remote_perm(inode, mask);
2565 return generic_permission(inode, mask, lustre_check_acl);
2568 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2569 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2571 int ll_inode_permission(struct inode *inode, int mask)
2574 int mode = inode->i_mode;
2577 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2578 inode->i_ino, inode->i_generation, inode, mask);
2580 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2581 return lustre_check_remote_perm(inode, mask);
2583 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2584 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2586 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2588 if (current->fsuid == inode->i_uid) {
2591 if (((mode >> 3) & mask & S_IRWXO) != mask)
2593 rc = lustre_check_acl(inode, mask);
2597 goto check_capabilities;
2601 if (in_group_p(inode->i_gid))
2604 if ((mode & mask & S_IRWXO) == mask)
2608 if (!(mask & MAY_EXEC) ||
2609 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2610 if (capable(CAP_DAC_OVERRIDE))
2613 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2614 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2620 struct file_operations ll_file_operations = {
2621 .read = ll_file_read,
2622 .write = ll_file_write,
2623 .ioctl = ll_file_ioctl,
2624 .open = ll_file_open,
2625 .release = ll_file_release,
2626 .mmap = ll_file_mmap,
2627 .llseek = ll_file_seek,
2628 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2629 .sendfile = ll_file_sendfile,
2632 /* .lock = ll_file_flock */
2635 struct file_operations ll_file_operations_flock = {
2636 .read = ll_file_read,
2637 .write = ll_file_write,
2638 .ioctl = ll_file_ioctl,
2639 .open = ll_file_open,
2640 .release = ll_file_release,
2641 .mmap = ll_file_mmap,
2642 .llseek = ll_file_seek,
2643 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2644 .sendfile = ll_file_sendfile,
2647 .lock = ll_file_flock
2651 struct inode_operations ll_file_inode_operations = {
2652 #ifdef LUSTRE_KERNEL_VERSION
2653 .setattr_raw = ll_setattr_raw,
2655 .setattr = ll_setattr,
2656 .truncate = ll_truncate,
2657 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2658 .getattr = ll_getattr,
2660 .revalidate_it = ll_inode_revalidate_it,
2662 .permission = ll_inode_permission,
2663 .setxattr = ll_setxattr,
2664 .getxattr = ll_getxattr,
2665 .listxattr = ll_listxattr,
2666 .removexattr = ll_removexattr,