1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
34 #include "llite_internal.h"
36 /* also used by llite/special.c:ll_special_open() */
37 struct ll_file_data *ll_file_data_get(void)
39 struct ll_file_data *fd;
41 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
45 static void ll_file_data_put(struct ll_file_data *fd)
48 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
52 struct lustre_handle *fh)
54 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
55 op_data->op_attr.ia_mode = inode->i_mode;
56 op_data->op_attr.ia_atime = inode->i_atime;
57 op_data->op_attr.ia_mtime = inode->i_mtime;
58 op_data->op_attr.ia_ctime = inode->i_ctime;
59 op_data->op_attr.ia_size = inode->i_size;
60 op_data->op_attr_blocks = inode->i_blocks;
61 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
62 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
63 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
64 op_data->op_capa1 = ll_mdscapa_get(inode);
67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
68 struct obd_client_handle *och)
72 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
73 ATTR_MTIME_SET | ATTR_CTIME_SET;
75 if (!(och->och_flags & FMODE_WRITE))
78 if (!S_ISREG(inode->i_mode))
79 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
81 ll_epoch_close(inode, op_data, &och, 0);
84 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
88 static int ll_close_inode_openhandle(struct obd_export *md_exp,
90 struct obd_client_handle *och)
92 struct md_op_data *op_data;
93 struct ptlrpc_request *req = NULL;
94 struct obd_device *obd;
95 int rc, clear_ord = 0;
99 obd = class_exp2obd(ll_i2mdexp(inode));
102 * XXX: in case of LMV, is this correct to access
105 CERROR("Invalid MDC connection handle "LPX64"\n",
106 ll_i2mdexp(inode)->exp_handle.h_cookie);
111 * here we check if this is forced umount. If so this is called on
112 * canceling "open lock" and we do not call md_close() in this case, as
113 * it will not be successful, as import is already deactivated.
115 if (obd->obd_no_recov)
118 OBD_ALLOC_PTR(op_data);
120 GOTO(out, rc = -ENOMEM);
122 ll_prepare_close(inode, op_data, och);
123 epoch_close = (och->och_flags & FMODE_WRITE) &&
124 ((op_data->op_flags & MF_EPOCH_CLOSE) ||
125 !S_ISREG(inode->i_mode));
126 rc = md_close(md_exp, op_data, och, &req);
128 ll_finish_md_op_data(op_data);
130 /* This close must have closed the epoch. */
131 LASSERT(epoch_close);
132 /* MDS has instructed us to obtain Size-on-MDS attribute from
133 * OSTs and send setattr to back to MDS. */
134 rc = ll_sizeonmds_update(inode, &och->och_fh);
136 CERROR("inode %lu mdc Size-on-MDS update failed: "
137 "rc = %d\n", inode->i_ino, rc);
141 CERROR("inode %lu mdc close failed: rc = %d\n",
145 if (!epoch_close && (och->och_flags & FMODE_WRITE)) {
146 md_clear_open_replay_data(md_exp, och);
149 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
153 rc = ll_objects_destroy(req, inode);
155 CERROR("inode %lu ll_objects destroy: rc = %d\n",
159 ptlrpc_req_finished(req); /* This is close request */
163 md_clear_open_replay_data(md_exp, och);
165 if (epoch_close || !(och->och_flags & FMODE_WRITE))
166 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
170 int ll_md_real_close(struct inode *inode, int flags)
172 struct ll_inode_info *lli = ll_i2info(inode);
173 struct obd_client_handle **och_p;
174 struct obd_client_handle *och;
179 if (flags & FMODE_WRITE) {
180 och_p = &lli->lli_mds_write_och;
181 och_usecount = &lli->lli_open_fd_write_count;
182 } else if (flags & FMODE_EXEC) {
183 och_p = &lli->lli_mds_exec_och;
184 och_usecount = &lli->lli_open_fd_exec_count;
186 LASSERT(flags & FMODE_READ);
187 och_p = &lli->lli_mds_read_och;
188 och_usecount = &lli->lli_open_fd_read_count;
191 down(&lli->lli_och_sem);
192 if (*och_usecount) { /* There are still users of this handle, so
194 up(&lli->lli_och_sem);
199 up(&lli->lli_och_sem);
201 if (och) { /* There might be a race and somebody have freed this och
203 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
205 /* Do not free @och is it is waiting for DONE_WRITING. */
206 if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
207 OBD_FREE(och, sizeof *och);
213 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
216 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
217 struct ll_inode_info *lli = ll_i2info(inode);
221 /* clear group lock, if present */
222 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
223 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
224 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
225 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
229 /* Let's see if we have good enough OPEN lock on the file and if
230 we can skip talking to MDS */
231 if (file->f_dentry->d_inode) { /* Can this ever be false? */
233 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
234 struct lustre_handle lockh;
235 struct inode *inode = file->f_dentry->d_inode;
236 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
238 down(&lli->lli_och_sem);
239 if (fd->fd_omode & FMODE_WRITE) {
241 LASSERT(lli->lli_open_fd_write_count);
242 lli->lli_open_fd_write_count--;
243 } else if (fd->fd_omode & FMODE_EXEC) {
245 LASSERT(lli->lli_open_fd_exec_count);
246 lli->lli_open_fd_exec_count--;
249 LASSERT(lli->lli_open_fd_read_count);
250 lli->lli_open_fd_read_count--;
252 up(&lli->lli_och_sem);
254 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
255 LDLM_IBITS, &policy, lockmode,
257 rc = ll_md_real_close(file->f_dentry->d_inode,
261 CERROR("Releasing a file %p with negative dentry %p. Name %s",
262 file, file->f_dentry, file->f_dentry->d_name.name);
265 LUSTRE_FPRIVATE(file) = NULL;
266 ll_file_data_put(fd);
267 ll_capa_close(inode);
272 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
274 /* While this returns an error code, fput() the caller does not, so we need
275 * to make every effort to clean up all of our state here. Also, applications
276 * rarely check close errors and even if an error is returned they will not
277 * re-try the close call.
279 int ll_file_release(struct inode *inode, struct file *file)
281 struct ll_file_data *fd;
282 struct ll_sb_info *sbi = ll_i2sbi(inode);
283 struct ll_inode_info *lli = ll_i2info(inode);
284 struct lov_stripe_md *lsm = lli->lli_smd;
288 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
289 inode->i_generation, inode);
291 lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
292 fd = LUSTRE_FPRIVATE(file);
295 /* don't do anything for / */
296 if (inode->i_sb->s_root == file->f_dentry) {
297 LUSTRE_FPRIVATE(file) = NULL;
298 ll_file_data_put(fd);
303 lov_test_and_clear_async_rc(lsm);
304 lli->lli_async_rc = 0;
306 rc = ll_md_close(sbi->ll_md_exp, inode, file);
310 static int ll_intent_file_open(struct file *file, void *lmm,
311 int lmmsize, struct lookup_intent *itp)
313 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
314 struct dentry *parent = file->f_dentry->d_parent;
315 const char *name = file->f_dentry->d_name.name;
316 const int len = file->f_dentry->d_name.len;
317 struct lustre_handle lockh;
318 struct md_op_data *op_data;
324 /* Usually we come here only for NFSD, and we want open lock.
325 But we can also get here with pre 2.6.15 patchless kernels, and in
326 that case that lock is also ok */
327 /* We can also get here if there was cached open handle in revalidate_it
328 * but it disappeared while we were getting from there to ll_file_open.
329 * But this means this file was closed and immediatelly opened which
330 * makes a good candidate for using OPEN lock */
331 /* If lmmsize & lmm are not 0, we are just setting stripe info
332 * parameters. No need for the open lock */
333 if (!lmm && !lmmsize)
334 itp->it_flags |= MDS_OPEN_LOCK;
336 op_data = ll_prep_md_op_data(NULL, parent->d_inode, NULL, name, len,
341 rc = md_enqueue(sbi->ll_md_exp, LDLM_IBITS, itp, LCK_CW, op_data,
342 &lockh, lmm, lmmsize, ldlm_completion_ast,
343 ll_md_blocking_ast, NULL, 0);
345 ll_finish_md_op_data(op_data);
347 CERROR("lock enqueue: err: %d\n", rc);
351 if (itp->d.lustre.it_lock_mode) { /* If we got lock - release it right
353 ldlm_lock_decref(&lockh, itp->d.lustre.it_lock_mode);
354 itp->d.lustre.it_lock_mode = 0;
356 rc = ll_prep_inode(&file->f_dentry->d_inode,
357 (struct ptlrpc_request *)itp->d.lustre.it_data,
358 DLM_REPLY_REC_OFF, NULL);
362 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
363 struct lookup_intent *it, struct obd_client_handle *och)
365 struct ptlrpc_request *req = it->d.lustre.it_data;
366 struct mdt_body *body;
370 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
371 LASSERT(body != NULL); /* reply already checked out */
372 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
374 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
375 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
376 och->och_fid = lli->lli_fid;
377 och->och_flags = it->it_flags;
378 lli->lli_ioepoch = body->ioepoch;
380 return md_set_open_replay_data(md_exp, och, req);
383 int ll_local_open(struct file *file, struct lookup_intent *it,
384 struct ll_file_data *fd, struct obd_client_handle *och)
386 struct inode *inode = file->f_dentry->d_inode;
387 struct ll_inode_info *lli = ll_i2info(inode);
390 LASSERT(!LUSTRE_FPRIVATE(file));
395 struct ptlrpc_request *req = it->d.lustre.it_data;
396 struct mdt_body *body;
399 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
403 body = lustre_msg_buf(req->rq_repmsg,
404 DLM_REPLY_REC_OFF, sizeof(*body));
406 if ((it->it_flags & FMODE_WRITE) &&
407 (body->valid & OBD_MD_FLSIZE))
409 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
410 lli->lli_ioepoch, PFID(&lli->lli_fid));
414 LUSTRE_FPRIVATE(file) = fd;
415 ll_readahead_init(inode, &fd->fd_ras);
416 fd->fd_omode = it->it_flags;
420 /* Open a file, and (for the very first open) create objects on the OSTs at
421 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
422 * creation or open until ll_lov_setstripe() ioctl is called. We grab
423 * lli_open_sem to ensure no other process will create objects, send the
424 * stripe MD to the MDS, or try to destroy the objects if that fails.
426 * If we already have the stripe MD locally then we don't request it in
427 * md_open(), by passing a lmm_size = 0.
429 * It is up to the application to ensure no other processes open this file
430 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
431 * used. We might be able to avoid races of that sort by getting lli_open_sem
432 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
433 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
435 int ll_file_open(struct inode *inode, struct file *file)
437 struct ll_inode_info *lli = ll_i2info(inode);
438 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
439 .it_flags = file->f_flags };
440 struct lov_stripe_md *lsm;
441 struct ptlrpc_request *req = NULL;
442 struct obd_client_handle **och_p;
444 struct ll_file_data *fd;
448 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
449 inode->i_generation, inode, file->f_flags);
451 #ifdef LUSTRE_KERNEL_VERSION
454 it = file->private_data; /* XXX: compat macro */
455 file->private_data = NULL; /* prevent ll_local_open assertion */
458 fd = ll_file_data_get();
462 /* don't do anything for / */
463 if (inode->i_sb->s_root == file->f_dentry) {
464 LUSTRE_FPRIVATE(file) = fd;
468 if (!it || !it->d.lustre.it_disposition) {
469 /* Convert f_flags into access mode. We cannot use file->f_mode,
470 * because everything but O_ACCMODE mask was stripped from
472 if ((oit.it_flags + 1) & O_ACCMODE)
474 if (oit.it_flags & O_TRUNC)
475 oit.it_flags |= FMODE_WRITE;
477 if (oit.it_flags & O_CREAT)
478 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
480 /* NFS hack - some strange NFS clients create files with zero
481 * permission bits, and then expect to be able to open such
482 * files. We are relying on real VFS client to do ll_permission
483 * first before coming here, so if we got here, we either came
484 * from NFS or all access checks ar eok, so it is safe to set
485 * this flag in any case (XXX - race with chmod?)
487 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
489 /* We do not want O_EXCL here, presumably we opened the file
490 * already? XXX - NFS implications? */
491 oit.it_flags &= ~O_EXCL;
496 /* Let's see if we have file open on MDS already. */
497 if (it->it_flags & FMODE_WRITE) {
498 och_p = &lli->lli_mds_write_och;
499 och_usecount = &lli->lli_open_fd_write_count;
500 } else if (it->it_flags & FMODE_EXEC) {
501 och_p = &lli->lli_mds_exec_och;
502 och_usecount = &lli->lli_open_fd_exec_count;
504 och_p = &lli->lli_mds_read_och;
505 och_usecount = &lli->lli_open_fd_read_count;
507 down(&lli->lli_och_sem);
508 if (*och_p) { /* Open handle is present */
509 if (it_disposition(it, DISP_OPEN_OPEN)) {
510 /* Well, there's extra open request that we do not need,
511 let's close it somehow. This will decref request. */
512 ll_release_openhandle(file->f_dentry, it);
516 rc = ll_local_open(file, it, fd, NULL);
518 up(&lli->lli_och_sem);
519 ll_file_data_put(fd);
523 LASSERT(*och_usecount == 0);
524 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
526 ll_file_data_put(fd);
527 GOTO(out_och_free, rc = -ENOMEM);
530 if (!it->d.lustre.it_disposition) {
531 rc = ll_intent_file_open(file, NULL, 0, it);
533 ll_file_data_put(fd);
534 GOTO(out_och_free, rc);
537 /* Got some error? Release the request */
538 if (it->d.lustre.it_status < 0) {
539 req = it->d.lustre.it_data;
540 ptlrpc_req_finished(req);
542 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
543 &it->d.lustre.it_lock_handle,
544 file->f_dentry->d_inode);
546 req = it->d.lustre.it_data;
548 /* md_intent_lock() didn't get a request ref if there was an
549 * open error, so don't do cleanup on the request here
551 /* XXX (green): Should not we bail out on any error here, not
552 * just open error? */
553 rc = it_open_error(DISP_OPEN_OPEN, it);
555 ll_file_data_put(fd);
556 GOTO(out_och_free, rc);
559 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
560 rc = ll_local_open(file, it, fd, *och_p);
562 up(&lli->lli_och_sem);
563 ll_file_data_put(fd);
564 GOTO(out_och_free, rc);
567 up(&lli->lli_och_sem);
569 /* Must do this outside lli_och_sem lock to prevent deadlock where
570 different kind of OPEN lock for this same inode gets cancelled
571 by ldlm_cancel_lru */
572 if (!S_ISREG(inode->i_mode))
579 if (file->f_flags & O_LOV_DELAY_CREATE ||
580 !(file->f_mode & FMODE_WRITE)) {
581 CDEBUG(D_INODE, "object creation was delayed\n");
585 file->f_flags &= ~O_LOV_DELAY_CREATE;
588 ptlrpc_req_finished(req);
590 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
594 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
595 *och_p = NULL; /* OBD_FREE writes some magic there */
598 up(&lli->lli_och_sem);
604 /* Fills the obdo with the attributes for the inode defined by lsm */
605 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
607 struct ptlrpc_request_set *set;
608 struct ll_inode_info *lli = ll_i2info(inode);
609 struct lov_stripe_md *lsm = lli->lli_smd;
611 struct obd_info oinfo = { { { 0 } } };
615 LASSERT(lsm != NULL);
619 oinfo.oi_oa->o_id = lsm->lsm_object_id;
620 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
621 oinfo.oi_oa->o_mode = S_IFREG;
622 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
623 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
624 OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
625 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
626 oinfo.oi_capa = ll_mdscapa_get(inode);
628 set = ptlrpc_prep_set();
630 CERROR("can't allocate ptlrpc set\n");
633 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
635 rc = ptlrpc_set_wait(set);
636 ptlrpc_set_destroy(set);
638 capa_put(oinfo.oi_capa);
642 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
643 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
646 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
647 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
648 lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
653 static inline void ll_remove_suid(struct inode *inode)
657 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
658 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
660 /* was any of the uid bits set? */
661 mode &= inode->i_mode;
662 if (mode && !capable(CAP_FSETID)) {
663 inode->i_mode &= ~mode;
664 // XXX careful here - we cannot change the size
668 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
670 struct ll_inode_info *lli = ll_i2info(inode);
671 struct lov_stripe_md *lsm = lli->lli_smd;
672 struct obd_export *exp = ll_i2dtexp(inode);
675 struct ldlm_lock *lock;
676 struct lov_stripe_md *lsm;
677 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
678 __u32 stripe, vallen = sizeof(stripe);
682 if (lsm->lsm_stripe_count == 1)
683 GOTO(check, stripe = 0);
685 /* get our offset in the lov */
686 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
688 CERROR("obd_get_info: rc = %d\n", rc);
691 LASSERT(stripe < lsm->lsm_stripe_count);
694 if (lsm->lsm_oinfo[stripe].loi_id != lock->l_resource->lr_name.name[0]||
695 lsm->lsm_oinfo[stripe].loi_gr != lock->l_resource->lr_name.name[2]){
696 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
697 lsm->lsm_oinfo[stripe].loi_id,
698 lsm->lsm_oinfo[stripe].loi_gr);
699 RETURN(-ELDLM_NO_LOCK_DATA);
705 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
706 * we get a lock cancellation for each stripe, so we have to map the obd's
707 * region back onto the stripes in the file that it held.
709 * No one can dirty the extent until we've finished our work and they can
710 * enqueue another lock. The DLM protects us from ll_file_read/write here,
711 * but other kernel actors could have pages locked.
713 * Called with the DLM lock held. */
714 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
715 struct ldlm_lock *lock, __u32 stripe)
717 ldlm_policy_data_t tmpex;
718 unsigned long start, end, count, skip, i, j;
720 int rc, rc2, l_flags, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
721 struct lustre_handle lockh;
724 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
725 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
726 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
729 /* our locks are page granular thanks to osc_enqueue, we invalidate the
731 if ((tmpex.l_extent.start & ~PAGE_CACHE_MASK) != 0 ||
732 ((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) != 0)
733 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",PAGE_SIZE);
734 LASSERT((tmpex.l_extent.start & ~PAGE_CACHE_MASK) == 0);
735 LASSERT(((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) == 0);
739 start = tmpex.l_extent.start >> PAGE_CACHE_SHIFT;
740 end = tmpex.l_extent.end >> PAGE_CACHE_SHIFT;
741 if (lsm->lsm_stripe_count > 1) {
742 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
743 skip = (lsm->lsm_stripe_count - 1) * count;
744 start += start/count * skip + stripe * count;
746 end += end/count * skip + stripe * count;
748 if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT)
751 i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0;
755 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
756 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
757 count, skip, end, discard ? " (DISCARDING)" : "");
759 /* walk through the vmas on the inode and tear down mmaped pages that
760 * intersect with the lock. this stops immediately if there are no
761 * mmap()ed regions of the file. This is not efficient at all and
762 * should be short lived. We'll associate mmap()ed pages with the lock
763 * and will be able to find them directly */
764 for (i = start; i <= end; i += (j + skip)) {
765 j = min(count - (i % count), end - i + 1);
767 LASSERT(inode->i_mapping);
768 if (ll_teardown_mmaps(inode->i_mapping,
769 (__u64)i << PAGE_CACHE_SHIFT,
770 ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) )
774 /* this is the simplistic implementation of page eviction at
775 * cancelation. It is careful to get races with other page
776 * lockers handled correctly. fixes from bug 20 will make it
777 * more efficient by associating locks with pages and with
778 * batching writeback under the lock explicitly. */
779 for (i = start, j = start % count; i <= end;
780 j++, i++, tmpex.l_extent.start += PAGE_CACHE_SIZE) {
782 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
788 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
789 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
790 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
793 if (!mapping_has_pages(inode->i_mapping)) {
794 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
800 page = find_get_page(inode->i_mapping, i);
803 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
804 i, tmpex.l_extent.start);
807 /* page->mapping to check with racing against teardown */
808 if (!discard && clear_page_dirty_for_io(page)) {
809 rc = ll_call_writepage(inode, page);
811 CERROR("writepage of page %p failed: %d\n",
813 /* either waiting for io to complete or reacquiring
814 * the lock that the failed writepage released */
818 tmpex.l_extent.end = tmpex.l_extent.start + PAGE_CACHE_SIZE - 1;
819 l_flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
820 /* check to see if another DLM lock covers this page b=2765 */
821 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
822 l_flags, &lock->l_resource->lr_name,
823 LDLM_EXTENT, &tmpex, LCK_PR | LCK_PW, &lockh);
825 if (rc2 <= 0 && page->mapping != NULL) {
826 struct ll_async_page *llap = llap_cast_private(page);
827 // checking again to account for writeback's lock_page()
828 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
830 ll_ra_accounting(llap, inode->i_mapping);
831 ll_truncate_complete_page(page);
834 page_cache_release(page);
836 LASSERTF(tmpex.l_extent.start <=
837 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
838 lock->l_policy_data.l_extent.end + 1),
839 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
840 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
845 static int ll_extent_lock_callback(struct ldlm_lock *lock,
846 struct ldlm_lock_desc *new, void *data,
849 struct lustre_handle lockh = { 0 };
853 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
854 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
859 case LDLM_CB_BLOCKING:
860 ldlm_lock2handle(lock, &lockh);
861 rc = ldlm_cli_cancel(&lockh);
863 CERROR("ldlm_cli_cancel failed: %d\n", rc);
865 case LDLM_CB_CANCELING: {
867 struct ll_inode_info *lli;
868 struct lov_stripe_md *lsm;
872 /* This lock wasn't granted, don't try to evict pages */
873 if (lock->l_req_mode != lock->l_granted_mode)
876 inode = ll_inode_from_lock(lock);
879 lli = ll_i2info(inode);
882 if (lli->lli_smd == NULL)
886 stripe = ll_lock_to_stripe_offset(inode, lock);
890 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
892 lov_stripe_lock(lsm);
893 lock_res_and_lock(lock);
894 kms = ldlm_extent_shift_kms(lock,
895 lsm->lsm_oinfo[stripe].loi_kms);
897 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
898 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
899 lsm->lsm_oinfo[stripe].loi_kms, kms);
900 lsm->lsm_oinfo[stripe].loi_kms = kms;
901 unlock_res_and_lock(lock);
902 lov_stripe_unlock(lsm);
915 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
917 /* XXX ALLOCATE - 160 bytes */
918 struct inode *inode = ll_inode_from_lock(lock);
919 struct ll_inode_info *lli = ll_i2info(inode);
920 struct lustre_handle lockh = { 0 };
925 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
926 LDLM_FL_BLOCK_CONV)) {
927 LBUG(); /* not expecting any blocked async locks yet */
928 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
930 ldlm_lock_dump(D_OTHER, lock, 0);
931 ldlm_reprocess_all(lock->l_resource);
935 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
937 stripe = ll_lock_to_stripe_offset(inode, lock);
941 if (lock->l_lvb_len) {
942 struct lov_stripe_md *lsm = lli->lli_smd;
944 lvb = lock->l_lvb_data;
945 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
947 LOCK_INODE_MUTEX(inode);
948 lock_res_and_lock(lock);
949 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
950 kms = ldlm_extent_shift_kms(NULL, kms);
951 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
952 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
953 lsm->lsm_oinfo[stripe].loi_kms, kms);
954 lsm->lsm_oinfo[stripe].loi_kms = kms;
955 unlock_res_and_lock(lock);
956 UNLOCK_INODE_MUTEX(inode);
961 wake_up(&lock->l_waitq);
963 ldlm_lock2handle(lock, &lockh);
964 ldlm_lock_decref(&lockh, LCK_PR);
969 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
971 struct ptlrpc_request *req = reqp;
972 struct inode *inode = ll_inode_from_lock(lock);
973 struct ll_inode_info *lli;
974 struct lov_stripe_md *lsm;
977 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
981 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
982 lli = ll_i2info(inode);
984 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
987 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
989 /* First, find out which stripe index this lock corresponds to. */
990 stripe = ll_lock_to_stripe_offset(inode, lock);
992 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
994 rc = lustre_pack_reply(req, 2, size, NULL);
996 CERROR("lustre_pack_reply: %d\n", rc);
1000 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1001 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
1002 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1003 lvb->lvb_atime = LTIME_S(inode->i_atime);
1004 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1006 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1007 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1008 inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1009 lvb->lvb_atime, lvb->lvb_ctime);
1014 /* These errors are normal races, so we don't want to fill the console
1015 * with messages by calling ptlrpc_error() */
1016 if (rc == -ELDLM_NO_LOCK_DATA)
1017 lustre_pack_reply(req, 1, NULL, NULL);
1019 req->rq_status = rc;
1023 static void ll_merge_lvb(struct inode *inode)
1025 struct ll_inode_info *lli = ll_i2info(inode);
1026 struct ll_sb_info *sbi = ll_i2sbi(inode);
1030 ll_inode_size_lock(inode, 1);
1031 inode_init_lvb(inode, &lvb);
1032 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1033 inode->i_size = lvb.lvb_size;
1034 inode->i_blocks = lvb.lvb_blocks;
1035 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1036 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1037 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1038 ll_inode_size_unlock(inode, 1);
1042 int ll_local_size(struct inode *inode)
1044 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1045 struct ll_inode_info *lli = ll_i2info(inode);
1046 struct ll_sb_info *sbi = ll_i2sbi(inode);
1047 struct lustre_handle lockh = { 0 };
1052 if (lli->lli_smd->lsm_stripe_count == 0)
1055 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1056 &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1062 ll_merge_lvb(inode);
1063 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1067 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1070 struct lustre_handle lockh = { 0 };
1071 struct obd_enqueue_info einfo = { 0 };
1072 struct obd_info oinfo = { { { 0 } } };
1078 einfo.ei_type = LDLM_EXTENT;
1079 einfo.ei_mode = LCK_PR;
1080 einfo.ei_flags = LDLM_FL_HAS_INTENT;
1081 einfo.ei_cb_bl = ll_extent_lock_callback;
1082 einfo.ei_cb_cp = ldlm_completion_ast;
1083 einfo.ei_cb_gl = ll_glimpse_callback;
1084 einfo.ei_cbdata = NULL;
1086 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1087 oinfo.oi_lockh = &lockh;
1090 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1094 CERROR("obd_enqueue returned rc %d, "
1095 "returning -EIO\n", rc);
1096 RETURN(rc > 0 ? -EIO : rc);
1099 lov_stripe_lock(lsm);
1100 memset(&lvb, 0, sizeof(lvb));
1101 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1102 st->st_size = lvb.lvb_size;
1103 st->st_blocks = lvb.lvb_blocks;
1104 st->st_mtime = lvb.lvb_mtime;
1105 st->st_atime = lvb.lvb_atime;
1106 st->st_ctime = lvb.lvb_ctime;
1107 lov_stripe_unlock(lsm);
1112 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1113 * file (because it prefers KMS over RSS when larger) */
1114 int ll_glimpse_size(struct inode *inode, int ast_flags)
1116 struct ll_inode_info *lli = ll_i2info(inode);
1117 struct ll_sb_info *sbi = ll_i2sbi(inode);
1118 struct lustre_handle lockh = { 0 };
1119 struct obd_enqueue_info einfo = { 0 };
1120 struct obd_info oinfo = { { { 0 } } };
1124 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1127 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1129 if (!lli->lli_smd) {
1130 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1134 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1135 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1136 * won't revoke any conflicting DLM locks held. Instead,
1137 * ll_glimpse_callback() will be called on each client
1138 * holding a DLM lock against this file, and resulting size
1139 * will be returned for each stripe. DLM lock on [0, EOF] is
1140 * acquired only if there were no conflicting locks. */
1141 einfo.ei_type = LDLM_EXTENT;
1142 einfo.ei_mode = LCK_PR;
1143 einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1144 einfo.ei_cb_bl = ll_extent_lock_callback;
1145 einfo.ei_cb_cp = ldlm_completion_ast;
1146 einfo.ei_cb_gl = ll_glimpse_callback;
1147 einfo.ei_cbdata = inode;
1149 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1150 oinfo.oi_lockh = &lockh;
1151 oinfo.oi_md = lli->lli_smd;
1153 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1157 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1158 RETURN(rc > 0 ? -EIO : rc);
1161 ll_merge_lvb(inode);
1163 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1164 inode->i_size, inode->i_blocks);
1169 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1170 struct lov_stripe_md *lsm, int mode,
1171 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1174 struct ll_sb_info *sbi = ll_i2sbi(inode);
1176 struct obd_enqueue_info einfo = { 0 };
1177 struct obd_info oinfo = { { { 0 } } };
1181 LASSERT(!lustre_handle_is_used(lockh));
1182 LASSERT(lsm != NULL);
1184 /* don't drop the mmapped file to LRU */
1185 if (mapping_mapped(inode->i_mapping))
1186 ast_flags |= LDLM_FL_NO_LRU;
1188 /* XXX phil: can we do this? won't it screw the file size up? */
1189 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1190 (sbi->ll_flags & LL_SBI_NOLCK))
1193 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1194 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1196 einfo.ei_type = LDLM_EXTENT;
1197 einfo.ei_mode = mode;
1198 einfo.ei_flags = ast_flags;
1199 einfo.ei_cb_bl = ll_extent_lock_callback;
1200 einfo.ei_cb_cp = ldlm_completion_ast;
1201 einfo.ei_cb_gl = ll_glimpse_callback;
1202 einfo.ei_cbdata = inode;
1204 oinfo.oi_policy = *policy;
1205 oinfo.oi_lockh = lockh;
1208 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1209 *policy = oinfo.oi_policy;
1213 ll_inode_size_lock(inode, 1);
1214 inode_init_lvb(inode, &lvb);
1215 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1217 if (policy->l_extent.start == 0 &&
1218 policy->l_extent.end == OBD_OBJECT_EOF) {
1219 /* vmtruncate()->ll_truncate() first sets the i_size and then
1220 * the kms under both a DLM lock and the
1221 * ll_inode_size_lock(). If we don't get the
1222 * ll_inode_size_lock() here we can match the DLM lock and
1223 * reset i_size from the kms before the truncating path has
1224 * updated the kms. generic_file_write can then trust the
1225 * stale i_size when doing appending writes and effectively
1226 * cancel the result of the truncate. Getting the
1227 * ll_inode_size_lock() after the enqueue maintains the DLM
1228 * -> ll_inode_size_lock() acquiring order. */
1229 inode->i_size = lvb.lvb_size;
1233 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1234 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1235 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1237 ll_inode_size_unlock(inode, 1);
1242 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1243 struct lov_stripe_md *lsm, int mode,
1244 struct lustre_handle *lockh)
1246 struct ll_sb_info *sbi = ll_i2sbi(inode);
1250 /* XXX phil: can we do this? won't it screw the file size up? */
1251 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1252 (sbi->ll_flags & LL_SBI_NOLCK))
1255 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1260 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1263 struct inode *inode = file->f_dentry->d_inode;
1264 struct ll_inode_info *lli = ll_i2info(inode);
1265 struct lov_stripe_md *lsm = lli->lli_smd;
1266 struct ll_sb_info *sbi = ll_i2sbi(inode);
1267 struct ll_lock_tree tree;
1268 struct ll_lock_tree_node *node;
1270 struct ll_ra_read bead;
1273 ssize_t retval, chunk, sum = 0;
1277 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1278 inode->i_ino, inode->i_generation, inode, count, *ppos);
1280 /* "If nbyte is 0, read() will return 0 and have no other results."
1281 * -- Single Unix Spec */
1285 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1289 /* Read on file with no objects should return zero-filled
1290 * buffers up to file size (we can get non-zero sizes with
1291 * mknod + truncate, then opening file for read. This is a
1292 * common pattern in NFS case, it seems). Bug 6243 */
1294 /* Since there are no objects on OSTs, we have nothing to get
1295 * lock on and so we are forced to access inode->i_size
1298 /* Read beyond end of file */
1299 if (*ppos >= inode->i_size)
1302 if (count > inode->i_size - *ppos)
1303 count = inode->i_size - *ppos;
1304 /* Make sure to correctly adjust the file pos pointer for
1306 notzeroed = clear_user(buf, count);
1315 if (sbi->ll_max_rw_chunk != 0) {
1316 /* first, let's know the end of the current stripe */
1318 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1320 /* correct, the end is beyond the request */
1321 if (end > *ppos + count - 1)
1322 end = *ppos + count - 1;
1324 /* and chunk shouldn't be too large even if striping is wide */
1325 if (end - *ppos > sbi->ll_max_rw_chunk)
1326 end = *ppos + sbi->ll_max_rw_chunk - 1;
1328 end = *ppos + count - 1;
1331 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1332 tree.lt_fd = LUSTRE_FPRIVATE(file);
1333 rc = ll_tree_lock(&tree, node, buf, count,
1334 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1336 GOTO(out, retval = rc);
1338 ll_inode_size_lock(inode, 1);
1340 * Consistency guarantees: following possibilities exist for the
1341 * relation between region being read and real file size at this
1344 * (A): the region is completely inside of the file;
1346 * (B-x): x bytes of region are inside of the file, the rest is
1349 * (C): the region is completely outside of the file.
1351 * This classification is stable under DLM lock acquired by
1352 * ll_tree_lock() above, because to change class, other client has to
1353 * take DLM lock conflicting with our lock. Also, any updates to
1354 * ->i_size by other threads on this client are serialized by
1355 * ll_inode_size_lock(). This guarantees that short reads are handled
1356 * correctly in the face of concurrent writes and truncates.
1358 inode_init_lvb(inode, &lvb);
1359 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1361 if (*ppos + count - 1 > kms) {
1362 /* A glimpse is necessary to determine whether we return a
1363 * short read (B) or some zeroes at the end of the buffer (C) */
1364 ll_inode_size_unlock(inode, 1);
1365 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1367 ll_tree_unlock(&tree);
1371 /* region is within kms and, hence, within real file size (A) */
1372 inode->i_size = kms;
1373 ll_inode_size_unlock(inode, 1);
1376 chunk = end - *ppos + 1;
1377 CDEBUG(D_VFSTRACE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1378 inode->i_ino, chunk, *ppos, inode->i_size);
1380 /* turn off the kernel's read-ahead */
1381 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1384 file->f_ra.ra_pages = 0;
1386 /* initialize read-ahead window once per syscall */
1389 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1390 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1391 ll_ra_read_in(file, &bead);
1395 file_accessed(file);
1396 retval = generic_file_read(file, buf, chunk, ppos);
1397 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 0);
1399 ll_tree_unlock(&tree);
1405 if (retval == chunk && count > 0)
1411 ll_ra_read_ex(file, &bead);
1412 retval = (sum > 0) ? sum : retval;
1414 CERROR("Read error inode=%lu/%u(%p),size="LPSZ",off=%Ld rc %d\n",
1415 inode->i_ino, inode->i_generation, inode, count, *ppos,
1422 * Write to a file (through the page cache).
1424 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1427 struct inode *inode = file->f_dentry->d_inode;
1428 struct ll_sb_info *sbi = ll_i2sbi(inode);
1429 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1430 struct ll_lock_tree tree;
1431 struct ll_lock_tree_node *node;
1432 loff_t maxbytes = ll_file_maxbytes(inode);
1433 loff_t lock_start, lock_end, end;
1434 ssize_t retval, chunk, sum = 0;
1437 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1438 inode->i_ino, inode->i_generation, inode, count, *ppos);
1440 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1442 /* POSIX, but surprised the VFS doesn't check this already */
1446 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1447 * called on the file, don't fail the below assertion (bug 2388). */
1448 if (file->f_flags & O_LOV_DELAY_CREATE &&
1449 ll_i2info(inode)->lli_smd == NULL)
1452 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1454 down(&ll_i2info(inode)->lli_write_sem);
1457 chunk = 0; /* just to fix gcc's warning */
1458 end = *ppos + count - 1;
1460 if (file->f_flags & O_APPEND) {
1462 lock_end = OBD_OBJECT_EOF;
1463 } else if (sbi->ll_max_rw_chunk != 0) {
1464 /* first, let's know the end of the current stripe */
1466 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1468 /* correct, the end is beyond the request */
1469 if (end > *ppos + count - 1)
1470 end = *ppos + count - 1;
1472 /* and chunk shouldn't be too large even if striping is wide */
1473 if (end - *ppos > sbi->ll_max_rw_chunk)
1474 end = *ppos + sbi->ll_max_rw_chunk - 1;
1479 lock_end = *ppos + count - 1;
1481 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1484 GOTO(out, retval = PTR_ERR(node));
1486 tree.lt_fd = LUSTRE_FPRIVATE(file);
1487 rc = ll_tree_lock(&tree, node, buf, count,
1488 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1490 GOTO(out, retval = rc);
1492 /* this is ok, g_f_w will overwrite this under i_mutex if it races
1493 * with a local truncate, it just makes our maxbyte checking easier */
1494 if (file->f_flags & O_APPEND) {
1495 *ppos = inode->i_size;
1496 end = *ppos + count - 1;
1499 if (*ppos >= maxbytes) {
1500 send_sig(SIGXFSZ, current, 0);
1501 GOTO(out, retval = -EFBIG);
1503 if (*ppos + count > maxbytes)
1504 count = maxbytes - *ppos;
1506 /* generic_file_write handles O_APPEND after getting i_mutex */
1507 chunk = end - *ppos + 1;
1508 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1509 inode->i_ino, chunk, *ppos);
1510 retval = generic_file_write(file, buf, chunk, ppos);
1511 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1514 ll_tree_unlock(&tree);
1520 if (retval == chunk && count > 0)
1524 up(&ll_i2info(inode)->lli_write_sem);
1526 retval = (sum > 0) ? sum : retval;
1527 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
1528 retval > 0 ? retval : 0);
1533 * Send file content (through pagecache) somewhere with helper
1535 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1536 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1537 read_actor_t actor, void *target)
1539 struct inode *inode = in_file->f_dentry->d_inode;
1540 struct ll_inode_info *lli = ll_i2info(inode);
1541 struct lov_stripe_md *lsm = lli->lli_smd;
1542 struct ll_lock_tree tree;
1543 struct ll_lock_tree_node *node;
1545 struct ll_ra_read bead;
1550 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1551 inode->i_ino, inode->i_generation, inode, count, *ppos);
1553 /* "If nbyte is 0, read() will return 0 and have no other results."
1554 * -- Single Unix Spec */
1558 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1561 /* File with no objects, nothing to lock */
1563 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1565 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1566 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1567 rc = ll_tree_lock(&tree, node, NULL, count,
1568 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1572 ll_inode_size_lock(inode, 1);
1574 * Consistency guarantees: following possibilities exist for the
1575 * relation between region being read and real file size at this
1578 * (A): the region is completely inside of the file;
1580 * (B-x): x bytes of region are inside of the file, the rest is
1583 * (C): the region is completely outside of the file.
1585 * This classification is stable under DLM lock acquired by
1586 * ll_tree_lock() above, because to change class, other client has to
1587 * take DLM lock conflicting with our lock. Also, any updates to
1588 * ->i_size by other threads on this client are serialized by
1589 * ll_inode_size_lock(). This guarantees that short reads are handled
1590 * correctly in the face of concurrent writes and truncates.
1592 inode_init_lvb(inode, &lvb);
1593 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1595 if (*ppos + count - 1 > kms) {
1596 /* A glimpse is necessary to determine whether we return a
1597 * short read (B) or some zeroes at the end of the buffer (C) */
1598 ll_inode_size_unlock(inode, 1);
1599 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1603 /* region is within kms and, hence, within real file size (A) */
1604 inode->i_size = kms;
1605 ll_inode_size_unlock(inode, 1);
1608 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1609 inode->i_ino, count, *ppos, inode->i_size);
1611 /* turn off the kernel's read-ahead */
1612 in_file->f_ra.ra_pages = 0;
1614 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1615 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1616 ll_ra_read_in(in_file, &bead);
1618 file_accessed(in_file);
1619 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1620 ll_ra_read_ex(in_file, &bead);
1623 ll_tree_unlock(&tree);
1628 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1631 struct ll_inode_info *lli = ll_i2info(inode);
1632 struct obd_export *exp = ll_i2dtexp(inode);
1633 struct ll_recreate_obj ucreatp;
1634 struct obd_trans_info oti = { 0 };
1635 struct obdo *oa = NULL;
1638 struct lov_stripe_md *lsm, *lsm2;
1641 if (!capable (CAP_SYS_ADMIN))
1644 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1645 sizeof(struct ll_recreate_obj));
1653 down(&lli->lli_open_sem);
1656 GOTO(out, rc = -ENOENT);
1657 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1658 (lsm->lsm_stripe_count));
1660 OBD_ALLOC(lsm2, lsm_size);
1662 GOTO(out, rc = -ENOMEM);
1664 oa->o_id = ucreatp.lrc_id;
1665 oa->o_gr = ucreatp.lrc_group;
1666 oa->o_nlink = ucreatp.lrc_ost_idx;
1667 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1668 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1669 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1670 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1672 oti.oti_objid = NULL;
1673 memcpy(lsm2, lsm, lsm_size);
1674 rc = obd_create(exp, oa, &lsm2, &oti);
1676 OBD_FREE(lsm2, lsm_size);
1679 up(&lli->lli_open_sem);
1684 static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1685 int flags, struct lov_user_md *lum,
1688 struct ll_inode_info *lli = ll_i2info(inode);
1689 struct lov_stripe_md *lsm;
1690 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1694 down(&lli->lli_open_sem);
1697 up(&lli->lli_open_sem);
1698 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1703 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1706 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1707 GOTO(out_req_free, rc = -ENOENT);
1708 rc = oit.d.lustre.it_status;
1710 GOTO(out_req_free, rc);
1712 ll_release_openhandle(file->f_dentry, &oit);
1715 up(&lli->lli_open_sem);
1716 ll_intent_release(&oit);
1719 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1723 static int ll_lov_setea(struct inode *inode, struct file *file,
1726 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1727 struct lov_user_md *lump;
1728 int lum_size = sizeof(struct lov_user_md) +
1729 sizeof(struct lov_user_ost_data);
1733 if (!capable (CAP_SYS_ADMIN))
1736 OBD_ALLOC(lump, lum_size);
1740 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1742 OBD_FREE(lump, lum_size);
1746 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1748 OBD_FREE(lump, lum_size);
1752 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1755 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1757 int flags = FMODE_WRITE;
1760 /* Bug 1152: copy properly when this is no longer true */
1761 LASSERT(sizeof(lum) == sizeof(*lump));
1762 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1763 rc = copy_from_user(&lum, lump, sizeof(lum));
1767 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1769 put_user(0, &lump->lmm_stripe_count);
1770 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1771 0, ll_i2info(inode)->lli_smd, lump);
1776 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1778 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1783 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1787 static int ll_get_grouplock(struct inode *inode, struct file *file,
1790 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1791 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1792 .end = OBD_OBJECT_EOF}};
1793 struct lustre_handle lockh = { 0 };
1794 struct ll_inode_info *lli = ll_i2info(inode);
1795 struct lov_stripe_md *lsm = lli->lli_smd;
1799 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1803 policy.l_extent.gid = arg;
1804 if (file->f_flags & O_NONBLOCK)
1805 flags = LDLM_FL_BLOCK_NOWAIT;
1807 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1811 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1813 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1818 static int ll_put_grouplock(struct inode *inode, struct file *file,
1821 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1822 struct ll_inode_info *lli = ll_i2info(inode);
1823 struct lov_stripe_md *lsm = lli->lli_smd;
1827 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1828 /* Ugh, it's already unlocked. */
1832 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1835 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1837 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1842 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1847 static int join_sanity_check(struct inode *head, struct inode *tail)
1850 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1851 CERROR("server do not support join \n");
1854 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1855 CERROR("tail ino %lu and ino head %lu must be regular\n",
1856 head->i_ino, tail->i_ino);
1859 if (head->i_ino == tail->i_ino) {
1860 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1863 if (head->i_size % JOIN_FILE_ALIGN) {
1864 CERROR("hsize %llu must be times of 64K\n", head->i_size);
1870 static int join_file(struct inode *head_inode, struct file *head_filp,
1871 struct file *tail_filp)
1873 struct inode *tail_inode, *tail_parent;
1874 struct dentry *tail_dentry = tail_filp->f_dentry;
1875 struct lookup_intent oit = {.it_op = IT_OPEN,
1876 .it_flags = head_filp->f_flags|O_JOIN_FILE};
1877 struct lustre_handle lockh;
1878 struct md_op_data *op_data;
1879 __u32 hsize = head_inode->i_size >> 32;
1880 __u32 tsize = head_inode->i_size;
1884 tail_dentry = tail_filp->f_dentry;
1885 tail_inode = tail_dentry->d_inode;
1886 tail_parent = tail_dentry->d_parent->d_inode;
1888 op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
1889 tail_dentry->d_name.name,
1890 tail_dentry->d_name.len, 0);
1891 if (op_data == NULL)
1894 rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
1895 op_data, &lockh, &tsize, 0, ldlm_completion_ast,
1896 ll_md_blocking_ast, &hsize, 0);
1898 ll_finish_md_op_data(op_data);
1902 rc = oit.d.lustre.it_status;
1905 ptlrpc_req_finished((struct ptlrpc_request *)
1906 oit.d.lustre.it_data);
1910 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1912 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1913 oit.d.lustre.it_lock_mode = 0;
1915 ll_release_openhandle(head_filp->f_dentry, &oit);
1917 ll_intent_release(&oit);
1921 static int ll_file_join(struct inode *head, struct file *filp,
1922 char *filename_tail)
1924 struct inode *tail = NULL, *first = NULL, *second = NULL;
1925 struct dentry *tail_dentry;
1926 struct file *tail_filp, *first_filp, *second_filp;
1927 struct ll_lock_tree first_tree, second_tree;
1928 struct ll_lock_tree_node *first_node, *second_node;
1929 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1930 int rc = 0, cleanup_phase = 0;
1933 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1934 head->i_ino, head->i_generation, head, filename_tail);
1936 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1937 if (IS_ERR(tail_filp)) {
1938 CERROR("Can not open tail file %s", filename_tail);
1939 rc = PTR_ERR(tail_filp);
1942 tail = igrab(tail_filp->f_dentry->d_inode);
1944 tlli = ll_i2info(tail);
1945 tail_dentry = tail_filp->f_dentry;
1946 LASSERT(tail_dentry);
1949 /*reorder the inode for lock sequence*/
1950 first = head->i_ino > tail->i_ino ? head : tail;
1951 second = head->i_ino > tail->i_ino ? tail : head;
1952 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1953 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1955 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1956 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1957 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1958 if (IS_ERR(first_node)){
1959 rc = PTR_ERR(first_node);
1962 first_tree.lt_fd = first_filp->private_data;
1963 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1968 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1969 if (IS_ERR(second_node)){
1970 rc = PTR_ERR(second_node);
1973 second_tree.lt_fd = second_filp->private_data;
1974 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1979 rc = join_sanity_check(head, tail);
1983 rc = join_file(head, filp, tail_filp);
1987 switch (cleanup_phase) {
1989 ll_tree_unlock(&second_tree);
1990 obd_cancel_unused(ll_i2dtexp(second),
1991 ll_i2info(second)->lli_smd, 0, NULL);
1993 ll_tree_unlock(&first_tree);
1994 obd_cancel_unused(ll_i2dtexp(first),
1995 ll_i2info(first)->lli_smd, 0, NULL);
1997 filp_close(tail_filp, 0);
2000 if (head && rc == 0) {
2001 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2003 hlli->lli_smd = NULL;
2008 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2014 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2016 struct inode *inode = dentry->d_inode;
2017 struct obd_client_handle *och;
2023 /* Root ? Do nothing. */
2024 if (dentry->d_inode->i_sb->s_root == dentry)
2027 /* No open handle to close? Move away */
2028 if (!it_disposition(it, DISP_OPEN_OPEN))
2031 OBD_ALLOC(och, sizeof(*och));
2033 GOTO(out, rc = -ENOMEM);
2035 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2036 ll_i2info(inode), it, och);
2038 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2041 /* Do not free @och is it is waiting for DONE_WRITING. */
2042 if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
2043 OBD_FREE(och, sizeof(*och));
2045 /* this one is in place of ll_file_open */
2046 ptlrpc_req_finished(it->d.lustre.it_data);
2047 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2051 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2054 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2058 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2059 inode->i_generation, inode, cmd);
2061 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2062 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2065 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
2067 case LL_IOC_GETFLAGS:
2068 /* Get the current value of the file flags */
2069 return put_user(fd->fd_flags, (int *)arg);
2070 case LL_IOC_SETFLAGS:
2071 case LL_IOC_CLRFLAGS:
2072 /* Set or clear specific file flags */
2073 /* XXX This probably needs checks to ensure the flags are
2074 * not abused, and to handle any flag side effects.
2076 if (get_user(flags, (int *) arg))
2079 if (cmd == LL_IOC_SETFLAGS) {
2080 if ((flags & LL_FILE_IGNORE_LOCK) &&
2081 !(file->f_flags & O_DIRECT)) {
2082 CERROR("%s: unable to disable locking on "
2083 "non-O_DIRECT file\n", current->comm);
2087 fd->fd_flags |= flags;
2089 fd->fd_flags &= ~flags;
2092 case LL_IOC_LOV_SETSTRIPE:
2093 RETURN(ll_lov_setstripe(inode, file, arg));
2094 case LL_IOC_LOV_SETEA:
2095 RETURN(ll_lov_setea(inode, file, arg));
2096 case LL_IOC_LOV_GETSTRIPE:
2097 RETURN(ll_lov_getstripe(inode, arg));
2098 case LL_IOC_RECREATE_OBJ:
2099 RETURN(ll_lov_recreate_obj(inode, file, arg));
2100 case EXT3_IOC_GETFLAGS:
2101 case EXT3_IOC_SETFLAGS:
2102 RETURN(ll_iocontrol(inode, file, cmd, arg));
2103 case EXT3_IOC_GETVERSION_OLD:
2104 case EXT3_IOC_GETVERSION:
2105 RETURN(put_user(inode->i_generation, (int *)arg));
2110 ftail = getname((const char *)arg);
2112 RETURN(PTR_ERR(ftail));
2113 rc = ll_file_join(inode, file, ftail);
2117 case LL_IOC_GROUP_LOCK:
2118 RETURN(ll_get_grouplock(inode, file, arg));
2119 case LL_IOC_GROUP_UNLOCK:
2120 RETURN(ll_put_grouplock(inode, file, arg));
2121 case IOC_OBD_STATFS:
2122 RETURN(ll_obd_statfs(inode, (void *)arg));
2124 /* We need to special case any other ioctls we want to handle,
2125 * to send them to the MDS/OST as appropriate and to properly
2126 * network encode the arg field.
2127 case EXT3_IOC_SETVERSION_OLD:
2128 case EXT3_IOC_SETVERSION:
2130 case LL_IOC_FLUSHCTX:
2131 RETURN(ll_flush_ctx(inode));
2132 case LL_IOC_GETFACL: {
2133 struct rmtacl_ioctl_data ioc;
2135 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2138 RETURN(ll_ioctl_getfacl(inode, &ioc));
2140 case LL_IOC_SETFACL: {
2141 struct rmtacl_ioctl_data ioc;
2143 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2146 RETURN(ll_ioctl_setfacl(inode, &ioc));
2149 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2154 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2156 struct inode *inode = file->f_dentry->d_inode;
2157 struct ll_inode_info *lli = ll_i2info(inode);
2158 struct lov_stripe_md *lsm = lli->lli_smd;
2161 retval = offset + ((origin == 2) ? inode->i_size :
2162 (origin == 1) ? file->f_pos : 0);
2163 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2164 inode->i_ino, inode->i_generation, inode, retval, retval,
2165 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2167 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
2168 if (origin == 2) { /* SEEK_END */
2169 int nonblock = 0, rc;
2171 if (file->f_flags & O_NONBLOCK)
2172 nonblock = LDLM_FL_BLOCK_NOWAIT;
2175 rc = ll_glimpse_size(inode, nonblock);
2180 ll_inode_size_lock(inode, 0);
2181 offset += inode->i_size;
2182 ll_inode_size_unlock(inode, 0);
2183 } else if (origin == 1) { /* SEEK_CUR */
2184 offset += file->f_pos;
2188 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2189 if (offset != file->f_pos) {
2190 file->f_pos = offset;
2191 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2193 file->f_version = ++event;
2198 CERROR("invalid offset offset "LPX64" inode=%lu/%u(%p)"
2199 "seek (%s) isize "LPU64", f_ops "LPU64"\n",
2200 offset, inode->i_ino, inode->i_generation, inode,
2201 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR"
2202 : "SEEK_SET", inode->i_size, file->f_pos);
2208 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2210 struct inode *inode = dentry->d_inode;
2211 struct ll_inode_info *lli = ll_i2info(inode);
2212 struct lov_stripe_md *lsm = lli->lli_smd;
2213 struct ptlrpc_request *req;
2214 struct obd_capa *oc;
2217 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2218 inode->i_generation, inode);
2220 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
2222 /* fsync's caller has already called _fdata{sync,write}, we want
2223 * that IO to finish before calling the osc and mdc sync methods */
2224 rc = filemap_fdatawait(inode->i_mapping);
2226 /* catch async errors that were recorded back when async writeback
2227 * failed for pages in this mapping. */
2228 err = lli->lli_async_rc;
2229 lli->lli_async_rc = 0;
2233 err = lov_test_and_clear_async_rc(lsm);
2238 oc = ll_mdscapa_get(inode);
2239 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2245 ptlrpc_req_finished(req);
2248 struct obdo *oa = obdo_alloc();
2251 RETURN(rc ? rc : -ENOMEM);
2253 oa->o_id = lsm->lsm_object_id;
2254 oa->o_gr = lsm->lsm_object_gr;
2255 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2256 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2257 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2260 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2261 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2262 0, OBD_OBJECT_EOF, oc);
2272 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2274 struct inode *inode = file->f_dentry->d_inode;
2275 struct ll_sb_info *sbi = ll_i2sbi(inode);
2276 struct ldlm_res_id res_id =
2277 { .name = { fid_seq(ll_inode2fid(inode)),
2278 fid_oid(ll_inode2fid(inode)),
2279 fid_ver(ll_inode2fid(inode)),
2281 struct lustre_handle lockh = {0};
2282 ldlm_policy_data_t flock;
2283 ldlm_mode_t mode = 0;
2288 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2289 inode->i_ino, file_lock);
2291 flock.l_flock.pid = file_lock->fl_pid;
2292 flock.l_flock.start = file_lock->fl_start;
2293 flock.l_flock.end = file_lock->fl_end;
2295 switch (file_lock->fl_type) {
2300 /* An unlock request may or may not have any relation to
2301 * existing locks so we may not be able to pass a lock handle
2302 * via a normal ldlm_lock_cancel() request. The request may even
2303 * unlock a byte range in the middle of an existing lock. In
2304 * order to process an unlock request we need all of the same
2305 * information that is given with a normal read or write record
2306 * lock request. To avoid creating another ldlm unlock (cancel)
2307 * message we'll treat a LCK_NL flock request as an unlock. */
2314 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2329 flags = LDLM_FL_BLOCK_NOWAIT;
2335 flags = LDLM_FL_TEST_LOCK;
2336 /* Save the old mode so that if the mode in the lock changes we
2337 * can decrement the appropriate reader or writer refcount. */
2338 file_lock->fl_type = mode;
2341 CERROR("unknown fcntl lock command: %d\n", cmd);
2345 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2346 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2347 flags, mode, flock.l_flock.start, flock.l_flock.end);
2349 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2350 LDLM_FLOCK, &flock, mode, &flags, NULL,
2351 ldlm_flock_completion_ast, NULL, file_lock,
2352 NULL, 0, NULL, &lockh, 0);
2356 int ll_have_md_lock(struct inode *inode, __u64 bits)
2358 struct lustre_handle lockh;
2359 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2367 fid = &ll_i2info(inode)->lli_fid;
2368 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2370 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2371 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2372 LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2379 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2380 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2381 * and return success */
2383 /* This path cannot be hit for regular files unless in
2384 * case of obscure races, so no need to to validate
2386 if (!S_ISREG(inode->i_mode) &&
2387 !S_ISDIR(inode->i_mode))
2392 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2400 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2402 struct inode *inode = dentry->d_inode;
2403 struct ptlrpc_request *req = NULL;
2404 struct ll_sb_info *sbi;
2405 struct obd_export *exp;
2410 CERROR("REPORT THIS LINE TO PETER\n");
2413 sbi = ll_i2sbi(inode);
2415 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2416 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2417 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2418 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
2421 exp = ll_i2mdexp(inode);
2423 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2424 struct lookup_intent oit = { .it_op = IT_GETATTR };
2425 struct md_op_data *op_data;
2427 /* Call getattr by fid, so do not provide name at all. */
2428 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2429 dentry->d_inode, NULL, 0, 0);
2430 if (op_data == NULL)
2432 it->it_flags |= O_CHECK_STALE;
2433 rc = md_intent_lock(exp, op_data, NULL, 0,
2434 /* we are not interested in name
2437 ll_md_blocking_ast, 0);
2438 ll_finish_md_op_data(op_data);
2439 it->it_flags &= ~ O_CHECK_STALE;
2441 rc = ll_inode_revalidate_fini(inode, rc);
2445 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2447 ll_intent_release(&oit);
2451 /* Unlinked? Unhash dentry, so it is not picked up later by
2452 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2453 here to preserve get_cwd functionality on 2.6.
2455 if (!dentry->d_inode->i_nlink) {
2456 spin_lock(&dcache_lock);
2457 ll_drop_dentry(dentry);
2458 spin_unlock(&dcache_lock);
2461 ll_lookup_finish_locks(&oit, dentry);
2462 } else if (!ll_have_md_lock(dentry->d_inode,
2463 MDS_INODELOCK_UPDATE)) {
2464 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2465 obd_valid valid = OBD_MD_FLGETATTR;
2467 struct obd_capa *oc;
2469 if (S_ISREG(inode->i_mode)) {
2470 rc = ll_get_max_mdsize(sbi, &ealen);
2473 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2475 oc = ll_mdscapa_get(inode);
2476 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2480 rc = ll_inode_revalidate_fini(inode, rc);
2484 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2490 /* if object not yet allocated, don't validate size */
2491 if (ll_i2info(inode)->lli_smd == NULL)
2494 /* ll_glimpse_size will prefer locally cached writes if they extend
2496 rc = ll_glimpse_size(inode, 0);
2499 ptlrpc_req_finished(req);
2503 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2504 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2505 struct lookup_intent *it, struct kstat *stat)
2507 struct inode *inode = de->d_inode;
2510 res = ll_inode_revalidate_it(de, it);
2511 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
2516 stat->dev = inode->i_sb->s_dev;
2517 stat->ino = inode->i_ino;
2518 stat->mode = inode->i_mode;
2519 stat->nlink = inode->i_nlink;
2520 stat->uid = inode->i_uid;
2521 stat->gid = inode->i_gid;
2522 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2523 stat->atime = inode->i_atime;
2524 stat->mtime = inode->i_mtime;
2525 stat->ctime = inode->i_ctime;
2526 stat->blksize = inode->i_blksize;
2528 ll_inode_size_lock(inode, 0);
2529 stat->size = inode->i_size;
2530 stat->blocks = inode->i_blocks;
2531 ll_inode_size_unlock(inode, 0);
2535 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2537 struct lookup_intent it = { .it_op = IT_GETATTR };
2539 return ll_getattr_it(mnt, de, &it, stat);
2544 int lustre_check_acl(struct inode *inode, int mask)
2546 #ifdef CONFIG_FS_POSIX_ACL
2547 struct ll_inode_info *lli = ll_i2info(inode);
2548 struct posix_acl *acl;
2552 spin_lock(&lli->lli_lock);
2553 acl = posix_acl_dup(lli->lli_posix_acl);
2554 spin_unlock(&lli->lli_lock);
2559 rc = posix_acl_permission(inode, acl, mask);
2560 posix_acl_release(acl);
2568 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2569 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2571 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2572 inode->i_ino, inode->i_generation, inode, mask);
2573 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2574 return lustre_check_remote_perm(inode, mask);
2575 return generic_permission(inode, mask, lustre_check_acl);
2578 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2579 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2581 int ll_inode_permission(struct inode *inode, int mask)
2584 int mode = inode->i_mode;
2587 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2588 inode->i_ino, inode->i_generation, inode, mask);
2590 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2591 return lustre_check_remote_perm(inode, mask);
2593 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2594 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2596 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2598 if (current->fsuid == inode->i_uid) {
2601 if (((mode >> 3) & mask & S_IRWXO) != mask)
2603 rc = lustre_check_acl(inode, mask);
2607 goto check_capabilities;
2611 if (in_group_p(inode->i_gid))
2614 if ((mode & mask & S_IRWXO) == mask)
2618 if (!(mask & MAY_EXEC) ||
2619 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2620 if (capable(CAP_DAC_OVERRIDE))
2623 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2624 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2630 struct file_operations ll_file_operations = {
2631 .read = ll_file_read,
2632 .write = ll_file_write,
2633 .ioctl = ll_file_ioctl,
2634 .open = ll_file_open,
2635 .release = ll_file_release,
2636 .mmap = ll_file_mmap,
2637 .llseek = ll_file_seek,
2638 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2639 .sendfile = ll_file_sendfile,
2642 /* .lock = ll_file_flock */
2645 struct file_operations ll_file_operations_flock = {
2646 .read = ll_file_read,
2647 .write = ll_file_write,
2648 .ioctl = ll_file_ioctl,
2649 .open = ll_file_open,
2650 .release = ll_file_release,
2651 .mmap = ll_file_mmap,
2652 .llseek = ll_file_seek,
2653 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2654 .sendfile = ll_file_sendfile,
2657 .lock = ll_file_flock
2661 struct inode_operations ll_file_inode_operations = {
2662 #ifdef LUSTRE_KERNEL_VERSION
2663 .setattr_raw = ll_setattr_raw,
2665 .setattr = ll_setattr,
2666 .truncate = ll_truncate,
2667 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2668 .getattr = ll_getattr,
2670 .revalidate_it = ll_inode_revalidate_it,
2672 .permission = ll_inode_permission,
2673 .setxattr = ll_setxattr,
2674 .getxattr = ll_getxattr,
2675 .listxattr = ll_listxattr,
2676 .removexattr = ll_removexattr,