1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Lustre Light block IO
6 * Copyright (c) 2002-2004 Cluster File Systems, Inc.
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #define DEBUG_SUBSYSTEM S_LLITE
30 #include <sys/types.h>
32 #include <sys/queue.h>
49 #include "llite_lib.h"
53 struct obd_io_group *lig_oig;
54 struct inode *lig_inode;
55 struct lustre_rw_params *lig_params;
59 struct ll_async_page *lig_llaps;
60 struct page *lig_pages;
61 void *lig_llap_cookies;
64 #define LLU_IO_GROUP_SIZE(x) \
65 (sizeof(struct llu_io_group) + \
66 (sizeof(struct ll_async_page) + \
67 sizeof(struct page) + \
68 llap_cookie_size) * (x))
72 struct inode *lis_inode;
76 struct llu_io_group *lis_groups[0];
78 #define LLU_IO_SESSION_SIZE(x) \
79 (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *))
82 typedef ssize_t llu_file_piov_t(const struct iovec *iovec, int iovlen,
83 _SYSIO_OFF_T pos, ssize_t len,
86 size_t llap_cookie_size;
88 static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
90 struct llu_inode_info *lli = llu_i2info(inode);
91 struct lov_stripe_md *lsm = lli->lli_smd;
92 struct obd_export *exp = llu_i2obdexp(inode);
95 struct ldlm_lock *lock;
96 struct lov_stripe_md *lsm;
97 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
98 __u32 stripe, vallen = sizeof(stripe);
102 if (lsm->lsm_stripe_count == 1)
105 /* get our offset in the lov */
106 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
108 CERROR("obd_get_info: rc = %d\n", rc);
111 LASSERT(stripe < lsm->lsm_stripe_count);
115 static int llu_extent_lock_callback(struct ldlm_lock *lock,
116 struct ldlm_lock_desc *new, void *data,
119 struct lustre_handle lockh = { 0 };
123 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
124 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
129 case LDLM_CB_BLOCKING:
130 ldlm_lock2handle(lock, &lockh);
131 rc = ldlm_cli_cancel(&lockh);
133 CERROR("ldlm_cli_cancel failed: %d\n", rc);
135 case LDLM_CB_CANCELING: {
137 struct llu_inode_info *lli;
138 struct lov_stripe_md *lsm;
142 /* This lock wasn't granted, don't try to evict pages */
143 if (lock->l_req_mode != lock->l_granted_mode)
146 inode = llu_inode_from_lock(lock);
149 lli= llu_i2info(inode);
156 stripe = llu_lock_to_stripe_offset(inode, lock);
157 l_lock(&lock->l_resource->lr_namespace->ns_lock);
158 kms = ldlm_extent_shift_kms(lock,
159 lsm->lsm_oinfo[stripe].loi_kms);
160 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
161 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
162 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
163 lsm->lsm_oinfo[stripe].loi_kms, kms);
164 lsm->lsm_oinfo[stripe].loi_kms = kms;
176 static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp)
178 struct ptlrpc_request *req = reqp;
179 struct inode *inode = llu_inode_from_lock(lock);
180 struct llu_inode_info *lli;
182 int rc, size = sizeof(*lvb), stripe = 0;
186 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
187 lli = llu_i2info(inode);
189 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
190 if (lli->lli_smd == NULL)
191 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
193 /* First, find out which stripe index this lock corresponds to. */
194 if (lli->lli_smd->lsm_stripe_count > 1)
195 stripe = llu_lock_to_stripe_offset(inode, lock);
197 rc = lustre_pack_reply(req, 1, &size, NULL);
199 CERROR("lustre_pack_reply: %d\n", rc);
203 lvb = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*lvb));
204 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
206 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64,
207 (long long)llu_i2stat(inode)->st_size, stripe,lvb->lvb_size);
211 /* These errors are normal races, so we don't want to fill the console
212 * with messages by calling ptlrpc_error() */
213 if (rc == -ELDLM_NO_LOCK_DATA)
214 lustre_pack_reply(req, 0, NULL, NULL);
220 /* NB: lov_merge_size will prefer locally cached writes if they extend the
221 * file (because it prefers KMS over RSS when larger) */
222 int llu_glimpse_size(struct inode *inode)
224 struct llu_inode_info *lli = llu_i2info(inode);
225 struct intnl_stat *st = llu_i2stat(inode);
226 struct llu_sb_info *sbi = llu_i2sbi(inode);
227 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
228 struct lustre_handle lockh = { 0 };
230 int rc, flags = LDLM_FL_HAS_INTENT;
233 CDEBUG(D_DLMTRACE, "Glimpsing inode %llu\n", (long long)st->st_ino);
235 rc = obd_enqueue(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT, &policy,
236 LCK_PR, &flags, llu_extent_lock_callback,
237 ldlm_completion_ast, llu_glimpse_callback, inode,
238 sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
240 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
241 RETURN(rc > 0 ? -EIO : rc);
244 inode_init_lvb(inode, &lvb);
245 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
246 st->st_size = lvb.lvb_size;
247 st->st_blocks = lvb.lvb_blocks;
248 st->st_mtime = lvb.lvb_mtime;
249 st->st_atime = lvb.lvb_atime;
250 st->st_ctime = lvb.lvb_ctime;
252 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
253 (long long)st->st_size, (long long)st->st_blocks);
255 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
260 int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
261 struct lov_stripe_md *lsm, int mode,
262 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
265 struct llu_sb_info *sbi = llu_i2sbi(inode);
266 struct intnl_stat *st = llu_i2stat(inode);
271 LASSERT(!lustre_handle_is_used(lockh));
272 CLASSERT(ELDLM_OK == 0);
274 /* XXX phil: can we do this? won't it screw the file size up? */
275 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
276 (sbi->ll_flags & LL_SBI_NOLCK) || mode == LCK_NL)
279 CDEBUG(D_DLMTRACE, "Locking inode %llu, start "LPU64" end "LPU64"\n",
280 (long long)st->st_ino, policy->l_extent.start,
281 policy->l_extent.end);
283 rc = obd_enqueue(sbi->ll_dt_exp, lsm, LDLM_EXTENT, policy, mode,
284 &ast_flags, llu_extent_lock_callback,
285 ldlm_completion_ast, llu_glimpse_callback, inode,
286 sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh);
290 inode_init_lvb(inode, &lvb);
291 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
292 if (policy->l_extent.start == 0 &&
293 policy->l_extent.end == OBD_OBJECT_EOF)
294 st->st_size = lvb.lvb_size;
297 st->st_mtime = lvb.lvb_mtime;
298 st->st_atime = lvb.lvb_atime;
299 st->st_ctime = lvb.lvb_ctime;
305 int llu_extent_unlock(struct ll_file_data *fd, struct inode *inode,
306 struct lov_stripe_md *lsm, int mode,
307 struct lustre_handle *lockh)
309 struct llu_sb_info *sbi = llu_i2sbi(inode);
313 CLASSERT(ELDLM_OK == 0);
315 /* XXX phil: can we do this? won't it screw the file size up? */
316 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
317 (sbi->ll_flags & LL_SBI_NOLCK) || mode == LCK_NL)
320 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
325 #define LLAP_MAGIC 12346789
327 struct ll_async_page {
331 struct page *llap_page;
332 struct inode *llap_inode;
335 static void llu_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
337 struct ll_async_page *llap;
339 struct lov_stripe_md *lsm;
340 obd_flag valid_flags;
343 llap = LLAP_FROM_COOKIE(data);
344 inode = llap->llap_inode;
345 lsm = llu_i2info(inode)->lli_smd;
347 oa->o_id = lsm->lsm_object_id;
348 oa->o_valid = OBD_MD_FLID;
349 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
350 if (cmd & OBD_BRW_WRITE)
351 valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
353 obdo_from_inode(oa, inode, valid_flags);
357 /* called for each page in a completed rpc.*/
358 static void llu_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
360 struct ll_async_page *llap;
364 llap = LLAP_FROM_COOKIE(data);
365 llap->llap_queued = 0;
366 page = llap->llap_page;
369 if (cmd & OBD_BRW_WRITE)
370 CERROR("writeback error on page %p index %ld: %d\n",
371 page, page->index, rc);
376 static struct obd_async_page_ops llu_async_page_ops = {
377 .ap_make_ready = NULL,
378 .ap_refresh_count = NULL,
379 .ap_fill_obdo = llu_ap_fill_obdo,
380 .ap_completion = llu_ap_completion,
383 static int llu_queue_pio(int cmd, struct llu_io_group *group,
384 char *buf, size_t count, loff_t pos)
386 struct llu_inode_info *lli = llu_i2info(group->lig_inode);
387 struct intnl_stat *st = llu_i2stat(group->lig_inode);
388 struct lov_stripe_md *lsm = lli->lli_smd;
389 struct obd_export *exp = llu_i2obdexp(group->lig_inode);
390 struct page *pages = &group->lig_pages[group->lig_npages],*page = pages;
391 struct ll_async_page *llap = &group->lig_llaps[group->lig_npages];
392 void *llap_cookie = group->lig_llap_cookies +
393 llap_cookie_size * group->lig_npages;
394 int i, rc, npages = 0, ret_bytes = 0;
401 local_lock = group->lig_params->lrp_lock_mode != LCK_NL;
402 /* prepare the pages array */
404 unsigned long index, offset, bytes;
406 offset = (pos & ~PAGE_CACHE_MASK);
407 index = pos >> PAGE_CACHE_SHIFT;
408 bytes = PAGE_CACHE_SIZE - offset;
412 /* prevent read beyond file range */
413 if (/* local_lock && */
414 cmd == OBD_BRW_READ && pos + bytes >= st->st_size) {
415 if (pos >= st->st_size)
417 bytes = st->st_size - pos;
420 /* prepare page for this index */
422 page->addr = buf - offset;
424 page->_offset = offset;
425 page->_count = bytes;
433 group->lig_rwcount += bytes;
437 group->lig_npages += npages;
439 for (i = 0, page = pages; i < npages;
440 i++, page++, llap++, llap_cookie += llap_cookie_size){
441 llap->llap_magic = LLAP_MAGIC;
442 llap->llap_cookie = llap_cookie;
443 rc = obd_prep_async_page(exp, lsm, NULL, page,
444 (obd_off)page->index << PAGE_SHIFT,
446 llap, &llap->llap_cookie);
449 llap->llap_cookie = NULL;
452 CDEBUG(D_CACHE, "llap %p page %p group %p obj off "LPU64"\n",
453 llap, page, llap->llap_cookie,
454 (obd_off)pages->index << PAGE_SHIFT);
455 page->private = (unsigned long)llap;
456 llap->llap_page = page;
457 llap->llap_inode = group->lig_inode;
459 rc = obd_queue_group_io(exp, lsm, NULL, group->lig_oig,
460 llap->llap_cookie, cmd,
461 page->_offset, page->_count,
462 group->lig_params->lrp_brw_flags,
463 ASYNC_READY | ASYNC_URGENT |
464 ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
465 if (!local_lock && cmd == OBD_BRW_READ) {
467 * In OST-side locking case short reads cannot be
470 * The root of the problem is that
472 * kms = lov_merge_size(lsm, 1);
474 * glimpse_size(inode);
478 * logic in the read code (both llite and liblustre)
479 * only works correctly when client holds DLM lock on
480 * [start, end]. Without DLM lock KMS can be
481 * completely out of date, and client can either make
482 * spurious short-read (missing concurrent write), or
483 * return stale data (missing concurrent
484 * truncate). For llite client this is fatal, because
485 * incorrect data are cached and can be later sent
486 * back to the server (vide bug 5047). This is hard to
487 * fix by handling short-reads on the server, as there
488 * is no easy way to communicate file size (or amount
489 * of bytes read/written) back to the client,
490 * _especially_ because OSC pages can be sliced and
491 * dices into multiple RPCs arbitrary. Fortunately,
492 * liblustre doesn't cache data and the worst case is
493 * that we get race with concurrent write or truncate.
501 llap->llap_queued = 1;
508 struct llu_io_group * get_io_group(struct inode *inode, int maxpages,
509 struct lustre_rw_params *params)
511 struct llu_io_group *group;
514 if (!llap_cookie_size)
515 llap_cookie_size = obd_prep_async_page(llu_i2obdexp(inode),
519 OBD_ALLOC(group, LLU_IO_GROUP_SIZE(maxpages));
521 return ERR_PTR(-ENOMEM);
524 group->lig_inode = inode;
525 group->lig_maxpages = maxpages;
526 group->lig_params = params;
527 group->lig_llaps = (struct ll_async_page *)(group + 1);
528 group->lig_pages = (struct page *)(&group->lig_llaps[maxpages]);
529 group->lig_llap_cookies = (void *)(&group->lig_pages[maxpages]);
531 rc = oig_init(&group->lig_oig);
533 OBD_FREE(group, LLU_IO_GROUP_SIZE(maxpages));
540 static int max_io_pages(ssize_t len, int iovlen)
542 return (((len + PAGE_SIZE -1) / PAGE_SIZE) + 2 + iovlen - 1);
546 void put_io_group(struct llu_io_group *group)
548 struct lov_stripe_md *lsm = llu_i2info(group->lig_inode)->lli_smd;
549 struct obd_export *exp = llu_i2obdexp(group->lig_inode);
550 struct ll_async_page *llap = group->lig_llaps;
553 for (i = 0; i < group->lig_npages; i++, llap++) {
554 if (llap->llap_cookie)
555 obd_teardown_async_page(exp, lsm, NULL,
559 I_RELE(group->lig_inode);
561 oig_release(group->lig_oig);
562 OBD_FREE(group, LLU_IO_GROUP_SIZE(group->lig_maxpages));
566 ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
567 _SYSIO_OFF_T pos, ssize_t len,
570 struct llu_io_session *session = (struct llu_io_session *) private;
571 struct inode *inode = session->lis_inode;
572 struct llu_inode_info *lli = llu_i2info(inode);
573 struct intnl_stat *st = llu_i2stat(inode);
574 struct ll_file_data *fd = lli->lli_file_data;
575 struct lustre_handle lockh = {0};
576 struct lov_stripe_md *lsm = lli->lli_smd;
577 struct obd_export *exp = NULL;
578 struct llu_io_group *iogroup;
579 struct lustre_rw_params p;
582 int err, is_read, iovidx, ret;
586 /* in a large iov read/write we'll be repeatedly called.
587 * so give a chance to answer cancel ast here
589 liblustre_wait_event(0);
591 exp = llu_i2obdexp(inode);
595 if (len == 0 || iovlen == 0)
598 if (pos + len > lli->lli_maxbytes)
601 lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags,
602 lli->lli_sbi->ll_lco.lco_flags,
605 iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p);
607 RETURN(PTR_ERR(iogroup));
609 local_lock = p.lrp_lock_mode != LCK_NL;
611 err = llu_extent_lock(fd, inode, lsm, p.lrp_lock_mode, &p.lrp_policy,
612 &lockh, p.lrp_ast_flags);
616 is_read = (session->lis_cmd == OBD_BRW_READ);
619 * If OST-side locking is used, KMS can be completely out of
620 * date, and, hence, cannot be used for short-read
621 * detection. Rely in OST to handle short reads in that case.
623 inode_init_lvb(inode, &lvb);
624 obd_merge_lvb(exp, lsm, &lvb, 1);
626 /* extent.end is last byte of the range */
627 if (p.lrp_policy.l_extent.end >= kms) {
628 /* A glimpse is necessary to determine whether
629 * we return a short read or some zeroes at
630 * the end of the buffer
632 * In the case of OST-side locking KMS can be
633 * completely out of date and short-reads maybe
634 * mishandled. See llu_queue_pio() for more detailed
637 if ((err = llu_glimpse_size(inode))) {
638 GOTO(err_unlock, err);
643 } else if (lli->lli_open_flags & O_APPEND) {
647 for (iovidx = 0; iovidx < iovlen; iovidx++) {
648 char *buf = (char *) iovec[iovidx].iov_base;
649 size_t count = iovec[iovidx].iov_len;
655 if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) {
656 GOTO(err_unlock, err = -EFAULT);
660 if (/* local_lock && */ pos >= st->st_size)
663 if (pos >= lli->lli_maxbytes) {
664 GOTO(err_unlock, err = -EFBIG);
666 if (pos + count >= lli->lli_maxbytes)
667 count = lli->lli_maxbytes - pos;
670 ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos);
672 GOTO(err_unlock, err = ret);
676 LASSERT(ret == count);
677 obd_adjust_kms(exp, lsm, pos, 0);
678 /* file size grow immediately */
679 if (pos > st->st_size)
687 LASSERT(len == 0 || is_read); /* libsysio should guarantee this */
689 err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig);
691 GOTO(err_unlock, err);
693 err = oig_wait(iogroup->lig_oig);
695 CERROR("sync error %d, data corruption possible\n", err);
696 GOTO(err_unlock, err);
699 ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
701 CERROR("extent unlock error %d\n", ret);
703 session->lis_groups[session->lis_ngroups++] = iogroup;
707 llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
709 put_io_group(iogroup);
710 RETURN((ssize_t)err);
714 struct llu_io_session *get_io_session(struct inode *ino, int ngroups, int cmd)
716 struct llu_io_session *session;
718 OBD_ALLOC(session, LLU_IO_SESSION_SIZE(ngroups));
723 session->lis_inode = ino;
724 session->lis_max_groups = ngroups;
725 session->lis_cmd = cmd;
729 static void put_io_session(struct llu_io_session *session)
733 for (i = 0; i < session->lis_ngroups; i++) {
734 if (session->lis_groups[i]) {
735 put_io_group(session->lis_groups[i]);
736 session->lis_groups[i] = NULL;
740 I_RELE(session->lis_inode);
741 OBD_FREE(session, LLU_IO_SESSION_SIZE(session->lis_max_groups));
744 static int llu_file_rwx(struct inode *ino,
748 struct llu_io_session *session;
750 int cmd = read ? OBD_BRW_READ : OBD_BRW_WRITE;
753 LASSERT(ioctx->ioctx_xtvlen >= 0);
754 LASSERT(ioctx->ioctx_iovlen >= 0);
756 liblustre_wait_event(0);
758 if (!ioctx->ioctx_xtvlen)
761 /* XXX consider other types later */
762 if (S_ISDIR(llu_i2stat(ino)->st_mode))
764 if (!S_ISREG(llu_i2stat(ino)->st_mode))
767 session = get_io_session(ino, ioctx->ioctx_xtvlen * 2, cmd);
771 cc = _sysio_enumerate_extents(ioctx->ioctx_xtv, ioctx->ioctx_xtvlen,
772 ioctx->ioctx_iov, ioctx->ioctx_iovlen,
773 llu_file_prwv, session);
776 LASSERT(!ioctx->ioctx_cc);
777 ioctx->ioctx_private = session;
780 put_io_session(session);
783 liblustre_wait_event(0);
787 int llu_iop_read(struct inode *ino,
791 struct intnl_stat *st = llu_i2stat(ino);
792 st->st_atime = CURRENT_TIME;
794 return llu_file_rwx(ino, ioctx, 1);
797 int llu_iop_write(struct inode *ino,
800 struct intnl_stat *st = llu_i2stat(ino);
801 st->st_mtime = st->st_ctime = CURRENT_TIME;
803 return llu_file_rwx(ino, ioctx, 0);
806 int llu_iop_iodone(struct ioctx *ioctx)
808 struct llu_io_session *session;
809 struct llu_io_group *group;
810 int i, err = 0, rc = 0;
813 liblustre_wait_event(0);
815 session = (struct llu_io_session *) ioctx->ioctx_private;
817 LASSERT(!IS_ERR(session));
819 for (i = 0; i < session->lis_ngroups; i++) {
820 group = session->lis_groups[i];
823 err = oig_wait(group->lig_oig);
828 ioctx->ioctx_cc += group->lig_rwcount;
830 session->lis_groups[i] = NULL;
836 ioctx->ioctx_cc = -1;
837 ioctx->ioctx_errno = -rc;
840 put_io_session(session);
841 ioctx->ioctx_private = NULL;
842 liblustre_wait_event(0);