1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Lustre Light block IO
6 * Copyright (c) 2002-2004 Cluster File Systems, Inc.
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #define DEBUG_SUBSYSTEM S_LLITE
30 #include <sys/types.h>
32 #include <sys/queue.h>
49 #include "llite_lib.h"
53 struct obd_io_group *lig_oig;
54 struct inode *lig_inode;
55 struct lustre_rw_params *lig_params;
59 struct ll_async_page *lig_llaps;
60 struct page *lig_pages;
61 void *lig_llap_cookies;
64 #define LLU_IO_GROUP_SIZE(x) \
65 (sizeof(struct llu_io_group) + \
66 (sizeof(struct ll_async_page) + \
67 sizeof(struct page) + \
68 llap_cookie_size) * (x))
72 struct inode *lis_inode;
76 struct llu_io_group *lis_groups[0];
78 #define LLU_IO_SESSION_SIZE(x) \
79 (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *))
82 typedef ssize_t llu_file_piov_t(const struct iovec *iovec, int iovlen,
83 _SYSIO_OFF_T pos, ssize_t len,
86 size_t llap_cookie_size;
88 static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
90 struct llu_inode_info *lli = llu_i2info(inode);
91 struct lov_stripe_md *lsm = lli->lli_smd;
92 struct obd_export *exp = llu_i2obdexp(inode);
95 struct ldlm_lock *lock;
96 struct lov_stripe_md *lsm;
97 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
98 __u32 stripe, vallen = sizeof(stripe);
102 if (lsm->lsm_stripe_count == 1)
105 /* get our offset in the lov */
106 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
108 CERROR("obd_get_info: rc = %d\n", rc);
111 LASSERT(stripe < lsm->lsm_stripe_count);
115 static int llu_extent_lock_callback(struct ldlm_lock *lock,
116 struct ldlm_lock_desc *new, void *data,
119 struct lustre_handle lockh = { 0 };
123 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
124 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
129 case LDLM_CB_BLOCKING:
130 ldlm_lock2handle(lock, &lockh);
131 rc = ldlm_cli_cancel(&lockh);
133 CERROR("ldlm_cli_cancel failed: %d\n", rc);
135 case LDLM_CB_CANCELING: {
137 struct llu_inode_info *lli;
138 struct lov_stripe_md *lsm;
142 /* This lock wasn't granted, don't try to evict pages */
143 if (lock->l_req_mode != lock->l_granted_mode)
146 inode = llu_inode_from_lock(lock);
149 lli= llu_i2info(inode);
156 stripe = llu_lock_to_stripe_offset(inode, lock);
157 l_lock(&lock->l_resource->lr_namespace->ns_lock);
158 kms = ldlm_extent_shift_kms(lock,
159 lsm->lsm_oinfo[stripe].loi_kms);
160 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
161 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
162 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
163 lsm->lsm_oinfo[stripe].loi_kms, kms);
164 lsm->lsm_oinfo[stripe].loi_kms = kms;
176 static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp)
178 struct ptlrpc_request *req = reqp;
179 struct inode *inode = llu_inode_from_lock(lock);
180 struct llu_inode_info *lli;
182 int rc, size = sizeof(*lvb), stripe = 0;
186 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
187 lli = llu_i2info(inode);
189 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
190 if (lli->lli_smd == NULL)
191 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
193 /* First, find out which stripe index this lock corresponds to. */
194 if (lli->lli_smd->lsm_stripe_count > 1)
195 stripe = llu_lock_to_stripe_offset(inode, lock);
197 rc = lustre_pack_reply(req, 1, &size, NULL);
199 CERROR("lustre_pack_reply: %d\n", rc);
203 lvb = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*lvb));
204 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
206 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64,
207 (long long)llu_i2stat(inode)->st_size, stripe,lvb->lvb_size);
211 /* These errors are normal races, so we don't want to fill the console
212 * with messages by calling ptlrpc_error() */
213 if (rc == -ELDLM_NO_LOCK_DATA)
214 lustre_pack_reply(req, 0, NULL, NULL);
220 /* NB: lov_merge_size will prefer locally cached writes if they extend the
221 * file (because it prefers KMS over RSS when larger) */
222 int llu_glimpse_size(struct inode *inode)
224 struct llu_inode_info *lli = llu_i2info(inode);
225 struct intnl_stat *st = llu_i2stat(inode);
226 struct llu_sb_info *sbi = llu_i2sbi(inode);
227 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
228 struct lustre_handle lockh = { 0 };
229 int rc, flags = LDLM_FL_HAS_INTENT;
232 CDEBUG(D_DLMTRACE, "Glimpsing inode %llu\n", (long long)st->st_ino);
234 rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy,
235 LCK_PR, &flags, llu_extent_lock_callback,
236 ldlm_completion_ast, llu_glimpse_callback, inode,
237 sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
239 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
240 RETURN(rc > 0 ? -EIO : rc);
243 st->st_size = lov_merge_size(lli->lli_smd, 0);
244 st->st_blocks = lov_merge_blocks(lli->lli_smd);
245 st->st_mtime = lov_merge_mtime(lli->lli_smd, st->st_mtime);
247 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
248 (long long)st->st_size, (long long)st->st_blocks);
250 obd_cancel(sbi->ll_osc_exp, lli->lli_smd, LCK_PR, &lockh);
255 int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
256 struct lov_stripe_md *lsm, int mode,
257 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
260 struct llu_sb_info *sbi = llu_i2sbi(inode);
261 struct intnl_stat *st = llu_i2stat(inode);
265 LASSERT(!lustre_handle_is_used(lockh));
266 CLASSERT(ELDLM_OK == 0);
268 /* XXX phil: can we do this? won't it screw the file size up? */
269 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
270 (sbi->ll_flags & LL_SBI_NOLCK) || mode == LCK_NL)
273 CDEBUG(D_DLMTRACE, "Locking inode %llu, start "LPU64" end "LPU64"\n",
274 (long long)st->st_ino, policy->l_extent.start,
275 policy->l_extent.end);
277 rc = obd_enqueue(sbi->ll_osc_exp, lsm, LDLM_EXTENT, policy, mode,
278 &ast_flags, llu_extent_lock_callback,
279 ldlm_completion_ast, llu_glimpse_callback, inode,
280 sizeof(struct ost_lvb), lustre_swab_ost_lvb, lockh);
284 if (policy->l_extent.start == 0 &&
285 policy->l_extent.end == OBD_OBJECT_EOF)
286 st->st_size = lov_merge_size(lsm, 1);
289 st->st_mtime = lov_merge_mtime(lsm, st->st_mtime);
294 int llu_extent_unlock(struct ll_file_data *fd, struct inode *inode,
295 struct lov_stripe_md *lsm, int mode,
296 struct lustre_handle *lockh)
298 struct llu_sb_info *sbi = llu_i2sbi(inode);
302 CLASSERT(ELDLM_OK == 0);
304 /* XXX phil: can we do this? won't it screw the file size up? */
305 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
306 (sbi->ll_flags & LL_SBI_NOLCK) || mode == LCK_NL)
309 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
314 #define LLAP_MAGIC 12346789
316 struct ll_async_page {
320 struct page *llap_page;
321 struct inode *llap_inode;
324 static void llu_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
326 struct ll_async_page *llap;
328 struct lov_stripe_md *lsm;
329 obd_flag valid_flags;
332 llap = LLAP_FROM_COOKIE(data);
333 inode = llap->llap_inode;
334 lsm = llu_i2info(inode)->lli_smd;
336 oa->o_id = lsm->lsm_object_id;
337 oa->o_valid = OBD_MD_FLID;
338 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
339 if (cmd & OBD_BRW_WRITE)
340 valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
342 obdo_from_inode(oa, inode, valid_flags);
346 /* called for each page in a completed rpc.*/
347 static void llu_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
349 struct ll_async_page *llap;
353 llap = LLAP_FROM_COOKIE(data);
354 llap->llap_queued = 0;
355 page = llap->llap_page;
358 if (cmd & OBD_BRW_WRITE)
359 CERROR("writeback error on page %p index %ld: %d\n",
360 page, page->index, rc);
365 static struct obd_async_page_ops llu_async_page_ops = {
366 .ap_make_ready = NULL,
367 .ap_refresh_count = NULL,
368 .ap_fill_obdo = llu_ap_fill_obdo,
369 .ap_completion = llu_ap_completion,
372 static int llu_queue_pio(int cmd, struct llu_io_group *group,
373 char *buf, size_t count, loff_t pos)
375 struct llu_inode_info *lli = llu_i2info(group->lig_inode);
376 struct intnl_stat *st = llu_i2stat(group->lig_inode);
377 struct lov_stripe_md *lsm = lli->lli_smd;
378 struct obd_export *exp = llu_i2obdexp(group->lig_inode);
379 struct page *pages = &group->lig_pages[group->lig_npages],*page = pages;
380 struct ll_async_page *llap = &group->lig_llaps[group->lig_npages];
381 void *llap_cookie = group->lig_llap_cookies +
382 llap_cookie_size * group->lig_npages;
383 int i, rc, npages = 0, ret_bytes = 0;
390 local_lock = group->lig_params->lrp_lock_mode != LCK_NL;
391 /* prepare the pages array */
393 unsigned long index, offset, bytes;
395 offset = (pos & ~PAGE_CACHE_MASK);
396 index = pos >> PAGE_CACHE_SHIFT;
397 bytes = PAGE_CACHE_SIZE - offset;
401 /* prevent read beyond file range */
402 if (/* local_lock && */
403 cmd == OBD_BRW_READ && pos + bytes >= st->st_size) {
404 if (pos >= st->st_size)
406 bytes = st->st_size - pos;
409 /* prepare page for this index */
411 page->addr = buf - offset;
413 page->_offset = offset;
414 page->_count = bytes;
422 group->lig_rwcount += bytes;
426 group->lig_npages += npages;
428 for (i = 0, page = pages; i < npages;
429 i++, page++, llap++, llap_cookie += llap_cookie_size){
430 llap->llap_magic = LLAP_MAGIC;
431 llap->llap_cookie = llap_cookie;
432 rc = obd_prep_async_page(exp, lsm, NULL, page,
433 (obd_off)page->index << PAGE_SHIFT,
435 llap, &llap->llap_cookie);
438 llap->llap_cookie = NULL;
441 CDEBUG(D_CACHE, "llap %p page %p group %p obj off "LPU64"\n",
442 llap, page, llap->llap_cookie,
443 (obd_off)pages->index << PAGE_SHIFT);
444 page->private = (unsigned long)llap;
445 llap->llap_page = page;
446 llap->llap_inode = group->lig_inode;
448 rc = obd_queue_group_io(exp, lsm, NULL, group->lig_oig,
449 llap->llap_cookie, cmd,
450 page->_offset, page->_count,
451 group->lig_params->lrp_brw_flags,
452 ASYNC_READY | ASYNC_URGENT |
453 ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
454 if (!local_lock && cmd == OBD_BRW_READ) {
456 * In OST-side locking case short reads cannot be
459 * The root of the problem is that
461 * kms = lov_merge_size(lsm, 1);
463 * glimpse_size(inode);
467 * logic in the read code (both llite and liblustre)
468 * only works correctly when client holds DLM lock on
469 * [start, end]. Without DLM lock KMS can be
470 * completely out of date, and client can either make
471 * spurious short-read (missing concurrent write), or
472 * return stale data (missing concurrent
473 * truncate). For llite client this is fatal, because
474 * incorrect data are cached and can be later sent
475 * back to the server (vide bug 5047). This is hard to
476 * fix by handling short-reads on the server, as there
477 * is no easy way to communicate file size (or amount
478 * of bytes read/written) back to the client,
479 * _especially_ because OSC pages can be sliced and
480 * dices into multiple RPCs arbitrary. Fortunately,
481 * liblustre doesn't cache data and the worst case is
482 * that we get race with concurrent write or truncate.
490 llap->llap_queued = 1;
497 struct llu_io_group * get_io_group(struct inode *inode, int maxpages,
498 struct lustre_rw_params *params)
500 struct llu_io_group *group;
503 if (!llap_cookie_size)
504 llap_cookie_size = obd_prep_async_page(llu_i2obdexp(inode),
508 OBD_ALLOC(group, LLU_IO_GROUP_SIZE(maxpages));
510 return ERR_PTR(-ENOMEM);
513 group->lig_inode = inode;
514 group->lig_maxpages = maxpages;
515 group->lig_params = params;
516 group->lig_llaps = (struct ll_async_page *)(group + 1);
517 group->lig_pages = (struct page *)(&group->lig_llaps[maxpages]);
518 group->lig_llap_cookies = (void *)(&group->lig_pages[maxpages]);
520 rc = oig_init(&group->lig_oig);
522 OBD_FREE(group, LLU_IO_GROUP_SIZE(maxpages));
529 static int max_io_pages(ssize_t len, int iovlen)
531 return (((len + PAGE_SIZE -1) / PAGE_SIZE) + 2 + iovlen - 1);
535 void put_io_group(struct llu_io_group *group)
537 struct lov_stripe_md *lsm = llu_i2info(group->lig_inode)->lli_smd;
538 struct obd_export *exp = llu_i2obdexp(group->lig_inode);
539 struct ll_async_page *llap = group->lig_llaps;
542 for (i = 0; i < group->lig_npages; i++, llap++) {
543 if (llap->llap_cookie)
544 obd_teardown_async_page(exp, lsm, NULL,
548 I_RELE(group->lig_inode);
550 oig_release(group->lig_oig);
551 OBD_FREE(group, LLU_IO_GROUP_SIZE(group->lig_maxpages));
555 ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
556 _SYSIO_OFF_T pos, ssize_t len,
559 struct llu_io_session *session = (struct llu_io_session *) private;
560 struct inode *inode = session->lis_inode;
561 struct llu_inode_info *lli = llu_i2info(inode);
562 struct intnl_stat *st = llu_i2stat(inode);
563 struct ll_file_data *fd = lli->lli_file_data;
564 struct lustre_handle lockh = {0};
565 struct lov_stripe_md *lsm = lli->lli_smd;
566 struct obd_export *exp = NULL;
567 struct llu_io_group *iogroup;
568 struct lustre_rw_params p;
570 int err, is_read, iovidx, ret;
574 /* in a large iov read/write we'll be repeatedly called.
575 * so give a chance to answer cancel ast here
577 liblustre_wait_event(0);
579 exp = llu_i2obdexp(inode);
583 if (len == 0 || iovlen == 0)
586 if (pos + len > lli->lli_maxbytes)
589 lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags,
590 lli->lli_sbi->ll_lco.lco_flags,
593 iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p);
595 RETURN(PTR_ERR(iogroup));
597 local_lock = p.lrp_lock_mode != LCK_NL;
599 err = llu_extent_lock(fd, inode, lsm, p.lrp_lock_mode, &p.lrp_policy,
600 &lockh, p.lrp_ast_flags);
604 is_read = (session->lis_cmd == OBD_BRW_READ);
607 * If OST-side locking is used, KMS can be completely out of
608 * date, and, hence, cannot be used for short-read
609 * detection. Rely in OST to handle short reads in that case.
611 kms = lov_merge_size(lsm, 1);
612 /* extent.end is last byte of the range */
613 if (p.lrp_policy.l_extent.end >= kms) {
614 /* A glimpse is necessary to determine whether
615 * we return a short read or some zeroes at
616 * the end of the buffer
618 * In the case of OST-side locking KMS can be
619 * completely out of date and short-reads maybe
620 * mishandled. See llu_queue_pio() for more detailed
623 if ((err = llu_glimpse_size(inode))) {
624 GOTO(err_unlock, err);
629 } else if (lli->lli_open_flags & O_APPEND) {
633 for (iovidx = 0; iovidx < iovlen; iovidx++) {
634 char *buf = (char *) iovec[iovidx].iov_base;
635 size_t count = iovec[iovidx].iov_len;
641 if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) {
642 GOTO(err_unlock, err = -EFAULT);
646 if (/* local_lock && */ pos >= st->st_size)
649 if (pos >= lli->lli_maxbytes) {
650 GOTO(err_unlock, err = -EFBIG);
652 if (pos + count >= lli->lli_maxbytes)
653 count = lli->lli_maxbytes - pos;
656 ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos);
658 GOTO(err_unlock, err = ret);
662 LASSERT(ret == count);
663 obd_adjust_kms(exp, lsm, pos, 0);
664 /* file size grow immediately */
665 if (pos > st->st_size)
673 LASSERT(len == 0 || is_read); /* libsysio should guarantee this */
675 err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig);
677 GOTO(err_unlock, err);
679 err = oig_wait(iogroup->lig_oig);
681 CERROR("sync error %d, data corruption possible\n", err);
682 GOTO(err_unlock, err);
685 ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
687 CERROR("extent unlock error %d\n", ret);
689 session->lis_groups[session->lis_ngroups++] = iogroup;
693 llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
695 put_io_group(iogroup);
696 RETURN((ssize_t)err);
700 struct llu_io_session *get_io_session(struct inode *ino, int ngroups, int cmd)
702 struct llu_io_session *session;
704 OBD_ALLOC(session, LLU_IO_SESSION_SIZE(ngroups));
709 session->lis_inode = ino;
710 session->lis_max_groups = ngroups;
711 session->lis_cmd = cmd;
715 static void put_io_session(struct llu_io_session *session)
719 for (i = 0; i < session->lis_ngroups; i++) {
720 if (session->lis_groups[i]) {
721 put_io_group(session->lis_groups[i]);
722 session->lis_groups[i] = NULL;
726 I_RELE(session->lis_inode);
727 OBD_FREE(session, LLU_IO_SESSION_SIZE(session->lis_max_groups));
730 static int llu_file_rwx(struct inode *ino,
734 struct llu_io_session *session;
736 int cmd = read ? OBD_BRW_READ : OBD_BRW_WRITE;
739 LASSERT(ioctx->ioctx_xtvlen >= 0);
740 LASSERT(ioctx->ioctx_iovlen >= 0);
742 liblustre_wait_event(0);
744 if (!ioctx->ioctx_xtvlen)
747 /* XXX consider other types later */
748 if (S_ISDIR(llu_i2stat(ino)->st_mode))
750 if (!S_ISREG(llu_i2stat(ino)->st_mode))
753 session = get_io_session(ino, ioctx->ioctx_xtvlen * 2, cmd);
757 cc = _sysio_enumerate_extents(ioctx->ioctx_xtv, ioctx->ioctx_xtvlen,
758 ioctx->ioctx_iov, ioctx->ioctx_iovlen,
759 llu_file_prwv, session);
762 LASSERT(!ioctx->ioctx_cc);
763 ioctx->ioctx_private = session;
766 put_io_session(session);
769 liblustre_wait_event(0);
773 int llu_iop_read(struct inode *ino,
776 return llu_file_rwx(ino, ioctx, 1);
779 int llu_iop_write(struct inode *ino,
785 memset(&iattr, 0, sizeof(iattr));
786 iattr.ia_mtime = iattr.ia_atime = CURRENT_TIME;
787 iattr.ia_valid = ATTR_MTIME | ATTR_ATIME | ATTR_RAW;
789 liblustre_wait_event(0);
790 rc = llu_setattr_raw(ino, &iattr);
792 CERROR("failed to set mtime/atime during write: %d", rc);
793 /* XXX should continue or return error? */
796 return llu_file_rwx(ino, ioctx, 0);
799 int llu_iop_iodone(struct ioctx *ioctx)
801 struct llu_io_session *session;
802 struct llu_io_group *group;
803 int i, err = 0, rc = 0;
806 liblustre_wait_event(0);
808 session = (struct llu_io_session *) ioctx->ioctx_private;
810 LASSERT(!IS_ERR(session));
812 for (i = 0; i < session->lis_ngroups; i++) {
813 group = session->lis_groups[i];
816 err = oig_wait(group->lig_oig);
821 ioctx->ioctx_cc += group->lig_rwcount;
823 session->lis_groups[i] = NULL;
829 ioctx->ioctx_cc = -1;
830 ioctx->ioctx_errno = -rc;
833 put_io_session(session);
834 ioctx->ioctx_private = NULL;
835 liblustre_wait_event(0);