4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
37 #define DEBUG_SUBSYSTEM S_LOV
40 #include <libcfs/libcfs.h>
42 #include <liblustre.h>
45 #include <obd_class.h>
47 #include <lustre/lustre_idl.h>
49 #include "lov_internal.h"
51 static void lov_init_set(struct lov_request_set *set)
54 cfs_atomic_set(&set->set_completes, 0);
55 cfs_atomic_set(&set->set_success, 0);
56 cfs_atomic_set(&set->set_finish_checked, 0);
58 CFS_INIT_LIST_HEAD(&set->set_list);
59 cfs_atomic_set(&set->set_refcount, 1);
60 init_waitqueue_head(&set->set_waitq);
63 void lov_finish_set(struct lov_request_set *set)
65 struct list_head *pos, *n;
66 struct lov_request *req;
70 list_for_each_safe(pos, n, &set->set_list) {
71 req = list_entry(pos, struct lov_request, rq_link);
72 list_del_init(&req->rq_link);
74 if (req->rq_oi.oi_oa != NULL)
75 OBDO_FREE(req->rq_oi.oi_oa);
77 if (req->rq_oi.oi_osfs != NULL)
78 OBD_FREE_PTR(req->rq_oi.oi_osfs);
87 int lov_set_finished(struct lov_request_set *set, int idempotent)
89 int completes = cfs_atomic_read(&set->set_completes);
91 CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
93 if (completes == set->set_count) {
96 if (cfs_atomic_inc_return(&set->set_finish_checked) == 1)
102 void lov_update_set(struct lov_request_set *set,
103 struct lov_request *req, int rc)
105 req->rq_complete = 1;
108 cfs_atomic_inc(&set->set_completes);
110 cfs_atomic_inc(&set->set_success);
112 wake_up(&set->set_waitq);
115 int lov_update_common_set(struct lov_request_set *set,
116 struct lov_request *req, int rc)
118 struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
121 lov_update_set(set, req, rc);
123 /* grace error on inactive ost */
124 if (rc && !(lov->lov_tgts[req->rq_idx] &&
125 lov->lov_tgts[req->rq_idx]->ltd_active))
128 /* FIXME in raid1 regime, should return 0 */
132 void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
134 cfs_list_add_tail(&req->rq_link, &set->set_list);
139 static int lov_check_set(struct lov_obd *lov, int idx)
142 mutex_lock(&lov->lov_lock);
144 if (lov->lov_tgts[idx] == NULL ||
145 lov->lov_tgts[idx]->ltd_active ||
146 (lov->lov_tgts[idx]->ltd_exp != NULL &&
147 class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
150 mutex_unlock(&lov->lov_lock);
154 /* Check if the OSC connection exists and is active.
155 * If the OSC has not yet had a chance to connect to the OST the first time,
156 * wait once for it to connect instead of returning an error.
158 int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
160 wait_queue_head_t waitq;
161 struct l_wait_info lwi;
162 struct lov_tgt_desc *tgt;
165 mutex_lock(&lov->lov_lock);
167 tgt = lov->lov_tgts[ost_idx];
169 if (unlikely(tgt == NULL))
172 if (likely(tgt->ltd_active))
175 if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
178 mutex_unlock(&lov->lov_lock);
180 init_waitqueue_head(&waitq);
181 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
182 cfs_time_seconds(1), NULL, NULL);
184 rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
191 mutex_unlock(&lov->lov_lock);
195 static int common_attr_done(struct lov_request_set *set)
198 struct lov_request *req;
200 int rc = 0, attrset = 0;
203 LASSERT(set->set_oi != NULL);
205 if (set->set_oi->oi_oa == NULL)
208 if (!cfs_atomic_read(&set->set_success))
213 GOTO(out, rc = -ENOMEM);
215 cfs_list_for_each (pos, &set->set_list) {
216 req = cfs_list_entry(pos, struct lov_request, rq_link);
218 if (!req->rq_complete || req->rq_rc)
220 if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */
222 lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
223 req->rq_oi.oi_oa->o_valid,
224 set->set_oi->oi_md, req->rq_stripe, &attrset);
227 CERROR("No stripes had valid attrs\n");
230 if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
231 (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
232 /* When we take attributes of some epoch, we require all the
233 * ost to be active. */
234 CERROR("Not all the stripes had valid attrs\n");
235 GOTO(out, rc = -EIO);
238 tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
239 memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
247 int lov_fini_getattr_set(struct lov_request_set *set)
254 LASSERT(set->set_exp);
255 if (cfs_atomic_read(&set->set_completes))
256 rc = common_attr_done(set);
263 /* The callback for osc_getattr_async that finilizes a request info when a
264 * response is received. */
265 static int cb_getattr_update(void *cookie, int rc)
267 struct obd_info *oinfo = cookie;
268 struct lov_request *lovreq;
269 lovreq = container_of(oinfo, struct lov_request, rq_oi);
270 return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
273 int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
274 struct lov_request_set **reqset)
276 struct lov_request_set *set;
277 struct lov_obd *lov = &exp->exp_obd->u.lov;
281 OBD_ALLOC(set, sizeof(*set));
289 for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
290 struct lov_oinfo *loi;
291 struct lov_request *req;
293 loi = oinfo->oi_md->lsm_oinfo[i];
294 if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
295 CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
296 if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH)
297 /* SOM requires all the OSTs to be active. */
298 GOTO(out_set, rc = -EIO);
302 OBD_ALLOC(req, sizeof(*req));
304 GOTO(out_set, rc = -ENOMEM);
307 req->rq_idx = loi->loi_ost_idx;
309 OBDO_ALLOC(req->rq_oi.oi_oa);
310 if (req->rq_oi.oi_oa == NULL) {
311 OBD_FREE(req, sizeof(*req));
312 GOTO(out_set, rc = -ENOMEM);
314 memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
315 sizeof(*req->rq_oi.oi_oa));
316 req->rq_oi.oi_oa->o_oi = loi->loi_oi;
317 req->rq_oi.oi_cb_up = cb_getattr_update;
318 req->rq_oi.oi_capa = oinfo->oi_capa;
320 lov_set_add_req(req, set);
323 GOTO(out_set, rc = -EIO);
327 lov_fini_getattr_set(set);
331 int lov_fini_destroy_set(struct lov_request_set *set)
337 LASSERT(set->set_exp);
338 if (cfs_atomic_read(&set->set_completes)) {
339 /* FIXME update qos data here */
347 int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
348 struct obdo *src_oa, struct lov_stripe_md *lsm,
349 struct obd_trans_info *oti,
350 struct lov_request_set **reqset)
352 struct lov_request_set *set;
353 struct lov_obd *lov = &exp->exp_obd->u.lov;
357 OBD_ALLOC(set, sizeof(*set));
364 set->set_oi->oi_md = lsm;
365 set->set_oi->oi_oa = src_oa;
366 if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
367 set->set_cookies = oti->oti_logcookies;
369 for (i = 0; i < lsm->lsm_stripe_count; i++) {
370 struct lov_oinfo *loi;
371 struct lov_request *req;
373 loi = lsm->lsm_oinfo[i];
374 if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
375 CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
379 OBD_ALLOC(req, sizeof(*req));
381 GOTO(out_set, rc = -ENOMEM);
384 req->rq_idx = loi->loi_ost_idx;
386 OBDO_ALLOC(req->rq_oi.oi_oa);
387 if (req->rq_oi.oi_oa == NULL) {
388 OBD_FREE(req, sizeof(*req));
389 GOTO(out_set, rc = -ENOMEM);
391 memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
392 req->rq_oi.oi_oa->o_oi = loi->loi_oi;
393 lov_set_add_req(req, set);
396 GOTO(out_set, rc = -EIO);
400 lov_fini_destroy_set(set);
404 int lov_fini_setattr_set(struct lov_request_set *set)
411 LASSERT(set->set_exp);
412 if (cfs_atomic_read(&set->set_completes)) {
413 rc = common_attr_done(set);
414 /* FIXME update qos data here */
421 int lov_update_setattr_set(struct lov_request_set *set,
422 struct lov_request *req, int rc)
424 struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
425 struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
428 lov_update_set(set, req, rc);
430 /* grace error on inactive ost */
431 if (rc && !(lov->lov_tgts[req->rq_idx] &&
432 lov->lov_tgts[req->rq_idx]->ltd_active))
436 if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
437 lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
438 req->rq_oi.oi_oa->o_ctime;
439 if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
440 lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
441 req->rq_oi.oi_oa->o_mtime;
442 if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
443 lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
444 req->rq_oi.oi_oa->o_atime;
450 /* The callback for osc_setattr_async that finilizes a request info when a
451 * response is received. */
452 static int cb_setattr_update(void *cookie, int rc)
454 struct obd_info *oinfo = cookie;
455 struct lov_request *lovreq;
456 lovreq = container_of(oinfo, struct lov_request, rq_oi);
457 return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
460 int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
461 struct obd_trans_info *oti,
462 struct lov_request_set **reqset)
464 struct lov_request_set *set;
465 struct lov_obd *lov = &exp->exp_obd->u.lov;
469 OBD_ALLOC(set, sizeof(*set));
476 if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
477 set->set_cookies = oti->oti_logcookies;
479 for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
480 struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
481 struct lov_request *req;
483 if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
484 CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
488 OBD_ALLOC(req, sizeof(*req));
490 GOTO(out_set, rc = -ENOMEM);
492 req->rq_idx = loi->loi_ost_idx;
494 OBDO_ALLOC(req->rq_oi.oi_oa);
495 if (req->rq_oi.oi_oa == NULL) {
496 OBD_FREE(req, sizeof(*req));
497 GOTO(out_set, rc = -ENOMEM);
499 memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
500 sizeof(*req->rq_oi.oi_oa));
501 req->rq_oi.oi_oa->o_oi = loi->loi_oi;
502 req->rq_oi.oi_oa->o_stripe_idx = i;
503 req->rq_oi.oi_cb_up = cb_setattr_update;
504 req->rq_oi.oi_capa = oinfo->oi_capa;
506 if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
507 int off = lov_stripe_offset(oinfo->oi_md,
508 oinfo->oi_oa->o_size, i,
509 &req->rq_oi.oi_oa->o_size);
511 if (off < 0 && req->rq_oi.oi_oa->o_size)
512 req->rq_oi.oi_oa->o_size--;
514 CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
515 i, req->rq_oi.oi_oa->o_size,
516 oinfo->oi_oa->o_size);
518 lov_set_add_req(req, set);
521 GOTO(out_set, rc = -EIO);
525 lov_fini_setattr_set(set);
529 #define LOV_U64_MAX ((__u64)~0ULL)
530 #define LOV_SUM_MAX(tot, add) \
532 if ((tot) + (add) < (tot)) \
533 (tot) = LOV_U64_MAX; \
538 int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
543 __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
545 if (osfs->os_files != LOV_U64_MAX)
546 lov_do_div64(osfs->os_files, expected_stripes);
547 if (osfs->os_ffree != LOV_U64_MAX)
548 lov_do_div64(osfs->os_ffree, expected_stripes);
550 spin_lock(&obd->obd_osfs_lock);
551 memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
552 obd->obd_osfs_age = cfs_time_current_64();
553 spin_unlock(&obd->obd_osfs_lock);
560 int lov_fini_statfs_set(struct lov_request_set *set)
568 if (cfs_atomic_read(&set->set_completes)) {
569 rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
570 cfs_atomic_read(&set->set_success));
576 void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
579 int shift = 0, quit = 0;
583 memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
585 if (osfs->os_bsize != lov_sfs->os_bsize) {
586 /* assume all block sizes are always powers of 2 */
587 /* get the bits difference */
588 tmp = osfs->os_bsize | lov_sfs->os_bsize;
589 for (shift = 0; shift <= 64; ++shift) {
601 if (osfs->os_bsize < lov_sfs->os_bsize) {
602 osfs->os_bsize = lov_sfs->os_bsize;
604 osfs->os_bfree >>= shift;
605 osfs->os_bavail >>= shift;
606 osfs->os_blocks >>= shift;
607 } else if (shift != 0) {
608 lov_sfs->os_bfree >>= shift;
609 lov_sfs->os_bavail >>= shift;
610 lov_sfs->os_blocks >>= shift;
613 /* Sandia requested that df (and so, statfs) only
614 returned minimal available space on
615 a single OST, so people would be able to
616 write this much data guaranteed. */
617 if (osfs->os_bavail > lov_sfs->os_bavail) {
618 /* Presumably if new bavail is smaller,
619 new bfree is bigger as well */
620 osfs->os_bfree = lov_sfs->os_bfree;
621 osfs->os_bavail = lov_sfs->os_bavail;
624 osfs->os_bfree += lov_sfs->os_bfree;
625 osfs->os_bavail += lov_sfs->os_bavail;
627 osfs->os_blocks += lov_sfs->os_blocks;
628 /* XXX not sure about this one - depends on policy.
629 * - could be minimum if we always stripe on all OBDs
630 * (but that would be wrong for any other policy,
631 * if one of the OBDs has no more objects left)
632 * - could be sum if we stripe whole objects
633 * - could be average, just to give a nice number
635 * To give a "reasonable" (if not wholly accurate)
636 * number, we divide the total number of free objects
637 * by expected stripe count (watch out for overflow).
639 LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
640 LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
644 /* The callback for osc_statfs_async that finilizes a request info when a
645 * response is received. */
646 static int cb_statfs_update(void *cookie, int rc)
648 struct obd_info *oinfo = cookie;
649 struct lov_request *lovreq;
650 struct lov_request_set *set;
651 struct obd_statfs *osfs, *lov_sfs;
653 struct lov_tgt_desc *tgt;
654 struct obd_device *lovobd, *tgtobd;
658 lovreq = container_of(oinfo, struct lov_request, rq_oi);
659 set = lovreq->rq_rqset;
660 lovobd = set->set_obd;
661 lov = &lovobd->u.lov;
662 osfs = set->set_oi->oi_osfs;
663 lov_sfs = oinfo->oi_osfs;
664 success = cfs_atomic_read(&set->set_success);
665 /* XXX: the same is done in lov_update_common_set, however
666 lovset->set_exp is not initialized. */
667 lov_update_set(set, lovreq, rc);
672 tgt = lov->lov_tgts[lovreq->rq_idx];
673 if (!tgt || !tgt->ltd_active)
674 GOTO(out_update, rc);
676 tgtobd = class_exp2obd(tgt->ltd_exp);
677 spin_lock(&tgtobd->obd_osfs_lock);
678 memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
679 if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
680 tgtobd->obd_osfs_age = cfs_time_current_64();
681 spin_unlock(&tgtobd->obd_osfs_lock);
684 lov_update_statfs(osfs, lov_sfs, success);
688 if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
689 lov_set_finished(set, 0)) {
690 lov_statfs_interpret(NULL, set, set->set_count !=
691 cfs_atomic_read(&set->set_success));
697 int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
698 struct lov_request_set **reqset)
700 struct lov_request_set *set;
701 struct lov_obd *lov = &obd->u.lov;
705 OBD_ALLOC(set, sizeof(*set));
713 /* We only get block data from the OBD */
714 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
715 struct lov_request *req;
717 if (lov->lov_tgts[i] == NULL ||
718 (oinfo->oi_flags & OBD_STATFS_NODELAY &&
719 !lov->lov_tgts[i]->ltd_active)) {
720 CDEBUG(D_HA, "lov idx %d inactive\n", i);
724 if (!lov->lov_tgts[i]->ltd_active)
725 lov_check_and_wait_active(lov, i);
727 /* skip targets that have been explicitely disabled by the
729 if (!lov->lov_tgts[i]->ltd_exp) {
730 CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
734 OBD_ALLOC(req, sizeof(*req));
736 GOTO(out_set, rc = -ENOMEM);
738 OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
739 if (req->rq_oi.oi_osfs == NULL) {
740 OBD_FREE(req, sizeof(*req));
741 GOTO(out_set, rc = -ENOMEM);
745 req->rq_oi.oi_cb_up = cb_statfs_update;
746 req->rq_oi.oi_flags = oinfo->oi_flags;
748 lov_set_add_req(req, set);
751 GOTO(out_set, rc = -EIO);
755 lov_fini_statfs_set(set);