1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of the Lustre file system, http://www.lustre.org
7 * Lustre is a trademark of Cluster File Systems, Inc.
9 * You may have signed or agreed to another license before downloading
10 * this software. If so, you are bound by the terms and conditions
11 * of that agreement, and the following does not apply to you. See the
12 * LICENSE file included with this distribution for more information.
14 * If you did not agree to a different license, then this copy of Lustre
15 * is open source software; you can redistribute it and/or modify it
16 * under the terms of version 2 of the GNU General Public License as
17 * published by the Free Software Foundation.
19 * In either case, Lustre is distributed in the hope that it will be
20 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
21 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * license text for more details.
26 # define EXPORT_SYMTAB
28 #define DEBUG_SUBSYSTEM S_LOV
31 #include <libcfs/libcfs.h>
33 #include <liblustre.h>
36 #include <obd_class.h>
39 #include "lov_internal.h"
41 void qos_shrink_lsm(struct lov_request_set *set)
43 struct lov_stripe_md *lsm = set->set_md, *lsm_new;
44 /* XXX LOV STACKING call into osc for sizes */
45 unsigned oldsize, newsize;
47 if (set->set_oti && set->set_cookies && set->set_cookie_sent) {
48 struct llog_cookie *cookies;
49 oldsize = lsm->lsm_stripe_count * sizeof(*cookies);
50 newsize = set->set_count * sizeof(*cookies);
52 cookies = set->set_cookies;
53 oti_alloc_cookies(set->set_oti, set->set_count);
54 if (set->set_oti->oti_logcookies) {
55 memcpy(set->set_oti->oti_logcookies, cookies, newsize);
56 OBD_FREE(cookies, oldsize);
57 set->set_cookies = set->set_oti->oti_logcookies;
59 CWARN("'leaking' %d bytes\n", oldsize - newsize);
63 CWARN("using fewer stripes for object "LPX64": old %u new %u\n",
64 lsm->lsm_object_id, lsm->lsm_stripe_count, set->set_count);
66 oldsize = lov_stripe_md_size(lsm->lsm_stripe_count);
67 newsize = lov_stripe_md_size(set->set_count);
68 OBD_ALLOC(lsm_new, newsize);
69 if (lsm_new != NULL) {
70 memcpy(lsm_new, lsm, newsize);
71 lsm_new->lsm_stripe_count = set->set_count;
72 OBD_FREE(lsm, oldsize);
73 set->set_md = lsm_new;
75 CWARN("'leaking' %d bytes\n", oldsize - newsize);
79 int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
81 struct lov_stripe_md *lsm = set->set_md;
82 struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
83 unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
84 int stripe, i, rc = -EIO;
87 ost_idx = (req->rq_idx + lsm->lsm_stripe_count) % ost_count;
88 for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
89 if (lov->tgts[ost_idx].active == 0) {
90 CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
93 /* check if objects has been created on this ost */
94 for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) {
95 if (stripe == req->rq_stripe)
97 if (ost_idx == lsm->lsm_oinfo[stripe].loi_ost_idx)
101 if (stripe >= lsm->lsm_stripe_count) {
102 req->rq_idx = ost_idx;
103 rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa,
104 &req->rq_md, set->set_oti);
112 #define LOV_CREATE_RESEED_MULT 4
113 #define LOV_CREATE_RESEED_MIN 1000
114 /* alloc objects on osts with round-robin algorithm */
115 static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt)
117 static int ost_start_count, ost_start_idx;
118 unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
119 unsigned ost_active_count = lov->desc.ld_active_tgt_count;
120 int i, *idx_pos = idx_arr;
123 if (--ost_start_count <= 0) {
124 ost_start_idx = ll_rand();
126 (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
127 LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
128 } else if (*stripe_cnt >= lov->desc.ld_active_tgt_count) {
129 /* If we allocate from all of the stripes, make the
130 * next file start on the next OST. */
133 ost_idx = ost_start_idx % ost_count;
135 for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
138 if (lov->tgts[ost_idx].active == 0) {
139 CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
146 if (idx_pos - idx_arr == *stripe_cnt)
149 *stripe_cnt = idx_pos - idx_arr;
153 /* alloc objects on osts with specific stripe offset */
154 static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
157 unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
158 int i, *idx_pos = idx_arr;
161 ost_idx = lsm->lsm_oinfo[0].loi_ost_idx;
162 for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
163 if (lov->tgts[ost_idx].active == 0) {
164 CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
170 if (idx_pos - idx_arr == lsm->lsm_stripe_count)
173 /* If we were passed specific striping params, then a failure to
174 * meet those requirements is an error, since we can't reallocate
175 * that memory (it might be part of a larger array or something).
177 * We can only get here if lsm_stripe_count was originally > 1.
179 CERROR("can't lstripe objid "LPX64": have %u want %u\n",
180 lsm->lsm_object_id, idx_pos - idx_arr, lsm->lsm_stripe_count);
184 /* free space OST must have to be used for object allocation. */
185 #define QOS_MIN (lov->desc.ld_qos_threshold << 20)
187 #define TGT_BAVAIL(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \
188 tgt->ltd_exp->exp_obd->obd_osfs.os_bsize)
189 #define TGT_FFREE(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree)
191 /* alloc objects on osts with free space weighted algorithm */
192 static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt)
194 struct lov_obd *lov = &exp->exp_obd->u.lov;
195 unsigned ost_count = lov->desc.ld_tgt_count;
196 __u64 cur_bavail, rand, *availspace, total_bavail = 0;
197 int *indexes, nfound, good_osts, i, warn = 0, rc = 0;
198 struct lov_tgt_desc *tgt;
199 int shift, require_stripes = *stripe_cnt;
200 static time_t last_warn = 0;
201 time_t now = cfs_time_current_sec();
206 OBD_ALLOC(availspace, sizeof(__u64) * ost_count);
207 OBD_ALLOC(indexes, sizeof(int) * require_stripes);
208 if (!availspace || !indexes)
209 GOTO(out_free, rc = -EAGAIN);
211 mutex_down(&lov->lov_lock);
213 /* if free space is below some threshold, just go
214 * to do round-robin allocation */
215 total_bavail = (exp->exp_obd->obd_osfs.os_bavail * \
216 exp->exp_obd->obd_osfs.os_bsize);
217 if (ost_count < 2 || total_bavail <= QOS_MIN) {
218 mutex_up(&lov->lov_lock);
219 GOTO(out_free, rc = -EAGAIN);
222 /* if each ost has almost same free space, go to
223 * do rr allocation for better creation performance */
224 if (!list_empty(&lov->qos_bavail_list)) {
226 tgt = list_entry(lov->qos_bavail_list.next,
227 struct lov_tgt_desc, qos_bavail_list);
228 max = TGT_BAVAIL(tgt);
229 tgt = list_entry(lov->qos_bavail_list.prev,
230 struct lov_tgt_desc, qos_bavail_list);
231 min = TGT_BAVAIL(tgt);
233 val = (max >= min) ? (max - min) : (min - max);
234 min = (min * 13) >> 8; /* less than 5% of gap */
237 mutex_up(&lov->lov_lock);
238 GOTO(out_free, rc = -EAGAIN);
241 mutex_up(&lov->lov_lock);
242 GOTO(out_free, rc = -EAGAIN);
247 /* warn zero available space/inode every 30 min */
248 if (cfs_time_sub(now, last_warn) > 60 * 30)
250 /* Find all the OSTs big enough to be stripe candidates */
251 list_for_each_entry(tgt, &lov->qos_bavail_list, qos_bavail_list) {
254 if (!TGT_BAVAIL(tgt)) {
256 CWARN("no free space on %s\n",
262 if (!TGT_FFREE(tgt)) {
264 CWARN("no free inodes on %s\n",
270 /* We can stop if we have enough good osts and our osts
271 are getting too small */
272 if ((TGT_BAVAIL(tgt) <= QOS_MIN) && (good_osts >= *stripe_cnt))
274 availspace[good_osts] = TGT_BAVAIL(tgt);
275 indexes[good_osts] = tgt->index;
276 total_bavail += availspace[good_osts];
280 mutex_up(&lov->lov_lock);
283 GOTO(out_free, rc = -ENOSPC);
285 /* if we don't have enough good OSTs, we reduce the stripe count. */
286 if (good_osts < *stripe_cnt)
287 *stripe_cnt = good_osts;
290 GOTO(out_free, rc = -EAGAIN);
292 /* The point of all this shift and rand is to choose a 64-bit
293 random number between 0 and total_bavail. Apparently '%' doesn't
294 work for 64bit numbers. */
296 while ((total_bavail >> shift) > 0)
299 /* Find enough OSTs with free space weighted random allocation */
300 while (nfound < *stripe_cnt) {
303 /* If the total storage left is < 4GB, don't use random order,
304 store in biggest OST first. (Low storage situation.)
305 Otherwise, choose a 64bit random number... */
306 rand = (shift < 32 ? 0ULL : (__u64)ll_rand() << 32) | ll_rand();
307 /* ... mask everything above shift... */
309 rand &= ((1ULL << shift) - 1);
310 /* ... and this while should execute at most once... */
311 while (rand > total_bavail)
312 rand -= total_bavail;
313 /* ... leaving us a 64bit number between 0 and total_bavail. */
315 /* Try to fit in bigger OSTs first. On average, this will
316 fill more toward the front of the OST array */
317 for (i = 0; i < good_osts; i++) {
318 cur_bavail += availspace[i];
319 if (cur_bavail >= rand) {
320 total_bavail -= availspace[i];
322 idx_arr[nfound] = indexes[i];
327 /* should never satisfy below condition */
331 LASSERT(nfound == *stripe_cnt);
335 OBD_FREE(availspace, sizeof(__u64) * ost_count);
337 OBD_FREE(indexes, sizeof(int) * require_stripes);
342 rc = alloc_rr(lov, idx_arr, stripe_cnt);
346 /* return new alloced stripe count on success */
347 static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm,
348 int newea, int **idx_arr, int *arr_cnt)
350 struct lov_obd *lov = &exp->exp_obd->u.lov;
351 int stripe_cnt = lsm->lsm_stripe_count;
356 *arr_cnt = stripe_cnt;
357 OBD_ALLOC(tmp_arr, *arr_cnt * sizeof(int));
360 for (i = 0; i < *arr_cnt; i++)
364 lsm->lsm_oinfo[0].loi_ost_idx >= lov->desc.ld_tgt_count)
365 rc = alloc_qos(exp, tmp_arr, &stripe_cnt);
367 rc = alloc_specific(lov, lsm, tmp_arr);
375 OBD_FREE(tmp_arr, *arr_cnt * sizeof(int));
380 static void free_idx_array(int *idx_arr, int arr_cnt)
383 OBD_FREE(idx_arr, arr_cnt * sizeof(int));
386 int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
388 struct lov_obd *lov = &exp->exp_obd->u.lov;
389 struct lov_stripe_md *lsm;
390 struct obdo *src_oa = set->set_oa;
391 struct obd_trans_info *oti = set->set_oti;
392 int i, stripes, rc = 0, newea = 0;
393 int *idx_arr, idx_cnt = 0;
396 LASSERT(src_oa->o_valid & OBD_MD_FLID);
398 if (set->set_md == NULL) {
399 int stripe_cnt = lov_get_stripecnt(lov, 0);
401 /* If the MDS file was truncated up to some size, stripe over
402 * enough OSTs to allow the file to be created at that size.
403 * This may mean we use more than the default # of stripes. */
404 if (src_oa->o_valid & OBD_MD_FLSIZE) {
405 struct lov_tgt_desc *tgt;
407 /* Find the smallest number of stripes we can use
408 (up to # of active osts). */
410 mutex_down(&lov->lov_lock);
411 list_for_each_entry(tgt, &lov->qos_bavail_list,
415 /* All earlier tgts have at least this many
416 bytes available also, since our list is
418 if (TGT_BAVAIL(tgt) * stripes > src_oa->o_size)
422 mutex_up(&lov->lov_lock);
424 if (stripes < stripe_cnt)
425 stripes = stripe_cnt;
427 stripes = stripe_cnt;
430 rc = lov_alloc_memmd(&set->set_md, stripes,
431 lov->desc.ld_pattern ?
432 lov->desc.ld_pattern : LOV_PATTERN_RAID0,
441 lsm->lsm_object_id = src_oa->o_id;
442 if (!lsm->lsm_stripe_size)
443 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
444 if (!lsm->lsm_pattern) {
445 LASSERT(lov->desc.ld_pattern);
446 lsm->lsm_pattern = lov->desc.ld_pattern;
449 stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt);
450 LASSERT(stripes <= lsm->lsm_stripe_count);
452 GOTO(out_err, rc = stripes ? stripes : -EIO);
454 for (i = 0; i < stripes; i++) {
455 struct lov_request *req;
456 int ost_idx = idx_arr[i];
457 LASSERT(ost_idx >= 0);
459 OBD_ALLOC(req, sizeof(*req));
461 GOTO(out_err, rc = -ENOMEM);
462 lov_set_add_req(req, set);
464 req->rq_buflen = sizeof(*req->rq_md);
465 OBD_ALLOC(req->rq_md, req->rq_buflen);
466 if (req->rq_md == NULL)
467 GOTO(out_err, rc = -ENOMEM);
469 req->rq_oa = obdo_alloc();
470 if (req->rq_oa == NULL)
471 GOTO(out_err, rc = -ENOMEM);
473 req->rq_idx = ost_idx;
475 /* create data objects with "parent" OA */
476 memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
478 /* XXX When we start creating objects on demand, we need to
479 * make sure that we always create the object on the
480 * stripe which holds the existing file size.
482 if (src_oa->o_valid & OBD_MD_FLSIZE) {
484 lov_size_to_stripe(lsm, src_oa->o_size, i);
486 CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
487 i, req->rq_oa->o_size, src_oa->o_size);
491 LASSERT(set->set_count == stripes);
493 if (stripes < lsm->lsm_stripe_count)
496 if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
497 oti_alloc_cookies(oti, set->set_count);
498 if (!oti->oti_logcookies)
499 GOTO(out_err, rc = -ENOMEM);
500 set->set_cookies = oti->oti_logcookies;
504 obd_free_memmd(exp, &set->set_md);
505 free_idx_array(idx_arr, idx_cnt);
510 /* An caveat here is don't use list_move() on same list */
511 #define list_adjust(tgt, lov, list_name, value) \
513 struct list_head *element; \
514 struct lov_tgt_desc *tmp; \
515 if (list_empty(&(tgt)->list_name)) \
516 list_add(&(tgt)->list_name, &(lov)->list_name); \
517 element = (tgt)->list_name.next; \
518 while((element != &(lov)->list_name) && \
519 (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
520 (value(tgt) < value(tmp))) \
521 element = element->next; \
522 if (element != (tgt)->list_name.next) { \
523 list_del_init(&(tgt)->list_name); \
524 list_add(&(tgt)->list_name, element->prev); \
526 element = (tgt)->list_name.prev; \
527 while ((element != &(lov)->list_name) && \
528 (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
529 (value(tgt) > value(tmp))) \
530 element = element->prev; \
531 if (element != (tgt)->list_name.prev) { \
532 list_del_init(&(tgt)->list_name); \
533 list_add_tail(&(tgt)->list_name, element->prev); \
537 void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs)
539 struct lov_tgt_desc *tgt = &lov->tgts[idx];
543 bavail = osfs->os_bavail * osfs->os_bsize;
545 CWARN("ost %d has zero avail space!\n", idx);
547 CDEBUG(D_OTHER, "QOS: bfree now "LPU64"\n", bavail);
549 mutex_down(&lov->lov_lock);
550 list_adjust(tgt, lov, qos_bavail_list, TGT_BAVAIL);
551 mutex_up(&lov->lov_lock);