4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2017, Intel Corporation.
29 * lustre/target/tgt_grant.c
31 * This file provides code related to grant space management on Lustre Targets
32 * (OSTs and MDTs). Grant is a mechanism used by client nodes to reserve disk
33 * space on a target for the data writeback cache. The Lustre client is thus
34 * assured that enough space will be available when flushing dirty pages
35 * asynchronously. Each client node is granted an initial amount of reserved
36 * space at connect time and gets additional space back from target in bulk
39 * We actually support three different cases:
40 * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM)
41 * which means that all grant overhead calculation happens on the client side.
42 * The server reports at connect time the backend filesystem block size, the
43 * maximum extent size as well as the extent insertion cost and it is then up
44 * to the osc layer to the track dirty extents and consume grant accordingly
45 * (see osc_cache.c). In each bulk write request, the client provides how much
46 * grant space was consumed for this RPC.
47 * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a
48 * a backend file system block size of 4KB. We then have two cases:
49 * - If the block size is really 4KB, then the client can deal with grant
50 * allocation for partial block writes, but won't take extent insertion cost
51 * into account. For such clients, we inflate grant by 100% on the server
52 * side. It means that when 32MB of grant is hold by the client, 64MB of
53 * grant space is actually reserved on the server. All grant counters
54 * provided by such a client are inflated by 100%.
55 * - The backend filesystem block size is bigger than 4KB, which isn't
56 * supported by the client. In this case, we emulate a 4KB block size and
57 * consume one block size on the server for each 4KB of grant returned to
58 * client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB
59 * on the client will actually consume 1GB of grant on the server.
60 * All grant counters provided by such a client are inflated by the block
63 * This file handles the core logic for:
64 * - grant allocation strategy
65 * - maintaining per-client as well as global grant space accounting
66 * - processing grant information packed in incoming requests
67 * - allocating server-side grant space for synchronous write RPCs which did not
68 * consume grant on the client side (OBD_BRW_FROM_GRANT flag not set). If not
69 * enough space is available, such RPCs fail with ENOSPC
71 * Author: Johann Lombardi <johann.lombardi@intel.com>
74 #define DEBUG_SUBSYSTEM S_CLASS
77 #include <obd_class.h>
79 #include "tgt_internal.h"
81 int lbug_on_grant_miscount;
82 module_param(lbug_on_grant_miscount, int, 0644);
83 MODULE_PARM_DESC(lbug_on_grant_miscount, "LBUG on grant miscount");
85 /* Clients typically hold 2x their max_rpcs_in_flight of grant space */
86 #define TGT_GRANT_SHRINK_LIMIT(exp) (2ULL * 8 * exp_max_brw_size(exp))
88 /* Helpers to inflate/deflate grants for clients that do not support the grant
90 static inline u64 tgt_grant_inflate(struct tg_grants_data *tgd, u64 val)
92 if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
93 /* Client does not support such large block size, grant
94 * is thus inflated. We already significantly overestimate
95 * overhead, no need to add the extent tax in this case */
96 return val << (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
100 /* Companion of tgt_grant_inflate() */
101 static inline u64 tgt_grant_deflate(struct tg_grants_data *tgd, u64 val)
103 if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
104 return val >> (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
108 /* Grant chunk is used as a unit for grant allocation. It should be inflated
109 * if the client does not support the grant paramaters.
110 * Check connection flag against \a data if not NULL. This is used during
111 * connection creation where exp->exp_connect_data isn't populated yet */
112 static inline u64 tgt_grant_chunk(struct obd_export *exp,
113 struct lu_target *lut,
114 struct obd_connect_data *data)
116 struct tg_grants_data *tgd = &lut->lut_tgd;
117 u64 chunk = exp_max_brw_size(exp);
120 if (exp->exp_obd->obd_self_export == exp)
121 /* Grant enough space to handle a big precreate request */
122 return OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
124 if ((data == NULL && !(exp_grant_param_supp(exp))) ||
125 (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM)))
126 /* Try to grant enough space to send 2 full-size RPCs */
127 return tgt_grant_inflate(tgd, chunk) << 1;
129 /* Try to return enough to send two full-size RPCs
130 * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */
131 tax = 1ULL << tgd->tgd_blockbits; /* block size */
132 tax *= lut->lut_dt_conf.ddp_max_extent_blks; /* max extent size */
133 tax = (chunk + tax - 1) / tax; /* #extents in a RPC */
134 tax *= lut->lut_dt_conf.ddp_extent_tax; /* extent tax for a RPC */
135 chunk = (chunk + tax) * 2; /* we said two full RPCs */
139 static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
140 u64 *pending, u64 *granted, u64 maxsize)
142 struct tg_export_data *ted = &exp->exp_target_data;
145 if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
147 CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
148 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
149 ted->ted_dirty, ted->ted_pending, ted->ted_grant);
151 if (ted->ted_grant + ted->ted_pending > maxsize) {
152 CERROR("%s: cli %s/%p ted_grant(%ld) + ted_pending(%ld)"
153 " > maxsize(%llu)\n", exp->exp_obd->obd_name,
154 exp->exp_client_uuid.uuid, exp, ted->ted_grant,
155 ted->ted_pending, maxsize);
158 if (ted->ted_dirty > maxsize) {
159 CERROR("%s: cli %s/%p ted_dirty(%ld) > maxsize(%llu)\n",
160 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
161 exp, ted->ted_dirty, maxsize);
164 *granted += ted->ted_grant + ted->ted_pending;
165 *pending += ted->ted_pending;
166 *dirty += ted->ted_dirty;
171 * Perform extra sanity checks for grant accounting.
173 * This function scans the export list, sanity checks per-export grant counters
174 * and verifies accuracy of global grant accounting. If an inconsistency is
175 * found, a CERROR is printed with the function name \func that was passed as
176 * argument. LBUG is only called in case of serious counter corruption (i.e.
177 * value larger than the device size).
178 * Those sanity checks can be pretty expensive and are disabled if the OBD
179 * device has more than 100 connected exports by default.
181 * \param[in] obd OBD device for which grant accounting should be
183 * \param[in] func caller's function name
185 void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
187 struct lu_target *lut = obd->u.obt.obt_lut;
188 struct tg_grants_data *tgd = &lut->lut_tgd;
189 struct obd_export *exp;
190 struct tg_export_data *ted;
200 if (list_empty(&obd->obd_exports))
204 * We don't want to do this for large machines that do lots of
205 * mounts or unmounts. It burns...
206 * Use set_param to change obd_grant_check_threshold, which
207 * is 100 by default, 0 to always check grants
209 if (obd->obd_num_exports > obd->obd_grant_check_threshold &&
210 obd->obd_grant_check_threshold)
213 maxsize = tgd->tgd_osfs.os_blocks << tgd->tgd_blockbits;
215 spin_lock(&obd->obd_dev_lock);
216 spin_lock(&tgd->tgd_grant_lock);
217 exp = obd->obd_self_export;
218 ted = &exp->exp_target_data;
219 CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
220 "%ld\n", obd->obd_name, ted->ted_grant,
221 ted->ted_pending, ted->ted_dirty);
222 tot_granted += ted->ted_grant + ted->ted_pending;
223 tot_pending += ted->ted_pending;
224 tot_dirty += ted->ted_dirty;
226 list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
227 error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
228 &tot_granted, maxsize);
230 spin_unlock(&obd->obd_dev_lock);
231 spin_unlock(&tgd->tgd_grant_lock);
236 /* exports about to be unlinked should also be taken into account since
237 * they might still hold pending grant space to be released at
239 list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) {
240 error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
241 &tot_granted, maxsize);
243 spin_unlock(&obd->obd_dev_lock);
244 spin_unlock(&tgd->tgd_grant_lock);
249 fo_tot_granted = tgd->tgd_tot_granted;
250 fo_tot_pending = tgd->tgd_tot_pending;
251 fo_tot_dirty = tgd->tgd_tot_dirty;
252 spin_unlock(&obd->obd_dev_lock);
253 spin_unlock(&tgd->tgd_grant_lock);
255 if (tot_granted != fo_tot_granted)
256 CERROR("%s: tot_granted %llu != fo_tot_granted %llu\n",
257 func, tot_granted, fo_tot_granted);
258 if (tot_pending != fo_tot_pending)
259 CERROR("%s: tot_pending %llu != fo_tot_pending %llu\n",
260 func, tot_pending, fo_tot_pending);
261 if (tot_dirty != fo_tot_dirty)
262 CERROR("%s: tot_dirty %llu != fo_tot_dirty %llu\n",
263 func, tot_dirty, fo_tot_dirty);
264 if (tot_pending > tot_granted)
265 CERROR("%s: tot_pending %llu > tot_granted %llu\n",
266 func, tot_pending, tot_granted);
267 if (tot_granted > maxsize)
268 CERROR("%s: tot_granted %llu > maxsize %llu\n",
269 func, tot_granted, maxsize);
270 if (tot_dirty > maxsize)
271 CERROR("%s: tot_dirty %llu > maxsize %llu\n",
272 func, tot_dirty, maxsize);
274 EXPORT_SYMBOL(tgt_grant_sanity_check);
277 * Get file system statistics of target.
279 * Helper function for statfs(), also used by grant code.
280 * Implements caching for statistics to avoid calling OSD device each time.
282 * \param[in] env execution environment
283 * \param[in] lut LU target
284 * \param[out] osfs statistic data to return
285 * \param[in] max_age maximum age for cached data
286 * \param[in] from_cache show that data was get from cache or not
288 * \retval 0 if successful
289 * \retval negative value on error
291 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
292 struct obd_statfs *osfs, time64_t max_age, int *from_cache)
294 struct tg_grants_data *tgd = &lut->lut_tgd;
298 spin_lock(&tgd->tgd_osfs_lock);
299 if (tgd->tgd_osfs_age < max_age || max_age == 0) {
302 /* statfs data are too old, get up-to-date one.
303 * we must be cautious here since multiple threads might be
304 * willing to update statfs data concurrently and we must
305 * grant that cached statfs data are always consistent */
307 if (tgd->tgd_statfs_inflight == 0)
308 /* clear inflight counter if no users, although it would
309 * take a while to overflow this 64-bit counter ... */
310 tgd->tgd_osfs_inflight = 0;
311 /* notify tgt_grant_commit() that we want to track writes
312 * completed as of now */
313 tgd->tgd_statfs_inflight++;
314 /* record value of inflight counter before running statfs to
315 * compute the diff once statfs is completed */
316 unstable = tgd->tgd_osfs_inflight;
317 spin_unlock(&tgd->tgd_osfs_lock);
319 /* statfs can sleep ... hopefully not for too long since we can
320 * call it fairly often as space fills up */
321 rc = dt_statfs(env, lut->lut_bottom, osfs);
325 osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
327 spin_lock(&tgd->tgd_grant_lock);
328 spin_lock(&tgd->tgd_osfs_lock);
329 /* calculate how much space was written while we released the
331 unstable = tgd->tgd_osfs_inflight - unstable;
332 tgd->tgd_osfs_unstable = 0;
334 /* some writes committed while we were running statfs
335 * w/o the tgd_osfs_lock. Those ones got added to
336 * the cached statfs data that we are about to crunch.
337 * Take them into account in the new statfs data */
338 osfs->os_bavail -= min_t(u64, osfs->os_bavail,
339 unstable >> tgd->tgd_blockbits);
340 /* However, we don't really know if those writes got
341 * accounted in the statfs call, so tell
342 * tgt_grant_space_left() there is some uncertainty
343 * on the accounting of those writes.
344 * The purpose is to prevent spurious error messages in
345 * tgt_grant_space_left() since those writes might be
346 * accounted twice. */
347 tgd->tgd_osfs_unstable += unstable;
349 /* similarly, there is some uncertainty on write requests
350 * between prepare & commit */
351 tgd->tgd_osfs_unstable += tgd->tgd_tot_pending;
352 spin_unlock(&tgd->tgd_grant_lock);
354 /* finally udpate cached statfs data */
355 tgd->tgd_osfs = *osfs;
356 tgd->tgd_osfs_age = ktime_get_seconds();
358 tgd->tgd_statfs_inflight--; /* stop tracking */
359 if (tgd->tgd_statfs_inflight == 0)
360 tgd->tgd_osfs_inflight = 0;
361 spin_unlock(&tgd->tgd_osfs_lock);
366 /* use cached statfs data */
367 *osfs = tgd->tgd_osfs;
368 spin_unlock(&tgd->tgd_osfs_lock);
377 EXPORT_SYMBOL(tgt_statfs_internal);
380 * Update cached statfs information from the OSD layer
382 * Refresh statfs information cached in tgd::tgd_osfs if the cache is older
383 * than 1s or if force is set. The OSD layer is in charge of estimating data &
385 * This function can sleep so it should not be called with any spinlock held.
387 * \param[in] env LU environment passed by the caller
388 * \param[in] exp export used to print client info in debug
390 * \param[in] force force a refresh of statfs information
391 * \param[out] from_cache returns whether the statfs information are
394 static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
395 int force, int *from_cache)
397 struct obd_device *obd = exp->exp_obd;
398 struct lu_target *lut = obd->u.obt.obt_lut;
399 struct tg_grants_data *tgd = &lut->lut_tgd;
400 struct tgt_thread_info *tti;
401 struct obd_statfs *osfs;
406 max_age = 0; /* get fresh statfs data */
408 max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS;
410 tti = tgt_th_info(env);
411 osfs = &tti->tti_u.osfs;
412 rc = tgt_statfs_internal(env, lut, osfs, max_age, from_cache);
419 CDEBUG(D_CACHE, "%s: cli %s/%p free: %llu avail: %llu\n",
420 obd->obd_name, exp->exp_client_uuid.uuid, exp,
421 osfs->os_bfree << tgd->tgd_blockbits,
422 osfs->os_bavail << tgd->tgd_blockbits);
426 * Figure out how much space is available on the backend filesystem after
427 * removing grant space already booked by clients.
429 * This is done by accessing cached statfs data previously populated by
430 * tgt_grant_statfs(), from which we withdraw the space already granted to
431 * clients and the reserved space.
432 * Caller must hold tgd_grant_lock spinlock.
434 * \param[in] exp export associated with the device for which the amount
435 * of available space is requested
436 * \retval amount of non-allocated space, in bytes
438 static u64 tgt_grant_space_left(struct obd_export *exp)
440 struct obd_device *obd = exp->exp_obd;
441 struct lu_target *lut = obd->u.obt.obt_lut;
442 struct tg_grants_data *tgd = &lut->lut_tgd;
450 assert_spin_locked(&tgd->tgd_grant_lock);
452 spin_lock(&tgd->tgd_osfs_lock);
453 /* get available space from cached statfs data */
454 left = tgd->tgd_osfs.os_bavail << tgd->tgd_blockbits;
455 unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
456 spin_unlock(&tgd->tgd_osfs_lock);
458 reserved = left * tgd->tgd_reserved_pcnt / 100;
459 tot_granted = tgd->tgd_tot_granted + reserved;
461 if (left < tot_granted) {
462 int mask = (left + unstable <
463 tot_granted - tgd->tgd_tot_pending) ?
466 /* the below message is checked in sanityn.sh test_15 */
468 "%s: cli %s/%p left=%llu < tot_grant=%llu unstable=%llu pending=%llu dirty=%llu\n",
469 obd->obd_name, exp->exp_client_uuid.uuid, exp,
470 left, tot_granted, unstable,
471 tgd->tgd_tot_pending,
477 /* Withdraw space already granted to clients */
480 /* Align left on block size */
481 left &= ~((1ULL << tgd->tgd_blockbits) - 1);
484 "%s: cli %s/%p avail=%llu left=%llu unstable=%llu tot_grant=%llu pending=%llu\n",
485 obd->obd_name, exp->exp_client_uuid.uuid, exp, avail, left,
486 unstable, tot_granted, tgd->tgd_tot_pending);
492 * Process grant information from obdo structure packed in incoming BRW
493 * and inflate grant counters if required.
495 * Grab the dirty and seen grant announcements from the incoming obdo and
496 * inflate all grant counters passed in the request if the client does not
497 * support the grant parameters.
498 * We will later calculate the client's new grant and return it.
499 * Caller must hold tgd_grant_lock spinlock.
501 * \param[in] env LU environment supplying osfs storage
502 * \param[in] exp export for which we received the request
503 * \param[in,out] oa incoming obdo sent by the client
505 static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
506 struct obdo *oa, long chunk)
508 struct tg_export_data *ted = &exp->exp_target_data;
509 struct obd_device *obd = exp->exp_obd;
510 struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
511 long long dirty, dropped;
514 assert_spin_locked(&tgd->tgd_grant_lock);
516 if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) !=
517 (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) {
518 oa->o_valid &= ~OBD_MD_FLGRANT;
522 /* Add some margin, since there is a small race if other RPCs arrive
523 * out-or-order and have already consumed some grant. We want to
524 * leave this here in case there is a large error in accounting. */
526 "%s: cli %s/%p reports grant %llu dropped %u, local %lu\n",
527 obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant,
528 oa->o_dropped, ted->ted_grant);
530 if ((long long)oa->o_dirty < 0)
533 /* inflate grant counters if required */
534 if (!exp_grant_param_supp(exp)) {
536 oa->o_grant = tgt_grant_inflate(tgd, oa->o_grant);
537 oa->o_dirty = tgt_grant_inflate(tgd, oa->o_dirty);
538 /* inflation can bump client's wish to >4GB which doesn't fit
539 * 32bit o_undirty, limit that .. */
540 tmp = tgt_grant_inflate(tgd, oa->o_undirty);
541 if (tmp >= OBD_MAX_GRANT)
542 tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
544 tmp = tgt_grant_inflate(tgd, oa->o_dropped);
545 if (tmp >= OBD_MAX_GRANT)
546 tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
551 dropped = oa->o_dropped;
553 /* Update our accounting now so that statfs takes it into account.
554 * Note that ted_dirty is only approximate and can become incorrect
555 * if RPCs arrive out-of-order. No important calculations depend
556 * on ted_dirty however, but we must check sanity to not assert. */
557 if (dirty > ted->ted_grant + 4 * chunk)
558 dirty = ted->ted_grant + 4 * chunk;
559 tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
560 if (ted->ted_grant < dropped) {
562 "%s: cli %s/%p reports %llu dropped > grant %lu\n",
563 obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
567 if (tgd->tgd_tot_granted < dropped) {
568 CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n",
569 obd->obd_name, exp->exp_client_uuid.uuid, exp,
570 dropped, tgd->tgd_tot_granted);
573 tgd->tgd_tot_granted -= dropped;
574 ted->ted_grant -= dropped;
575 ted->ted_dirty = dirty;
577 if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
578 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
579 obd->obd_name, exp->exp_client_uuid.uuid, exp,
580 ted->ted_dirty, ted->ted_pending, ted->ted_grant);
581 spin_unlock(&tgd->tgd_grant_lock);
588 * Grant shrink request handler.
590 * Client nodes can explicitly release grant space (i.e. process called grant
591 * shrinking). This function proceeds with the shrink request when there is
592 * less ungranted space remaining than the amount all of the connected clients
593 * would consume if they used their full grant.
594 * Caller must hold tgd_grant_lock spinlock.
596 * \param[in] exp export releasing grant space
597 * \param[in,out] oa incoming obdo sent by the client
598 * \param[in] left_space remaining free space with space already granted
601 static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
604 struct tg_export_data *ted = &exp->exp_target_data;
605 struct obd_device *obd = exp->exp_obd;
606 struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
609 assert_spin_locked(&tgd->tgd_grant_lock);
611 if (left_space >= tgd->tgd_tot_granted_clients *
612 TGT_GRANT_SHRINK_LIMIT(exp))
615 grant_shrink = oa->o_grant;
617 if (ted->ted_grant < grant_shrink) {
619 "%s: cli %s/%p wants %lu shrinked > grant %lu\n",
620 obd->obd_name, exp->exp_client_uuid.uuid, exp,
621 grant_shrink, ted->ted_grant);
622 grant_shrink = ted->ted_grant;
625 ted->ted_grant -= grant_shrink;
626 tgd->tgd_tot_granted -= grant_shrink;
628 CDEBUG(D_CACHE, "%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n",
629 obd->obd_name, exp->exp_client_uuid.uuid, exp, grant_shrink,
630 ted->ted_grant, tgd->tgd_tot_granted);
632 /* client has just released some grant, don't grant any space back */
637 * Calculate how much space is required to write a given network buffer
639 * This function takes block alignment into account to estimate how much on-disk
640 * space will be required to successfully write the whole niobuf.
641 * Estimated space is inflated if the export does not support
642 * OBD_CONNECT_GRANT_PARAM and if the backend filesystem has a block size
643 * larger than the minimal supported page size (i.e. 4KB).
645 * \param[in] exp export associated which the write request
646 * if NULL, then size estimate is done for server-side
648 * \param[in] lut LU target handling the request
649 * \param[in] rnb network buffer to estimate size of
651 * \retval space (in bytes) that will be consumed to write the
654 static inline u64 tgt_grant_rnb_size(struct obd_export *exp,
655 struct lu_target *lut,
656 struct niobuf_remote *rnb)
658 struct tg_grants_data *tgd = &lut->lut_tgd;
663 if (exp && !exp_grant_param_supp(exp) &&
664 tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
665 blksize = 1ULL << COMPAT_BSIZE_SHIFT;
667 blksize = 1ULL << tgd->tgd_blockbits;
669 /* The network buffer might span several blocks, align it on block
671 bytes = rnb->rnb_offset & (blksize - 1);
672 bytes += rnb->rnb_len;
673 end = bytes & (blksize - 1);
675 bytes += blksize - end;
677 if (exp == NULL || exp_grant_param_supp(exp)) {
678 /* add per-extent insertion cost */
682 max_ext = blksize * lut->lut_dt_conf.ddp_max_extent_blks;
683 nr_ext = (bytes + max_ext - 1) / max_ext;
684 bytes += nr_ext * lut->lut_dt_conf.ddp_extent_tax;
686 /* Inflate grant space if client does not support extent-based
687 * grant allocation */
688 bytes = tgt_grant_inflate(tgd, (u64)bytes);
695 * Validate grant accounting for each incoming remote network buffer.
697 * When clients have dirtied as much space as they've been granted they
698 * fall through to sync writes. These sync writes haven't been expressed
699 * in grants and need to error with ENOSPC when there isn't room in the
700 * filesystem for them after grants are taken into account. However,
701 * writeback of the dirty data that was already granted space can write
703 * The OBD_BRW_GRANTED flag will be set in the rnb_flags of each network
704 * buffer which has been granted enough space to proceed. Buffers without
705 * this flag will fail to be written with -ENOSPC (see tgt_preprw_write().
706 * Caller must hold tgd_grant_lock spinlock.
708 * \param[in] env LU environment passed by the caller
709 * \param[in] exp export identifying the client which sent the RPC
710 * \param[in] oa incoming obdo in which we should return the pack the
712 * \param[in,out] rnb the list of network buffers
713 * \param[in] niocount the number of network buffers in the list
714 * \param[in] left the remaining free space with space already granted
717 static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
718 struct obdo *oa, struct niobuf_remote *rnb,
719 int niocount, u64 *left)
721 struct tg_export_data *ted = &exp->exp_target_data;
722 struct obd_device *obd = exp->exp_obd;
723 struct lu_target *lut = obd->u.obt.obt_lut;
724 struct tg_grants_data *tgd = &lut->lut_tgd;
725 unsigned long ungranted = 0;
726 unsigned long granted = 0;
732 assert_spin_locked(&tgd->tgd_grant_lock);
734 if (obd->obd_recovering) {
735 /* Replaying write. Grant info have been processed already so no
736 * need to do any enforcement here. It is worth noting that only
737 * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be
738 * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then
739 * the whole bulk is written synchronously */
741 CDEBUG(D_CACHE, "Replaying write, skipping accounting\n");
742 } else if ((oa->o_valid & OBD_MD_FLFLAGS) &&
743 (oa->o_flags & OBD_FL_RECOV_RESEND)) {
744 /* Recoverable resend, grant info have already been processed as
747 CDEBUG(D_CACHE, "Recoverable resend arrived, skipping "
749 } else if (exp_grant_param_supp(exp) && oa->o_grant_used > 0) {
750 /* Client supports the new grant parameters and is telling us
751 * how much grant space it consumed for this bulk write.
752 * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT
753 * flag set, we will scan the rnb list and looks for non-cache
754 * I/O in case it changes in the future */
755 if (ted->ted_grant >= oa->o_grant_used) {
756 /* skip grant accounting for rnbs with
757 * OBD_BRW_FROM_GRANT and just used grant consumption
758 * claimed in the request */
759 granted = oa->o_grant_used;
762 /* client has used more grants for this request that
764 CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n",
765 exp->exp_obd->obd_name,
766 exp->exp_client_uuid.uuid,
767 (unsigned long)oa->o_grant_used, ted->ted_grant);
769 /* check whether we can fill the gap with unallocated
771 if (*left > (oa->o_grant_used - ted->ted_grant)) {
772 /* ouf .. we are safe for now */
773 granted = ted->ted_grant;
774 ungranted = oa->o_grant_used - granted;
778 /* too bad, but we cannot afford to blow up our grant
779 * accounting. The loop below will handle each rnb in
784 for (i = 0; i < niocount; i++) {
787 if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) {
789 rnb[i].rnb_flags |= OBD_BRW_GRANTED;
793 /* compute how much grant space is actually needed for
794 * this rnb, inflate grant if required */
795 bytes = tgt_grant_rnb_size(exp, lut, &rnb[i]);
796 if (ted->ted_grant >= granted + bytes) {
798 rnb[i].rnb_flags |= OBD_BRW_GRANTED;
802 CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, "
803 "real grant %lu idx %d\n", obd->obd_name,
804 exp->exp_client_uuid.uuid, exp, granted, bytes,
808 if (obd->obd_recovering)
809 CERROR("%s: cli %s is replaying OST_WRITE while one rnb"
810 " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n",
811 obd->obd_name, exp->exp_client_uuid.uuid,
814 /* Consume grant space on the server.
815 * Unlike above, tgt_grant_rnb_size() is called with exp = NULL
816 * so that the required grant space isn't inflated. This is
817 * done on purpose since the server can deal with large block
818 * size, unlike some clients */
819 bytes = tgt_grant_rnb_size(NULL, lut, &rnb[i]);
821 /* if enough space, pretend it was granted */
824 rnb[i].rnb_flags |= OBD_BRW_GRANTED;
828 /* We can't check for already-mapped blocks here (make sense
829 * when backend filesystem does not use COW) as it requires
830 * dropping the grant lock.
831 * Instead, we clear OBD_BRW_GRANTED and in that case we need
832 * to go through and verify if all of the blocks not marked
833 * BRW_GRANTED are already mapped and we can ignore this error.
835 rnb[i].rnb_flags &= ~OBD_BRW_GRANTED;
836 CDEBUG(D_CACHE, "%s: cli %s/%p idx %d no space for %d\n",
837 obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes);
840 /* record in o_grant_used the actual space reserved for the I/O, will be
841 * used later in tgt_grant_commmit() */
842 oa->o_grant_used = granted + ungranted;
844 /* record space used for the I/O, will be used in tgt_grant_commmit() */
845 /* Now substract what the clients has used already. We don't subtract
846 * this from the tot_granted yet, so that other client's can't grab
847 * that space before we have actually allocated our blocks. That
848 * happens in tgt_grant_commit() after the writes are done. */
849 ted->ted_grant -= granted;
850 ted->ted_pending += oa->o_grant_used;
851 tgd->tgd_tot_granted += ungranted;
852 tgd->tgd_tot_pending += oa->o_grant_used;
855 "%s: cli %s/%p granted: %lu ungranted: %lu grant: %lu dirty: %lu"
856 "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp,
857 granted, ungranted, ted->ted_grant, ted->ted_dirty);
859 if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0)
860 /* don't update dirty accounting during recovery or
861 * if grant information got discarded (e.g. during resend) */
864 if (ted->ted_dirty < granted) {
865 CWARN("%s: cli %s/%p claims granted %lu > ted_dirty %lu\n",
866 obd->obd_name, exp->exp_client_uuid.uuid, exp,
867 granted, ted->ted_dirty);
868 granted = ted->ted_dirty;
870 tgd->tgd_tot_dirty -= granted;
871 ted->ted_dirty -= granted;
873 if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
874 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
875 obd->obd_name, exp->exp_client_uuid.uuid, exp,
876 ted->ted_dirty, ted->ted_pending, ted->ted_grant);
877 spin_unlock(&tgd->tgd_grant_lock);
884 * Allocate additional grant space to a client
886 * Calculate how much grant space to return to client, based on how much space
887 * is currently free and how much of that is already granted.
888 * Caller must hold tgd_grant_lock spinlock.
890 * \param[in] exp export of the client which sent the request
891 * \param[in] curgrant current grant claimed by the client
892 * \param[in] want how much grant space the client would like to
894 * \param[in] left remaining free space with granted space taken
896 * \param[in] chunk grant allocation unit
897 * \param[in] conservative if set to true, the server should be cautious
898 * and limit how much space is granted back to the
899 * client. Otherwise, the server should try hard to
900 * satisfy the client request.
902 * \retval amount of grant space allocated
904 static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
905 u64 want, u64 left, long chunk,
908 struct obd_device *obd = exp->exp_obd;
909 struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
910 struct tg_export_data *ted = &exp->exp_target_data;
915 if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT))
918 /* When tgd_grant_compat_disable is set, we don't grant any space to
919 * clients not supporting OBD_CONNECT_GRANT_PARAM.
920 * Otherwise, space granted to such a client is inflated since it
921 * consumes PAGE_SIZE of grant space per block */
922 if ((obd->obd_self_export != exp && !exp_grant_param_supp(exp) &&
923 tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed)
926 if (want > OBD_MAX_GRANT) {
927 CERROR("%s: client %s/%p requesting > max (%lu), %llu\n",
928 obd->obd_name, exp->exp_client_uuid.uuid, exp,
929 OBD_MAX_GRANT, want);
933 /* Grant some fraction of the client's requested grant space so that
934 * they are not always waiting for write credits (not all of it to
935 * avoid overgranting in face of multiple RPCs in flight). This
936 * essentially will be able to control the OSC_MAX_RIF for a client.
938 * If we do have a large disparity between what the client thinks it
939 * has and what we think it has, don't grant very much and let the
940 * client consume its grant first. Either it just has lots of RPCs
941 * in flight, or it was evicted and its grants will soon be used up. */
942 if (curgrant >= want || curgrant >= ted->ted_grant + chunk)
945 if (obd->obd_recovering)
946 conservative = false;
949 /* don't grant more than 1/8th of the remaining free space in
952 grant = min(want - curgrant, left);
953 /* round grant up to the next block size */
954 grant = (grant + (1 << tgd->tgd_blockbits) - 1) &
955 ~((1ULL << tgd->tgd_blockbits) - 1);
960 /* Limit to grant_chunk if not reconnect/recovery */
961 if ((grant > chunk) && conservative)
965 * Limit grant so that export' grant does not exceed what the
966 * client would like to have by more than grants for 2 full
969 if (want + chunk <= ted->ted_grant)
971 if (ted->ted_grant + grant > want + chunk)
972 grant = want + chunk - ted->ted_grant;
974 tgd->tgd_tot_granted += grant;
975 ted->ted_grant += grant;
977 if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) {
978 CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
979 obd->obd_name, exp->exp_client_uuid.uuid, exp,
980 ted->ted_grant, want, curgrant);
981 if (lbug_on_grant_miscount) {
982 spin_unlock(&tgd->tgd_grant_lock);
988 "%s: cli %s/%p wants: %llu current grant %llu"
989 " granting: %llu\n", obd->obd_name, exp->exp_client_uuid.uuid,
990 exp, want, curgrant, grant);
992 "%s: cli %s/%p tot cached:%llu granted:%llu"
993 " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid,
994 exp, tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
995 obd->obd_num_exports);
1001 * Handle grant space allocation on client connection & reconnection.
1003 * A new non-readonly connection gets an initial grant allocation equals to
1004 * tgt_grant_chunk() (i.e. twice the max BRW size in most of the cases).
1005 * On reconnection, grant counters between client & target are resynchronized
1006 * and additional space might be granted back if possible.
1008 * \param[in] env LU environment provided by the caller
1009 * \param[in] exp client's export which is (re)connecting
1010 * \param[in,out] data obd_connect_data structure sent by the client in the
1012 * \param[in] new_conn must set to true if this is a new connection and false
1013 * for a reconnection
1015 void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp,
1016 struct obd_connect_data *data, bool new_conn)
1018 struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
1019 struct tg_grants_data *tgd = &lut->lut_tgd;
1020 struct tg_export_data *ted = &exp->exp_target_data;
1025 int force = 0; /* can use cached data */
1027 /* don't grant space to client with read-only access */
1028 if (OCD_HAS_FLAG(data, RDONLY) ||
1029 (!OCD_HAS_FLAG(data, GRANT_PARAM) &&
1030 tgd->tgd_grant_compat_disable)) {
1031 data->ocd_grant = 0;
1032 data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT |
1033 OBD_CONNECT_GRANT_PARAM);
1037 if (OCD_HAS_FLAG(data, GRANT_PARAM))
1038 want = data->ocd_grant;
1040 want = tgt_grant_inflate(tgd, data->ocd_grant);
1041 chunk = tgt_grant_chunk(exp, lut, data);
1043 tgt_grant_statfs(env, exp, force, &from_cache);
1045 spin_lock(&tgd->tgd_grant_lock);
1047 /* Grab free space from cached info and take out space already granted
1048 * to clients as well as reserved space */
1049 left = tgt_grant_space_left(exp);
1051 /* get fresh statfs data if we are short in ungranted space */
1052 if (from_cache && left < 32 * chunk) {
1053 spin_unlock(&tgd->tgd_grant_lock);
1054 CDEBUG(D_CACHE, "fs has no space left and statfs too old\n");
1059 tgt_grant_alloc(exp, (u64)ted->ted_grant, want, left, chunk, new_conn);
1061 /* return to client its current grant */
1062 if (OCD_HAS_FLAG(data, GRANT_PARAM))
1063 data->ocd_grant = ted->ted_grant;
1066 data->ocd_grant = tgt_grant_deflate(tgd, (u64)ted->ted_grant);
1068 /* reset dirty accounting */
1069 tgd->tgd_tot_dirty -= ted->ted_dirty;
1072 if (new_conn && OCD_HAS_FLAG(data, GRANT))
1073 tgd->tgd_tot_granted_clients++;
1075 spin_unlock(&tgd->tgd_grant_lock);
1077 CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: %llu left: %llu\n",
1078 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
1079 exp, data->ocd_grant, want, left);
1083 EXPORT_SYMBOL(tgt_grant_connect);
1086 * Release all grant space attached to a given export.
1088 * Remove a client from the grant accounting totals. We also remove
1089 * the export from the obd device under the osfs and dev locks to ensure
1090 * that the tgt_grant_sanity_check() calculations are always valid.
1091 * The client should do something similar when it invalidates its import.
1093 * \param[in] exp client's export to remove from grant accounting
1095 void tgt_grant_discard(struct obd_export *exp)
1097 struct obd_device *obd = exp->exp_obd;
1098 struct lu_target *lut = class_exp2tgt(exp);
1099 struct tg_export_data *ted = &exp->exp_target_data;
1100 struct tg_grants_data *tgd;
1105 tgd = &lut->lut_tgd;
1106 spin_lock(&tgd->tgd_grant_lock);
1107 if (unlikely(tgd->tgd_tot_granted < ted->ted_grant ||
1108 tgd->tgd_tot_dirty < ted->ted_dirty)) {
1109 struct obd_export *e;
1113 list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) {
1115 ttg += e->exp_target_data.ted_grant;
1116 ttg += e->exp_target_data.ted_pending;
1117 ttd += e->exp_target_data.ted_dirty;
1119 if (tgd->tgd_tot_granted < ted->ted_grant)
1120 CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu",
1121 obd->obd_name, exp->exp_client_uuid.uuid, exp,
1122 tgd->tgd_tot_granted, ted->ted_grant, ttg);
1123 if (tgd->tgd_tot_dirty < ted->ted_dirty)
1124 CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu",
1125 obd->obd_name, exp->exp_client_uuid.uuid, exp,
1126 tgd->tgd_tot_dirty, ted->ted_dirty, ttd);
1127 tgd->tgd_tot_granted = ttg;
1128 tgd->tgd_tot_dirty = ttd;
1130 tgd->tgd_tot_granted -= ted->ted_grant;
1131 tgd->tgd_tot_dirty -= ted->ted_dirty;
1136 if (tgd->tgd_tot_pending < ted->ted_pending) {
1137 CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n",
1138 obd->obd_name, tgd->tgd_tot_pending,
1139 exp->exp_client_uuid.uuid, exp, ted->ted_pending);
1141 /* tgd_tot_pending is handled in tgt_grant_commit as bulk
1143 spin_unlock(&tgd->tgd_grant_lock);
1145 EXPORT_SYMBOL(tgt_grant_discard);
1148 * Process grant information from incoming bulk read request.
1150 * Extract grant information packed in obdo structure (OBD_MD_FLGRANT set in
1151 * o_valid). Bulk reads usually comes with grant announcements (number of dirty
1152 * blocks, remaining amount of grant space, ...) and could also include a grant
1153 * shrink request. Unlike bulk write, no additional grant space is returned on
1154 * bulk read request.
1156 * \param[in] env is the lu environment provided by the caller
1157 * \param[in] exp is the export of the client which sent the request
1158 * \param[in,out] oa is the incoming obdo sent by the client
1160 void tgt_grant_prepare_read(const struct lu_env *env,
1161 struct obd_export *exp, struct obdo *oa)
1163 struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
1164 struct tg_grants_data *tgd = &lut->lut_tgd;
1173 if ((oa->o_valid & OBD_MD_FLGRANT) == 0)
1174 /* The read request does not contain any grant
1178 if ((oa->o_valid & OBD_MD_FLFLAGS) &&
1179 (oa->o_flags & OBD_FL_SHRINK_GRANT)) {
1180 /* To process grant shrink request, we need to know how much
1181 * available space remains on the backend filesystem.
1182 * Shrink requests are not so common, we always get fresh
1183 * statfs information. */
1184 tgt_grant_statfs(env, exp, 1, NULL);
1186 /* protect all grant counters */
1187 spin_lock(&tgd->tgd_grant_lock);
1189 /* Grab free space from cached statfs data and take out space
1190 * already granted to clients as well as reserved space */
1191 left = tgt_grant_space_left(exp);
1193 /* all set now to proceed with shrinking */
1196 /* no grant shrinking request packed in the obdo and
1197 * since we don't grant space back on reads, no point
1198 * in running statfs, so just skip it and process
1199 * incoming grant data directly. */
1200 spin_lock(&tgd->tgd_grant_lock);
1204 /* extract incoming grant information provided by the client and
1205 * inflate grant counters if required */
1206 tgt_grant_incoming(env, exp, oa, tgt_grant_chunk(exp, lut, NULL));
1208 /* unlike writes, we don't return grants back on reads unless a grant
1209 * shrink request was packed and we decided to turn it down. */
1211 tgt_grant_shrink(exp, oa, left);
1215 if (!exp_grant_param_supp(exp))
1216 oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
1217 spin_unlock(&tgd->tgd_grant_lock);
1220 EXPORT_SYMBOL(tgt_grant_prepare_read);
1223 * Process grant information from incoming bulk write request.
1225 * This function extracts client's grant announcements from incoming bulk write
1226 * request and attempts to allocate grant space for network buffers that need it
1227 * (i.e. OBD_BRW_FROM_GRANT not set in rnb_fags).
1228 * Network buffers which aren't granted the OBD_BRW_GRANTED flag should not
1229 * proceed further and should fail with -ENOSPC.
1230 * Whenever possible, additional grant space will be returned to the client
1231 * in the bulk write reply.
1232 * tgt_grant_prepare_write() must be called before writting any buffers to
1233 * the backend storage. This function works in pair with tgt_grant_commit()
1234 * which must be invoked once all buffers have been written to disk in order
1235 * to release space from the pending grant counter.
1237 * \param[in] env LU environment provided by the caller
1238 * \param[in] exp export of the client which sent the request
1239 * \param[in] oa incoming obdo sent by the client
1240 * \param[in] rnb list of network buffers
1241 * \param[in] niocount number of network buffers in the list
1243 void tgt_grant_prepare_write(const struct lu_env *env,
1244 struct obd_export *exp, struct obdo *oa,
1245 struct niobuf_remote *rnb, int niocount)
1247 struct obd_device *obd = exp->exp_obd;
1248 struct lu_target *lut = obd->u.obt.obt_lut;
1249 struct tg_grants_data *tgd = &lut->lut_tgd;
1252 int force = 0; /* can use cached data intially */
1253 long chunk = tgt_grant_chunk(exp, lut, NULL);
1258 /* get statfs information from OSD layer */
1259 tgt_grant_statfs(env, exp, force, &from_cache);
1261 spin_lock(&tgd->tgd_grant_lock); /* protect all grant counters */
1263 /* Grab free space from cached statfs data and take out space already
1264 * granted to clients as well as reserved space */
1265 left = tgt_grant_space_left(exp);
1267 /* Get fresh statfs data if we are short in ungranted space */
1268 if (from_cache && left < 32 * chunk) {
1269 spin_unlock(&tgd->tgd_grant_lock);
1270 CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n",
1276 /* When close to free space exhaustion, trigger a sync to force
1277 * writeback cache to consume required space immediately and release as
1278 * much space as possible. */
1279 if (!obd->obd_recovering && force != 2 && left < chunk) {
1280 bool from_grant = true;
1283 /* That said, it is worth running a sync only if some pages did
1284 * not consume grant space on the client and could thus fail
1285 * with ENOSPC later in tgt_grant_check() */
1286 for (i = 0; i < niocount; i++)
1287 if (!(rnb[i].rnb_flags & OBD_BRW_FROM_GRANT))
1291 /* at least one network buffer requires acquiring grant
1292 * space on the server */
1293 spin_unlock(&tgd->tgd_grant_lock);
1294 /* discard errors, at least we tried ... */
1295 dt_sync(env, lut->lut_bottom);
1301 /* extract incoming grant information provided by the client,
1302 * and inflate grant counters if required */
1303 tgt_grant_incoming(env, exp, oa, chunk);
1306 tgt_grant_check(env, exp, oa, rnb, niocount, &left);
1308 if (!(oa->o_valid & OBD_MD_FLGRANT)) {
1309 spin_unlock(&tgd->tgd_grant_lock);
1313 /* if OBD_FL_SHRINK_GRANT is set, the client is willing to release some
1315 if ((oa->o_valid & OBD_MD_FLFLAGS) &&
1316 (oa->o_flags & OBD_FL_SHRINK_GRANT))
1317 tgt_grant_shrink(exp, oa, left);
1319 /* grant more space back to the client if possible */
1320 oa->o_grant = tgt_grant_alloc(exp, oa->o_grant, oa->o_undirty,
1323 if (!exp_grant_param_supp(exp))
1324 oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
1325 spin_unlock(&tgd->tgd_grant_lock);
1328 EXPORT_SYMBOL(tgt_grant_prepare_write);
1331 * Consume grant space reserved for object creation.
1333 * Grant space is allocated to the local self export for object precreation.
1334 * This is required to prevent object precreation from consuming grant space
1335 * allocated to client nodes for the data writeback cache.
1336 * This function consumes enough space to create \a nr objects and allocates
1337 * more grant space to the self export for future precreation requests, if
1340 * \param[in] env LU environment provided by the caller
1341 * \param[in] exp export holding the grant space for precreation (= self
1343 * \param[in] nr number of objects to be created
1345 * \retval >= 0 amount of grant space allocated to the precreate request
1346 * \retval -ENOSPC on failure
1348 long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, s64 *nr)
1350 struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
1351 struct tg_grants_data *tgd = &lut->lut_tgd;
1352 struct tg_export_data *ted = &exp->exp_target_data;
1354 unsigned long wanted;
1355 unsigned long granted;
1358 if (exp->exp_obd->obd_recovering ||
1359 lut->lut_dt_conf.ddp_inodespace == 0)
1360 /* don't enforce grant during recovery */
1363 /* Update statfs data if required */
1364 tgt_grant_statfs(env, exp, 1, NULL);
1366 /* protect all grant counters */
1367 spin_lock(&tgd->tgd_grant_lock);
1369 /* fail precreate request if there is not enough blocks available for
1371 if (tgd->tgd_osfs.os_bavail - (ted->ted_grant >> tgd->tgd_blockbits) <
1372 (tgd->tgd_osfs.os_blocks >> 10)) {
1373 spin_unlock(&tgd->tgd_grant_lock);
1374 CDEBUG(D_RPCTRACE, "%s: not enough space for create %llu\n",
1375 exp->exp_obd->obd_name,
1376 tgd->tgd_osfs.os_bavail * tgd->tgd_osfs.os_blocks);
1380 /* Grab free space from cached statfs data and take out space
1381 * already granted to clients as well as reserved space */
1382 left = tgt_grant_space_left(exp);
1384 /* compute how much space is required to handle the precreation
1386 wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
1387 if (wanted > ted->ted_grant + left) {
1388 /* that's beyond what remains, adjust the number of objects that
1389 * can be safely precreated */
1390 wanted = ted->ted_grant + left;
1391 *nr = wanted / lut->lut_dt_conf.ddp_inodespace;
1393 /* we really have no space any more for precreation,
1394 * fail the precreate request with ENOSPC */
1395 spin_unlock(&tgd->tgd_grant_lock);
1398 /* compute space needed for the new number of creations */
1399 wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
1401 LASSERT(wanted <= ted->ted_grant + left);
1403 if (wanted <= ted->ted_grant) {
1404 /* we've enough grant space to handle this precreate request */
1405 ted->ted_grant -= wanted;
1407 /* we need to take some space from the ungranted pool */
1408 tgd->tgd_tot_granted += wanted - ted->ted_grant;
1409 left -= wanted - ted->ted_grant;
1413 ted->ted_pending += granted;
1414 tgd->tgd_tot_pending += granted;
1416 /* grant more space for precreate purpose if possible. */
1417 wanted = OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
1418 if (wanted > ted->ted_grant) {
1421 /* always try to book enough space to handle a large precreate
1423 chunk = tgt_grant_chunk(exp, lut, NULL);
1424 wanted -= ted->ted_grant;
1425 tgt_grant_alloc(exp, ted->ted_grant, wanted, left, chunk,
1428 spin_unlock(&tgd->tgd_grant_lock);
1431 EXPORT_SYMBOL(tgt_grant_create);
1434 * Release grant space added to the pending counter by tgt_grant_prepare_write()
1436 * Update pending grant counter once buffers have been written to the disk.
1438 * \param[in] exp export of the client which sent the request
1439 * \param[in] pending amount of reserved space to be released
1440 * \param[in] rc return code of pre-commit operations
1442 void tgt_grant_commit(struct obd_export *exp, unsigned long pending,
1445 struct tg_grants_data *tgd = &exp->exp_obd->u.obt.obt_lut->lut_tgd;
1449 /* get space accounted in tot_pending for the I/O, set in
1450 * tgt_grant_check() */
1454 spin_lock(&tgd->tgd_grant_lock);
1455 /* Don't update statfs data for errors raised before commit (e.g.
1456 * bulk transfer failed, ...) since we know those writes have not been
1457 * processed. For other errors hit during commit, we cannot really tell
1458 * whether or not something was written, so we update statfs data.
1459 * In any case, this should not be fatal since we always get fresh
1460 * statfs data before failing a request with ENOSPC */
1462 spin_lock(&tgd->tgd_osfs_lock);
1463 /* Take pending out of cached statfs data */
1464 tgd->tgd_osfs.os_bavail -= min_t(u64,
1465 tgd->tgd_osfs.os_bavail,
1466 pending >> tgd->tgd_blockbits);
1467 if (tgd->tgd_statfs_inflight)
1468 /* someone is running statfs and want to be notified of
1469 * writes happening meanwhile */
1470 tgd->tgd_osfs_inflight += pending;
1471 spin_unlock(&tgd->tgd_osfs_lock);
1474 if (exp->exp_target_data.ted_pending < pending) {
1475 CERROR("%s: cli %s/%p ted_pending(%lu) < grant_used(%lu)\n",
1476 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
1477 exp->exp_target_data.ted_pending, pending);
1478 spin_unlock(&tgd->tgd_grant_lock);
1481 exp->exp_target_data.ted_pending -= pending;
1483 if (tgd->tgd_tot_granted < pending) {
1484 CERROR("%s: cli %s/%p tot_granted(%llu) < grant_used(%lu)\n",
1485 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
1486 tgd->tgd_tot_granted, pending);
1487 spin_unlock(&tgd->tgd_grant_lock);
1490 tgd->tgd_tot_granted -= pending;
1492 if (tgd->tgd_tot_pending < pending) {
1493 CERROR("%s: cli %s/%p tot_pending(%llu) < grant_used(%lu)\n",
1494 exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
1495 tgd->tgd_tot_pending, pending);
1496 spin_unlock(&tgd->tgd_grant_lock);
1499 tgd->tgd_tot_pending -= pending;
1500 spin_unlock(&tgd->tgd_grant_lock);
1503 EXPORT_SYMBOL(tgt_grant_commit);
1505 struct tgt_grant_cb {
1506 /* commit callback structure */
1507 struct dt_txn_commit_cb tgc_cb;
1508 /* export associated with the bulk write */
1509 struct obd_export *tgc_exp;
1510 /* pending grant to be released */
1511 unsigned long tgc_granted;
1515 * Callback function for grant releasing
1517 * Release grant space reserved by the client node.
1519 * \param[in] env execution environment
1520 * \param[in] th transaction handle
1521 * \param[in] cb callback data
1522 * \param[in] err error code
1524 static void tgt_grant_commit_cb(struct lu_env *env, struct thandle *th,
1525 struct dt_txn_commit_cb *cb, int err)
1527 struct tgt_grant_cb *tgc;
1529 tgc = container_of(cb, struct tgt_grant_cb, tgc_cb);
1531 tgt_grant_commit(tgc->tgc_exp, tgc->tgc_granted, err);
1532 class_export_cb_put(tgc->tgc_exp);
1537 * Add callback for grant releasing
1539 * Register a commit callback to release grant space.
1541 * \param[in] th transaction handle
1542 * \param[in] exp OBD export of client
1543 * \param[in] granted amount of grant space to be released upon commit
1545 * \retval 0 on successful callback adding
1546 * \retval negative value on error
1548 int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
1549 unsigned long granted)
1551 struct tgt_grant_cb *tgc;
1552 struct dt_txn_commit_cb *dcb;
1560 tgc->tgc_exp = class_export_cb_get(exp);
1561 tgc->tgc_granted = granted;
1564 dcb->dcb_func = tgt_grant_commit_cb;
1565 INIT_LIST_HEAD(&dcb->dcb_linkage);
1566 strlcpy(dcb->dcb_name, "tgt_grant_commit_cb", sizeof(dcb->dcb_name));
1568 rc = dt_trans_cb_add(th, dcb);
1570 class_export_cb_put(tgc->tgc_exp);
1576 EXPORT_SYMBOL(tgt_grant_commit_cb_add);
1579 * Show estimate of total amount of dirty data on clients.
1581 * @kobj kobject embedded in obd_device
1583 * @buf buf used by sysfs to print out data
1585 * Return: 0 on success
1586 * negative value on error
1588 ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
1591 struct obd_device *obd = container_of(kobj, struct obd_device,
1593 struct tg_grants_data *tgd;
1595 tgd = &obd->u.obt.obt_lut->lut_tgd;
1596 return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty);
1598 EXPORT_SYMBOL(tot_dirty_show);
1601 * Show total amount of space granted to clients.
1603 * @kobj kobject embedded in obd_device
1605 * @buf buf used by sysfs to print out data
1607 * Return: 0 on success
1608 * negative value on error
1610 ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
1613 struct obd_device *obd = container_of(kobj, struct obd_device,
1615 struct tg_grants_data *tgd;
1617 tgd = &obd->u.obt.obt_lut->lut_tgd;
1618 return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted);
1620 EXPORT_SYMBOL(tot_granted_show);
1623 * Show total amount of space used by IO in progress.
1625 * @kobj kobject embedded in obd_device
1627 * @buf buf used by sysfs to print out data
1629 * Return: 0 on success
1630 * negative value on error
1632 ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
1635 struct obd_device *obd = container_of(kobj, struct obd_device,
1637 struct tg_grants_data *tgd;
1639 tgd = &obd->u.obt.obt_lut->lut_tgd;
1640 return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending);
1642 EXPORT_SYMBOL(tot_pending_show);
1645 * Show if grants compatibility mode is disabled.
1647 * When tgd_grant_compat_disable is set, we don't grant any space to clients
1648 * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
1649 * a client is inflated since it consumes PAGE_SIZE of grant space per
1650 * block, (i.e. typically 4kB units), but underlaying file system might have
1651 * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
1653 * @kobj kobject embedded in obd_device
1655 * @buf buf used by sysfs to print out data
1657 * Return: string length of @buf output on success
1659 ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
1662 struct obd_device *obd = container_of(kobj, struct obd_device,
1664 struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
1666 return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable);
1668 EXPORT_SYMBOL(grant_compat_disable_show);
1671 * Change grant compatibility mode.
1673 * Setting tgd_grant_compat_disable prohibit any space granting to clients
1674 * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
1676 * @kobj kobject embedded in obd_device
1678 * @buffer string which represents mode
1679 * 1: disable compatibility mode
1680 * 0: enable compatibility mode
1681 * @count @buffer length
1683 * Return: @count on success
1684 * negative number on error
1686 ssize_t grant_compat_disable_store(struct kobject *kobj,
1687 struct attribute *attr,
1688 const char *buffer, size_t count)
1690 struct obd_device *obd = container_of(kobj, struct obd_device,
1692 struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
1696 rc = kstrtobool(buffer, &val);
1700 tgd->tgd_grant_compat_disable = val;
1704 EXPORT_SYMBOL(grant_compat_disable_store);