1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Copyright (C) 2006 Cluster File Systems, Inc.
7 * Author: Nikita Danilov <nikita@clusterfs.com>
9 * This file is part of the Lustre file system, http://www.lustre.org
10 * Lustre is a trademark of Cluster File Systems, Inc.
12 * You may have signed or agreed to another license before downloading
13 * this software. If so, you are bound by the terms and conditions
14 * of that agreement, and the following does not apply to you. See the
15 * LICENSE file included with this distribution for more information.
17 * If you did not agree to a different license, then this copy of Lustre
18 * is open source software; you can redistribute it and/or modify it
19 * under the terms of version 2 of the GNU General Public License as
20 * published by the Free Software Foundation.
22 * In either case, Lustre is distributed in the hope that it will be
23 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
24 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * license text for more details.
27 * These are the only exported functions, they provide some generic
28 * infrastructure for managing object devices
31 #define DEBUG_SUBSYSTEM S_CLASS
33 # define EXPORT_SYMTAB
36 #include <linux/seq_file.h>
37 #include <linux/module.h>
38 #include <obd_support.h>
39 #include <lustre_disk.h>
40 #include <lustre_fid.h>
41 #include <lu_object.h>
42 #include <libcfs/list.h>
44 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
47 * Decrease reference counter on object. If last reference is freed, return
48 * object to the cache, unless lu_object_is_dying(o) holds. In the latter
49 * case, free object immediately.
51 void lu_object_put(const struct lu_env *env, struct lu_object *o)
53 struct lu_object_header *top;
55 struct lu_object *orig;
59 site = o->lo_dev->ld_site;
62 spin_lock(&site->ls_guard);
63 if (-- top->loh_ref == 0) {
65 * When last reference is released, iterate over object
66 * layers, and notify them that object is no longer busy.
68 list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
69 if (o->lo_ops->loo_object_release != NULL)
70 o->lo_ops->loo_object_release(env, o);
73 if (lu_object_is_dying(top)) {
75 * If object is dying (will not be cached), removed it
76 * from hash table and LRU.
78 * This is done with hash table and LRU lists
79 * locked. As the only way to acquire first reference
80 * to previously unreferenced object is through
81 * hash-table lookup (lu_object_find()), or LRU
82 * scanning (lu_site_purge()), that are done under
83 * hash-table and LRU lock, no race with concurrent
84 * object lookup is possible and we can safely destroy
87 hlist_del_init(&top->loh_hash);
88 list_del_init(&top->loh_lru);
92 spin_unlock(&site->ls_guard);
95 * Object was already removed from hash and lru above, can
98 lu_object_free(env, orig);
100 EXPORT_SYMBOL(lu_object_put);
103 * Allocate new object.
105 * This follows object creation protocol, described in the comment within
106 * struct lu_device_operations definition.
108 static struct lu_object *lu_object_alloc(const struct lu_env *env,
110 const struct lu_fid *f,
111 const struct lustre_capa *capa)
113 struct lu_object *scan;
114 struct lu_object *top;
115 struct list_head *layers;
120 * Create top-level object slice. This will also create
123 top = s->ls_top_dev->ld_ops->ldo_object_alloc(env,
124 NULL, s->ls_top_dev);
129 * This is the only place where object fid is assigned. It's constant
132 top->lo_header->loh_fid = *f;
133 if (capa == BYPASS_CAPA)
134 lu_object_bypass_capa(top);
136 top->lo_header->loh_capa = *capa;
138 layers = &top->lo_header->loh_layers;
141 * Call ->loo_object_init() repeatedly, until no more new
142 * object slices are created.
145 list_for_each_entry(scan, layers, lo_linkage) {
146 if (scan->lo_flags & LU_OBJECT_ALLOCATED)
149 scan->lo_header = top->lo_header;
150 result = scan->lo_ops->loo_object_init(env, scan);
152 lu_object_free(env, top);
153 RETURN(ERR_PTR(result));
155 scan->lo_flags |= LU_OBJECT_ALLOCATED;
159 list_for_each_entry_reverse(scan, layers, lo_linkage) {
160 if (scan->lo_ops->loo_object_start != NULL) {
161 result = scan->lo_ops->loo_object_start(env, scan);
163 lu_object_free(env, top);
164 RETURN(ERR_PTR(result));
169 s->ls_stats.s_created ++;
176 static void lu_object_free(const struct lu_env *env, struct lu_object *o)
178 struct list_head splice;
179 struct lu_object *scan;
182 * First call ->loo_object_delete() method to release all resources.
184 list_for_each_entry_reverse(scan,
185 &o->lo_header->loh_layers, lo_linkage) {
186 if (scan->lo_ops->loo_object_delete != NULL)
187 scan->lo_ops->loo_object_delete(env, scan);
189 -- o->lo_dev->ld_site->ls_total;
191 * Then, splice object layers into stand-alone list, and call
192 * ->loo_object_free() on all layers to free memory. Splice is
193 * necessary, because lu_object_header is freed together with the
196 INIT_LIST_HEAD(&splice);
197 list_splice_init(&o->lo_header->loh_layers, &splice);
198 while (!list_empty(&splice)) {
199 o = container_of0(splice.next, struct lu_object, lo_linkage);
200 list_del_init(&o->lo_linkage);
201 LASSERT(o->lo_ops->loo_object_free != NULL);
202 o->lo_ops->loo_object_free(env, o);
207 * Free @nr objects from the cold end of the site LRU list.
209 void lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
211 struct list_head dispose;
212 struct lu_object_header *h;
213 struct lu_object_header *temp;
215 INIT_LIST_HEAD(&dispose);
217 * Under LRU list lock, scan LRU list and move unreferenced objects to
218 * the dispose list, removing them from LRU and hash table.
220 spin_lock(&s->ls_guard);
221 list_for_each_entry_safe(h, temp, &s->ls_lru, loh_lru) {
226 hlist_del_init(&h->loh_hash);
227 list_move(&h->loh_lru, &dispose);
229 spin_unlock(&s->ls_guard);
231 * Free everything on the dispose list. This is safe against races due
232 * to the reasons described in lu_object_put().
234 while (!list_empty(&dispose)) {
235 h = container_of0(dispose.next,
236 struct lu_object_header, loh_lru);
237 list_del_init(&h->loh_lru);
238 lu_object_free(env, lu_object_top(h));
239 s->ls_stats.s_lru_purged ++;
242 EXPORT_SYMBOL(lu_site_purge);
247 * Code below has to jump through certain loops to output object description
248 * into libcfs_debug_msg-based log. The problem is that lu_object_print()
249 * composes object description from strings that are parts of _lines_ of
250 * output (i.e., strings that are not terminated by newline). This doesn't fit
251 * very well into libcfs_debug_msg() interface that assumes that each message
252 * supplied to it is a self-contained output line.
254 * To work around this, strings are collected in a temporary buffer
255 * (implemented as a value of lu_cdebug_key key), until terminating newline
256 * character is detected.
264 * XXX overflow is not handled correctly.
269 struct lu_cdebug_data {
273 char lck_area[LU_CDEBUG_LINE];
276 static void *lu_cdebug_key_init(const struct lu_context *ctx,
277 struct lu_context_key *key)
279 struct lu_cdebug_data *value;
281 OBD_ALLOC_PTR(value);
283 value = ERR_PTR(-ENOMEM);
287 static void lu_cdebug_key_fini(const struct lu_context *ctx,
288 struct lu_context_key *key, void *data)
290 struct lu_cdebug_data *value = data;
295 * Key, holding temporary buffer. This key is registered very early by
298 static struct lu_context_key lu_cdebug_key = {
299 .lct_tags = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD,
300 .lct_init = lu_cdebug_key_init,
301 .lct_fini = lu_cdebug_key_fini
305 * Printer function emitting messages through libcfs_debug_msg().
307 int lu_cdebug_printer(const struct lu_env *env,
308 void *cookie, const char *format, ...)
310 struct lu_cdebug_print_info *info = cookie;
311 struct lu_cdebug_data *key;
316 va_start(args, format);
318 key = lu_context_key_get(&env->le_ctx, &lu_cdebug_key);
319 LASSERT(key != NULL);
321 used = strlen(key->lck_area);
322 complete = format[strlen(format) - 1] == '\n';
324 * Append new chunk to the buffer.
326 vsnprintf(key->lck_area + used,
327 ARRAY_SIZE(key->lck_area) - used, format, args);
329 libcfs_debug_msg(NULL, info->lpi_subsys, info->lpi_mask,
330 (char *)info->lpi_file, info->lpi_fn,
331 info->lpi_line, "%s", key->lck_area);
332 key->lck_area[0] = 0;
337 EXPORT_SYMBOL(lu_cdebug_printer);
340 * Print object header.
342 static void lu_object_header_print(const struct lu_env *env,
343 void *cookie, lu_printer_t printer,
344 const struct lu_object_header *hdr)
346 (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s]",
347 hdr, hdr->loh_flags, hdr->loh_ref, PFID(&hdr->loh_fid),
348 hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
349 list_empty(&hdr->loh_lru) ? "" : " lru");
353 * Print human readable representation of the @o to the @printer.
355 void lu_object_print(const struct lu_env *env, void *cookie,
356 lu_printer_t printer, const struct lu_object *o)
358 static const char ruler[] = "........................................";
359 struct lu_object_header *top;
363 lu_object_header_print(env, cookie, printer, top);
364 (*printer)(env, cookie, "\n");
365 list_for_each_entry(o, &top->loh_layers, lo_linkage) {
366 depth = o->lo_depth + 4;
367 LASSERT(o->lo_ops->loo_object_print != NULL);
369 * print `.' @depth times.
371 (*printer)(env, cookie, "%*.*s", depth, depth, ruler);
372 o->lo_ops->loo_object_print(env, cookie, printer, o);
373 (*printer)(env, cookie, "\n");
376 EXPORT_SYMBOL(lu_object_print);
379 * Check object consistency.
381 int lu_object_invariant(const struct lu_object *o)
383 struct lu_object_header *top;
386 list_for_each_entry(o, &top->loh_layers, lo_linkage) {
387 if (o->lo_ops->loo_object_invariant != NULL &&
388 !o->lo_ops->loo_object_invariant(o))
393 EXPORT_SYMBOL(lu_object_invariant);
395 static struct lu_object *htable_lookup(struct lu_site *s,
396 const struct hlist_head *bucket,
397 const struct lu_fid *f)
399 struct lu_object_header *h;
400 struct hlist_node *scan;
402 hlist_for_each_entry(h, scan, bucket, loh_hash) {
403 s->ls_stats.s_cache_check ++;
404 if (lu_fid_eq(&h->loh_fid, f) && !lu_object_is_dying(h)) {
405 /* bump reference count... */
406 if (h->loh_ref ++ == 0)
408 /* and move to the head of the LRU */
409 list_move_tail(&h->loh_lru, &s->ls_lru);
410 s->ls_stats.s_cache_hit ++;
411 return lu_object_top(h);
414 s->ls_stats.s_cache_miss ++;
418 static __u32 fid_hash(const struct lu_fid *f)
420 /* all objects with same id and different versions will belong to same
421 * collisions list. */
422 return (fid_seq(f) - 1) * LUSTRE_SEQ_MAX_WIDTH + fid_oid(f);
426 * Search cache for an object with the fid @f. If such object is found, return
427 * it. Otherwise, create new object, insert it into cache and return it. In
428 * any case, additional reference is acquired on the returned object.
430 struct lu_object *lu_object_find(const struct lu_env *env,
431 struct lu_site *s, const struct lu_fid *f,
432 struct lustre_capa *capa)
435 struct lu_object *shadow;
436 struct hlist_head *bucket;
440 * This uses standard index maintenance protocol:
442 * - search index under lock, and return object if found;
443 * - otherwise, unlock index, allocate new object;
444 * - lock index and search again;
445 * - if nothing is found (usual case), insert newly created
447 * - otherwise (race: other thread inserted object), free
448 * object just allocated.
453 bucket = s->ls_hash + (fid_hash(f) & s->ls_hash_mask);
454 spin_lock(&s->ls_guard);
455 o = htable_lookup(s, bucket, f);
457 spin_unlock(&s->ls_guard);
459 if (capa == BYPASS_CAPA) {
460 o->lo_header->loh_capa_bypass = 1;
462 rc = lu_object_auth(env, o, capa,
463 CAPA_OPC_INDEX_LOOKUP);
467 o->lo_header->loh_capa = *capa;
473 * Allocate new object. This may result in rather complicated
474 * operations, including fld queries, inode loading, etc.
476 o = lu_object_alloc(env, s, f, capa);
480 LASSERT(lu_fid_eq(lu_object_fid(o), f));
482 spin_lock(&s->ls_guard);
483 shadow = htable_lookup(s, bucket, f);
484 if (shadow == NULL) {
485 hlist_add_head(&o->lo_header->loh_hash, bucket);
486 list_add_tail(&o->lo_header->loh_lru, &s->ls_lru);
491 s->ls_stats.s_cache_race ++;
492 spin_unlock(&s->ls_guard);
494 lu_object_free(env, o);
497 EXPORT_SYMBOL(lu_object_find);
499 int lu_object_auth(const struct lu_env *env, const struct lu_object *o,
500 struct lustre_capa *capa, __u64 opc)
502 struct lu_object_header *top = o->lo_header;
505 list_for_each_entry(o, &top->loh_layers, lo_linkage) {
506 if (o->lo_ops->loo_object_auth) {
507 rc = o->lo_ops->loo_object_auth(env, o, capa, opc);
515 EXPORT_SYMBOL(lu_object_auth);
518 LU_SITE_HTABLE_BITS = 8,
519 LU_SITE_HTABLE_SIZE = (1 << LU_SITE_HTABLE_BITS),
520 LU_SITE_HTABLE_MASK = LU_SITE_HTABLE_SIZE - 1
524 * Initialize site @s, with @d as the top level device.
526 int lu_site_init(struct lu_site *s, struct lu_device *top)
531 memset(s, 0, sizeof *s);
532 spin_lock_init(&s->ls_guard);
533 CFS_INIT_LIST_HEAD(&s->ls_lru);
538 * XXX nikita: fixed size hash-table.
540 s->ls_hash_mask = LU_SITE_HTABLE_MASK;
541 OBD_ALLOC(s->ls_hash, LU_SITE_HTABLE_SIZE * sizeof s->ls_hash[0]);
542 if (s->ls_hash != NULL) {
544 for (i = 0; i < LU_SITE_HTABLE_SIZE; i++)
545 INIT_HLIST_HEAD(&s->ls_hash[i]);
553 EXPORT_SYMBOL(lu_site_init);
556 * Finalize @s and release its resources.
558 void lu_site_fini(struct lu_site *s)
560 LASSERT(list_empty(&s->ls_lru));
561 LASSERT(s->ls_total == 0);
562 LASSERT(s->ls_busy == 0);
564 if (s->ls_hash != NULL) {
566 for (i = 0; i < LU_SITE_HTABLE_SIZE; i++)
567 LASSERT(hlist_empty(&s->ls_hash[i]));
569 LU_SITE_HTABLE_SIZE * sizeof s->ls_hash[0]);
572 if (s->ls_top_dev != NULL) {
573 s->ls_top_dev->ld_site = NULL;
574 lu_device_put(s->ls_top_dev);
575 s->ls_top_dev = NULL;
578 EXPORT_SYMBOL(lu_site_fini);
581 * Acquire additional reference on device @d
583 void lu_device_get(struct lu_device *d)
585 atomic_inc(&d->ld_ref);
587 EXPORT_SYMBOL(lu_device_get);
590 * Release reference on device @d.
592 void lu_device_put(struct lu_device *d)
594 atomic_dec(&d->ld_ref);
596 EXPORT_SYMBOL(lu_device_put);
599 * Initialize device @d of type @t.
601 int lu_device_init(struct lu_device *d, struct lu_device_type *t)
603 memset(d, 0, sizeof *d);
604 atomic_set(&d->ld_ref, 0);
608 EXPORT_SYMBOL(lu_device_init);
611 * Finalize device @d.
613 void lu_device_fini(struct lu_device *d)
615 if (d->ld_obd != NULL)
617 lprocfs_obd_cleanup(d->ld_obd);
619 LASSERTF(atomic_read(&d->ld_ref) == 0,
620 "Refcount is %u\n", atomic_read(&d->ld_ref));
622 EXPORT_SYMBOL(lu_device_fini);
625 * Initialize object @o that is part of compound object @h and was created by
628 int lu_object_init(struct lu_object *o,
629 struct lu_object_header *h, struct lu_device *d)
631 memset(o, 0, sizeof *o);
635 CFS_INIT_LIST_HEAD(&o->lo_linkage);
638 EXPORT_SYMBOL(lu_object_init);
641 * Finalize object and release its resources.
643 void lu_object_fini(struct lu_object *o)
645 LASSERT(list_empty(&o->lo_linkage));
647 if (o->lo_dev != NULL) {
648 lu_device_put(o->lo_dev);
652 EXPORT_SYMBOL(lu_object_fini);
655 * Add object @o as first layer of compound object @h
657 * This is typically called by the ->ldo_object_alloc() method of top-level
660 void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
662 list_move(&o->lo_linkage, &h->loh_layers);
664 EXPORT_SYMBOL(lu_object_add_top);
667 * Add object @o as a layer of compound object, going after @before.1
669 * This is typically called by the ->ldo_object_alloc() method of
672 void lu_object_add(struct lu_object *before, struct lu_object *o)
674 list_move(&o->lo_linkage, &before->lo_linkage);
676 EXPORT_SYMBOL(lu_object_add);
679 * Initialize compound object.
681 int lu_object_header_init(struct lu_object_header *h)
683 memset(h, 0, sizeof *h);
685 INIT_HLIST_NODE(&h->loh_hash);
686 CFS_INIT_LIST_HEAD(&h->loh_lru);
687 CFS_INIT_LIST_HEAD(&h->loh_layers);
690 EXPORT_SYMBOL(lu_object_header_init);
693 * Finalize compound object.
695 void lu_object_header_fini(struct lu_object_header *h)
697 LASSERT(list_empty(&h->loh_layers));
698 LASSERT(list_empty(&h->loh_lru));
699 LASSERT(hlist_unhashed(&h->loh_hash));
701 EXPORT_SYMBOL(lu_object_header_fini);
704 * Given a compound object, find its slice, corresponding to the device type
707 struct lu_object *lu_object_locate(struct lu_object_header *h,
708 struct lu_device_type *dtype)
712 list_for_each_entry(o, &h->loh_layers, lo_linkage) {
713 if (o->lo_dev->ld_type == dtype)
718 EXPORT_SYMBOL(lu_object_locate);
722 * Maximal number of tld slots.
724 LU_CONTEXT_KEY_NR = 16
727 static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
729 static spinlock_t lu_keys_guard = SPIN_LOCK_UNLOCKED;
734 int lu_context_key_register(struct lu_context_key *key)
739 LASSERT(key->lct_init != NULL);
740 LASSERT(key->lct_fini != NULL);
741 LASSERT(key->lct_tags != 0);
744 spin_lock(&lu_keys_guard);
745 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
746 if (lu_keys[i] == NULL) {
748 atomic_set(&key->lct_used, 1);
754 spin_unlock(&lu_keys_guard);
757 EXPORT_SYMBOL(lu_context_key_register);
762 void lu_context_key_degister(struct lu_context_key *key)
764 LASSERT(atomic_read(&key->lct_used) >= 1);
765 LASSERT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
767 if (atomic_read(&key->lct_used) > 1)
768 CERROR("key has instances.\n");
769 spin_lock(&lu_keys_guard);
770 lu_keys[key->lct_index] = NULL;
771 spin_unlock(&lu_keys_guard);
773 EXPORT_SYMBOL(lu_context_key_degister);
776 * Return value associated with key @key in context @ctx.
778 void *lu_context_key_get(const struct lu_context *ctx,
779 struct lu_context_key *key)
781 LASSERT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
782 return ctx->lc_value[key->lct_index];
784 EXPORT_SYMBOL(lu_context_key_get);
786 static void keys_fini(struct lu_context *ctx)
790 if (ctx->lc_value != NULL) {
791 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
792 if (ctx->lc_value[i] != NULL) {
793 struct lu_context_key *key;
796 LASSERT(key != NULL);
797 LASSERT(key->lct_fini != NULL);
798 LASSERT(atomic_read(&key->lct_used) > 1);
800 key->lct_fini(ctx, key, ctx->lc_value[i]);
801 atomic_dec(&key->lct_used);
802 ctx->lc_value[i] = NULL;
805 OBD_FREE(ctx->lc_value,
806 ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
807 ctx->lc_value = NULL;
811 static int keys_fill(const struct lu_context *ctx)
815 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
816 struct lu_context_key *key;
819 if (ctx->lc_value[i] == NULL &&
820 key != NULL && key->lct_tags & ctx->lc_tags) {
823 LASSERT(key->lct_init != NULL);
824 LASSERT(key->lct_index == i);
826 value = key->lct_init(ctx, key);
828 return PTR_ERR(value);
829 atomic_inc(&key->lct_used);
830 ctx->lc_value[i] = value;
836 static int keys_init(struct lu_context *ctx)
840 OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
841 if (ctx->lc_value != NULL)
842 result = keys_fill(ctx);
852 * Initialize context data-structure. Create values for all keys.
854 int lu_context_init(struct lu_context *ctx, __u32 tags)
856 memset(ctx, 0, sizeof *ctx);
858 return keys_init(ctx);
860 EXPORT_SYMBOL(lu_context_init);
863 * Finalize context data-structure. Destroy key values.
865 void lu_context_fini(struct lu_context *ctx)
869 EXPORT_SYMBOL(lu_context_fini);
872 * Called before entering context.
874 void lu_context_enter(struct lu_context *ctx)
877 EXPORT_SYMBOL(lu_context_enter);
880 * Called after exiting from @ctx
882 void lu_context_exit(struct lu_context *ctx)
886 if (ctx->lc_value != NULL) {
887 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
888 if (ctx->lc_value[i] != NULL) {
889 struct lu_context_key *key;
892 LASSERT(key != NULL);
893 if (key->lct_exit != NULL)
895 key, ctx->lc_value[i]);
900 EXPORT_SYMBOL(lu_context_exit);
903 * Allocate for context all missing keys that were registered after context
906 int lu_context_refill(const struct lu_context *ctx)
908 LASSERT(ctx->lc_value != NULL);
909 return keys_fill(ctx);
911 EXPORT_SYMBOL(lu_context_refill);
913 int lu_env_init(struct lu_env *env, struct lu_context *ses, __u32 tags)
918 result = lu_context_init(&env->le_ctx, tags);
920 lu_context_enter(&env->le_ctx);
923 EXPORT_SYMBOL(lu_env_init);
925 void lu_env_fini(struct lu_env *env)
927 lu_context_exit(&env->le_ctx);
928 lu_context_fini(&env->le_ctx);
931 EXPORT_SYMBOL(lu_env_fini);
934 * Initialization of global lu_* data.
936 int lu_global_init(void)
940 result = lu_context_key_register(&lu_cdebug_key);
945 * Dual to lu_global_init().
947 void lu_global_fini(void)
949 lu_context_key_degister(&lu_cdebug_key);
952 struct lu_buf LU_BUF_NULL = {
956 EXPORT_SYMBOL(LU_BUF_NULL);