Whamcloud - gitweb
670c17a1faf413e46d9ee03a4343b756e0bd664e
[fs/lustre-release.git] / lustre / obdclass / lu_object.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Lustre Object.
5  *
6  *  Copyright (C) 2006 Cluster File Systems, Inc.
7  *   Author: Nikita Danilov <nikita@clusterfs.com>
8  *
9  *   This file is part of the Lustre file system, http://www.lustre.org
10  *   Lustre is a trademark of Cluster File Systems, Inc.
11  *
12  *   You may have signed or agreed to another license before downloading
13  *   this software.  If so, you are bound by the terms and conditions
14  *   of that agreement, and the following does not apply to you.  See the
15  *   LICENSE file included with this distribution for more information.
16  *
17  *   If you did not agree to a different license, then this copy of Lustre
18  *   is open source software; you can redistribute it and/or modify it
19  *   under the terms of version 2 of the GNU General Public License as
20  *   published by the Free Software Foundation.
21  *
22  *   In either case, Lustre is distributed in the hope that it will be
23  *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
24  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25  *   license text for more details.
26  *
27  * These are the only exported functions, they provide some generic
28  * infrastructure for managing object devices
29  */
30
31 #define DEBUG_SUBSYSTEM S_CLASS
32 #ifndef EXPORT_SYMTAB
33 # define EXPORT_SYMTAB
34 #endif
35
36 #include <linux/seq_file.h>
37 #include <linux/module.h>
38 #include <obd_support.h>
39 #include <lustre_disk.h>
40 #include <lustre_fid.h>
41 #include <lu_object.h>
42 #include <libcfs/list.h>
43
44 static void lu_object_free(const struct lu_context *ctx, struct lu_object *o);
45
46 /*
47  * Decrease reference counter on object. If last reference is freed, return
48  * object to the cache, unless lu_object_is_dying(o) holds. In the latter
49  * case, free object immediately.
50  */
51 void lu_object_put(const struct lu_context *ctxt, struct lu_object *o)
52 {
53         struct lu_object_header *top;
54         struct lu_site          *site;
55         struct lu_object        *orig;
56         int                      kill_it;
57
58         top = o->lo_header;
59         site = o->lo_dev->ld_site;
60         orig = o;
61         kill_it = 0;
62         spin_lock(&site->ls_guard);
63         if (-- top->loh_ref == 0) {
64                 /*
65                  * When last reference is released, iterate over object
66                  * layers, and notify them that object is no longer busy.
67                  */
68                 list_for_each_entry(o, &top->loh_layers, lo_linkage) {
69                         if (o->lo_ops->loo_object_release != NULL)
70                                 o->lo_ops->loo_object_release(ctxt, o);
71                 }
72                 -- site->ls_busy;
73                 if (lu_object_is_dying(top)) {
74                         /*
75                          * If object is dying (will not be cached), removed it
76                          * from hash table and LRU.
77                          *
78                          * This is done with hash table and LRU lists
79                          * locked. As the only way to acquire first reference
80                          * to previously unreferenced object is through
81                          * hash-table lookup (lu_object_find()), or LRU
82                          * scanning (lu_site_purge()), that are done under
83                          * hash-table and LRU lock, no race with concurrent
84                          * object lookup is possible and we can safely destroy
85                          * object below.
86                          */
87                         hlist_del_init(&top->loh_hash);
88                         list_del_init(&top->loh_lru);
89                         kill_it = 1;
90                 }
91         }
92         spin_unlock(&site->ls_guard);
93         if (kill_it)
94                 /*
95                  * Object was already removed from hash and lru above, can
96                  * kill it.
97                  */
98                 lu_object_free(ctxt, orig);
99 }
100 EXPORT_SYMBOL(lu_object_put);
101
102 /*
103  * Allocate new object.
104  *
105  * This follows object creation protocol, described in the comment within
106  * struct lu_device_operations definition.
107  */
108 static struct lu_object *lu_object_alloc(const struct lu_context *ctxt,
109                                          struct lu_site *s,
110                                          const struct lu_fid *f)
111 {
112         struct lu_object *scan;
113         struct lu_object *top;
114         int clean;
115         int result;
116
117         /*
118          * Create top-level object slice. This will also create
119          * lu_object_header.
120          */
121         top = s->ls_top_dev->ld_ops->ldo_object_alloc(ctxt,
122                                                       NULL, s->ls_top_dev);
123         if (IS_ERR(top))
124                 RETURN(top);
125         s->ls_total ++;
126         /*
127          * This is the only place where object fid is assigned. It's constant
128          * after this point.
129          */
130         top->lo_header->loh_fid = *f;
131         do {
132                 /*
133                  * Call ->loo_object_init() repeatedly, until no more new
134                  * object slices are created.
135                  */
136                 clean = 1;
137                 list_for_each_entry(scan,
138                                     &top->lo_header->loh_layers, lo_linkage) {
139                         if (scan->lo_flags & LU_OBJECT_ALLOCATED)
140                                 continue;
141                         clean = 0;
142                         scan->lo_header = top->lo_header;
143                         result = scan->lo_ops->loo_object_init(ctxt, scan);
144                         if (result != 0) {
145                                 lu_object_free(ctxt, top);
146                                 RETURN(ERR_PTR(result));
147                         }
148                         scan->lo_flags |= LU_OBJECT_ALLOCATED;
149                 }
150         } while (!clean);
151         s->ls_stats.s_created ++;
152         RETURN(top);
153 }
154
155 /*
156  * Free object.
157  */
158 static void lu_object_free(const struct lu_context *ctx, struct lu_object *o)
159 {
160         struct list_head splice;
161         struct lu_object *scan;
162
163         /*
164          * First call ->loo_object_delete() method to release all resources.
165          */
166         list_for_each_entry_reverse(scan,
167                                     &o->lo_header->loh_layers, lo_linkage) {
168                 if (scan->lo_ops->loo_object_delete != NULL)
169                         scan->lo_ops->loo_object_delete(ctx, scan);
170         }
171         -- o->lo_dev->ld_site->ls_total;
172         /*
173          * Then, splice object layers into stand-alone list, and call
174          * ->loo_object_free() on all layers to free memory. Splice is
175          * necessary, because lu_object_header is freed together with the
176          * top-level slice.
177          */
178         INIT_LIST_HEAD(&splice);
179         list_splice_init(&o->lo_header->loh_layers, &splice);
180         while (!list_empty(&splice)) {
181                 o = container_of0(splice.next, struct lu_object, lo_linkage);
182                 list_del_init(&o->lo_linkage);
183                 LASSERT(o->lo_ops->loo_object_free != NULL);
184                 o->lo_ops->loo_object_free(ctx, o);
185         }
186 }
187
188 /*
189  * Free @nr objects from the cold end of the site LRU list.
190  */
191 void lu_site_purge(const struct lu_context *ctx, struct lu_site *s, int nr)
192 {
193         struct list_head         dispose;
194         struct lu_object_header *h;
195         struct lu_object_header *temp;
196
197         INIT_LIST_HEAD(&dispose);
198         /*
199          * Under LRU list lock, scan LRU list and move unreferenced objects to
200          * the dispose list, removing them from LRU and hash table.
201          */
202         spin_lock(&s->ls_guard);
203         list_for_each_entry_safe(h, temp, &s->ls_lru, loh_lru) {
204                 if (nr-- == 0)
205                         break;
206                 if (h->loh_ref > 0)
207                         continue;
208                 hlist_del_init(&h->loh_hash);
209                 list_move(&h->loh_lru, &dispose);
210         }
211         spin_unlock(&s->ls_guard);
212         /*
213          * Free everything on the dispose list. This is safe against races due
214          * to the reasons described in lu_object_put().
215          */
216         while (!list_empty(&dispose)) {
217                 h = container_of0(dispose.next,
218                                  struct lu_object_header, loh_lru);
219                 list_del_init(&h->loh_lru);
220                 lu_object_free(ctx, lu_object_top(h));
221                 s->ls_stats.s_lru_purged ++;
222         }
223 }
224 EXPORT_SYMBOL(lu_site_purge);
225
226 /*
227  * Print human readable representation of the @o to the @f.
228  */
229 int lu_object_print(const struct lu_context *ctx,
230                     struct seq_file *f, const struct lu_object *o)
231 {
232         static char ruler[] = "........................................";
233         struct lu_object_header *top;
234         int nob;
235         int depth;
236
237         nob = 0;
238         top = o->lo_header;
239         list_for_each_entry(o, &top->loh_layers, lo_linkage) {
240                 depth = o->lo_depth;
241                 LASSERT(o->lo_ops->loo_object_print != NULL);
242                 /*
243                  * print `.' @depth times.
244                  */
245                 nob += seq_printf(f, "%*.*s", depth, depth, ruler);
246                 nob += o->lo_ops->loo_object_print(ctx, f, o);
247                 nob += seq_printf(f, "\n");
248         }
249         return nob;
250 }
251 EXPORT_SYMBOL(lu_object_print);
252
253 /*
254  * Check object consistency.
255  */
256 int lu_object_invariant(const struct lu_object *o)
257 {
258         struct lu_object_header *top;
259
260         top = o->lo_header;
261         list_for_each_entry(o, &top->loh_layers, lo_linkage) {
262                 if (o->lo_ops->loo_object_invariant != NULL &&
263                     !o->lo_ops->loo_object_invariant(o))
264                         return 0;
265         }
266         return 1;
267 }
268 EXPORT_SYMBOL(lu_object_invariant);
269
270 static struct lu_object *htable_lookup(struct lu_site *s,
271                                        const struct hlist_head *bucket,
272                                        const struct lu_fid *f)
273 {
274         struct lu_object_header *h;
275         struct hlist_node *scan;
276
277         hlist_for_each_entry(h, scan, bucket, loh_hash) {
278                 s->ls_stats.s_cache_check ++;
279                 if (lu_fid_eq(&h->loh_fid, f) && !lu_object_is_dying(h)) {
280                         /* bump reference count... */
281                         if (h->loh_ref ++ == 0)
282                                 ++ s->ls_busy;
283                         /* and move to the head of the LRU */
284                         list_move_tail(&h->loh_lru, &s->ls_lru);
285                         s->ls_stats.s_cache_hit ++;
286                         return lu_object_top(h);
287                 }
288         }
289         s->ls_stats.s_cache_miss ++;
290         return NULL;
291 }
292
293 static __u32 fid_hash(const struct lu_fid *f)
294 {
295         /* all objects with same id and different versions will belong to same
296          * collisions list. */
297         return (fid_seq(f) - 1) * LUSTRE_SEQ_MAX_WIDTH + fid_oid(f);
298 }
299
300 /*
301  * Search cache for an object with the fid @f. If such object is found, return
302  * it. Otherwise, create new object, insert it into cache and return it. In
303  * any case, additional reference is acquired on the returned object.
304  */
305 struct lu_object *lu_object_find(const struct lu_context *ctxt,
306                                  struct lu_site *s, const struct lu_fid *f)
307 {
308         struct lu_object  *o;
309         struct lu_object  *shadow;
310         struct hlist_head *bucket;
311
312         /*
313          * This uses standard index maintenance protocol:
314          *
315          *     - search index under lock, and return object if found;
316          *     - otherwise, unlock index, allocate new object;
317          *     - lock index and search again;
318          *     - if nothing is found (usual case), insert newly created
319          *       object into index;
320          *     - otherwise (race: other thread inserted object), free
321          *       object just allocated.
322          *     - unlock index;
323          *     - return object.
324          */
325
326         bucket = s->ls_hash + (fid_hash(f) & s->ls_hash_mask);
327         spin_lock(&s->ls_guard);
328         o = htable_lookup(s, bucket, f);
329
330         spin_unlock(&s->ls_guard);
331         if (o != NULL)
332                 return o;
333         /*
334          * Allocate new object. This may result in rather complicated
335          * operations, including fld queries, inode loading, etc.
336          */
337         o = lu_object_alloc(ctxt, s, f);
338         if (IS_ERR(o))
339                 return o;
340
341         LASSERT(lu_fid_eq(lu_object_fid(o), f));
342
343         spin_lock(&s->ls_guard);
344         shadow = htable_lookup(s, bucket, f);
345         if (shadow == NULL) {
346                 hlist_add_head(&o->lo_header->loh_hash, bucket);
347                 list_add_tail(&s->ls_lru, &o->lo_header->loh_lru);
348                 ++ s->ls_busy;
349                 shadow = o;
350                 o = NULL;
351         } else
352                 s->ls_stats.s_cache_race ++;
353         spin_unlock(&s->ls_guard);
354         if (o != NULL)
355                 lu_object_free(ctxt, o);
356         return shadow;
357 }
358 EXPORT_SYMBOL(lu_object_find);
359
360 enum {
361         LU_SITE_HTABLE_BITS = 8,
362         LU_SITE_HTABLE_SIZE = (1 << LU_SITE_HTABLE_BITS),
363         LU_SITE_HTABLE_MASK = LU_SITE_HTABLE_SIZE - 1
364 };
365
366 /*
367  * Initialize site @s, with @d as the top level device.
368  */
369 int lu_site_init(struct lu_site *s, struct lu_device *top)
370 {
371         int result;
372         ENTRY;
373
374         memset(s, 0, sizeof *s);
375         spin_lock_init(&s->ls_guard);
376         CFS_INIT_LIST_HEAD(&s->ls_lru);
377         s->ls_top_dev = top;
378         top->ld_site = s;
379         lu_device_get(top);
380         /*
381          * XXX nikita: fixed size hash-table.
382          */
383         s->ls_hash_mask = LU_SITE_HTABLE_MASK;
384         OBD_ALLOC(s->ls_hash, LU_SITE_HTABLE_SIZE * sizeof s->ls_hash[0]);
385         if (s->ls_hash != NULL) {
386                 int i;
387                 for (i = 0; i < LU_SITE_HTABLE_SIZE; i++)
388                         INIT_HLIST_HEAD(&s->ls_hash[i]);
389                 result = 0;
390         } else {
391                 result = -ENOMEM;
392         }
393
394         RETURN(result);
395 }
396 EXPORT_SYMBOL(lu_site_init);
397
398 /*
399  * Finalize @s and release its resources.
400  */
401 void lu_site_fini(struct lu_site *s)
402 {
403         LASSERT(list_empty(&s->ls_lru));
404         LASSERT(s->ls_total == 0);
405         LASSERT(s->ls_busy == 0);
406
407         if (s->ls_hash != NULL) {
408                 int i;
409                 for (i = 0; i < LU_SITE_HTABLE_SIZE; i++)
410                         LASSERT(hlist_empty(&s->ls_hash[i]));
411                 OBD_FREE(s->ls_hash,
412                          LU_SITE_HTABLE_SIZE * sizeof s->ls_hash[0]);
413                 s->ls_hash = NULL;
414        }
415        if (s->ls_top_dev != NULL) {
416                s->ls_top_dev->ld_site = NULL;
417                lu_device_put(s->ls_top_dev);
418                s->ls_top_dev = NULL;
419        }
420  }
421 EXPORT_SYMBOL(lu_site_fini);
422
423 /*
424  * Acquire additional reference on device @d
425  */
426 void lu_device_get(struct lu_device *d)
427 {
428         atomic_inc(&d->ld_ref);
429 }
430 EXPORT_SYMBOL(lu_device_get);
431
432 /*
433  * Release reference on device @d.
434  */
435 void lu_device_put(struct lu_device *d)
436 {
437         atomic_dec(&d->ld_ref);
438 }
439 EXPORT_SYMBOL(lu_device_put);
440
441 /*
442  * Initialize device @d of type @t.
443  */
444 int lu_device_init(struct lu_device *d, struct lu_device_type *t)
445 {
446         memset(d, 0, sizeof *d);
447         atomic_set(&d->ld_ref, 0);
448         d->ld_type = t;
449         return 0;
450 }
451 EXPORT_SYMBOL(lu_device_init);
452
453 /*
454  * Finalize device @d.
455  */
456 void lu_device_fini(struct lu_device *d)
457 {
458         LASSERT(atomic_read(&d->ld_ref) == 0);
459 }
460 EXPORT_SYMBOL(lu_device_fini);
461
462 /*
463  * Initialize object @o that is part of compound object @h and was created by
464  * device @d.
465  */
466 int lu_object_init(struct lu_object *o,
467                    struct lu_object_header *h, struct lu_device *d)
468 {
469         memset(o, 0, sizeof *o);
470         o->lo_header = h;
471         o->lo_dev    = d;
472         lu_device_get(d);
473         CFS_INIT_LIST_HEAD(&o->lo_linkage);
474         return 0;
475 }
476 EXPORT_SYMBOL(lu_object_init);
477
478 /*
479  * Finalize object and release its resources.
480  */
481 void lu_object_fini(struct lu_object *o)
482 {
483         LASSERT(list_empty(&o->lo_linkage));
484
485         if (o->lo_dev != NULL) {
486                 lu_device_put(o->lo_dev);
487                 o->lo_dev = NULL;
488         }
489 }
490 EXPORT_SYMBOL(lu_object_fini);
491
492 /*
493  * Add object @o as first layer of compound object @h
494  *
495  * This is typically called by the ->ldo_object_alloc() method of top-level
496  * device.
497  */
498 void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
499 {
500         list_move(&o->lo_linkage, &h->loh_layers);
501 }
502 EXPORT_SYMBOL(lu_object_add_top);
503
504 /*
505  * Add object @o as a layer of compound object, going after @before.1
506  *
507  * This is typically called by the ->ldo_object_alloc() method of
508  * @before->lo_dev.
509  */
510 void lu_object_add(struct lu_object *before, struct lu_object *o)
511 {
512         list_move(&o->lo_linkage, &before->lo_linkage);
513 }
514 EXPORT_SYMBOL(lu_object_add);
515
516 /*
517  * Initialize compound object.
518  */
519 int lu_object_header_init(struct lu_object_header *h)
520 {
521         memset(h, 0, sizeof *h);
522         h->loh_ref = 1;
523         INIT_HLIST_NODE(&h->loh_hash);
524         CFS_INIT_LIST_HEAD(&h->loh_lru);
525         CFS_INIT_LIST_HEAD(&h->loh_layers);
526         return 0;
527 }
528 EXPORT_SYMBOL(lu_object_header_init);
529
530 /*
531  * Finalize compound object.
532  */
533 void lu_object_header_fini(struct lu_object_header *h)
534 {
535         LASSERT(list_empty(&h->loh_layers));
536         LASSERT(list_empty(&h->loh_lru));
537         LASSERT(hlist_unhashed(&h->loh_hash));
538 }
539 EXPORT_SYMBOL(lu_object_header_fini);
540
541 /*
542  * Given a compound object, find its slice, corresponding to the device type
543  * @dtype.
544  */
545 struct lu_object *lu_object_locate(struct lu_object_header *h,
546                                    struct lu_device_type *dtype)
547 {
548         struct lu_object *o;
549
550         list_for_each_entry(o, &h->loh_layers, lo_linkage) {
551                 if (o->lo_dev->ld_type == dtype)
552                         return o;
553         }
554         return NULL;
555 }
556 EXPORT_SYMBOL(lu_object_locate);
557
558 enum {
559         /*
560          * Maximal number of tld slots.
561          */
562         LU_CONTEXT_KEY_NR = 16
563 };
564
565 static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
566
567 static spinlock_t lu_keys_guard = SPIN_LOCK_UNLOCKED;
568
569 /*
570  * Register new key.
571  */
572 int lu_context_key_register(struct lu_context_key *key)
573 {
574         int result;
575         int i;
576
577         LASSERT(key->lct_init != NULL);
578         LASSERT(key->lct_fini != NULL);
579         LASSERT(key->lct_tags != 0);
580
581         result = -ENFILE;
582         spin_lock(&lu_keys_guard);
583         for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
584                 if (lu_keys[i] == NULL) {
585                         key->lct_index = i;
586                         key->lct_used = 1;
587                         lu_keys[i] = key;
588                         result = 0;
589                         break;
590                 }
591         }
592         spin_unlock(&lu_keys_guard);
593         return result;
594 }
595 EXPORT_SYMBOL(lu_context_key_register);
596
597 /*
598  * Deregister key.
599  */
600 void lu_context_key_degister(struct lu_context_key *key)
601 {
602         LASSERT(key->lct_used >= 1);
603         LASSERT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
604
605         if (key->lct_used > 1)
606                 CERROR("key has instances.\n");
607         spin_lock(&lu_keys_guard);
608         lu_keys[key->lct_index] = NULL;
609         spin_unlock(&lu_keys_guard);
610 }
611 EXPORT_SYMBOL(lu_context_key_degister);
612
613 /*
614  * Return value associated with key @key in context @ctx.
615  */
616 void *lu_context_key_get(const struct lu_context *ctx,
617                          struct lu_context_key *key)
618 {
619         LASSERT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
620         return ctx->lc_value[key->lct_index];
621 }
622 EXPORT_SYMBOL(lu_context_key_get);
623
624 static void keys_fini(struct lu_context *ctx)
625 {
626         int i;
627
628         if (ctx->lc_value != NULL) {
629                 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
630                         if (ctx->lc_value[i] != NULL) {
631                                 struct lu_context_key *key;
632
633                                 key = lu_keys[i];
634                                 LASSERT(key != NULL);
635                                 LASSERT(key->lct_fini != NULL);
636                                 LASSERT(key->lct_used > 1);
637
638                                 key->lct_fini(ctx, key, ctx->lc_value[i]);
639                                 key->lct_used--;
640                                 ctx->lc_value[i] = NULL;
641                         }
642                 }
643                 OBD_FREE(ctx->lc_value,
644                          ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
645                 ctx->lc_value = NULL;
646         }
647 }
648
649 static int keys_init(struct lu_context *ctx)
650 {
651         int i;
652         int result;
653
654         OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
655         if (ctx->lc_value != NULL) {
656                 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
657                         struct lu_context_key *key;
658
659                         key = lu_keys[i];
660                         if (key != NULL && key->lct_tags & ctx->lc_tags) {
661                                 void *value;
662
663                                 LASSERT(key->lct_init != NULL);
664                                 LASSERT(key->lct_index == i);
665
666                                 value = key->lct_init(ctx, key);
667                                 if (IS_ERR(value)) {
668                                         keys_fini(ctx);
669                                         return PTR_ERR(value);
670                                 }
671                                 key->lct_used++;
672                                 ctx->lc_value[i] = value;
673                         }
674                 }
675                 result = 0;
676         } else
677                 result = -ENOMEM;
678         return result;
679 }
680
681 /*
682  * Initialize context data-structure. Create values for all keys.
683  */
684 int lu_context_init(struct lu_context *ctx, __u32 tags)
685 {
686         memset(ctx, 0, sizeof *ctx);
687         ctx->lc_tags = tags;
688         keys_init(ctx);
689         return 0;
690 }
691 EXPORT_SYMBOL(lu_context_init);
692
693 /*
694  * Finalize context data-structure. Destroy key values.
695  */
696 void lu_context_fini(struct lu_context *ctx)
697 {
698         keys_fini(ctx);
699 }
700 EXPORT_SYMBOL(lu_context_fini);
701
702 /*
703  * Called before entering context.
704  */
705 void lu_context_enter(struct lu_context *ctx)
706 {
707 }
708 EXPORT_SYMBOL(lu_context_enter);
709
710 /*
711  * Called after exiting from @ctx
712  */
713 void lu_context_exit(struct lu_context *ctx)
714 {
715 }
716 EXPORT_SYMBOL(lu_context_exit);