Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / lockmeter.patch
1
2 From: Ray Bryant <raybry@sgi.com>
3
4 This version of the lockmeter patch has been updated to work with
5 the out-of-line spinlocks that were recently merged into the mainline.
6 Basically what is done here is that if CONFIG_LOCKMETER is set, then the
7 lock routines in kernel/spinlock.c are not used and the corresponding
8 versions in kernel/lockmeter.c are used instead.  The former set of
9 lock routines call _raw_ spin lock code defined by the architecture;
10 the latter call _metered_ versions of the spinlock routines, which in
11 turn call the _raw_ routines.  The versions in the two files will have
12 to kept in sync manually, but given that the lock APIs don't change
13 very often, that should be ok.  The lockmeter.c versions are written
14 so that the address of the original caller is passed to the _metered_
15 lock routines; elsewise all lock requests would look like they were
16 coming from the lockmeter.c versions of the lock routine.
17
18 I've had trouble testing this on an Altix so have really only tested this
19 on i386.  But the changes are almost exclusively in machine independent
20 code so that should be ok.  I did do some fixup in the Sparc64 arch files,
21 so someone with such a box should test this stuff.
22
23 The existing lockstat routine will work with this patch, although we have
24 had to add some "#include <errno.h>" lines to that program to get it to
25 compile properly.  An updated version of lockstat will be posted to
26 oss.sgi.com in the near future.
27
28 Signed-off-by: Ray Bryant <raybry@sgi.com>
29
30 =========================================================================
31
32 DESC
33 ia64 CONFIG_LOCKMETER fix
34 EDESC
35 From: John Hawkes <hawkes@babylon.engr.sgi.com>
36
37 The 2.6.3-mm4 patch seems to have the CONFIG_LOCKMETER in the wrong spot
38 for ia64.  When I make this change, I can build and run an ia64
39 Lockmeter'ed kernel.
40 DESC
41 lockmeter-build-fix
42 EDESC
43 DESC
44 lockmeter for x86_64
45 EDESC
46 From: Alexander Nyberg <alexn@telia.com>
47
48 This is basically a cut and paste from i386 code.  At some places however
49 some unresolved addresses at places like [0x1000211eb38] shows up, which is
50 a bit weird.  I'm hoping for a comment from any of the SGI guys, as the
51 code is so similar to i386 I don't know if problem lies below or in the
52 generic code.
53 Signed-off-by: Andrew Morton <akpm@osdl.org>
54 Index: linux/arch/i386/Kconfig.debug
55 ===================================================================
56 --- linux.orig/arch/i386/Kconfig.debug
57 +++ linux/arch/i386/Kconfig.debug
58 @@ -67,6 +67,13 @@ config SCHEDSTATS
59           application, you can say N to avoid the very slight overhead
60           this adds.
61  
62 +config LOCKMETER
63 +       bool "Kernel lock metering"
64 +       depends on SMP
65 +       help
66 +         Say Y to enable kernel lock metering, which adds overhead to SMP locks,
67 +         but allows you to see various statistics using the lockstat command.
68 +
69  config X86_FIND_SMP_CONFIG
70         bool
71         depends on X86_LOCAL_APIC || X86_VOYAGER
72 Index: linux/arch/i386/lib/dec_and_lock.c
73 ===================================================================
74 --- linux.orig/arch/i386/lib/dec_and_lock.c
75 +++ linux/arch/i386/lib/dec_and_lock.c
76 @@ -10,6 +10,7 @@
77  #include <linux/spinlock.h>
78  #include <asm/atomic.h>
79  
80 +#ifndef ATOMIC_DEC_AND_LOCK
81  int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
82  {
83         int counter;
84 @@ -38,3 +39,5 @@ slow_path:
85         spin_unlock(lock);
86         return 0;
87  }
88 +#endif
89 +
90 Index: linux/arch/ia64/Kconfig.debug
91 ===================================================================
92 --- linux.orig/arch/ia64/Kconfig.debug
93 +++ linux/arch/ia64/Kconfig.debug
94 @@ -72,4 +72,11 @@ config SYSVIPC_COMPAT
95         depends on COMPAT && SYSVIPC
96         default y
97  
98 +config LOCKMETER
99 +       bool "Kernel lock metering"
100 +       depends on SMP
101 +       help
102 +         Say Y to enable kernel lock metering, which adds overhead to SMP locks,
103 +         but allows you to see various statistics using the lockstat command.
104 +
105  endmenu
106 Index: linux/arch/ia64/lib/dec_and_lock.c
107 ===================================================================
108 --- linux.orig/arch/ia64/lib/dec_and_lock.c
109 +++ linux/arch/ia64/lib/dec_and_lock.c
110 @@ -13,6 +13,7 @@
111  #include <linux/spinlock.h>
112  #include <asm/atomic.h>
113  
114 +#ifndef CONFIG_LOCKMETER
115  /*
116   * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  Both of these
117   * operations have to be done atomically, so that the count doesn't drop to zero without
118 @@ -40,3 +41,4 @@ atomic_dec_and_lock (atomic_t *refcount,
119  }
120  
121  EXPORT_SYMBOL(atomic_dec_and_lock);
122 +#endif
123 Index: linux/arch/sparc64/Kconfig.debug
124 ===================================================================
125 --- linux.orig/arch/sparc64/Kconfig.debug
126 +++ linux/arch/sparc64/Kconfig.debug
127 @@ -33,12 +33,19 @@ config DEBUG_BOOTMEM
128         depends on DEBUG_KERNEL
129         bool "Debug BOOTMEM initialization"
130  
131 +config LOCKMETER
132 +       bool "Kernel lock metering"
133 +       depends on SMP && !PREEMPT
134 +       help
135 +         Say Y to enable kernel lock metering, which adds overhead to SMP locks,
136 +         but allows you to see various statistics using the lockstat command.
137 +
138  # We have a custom atomic_dec_and_lock() implementation but it's not
139  # compatible with spinlock debugging so we need to fall back on
140  # the generic version in that case.
141  config HAVE_DEC_LOCK
142         bool
143 -       depends on SMP && !DEBUG_SPINLOCK
144 +       depends on SMP && !DEBUG_SPINLOCK && !LOCKMETER
145         default y
146  
147  config MCOUNT
148 Index: linux/arch/x86_64/Kconfig.debug
149 ===================================================================
150 --- linux.orig/arch/x86_64/Kconfig.debug
151 +++ linux/arch/x86_64/Kconfig.debug
152 @@ -66,4 +66,11 @@ config IOMMU_LEAK
153  #config X86_REMOTE_DEBUG
154  #       bool "kgdb debugging stub"
155  
156 +config LOCKMETER
157 +       bool "Kernel lock metering"
158 +       depends on SMP
159 +       help
160 +         Say Y to enable kernel lock metering, which adds overhead to SMP locks,
161 +         but allows you to see various statistics using the lockstat command.
162 +
163  endmenu
164 Index: linux/arch/x86_64/lib/dec_and_lock.c
165 ===================================================================
166 --- linux.orig/arch/x86_64/lib/dec_and_lock.c
167 +++ linux/arch/x86_64/lib/dec_and_lock.c
168 @@ -10,6 +10,7 @@
169  #include <linux/spinlock.h>
170  #include <asm/atomic.h>
171  
172 +#ifndef ATOMIC_DEC_AND_LOCK
173  int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
174  {
175         int counter;
176 @@ -38,3 +39,4 @@ slow_path:
177         spin_unlock(lock);
178         return 0;
179  }
180 +#endif
181 Index: linux/fs/proc/proc_misc.c
182 ===================================================================
183 --- linux.orig/fs/proc/proc_misc.c
184 +++ linux/fs/proc/proc_misc.c
185 @@ -578,6 +578,36 @@ static void create_seq_entry(char *name,
186                 entry->proc_fops = f;
187  }
188  
189 +#ifdef CONFIG_LOCKMETER
190 +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *);
191 +extern ssize_t put_lockmeter_info(const char *, size_t);
192 +extern int get_lockmeter_info_size(void);
193 +
194 +/*
195 + * This function accesses lock metering information.
196 + */
197 +static ssize_t read_lockmeter(struct file *file, char *buf,
198 +                             size_t count, loff_t *ppos)
199 +{
200 +       return get_lockmeter_info(buf, count, ppos);
201 +}
202 +
203 +/*
204 + * Writing to /proc/lockmeter resets the counters
205 + */
206 +static ssize_t write_lockmeter(struct file * file, const char * buf,
207 +                              size_t count, loff_t *ppos)
208 +{
209 +       return put_lockmeter_info(buf, count);
210 +}
211 +
212 +static struct file_operations proc_lockmeter_operations = {
213 +       NULL,           /* lseek */
214 +       read:           read_lockmeter,
215 +       write:          write_lockmeter,
216 +};
217 +#endif  /* CONFIG_LOCKMETER */
218 +
219  void __init proc_misc_init(void)
220  {
221         struct proc_dir_entry *entry;
222 @@ -638,6 +668,13 @@ void __init proc_misc_init(void)
223         if (entry)
224                 entry->proc_fops = &proc_sysrq_trigger_operations;
225  #endif
226 +#ifdef CONFIG_LOCKMETER
227 +       entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL);
228 +       if (entry) {
229 +               entry->proc_fops = &proc_lockmeter_operations;
230 +               entry->size = get_lockmeter_info_size();
231 +       }
232 +#endif
233  #ifdef CONFIG_PPC32
234         {
235                 extern struct file_operations ppc_htab_operations;
236 Index: linux/include/asm-alpha/lockmeter.h
237 ===================================================================
238 --- linux.orig/include/asm-alpha/lockmeter.h
239 +++ linux/include/asm-alpha/lockmeter.h
240 @@ -0,0 +1,84 @@
241 +/*
242 + *  Written by John Hawkes (hawkes@sgi.com)
243 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
244 + *
245 + *  Modified by Peter Rival (frival@zk3.dec.com)
246 + */
247 +
248 +#ifndef _ALPHA_LOCKMETER_H
249 +#define _ALPHA_LOCKMETER_H
250 +
251 +#include <asm/hwrpb.h>
252 +#define CPU_CYCLE_FREQUENCY    hwrpb->cycle_freq
253 +
254 +#define get_cycles64()         get_cycles()
255 +
256 +#define THIS_CPU_NUMBER                smp_processor_id()
257 +
258 +#include <linux/version.h>
259 +
260 +#define SPINLOCK_MAGIC_INIT /**/
261 +
262 +/*
263 + * Macros to cache and retrieve an index value inside of a lock
264 + * these macros assume that there are less than 65536 simultaneous
265 + * (read mode) holders of a rwlock.
266 + * We also assume that the hash table has less than 32767 entries.
267 + * the high order bit is used for write locking a rw_lock
268 + * Note: although these defines and macros are the same as what is being used
269 + *       in include/asm-i386/lockmeter.h, they are present here to easily
270 + *      allow an alternate Alpha implementation.
271 + */
272 +/*
273 + * instrumented spinlock structure -- never used to allocate storage
274 + * only used in macros below to overlay a spinlock_t
275 + */
276 +typedef struct inst_spinlock_s {
277 +       /* remember, Alpha is little endian */
278 +       unsigned short lock;
279 +       unsigned short index;
280 +} inst_spinlock_t;
281 +#define PUT_INDEX(lock_ptr,indexv)     ((inst_spinlock_t *)(lock_ptr))->index = indexv
282 +#define GET_INDEX(lock_ptr)            ((inst_spinlock_t *)(lock_ptr))->index
283 +
284 +/*
285 + * macros to cache and retrieve an index value in a read/write lock
286 + * as well as the cpu where a reader busy period started
287 + * we use the 2nd word (the debug word) for this, so require the
288 + * debug word to be present
289 + */
290 +/*
291 + * instrumented rwlock structure -- never used to allocate storage
292 + * only used in macros below to overlay a rwlock_t
293 + */
294 +typedef struct inst_rwlock_s {
295 +       volatile int lock;
296 +       unsigned short index;
297 +       unsigned short cpu;
298 +} inst_rwlock_t;
299 +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
300 +#define GET_RWINDEX(rwlock_ptr)                ((inst_rwlock_t *)(rwlock_ptr))->index
301 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
302 +#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
303 +
304 +/*
305 + * return true if rwlock is write locked
306 + * (note that other lock attempts can cause the lock value to be negative)
307 + */
308 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1)
309 +#define IABS(x) ((x) > 0 ? (x) : -(x))
310 +
311 +#define RWLOCK_READERS(rwlock_ptr)     rwlock_readers(rwlock_ptr)
312 +extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
313 +{
314 +       int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock;
315 +       /* readers subtract 2, so we have to:           */
316 +       /*      - andnot off a possible writer (bit 0)  */
317 +       /*      - get the absolute value                */
318 +       /*      - divide by 2 (right shift by one)      */
319 +       /* to find the number of readers                */
320 +       if (tmp == 0) return(0);
321 +       else return(IABS(tmp & ~1)>>1);
322 +}
323 +
324 +#endif /* _ALPHA_LOCKMETER_H */
325 Index: linux/include/asm-alpha/spinlock.h
326 ===================================================================
327 --- linux.orig/include/asm-alpha/spinlock.h
328 +++ linux/include/asm-alpha/spinlock.h
329 @@ -6,6 +6,10 @@
330  #include <linux/kernel.h>
331  #include <asm/current.h>
332  
333 +#ifdef CONFIG_LOCKMETER
334 +#undef DEBUG_SPINLOCK
335 +#undef DEBUG_RWLOCK
336 +#endif
337  
338  /*
339   * Simple spin lock operations.  There are two variants, one clears IRQ's
340 @@ -96,9 +100,18 @@ static inline int _raw_spin_trylock(spin
341  
342  typedef struct {
343         volatile unsigned int write_lock:1, read_counter:31;
344 +#ifdef CONFIG_LOCKMETER
345 +       /* required for LOCKMETER since all bits in lock are used */
346 +       /* need this storage for CPU and lock INDEX ............. */
347 +       unsigned magic;
348 +#endif
349  } /*__attribute__((aligned(32)))*/ rwlock_t;
350  
351 +#ifdef CONFIG_LOCKMETER
352 +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 }
353 +#else
354  #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
355 +#endif
356  
357  #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0)
358  #define rwlock_is_locked(x)    (*(volatile int *)(x) != 0)
359 @@ -193,4 +206,41 @@ static inline void _raw_read_unlock(rwlo
360         : "m" (*lock) : "memory");
361  }
362  
363 +#ifdef CONFIG_LOCKMETER
364 +static inline int _raw_write_trylock(rwlock_t *lock)
365 +{
366 +       long temp,result;
367 +
368 +       __asm__ __volatile__(
369 +       "       ldl_l %1,%0\n"
370 +       "       mov $31,%2\n"
371 +       "       bne %1,1f\n"
372 +       "       or $31,1,%2\n"
373 +       "       stl_c %2,%0\n"
374 +       "1:     mb\n"
375 +       : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result)
376 +       : "m" (*(volatile int *)lock)
377 +       );
378 +
379 +       return (result);
380 +}
381 +
382 +static inline int _raw_read_trylock(rwlock_t *lock)
383 +{
384 +       unsigned long temp,result;
385 +
386 +       __asm__ __volatile__(
387 +       "       ldl_l %1,%0\n"
388 +       "       mov $31,%2\n"
389 +       "       blbs %1,1f\n"
390 +       "       subl %1,2,%2\n"
391 +       "       stl_c %2,%0\n"
392 +       "1:     mb\n"
393 +       : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result)
394 +       : "m" (*(volatile int *)lock)
395 +       );
396 +       return (result);
397 +}
398 +#endif /* CONFIG_LOCKMETER */
399 +
400  #endif /* _ALPHA_SPINLOCK_H */
401 Index: linux/include/asm-i386/lockmeter.h
402 ===================================================================
403 --- linux.orig/include/asm-i386/lockmeter.h
404 +++ linux/include/asm-i386/lockmeter.h
405 @@ -0,0 +1,115 @@
406 +/*
407 + *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
408 + *
409 + *  Written by John Hawkes (hawkes@sgi.com)
410 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
411 + *
412 + *  Modified by Ray Bryant (raybry@us.ibm.com)
413 + *  Changes Copyright (C) 2000 IBM, Inc.
414 + *  Added save of index in spinlock_t to improve efficiency
415 + *  of "hold" time reporting for spinlocks.
416 + *  Added support for hold time statistics for read and write
417 + *  locks.
418 + *  Moved machine dependent code here from include/lockmeter.h.
419 + *
420 + */
421 +
422 +#ifndef _I386_LOCKMETER_H
423 +#define _I386_LOCKMETER_H
424 +
425 +#include <asm/spinlock.h>
426 +#include <asm/rwlock.h>
427 +
428 +#include <linux/version.h>
429 +
430 +#ifdef __KERNEL__
431 +extern unsigned long cpu_khz;
432 +#define CPU_CYCLE_FREQUENCY    (cpu_khz * 1000)
433 +#else
434 +#define CPU_CYCLE_FREQUENCY    450000000
435 +#endif
436 +
437 +#define THIS_CPU_NUMBER                smp_processor_id()
438 +
439 +/*
440 + * macros to cache and retrieve an index value inside of a spin lock
441 + * these macros assume that there are less than 65536 simultaneous
442 + * (read mode) holders of a rwlock.  Not normally a problem!!
443 + * we also assume that the hash table has less than 65535 entries.
444 + */
445 +/*
446 + * instrumented spinlock structure -- never used to allocate storage
447 + * only used in macros below to overlay a spinlock_t
448 + */
449 +typedef struct inst_spinlock_s {
450 +       /* remember, Intel is little endian */
451 +       unsigned short lock;
452 +       unsigned short index;
453 +} inst_spinlock_t;
454 +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv
455 +#define GET_INDEX(lock_ptr)        ((inst_spinlock_t *)(lock_ptr))->index
456 +
457 +/*
458 + * macros to cache and retrieve an index value in a read/write lock
459 + * as well as the cpu where a reader busy period started
460 + * we use the 2nd word (the debug word) for this, so require the
461 + * debug word to be present
462 + */
463 +/*
464 + * instrumented rwlock structure -- never used to allocate storage
465 + * only used in macros below to overlay a rwlock_t
466 + */
467 +typedef struct inst_rwlock_s {
468 +       volatile int lock;
469 +       unsigned short index;
470 +       unsigned short cpu;
471 +} inst_rwlock_t;
472 +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
473 +#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
474 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
475 +#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
476 +
477 +/*
478 + * return the number of readers for a rwlock_t
479 + */
480 +#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
481 +
482 +extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
483 +{
484 +       int tmp = (int) rwlock_ptr->lock;
485 +       /* read and write lock attempts may cause the lock value to temporarily */
486 +       /* be negative.  Until it is >= 0 we know nothing (i. e. can't tell if  */
487 +       /* is -1 because it was write locked and somebody tried to read lock it */
488 +       /* or if it is -1 because it was read locked and somebody tried to write*/
489 +       /* lock it. ........................................................... */
490 +       do {
491 +               tmp = (int) rwlock_ptr->lock;
492 +       } while (tmp < 0);
493 +       if (tmp == 0) return(0);
494 +       else return(RW_LOCK_BIAS-tmp);
495 +}
496 +
497 +/*
498 + * return true if rwlock is write locked
499 + * (note that other lock attempts can cause the lock value to be negative)
500 + */
501 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0)
502 +#define IABS(x) ((x) > 0 ? (x) : -(x))
503 +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0)
504 +
505 +/* this is a lot of typing just to get gcc to emit "rdtsc" */
506 +static inline long long get_cycles64 (void)
507 +{
508 +       union longlong_u {
509 +               long long intlong;
510 +               struct intint_s {
511 +                       uint32_t eax;
512 +                       uint32_t edx;
513 +               } intint;
514 +       } longlong;
515 +
516 +       rdtsc(longlong.intint.eax,longlong.intint.edx);
517 +       return longlong.intlong;
518 +}
519 +
520 +#endif /* _I386_LOCKMETER_H */
521 Index: linux/include/asm-i386/spinlock.h
522 ===================================================================
523 --- linux.orig/include/asm-i386/spinlock.h
524 +++ linux/include/asm-i386/spinlock.h
525 @@ -163,6 +163,11 @@ static inline void _raw_spin_lock_flags 
526   */
527  typedef struct {
528         volatile unsigned int lock;
529 +#ifdef CONFIG_LOCKMETER
530 +       /* required for LOCKMETER since all bits in lock are used */
531 +       /* and we need this storage for CPU and lock INDEX        */
532 +       unsigned lockmeter_magic;
533 +#endif
534  #ifdef CONFIG_DEBUG_SPINLOCK
535         unsigned magic;
536  #endif
537 @@ -170,11 +175,19 @@ typedef struct {
538  
539  #define RWLOCK_MAGIC   0xdeaf1eed
540  
541 +#ifdef CONFIG_LOCKMETER
542 +#ifdef CONFIG_DEBUG_SPINLOCK
543 +#define RWLOCK_MAGIC_INIT      , 0, RWLOCK_MAGIC
544 +#else
545 +#define RWLOCK_MAGIC_INIT      , 0
546 +#endif
547 +#else /* !CONFIG_LOCKMETER */
548  #ifdef CONFIG_DEBUG_SPINLOCK
549  #define RWLOCK_MAGIC_INIT      , RWLOCK_MAGIC
550  #else
551  #define RWLOCK_MAGIC_INIT      /* */
552  #endif
553 +#endif /* !CONFIG_LOCKMETER */
554  
555  #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
556  
557 @@ -212,6 +225,16 @@ static inline void _raw_write_lock(rwloc
558  #define _raw_read_unlock(rw)           asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
559  #define _raw_write_unlock(rw)  asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
560  
561 +static inline int _raw_read_trylock(rwlock_t *lock)
562 +{
563 +       atomic_t *count = (atomic_t *)lock;
564 +       atomic_dec(count);
565 +       if (atomic_read(count) >= 0)
566 +               return 1;
567 +       atomic_inc(count);
568 +       return 0;
569 +}
570 +
571  static inline int _raw_write_trylock(rwlock_t *lock)
572  {
573         atomic_t *count = (atomic_t *)lock;
574 @@ -221,4 +244,47 @@ static inline int _raw_write_trylock(rwl
575         return 0;
576  }
577  
578 +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK)
579 +extern void _metered_spin_lock  (spinlock_t *lock, void *caller_pc);
580 +extern void _metered_spin_unlock(spinlock_t *lock);
581 +
582 +/*
583 + *  Matches what is in arch/i386/lib/dec_and_lock.c, except this one is
584 + *  "static inline" so that the spin_lock(), if actually invoked, is charged
585 + *  against the real caller, not against the catch-all atomic_dec_and_lock
586 + */
587 +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
588 +{
589 +       int counter;
590 +       int newcount;
591 +
592 +repeat:
593 +       counter = atomic_read(atomic);
594 +       newcount = counter-1;
595 +
596 +       if (!newcount)
597 +               goto slow_path;
598 +
599 +       asm volatile("lock; cmpxchgl %1,%2"
600 +               :"=a" (newcount)
601 +               :"r" (newcount), "m" (atomic->counter), "0" (counter));
602 +
603 +       /* If the above failed, "eax" will have changed */
604 +       if (newcount != counter)
605 +               goto repeat;
606 +       return 0;
607 +
608 +slow_path:
609 +       preempt_disable();
610 +       _metered_spin_lock(lock, __builtin_return_address(0));
611 +       if (atomic_dec_and_test(atomic))
612 +               return 1;
613 +       _metered_spin_unlock(lock);
614 +       preempt_enable();
615 +       return 0;
616 +}
617 +
618 +#define ATOMIC_DEC_AND_LOCK
619 +#endif
620 +
621  #endif /* __ASM_SPINLOCK_H */
622 Index: linux/include/asm-ia64/lockmeter.h
623 ===================================================================
624 --- linux.orig/include/asm-ia64/lockmeter.h
625 +++ linux/include/asm-ia64/lockmeter.h
626 @@ -0,0 +1,72 @@
627 +/*
628 + *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
629 + *
630 + *  Written by John Hawkes (hawkes@sgi.com)
631 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
632 + */
633 +
634 +#ifndef _IA64_LOCKMETER_H
635 +#define _IA64_LOCKMETER_H
636 +
637 +#ifdef local_cpu_data
638 +#define CPU_CYCLE_FREQUENCY    local_cpu_data->itc_freq
639 +#else
640 +#define CPU_CYCLE_FREQUENCY    my_cpu_data.itc_freq
641 +#endif
642 +#define get_cycles64()         get_cycles()
643 +
644 +#define THIS_CPU_NUMBER                smp_processor_id()
645 +
646 +/*
647 + * macros to cache and retrieve an index value inside of a lock
648 + * these macros assume that there are less than 65536 simultaneous
649 + * (read mode) holders of a rwlock.
650 + * we also assume that the hash table has less than 32767 entries.
651 + */
652 +/*
653 + * instrumented spinlock structure -- never used to allocate storage
654 + * only used in macros below to overlay a spinlock_t
655 + */
656 +typedef struct inst_spinlock_s {
657 +       /* remember, Intel is little endian */
658 +       volatile unsigned short lock;
659 +       volatile unsigned short index;
660 +} inst_spinlock_t;
661 +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv
662 +#define GET_INDEX(lock_ptr)        ((inst_spinlock_t *)(lock_ptr))->index
663 +
664 +/*
665 + * macros to cache and retrieve an index value in a read/write lock
666 + * as well as the cpu where a reader busy period started
667 + * we use the 2nd word (the debug word) for this, so require the
668 + * debug word to be present
669 + */
670 +/*
671 + * instrumented rwlock structure -- never used to allocate storage
672 + * only used in macros below to overlay a rwlock_t
673 + */
674 +typedef struct inst_rwlock_s {
675 +       volatile int read_counter:31;
676 +       volatile int write_lock:1;
677 +       volatile unsigned short index;
678 +       volatile unsigned short cpu;
679 +} inst_rwlock_t;
680 +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
681 +#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
682 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
683 +#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
684 +
685 +/*
686 + * return the number of readers for a rwlock_t
687 + */
688 +#define RWLOCK_READERS(rwlock_ptr)     ((rwlock_ptr)->read_counter)
689 +
690 +/*
691 + * return true if rwlock is write locked
692 + * (note that other lock attempts can cause the lock value to be negative)
693 + */
694 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock)
695 +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->read_counter)
696 +
697 +#endif /* _IA64_LOCKMETER_H */
698 +
699 Index: linux/include/asm-ia64/spinlock.h
700 ===================================================================
701 --- linux.orig/include/asm-ia64/spinlock.h
702 +++ linux/include/asm-ia64/spinlock.h
703 @@ -116,8 +116,18 @@ do {                                                                                       \
704  typedef struct {
705         volatile unsigned int read_counter      : 31;
706         volatile unsigned int write_lock        :  1;
707 +#ifdef CONFIG_LOCKMETER
708 +       /* required for LOCKMETER since all bits in lock are used */
709 +       /* and we need this storage for CPU and lock INDEX        */
710 +       unsigned lockmeter_magic;
711 +#endif
712  } rwlock_t;
713 +
714 +#ifdef CONFIG_LOCKMETER
715 +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 }
716 +#else
717  #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
718 +#endif
719  
720  #define rwlock_init(x)         do { *(x) = RW_LOCK_UNLOCKED; } while(0)
721  #define rwlock_is_locked(x)    (*(volatile int *) (x) != 0)
722 @@ -133,6 +143,48 @@ do {                                                                                       \
723         }                                                                               \
724  } while (0)
725  
726 +#ifdef CONFIG_LOCKMETER
727 +/*
728 + * HACK: This works, but still have a timing window that affects performance:
729 + * we see that no one owns the Write lock, then someone * else grabs for Write
730 + * lock before we do a read_lock().
731 + * This means that on rare occasions our read_lock() will stall and spin-wait
732 + * until we acquire for Read, instead of simply returning a trylock failure.
733 + */
734 +static inline int _raw_read_trylock(rwlock_t *rw)
735 +{
736 +       if (rw->write_lock) {
737 +               return 0;
738 +       } else {
739 +               _raw_read_lock(rw);
740 +               return 1;
741 +       }
742 +}
743 +
744 +static inline int _raw_write_trylock(rwlock_t *rw)
745 +{
746 +       if (!(rw->write_lock)) {
747 +           /* isn't currently write-locked... that looks promising... */
748 +           if (test_and_set_bit(31, rw) == 0) {
749 +               /* now it is write-locked by me... */
750 +               if (rw->read_counter) {
751 +                   /* really read-locked, so release write-lock and fail */
752 +                   clear_bit(31, rw);
753 +               } else {
754 +                   /* we've the the write-lock, no read-lockers... success! */
755 +                   barrier();
756 +                   return 1;
757 +               }
758 +
759 +           }
760 +       }
761 +
762 +       /* falls through ... fails to write-lock */
763 +       barrier();
764 +       return 0;
765 +}
766 +#endif
767 +
768  #define _raw_read_unlock(rw)                                   \
769  do {                                                           \
770         rwlock_t *__read_lock_ptr = (rw);                       \
771 @@ -196,4 +248,25 @@ do {                                                                               \
772         clear_bit(31, (x));                                                             \
773  })
774  
775 +#ifdef CONFIG_LOCKMETER
776 +extern void _metered_spin_lock  (spinlock_t *lock, void *caller_pc);
777 +extern void _metered_spin_unlock(spinlock_t *lock);
778 +
779 +/*
780 + *  Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering
781 + *  so we can see the callerPC of who is actually doing the spin_lock().
782 + *  Otherwise, all we see is the generic rollup of all locks done by
783 + *  atomic_dec_and_lock().
784 + */
785 +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
786 +{
787 +       _metered_spin_lock(lock, __builtin_return_address(0));
788 +       if (atomic_dec_and_test(atomic))
789 +               return 1;
790 +       _metered_spin_unlock(lock);
791 +       return 0;
792 +}
793 +#define ATOMIC_DEC_AND_LOCK
794 +#endif
795 +
796  #endif /*  _ASM_IA64_SPINLOCK_H */
797 Index: linux/include/asm-mips/lockmeter.h
798 ===================================================================
799 --- linux.orig/include/asm-mips/lockmeter.h
800 +++ linux/include/asm-mips/lockmeter.h
801 @@ -0,0 +1,126 @@
802 +/*
803 + *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
804 + *
805 + *  Written by John Hawkes (hawkes@sgi.com)
806 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
807 + *  Ported to mips32 for Asita Technologies
808 + *   by D.J. Barrow ( dj.barrow@asitatechnologies.com )
809 + */
810 +#ifndef _ASM_LOCKMETER_H
811 +#define _ASM_LOCKMETER_H
812 +
813 +/* do_gettimeoffset is a function pointer on mips */
814 +/* & it is not included by <linux/time.h> */
815 +#include <asm/time.h>
816 +#include <linux/time.h>
817 +#include <asm/div64.h>
818 +
819 +#define SPINLOCK_MAGIC_INIT    /* */
820 +
821 +#define CPU_CYCLE_FREQUENCY    get_cpu_cycle_frequency()
822 +
823 +#define THIS_CPU_NUMBER                smp_processor_id()
824 +
825 +static uint32_t cpu_cycle_frequency = 0;
826 +
827 +static uint32_t get_cpu_cycle_frequency(void)
828 +{
829 +    /* a total hack, slow and invasive, but ... it works */
830 +    int sec;
831 +    uint32_t start_cycles;
832 +    struct timeval tv;
833 +
834 +    if (cpu_cycle_frequency == 0) {    /* uninitialized */
835 +       do_gettimeofday(&tv);
836 +       sec = tv.tv_sec;        /* set up to catch the tv_sec rollover */
837 +       while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
838 +       sec = tv.tv_sec;        /* rolled over to a new sec value */
839 +       start_cycles = get_cycles();
840 +       while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
841 +       cpu_cycle_frequency = get_cycles() - start_cycles;
842 +    }
843 +
844 +    return cpu_cycle_frequency;
845 +}
846 +
847 +extern struct timeval xtime;
848 +
849 +static uint64_t get_cycles64(void)
850 +{
851 +    static uint64_t last_get_cycles64 = 0;
852 +    uint64_t ret;
853 +    unsigned long sec;
854 +    unsigned long usec, usec_offset;
855 +
856 +again:
857 +    sec  = xtime.tv_sec;
858 +    usec = xtime.tv_usec;
859 +    usec_offset = do_gettimeoffset();
860 +    if ((xtime.tv_sec != sec)  ||
861 +       (xtime.tv_usec != usec)||
862 +       (usec_offset >= 20000))
863 +       goto again;
864 +
865 +    ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency);
866 +    /* We can't do a normal 64 bit division on mips without libgcc.a */
867 +    do_div(ret,1000000);
868 +    ret +=  ((uint64_t)sec * cpu_cycle_frequency);
869 +
870 +    /* XXX why does time go backwards?  do_gettimeoffset?  general time adj? */
871 +    if (ret <= last_get_cycles64)
872 +       ret  = last_get_cycles64+1;
873 +    last_get_cycles64 = ret;
874 +
875 +    return ret;
876 +}
877 +
878 +/*
879 + * macros to cache and retrieve an index value inside of a lock
880 + * these macros assume that there are less than 65536 simultaneous
881 + * (read mode) holders of a rwlock.
882 + * we also assume that the hash table has less than 32767 entries.
883 + * the high order bit is used for write locking a rw_lock
884 + */
885 +#define INDEX_MASK   0x7FFF0000
886 +#define READERS_MASK 0x0000FFFF
887 +#define INDEX_SHIFT 16
888 +#define PUT_INDEX(lockp,index)   \
889 +        lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT)
890 +#define GET_INDEX(lockp) \
891 +        (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT)
892 +
893 +/*
894 + * macros to cache and retrieve an index value in a read/write lock
895 + * as well as the cpu where a reader busy period started
896 + * we use the 2nd word (the debug word) for this, so require the
897 + * debug word to be present
898 + */
899 +/*
900 + * instrumented rwlock structure -- never used to allocate storage
901 + * only used in macros below to overlay a rwlock_t
902 + */
903 +typedef struct inst_rwlock_s {
904 +       volatile int lock;
905 +       unsigned short index;
906 +       unsigned short cpu;
907 +} inst_rwlock_t;
908 +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
909 +#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
910 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
911 +#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
912 +
913 +/*
914 + * return the number of readers for a rwlock_t
915 + */
916 +#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
917 +
918 +extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
919 +{
920 +       int tmp = (int) rwlock_ptr->lock;
921 +       return (tmp >= 0) ? tmp : 0;
922 +}
923 +
924 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0)
925 +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->lock > 0)
926 +
927 +#endif /* _ASM_LOCKMETER_H */
928 Index: linux/include/asm-mips/spinlock.h
929 ===================================================================
930 --- linux.orig/include/asm-mips/spinlock.h
931 +++ linux/include/asm-mips/spinlock.h
932 @@ -92,9 +92,18 @@ static inline unsigned int _raw_spin_try
933  
934  typedef struct {
935         volatile unsigned int lock;
936 +#ifdef CONFIG_LOCKMETER
937 +       /* required for LOCKMETER since all bits in lock are used */
938 +       /* and we need this storage for CPU and lock INDEX        */
939 +       unsigned lockmeter_magic;
940 +#endif
941  } rwlock_t;
942  
943 +#ifdef CONFIG_LOCKMETER
944 +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
945 +#else
946  #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
947 +#endif
948  
949  #define rwlock_init(x)  do { *(x) = RW_LOCK_UNLOCKED; } while(0)
950  
951 Index: linux/include/asm-sparc64/lockmeter.h
952 ===================================================================
953 --- linux.orig/include/asm-sparc64/lockmeter.h
954 +++ linux/include/asm-sparc64/lockmeter.h
955 @@ -0,0 +1,45 @@
956 +/*
957 + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com)
958 + * Copyright (C) 2003 David S. Miller (davem@redhat.com)
959 + */
960 +
961 +#ifndef _SPARC64_LOCKMETER_H
962 +#define _SPARC64_LOCKMETER_H
963 +
964 +#include <linux/smp.h>
965 +#include <asm/spinlock.h>
966 +#include <asm/timer.h>
967 +#include <asm/timex.h>
968 +
969 +/* Actually, this is not the CPU frequency by the system tick
970 + * frequency which is good enough for lock metering.
971 + */
972 +#define CPU_CYCLE_FREQUENCY    (timer_tick_offset * HZ)
973 +#define THIS_CPU_NUMBER                smp_processor_id()
974 +
975 +#define PUT_INDEX(lock_ptr,indexv)     (lock_ptr)->index = (indexv)
976 +#define GET_INDEX(lock_ptr)            (lock_ptr)->index
977 +
978 +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv)
979 +#define GET_RWINDEX(rwlock_ptr)        (rwlock_ptr)->index
980 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    (rwlock_ptr)->cpu = (cpuv)
981 +#define GET_RW_CPU(rwlock_ptr)         (rwlock_ptr)->cpu
982 +
983 +#define RWLOCK_READERS(rwlock_ptr)     rwlock_readers(rwlock_ptr)
984 +
985 +extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
986 +{
987 +       signed int tmp = rwlock_ptr->lock;
988 +
989 +       if (tmp > 0)
990 +               return tmp;
991 +       else
992 +               return 0;
993 +}
994 +
995 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr)     ((signed int)((rwlock_ptr)->lock) < 0)
996 +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)      ((signed int)((rwlock_ptr)->lock) > 0)
997 +
998 +#define get_cycles64() get_cycles()
999 +
1000 +#endif /* _SPARC64_LOCKMETER_H */
1001 Index: linux/include/asm-x86_64/lockmeter.h
1002 ===================================================================
1003 --- linux.orig/include/asm-x86_64/lockmeter.h
1004 +++ linux/include/asm-x86_64/lockmeter.h
1005 @@ -0,0 +1,102 @@
1006 +/*
1007 + *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
1008 + *
1009 + *  Written by John Hawkes (hawkes@sgi.com)
1010 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
1011 + *
1012 + *  Modified by Ray Bryant (raybry@us.ibm.com)
1013 + *  Changes Copyright (C) 2000 IBM, Inc.
1014 + *  Added save of index in spinlock_t to improve efficiency
1015 + *  of "hold" time reporting for spinlocks.
1016 + *  Added support for hold time statistics for read and write
1017 + *  locks.
1018 + *  Moved machine dependent code here from include/lockmeter.h.
1019 + *
1020 + */
1021 +
1022 +#ifndef _X8664_LOCKMETER_H
1023 +#define _X8664_LOCKMETER_H
1024 +
1025 +#include <asm/spinlock.h>
1026 +#include <asm/rwlock.h>
1027 +
1028 +#include <linux/version.h>
1029 +
1030 +#ifdef __KERNEL__
1031 +extern unsigned int cpu_khz;
1032 +#define CPU_CYCLE_FREQUENCY    (cpu_khz * 1000)
1033 +#else
1034 +#define CPU_CYCLE_FREQUENCY    450000000
1035 +#endif
1036 +
1037 +#define THIS_CPU_NUMBER                smp_processor_id()
1038 +
1039 +/*
1040 + * macros to cache and retrieve an index value inside of a spin lock
1041 + * these macros assume that there are less than 65536 simultaneous
1042 + * (read mode) holders of a rwlock.  Not normally a problem!!
1043 + * we also assume that the hash table has less than 65535 entries.
1044 + */
1045 +/*
1046 + * instrumented spinlock structure -- never used to allocate storage
1047 + * only used in macros below to overlay a spinlock_t
1048 + */
1049 +typedef struct inst_spinlock_s {
1050 +       /* remember, Intel is little endian */
1051 +       unsigned short lock;
1052 +       unsigned short index;
1053 +} inst_spinlock_t;
1054 +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv
1055 +#define GET_INDEX(lock_ptr)        ((inst_spinlock_t *)(lock_ptr))->index
1056 +
1057 +/*
1058 + * macros to cache and retrieve an index value in a read/write lock
1059 + * as well as the cpu where a reader busy period started
1060 + * we use the 2nd word (the debug word) for this, so require the
1061 + * debug word to be present
1062 + */
1063 +/*
1064 + * instrumented rwlock structure -- never used to allocate storage
1065 + * only used in macros below to overlay a rwlock_t
1066 + */
1067 +typedef struct inst_rwlock_s {
1068 +       volatile int lock;
1069 +       unsigned short index;
1070 +       unsigned short cpu;
1071 +} inst_rwlock_t;
1072 +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
1073 +#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
1074 +#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
1075 +#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
1076 +
1077 +/*
1078 + * return the number of readers for a rwlock_t
1079 + */
1080 +#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
1081 +
1082 +extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
1083 +{
1084 +       int tmp = (int) rwlock_ptr->lock;
1085 +       /* read and write lock attempts may cause the lock value to temporarily */
1086 +       /* be negative.  Until it is >= 0 we know nothing (i. e. can't tell if  */
1087 +       /* is -1 because it was write locked and somebody tried to read lock it */
1088 +       /* or if it is -1 because it was read locked and somebody tried to write*/
1089 +       /* lock it. ........................................................... */
1090 +       do {
1091 +               tmp = (int) rwlock_ptr->lock;
1092 +       } while (tmp < 0);
1093 +       if (tmp == 0) return(0);
1094 +       else return(RW_LOCK_BIAS-tmp);
1095 +}
1096 +
1097 +/*
1098 + * return true if rwlock is write locked
1099 + * (note that other lock attempts can cause the lock value to be negative)
1100 + */
1101 +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0)
1102 +#define IABS(x) ((x) > 0 ? (x) : -(x))
1103 +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0)
1104 +
1105 +#define get_cycles64() get_cycles()
1106 +
1107 +#endif /* _X8664_LOCKMETER_H */
1108 Index: linux/include/asm-x86_64/spinlock.h
1109 ===================================================================
1110 --- linux.orig/include/asm-x86_64/spinlock.h
1111 +++ linux/include/asm-x86_64/spinlock.h
1112 @@ -136,6 +136,11 @@ static inline void _raw_spin_lock(spinlo
1113   */
1114  typedef struct {
1115         volatile unsigned int lock;
1116 +#ifdef CONFIG_LOCKMETER
1117 +       /* required for LOCKMETER since all bits in lock are used */
1118 +       /* and we need this storage for CPU and lock INDEX        */
1119 +       unsigned lockmeter_magic;
1120 +#endif
1121  #ifdef CONFIG_DEBUG_SPINLOCK
1122         unsigned magic;
1123  #endif
1124 @@ -143,11 +148,19 @@ typedef struct {
1125  
1126  #define RWLOCK_MAGIC   0xdeaf1eed
1127  
1128 +#ifdef CONFIG_LOCKMETER
1129 +#ifdef CONFIG_DEBUG_SPINLOCK
1130 +#define RWLOCK_MAGIC_INIT      , 0, RWLOCK_MAGIC
1131 +#else
1132 +#define RWLOCK_MAGIC_INIT      , 0
1133 +#endif
1134 +#else /* !CONFIG_LOCKMETER */
1135  #ifdef CONFIG_DEBUG_SPINLOCK
1136  #define RWLOCK_MAGIC_INIT      , RWLOCK_MAGIC
1137  #else
1138  #define RWLOCK_MAGIC_INIT      /* */
1139  #endif
1140 +#endif /* !CONFIG_LOCKMETER */
1141  
1142  #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
1143  
1144 @@ -194,4 +207,47 @@ static inline int _raw_write_trylock(rwl
1145         return 0;
1146  }
1147  
1148 +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK)
1149 +extern void _metered_spin_lock  (spinlock_t *lock, void *caller_pc);
1150 +extern void _metered_spin_unlock(spinlock_t *lock);
1151 +
1152 +/*
1153 + *  Matches what is in arch/x86_64/lib/dec_and_lock.c, except this one is
1154 + *  "static inline" so that the spin_lock(), if actually invoked, is charged
1155 + *  against the real caller, not against the catch-all atomic_dec_and_lock
1156 + */
1157 +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
1158 +{
1159 +       int counter;
1160 +       int newcount;
1161 +
1162 +repeat:
1163 +       counter = atomic_read(atomic);
1164 +       newcount = counter-1;
1165 +
1166 +       if (!newcount)
1167 +               goto slow_path;
1168 +
1169 +       asm volatile("lock; cmpxchgl %1,%2"
1170 +               :"=a" (newcount)
1171 +               :"r" (newcount), "m" (atomic->counter), "0" (counter));
1172 +
1173 +       /* If the above failed, "eax" will have changed */
1174 +       if (newcount != counter)
1175 +               goto repeat;
1176 +       return 0;
1177 +
1178 +slow_path:
1179 +       preempt_disable();
1180 +       _metered_spin_lock(lock, __builtin_return_address(0));
1181 +       if (atomic_dec_and_test(atomic))
1182 +               return 1;
1183 +       _metered_spin_unlock(lock);
1184 +       preempt_enable();
1185 +       return 0;
1186 +}
1187 +
1188 +#define ATOMIC_DEC_AND_LOCK
1189 +#endif
1190 +
1191  #endif /* __ASM_SPINLOCK_H */
1192 Index: linux/include/linux/lockmeter.h
1193 ===================================================================
1194 --- linux.orig/include/linux/lockmeter.h
1195 +++ linux/include/linux/lockmeter.h
1196 @@ -0,0 +1,320 @@
1197 +/*
1198 + *  Copyright (C) 1999-2002 Silicon Graphics, Inc.
1199 + *
1200 + *  Written by John Hawkes (hawkes@sgi.com)
1201 + *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
1202 + *
1203 + *  Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000
1204 + *  Changes Copyright (C) 2000 IBM, Inc.
1205 + *  Added save of index in spinlock_t to improve efficiency
1206 + *  of "hold" time reporting for spinlocks
1207 + *  Added support for hold time statistics for read and write
1208 + *  locks.
1209 + *  Moved machine dependent code to include/asm/lockmeter.h.
1210 + *
1211 + */
1212 +
1213 +#ifndef _LINUX_LOCKMETER_H
1214 +#define _LINUX_LOCKMETER_H
1215 +
1216 +
1217 +/*---------------------------------------------------
1218 + *     architecture-independent lockmeter.h
1219 + *-------------------------------------------------*/
1220 +
1221 +/*
1222 + * raybry -- version 2: added efficient hold time statistics
1223 + *           requires lstat recompile, so flagged as new version
1224 + * raybry -- version 3: added global reader lock data
1225 + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port
1226 + */
1227 +#define LSTAT_VERSION  5
1228 +
1229 +int    lstat_update(void*, void*, int);
1230 +int    lstat_update_time(void*, void*, int, uint32_t);
1231 +
1232 +/*
1233 + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we
1234 + * need to force compatibility in the inter-communication data structure.
1235 + */
1236 +
1237 +#if defined(CONFIG_MIPS32_COMPAT)
1238 +#define TIME_T         uint32_t
1239 +#elif defined(CONFIG_SPARC) || defined(CONFIG_SPARC64)
1240 +#define TIME_T         uint64_t
1241 +#else
1242 +#define TIME_T         time_t
1243 +#endif
1244 +
1245 +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC) && !defined(CONFIG_SPARC64)) || (_MIPS_SZLONG==32)
1246 +#define POINTER                void *
1247 +#else
1248 +#define        POINTER         int64_t
1249 +#endif
1250 +
1251 +/*
1252 + * Values for the "action" parameter passed to lstat_update.
1253 + *     ZZZ - do we want a try-success status here???
1254 + */
1255 +#define LSTAT_ACT_NO_WAIT      0
1256 +#define LSTAT_ACT_SPIN         1
1257 +#define LSTAT_ACT_REJECT       2
1258 +#define LSTAT_ACT_WW_SPIN       3
1259 +#define LSTAT_ACT_SLEPT                4 /* UNUSED */
1260 +
1261 +#define LSTAT_ACT_MAX_VALUES   4 /* NOTE: Increase to 5 if use ACT_SLEPT */
1262 +
1263 +/*
1264 + * Special values for the low 2 bits of an RA passed to
1265 + * lstat_update.
1266 + */
1267 +/* we use these values to figure out what kind of lock data */
1268 +/* is stored in the statistics table entry at index ....... */
1269 +#define LSTAT_RA_SPIN           0  /* spin lock data */
1270 +#define LSTAT_RA_READ           1  /* read lock statistics */
1271 +#define LSTAT_RA_SEMA          2  /* RESERVED */
1272 +#define LSTAT_RA_WRITE          3  /* write lock statistics*/
1273 +
1274 +#define LSTAT_RA(n)    \
1275 +       ((void*)( ((unsigned long) caller_pc & ~3) | n) )
1276 +
1277 +/*
1278 + * Constants used for lock addresses in the lstat_directory
1279 + * to indicate special values of the lock address.
1280 + */
1281 +#define        LSTAT_MULTI_LOCK_ADDRESS        NULL
1282 +
1283 +/*
1284 + * Maximum size of the lockstats tables. Increase this value
1285 + * if its not big enough. (Nothing bad happens if its not
1286 + * big enough although some locks will not be monitored.)
1287 + * We record overflows of this quantity in lstat_control.dir_overflows
1288 + *
1289 + * Note:  The max value here must fit into the field set
1290 + * and obtained by the macro's PUT_INDEX() and GET_INDEX().
1291 + * This value depends on how many bits are available in the
1292 + * lock word in the particular machine implementation we are on.
1293 + */
1294 +#define LSTAT_MAX_STAT_INDEX           2000
1295 +
1296 +/*
1297 + * Size and mask for the hash table into the directory.
1298 + */
1299 +#define LSTAT_HASH_TABLE_SIZE          4096            /* must be 2**N */
1300 +#define LSTAT_HASH_TABLE_MASK          (LSTAT_HASH_TABLE_SIZE-1)
1301 +
1302 +#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
1303 +
1304 +/*
1305 + *     This defines an entry in the lockstat directory. It contains
1306 + *     information about a lock being monitored.
1307 + *     A directory entry only contains the lock identification -
1308 + *     counts on usage of the lock are kept elsewhere in a per-cpu
1309 + *     data structure to minimize cache line pinging.
1310 + */
1311 +typedef struct {
1312 +       POINTER caller_ra;                /* RA of code that set lock */
1313 +       POINTER lock_ptr;                 /* lock address */
1314 +       ushort  next_stat_index;  /* Used to link multiple locks that have the same hash table value */
1315 +} lstat_directory_entry_t;
1316 +
1317 +/*
1318 + *     A multi-dimensioned array used to contain counts for lock accesses.
1319 + *     The array is 3-dimensional:
1320 + *             - CPU number. Keep from thrashing cache lines between CPUs
1321 + *             - Directory entry index. Identifies the lock
1322 + *             - Action. Indicates what kind of contention occurred on an
1323 + *               access to the lock.
1324 + *
1325 + *     The index of an entry in the directory is the same as the 2nd index
1326 + *     of the entry in the counts array.
1327 + */
1328 +/*
1329 + *  This table contains data for spin_locks, write locks, and read locks
1330 + *  Not all data is used for all cases.  In particular, the hold time
1331 + *  information is not stored here for read locks since that is a global
1332 + *  (e. g. cannot be separated out by return address) quantity.
1333 + *  See the lstat_read_lock_counts_t structure for the global read lock
1334 + *  hold time.
1335 + */
1336 +typedef struct {
1337 +       uint64_t    cum_wait_ticks;     /* sum of wait times               */
1338 +                                       /* for write locks, sum of time a  */
1339 +                                       /* writer is waiting for a reader  */
1340 +       int64_t     cum_hold_ticks;     /* cumulative sum of holds         */
1341 +                                       /* not used for read mode locks    */
1342 +                                       /* must be signed. ............... */
1343 +       uint32_t    max_wait_ticks;     /* max waiting time                */
1344 +       uint32_t    max_hold_ticks;     /* max holding time                */
1345 +       uint64_t    cum_wait_ww_ticks;  /* sum times writer waits on writer*/
1346 +       uint32_t    max_wait_ww_ticks;  /* max wait time writer vs writer  */
1347 +                                       /* prev 2 only used for write locks*/
1348 +       uint32_t    acquire_time;       /* time lock acquired this CPU     */
1349 +       uint32_t    count[LSTAT_ACT_MAX_VALUES];
1350 +} lstat_lock_counts_t;
1351 +
1352 +typedef lstat_lock_counts_t    lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX];
1353 +
1354 +/*
1355 + * User request to:
1356 + *     - turn statistic collection on/off, or to reset
1357 + */
1358 +#define LSTAT_OFF       0
1359 +#define LSTAT_ON        1
1360 +#define LSTAT_RESET      2
1361 +#define LSTAT_RELEASE    3
1362 +
1363 +#define LSTAT_MAX_READ_LOCK_INDEX 1000
1364 +typedef struct {
1365 +       POINTER     lock_ptr;            /* address of lock for output stats */
1366 +       uint32_t    read_lock_count;
1367 +       int64_t     cum_hold_ticks;       /* sum of read lock hold times over */
1368 +                                         /* all callers. ....................*/
1369 +       uint32_t    write_index;          /* last write lock hash table index */
1370 +       uint32_t    busy_periods;         /* count of busy periods ended this */
1371 +       uint64_t    start_busy;           /* time this busy period started. ..*/
1372 +       uint64_t    busy_ticks;           /* sum of busy periods this lock. ..*/
1373 +       uint64_t    max_busy;             /* longest busy period for this lock*/
1374 +       uint32_t    max_readers;          /* maximum number of readers ...... */
1375 +#ifdef USER_MODE_TESTING
1376 +       rwlock_t    entry_lock;           /* lock for this read lock entry... */
1377 +                                         /* avoid having more than one rdr at*/
1378 +                                         /* needed for user space testing... */
1379 +                                         /* not needed for kernel 'cause it  */
1380 +                                         /* is non-preemptive. ............. */
1381 +#endif
1382 +} lstat_read_lock_counts_t;
1383 +typedef lstat_read_lock_counts_t       lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX];
1384 +
1385 +#if defined(__KERNEL__) || defined(USER_MODE_TESTING)
1386 +
1387 +#ifndef USER_MODE_TESTING
1388 +#include <asm/lockmeter.h>
1389 +#else
1390 +#include "asm_newlockmeter.h"
1391 +#endif
1392 +
1393 +/*
1394 + * Size and mask for the hash table into the directory.
1395 + */
1396 +#define LSTAT_HASH_TABLE_SIZE          4096            /* must be 2**N */
1397 +#define LSTAT_HASH_TABLE_MASK          (LSTAT_HASH_TABLE_SIZE-1)
1398 +
1399 +#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
1400 +
1401 +/*
1402 + * This version eliminates the per processor lock stack.  What we do is to
1403 + * store the index of the lock hash structure in unused bits in the lock
1404 + * itself.  Then on unlock we can find the statistics record without doing
1405 + * any additional hash or lock stack lookup.  This works for spin_locks.
1406 + * Hold time reporting is now basically as cheap as wait time reporting
1407 + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT
1408 + * as in version 1.1.* of lockmeter.
1409 + *
1410 + * For rw_locks, we store the index of a global reader stats structure in
1411 + * the lock and the writer index is stored in the latter structure.
1412 + * For read mode locks we hash at the time of the lock to find an entry
1413 + * in the directory for reader wait time and the like.
1414 + * At unlock time for read mode locks, we update just the global structure
1415 + * so we don't need to know the reader directory index value at unlock time.
1416 + *
1417 + */
1418 +
1419 +/*
1420 + * Protocol to change lstat_control.state
1421 + *   This is complicated because we don't want the cum_hold_time for
1422 + * a rw_lock to be decremented in _read_lock_ without making sure it
1423 + * is incremented in _read_lock_ and vice versa.  So here is the
1424 + * way we change the state of lstat_control.state:
1425 + * I.  To Turn Statistics On
1426 + *     After allocating storage, set lstat_control.state non-zero.
1427 + * This works because we don't start updating statistics for in use
1428 + * locks until the reader lock count goes to zero.
1429 + * II. To Turn Statistics Off:
1430 + * (0)  Disable interrupts on this CPU
1431 + * (1)  Seize the lstat_control.directory_lock
1432 + * (2)  Obtain the current value of lstat_control.next_free_read_lock_index
1433 + * (3)  Store a zero in lstat_control.state.
1434 + * (4)  Release the lstat_control.directory_lock
1435 + * (5)  For each lock in the read lock list up to the saved value
1436 + *      (well, -1) of the next_free_read_lock_index, do the following:
1437 + *      (a)  Check validity of the stored lock address
1438 + *           by making sure that the word at the saved addr
1439 + *           has an index that matches this entry.  If not
1440 + *           valid, then skip this entry.
1441 + *      (b)  If there is a write lock already set on this lock,
1442 + *           skip to (d) below.
1443 + *      (c)  Set a non-metered write lock on the lock
1444 + *      (d)  set the cached INDEX in the lock to zero
1445 + *      (e)  Release the non-metered write lock.
1446 + * (6)  Re-enable interrupts
1447 + *
1448 + * These rules ensure that a read lock will not have its statistics
1449 + * partially updated even though the global lock recording state has
1450 + * changed.  See put_lockmeter_info() for implementation.
1451 + *
1452 + * The reason for (b) is that there may be write locks set on the
1453 + * syscall path to put_lockmeter_info() from user space.  If we do
1454 + * not do this check, then we can deadlock.  A similar problem would
1455 + * occur if the lock was read locked by the current CPU.  At the
1456 + * moment this does not appear to happen.
1457 + */
1458 +
1459 +/*
1460 + * Main control structure for lockstat. Used to turn statistics on/off
1461 + * and to maintain directory info.
1462 + */
1463 +typedef struct {
1464 +       int                             state;
1465 +       spinlock_t              control_lock;           /* used to serialize turning statistics on/off   */
1466 +       spinlock_t              directory_lock;         /* for serialize adding entries to directory     */
1467 +       volatile int    next_free_dir_index;/* next free entry in the directory */
1468 +       /* FIXME not all of these fields are used / needed .............. */
1469 +                /* the following fields represent data since     */
1470 +               /* first "lstat on" or most recent "lstat reset" */
1471 +       TIME_T      first_started_time;     /* time when measurement first enabled */
1472 +       TIME_T      started_time;           /* time when measurement last started  */
1473 +       TIME_T      ending_time;            /* time when measurement last disabled */
1474 +       uint64_t    started_cycles64;       /* cycles when measurement last started          */
1475 +       uint64_t    ending_cycles64;        /* cycles when measurement last disabled         */
1476 +       uint64_t    enabled_cycles64;       /* total cycles with measurement enabled         */
1477 +       int         intervals;              /* number of measurement intervals recorded      */
1478 +                                           /* i. e. number of times did lstat on;lstat off  */
1479 +       lstat_directory_entry_t *dir;           /* directory */
1480 +       int         dir_overflow;           /* count of times ran out of space in directory  */
1481 +       int         rwlock_overflow;        /* count of times we couldn't allocate a rw block*/
1482 +       ushort          *hashtab;                           /* hash table for quick dir scans */
1483 +       lstat_cpu_counts_t      *counts[NR_CPUS];        /* Array of pointers to per-cpu stats */
1484 +    int         next_free_read_lock_index;   /* next rwlock reader (global) stats block  */
1485 +    lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats  */
1486 +} lstat_control_t;
1487 +
1488 +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */
1489 +
1490 +typedef struct {
1491 +       short           lstat_version;          /* version of the data */
1492 +       short           state;                  /* the current state is returned */
1493 +       int             maxcpus;                /* Number of cpus present */
1494 +       int             next_free_dir_index;    /* index of the next free directory entry */
1495 +       TIME_T          first_started_time;     /* when measurement enabled for first time */
1496 +       TIME_T          started_time;           /* time in secs since 1969 when stats last turned on  */
1497 +       TIME_T          ending_time;            /* time in secs since 1969 when stats last turned off */
1498 +       uint32_t        cycleval;               /* cycles per second */
1499 +#ifdef notyet
1500 +       void            *kernel_magic_addr;     /* address of kernel_magic */
1501 +       void            *kernel_end_addr;       /* contents of kernel magic (points to "end") */
1502 +#endif
1503 +       int              next_free_read_lock_index; /* index of next (global) read lock stats struct */
1504 +       uint64_t         started_cycles64;      /* cycles when measurement last started        */
1505 +       uint64_t         ending_cycles64;       /* cycles when stats last turned off           */
1506 +       uint64_t         enabled_cycles64;      /* total cycles with measurement enabled       */
1507 +       int              intervals;             /* number of measurement intervals recorded      */
1508 +                                               /* i.e. number of times we did lstat on;lstat off*/
1509 +       int              dir_overflow;          /* number of times we wanted more space in directory */
1510 +       int              rwlock_overflow;       /* # of times we wanted more space in read_locks_count */
1511 +       struct new_utsname   uts;               /* info about machine where stats are measured */
1512 +                                               /* -T option of lockstat allows data to be     */
1513 +                                               /* moved to another machine. ................. */
1514 +} lstat_user_request_t;
1515 +
1516 +#endif /* _LINUX_LOCKMETER_H */
1517 Index: linux/include/linux/spinlock.h
1518 ===================================================================
1519 --- linux.orig/include/linux/spinlock.h
1520 +++ linux/include/linux/spinlock.h
1521 @@ -74,7 +74,16 @@ void __lockfunc _write_unlock_irqrestore
1522  void __lockfunc _write_unlock_irq(rwlock_t *lock);
1523  void __lockfunc _write_unlock_bh(rwlock_t *lock);
1524  int __lockfunc _spin_trylock_bh(spinlock_t *lock);
1525 -int in_lock_functions(unsigned long addr);
1526 +
1527 +static inline int in_lock_functions(unsigned long addr)
1528 +{
1529 +       /* Linker adds these: start and end of __lockfunc functions */
1530 +       extern char __lock_text_start[], __lock_text_end[];
1531 +
1532 +       return addr >= (unsigned long)__lock_text_start
1533 +       && addr < (unsigned long)__lock_text_end;
1534 +}
1535 +
1536  #else
1537  
1538  #define in_lock_functions(ADDR) 0
1539 @@ -472,17 +481,6 @@ do { \
1540         1 : ({local_irq_restore(flags); 0;}); \
1541  })
1542  
1543 -#ifdef CONFIG_LOCKMETER
1544 -extern void _metered_spin_lock   (spinlock_t *lock);
1545 -extern void _metered_spin_unlock (spinlock_t *lock);
1546 -extern int  _metered_spin_trylock(spinlock_t *lock);
1547 -extern void _metered_read_lock    (rwlock_t *lock);
1548 -extern void _metered_read_unlock  (rwlock_t *lock);
1549 -extern void _metered_write_lock   (rwlock_t *lock);
1550 -extern void _metered_write_unlock (rwlock_t *lock);
1551 -extern int  _metered_write_trylock(rwlock_t *lock);
1552 -#endif
1553 -
1554  /* "lock on reference count zero" */
1555  #ifndef ATOMIC_DEC_AND_LOCK
1556  #include <asm/atomic.h>
1557 @@ -558,5 +556,4 @@ static inline int bit_spin_is_locked(int
1558         return 1;
1559  #endif
1560  }
1561 -
1562  #endif /* __LINUX_SPINLOCK_H */
1563 Index: linux/kernel/Makefile
1564 ===================================================================
1565 --- linux.orig/kernel/Makefile
1566 +++ linux/kernel/Makefile
1567 @@ -11,7 +11,12 @@ obj-y     = sched.o fork.o exec_domain.o
1568  
1569  obj-$(CONFIG_FUTEX) += futex.o
1570  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
1571 +ifneq ($(CONFIG_LOCKMETER),y)
1572  obj-$(CONFIG_SMP) += cpu.o spinlock.o
1573 +else
1574 +obj-$(CONFIG_SMP) += cpu.o
1575 +obj-$(CONFIG_LOCKMETER) += lockmeter.o
1576 +endif
1577  obj-$(CONFIG_UID16) += uid16.o
1578  obj-$(CONFIG_MODULES) += module.o module-verify.o
1579  obj-$(CONFIG_MODULE_SIG) += module-verify-sig.o
1580 Index: linux/kernel/lockmeter.c
1581 ===================================================================
1582 --- linux.orig/kernel/lockmeter.c
1583 +++ linux/kernel/lockmeter.c
1584 @@ -0,0 +1,1512 @@
1585 +/*
1586 + *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
1587 + *
1588 + *  Written by John Hawkes (hawkes@sgi.com)
1589 + *  Based on klstat.c by Jack Steiner (steiner@sgi.com)
1590 + *
1591 + *  Modified by Ray Bryant (raybry@us.ibm.com)
1592 + *  Changes Copyright (C) 2000 IBM, Inc.
1593 + *  Added save of index in spinlock_t to improve efficiency
1594 + *  of "hold" time reporting for spinlocks
1595 + *  Added support for hold time statistics for read and write
1596 + *  locks.
1597 + *
1598 + *  Modified by Ray Bryant (raybry@sgi.com)
1599 + *  Changes Copyright (C) 2004, Silicon Graphics, Inc.
1600 + *  Fix to work with out-of-line spinlocks.
1601 + */
1602 +
1603 +#include <linux/config.h>
1604 +#include <linux/linkage.h>
1605 +#include <linux/preempt.h>
1606 +#include <linux/interrupt.h>
1607 +#include <linux/module.h>
1608 +#include <linux/types.h>
1609 +#include <linux/errno.h>
1610 +#include <linux/slab.h>
1611 +#include <linux/sched.h>
1612 +#include <linux/smp.h>
1613 +#include <linux/threads.h>
1614 +#include <linux/version.h>
1615 +#include <linux/vmalloc.h>
1616 +#include <linux/spinlock.h>
1617 +#include <linux/utsname.h>
1618 +#include <linux/module.h>
1619 +#include <asm/system.h>
1620 +#include <asm/uaccess.h>
1621 +
1622 +#include <linux/lockmeter.h>
1623 +
1624 +#define ASSERT(cond)
1625 +#define bzero(loc,size)                memset(loc,0,size)
1626 +
1627 +/*<---------------------------------------------------*/
1628 +/*              lockmeter.c                           */
1629 +/*>---------------------------------------------------*/
1630 +
1631 +static lstat_control_t lstat_control __cacheline_aligned =
1632 +       { LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED,
1633 +         19 * 0, NR_CPUS * 0, 0, NR_CPUS * 0 };
1634 +
1635 +static ushort lstat_make_dir_entry(void *, void *);
1636 +
1637 +/*
1638 + * lstat_lookup
1639 + *
1640 + * Given a RA, locate the directory entry for the lock.
1641 + */
1642 +static ushort
1643 +lstat_lookup(void *lock_ptr, void *caller_ra)
1644 +{
1645 +       ushort index;
1646 +       lstat_directory_entry_t *dirp;
1647 +
1648 +       dirp = lstat_control.dir;
1649 +
1650 +       index = lstat_control.hashtab[DIRHASH(caller_ra)];
1651 +       while (dirp[index].caller_ra != caller_ra) {
1652 +               if (index == 0) {
1653 +                       return lstat_make_dir_entry(lock_ptr, caller_ra);
1654 +               }
1655 +               index = dirp[index].next_stat_index;
1656 +       }
1657 +
1658 +       if (dirp[index].lock_ptr != NULL && dirp[index].lock_ptr != lock_ptr) {
1659 +               dirp[index].lock_ptr = NULL;
1660 +       }
1661 +
1662 +       return index;
1663 +}
1664 +
1665 +/*
1666 + * lstat_make_dir_entry
1667 + * Called to add a new lock to the lock directory.
1668 + */
1669 +static ushort
1670 +lstat_make_dir_entry(void *lock_ptr, void *caller_ra)
1671 +{
1672 +       lstat_directory_entry_t *dirp;
1673 +       ushort index, hindex;
1674 +       unsigned long flags;
1675 +
1676 +       /* lock the table without recursively reentering this metering code */
1677 +       local_irq_save(flags);
1678 +       _raw_spin_lock(&lstat_control.directory_lock);
1679 +
1680 +       hindex = DIRHASH(caller_ra);
1681 +       index = lstat_control.hashtab[hindex];
1682 +       dirp = lstat_control.dir;
1683 +       while (index && dirp[index].caller_ra != caller_ra)
1684 +               index = dirp[index].next_stat_index;
1685 +
1686 +       if (index == 0) {
1687 +               if (lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) {
1688 +                       index = lstat_control.next_free_dir_index++;
1689 +                       lstat_control.dir[index].caller_ra = caller_ra;
1690 +                       lstat_control.dir[index].lock_ptr = lock_ptr;
1691 +                       lstat_control.dir[index].next_stat_index =
1692 +                               lstat_control.hashtab[hindex];
1693 +                       lstat_control.hashtab[hindex] = index;
1694 +               } else {
1695 +                       lstat_control.dir_overflow++;
1696 +               }
1697 +       }
1698 +       _raw_spin_unlock(&lstat_control.directory_lock);
1699 +       local_irq_restore(flags);
1700 +       return index;
1701 +}
1702 +
1703 +int
1704 +lstat_update(void *lock_ptr, void *caller_ra, int action)
1705 +{
1706 +       int index;
1707 +       int cpu;
1708 +
1709 +       ASSERT(action < LSTAT_ACT_MAX_VALUES);
1710 +
1711 +       if (lstat_control.state == LSTAT_OFF)
1712 +               return 0;
1713 +
1714 +       index = lstat_lookup(lock_ptr, caller_ra);
1715 +       cpu = THIS_CPU_NUMBER;
1716 +       (*lstat_control.counts[cpu])[index].count[action]++;
1717 +       (*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
1718 +
1719 +       return index;
1720 +}
1721 +
1722 +int
1723 +lstat_update_time(void *lock_ptr, void *caller_ra, int action, uint32_t ticks)
1724 +{
1725 +       ushort index;
1726 +       int cpu;
1727 +
1728 +       ASSERT(action < LSTAT_ACT_MAX_VALUES);
1729 +
1730 +       if (lstat_control.state == LSTAT_OFF)
1731 +               return 0;
1732 +
1733 +       index = lstat_lookup(lock_ptr, caller_ra);
1734 +       cpu = THIS_CPU_NUMBER;
1735 +       (*lstat_control.counts[cpu])[index].count[action]++;
1736 +       (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t) ticks;
1737 +       if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks)
1738 +               (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks;
1739 +
1740 +       (*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
1741 +
1742 +       return index;
1743 +}
1744 +
1745 +void
1746 +_metered_spin_lock(spinlock_t * lock_ptr, void *caller_pc)
1747 +{
1748 +       if (lstat_control.state == LSTAT_OFF) {
1749 +               _raw_spin_lock(lock_ptr);       /* do the real lock */
1750 +               PUT_INDEX(lock_ptr, 0); /* clean index in case lockmetering  */
1751 +               /* gets turned on before unlock */
1752 +       } else {
1753 +               void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
1754 +               int index;
1755 +
1756 +               if (_raw_spin_trylock(lock_ptr)) {
1757 +                       index = lstat_update(lock_ptr, this_pc,
1758 +                                               LSTAT_ACT_NO_WAIT);
1759 +               } else {
1760 +                       uint32_t start_cycles = get_cycles();
1761 +                       _raw_spin_lock(lock_ptr);       /* do the real lock */
1762 +                       index = lstat_update_time(lock_ptr, this_pc,
1763 +                               LSTAT_ACT_SPIN, get_cycles() - start_cycles);
1764 +               }
1765 +               /* save the index in the lock itself for use in spin unlock */
1766 +               PUT_INDEX(lock_ptr, index);
1767 +       }
1768 +}
1769 +/* some archs require this for atomic_dec_and_lock in modules */
1770 +EXPORT_SYMBOL(_metered_spin_lock);
1771 +
1772 +void
1773 +_metered_spin_lock_flags(spinlock_t * lock_ptr, unsigned long flags,
1774 +                        void *caller_pc)
1775 +{
1776 +       if (lstat_control.state == LSTAT_OFF) {
1777 +               _raw_spin_lock(lock_ptr);       /* do the real lock */
1778 +               PUT_INDEX(lock_ptr, 0); /* clean index in case lockmetering  */
1779 +               /* gets turned on before unlock */
1780 +       } else {
1781 +               void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
1782 +               int index;
1783 +
1784 +               if (_raw_spin_trylock(lock_ptr)) {
1785 +                       index = lstat_update(lock_ptr, this_pc,
1786 +                                               LSTAT_ACT_NO_WAIT);
1787 +               } else {
1788 +                       uint32_t start_cycles = get_cycles();
1789 +                       /* do the real lock */
1790 +                       _raw_spin_lock_flags(lock_ptr, flags);
1791 +                       index = lstat_update_time(lock_ptr, this_pc,
1792 +                               LSTAT_ACT_SPIN, get_cycles() - start_cycles);
1793 +               }
1794 +               /* save the index in the lock itself for use in spin unlock */
1795 +               PUT_INDEX(lock_ptr, index);
1796 +       }
1797 +}
1798 +
1799 +int
1800 +_metered_spin_trylock(spinlock_t * lock_ptr, void *caller_pc)
1801 +{
1802 +       if (lstat_control.state == LSTAT_OFF) {
1803 +               return _raw_spin_trylock(lock_ptr);
1804 +       } else {
1805 +               int retval;
1806 +               void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
1807 +
1808 +               if ((retval = _raw_spin_trylock(lock_ptr))) {
1809 +                       int index = lstat_update(lock_ptr, this_pc,
1810 +                                               LSTAT_ACT_NO_WAIT);
1811 +                       /*
1812 +                        * save the index in the lock itself for use in spin
1813 +                        * unlock
1814 +                        */
1815 +                       PUT_INDEX(lock_ptr, index);
1816 +               } else {
1817 +                       lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT);
1818 +               }
1819 +
1820 +               return retval;
1821 +       }
1822 +}
1823 +
1824 +void
1825 +_metered_spin_unlock(spinlock_t * lock_ptr)
1826 +{
1827 +       int index = -1;
1828 +
1829 +       if (lstat_control.state != LSTAT_OFF) {
1830 +               index = GET_INDEX(lock_ptr);
1831 +               /*
1832 +                * If statistics were turned off when we set the lock,
1833 +                * then the index can be zero.  If that is the case,
1834 +                * then collect no stats on this call.
1835 +                */
1836 +               if (index > 0) {
1837 +                       uint32_t hold_time;
1838 +                       int cpu = THIS_CPU_NUMBER;
1839 +                       hold_time = get_cycles() -
1840 +                        (*lstat_control.counts[cpu])[index].acquire_time;
1841 +                       (*lstat_control.counts[cpu])[index].cum_hold_ticks +=
1842 +                               (uint64_t) hold_time;
1843 +                       if ((*lstat_control.counts[cpu])[index].max_hold_ticks <
1844 +                           hold_time)
1845 +                               (*lstat_control.counts[cpu])[index].
1846 +                                   max_hold_ticks = hold_time;
1847 +               }
1848 +       }
1849 +
1850 +       /* make sure we don't have a stale index value saved */
1851 +       PUT_INDEX(lock_ptr, 0);
1852 +       _raw_spin_unlock(lock_ptr);     /* do the real unlock */
1853 +}
1854 +/* some archs require this for atomic_dec_and_lock in modules*/
1855 +EXPORT_SYMBOL(_metered_spin_unlock);
1856 +
1857 +/*
1858 + * allocate the next global read lock structure and store its index
1859 + * in the rwlock at "lock_ptr".
1860 + */
1861 +uint32_t
1862 +alloc_rwlock_struct(rwlock_t * rwlock_ptr)
1863 +{
1864 +       int index;
1865 +       unsigned long flags;
1866 +       int cpu = THIS_CPU_NUMBER;
1867 +
1868 +       /* If we've already overflowed, then do a quick exit */
1869 +       if (lstat_control.next_free_read_lock_index >
1870 +                       LSTAT_MAX_READ_LOCK_INDEX) {
1871 +               lstat_control.rwlock_overflow++;
1872 +               return 0;
1873 +       }
1874 +
1875 +       local_irq_save(flags);
1876 +       _raw_spin_lock(&lstat_control.directory_lock);
1877 +
1878 +       /* It is possible this changed while we were waiting for the directory_lock */
1879 +       if (lstat_control.state == LSTAT_OFF) {
1880 +               index = 0;
1881 +               goto unlock;
1882 +       }
1883 +
1884 +       /* It is possible someone else got here first and set the index */
1885 +       if ((index = GET_RWINDEX(rwlock_ptr)) == 0) {
1886 +               /*
1887 +                * we can't turn on read stats for this lock while there are
1888 +                * readers (this would mess up the running hold time sum at
1889 +                * unlock time)
1890 +                */
1891 +               if (RWLOCK_READERS(rwlock_ptr) != 0) {
1892 +                       index = 0;
1893 +                       goto unlock;
1894 +               }
1895 +
1896 +               /*
1897 +                * if stats are turned on after being off, we may need to
1898 +                * return an old index from when the statistics were on last
1899 +                * time.
1900 +                */
1901 +               for (index = 1; index < lstat_control.next_free_read_lock_index;
1902 +                               index++)
1903 +                       if ((*lstat_control.read_lock_counts[cpu])[index].
1904 +                                       lock_ptr == rwlock_ptr)
1905 +                               goto put_index_and_unlock;
1906 +
1907 +               /* allocate the next global read lock structure */
1908 +               if (lstat_control.next_free_read_lock_index >=
1909 +                   LSTAT_MAX_READ_LOCK_INDEX) {
1910 +                       lstat_control.rwlock_overflow++;
1911 +                       index = 0;
1912 +                       goto unlock;
1913 +               }
1914 +               index = lstat_control.next_free_read_lock_index++;
1915 +
1916 +               /*
1917 +                * initialize the global read stats data structure for each
1918 +                * cpu
1919 +                */
1920 +               for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1921 +                       (*lstat_control.read_lock_counts[cpu])[index].lock_ptr =
1922 +                               rwlock_ptr;
1923 +               }
1924 +put_index_and_unlock:
1925 +               /* store the index for the read lock structure into the lock */
1926 +               PUT_RWINDEX(rwlock_ptr, index);
1927 +       }
1928 +
1929 +unlock:
1930 +       _raw_spin_unlock(&lstat_control.directory_lock);
1931 +       local_irq_restore(flags);
1932 +       return index;
1933 +}
1934 +
1935 +void
1936 +_metered_read_lock(rwlock_t * rwlock_ptr, void *caller_pc)
1937 +{
1938 +       void *this_pc;
1939 +       uint32_t start_cycles;
1940 +       int index;
1941 +       int cpu;
1942 +       unsigned long flags;
1943 +       int readers_before, readers_after;
1944 +       uint64_t cycles64;
1945 +
1946 +       if (lstat_control.state == LSTAT_OFF) {
1947 +               _raw_read_lock(rwlock_ptr);
1948 +               /* clean index in case lockmetering turns on before an unlock */
1949 +               PUT_RWINDEX(rwlock_ptr, 0);
1950 +               return;
1951 +       }
1952 +
1953 +       this_pc = LSTAT_RA(LSTAT_RA_READ);
1954 +       cpu = THIS_CPU_NUMBER;
1955 +       index = GET_RWINDEX(rwlock_ptr);
1956 +
1957 +       /* allocate the global stats entry for this lock, if needed */
1958 +       if (index == 0)
1959 +               index = alloc_rwlock_struct(rwlock_ptr);
1960 +
1961 +       readers_before = RWLOCK_READERS(rwlock_ptr);
1962 +       if (_raw_read_trylock(rwlock_ptr)) {
1963 +               /*
1964 +                * We have decremented the lock to count a new reader,
1965 +                * and have confirmed that no writer has it locked.
1966 +                */
1967 +               /* update statistics if enabled */
1968 +               if (index > 0) {
1969 +                       local_irq_save(flags);
1970 +                       lstat_update((void *) rwlock_ptr, this_pc,
1971 +                                       LSTAT_ACT_NO_WAIT);
1972 +                       /* preserve value of TSC so cum_hold_ticks and start_busy use same value */
1973 +                       cycles64 = get_cycles64();
1974 +                       (*lstat_control.read_lock_counts[cpu])[index].
1975 +                               cum_hold_ticks -= cycles64;
1976 +
1977 +                       /* record time and cpu of start of busy period */
1978 +                       /* this is not perfect (some race conditions are possible) */
1979 +                       if (readers_before == 0) {
1980 +                               (*lstat_control.read_lock_counts[cpu])[index].
1981 +                                       start_busy = cycles64;
1982 +                               PUT_RW_CPU(rwlock_ptr, cpu);
1983 +                       }
1984 +                       readers_after = RWLOCK_READERS(rwlock_ptr);
1985 +                       if (readers_after >
1986 +                               (*lstat_control.read_lock_counts[cpu])[index].
1987 +                                       max_readers)
1988 +                               (*lstat_control.read_lock_counts[cpu])[index].
1989 +                                       max_readers = readers_after;
1990 +                       local_irq_restore(flags);
1991 +               }
1992 +
1993 +               return;
1994 +       }
1995 +       /* If we get here, then we could not quickly grab the read lock */
1996 +
1997 +       start_cycles = get_cycles();    /* start counting the wait time */
1998 +
1999 +       /* Now spin until read_lock is successful */
2000 +       _raw_read_lock(rwlock_ptr);
2001 +
2002 +       lstat_update_time((void *) rwlock_ptr, this_pc, LSTAT_ACT_SPIN,
2003 +                         get_cycles() - start_cycles);
2004 +
2005 +       /* update statistics if they are enabled for this lock */
2006 +       if (index > 0) {
2007 +               local_irq_save(flags);
2008 +               cycles64 = get_cycles64();
2009 +               (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -=
2010 +                               cycles64;
2011 +
2012 +               /* this is not perfect (some race conditions are possible) */
2013 +               if (readers_before == 0) {
2014 +                       (*lstat_control.read_lock_counts[cpu])[index].
2015 +                               start_busy = cycles64;
2016 +                       PUT_RW_CPU(rwlock_ptr, cpu);
2017 +               }
2018 +               readers_after = RWLOCK_READERS(rwlock_ptr);
2019 +               if (readers_after >
2020 +                   (*lstat_control.read_lock_counts[cpu])[index].max_readers)
2021 +                       (*lstat_control.read_lock_counts[cpu])[index].
2022 +                               max_readers = readers_after;
2023 +               local_irq_restore(flags);
2024 +       }
2025 +}
2026 +
2027 +void
2028 +_metered_read_unlock(rwlock_t * rwlock_ptr)
2029 +{
2030 +       int index;
2031 +       int cpu;
2032 +       unsigned long flags;
2033 +       uint64_t busy_length;
2034 +       uint64_t cycles64;
2035 +
2036 +       if (lstat_control.state == LSTAT_OFF) {
2037 +               _raw_read_unlock(rwlock_ptr);
2038 +               return;
2039 +       }
2040 +
2041 +       index = GET_RWINDEX(rwlock_ptr);
2042 +       cpu = THIS_CPU_NUMBER;
2043 +
2044 +       if (index > 0) {
2045 +               local_irq_save(flags);
2046 +               /*
2047 +                * preserve value of TSC so cum_hold_ticks and busy_ticks are
2048 +                * consistent.
2049 +                */
2050 +               cycles64 = get_cycles64();
2051 +               (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks +=
2052 +                       cycles64;
2053 +               (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++;
2054 +
2055 +               /*
2056 +                * once again, this is not perfect (some race conditions are
2057 +                * possible)
2058 +                */
2059 +               if (RWLOCK_READERS(rwlock_ptr) == 1) {
2060 +                       int cpu1 = GET_RW_CPU(rwlock_ptr);
2061 +                       uint64_t last_start_busy =
2062 +                               (*lstat_control.read_lock_counts[cpu1])[index].
2063 +                                       start_busy;
2064 +                       (*lstat_control.read_lock_counts[cpu])[index].
2065 +                               busy_periods++;
2066 +                       if (cycles64 > last_start_busy) {
2067 +                               busy_length = cycles64 - last_start_busy;
2068 +                               (*lstat_control.read_lock_counts[cpu])[index].
2069 +                                       busy_ticks += busy_length;
2070 +                               if (busy_length >
2071 +                                       (*lstat_control.
2072 +                                               read_lock_counts[cpu])[index].
2073 +                                                       max_busy)
2074 +                                       (*lstat_control.
2075 +                                        read_lock_counts[cpu])[index].
2076 +                                               max_busy = busy_length;
2077 +                       }
2078 +               }
2079 +               local_irq_restore(flags);
2080 +       }
2081 +       _raw_read_unlock(rwlock_ptr);
2082 +}
2083 +
2084 +void
2085 +_metered_write_lock(rwlock_t * rwlock_ptr, void *caller_pc)
2086 +{
2087 +       uint32_t start_cycles;
2088 +       void *this_pc;
2089 +       uint32_t spin_ticks = 0; /* in anticipation of a potential wait */
2090 +       int index;
2091 +       int write_index = 0;
2092 +       int cpu;
2093 +       enum {
2094 +               writer_writer_conflict,
2095 +               writer_reader_conflict
2096 +       } why_wait = writer_writer_conflict;
2097 +
2098 +       if (lstat_control.state == LSTAT_OFF) {
2099 +               _raw_write_lock(rwlock_ptr);
2100 +               /* clean index in case lockmetering turns on before an unlock */
2101 +               PUT_RWINDEX(rwlock_ptr, 0);
2102 +               return;
2103 +       }
2104 +
2105 +       this_pc = LSTAT_RA(LSTAT_RA_WRITE);
2106 +       cpu = THIS_CPU_NUMBER;
2107 +       index = GET_RWINDEX(rwlock_ptr);
2108 +
2109 +       /* allocate the global stats entry for this lock, if needed */
2110 +       if (index == 0) {
2111 +               index = alloc_rwlock_struct(rwlock_ptr);
2112 +       }
2113 +
2114 +       if (_raw_write_trylock(rwlock_ptr)) {
2115 +               /* We acquired the lock on the first try */
2116 +               write_index = lstat_update((void *) rwlock_ptr, this_pc,
2117 +                                       LSTAT_ACT_NO_WAIT);
2118 +               /* save the write_index for use in unlock if stats enabled */
2119 +               if (index > 0)
2120 +                       (*lstat_control.read_lock_counts[cpu])[index].
2121 +                               write_index = write_index;
2122 +               return;
2123 +       }
2124 +
2125 +       /* If we get here, then we could not quickly grab the write lock */
2126 +       start_cycles = get_cycles();    /* start counting the wait time */
2127 +
2128 +       why_wait = RWLOCK_READERS(rwlock_ptr) ?
2129 +                       writer_reader_conflict : writer_writer_conflict;
2130 +
2131 +       /* Now set the lock and wait for conflicts to disappear */
2132 +       _raw_write_lock(rwlock_ptr);
2133 +
2134 +       spin_ticks = get_cycles() - start_cycles;
2135 +
2136 +       /* update stats -- if enabled */
2137 +       if (index > 0 && spin_ticks) {
2138 +               if (why_wait == writer_reader_conflict) {
2139 +                       /* waited due to a reader holding the lock */
2140 +                       write_index = lstat_update_time((void *)rwlock_ptr,
2141 +                                       this_pc, LSTAT_ACT_SPIN, spin_ticks);
2142 +               } else {
2143 +                       /*
2144 +                        * waited due to another writer holding the lock
2145 +                        */
2146 +                       write_index = lstat_update_time((void *)rwlock_ptr,
2147 +                               this_pc, LSTAT_ACT_WW_SPIN, spin_ticks);
2148 +                       (*lstat_control.counts[cpu])[write_index].
2149 +                               cum_wait_ww_ticks += spin_ticks;
2150 +                       if (spin_ticks >
2151 +                               (*lstat_control.counts[cpu])[write_index].
2152 +                                       max_wait_ww_ticks) {
2153 +                               (*lstat_control.counts[cpu])[write_index].
2154 +                                       max_wait_ww_ticks = spin_ticks;
2155 +                       }
2156 +               }
2157 +
2158 +               /* save the directory index for use on write_unlock */
2159 +               (*lstat_control.read_lock_counts[cpu])[index].
2160 +                       write_index = write_index;
2161 +       }
2162 +}
2163 +
2164 +void
2165 +_metered_write_unlock(rwlock_t * rwlock_ptr)
2166 +{
2167 +       int index;
2168 +       int cpu;
2169 +       int write_index;
2170 +       uint32_t hold_time;
2171 +
2172 +       if (lstat_control.state == LSTAT_OFF) {
2173 +               _raw_write_unlock(rwlock_ptr);
2174 +               return;
2175 +       }
2176 +
2177 +       cpu = THIS_CPU_NUMBER;
2178 +       index = GET_RWINDEX(rwlock_ptr);
2179 +
2180 +       /* update statistics if stats enabled for this lock */
2181 +       if (index > 0) {
2182 +               write_index =
2183 +                   (*lstat_control.read_lock_counts[cpu])[index].write_index;
2184 +
2185 +               hold_time = get_cycles() -
2186 +                       (*lstat_control.counts[cpu])[write_index].acquire_time;
2187 +               (*lstat_control.counts[cpu])[write_index].cum_hold_ticks +=
2188 +                       (uint64_t) hold_time;
2189 +               if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks <
2190 +                               hold_time)
2191 +                       (*lstat_control.counts[cpu])[write_index].
2192 +                               max_hold_ticks = hold_time;
2193 +       }
2194 +       _raw_write_unlock(rwlock_ptr);
2195 +}
2196 +
2197 +int
2198 +_metered_write_trylock(rwlock_t * rwlock_ptr, void *caller_pc)
2199 +{
2200 +       int retval;
2201 +       void *this_pc = LSTAT_RA(LSTAT_RA_WRITE);
2202 +
2203 +       if ((retval = _raw_write_trylock(rwlock_ptr))) {
2204 +               lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
2205 +       } else {
2206 +               lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT);
2207 +       }
2208 +
2209 +       return retval;
2210 +}
2211 +
2212 +static void
2213 +init_control_space(void)
2214 +{
2215 +       /* Set all control space pointers to null and indices to "empty" */
2216 +       int cpu;
2217 +
2218 +       /*
2219 +        * Access CPU_CYCLE_FREQUENCY at the outset, which in some
2220 +        * architectures may trigger a runtime calculation that uses a
2221 +        * spinlock.  Let's do this before lockmetering is turned on.
2222 +        */
2223 +       if (CPU_CYCLE_FREQUENCY == 0)
2224 +               BUG();
2225 +
2226 +       lstat_control.hashtab = NULL;
2227 +       lstat_control.dir = NULL;
2228 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
2229 +               lstat_control.counts[cpu] = NULL;
2230 +               lstat_control.read_lock_counts[cpu] = NULL;
2231 +       }
2232 +}
2233 +
2234 +static int
2235 +reset_lstat_data(void)
2236 +{
2237 +       int cpu, flags;
2238 +
2239 +       flags = 0;
2240 +       lstat_control.next_free_dir_index = 1;  /* 0 is for overflows */
2241 +       lstat_control.next_free_read_lock_index = 1;
2242 +       lstat_control.dir_overflow = 0;
2243 +       lstat_control.rwlock_overflow = 0;
2244 +
2245 +       lstat_control.started_cycles64 = 0;
2246 +       lstat_control.ending_cycles64 = 0;
2247 +       lstat_control.enabled_cycles64 = 0;
2248 +       lstat_control.first_started_time = 0;
2249 +       lstat_control.started_time = 0;
2250 +       lstat_control.ending_time = 0;
2251 +       lstat_control.intervals = 0;
2252 +
2253 +       /*
2254 +        * paranoia -- in case someone does a "lockstat reset" before
2255 +        * "lockstat on"
2256 +        */
2257 +       if (lstat_control.hashtab) {
2258 +               bzero(lstat_control.hashtab,
2259 +                       LSTAT_HASH_TABLE_SIZE * sizeof (short));
2260 +               bzero(lstat_control.dir, LSTAT_MAX_STAT_INDEX *
2261 +                               sizeof (lstat_directory_entry_t));
2262 +
2263 +               for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2264 +                       bzero(lstat_control.counts[cpu],
2265 +                               sizeof (lstat_cpu_counts_t));
2266 +                       bzero(lstat_control.read_lock_counts[cpu],
2267 +                               sizeof (lstat_read_lock_cpu_counts_t));
2268 +               }
2269 +       }
2270 +#ifdef NOTDEF
2271 +       _raw_spin_unlock(&lstat_control.directory_lock);
2272 +       local_irq_restore(flags);
2273 +#endif
2274 +       return 1;
2275 +}
2276 +
2277 +static void
2278 +release_control_space(void)
2279 +{
2280 +       /*
2281 +        * Called when either (1) allocation of kmem
2282 +        * or (2) when user writes LSTAT_RELEASE to /pro/lockmeter.
2283 +        * Assume that all pointers have been initialized to zero,
2284 +        * i.e., nonzero pointers are valid addresses.
2285 +        */
2286 +       int cpu;
2287 +
2288 +       if (lstat_control.hashtab) {
2289 +               kfree(lstat_control.hashtab);
2290 +               lstat_control.hashtab = NULL;
2291 +       }
2292 +
2293 +       if (lstat_control.dir) {
2294 +               vfree(lstat_control.dir);
2295 +               lstat_control.dir = NULL;
2296 +       }
2297 +
2298 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
2299 +               if (lstat_control.counts[cpu]) {
2300 +                       vfree(lstat_control.counts[cpu]);
2301 +                       lstat_control.counts[cpu] = NULL;
2302 +               }
2303 +               if (lstat_control.read_lock_counts[cpu]) {
2304 +                       kfree(lstat_control.read_lock_counts[cpu]);
2305 +                       lstat_control.read_lock_counts[cpu] = NULL;
2306 +               }
2307 +       }
2308 +}
2309 +
2310 +int
2311 +get_lockmeter_info_size(void)
2312 +{
2313 +       return sizeof (lstat_user_request_t)
2314 +               + num_online_cpus() * sizeof (lstat_cpu_counts_t)
2315 +               + num_online_cpus() * sizeof (lstat_read_lock_cpu_counts_t)
2316 +               + (LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t));
2317 +}
2318 +
2319 +ssize_t
2320 +get_lockmeter_info(char *buffer, size_t max_len, loff_t * last_index)
2321 +{
2322 +       lstat_user_request_t req;
2323 +       struct timeval tv;
2324 +       ssize_t next_ret_bcount;
2325 +       ssize_t actual_ret_bcount = 0;
2326 +       int cpu;
2327 +
2328 +       *last_index = 0;        /* a one-shot read */
2329 +
2330 +       req.lstat_version = LSTAT_VERSION;
2331 +       req.state = lstat_control.state;
2332 +       req.maxcpus = num_online_cpus();
2333 +       req.cycleval = CPU_CYCLE_FREQUENCY;
2334 +#ifdef notyet
2335 +       req.kernel_magic_addr = (void *) &_etext;
2336 +       req.kernel_end_addr = (void *) &_etext;
2337 +#endif
2338 +       req.uts = system_utsname;
2339 +       req.intervals = lstat_control.intervals;
2340 +
2341 +       req.first_started_time = lstat_control.first_started_time;
2342 +       req.started_time = lstat_control.started_time;
2343 +       req.started_cycles64 = lstat_control.started_cycles64;
2344 +
2345 +       req.next_free_dir_index = lstat_control.next_free_dir_index;
2346 +       req.next_free_read_lock_index = lstat_control.next_free_read_lock_index;
2347 +       req.dir_overflow = lstat_control.dir_overflow;
2348 +       req.rwlock_overflow = lstat_control.rwlock_overflow;
2349 +
2350 +       if (lstat_control.state == LSTAT_OFF) {
2351 +               if (req.intervals == 0) {
2352 +                       /* mesasurement is off and no valid data present */
2353 +                       next_ret_bcount = sizeof (lstat_user_request_t);
2354 +                       req.enabled_cycles64 = 0;
2355 +
2356 +                       if ((actual_ret_bcount + next_ret_bcount) > max_len)
2357 +                               return actual_ret_bcount;
2358 +
2359 +                       copy_to_user(buffer, (void *) &req, next_ret_bcount);
2360 +                       actual_ret_bcount += next_ret_bcount;
2361 +                       return actual_ret_bcount;
2362 +               } else {
2363 +                       /*
2364 +                        * measurement is off but valid data present
2365 +                        * fetch time info from lstat_control
2366 +                        */
2367 +                       req.ending_time = lstat_control.ending_time;
2368 +                       req.ending_cycles64 = lstat_control.ending_cycles64;
2369 +                       req.enabled_cycles64 = lstat_control.enabled_cycles64;
2370 +               }
2371 +       } else {
2372 +               /*
2373 +                * this must be a read while data active--use current time,
2374 +                * etc
2375 +                */
2376 +               do_gettimeofday(&tv);
2377 +               req.ending_time = tv.tv_sec;
2378 +               req.ending_cycles64 = get_cycles64();
2379 +               req.enabled_cycles64 = req.ending_cycles64 -
2380 +                       req.started_cycles64 + lstat_control.enabled_cycles64;
2381 +       }
2382 +
2383 +       next_ret_bcount = sizeof (lstat_user_request_t);
2384 +       if ((actual_ret_bcount + next_ret_bcount) > max_len)
2385 +               return actual_ret_bcount;
2386 +
2387 +       copy_to_user(buffer, (void *) &req, next_ret_bcount);
2388 +       actual_ret_bcount += next_ret_bcount;
2389 +
2390 +       if (!lstat_control.counts[0])   /* not initialized? */
2391 +               return actual_ret_bcount;
2392 +
2393 +       next_ret_bcount = sizeof (lstat_cpu_counts_t);
2394 +       for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2395 +               if ((actual_ret_bcount + next_ret_bcount) > max_len)
2396 +                       return actual_ret_bcount;       /* leave early */
2397 +               copy_to_user(buffer + actual_ret_bcount,
2398 +                               lstat_control.counts[cpu], next_ret_bcount);
2399 +               actual_ret_bcount += next_ret_bcount;
2400 +       }
2401 +
2402 +       next_ret_bcount = LSTAT_MAX_STAT_INDEX *
2403 +                       sizeof (lstat_directory_entry_t);
2404 +       if (((actual_ret_bcount + next_ret_bcount) > max_len)
2405 +                       || !lstat_control.dir)
2406 +               return actual_ret_bcount;       /* leave early */
2407 +
2408 +       copy_to_user(buffer + actual_ret_bcount, lstat_control.dir,
2409 +                       next_ret_bcount);
2410 +       actual_ret_bcount += next_ret_bcount;
2411 +
2412 +       next_ret_bcount = sizeof (lstat_read_lock_cpu_counts_t);
2413 +       for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2414 +               if (actual_ret_bcount + next_ret_bcount > max_len)
2415 +                       return actual_ret_bcount;
2416 +               copy_to_user(buffer + actual_ret_bcount,
2417 +                               lstat_control.read_lock_counts[cpu],
2418 +                               next_ret_bcount);
2419 +               actual_ret_bcount += next_ret_bcount;
2420 +       }
2421 +
2422 +       return actual_ret_bcount;
2423 +}
2424 +
2425 +/*
2426 + *  Writing to the /proc lockmeter node enables or disables metering.
2427 + *  based upon the first byte of the "written" data.
2428 + *  The following values are defined:
2429 + *  LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement
2430 + *            subsequent calls just turn on measurement
2431 + *  LSTAT_OFF: turns off measurement
2432 + *  LSTAT_RESET: resets statistics
2433 + *  LSTAT_RELEASE: releases statistics storage
2434 + *
2435 + *  This allows one to accumulate statistics over several lockstat runs:
2436 + *
2437 + *  lockstat on
2438 + *  lockstat off
2439 + *  ...repeat above as desired...
2440 + *  lockstat get
2441 + *  ...now start a new set of measurements...
2442 + *  lockstat reset
2443 + *  lockstat on
2444 + *  ...
2445 + *
2446 + */
2447 +ssize_t
2448 +put_lockmeter_info(const char *buffer, size_t len)
2449 +{
2450 +       int error = 0;
2451 +       int dirsize, countsize, read_lock_countsize, hashsize;
2452 +       int cpu;
2453 +       char put_char;
2454 +       int i, read_lock_blocks;
2455 +       unsigned long flags;
2456 +       rwlock_t *lock_ptr;
2457 +       struct timeval tv;
2458 +
2459 +       if (len <= 0)
2460 +               return -EINVAL;
2461 +
2462 +       _raw_spin_lock(&lstat_control.control_lock);
2463 +
2464 +       get_user(put_char, buffer);
2465 +       switch (put_char) {
2466 +
2467 +       case LSTAT_OFF:
2468 +               if (lstat_control.state != LSTAT_OFF) {
2469 +                       /*
2470 +                        * To avoid seeing read lock hold times in an
2471 +                        * inconsisent state, we have to follow this protocol
2472 +                        * to turn off statistics
2473 +                        */
2474 +                       local_irq_save(flags);
2475 +                       /*
2476 +                        * getting this lock will stop any read lock block
2477 +                        * allocations
2478 +                        */
2479 +                       _raw_spin_lock(&lstat_control.directory_lock);
2480 +                       /*
2481 +                        * keep any more read lock blocks from being
2482 +                        * allocated
2483 +                        */
2484 +                       lstat_control.state = LSTAT_OFF;
2485 +                       /* record how may read lock blocks there are */
2486 +                       read_lock_blocks =
2487 +                               lstat_control.next_free_read_lock_index;
2488 +                       _raw_spin_unlock(&lstat_control.directory_lock);
2489 +                       /* now go through the list of read locks */
2490 +                       cpu = THIS_CPU_NUMBER;
2491 +                       for (i = 1; i < read_lock_blocks; i++) {
2492 +                               lock_ptr =
2493 +                                   (*lstat_control.read_lock_counts[cpu])[i].
2494 +                                   lock_ptr;
2495 +                               /* is this saved lock address still valid? */
2496 +                               if (GET_RWINDEX(lock_ptr) == i) {
2497 +                                       /*
2498 +                                        * lock address appears to still be
2499 +                                        * valid because we only hold one lock
2500 +                                        * at a time, this can't cause a
2501 +                                        * deadlock unless this is a lock held
2502 +                                        * as part of the current system call
2503 +                                        * path. At the moment there
2504 +                                        * are no READ mode locks held to get
2505 +                                        * here from user space, so we solve
2506 +                                        * this by skipping locks held in
2507 +                                        * write mode.
2508 +                                        */
2509 +                                       if (RWLOCK_IS_WRITE_LOCKED(lock_ptr)) {
2510 +                                               PUT_RWINDEX(lock_ptr, 0);
2511 +                                               continue;
2512 +                                       }
2513 +                                       /*
2514 +                                        * now we know there are no read
2515 +                                        * holders of this lock! stop
2516 +                                        * statistics collection for this
2517 +                                        * lock
2518 +                                        */
2519 +                                       _raw_write_lock(lock_ptr);
2520 +                                       PUT_RWINDEX(lock_ptr, 0);
2521 +                                       _raw_write_unlock(lock_ptr);
2522 +                               }
2523 +                               /*
2524 +                                * it may still be possible for the hold time
2525 +                                * sum to be negative e.g. if a lock is
2526 +                                * reallocated while "busy" we will have to fix
2527 +                                * this up in the data reduction program.
2528 +                                */
2529 +                       }
2530 +                       local_irq_restore(flags);
2531 +                       lstat_control.intervals++;
2532 +                       lstat_control.ending_cycles64 = get_cycles64();
2533 +                       lstat_control.enabled_cycles64 +=
2534 +                               lstat_control.ending_cycles64 -
2535 +                               lstat_control.started_cycles64;
2536 +                       do_gettimeofday(&tv);
2537 +                       lstat_control.ending_time = tv.tv_sec;
2538 +                       /*
2539 +                        * don't deallocate the structures -- we may do a
2540 +                        * lockstat on to add to the data that is already
2541 +                        * there. Use LSTAT_RELEASE to release storage
2542 +                        */
2543 +               } else {
2544 +                       error = -EBUSY; /* already OFF */
2545 +               }
2546 +               break;
2547 +
2548 +       case LSTAT_ON:
2549 +               if (lstat_control.state == LSTAT_OFF) {
2550 +#ifdef DEBUG_LOCKMETER
2551 +                       printk("put_lockmeter_info(cpu=%d): LSTAT_ON\n",
2552 +                               THIS_CPU_NUMBER);
2553 +#endif
2554 +                       lstat_control.next_free_dir_index = 1;  /* 0 is for overflows */
2555 +
2556 +                       dirsize = LSTAT_MAX_STAT_INDEX *
2557 +                                       sizeof (lstat_directory_entry_t);
2558 +                       hashsize =
2559 +                               (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort);
2560 +                       countsize = sizeof (lstat_cpu_counts_t);
2561 +                       read_lock_countsize =
2562 +                               sizeof (lstat_read_lock_cpu_counts_t);
2563 +#ifdef DEBUG_LOCKMETER
2564 +                       printk(" dirsize:%d", dirsize);
2565 +                       printk(" hashsize:%d", hashsize);
2566 +                       printk(" countsize:%d", countsize);
2567 +                       printk(" read_lock_countsize:%d\n",
2568 +                               read_lock_countsize);
2569 +#endif
2570 +#ifdef DEBUG_LOCKMETER
2571 +                       {
2572 +                               int secs;
2573 +                               unsigned long cycles;
2574 +                               uint64_t cycles64;
2575 +
2576 +                               do_gettimeofday(&tv);
2577 +                               secs = tv.tv_sec;
2578 +                               do {
2579 +                                       do_gettimeofday(&tv);
2580 +                               } while (secs == tv.tv_sec);
2581 +                               cycles = get_cycles();
2582 +                               cycles64 = get_cycles64();
2583 +                               secs = tv.tv_sec;
2584 +                               do {
2585 +                                       do_gettimeofday(&tv);
2586 +                               } while (secs == tv.tv_sec);
2587 +                               cycles = get_cycles() - cycles;
2588 +                               cycles64 = get_cycles64() - cycles;
2589 +                               printk("lockmeter: cycleFrequency:%d "
2590 +                                       "cycles:%d cycles64:%d\n",
2591 +                                       CPU_CYCLE_FREQUENCY, cycles, cycles64);
2592 +                       }
2593 +#endif
2594 +
2595 +                       /*
2596 +                        * if this is the first call, allocate storage and
2597 +                        * initialize
2598 +                        */
2599 +                       if (!lstat_control.hashtab) {
2600 +
2601 +                               spin_lock_init(&lstat_control.directory_lock);
2602 +
2603 +                               /* guarantee all pointers at zero */
2604 +                               init_control_space();
2605 +
2606 +                               lstat_control.hashtab =
2607 +                                   kmalloc(hashsize, GFP_KERNEL);
2608 +                               if (!lstat_control.hashtab) {
2609 +                                       error = -ENOSPC;
2610 +#ifdef DEBUG_LOCKMETER
2611 +                                       printk("!!error kmalloc of hashtab\n");
2612 +#endif
2613 +                               }
2614 +                               lstat_control.dir = vmalloc(dirsize);
2615 +                               if (!lstat_control.dir) {
2616 +                                       error = -ENOSPC;
2617 +#ifdef DEBUG_LOCKMETER
2618 +                                       printk("!!error kmalloc of dir\n");
2619 +#endif
2620 +                               }
2621 +
2622 +                               for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2623 +                                       lstat_control.counts[cpu] =
2624 +                                               vmalloc(countsize);
2625 +                                       if (!lstat_control.counts[cpu]) {
2626 +                                               error = -ENOSPC;
2627 +#ifdef DEBUG_LOCKMETER
2628 +                                               printk("!!error vmalloc of "
2629 +                                                       "counts[%d]\n", cpu);
2630 +#endif
2631 +                                       }
2632 +                                       lstat_control.read_lock_counts[cpu] =
2633 +                                               (lstat_read_lock_cpu_counts_t *)
2634 +                                               kmalloc(read_lock_countsize,
2635 +                                                       GFP_KERNEL);
2636 +                                       if (!lstat_control.
2637 +                                                       read_lock_counts[cpu]) {
2638 +                                               error = -ENOSPC;
2639 +#ifdef DEBUG_LOCKMETER
2640 +                                               printk("!!error kmalloc of "
2641 +                                                 "read_lock_counts[%d]\n",
2642 +                                                       cpu);
2643 +#endif
2644 +                                       }
2645 +                               }
2646 +                       }
2647 +
2648 +                       if (error) {
2649 +                               /*
2650 +                                * One or more kmalloc failures -- free
2651 +                                * everything
2652 +                                */
2653 +                               release_control_space();
2654 +                       } else {
2655 +
2656 +                               if (!reset_lstat_data()) {
2657 +                                       error = -EINVAL;
2658 +                                       break;
2659 +                               };
2660 +
2661 +                               /*
2662 +                                * record starting and ending times and the
2663 +                                * like
2664 +                                */
2665 +                               if (lstat_control.intervals == 0) {
2666 +                                       do_gettimeofday(&tv);
2667 +                                       lstat_control.first_started_time =
2668 +                                               tv.tv_sec;
2669 +                               }
2670 +                               lstat_control.started_cycles64 = get_cycles64();
2671 +                               do_gettimeofday(&tv);
2672 +                               lstat_control.started_time = tv.tv_sec;
2673 +
2674 +                               lstat_control.state = LSTAT_ON;
2675 +                       }
2676 +               } else {
2677 +                       error = -EBUSY; /* already ON */
2678 +               }
2679 +               break;
2680 +
2681 +       case LSTAT_RESET:
2682 +               if (lstat_control.state == LSTAT_OFF) {
2683 +                       if (!reset_lstat_data())
2684 +                               error = -EINVAL;
2685 +               } else {
2686 +                       error = -EBUSY; /* still on; can't reset */
2687 +               }
2688 +               break;
2689 +
2690 +       case LSTAT_RELEASE:
2691 +               if (lstat_control.state == LSTAT_OFF) {
2692 +                       release_control_space();
2693 +                       lstat_control.intervals = 0;
2694 +                       lstat_control.enabled_cycles64 = 0;
2695 +               } else {
2696 +                       error = -EBUSY;
2697 +               }
2698 +               break;
2699 +
2700 +       default:
2701 +               error = -EINVAL;
2702 +       }                       /* switch */
2703 +
2704 +       _raw_spin_unlock(&lstat_control.control_lock);
2705 +       return error ? error : len;
2706 +}
2707 +
2708 +#ifdef USER_MODE_TESTING
2709 +/* following used for user mode testing */
2710 +void
2711 +lockmeter_init()
2712 +{
2713 +       int dirsize, hashsize, countsize, read_lock_countsize, cpu;
2714 +
2715 +       printf("lstat_control is at %x size=%d\n", &lstat_control,
2716 +               sizeof (lstat_control));
2717 +       printf("sizeof(spinlock_t)=%d\n", sizeof (spinlock_t));
2718 +       lstat_control.state = LSTAT_ON;
2719 +
2720 +       lstat_control.directory_lock = SPIN_LOCK_UNLOCKED;
2721 +       lstat_control.next_free_dir_index = 1;  /* 0 is for overflows */
2722 +       lstat_control.next_free_read_lock_index = 1;
2723 +
2724 +       dirsize = LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t);
2725 +       hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort);
2726 +       countsize = sizeof (lstat_cpu_counts_t);
2727 +       read_lock_countsize = sizeof (lstat_read_lock_cpu_counts_t);
2728 +
2729 +       lstat_control.hashtab = (ushort *) malloc(hashsize);
2730 +
2731 +       if (lstat_control.hashtab == 0) {
2732 +               printf("malloc failure for at line %d in lockmeter.c\n",
2733 +                       __LINE__);
2734 +               exit(0);
2735 +       }
2736 +
2737 +       lstat_control.dir = (lstat_directory_entry_t *) malloc(dirsize);
2738 +
2739 +       if (lstat_control.dir == 0) {
2740 +               printf("malloc failure for at line %d in lockmeter.c\n", cpu,
2741 +                       __LINE__);
2742 +               exit(0);
2743 +       }
2744 +
2745 +       for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2746 +               int j, k;
2747 +               j = (int) (lstat_control.counts[cpu] =
2748 +                          (lstat_cpu_counts_t *) malloc(countsize));
2749 +               k = (int) (lstat_control.read_lock_counts[cpu] =
2750 +                          (lstat_read_lock_cpu_counts_t *)
2751 +                          malloc(read_lock_countsize));
2752 +               if (j * k == 0) {
2753 +                       printf("malloc failure for cpu=%d at line %d in "
2754 +                               "lockmeter.c\n", cpu, __LINE__);
2755 +                       exit(0);
2756 +               }
2757 +       }
2758 +
2759 +       memset(lstat_control.hashtab, 0, hashsize);
2760 +       memset(lstat_control.dir, 0, dirsize);
2761 +
2762 +       for (cpu = 0; cpu < num_online_cpus(); cpu++) {
2763 +               memset(lstat_control.counts[cpu], 0, countsize);
2764 +               memset(lstat_control.read_lock_counts[cpu], 0,
2765 +                       read_lock_countsize);
2766 +       }
2767 +}
2768 +
2769 +asm(" \
2770 +.align 4 \
2771 +.globl __write_lock_failed \
2772 +__write_lock_failed: \
2773 +       " LOCK "addl    $" RW_LOCK_BIAS_STR ",(%eax) \
2774 +1:     cmpl    $" RW_LOCK_BIAS_STR ",(%eax) \
2775 +       jne     1b \
2776 +\
2777 +       " LOCK "subl    $" RW_LOCK_BIAS_STR ",(%eax) \
2778 +       jnz     __write_lock_failed \
2779 +       ret \
2780 +\
2781 +\
2782 +.align 4 \
2783 +.globl __read_lock_failed \
2784 +__read_lock_failed: \
2785 +       lock ; incl     (%eax) \
2786 +1:     cmpl    $1,(%eax) \
2787 +       js      1b \
2788 +\
2789 +       lock ; decl     (%eax) \
2790 +       js      __read_lock_failed \
2791 +       ret \
2792 +");
2793 +#endif
2794 +
2795 +/*
2796 + * these definitions need to match what is in kernel/spinlock.c
2797 + * except for the fact tht calls to _raw_ routines are replaced by
2798 + * corresponding calls to the _metered_ routines
2799 + */
2800 +
2801 +/*
2802 + * Generic declaration of the raw read_trylock() function,
2803 + * architectures are supposed to optimize this:
2804 + */
2805 +int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
2806 +{
2807 +       _metered_read_lock(lock, __builtin_return_address(0));
2808 +       return 1;
2809 +}
2810 +EXPORT_SYMBOL(generic_raw_read_trylock);
2811 +
2812 +int __lockfunc _spin_trylock(spinlock_t *lock)
2813 +{
2814 +       preempt_disable();
2815 +       if (_metered_spin_trylock(lock, __builtin_return_address(0)))
2816 +               return 1;
2817 +
2818 +       preempt_enable();
2819 +       return 0;
2820 +}
2821 +EXPORT_SYMBOL(_spin_trylock);
2822 +
2823 +int __lockfunc _write_trylock(rwlock_t *lock)
2824 +{
2825 +       preempt_disable();
2826 +       if (_metered_write_trylock(lock, __builtin_return_address(0)))
2827 +               return 1;
2828 +
2829 +       preempt_enable();
2830 +       return 0;
2831 +}
2832 +EXPORT_SYMBOL(_write_trylock);
2833 +
2834 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
2835 +/*
2836 + * This could be a long-held lock.  If another CPU holds it for a long time,
2837 + * and that CPU is not asked to reschedule then *this* CPU will spin on the
2838 + * lock for a long time, even if *this* CPU is asked to reschedule.
2839 + *
2840 + * So what we do here, in the slow (contended) path is to spin on the lock by
2841 + * hand while permitting preemption.
2842 + *
2843 + * Called inside preempt_disable().
2844 + */
2845 +static inline void __preempt_spin_lock(spinlock_t *lock, void *caller_pc)
2846 +{
2847 +       if (preempt_count() > 1) {
2848 +               _metered_spin_lock(lock, caller_pc);
2849 +               return;
2850 +       }
2851 +
2852 +       do {
2853 +               preempt_enable();
2854 +               while (spin_is_locked(lock))
2855 +                       cpu_relax();
2856 +               preempt_disable();
2857 +       } while (!_metered_spin_trylock(lock, caller_pc));
2858 +}
2859 +
2860 +void __lockfunc _spin_lock(spinlock_t *lock)
2861 +{
2862 +       preempt_disable();
2863 +       if (unlikely(!_metered_spin_trylock(lock, __builtin_return_address(0))))
2864 +               __preempt_spin_lock(lock, __builtin_return_address(0));
2865 +}
2866 +
2867 +static inline void __preempt_write_lock(rwlock_t *lock, void *caller_pc)
2868 +{
2869 +       if (preempt_count() > 1) {
2870 +               _metered_write_lock(lock, caller_pc);
2871 +               return;
2872 +       }
2873 +
2874 +       do {
2875 +               preempt_enable();
2876 +               while (rwlock_is_locked(lock))
2877 +                       cpu_relax();
2878 +               preempt_disable();
2879 +       } while (!_metered_write_trylock(lock,caller_pc));
2880 +}
2881 +
2882 +void __lockfunc _write_lock(rwlock_t *lock)
2883 +{
2884 +       preempt_disable();
2885 +       if (unlikely(!_metered_write_trylock(lock, __builtin_return_address(0))))
2886 +               __preempt_write_lock(lock, __builtin_return_address(0));
2887 +}
2888 +#else
2889 +void __lockfunc _spin_lock(spinlock_t *lock)
2890 +{
2891 +       preempt_disable();
2892 +       _metered_spin_lock(lock, __builtin_return_address(0));
2893 +}
2894 +
2895 +void __lockfunc _write_lock(rwlock_t *lock)
2896 +{
2897 +       preempt_disable();
2898 +       _metered_write_lock(lock, __builtin_return_address(0));
2899 +}
2900 +#endif
2901 +EXPORT_SYMBOL(_spin_lock);
2902 +EXPORT_SYMBOL(_write_lock);
2903 +
2904 +void __lockfunc _read_lock(rwlock_t *lock)
2905 +{
2906 +       preempt_disable();
2907 +       _metered_read_lock(lock, __builtin_return_address(0));
2908 +}
2909 +EXPORT_SYMBOL(_read_lock);
2910 +
2911 +void __lockfunc _spin_unlock(spinlock_t *lock)
2912 +{
2913 +       _metered_spin_unlock(lock);
2914 +       preempt_enable();
2915 +}
2916 +EXPORT_SYMBOL(_spin_unlock);
2917 +
2918 +void __lockfunc _write_unlock(rwlock_t *lock)
2919 +{
2920 +       _metered_write_unlock(lock);
2921 +       preempt_enable();
2922 +}
2923 +EXPORT_SYMBOL(_write_unlock);
2924 +
2925 +void __lockfunc _read_unlock(rwlock_t *lock)
2926 +{
2927 +       _metered_read_unlock(lock);
2928 +       preempt_enable();
2929 +}
2930 +EXPORT_SYMBOL(_read_unlock);
2931 +
2932 +unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
2933 +{
2934 +       unsigned long flags;
2935 +
2936 +       local_irq_save(flags);
2937 +       preempt_disable();
2938 +       _metered_spin_lock_flags(lock, flags, __builtin_return_address(0));
2939 +       return flags;
2940 +}
2941 +EXPORT_SYMBOL(_spin_lock_irqsave);
2942 +
2943 +void __lockfunc _spin_lock_irq(spinlock_t *lock)
2944 +{
2945 +       local_irq_disable();
2946 +       preempt_disable();
2947 +       _metered_spin_lock(lock, __builtin_return_address(0));
2948 +}
2949 +EXPORT_SYMBOL(_spin_lock_irq);
2950 +
2951 +void __lockfunc _spin_lock_bh(spinlock_t *lock)
2952 +{
2953 +       local_bh_disable();
2954 +       preempt_disable();
2955 +       _metered_spin_lock(lock, __builtin_return_address(0));
2956 +}
2957 +EXPORT_SYMBOL(_spin_lock_bh);
2958 +
2959 +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
2960 +{
2961 +       unsigned long flags;
2962 +
2963 +       local_irq_save(flags);
2964 +       preempt_disable();
2965 +       _metered_read_lock(lock, __builtin_return_address(0));
2966 +       return flags;
2967 +}
2968 +EXPORT_SYMBOL(_read_lock_irqsave);
2969 +
2970 +void __lockfunc _read_lock_irq(rwlock_t *lock)
2971 +{
2972 +       local_irq_disable();
2973 +       preempt_disable();
2974 +       _metered_read_lock(lock, __builtin_return_address(0));
2975 +}
2976 +EXPORT_SYMBOL(_read_lock_irq);
2977 +
2978 +void __lockfunc _read_lock_bh(rwlock_t *lock)
2979 +{
2980 +       local_bh_disable();
2981 +       preempt_disable();
2982 +       _metered_read_lock(lock, __builtin_return_address(0));
2983 +}
2984 +EXPORT_SYMBOL(_read_lock_bh);
2985 +
2986 +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
2987 +{
2988 +       unsigned long flags;
2989 +
2990 +       local_irq_save(flags);
2991 +       preempt_disable();
2992 +       _metered_write_lock(lock, __builtin_return_address(0));
2993 +       return flags;
2994 +}
2995 +EXPORT_SYMBOL(_write_lock_irqsave);
2996 +
2997 +void __lockfunc _write_lock_irq(rwlock_t *lock)
2998 +{
2999 +       local_irq_disable();
3000 +       preempt_disable();
3001 +       _metered_write_lock(lock, __builtin_return_address(0));
3002 +}
3003 +EXPORT_SYMBOL(_write_lock_irq);
3004 +
3005 +void __lockfunc _write_lock_bh(rwlock_t *lock)
3006 +{
3007 +       local_bh_disable();
3008 +       preempt_disable();
3009 +       _metered_write_lock(lock, __builtin_return_address(0));
3010 +}
3011 +EXPORT_SYMBOL(_write_lock_bh);
3012 +
3013 +void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
3014 +{
3015 +       _metered_spin_unlock(lock);
3016 +       local_irq_restore(flags);
3017 +       preempt_enable();
3018 +}
3019 +EXPORT_SYMBOL(_spin_unlock_irqrestore);
3020 +
3021 +void __lockfunc _spin_unlock_irq(spinlock_t *lock)
3022 +{
3023 +       _metered_spin_unlock(lock);
3024 +       local_irq_enable();
3025 +       preempt_enable();
3026 +}
3027 +EXPORT_SYMBOL(_spin_unlock_irq);
3028 +
3029 +void __lockfunc _spin_unlock_bh(spinlock_t *lock)
3030 +{
3031 +       _metered_spin_unlock(lock);
3032 +       preempt_enable();
3033 +       local_bh_enable();
3034 +}
3035 +EXPORT_SYMBOL(_spin_unlock_bh);
3036 +
3037 +void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
3038 +{
3039 +       _metered_read_unlock(lock);
3040 +       local_irq_restore(flags);
3041 +       preempt_enable();
3042 +}
3043 +EXPORT_SYMBOL(_read_unlock_irqrestore);
3044 +
3045 +void __lockfunc _read_unlock_irq(rwlock_t *lock)
3046 +{
3047 +       _metered_read_unlock(lock);
3048 +       local_irq_enable();
3049 +       preempt_enable();
3050 +}
3051 +EXPORT_SYMBOL(_read_unlock_irq);
3052 +
3053 +void __lockfunc _read_unlock_bh(rwlock_t *lock)
3054 +{
3055 +       _metered_read_unlock(lock);
3056 +       preempt_enable();
3057 +       local_bh_enable();
3058 +}
3059 +EXPORT_SYMBOL(_read_unlock_bh);
3060 +
3061 +void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
3062 +{
3063 +       _metered_write_unlock(lock);
3064 +       local_irq_restore(flags);
3065 +       preempt_enable();
3066 +}
3067 +EXPORT_SYMBOL(_write_unlock_irqrestore);
3068 +
3069 +void __lockfunc _write_unlock_irq(rwlock_t *lock)
3070 +{
3071 +       _metered_write_unlock(lock);
3072 +       local_irq_enable();
3073 +       preempt_enable();
3074 +}
3075 +EXPORT_SYMBOL(_write_unlock_irq);
3076 +
3077 +void __lockfunc _write_unlock_bh(rwlock_t *lock)
3078 +{
3079 +       _metered_write_unlock(lock);
3080 +       preempt_enable();
3081 +       local_bh_enable();
3082 +}
3083 +EXPORT_SYMBOL(_write_unlock_bh);
3084 +
3085 +int __lockfunc _spin_trylock_bh(spinlock_t *lock)
3086 +{
3087 +       local_bh_disable();
3088 +       preempt_disable();
3089 +       if (_metered_spin_trylock(lock, __builtin_return_address(0)))
3090 +               return 1;
3091 +
3092 +       preempt_enable();
3093 +       local_bh_enable();
3094 +       return 0;
3095 +}
3096 +EXPORT_SYMBOL(_spin_trylock_bh);