From 3a4d78df7eb9a45b4ed4b6873cff6ed3a3a4e52c Mon Sep 17 00:00:00 2001
From: ericm <ericm>
Date: Fri, 27 Oct 2006 18:12:58 +0000
Subject: [PATCH] branch: b1_8 merge from b1_5 (20061027_1139)

---
 .../patches/new-tcp-zero-copy-2.4.29-vanilla.patch |  307 +++++
 .../new-tcp-zero-copy-2.6.9-41.2chaos.patch        |  318 +++++
 .../patches/quota-deadlock-on-pagelock-core.patch  | 1264 ++++++++++++++++++++
 .../patches/quota-deadlock-on-pagelock-ext3.patch  |  273 +++++
 .../patches/quota-umount-race-fix.patch            |  139 +++
 lustre/tests/flocks_test.c                         |   62 +
 6 files changed, 2363 insertions(+)
 create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch
 create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch
 create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch
 create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch
 create mode 100644 lustre/kernel_patches/patches/quota-umount-race-fix.patch
 create mode 100644 lustre/tests/flocks_test.c
diff --git a/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch
new file mode 100644
index 0000000..62e3087
--- /dev/null
+++ b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch
@@ -0,0 +1,307 @@
+--- linux-2.4.29-orig/include/linux/skbuff.h	2006-10-10 01:25:07.000000000 +0100
++++ linux-2.4.29/include/linux/skbuff.h	2006-10-10 00:42:59.000000000 +0100
+@@ -116,6 +116,36 @@ struct skb_frag_struct
+ 	__u16 size;
+ };
+ 
++/* Zero Copy Callback Descriptor
++ * This struct supports receiving notification when zero-copy network I/O has
++ * completed.  The ZCCD can be embedded in a struct containing the state of a
++ * zero-copy network send.  Every skbuff that references that send's pages also
++ * keeps a reference on the ZCCD.  When they have all been disposed of, the
++ * reference count on the ZCCD drops to zero and the callback is made, telling
++ * the original caller that the pages may now be overwritten. */
++struct zccd 
++{
++	atomic_t	 zccd_refcount;
++	void           (*zccd_callback)(struct zccd *); 
++};
++
++static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
++{
++	atomic_set (&d->zccd_refcount, 1);
++	d->zccd_callback = callback;
++}
++
++static inline void zccd_incref (struct zccd *d)	/* take a reference */
++{
++	atomic_inc (&d->zccd_refcount);
++}
++
++static inline void zccd_decref (struct zccd *d)	/* release a reference */
++{
++	if (atomic_dec_and_test (&d->zccd_refcount))
++		(d->zccd_callback)(d);
++}
++
+ /* This data is invariant across clones and lives at
+  * the end of the header data, ie. at skb->end.
+  */
+@@ -123,6 +153,11 @@ struct skb_shared_info {
+ 	atomic_t	dataref;
+ 	unsigned int	nr_frags;
+ 	struct sk_buff	*frag_list;
++	struct zccd     *zccd1;
++	struct zccd     *zccd2;
++	/* NB zero-copy data is normally whole pages.  We have 2 zccds in an
++	 * skbuff so we don't unneccessarily split the packet where pages fall
++	 * into the same packet. */
+ 	skb_frag_t	frags[MAX_SKB_FRAGS];
+ };
+ 
+@@ -1131,6 +1166,23 @@ static inline void kunmap_skb_frag(void 
+ #endif
+ }
+ 
++/* This skbuf has dropped its pages: drop refs on any zero-copy callback
++ * descriptors it has. */
++static inline void skb_complete_zccd (struct sk_buff *skb)
++{
++	struct skb_shared_info *info = skb_shinfo(skb);
++	
++	if (info->zccd1 != NULL) {
++		zccd_decref(info->zccd1);
++		info->zccd1 = NULL;
++	}
++
++	if (info->zccd2 != NULL) {
++		zccd_decref(info->zccd2);
++		info->zccd2 = NULL;
++	}
++}
++
+ #define skb_queue_walk(queue, skb) \
+ 		for (skb = (queue)->next;			\
+ 		     (skb != (struct sk_buff *)(queue));	\
+--- linux-2.4.29-orig/include/net/tcp.h	2006-10-10 01:25:07.000000000 +0100
++++ linux-2.4.29/include/net/tcp.h	2006-10-10 00:43:26.000000000 +0100
+@@ -674,6 +674,8 @@ extern int		    	tcp_v4_tw_remember_stam
+ 
+ extern int			tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
+ extern ssize_t			tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t			tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++						  int flags, struct zccd *zccd);
+ 
+ extern int			tcp_ioctl(struct sock *sk, 
+ 					  int cmd, 
+--- linux-2.4.29-orig/net/core/skbuff.c	2006-10-10 01:25:08.000000000 +0100
++++ linux-2.4.29/net/core/skbuff.c	2006-10-10 02:03:49.000000000 +0100
+@@ -208,6 +208,9 @@ struct sk_buff *alloc_skb(unsigned int s
+ 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ 	skb_shinfo(skb)->nr_frags = 0;
+ 	skb_shinfo(skb)->frag_list = NULL;
++	skb_shinfo(skb)->zccd1 = NULL;		/* zero-copy completion callback */
++	skb_shinfo(skb)->zccd2 = NULL;		/* not required (yet) */
++
+ 	return skb;
+ 
+ nodata:
+@@ -277,6 +280,9 @@ static void skb_release_data(struct sk_b
+ {
+ 	if (!skb->cloned ||
+ 	    atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++		/* complete zero-copy callbacks (if any) */
++		skb_complete_zccd(skb);
++
+ 		if (skb_shinfo(skb)->nr_frags) {
+ 			int i;
+ 			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -535,6 +541,8 @@ int skb_linearize(struct sk_buff *skb, i
+ 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ 	skb_shinfo(skb)->nr_frags = 0;
+ 	skb_shinfo(skb)->frag_list = NULL;
++	skb_shinfo(skb)->zccd1 = NULL;		/* zero-copy completion callback */
++	skb_shinfo(skb)->zccd2 = NULL;		/* not required */
+ 
+ 	/* We are no longer a clone, even if we were. */
+ 	skb->cloned = 0;
+@@ -589,6 +597,18 @@ struct sk_buff *pskb_copy(struct sk_buff
+ 			get_page(skb_shinfo(n)->frags[i].page);
+ 		}
+ 		skb_shinfo(n)->nr_frags = i;
++
++		if (skb_shinfo(skb)->zccd1 != NULL) {
++			BUG_TRAP(skb_shinfo(n)->zccd1 = NULL);
++			skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
++			zccd_incref(skb_shinfo(n)->zccd1);
++		}
++
++		if (skb_shinfo(skb)->zccd2 != NULL) {
++			BUG_TRAP(skb_shinfo(n)->zccd2 = NULL);
++			skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++			zccd_incref(skb_shinfo(n)->zccd2);
++		}
+ 	}
+ 
+ 	if (skb_shinfo(skb)->frag_list) {
+@@ -638,6 +658,13 @@ int pskb_expand_head(struct sk_buff *skb
+ 	memcpy(data+nhead, skb->head, skb->tail-skb->head);
+ 	memcpy(data+size, skb->end, sizeof(struct skb_shared_info));
+ 
++	/* zero-copy descriptors have been copied into the new shinfo - 
++	 * account the new references */
++	if (skb_shinfo(skb)->zccd1 != NULL)
++	   zccd_incref(skb_shinfo(skb)->zccd1);
++	if (skb_shinfo(skb)->zccd2 != NULL)
++	   zccd_incref(skb_shinfo(skb)->zccd2);
++
+ 	for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
+ 		get_page(skb_shinfo(skb)->frags[i].page);
+ 
+@@ -794,6 +821,9 @@ int ___pskb_trim(struct sk_buff *skb, un
+ 		offset = end;
+ 	}
+ 
++	if (skb_shinfo(skb)->nr_frags == 0)	/* dropped all the pages */
++		skb_complete_zccd(skb);		/* drop zccd refs */
++	
+ 	if (offset < len) {
+ 		skb->data_len -= skb->len - len;
+ 		skb->len = len;
+@@ -947,6 +977,9 @@ pull_pages:
+ 	}
+ 	skb_shinfo(skb)->nr_frags = k;
+ 
++	if (k == 0)				/* dropped all the pages */
++		skb_complete_zccd(skb);		/* drop zccd refs */
++
+ 	skb->tail += delta;
+ 	skb->data_len -= delta;
+ 
+--- linux-2.4.29-orig/net/ipv4/tcp_output.c	2004-11-17 11:54:22.000000000 +0000
++++ linux-2.4.29/net/ipv4/tcp_output.c	2006-10-10 01:55:29.000000000 +0100
+@@ -379,6 +379,15 @@ static void skb_split(struct sk_buff *sk
+ 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ 			skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+ 
++		/* Transfer zero-copy callback descriptors */
++		BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
++		skb_shinfo(skb1)->zccd1    = skb_shinfo(skb)->zccd1;
++		skb_shinfo(skb)->zccd1     = NULL;
++
++		BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
++		skb_shinfo(skb1)->zccd2    = skb_shinfo(skb)->zccd2;
++		skb_shinfo(skb)->zccd2     = NULL;
++
+ 		skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ 		skb_shinfo(skb)->nr_frags = 0;
+ 
+@@ -425,6 +434,30 @@ static void skb_split(struct sk_buff *sk
+ 			pos += size;
+ 		}
+ 		skb_shinfo(skb1)->nr_frags = k;
++
++		if (k != 0) {				
++			/* skb1 has pages. Transfer or clone the zccds */
++
++			if (skb_shinfo(skb)->zccd1 != NULL) {
++				BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
++				skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
++		      
++				if (skb_shinfo(skb)->nr_frags == 0)
++					skb_shinfo(skb)->zccd1 = NULL;
++				else
++					zccd_incref(skb_shinfo(skb)->zccd1);
++			}
++
++			if (skb_shinfo(skb)->zccd2 != NULL) {
++				BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
++				skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
++
++				if (skb_shinfo(skb)->nr_frags == 0)
++					skb_shinfo(skb)->zccd2 = NULL;
++				else
++					zccd_incref(skb_shinfo(skb)->zccd2);
++			}
++		}
+ 	}
+ }
+ 
+--- linux-2.4.29-orig/net/ipv4/tcp.c	2006-10-10 01:25:08.000000000 +0100
++++ linux-2.4.29/net/ipv4/tcp.c	2006-10-09 20:53:28.000000000 +0100
+@@ -749,7 +749,8 @@ do_interrupted:
+ 	goto out;
+ }
+ 
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags,
++ 			 struct zccd *zccd);
+ 
+ static inline int
+ can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
+@@ -828,7 +829,8 @@ static int tcp_error(struct sock *sk, in
+ 	return err;
+ }
+ 
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags,
++			 struct zccd *zccd)
+ {
+ 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ 	int mss_now;
+@@ -876,6 +878,17 @@ new_segment:
+ 			copy = size;
+ 
+ 		i = skb_shinfo(skb)->nr_frags;
++
++		if (zccd != NULL &&		/* this is a zcc I/O */
++		    skb_shinfo(skb)->zccd1 != NULL && /* skb is part of a zcc I/O */
++		    skb_shinfo(skb)->zccd2 != NULL &&
++		    skb_shinfo(skb)->zccd1 != zccd && /* not the same one */
++		    skb_shinfo(skb)->zccd2 != zccd)
++		{
++			tcp_mark_push (tp, skb);
++			goto new_segment;
++		}
++
+ 		if (can_coalesce(skb, i, page, offset)) {
+ 			skb_shinfo(skb)->frags[i-1].size += copy;
+ 		} else if (i < MAX_SKB_FRAGS) {
+@@ -886,6 +899,18 @@ new_segment:
+ 			goto new_segment;
+ 		}
+ 
++		if (zccd != NULL &&		      /* completion callback wanted */
++		    skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
++		    skb_shinfo(skb)->zccd2 != zccd) {
++			if (skb_shinfo(skb)->zccd1 == NULL) {
++				skb_shinfo(skb)->zccd1 = zccd;
++			} else {
++				BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++				skb_shinfo(skb)->zccd2 = zccd;
++			}
++			zccd_incref(zccd);	      /* new reference */
++		}
++
+ 		skb->len += copy;
+ 		skb->data_len += copy;
+ 		skb->ip_summed = CHECKSUM_HW;
+@@ -934,7 +959,8 @@ out_err:
+ 	return tcp_error(sk, flags, err);
+ }
+ 
+-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, 
++			  size_t size, int flags, struct zccd *zccd)
+ {
+ 	ssize_t res;
+ 	struct sock *sk = sock->sk;
+@@ -949,12 +975,17 @@ ssize_t tcp_sendpage(struct socket *sock
+ 
+ 	lock_sock(sk);
+ 	TCP_CHECK_TIMER(sk);
+-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
++	res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
+ 	TCP_CHECK_TIMER(sk);
+ 	release_sock(sk);
+ 	return res;
+ }
+ 
++ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
++{
++	return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
++}
++
+ #define TCP_PAGE(sk)	(sk->tp_pinfo.af_tcp.sndmsg_page)
+ #define TCP_OFF(sk)	(sk->tp_pinfo.af_tcp.sndmsg_off)
+ 
diff --git a/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch
new file mode 100644
index 0000000..8782730
--- /dev/null
+++ b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch
@@ -0,0 +1,318 @@
+--- linux/./include/net/tcp.h	2006-10-10 01:49:23.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./include/net/tcp.h	2006-09-21 17:15:21.000000000 +0100
+@@ -787,6 +787,8 @@ extern int		    	tcp_v4_tw_remember_stam
+ extern int			tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ 					    struct msghdr *msg, size_t size);
+ extern ssize_t			tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t			tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++						  int flags, struct zccd *zccd);
+ 
+ extern int			tcp_ioctl(struct sock *sk, 
+ 					  int cmd, 
+--- linux/./include/linux/skbuff.h	2006-10-10 01:49:23.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./include/linux/skbuff.h	2006-10-06 18:09:35.000000000 +0100
+@@ -134,6 +134,36 @@ struct skb_frag_struct {
+ 	__u16 size;
+ };
+ 
++/* Zero Copy Callback Descriptor
++ * This struct supports receiving notification when zero-copy network I/O has
++ * completed.  The ZCCD can be embedded in a struct containing the state of a
++ * zero-copy network send.  Every skbuff that references that send's pages also
++ * keeps a reference on the ZCCD.  When they have all been disposed of, the
++ * reference count on the ZCCD drops to zero and the callback is made, telling
++ * the original caller that the pages may now be overwritten. */
++struct zccd 
++{
++	atomic_t	 zccd_refcount;
++	void           (*zccd_callback)(struct zccd *); 
++};
++
++static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
++{
++	atomic_set (&d->zccd_refcount, 1);
++	d->zccd_callback = callback;
++}
++
++static inline void zccd_incref (struct zccd *d)	/* take a reference */
++{
++	atomic_inc (&d->zccd_refcount);
++}
++
++static inline void zccd_decref (struct zccd *d)	/* release a reference */
++{
++	if (atomic_dec_and_test (&d->zccd_refcount))
++		(d->zccd_callback)(d);
++}
++
+ /* This data is invariant across clones and lives at
+  * the end of the header data, ie. at skb->end.
+  */
+@@ -143,6 +173,11 @@ struct skb_shared_info {
+ 	unsigned short	tso_size;
+ 	unsigned short	tso_segs;
+ 	struct sk_buff	*frag_list;
++	struct zccd     *zccd1;
++	struct zccd     *zccd2;
++	/* NB zero-copy data is normally whole pages.  We have 2 zccds in an
++	 * skbuff so we don't unneccessarily split the packet where pages fall
++	 * into the same packet. */
+ 	skb_frag_t	frags[MAX_SKB_FRAGS];
+ };
+ 
+@@ -1070,6 +1105,23 @@ static inline void kunmap_skb_frag(void 
+ #endif
+ }
+ 
++/* This skbuf has dropped its pages: drop refs on any zero-copy callback
++ * descriptors it has. */
++static inline void skb_complete_zccd (struct sk_buff *skb)
++{
++	struct skb_shared_info *info = skb_shinfo(skb);
++	
++	if (info->zccd1 != NULL) {
++		zccd_decref(info->zccd1);
++		info->zccd1 = NULL;
++	}
++
++	if (info->zccd2 != NULL) {
++		zccd_decref(info->zccd2);
++		info->zccd2 = NULL;
++	}
++}
++
+ #define skb_queue_walk(queue, skb) \
+ 		for (skb = (queue)->next, prefetch(skb->next);	\
+ 		     (skb != (struct sk_buff *)(queue));	\
+--- linux/./net/core/dev.c	2006-10-10 01:49:23.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./net/core/dev.c	2006-09-21 16:53:45.000000000 +0100
+@@ -1140,6 +1140,8 @@ int __skb_linearize(struct sk_buff *skb,
+ 	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+ 	ninfo->nr_frags = 0;
+ 	ninfo->frag_list = NULL;
++	ninfo->zccd1 = NULL;			/* zero copy completion callback */
++	ninfo->zccd2 = NULL;			/* not required */
+ 
+ 	/* Offset between the two in bytes */
+ 	offset = data - skb->head;
+--- linux/./net/core/skbuff.c	2006-10-10 01:49:23.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./net/core/skbuff.c	2006-10-10 01:46:16.000000000 +0100
+@@ -155,6 +155,8 @@ struct sk_buff *alloc_skb(unsigned int s
+ 	skb_shinfo(skb)->tso_size = 0;
+ 	skb_shinfo(skb)->tso_segs = 0;
+ 	skb_shinfo(skb)->frag_list = NULL;
++	skb_shinfo(skb)->zccd1 = NULL;		/* zero-copy completion callback */
++	skb_shinfo(skb)->zccd2 = NULL;		/* not required (yet) */
+ out:
+ 	return skb;
+ nodata:
+@@ -189,6 +191,9 @@ void skb_release_data(struct sk_buff *sk
+ {
+ 	if (!skb->cloned ||
+ 	    atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++		/* complete zero-copy callbacks (if any) */
++		skb_complete_zccd(skb);
++
+ 		if (skb_shinfo(skb)->nr_frags) {
+ 			int i;
+ 			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -484,6 +489,18 @@ struct sk_buff *pskb_copy(struct sk_buff
+ 			get_page(skb_shinfo(n)->frags[i].page);
+ 		}
+ 		skb_shinfo(n)->nr_frags = i;
++
++		if (skb_shinfo(skb)->zccd1 != NULL) {
++			BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
++			skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
++			zccd_incref(skb_shinfo(n)->zccd1);
++		}
++
++		if (skb_shinfo(skb)->zccd2 != NULL) {
++			BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
++			skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++			zccd_incref(skb_shinfo(n)->zccd2);
++		}
+ 	}
+ 
+ 	if (skb_shinfo(skb)->frag_list) {
+@@ -533,6 +550,13 @@ int pskb_expand_head(struct sk_buff *skb
+ 	memcpy(data + nhead, skb->head, skb->tail - skb->head);
+ 	memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
+ 
++	/* zero-copy descriptors have been copied into the new shinfo - 
++	 * account the new references */
++	if (skb_shinfo(skb)->zccd1 != NULL)
++	   zccd_incref(skb_shinfo(skb)->zccd1);
++	if (skb_shinfo(skb)->zccd2 != NULL)
++	   zccd_incref(skb_shinfo(skb)->zccd2);
++	
+ 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ 		get_page(skb_shinfo(skb)->frags[i].page);
+ 
+@@ -694,6 +718,9 @@ int ___pskb_trim(struct sk_buff *skb, un
+ 		offset = end;
+ 	}
+ 
++	if (skb_shinfo(skb)->nr_frags == 0)	/* dropped all the pages */
++		skb_complete_zccd(skb);		/* drop zccd refs */
++	
+ 	if (offset < len) {
+ 		skb->data_len -= skb->len - len;
+ 		skb->len       = len;
+@@ -846,6 +873,9 @@ pull_pages:
+ 	}
+ 	skb_shinfo(skb)->nr_frags = k;
+ 
++	if (k == 0)				/* dropped all the pages */
++		skb_complete_zccd(skb);		/* drop zccd refs */
++		
+ 	skb->tail     += delta;
+ 	skb->data_len -= delta;
+ 
+@@ -1362,6 +1392,15 @@ static void inline skb_split_inside_head
+ 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ 		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+ 
++	/* Transfer zero-copy callback descriptors */
++	BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
++	skb_shinfo(skb1)->zccd1    = skb_shinfo(skb)->zccd1;
++	skb_shinfo(skb)->zccd1     = NULL;
++
++	BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
++	skb_shinfo(skb1)->zccd2    = skb_shinfo(skb)->zccd2;
++	skb_shinfo(skb)->zccd2     = NULL;
++
+ 	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ 	skb_shinfo(skb)->nr_frags  = 0;
+ 	skb1->data_len		   = skb->data_len;
+@@ -1410,6 +1449,30 @@ static void inline skb_split_no_header(s
+ 		pos += size;
+ 	}
+ 	skb_shinfo(skb1)->nr_frags = k;
++
++	if (k != 0) {				
++		/* skb1 has pages. Transfer or clone the zccds */
++
++		if (skb_shinfo(skb)->zccd1 != NULL) {
++			BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
++			skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
++
++			if (skb_shinfo(skb)->nr_frags == 0)
++				skb_shinfo(skb)->zccd1 = NULL;
++			else
++				zccd_incref(skb_shinfo(skb)->zccd1);
++		}
++		
++		if (skb_shinfo(skb)->zccd2 != NULL) {
++			BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
++			skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
++
++			if (skb_shinfo(skb)->nr_frags == 0)
++				skb_shinfo(skb)->zccd2 = NULL;
++			else
++				zccd_incref(skb_shinfo(skb)->zccd2);
++		}
++	}
+ }
+ 
+ /**
+--- linux/./net/ipv4/tcp_output.c	2006-09-21 00:13:11.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp_output.c	2006-09-21 18:24:26.000000000 +0100
+@@ -562,6 +562,9 @@ static unsigned char *__pskb_trim_head(s
+ 	}
+ 	skb_shinfo(skb)->nr_frags = k;
+ 
++	if (k == 0)				/* dropped all pages */
++		skb_complete_zccd(skb);
++	
+ 	skb->tail = skb->data;
+ 	skb->data_len -= len;
+ 	skb->len = skb->data_len;
+--- linux/./net/ipv4/tcp.c	2006-10-10 01:49:23.000000000 +0100
++++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp.c	2006-10-09 19:03:15.000000000 +0100
+@@ -628,8 +628,9 @@ static inline void tcp_push(struct sock 
+ 	}
+ }
+ 
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
+ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+-			 size_t psize, int flags)
++				size_t psize, int flags, struct zccd *zccd)
+ {
+ 	struct tcp_opt *tp = tcp_sk(sk);
+ 	int mss_now;
+@@ -676,6 +677,16 @@ new_segment:
+ 			copy = size;
+ 
+ 		i = skb_shinfo(skb)->nr_frags;
++
++		if (zccd != NULL &&                   /* completion callback wanted */
++		    skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
++		    skb_shinfo(skb)->zccd2 != NULL && 
++		    skb_shinfo(skb)->zccd1 != zccd && /* room needed */
++		    skb_shinfo(skb)->zccd2 != zccd) {
++			tcp_mark_push (tp, skb);
++			goto new_segment;
++		}
++
+ 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ 			tcp_mark_push(tp, skb);
+@@ -692,6 +703,18 @@ new_segment:
+ 			skb_fill_page_desc(skb, i, page, offset, copy);
+ 		}
+ 
++		if (zccd != NULL &&		      /* completion callback wanted */
++		    skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
++		    skb_shinfo(skb)->zccd2 != zccd) {
++			if (skb_shinfo(skb)->zccd1 == NULL) {
++				skb_shinfo(skb)->zccd1 = zccd;
++			} else {
++				BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++				skb_shinfo(skb)->zccd2 = zccd;
++			}
++			zccd_incref(zccd);	      /* new reference */
++		}
++
+ 		skb->len += copy;
+ 		skb->data_len += copy;
+ 		skb->truesize += copy;
+@@ -744,8 +767,8 @@ out_err:
+ 	return sk_stream_error(sk, flags, err);
+ }
+ 
+-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+-		     size_t size, int flags)
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
++			  size_t size, int flags, struct zccd *zccd)
+ {
+ 	ssize_t res;
+ 	struct sock *sk = sock->sk;
+@@ -760,12 +783,18 @@ ssize_t tcp_sendpage(struct socket *sock
+ 
+ 	lock_sock(sk);
+ 	TCP_CHECK_TIMER(sk);
+-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
++	res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
+ 	TCP_CHECK_TIMER(sk);
+ 	release_sock(sk);
+ 	return res;
+ }
+ 
++ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
++		     size_t size, int flags)
++{
++	return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
++}
++
+ #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+ #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+ 
+@@ -2343,6 +2372,7 @@ EXPORT_SYMBOL(tcp_read_sock);
+ EXPORT_SYMBOL(tcp_recvmsg);
+ EXPORT_SYMBOL(tcp_sendmsg);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
+ EXPORT_SYMBOL(tcp_setsockopt);
+ EXPORT_SYMBOL(tcp_shutdown);
+ EXPORT_SYMBOL(tcp_statistics);
diff --git a/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch
new file mode 100644
index 0000000..892a61f
--- /dev/null
+++ b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch
@@ -0,0 +1,1264 @@
+
+From: Jan Kara <jack@suse.cz>
+
+The four patches in this series fix deadlocks with quotas of pagelock (the
+problem was lock inversion on PageLock and transaction start - quota code
+needed to first start a transaction and then write the data which subsequently
+needed acquisition of PageLock while the standard ordering - PageLock first
+and transaction start later - was used e.g.  by pdflush).  They implement a
+new way of quota access to disk: Every filesystem that would like to implement
+quotas now has to provide quota_read() and quota_write() functions.  These
+functions must obey quota lock ordering (in particular they should not take
+PageLock inside a transaction).
+
+The first patch implements the changes in the quota core, the other three
+patches implement needed functions in ext2, ext3 and reiserfs.  The patch for
+reiserfs also fixes several other lock inversion problems (similar as ext3
+had) and implements the journaled quota functionality (which comes almost for
+free after the locking fixes...).
+
+The quota core patch makes quota support in other filesystems (except XFS
+which implements everything on its own ;)) unfunctional (quotaon() will refuse
+to turn on quotas on them).  When the patches get reasonable wide testing and
+it will seem that no major changes will be needed I can make fixes also for
+the other filesystems (JFS, UDF, UFS).
+
+This patch:
+
+The patch implements the new way of quota io in the quota core.  Every
+filesystem wanting to support quotas has to provide functions quota_read()
+and quota_write() obeying quota locking rules.  As the writes and reads
+bypass the pagecache there is some ugly stuff ensuring that userspace can
+see all the data after quotaoff() (or Q_SYNC quotactl).  In future I plan
+to make quota files inaccessible from userspace (with the exception of
+quotacheck(8) which will take care about the cache flushing and such stuff
+itself) so that this synchronization stuff can be removed...
+
+The rewrite of the quota core. Quota uses the filesystem read() and write()
+functions no more to avoid possible deadlocks on PageLock. From now on every
+filesystem supporting quotas must provide functions quota_read() and
+quota_write() which obey the quota locking rules (e.g. they cannot acquire the
+PageLock).
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ 25-akpm/fs/dquot.c               |  162 +++++++++++++--------------
+ 25-akpm/fs/quota.c               |   45 +++++++
+ 25-akpm/fs/quota_v1.c            |   62 ++--------
+ 25-akpm/fs/quota_v2.c            |  227 +++++++++++++++++----------------------
+ 25-akpm/include/linux/fs.h       |    3 
+ 25-akpm/include/linux/quota.h    |    2 
+ 25-akpm/include/linux/security.h |    8 -
+ 25-akpm/security/dummy.c         |    2 
+ 25-akpm/security/selinux/hooks.c |    4 
+ 9 files changed, 247 insertions(+), 268 deletions(-)
+
+diff -puN fs/dquot.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/dquot.c
+--- 25/fs/dquot.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.293107536 -0800
++++ 25-akpm/fs/dquot.c	2004-12-03 20:56:04.312104648 -0800
+@@ -49,7 +49,7 @@
+  *		New SMP locking.
+  *		Jan Kara, <jack@suse.cz>, 10/2002
+  *
+- *		Added journalled quota support
++ *		Added journalled quota support, fix lock inversion problems
+  *		Jan Kara, <jack@suse.cz>, 2003,2004
+  *
+  * (C) Copyright 1994 - 1997 Marco van Wieringen 
+@@ -75,7 +75,8 @@
+ #include <linux/proc_fs.h>
+ #include <linux/security.h>
+ #include <linux/kmod.h>
+-#include <linux/pagemap.h>
++#include <linux/namei.h>
++#include <linux/buffer_head.h>
+ 
+ #include <asm/uaccess.h>
+ 
+@@ -114,7 +115,7 @@
+  * operations on dquots don't hold dq_lock as they copy data under dq_data_lock
+  * spinlock to internal buffers before writing.
+  *
+- * Lock ordering (including related VFS locks) is following:
++ * Lock ordering (including related VFS locks) is the following:
+  *   i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
+  *   > dquot->dq_lock > dqio_sem
+  * i_sem on quota files is special (it's below dqio_sem)
+@@ -183,8 +184,7 @@ static void put_quota_format(struct quot
+  * on all three lists, depending on its current state.
+  *
+  * All dquots are placed to the end of inuse_list when first created, and this
+- * list is used for the sync and invalidate operations, which must look
+- * at every dquot.
++ * list is used for invalidate operation, which must look at every dquot.
+  *
+  * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
+  * and this list is searched whenever we need an available dquot.  Dquots are
+@@ -1314,10 +1314,12 @@ int vfs_quota_off(struct super_block *sb
+ {
+ 	int cnt;
+ 	struct quota_info *dqopt = sb_dqopt(sb);
++	struct inode *toput[MAXQUOTAS];
+ 
+ 	/* We need to serialize quota_off() for device */
+ 	down(&dqopt->dqonoff_sem);
+ 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++		toput[cnt] = NULL;
+ 		if (type != -1 && cnt != type)
+ 			continue;
+ 		if (!sb_has_quota_enabled(sb, cnt))
+@@ -1337,7 +1339,7 @@ int vfs_quota_off(struct super_block *sb
+ 			dqopt->ops[cnt]->free_file_info(sb, cnt);
+ 		put_quota_format(dqopt->info[cnt].dqi_format);
+ 
+-		fput(dqopt->files[cnt]);
++		toput[cnt] = dqopt->files[cnt];
+ 		dqopt->files[cnt] = NULL;
+ 		dqopt->info[cnt].dqi_flags = 0;
+ 		dqopt->info[cnt].dqi_igrace = 0;
+@@ -1345,6 +1347,26 @@ int vfs_quota_off(struct super_block *sb
+ 		dqopt->ops[cnt] = NULL;
+ 	}
+ 	up(&dqopt->dqonoff_sem);
++	/* Sync the superblock so that buffers with quota data are written to
++         * disk (and so userspace sees correct data afterwards) */
++	if (sb->s_op->sync_fs)
++		sb->s_op->sync_fs(sb, 1);
++	sync_blockdev(sb->s_bdev);
++	/* Now the quota files are just ordinary files and we can set the
++	 * inode flags back. Moreover we discard the pagecache so that
++	 * userspace sees the writes we did bypassing the pagecache. We
++	 * must also discard the blockdev buffers so that we see the
++	 * changes done by userspace on the next quotaon() */
++	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
++		if (toput[cnt]) {
++			down(&toput[cnt]->i_sem);
++			toput[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA);
++			truncate_inode_pages(&toput[cnt]->i_data, 0);
++			up(&toput[cnt]->i_sem);
++			mark_inode_dirty(toput[cnt]);
++			iput(toput[cnt]);
++		}
++	invalidate_bdev(sb->s_bdev, 0);
+ 	return 0;
+ }
+ 
+@@ -1352,68 +1374,56 @@ int vfs_quota_off(struct super_block *sb
+  *	Turn quotas on on a device
+  */
+ 
+-/* Helper function when we already have file open */
+-static int vfs_quota_on_file(struct file *f, int type, int format_id)
++/* Helper function when we already have the inode */
++static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+ {
+ 	struct quota_format_type *fmt = find_quota_format(format_id);
+-	struct inode *inode;
+-	struct super_block *sb = f->f_dentry->d_sb;
++	struct super_block *sb = inode->i_sb;
+ 	struct quota_info *dqopt = sb_dqopt(sb);
+-	struct dquot *to_drop[MAXQUOTAS];
+-	int error, cnt;
+-	unsigned int oldflags = -1;
++	int error;
++	int oldflags = -1;
+ 
+ 	if (!fmt)
+ 		return -ESRCH;
+-	error = -EIO;
+-	if (!f->f_op || !f->f_op->read || !f->f_op->write)
++	if (!S_ISREG(inode->i_mode)) {
++		error = -EACCES;
+ 		goto out_fmt;
+-	inode = f->f_dentry->d_inode;
+-	error = -EACCES;
+-	if (!S_ISREG(inode->i_mode))
++	}
++	if (IS_RDONLY(inode)) {
++		error = -EROFS;
++		goto out_fmt;
++	}
++	if (!sb->s_op->quota_write || !sb->s_op->quota_read) {
++		error = -EINVAL;
+ 		goto out_fmt;
++	}
+ 
++	/* As we bypass the pagecache we must now flush the inode so that
++	 * we see all the changes from userspace... */
++	write_inode_now(inode, 1);
++	/* And now flush the block cache so that kernel sees the changes */
++	invalidate_bdev(sb->s_bdev, 0);
+ 	down(&inode->i_sem);
+ 	down(&dqopt->dqonoff_sem);
+ 	if (sb_has_quota_enabled(sb, type)) {
+-		up(&inode->i_sem);
+ 		error = -EBUSY;
+ 		goto out_lock;
+ 	}
+ 	/* We don't want quota and atime on quota files (deadlocks possible)
+-	 * We also need to set GFP mask differently because we cannot recurse
+-	 * into filesystem when allocating page for quota inode */
++	 * Also nobody should write to the file - we use special IO operations
++	 * which ignore the immutable bit. */
+ 	down_write(&dqopt->dqptr_sem);
+-	oldflags = inode->i_flags & (S_NOATIME | S_NOQUOTA);
+-	inode->i_flags |= S_NOQUOTA | S_NOATIME;
++	oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
++	inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+ 	up_write(&dqopt->dqptr_sem);
+-	up(&inode->i_sem);
+ 
+-	dqopt->files[type] = f;
++	error = -EIO;
++	dqopt->files[type] = igrab(inode);
++	if (!dqopt->files[type])
++		goto out_lock;
+ 	error = -EINVAL;
+ 	if (!fmt->qf_ops->check_quota_file(sb, type))
+ 		goto out_file_init;
+-	/*
+-	 * We write to quota files deep within filesystem code.  We don't want
+-	 * the VFS to reenter filesystem code when it tries to allocate a
+-	 * pagecache page for the quota file write.  So clear __GFP_FS in
+-	 * the quota file's allocation flags.
+-	 */
+-	mapping_set_gfp_mask(inode->i_mapping,
+-		mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+-
+-	down_write(&dqopt->dqptr_sem);
+-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+-		to_drop[cnt] = inode->i_dquot[cnt];
+-		inode->i_dquot[cnt] = NODQUOT;
+-	}
+-	up_write(&dqopt->dqptr_sem);
+-	/* We must put dquots outside of dqptr_sem because we may need to
+-	 * start transaction for dquot_release() */
+-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+-		if (to_drop[cnt])
+-			dqput(to_drop[cnt]);
+-	}
+ 
+ 	dqopt->ops[type] = fmt->qf_ops;
+ 	dqopt->info[type].dqi_format = fmt;
+@@ -1424,6 +1434,7 @@ static int vfs_quota_on_file(struct file
+ 		goto out_file_init;
+ 	}
+ 	up(&dqopt->dqio_sem);
++	up(&inode->i_sem);
+ 	set_enable_flags(dqopt, type);
+ 
+ 	add_dquot_ref(sb, type);
+@@ -1433,19 +1444,18 @@ static int vfs_quota_on_file(struct file
+ 
+ out_file_init:
+ 	dqopt->files[type] = NULL;
++	iput(inode);
+ out_lock:
+ 	up(&dqopt->dqonoff_sem);
+ 	if (oldflags != -1) {
+-		down(&inode->i_sem);
+ 		down_write(&dqopt->dqptr_sem);
+-		/* Reset the NOATIME flag back. I know it could change in the
+-		 * mean time but playing with NOATIME flags on a quota file is
+-		 * never a good idea */
+-		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA);
++		/* Set the flags back (in the case of accidental quotaon()
++		 * on a wrong file we don't want to mess up the flags) */
++		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
+ 		inode->i_flags |= oldflags;
+ 		up_write(&dqopt->dqptr_sem);
+-		up(&inode->i_sem);
+ 	}
++	up(&inode->i_sem);
+ out_fmt:
+ 	put_quota_format(fmt);
+ 
+@@ -1455,47 +1465,37 @@ out_fmt:
+ /* Actual function called from quotactl() */
+ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
+ {
+-	struct file *f;
++	struct nameidata nd;
+ 	int error;
+ 
+-	f = filp_open(path, O_RDWR, 0600);
+-	if (IS_ERR(f))
+-		return PTR_ERR(f);
+-	error = security_quota_on(f);
++	error = path_lookup(path, LOOKUP_FOLLOW, &nd);
++	if (error < 0)
++		return error;
++	error = security_quota_on(nd.dentry);
+ 	if (error)
+-		goto out_f;
+-	error = vfs_quota_on_file(f, type, format_id);
+-	if (!error)
+-		return 0;
+-out_f:
+-	filp_close(f, NULL);
++		goto out_path;
++	/* Quota file not on the same filesystem? */
++	if (nd.mnt->mnt_sb != sb)
++		error = -EXDEV;
++	else
++		error = vfs_quota_on_inode(nd.dentry->d_inode, type, format_id);
++out_path:
++	path_release(&nd);
+ 	return error;
+ }
+ 
+ /*
+- * Function used by filesystems when filp_open() would fail (filesystem is
+- * being mounted now). We will use a private file structure. Caller is
+- * responsible that it's IO functions won't need vfsmnt structure or
+- * some dentry tricks...
++ * This function is used when filesystem needs to initialize quotas
++ * during mount time.
+  */
+ int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry)
+ {
+-	struct file *f;
+ 	int error;
+ 
+-	dget(dentry);	/* Get a reference for struct file */
+-	f = dentry_open(dentry, NULL, O_RDWR);
+-	if (IS_ERR(f)) {
+-		error = PTR_ERR(f);
+-		goto out_dentry;
+-	}
+-	error = vfs_quota_on_file(f, type, format_id);
+-	if (!error)
+-		return 0;
+-	fput(f);
+-out_dentry:
+-	dput(dentry);
+-	return error;
++	error = security_quota_on(dentry);
++	if (error)
++		return error;
++	return vfs_quota_on_inode(dentry->d_inode, type, format_id);
+ }
+ 
+ /* Generic routine for getting common part of quota structure */
+diff -puN fs/quota.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota.c
+--- 25/fs/quota.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.295107232 -0800
++++ 25-akpm/fs/quota.c	2004-12-03 20:56:04.313104496 -0800
+@@ -13,6 +13,8 @@
+ #include <linux/kernel.h>
+ #include <linux/smp_lock.h>
+ #include <linux/security.h>
++#include <linux/syscalls.h>
++#include <linux/buffer_head.h>
+
+ /* Check validity of quotactl */
+ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+@@ -135,16 +136,54 @@ restart:
+ 	return NULL;
+ }
+ 
++void quota_sync_sb(struct super_block *sb, int type)
++{
++	int cnt;
++	struct inode *discard[MAXQUOTAS];
++
++	sb->s_qcop->quota_sync(sb, type);
++	/* This is not very clever (and fast) but currently I don't know about
++	 * any other simple way of getting quota data to disk and we must get
++	 * them there for userspace to be visible... */
++	if (sb->s_op->sync_fs)
++		sb->s_op->sync_fs(sb, 1);
++	sync_blockdev(sb->s_bdev);
++
++	/* Now when everything is written we can discard the pagecache so
++	 * that userspace sees the changes. We need i_sem and so we could
++	 * not do it inside dqonoff_sem. Moreover we need to be carefull
++	 * about races with quotaoff() (that is the reason why we have own
++	 * reference to inode). */
++	down(&sb_dqopt(sb)->dqonoff_sem);
++	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++		discard[cnt] = NULL;
++		if (type != -1 && cnt != type)
++			continue;
++		if (!sb_has_quota_enabled(sb, cnt))
++			continue;
++		discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]);
++	}
++	up(&sb_dqopt(sb)->dqonoff_sem);
++	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++		if (discard[cnt]) {
++			down(&discard[cnt]->i_sem);
++			truncate_inode_pages(&discard[cnt]->i_data, 0);
++			up(&discard[cnt]->i_sem);
++			iput(discard[cnt]);
++		}
++	}
++}
++
+ void sync_dquots(struct super_block *sb, int type)
+ {
+ 	if (sb) {
+ 		if (sb->s_qcop->quota_sync)
+-			sb->s_qcop->quota_sync(sb, type);
++			quota_sync_sb(sb, type);
+ 	}
+ 	else {
+-		while ((sb = get_super_to_sync(type)) != 0) {
++		while ((sb = get_super_to_sync(type)) != NULL) {
+ 			if (sb->s_qcop->quota_sync)
+-				sb->s_qcop->quota_sync(sb, type);
++				quota_sync_sb(sb, type);
+ 			drop_super(sb);
+ 		}
+ 	}
+diff -puN fs/quota_v1.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota_v1.c
+--- 25/fs/quota_v1.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.296107080 -0800
++++ 25-akpm/fs/quota_v1.c	2004-12-03 20:56:04.314104344 -0800
+@@ -7,7 +7,6 @@
+ #include <linux/init.h>
+ #include <linux/module.h>
+ 
+-#include <asm/uaccess.h>
+ #include <asm/byteorder.h>
+ 
+ MODULE_AUTHOR("Jan Kara");
+@@ -41,23 +40,14 @@ static void v1_mem2disk_dqblk(struct v1_
+ static int v1_read_dqblk(struct dquot *dquot)
+ {
+ 	int type = dquot->dq_type;
+-	struct file *filp;
+-	mm_segment_t fs;
+-	loff_t offset;
+ 	struct v1_disk_dqblk dqblk;
+ 
+-	filp = sb_dqopt(dquot->dq_sb)->files[type];
+-	if (filp == (struct file *)NULL)
++	if (!sb_dqopt(dquot->dq_sb)->files[type])
+ 		return -EINVAL;
+ 
+-	/* Now we are sure filp is valid */
+-	offset = v1_dqoff(dquot->dq_id);
+ 	/* Set structure to 0s in case read fails/is after end of file */
+ 	memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset);
+-	set_fs(fs);
++	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+ 
+ 	v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
+ 	if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 &&
+@@ -71,26 +61,18 @@ static int v1_read_dqblk(struct dquot *d
+ static int v1_commit_dqblk(struct dquot *dquot)
+ {
+ 	short type = dquot->dq_type;
+-	struct file *filp;
+-	mm_segment_t fs;
+-	loff_t offset;
+ 	ssize_t ret;
+ 	struct v1_disk_dqblk dqblk;
+ 
+-	filp = sb_dqopt(dquot->dq_sb)->files[type];
+-	offset = v1_dqoff(dquot->dq_id);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-
+ 	v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
+ 	if (dquot->dq_id == 0) {
+ 		dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
+ 		dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
+ 	}
+ 	ret = 0;
+-	if (filp)
+-		ret = filp->f_op->write(filp, (char *)&dqblk,
+-					sizeof(struct v1_disk_dqblk), &offset);
++	if (sb_dqopt(dquot->dq_sb)->files[type])
++		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk,
++					sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+ 	if (ret != sizeof(struct v1_disk_dqblk)) {
+ 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+ 			dquot->dq_sb->s_id);
+@@ -101,7 +83,6 @@ static int v1_commit_dqblk(struct dquot 
+ 	ret = 0;
+ 
+ out:
+-	set_fs(fs);
+ 	dqstats.writes++;
+ 
+ 	return ret;
+@@ -121,14 +102,11 @@ struct v2_disk_dqheader {
+ 
+ static int v1_check_quota_file(struct super_block *sb, int type)
+ {
+-	struct file *f = sb_dqopt(sb)->files[type];
+-	struct inode *inode = f->f_dentry->d_inode;
++	struct inode *inode = sb_dqopt(sb)->files[type];
+ 	ulong blocks;
+ 	size_t off; 
+ 	struct v2_disk_dqheader dqhead;
+-	mm_segment_t fs;
+ 	ssize_t size;
+-	loff_t offset = 0;
+ 	loff_t isize;
+ 	static const uint quota_magics[] = V2_INITQMAGICS;
+ 
+@@ -140,10 +118,7 @@ static int v1_check_quota_file(struct su
+ 	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk))
+ 		return 0;
+ 	/* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset);
+-	set_fs(fs);
++	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+ 	if (size != sizeof(struct v2_disk_dqheader))
+ 		return 1;	/* Probably not new format */
+ 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
+@@ -155,16 +130,10 @@ static int v1_check_quota_file(struct su
+ static int v1_read_file_info(struct super_block *sb, int type)
+ {
+ 	struct quota_info *dqopt = sb_dqopt(sb);
+-	mm_segment_t fs;
+-	loff_t offset;
+-	struct file *filp = dqopt->files[type];
+ 	struct v1_disk_dqblk dqblk;
+ 	int ret;
+ 
+-	offset = v1_dqoff(0);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) {
++	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+ 		if (ret >= 0)
+ 			ret = -EIO;
+ 		goto out;
+@@ -173,38 +142,31 @@ static int v1_read_file_info(struct supe
+ 	dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+ 	dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+ out:
+-	set_fs(fs);
+ 	return ret;
+ }
+ 
+ static int v1_write_file_info(struct super_block *sb, int type)
+ {
+ 	struct quota_info *dqopt = sb_dqopt(sb);
+-	mm_segment_t fs;
+-	struct file *filp = dqopt->files[type];
+ 	struct v1_disk_dqblk dqblk;
+-	loff_t offset;
+ 	int ret;
+ 
+ 	dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
+-	offset = v1_dqoff(0);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) {
++	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
++	    sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+ 		if (ret >= 0)
+ 			ret = -EIO;
+ 		goto out;
+ 	}
+ 	dqblk.dqb_itime = dqopt->info[type].dqi_igrace;
+ 	dqblk.dqb_btime = dqopt->info[type].dqi_bgrace;
+-	offset = v1_dqoff(0);
+-	ret = filp->f_op->write(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset);
++	ret = sb->s_op->quota_write(sb, type, (char *)&dqblk,
++	      sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+ 	if (ret == sizeof(struct v1_disk_dqblk))
+ 		ret = 0;
+ 	else if (ret > 0)
+ 		ret = -EIO;
+ out:
+-	set_fs(fs);
+ 	return ret;
+ }
+ 
+diff -puN fs/quota_v2.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota_v2.c
+--- 25/fs/quota_v2.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.297106928 -0800
++++ 25-akpm/fs/quota_v2.c	2004-12-03 20:56:04.318103736 -0800
+@@ -13,7 +13,6 @@
+ #include <linux/slab.h>
+ 
+ #include <asm/byteorder.h>
+-#include <asm/uaccess.h>
+ 
+ MODULE_AUTHOR("Jan Kara");
+ MODULE_DESCRIPTION("Quota format v2 support");
+@@ -30,19 +29,15 @@ typedef char *dqbuf_t;
+ static int v2_check_quota_file(struct super_block *sb, int type)
+ {
+ 	struct v2_disk_dqheader dqhead;
+-	struct file *f = sb_dqopt(sb)->files[type];
+-	mm_segment_t fs;
+ 	ssize_t size;
+-	loff_t offset = 0;
+ 	static const uint quota_magics[] = V2_INITQMAGICS;
+ 	static const uint quota_versions[] = V2_INITQVERSIONS;
+  
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset);
+-	set_fs(fs);
+-	if (size != sizeof(struct v2_disk_dqheader))
++	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
++	if (size != sizeof(struct v2_disk_dqheader)) {
++		printk("failed read\n");
+ 		return 0;
++	}
+ 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+ 	    le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+ 		return 0;
+@@ -52,20 +47,15 @@ static int v2_check_quota_file(struct su
+ /* Read information header from quota file */
+ static int v2_read_file_info(struct super_block *sb, int type)
+ {
+-	mm_segment_t fs;
+ 	struct v2_disk_dqinfo dinfo;
+ 	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+-	struct file *f = sb_dqopt(sb)->files[type];
+ 	ssize_t size;
+-	loff_t offset = V2_DQINFOOFF;
+ 
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	size = f->f_op->read(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset);
+-	set_fs(fs);
++	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
++	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+ 	if (size != sizeof(struct v2_disk_dqinfo)) {
+ 		printk(KERN_WARNING "Can't read info structure on device %s.\n",
+-			f->f_dentry->d_sb->s_id);
++			sb->s_id);
+ 		return -1;
+ 	}
+ 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+@@ -80,12 +70,9 @@ static int v2_read_file_info(struct supe
+ /* Write information header to quota file */
+ static int v2_write_file_info(struct super_block *sb, int type)
+ {
+-	mm_segment_t fs;
+ 	struct v2_disk_dqinfo dinfo;
+ 	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+-	struct file *f = sb_dqopt(sb)->files[type];
+ 	ssize_t size;
+-	loff_t offset = V2_DQINFOOFF;
+ 
+ 	spin_lock(&dq_data_lock);
+ 	info->dqi_flags &= ~DQF_INFO_DIRTY;
+@@ -96,13 +83,11 @@ static int v2_write_file_info(struct sup
+ 	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
+ 	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
+ 	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	size = f->f_op->write(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset);
+-	set_fs(fs);
++	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
++	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+ 	if (size != sizeof(struct v2_disk_dqinfo)) {
+ 		printk(KERN_WARNING "Can't write info structure on device %s.\n",
+-			f->f_dentry->d_sb->s_id);
++			sb->s_id);
+ 		return -1;
+ 	}
+ 	return 0;
+@@ -146,39 +131,24 @@ static inline void freedqbuf(dqbuf_t buf
+ 	kfree(buf);
+ }
+ 
+-static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf)
++static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
+ {
+-	mm_segment_t fs;
+-	ssize_t ret;
+-	loff_t offset = blk<<V2_DQBLKSIZE_BITS;
+-
+ 	memset(buf, 0, V2_DQBLKSIZE);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	ret = filp->f_op->read(filp, (char *)buf, V2_DQBLKSIZE, &offset);
+-	set_fs(fs);
+-	return ret;
++	return sb->s_op->quota_read(sb, type, (char *)buf,
++	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
+ }
+ 
+-static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf)
++static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
+ {
+-	mm_segment_t fs;
+-	ssize_t ret;
+-	loff_t offset = blk<<V2_DQBLKSIZE_BITS;
+-
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	ret = filp->f_op->write(filp, (char *)buf, V2_DQBLKSIZE, &offset);
+-	set_fs(fs);
+-	return ret;
+-
++	return sb->s_op->quota_write(sb, type, (char *)buf,
++	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
+ }
+ 
+ /* Remove empty block from list and return it */
+-static int get_free_dqblk(struct file *filp, int type)
++static int get_free_dqblk(struct super_block *sb, int type)
+ {
+ 	dqbuf_t buf = getdqbuf();
+-	struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type);
++	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ 	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+ 	int ret, blk;
+ 
+@@ -186,17 +156,18 @@ static int get_free_dqblk(struct file *f
+ 		return -ENOMEM;
+ 	if (info->u.v2_i.dqi_free_blk) {
+ 		blk = info->u.v2_i.dqi_free_blk;
+-		if ((ret = read_blk(filp, blk, buf)) < 0)
++		if ((ret = read_blk(sb, type, blk, buf)) < 0)
+ 			goto out_buf;
+ 		info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+ 	}
+ 	else {
+ 		memset(buf, 0, V2_DQBLKSIZE);
+-		if ((ret = write_blk(filp, info->u.v2_i.dqi_blocks, buf)) < 0)	/* Assure block allocation... */
++		/* Assure block allocation... */
++		if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
+ 			goto out_buf;
+ 		blk = info->u.v2_i.dqi_blocks++;
+ 	}
+-	mark_info_dirty(filp->f_dentry->d_sb, type);
++	mark_info_dirty(sb, type);
+ 	ret = blk;
+ out_buf:
+ 	freedqbuf(buf);
+@@ -204,9 +175,9 @@ out_buf:
+ }
+ 
+ /* Insert empty block to the list */
+-static int put_free_dqblk(struct file *filp, int type, dqbuf_t buf, uint blk)
++static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+ {
+-	struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type);
++	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ 	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+ 	int err;
+ 
+@@ -214,17 +185,18 @@ static int put_free_dqblk(struct file *f
+ 	dh->dqdh_prev_free = cpu_to_le32(0);
+ 	dh->dqdh_entries = cpu_to_le16(0);
+ 	info->u.v2_i.dqi_free_blk = blk;
+-	mark_info_dirty(filp->f_dentry->d_sb, type);
+-	if ((err = write_blk(filp, blk, buf)) < 0)	/* Some strange block. We had better leave it... */
++	mark_info_dirty(sb, type);
++	/* Some strange block. We had better leave it... */
++	if ((err = write_blk(sb, type, blk, buf)) < 0)
+ 		return err;
+ 	return 0;
+ }
+ 
+ /* Remove given block from the list of blocks with free entries */
+-static int remove_free_dqentry(struct file *filp, int type, dqbuf_t buf, uint blk)
++static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+ {
+ 	dqbuf_t tmpbuf = getdqbuf();
+-	struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type);
++	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ 	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+ 	uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
+ 	int err;
+@@ -232,26 +204,27 @@ static int remove_free_dqentry(struct fi
+ 	if (!tmpbuf)
+ 		return -ENOMEM;
+ 	if (nextblk) {
+-		if ((err = read_blk(filp, nextblk, tmpbuf)) < 0)
++		if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
+ 			goto out_buf;
+ 		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
+-		if ((err = write_blk(filp, nextblk, tmpbuf)) < 0)
++		if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
+ 			goto out_buf;
+ 	}
+ 	if (prevblk) {
+-		if ((err = read_blk(filp, prevblk, tmpbuf)) < 0)
++		if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
+ 			goto out_buf;
+ 		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
+-		if ((err = write_blk(filp, prevblk, tmpbuf)) < 0)
++		if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
+ 			goto out_buf;
+ 	}
+ 	else {
+ 		info->u.v2_i.dqi_free_entry = nextblk;
+-		mark_info_dirty(filp->f_dentry->d_sb, type);
++		mark_info_dirty(sb, type);
+ 	}
+ 	freedqbuf(tmpbuf);
+ 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+-	if (write_blk(filp, blk, buf) < 0)	/* No matter whether write succeeds block is out of list */
++	/* No matter whether write succeeds block is out of list */
++	if (write_blk(sb, type, blk, buf) < 0)
+ 		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+ 	return 0;
+ out_buf:
+@@ -260,10 +233,10 @@ out_buf:
+ }
+ 
+ /* Insert given block to the beginning of list with free entries */
+-static int insert_free_dqentry(struct file *filp, int type, dqbuf_t buf, uint blk)
++static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+ {
+ 	dqbuf_t tmpbuf = getdqbuf();
+-	struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type);
++	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ 	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+ 	int err;
+ 
+@@ -271,18 +244,18 @@ static int insert_free_dqentry(struct fi
+ 		return -ENOMEM;
+ 	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+ 	dh->dqdh_prev_free = cpu_to_le32(0);
+-	if ((err = write_blk(filp, blk, buf)) < 0)
++	if ((err = write_blk(sb, type, blk, buf)) < 0)
+ 		goto out_buf;
+ 	if (info->u.v2_i.dqi_free_entry) {
+-		if ((err = read_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
++		if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
+ 			goto out_buf;
+ 		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
+-		if ((err = write_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
++		if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
+ 			goto out_buf;
+ 	}
+ 	freedqbuf(tmpbuf);
+ 	info->u.v2_i.dqi_free_entry = blk;
+-	mark_info_dirty(filp->f_dentry->d_sb, type);
++	mark_info_dirty(sb, type);
+ 	return 0;
+ out_buf:
+ 	freedqbuf(tmpbuf);
+@@ -292,8 +265,8 @@ out_buf:
+ /* Find space for dquot */
+ static uint find_free_dqentry(struct dquot *dquot, int *err)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+-	struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info+dquot->dq_type;
++	struct super_block *sb = dquot->dq_sb;
++	struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
+ 	uint blk, i;
+ 	struct v2_disk_dqdbheader *dh;
+ 	struct v2_disk_dqblk *ddquot;
+@@ -309,22 +282,23 @@ static uint find_free_dqentry(struct dqu
+ 	ddquot = GETENTRIES(buf);
+ 	if (info->u.v2_i.dqi_free_entry) {
+ 		blk = info->u.v2_i.dqi_free_entry;
+-		if ((*err = read_blk(filp, blk, buf)) < 0)
++		if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
+ 			goto out_buf;
+ 	}
+ 	else {
+-		blk = get_free_dqblk(filp, dquot->dq_type);
++		blk = get_free_dqblk(sb, dquot->dq_type);
+ 		if ((int)blk < 0) {
+ 			*err = blk;
+ 			freedqbuf(buf);
+ 			return 0;
+ 		}
+ 		memset(buf, 0, V2_DQBLKSIZE);
+-		info->u.v2_i.dqi_free_entry = blk;	/* This is enough as block is already zeroed and entry list is empty... */
+-		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
++		/* This is enough as block is already zeroed and entry list is empty... */
++		info->u.v2_i.dqi_free_entry = blk;
++		mark_info_dirty(sb, dquot->dq_type);
+ 	}
+ 	if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)	/* Block will be full? */
+-		if ((*err = remove_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0) {
++		if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
+ 			printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
+ 			goto out_buf;
+ 		}
+@@ -339,7 +313,7 @@ static uint find_free_dqentry(struct dqu
+ 		goto out_buf;
+ 	}
+ #endif
+-	if ((*err = write_blk(filp, blk, buf)) < 0) {
++	if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
+ 		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
+ 		goto out_buf;
+ 	}
+@@ -354,7 +328,7 @@ out_buf:
+ /* Insert reference to structure into the trie */
+ static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
++	struct super_block *sb = dquot->dq_sb;
+ 	dqbuf_t buf;
+ 	int ret = 0, newson = 0, newact = 0;
+ 	__le32 *ref;
+@@ -363,7 +337,7 @@ static int do_insert_tree(struct dquot *
+ 	if (!(buf = getdqbuf()))
+ 		return -ENOMEM;
+ 	if (!*treeblk) {
+-		ret = get_free_dqblk(filp, dquot->dq_type);
++		ret = get_free_dqblk(sb, dquot->dq_type);
+ 		if (ret < 0)
+ 			goto out_buf;
+ 		*treeblk = ret;
+@@ -371,7 +345,7 @@ static int do_insert_tree(struct dquot *
+ 		newact = 1;
+ 	}
+ 	else {
+-		if ((ret = read_blk(filp, *treeblk, buf)) < 0) {
++		if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
+ 			printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
+ 			goto out_buf;
+ 		}
+@@ -394,10 +368,10 @@ static int do_insert_tree(struct dquot *
+ 		ret = do_insert_tree(dquot, &newblk, depth+1);
+ 	if (newson && ret >= 0) {
+ 		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
+-		ret = write_blk(filp, *treeblk, buf);
++		ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
+ 	}
+ 	else if (newact && ret < 0)
+-		put_free_dqblk(filp, dquot->dq_type, buf, *treeblk);
++		put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
+ out_buf:
+ 	freedqbuf(buf);
+ 	return ret;
+@@ -416,20 +390,15 @@ static inline int dq_insert_tree(struct 
+ static int v2_write_dquot(struct dquot *dquot)
+ {
+ 	int type = dquot->dq_type;
+-	struct file *filp;
+-	mm_segment_t fs;
+-	loff_t offset;
+ 	ssize_t ret;
+ 	struct v2_disk_dqblk ddquot, empty;
+ 
+ 	/* dq_off is guarded by dqio_sem */
+ 	if (!dquot->dq_off)
+ 		if ((ret = dq_insert_tree(dquot)) < 0) {
+-			printk(KERN_ERR "VFS: Error %Zd occurred while creating quota.\n", ret);
++			printk(KERN_ERR "VFS: Error %d occurred while creating quota.\n", ret);
+ 			return ret;
+ 		}
+-	filp = sb_dqopt(dquot->dq_sb)->files[type];
+-	offset = dquot->dq_off;
+ 	spin_lock(&dq_data_lock);
+ 	mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
+ 	/* Argh... We may need to write structure full of zeroes but that would be
+@@ -439,10 +408,8 @@ static int v2_write_dquot(struct dquot *
+ 	if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+ 		ddquot.dqb_itime = cpu_to_le64(1);
+ 	spin_unlock(&dq_data_lock);
+-	fs = get_fs();
+-	set_fs(KERNEL_DS);
+-	ret = filp->f_op->write(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset);
+-	set_fs(fs);
++	ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
++	      (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
+ 	if (ret != sizeof(struct v2_disk_dqblk)) {
+ 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
+ 		if (ret >= 0)
+@@ -458,7 +425,8 @@ static int v2_write_dquot(struct dquot *
+ /* Free dquot entry in data block */
+ static int free_dqentry(struct dquot *dquot, uint blk)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
++	struct super_block *sb = dquot->dq_sb;
++	int type = dquot->dq_type;
+ 	struct v2_disk_dqdbheader *dh;
+ 	dqbuf_t buf = getdqbuf();
+ 	int ret = 0;
+@@ -466,34 +434,39 @@ static int free_dqentry(struct dquot *dq
+ 	if (!buf)
+ 		return -ENOMEM;
+ 	if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
+-		printk(KERN_ERR "VFS: Quota structure has offset to other block (%u) than it should (%u).\n", blk, (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
++		printk(KERN_ERR "VFS: Quota structure has offset to other "
++		  "block (%u) than it should (%u).\n", blk,
++		  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
+ 		goto out_buf;
+ 	}
+-	if ((ret = read_blk(filp, blk, buf)) < 0) {
++	if ((ret = read_blk(sb, type, blk, buf)) < 0) {
+ 		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+ 		goto out_buf;
+ 	}
+ 	dh = (struct v2_disk_dqdbheader *)buf;
+ 	dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1);
+ 	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+-		if ((ret = remove_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0 ||
+-		    (ret = put_free_dqblk(filp, dquot->dq_type, buf, blk)) < 0) {
+-			printk(KERN_ERR "VFS: Can't move quota data block (%u) to free list.\n", blk);
++		if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
++		    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
++			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
++			  "to free list.\n", blk);
+ 			goto out_buf;
+ 		}
+ 	}
+ 	else {
+-		memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, sizeof(struct v2_disk_dqblk));
++		memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
++		  sizeof(struct v2_disk_dqblk));
+ 		if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
+ 			/* Insert will write block itself */
+-			if ((ret = insert_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0) {
++			if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
+ 				printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
+ 				goto out_buf;
+ 			}
+ 		}
+ 		else
+-			if ((ret = write_blk(filp, blk, buf)) < 0) {
+-				printk(KERN_ERR "VFS: Can't write quota data block %u\n", blk);
++			if ((ret = write_blk(sb, type, blk, buf)) < 0) {
++				printk(KERN_ERR "VFS: Can't write quota data "
++				  "block %u\n", blk);
+ 				goto out_buf;
+ 			}
+ 	}
+@@ -506,7 +479,8 @@ out_buf:
+ /* Remove reference to dquot from tree */
+ static int remove_tree(struct dquot *dquot, uint *blk, int depth)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
++	struct super_block *sb = dquot->dq_sb;
++	int type = dquot->dq_type;
+ 	dqbuf_t buf = getdqbuf();
+ 	int ret = 0;
+ 	uint newblk;
+@@ -514,7 +488,7 @@ static int remove_tree(struct dquot *dqu
+ 	
+ 	if (!buf)
+ 		return -ENOMEM;
+-	if ((ret = read_blk(filp, *blk, buf)) < 0) {
++	if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
+ 		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+ 		goto out_buf;
+ 	}
+@@ -530,12 +504,13 @@ static int remove_tree(struct dquot *dqu
+ 		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
+ 		for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);	/* Block got empty? */
+ 		if (i == V2_DQBLKSIZE) {
+-			put_free_dqblk(filp, dquot->dq_type, buf, *blk);
++			put_free_dqblk(sb, type, buf, *blk);
+ 			*blk = 0;
+ 		}
+ 		else
+-			if ((ret = write_blk(filp, *blk, buf)) < 0)
+-				printk(KERN_ERR "VFS: Can't write quota tree block %u.\n", *blk);
++			if ((ret = write_blk(sb, type, *blk, buf)) < 0)
++				printk(KERN_ERR "VFS: Can't write quota tree "
++				  "block %u.\n", *blk);
+ 	}
+ out_buf:
+ 	freedqbuf(buf);
+@@ -555,7 +530,6 @@ static int v2_delete_dquot(struct dquot 
+ /* Find entry in block */
+ static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ 	dqbuf_t buf = getdqbuf();
+ 	loff_t ret = 0;
+ 	int i;
+@@ -563,27 +537,31 @@ static loff_t find_block_dqentry(struct 
+ 
+ 	if (!buf)
+ 		return -ENOMEM;
+-	if ((ret = read_blk(filp, blk, buf)) < 0) {
++	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
+ 		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ 		goto out_buf;
+ 	}
+ 	if (dquot->dq_id)
+-		for (i = 0; i < V2_DQSTRINBLK && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
++		for (i = 0; i < V2_DQSTRINBLK &&
++		     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
+ 	else {	/* ID 0 as a bit more complicated searching... */
+ 		struct v2_disk_dqblk fakedquot;
+ 
+ 		memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
+ 		for (i = 0; i < V2_DQSTRINBLK; i++)
+-			if (!le32_to_cpu(ddquot[i].dqb_id) && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
++			if (!le32_to_cpu(ddquot[i].dqb_id) &&
++			    memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
+ 				break;
+ 	}
+ 	if (i == V2_DQSTRINBLK) {
+-		printk(KERN_ERR "VFS: Quota for id %u referenced but not present.\n", dquot->dq_id);
++		printk(KERN_ERR "VFS: Quota for id %u referenced "
++		  "but not present.\n", dquot->dq_id);
+ 		ret = -EIO;
+ 		goto out_buf;
+ 	}
+ 	else
+-		ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
++		ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
++		  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
+ out_buf:
+ 	freedqbuf(buf);
+ 	return ret;
+@@ -592,14 +570,13 @@ out_buf:
+ /* Find entry for given id in the tree */
+ static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
+ {
+-	struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ 	dqbuf_t buf = getdqbuf();
+ 	loff_t ret = 0;
+ 	__le32 *ref = (__le32 *)buf;
+ 
+ 	if (!buf)
+ 		return -ENOMEM;
+-	if ((ret = read_blk(filp, blk, buf)) < 0) {
++	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
+ 		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ 		goto out_buf;
+ 	}
+@@ -625,16 +602,13 @@ static inline loff_t find_dqentry(struct
+ static int v2_read_dquot(struct dquot *dquot)
+ {
+ 	int type = dquot->dq_type;
+-	struct file *filp;
+-	mm_segment_t fs;
+ 	loff_t offset;
+ 	struct v2_disk_dqblk ddquot, empty;
+ 	int ret = 0;
+ 
+-	filp = sb_dqopt(dquot->dq_sb)->files[type];
+-
+ #ifdef __QUOTA_V2_PARANOIA
+-	if (!filp || !dquot->dq_sb) {	/* Invalidated quota? */
++	/* Invalidated quota? */
++	if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
+ 		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+ 		return -EIO;
+ 	}
+@@ -642,7 +616,8 @@ static int v2_read_dquot(struct dquot *d
+ 	offset = find_dqentry(dquot);
+ 	if (offset <= 0) {	/* Entry not present? */
+ 		if (offset < 0)
+-			printk(KERN_ERR "VFS: Can't read quota structure for id %u.\n", dquot->dq_id);
++			printk(KERN_ERR "VFS: Can't read quota "
++			  "structure for id %u.\n", dquot->dq_id);
+ 		dquot->dq_off = 0;
+ 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+@@ -650,12 +625,13 @@ static int v2_read_dquot(struct dquot *d
+ 	}
+ 	else {
+ 		dquot->dq_off = offset;
+-		fs = get_fs();
+-		set_fs(KERNEL_DS);
+-		if ((ret = filp->f_op->read(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset)) != sizeof(struct v2_disk_dqblk)) {
++		if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
++		    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
++		    != sizeof(struct v2_disk_dqblk)) {
+ 			if (ret >= 0)
+ 				ret = -EIO;
+-			printk(KERN_ERR "VFS: Error while reading quota structure for id %u.\n", dquot->dq_id);
++			printk(KERN_ERR "VFS: Error while reading quota "
++			  "structure for id %u.\n", dquot->dq_id);
+ 			memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
+ 		}
+ 		else {
+@@ -666,7 +642,6 @@ static int v2_read_dquot(struct dquot *d
+ 			if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+ 				ddquot.dqb_itime = 0;
+ 		}
+-		set_fs(fs);
+ 		disk2memdqb(&dquot->dq_dqb, &ddquot);
+ 		if (!dquot->dq_dqb.dqb_bhardlimit &&
+ 			!dquot->dq_dqb.dqb_bsoftlimit &&
+diff -puN include/linux/fs.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/fs.h
+--- 25/include/linux/fs.h~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.300106472 -0800
++++ 25-akpm/include/linux/fs.h	2004-12-03 20:56:04.319103584 -0800
+@@ -1004,6 +1004,9 @@ struct super_operations {
+	void (*umount_begin) (struct super_block *);
+
+	int (*show_options)(struct seq_file *, struct vfsmount *);
++
++	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
++ 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ };
+
+ /* Inode state bits.  Protected by inode_lock. */
+
+diff -puN include/linux/quota.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/quota.h
+--- 25/include/linux/quota.h~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.301106320 -0800
++++ 25-akpm/include/linux/quota.h	2004-12-03 20:56:04.320103432 -0800
+@@ -285,7 +285,7 @@ struct quota_info {
+ 	struct semaphore dqio_sem;		/* lock device while I/O in progress */
+ 	struct semaphore dqonoff_sem;		/* Serialize quotaon & quotaoff */
+ 	struct rw_semaphore dqptr_sem;		/* serialize ops using quota_info struct, pointers from inode to dquots */
+-	struct file *files[MAXQUOTAS];		/* fp's to quotafiles */
++	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
+ 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
+ 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
+ };
+diff -puN include/linux/security.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/security.h
+--- 25/include/linux/security.h~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.303106016 -0800
++++ 25-akpm/include/linux/security.h	2004-12-03 20:56:04.322103128 -0800
+@@ -1033,7 +1033,7 @@ struct security_operations {
+	int (*sysctl) (ctl_table * table, int op);
+	int (*capable) (struct task_struct * tsk, int cap);
+	int (*quotactl) (int cmds, int type, int id, struct super_block * sb);
+-	int (*quota_on) (struct file * f);
++	int (*quota_on) (struct dentry * dentry);
+	int (*syslog) (int type);
+	int (*vm_enough_memory) (long pages);
+
+@@ -1281,9 +1281,9 @@ static inline int security_quotactl (int
+ 	return security_ops->quotactl (cmds, type, id, sb);
+ }
+ 
+-static inline int security_quota_on (struct file * file)
++static inline int security_quota_on (struct dentry * dentry)
+ {
+-	return security_ops->quota_on (file);
++	return security_ops->quota_on (dentry);
+ }
+ 
+ static inline int security_syslog(int type)
+@@ -1959,7 +1959,7 @@ static inline int security_quotactl (int
+ 	return 0;
+ }
+ 
+-static inline int security_quota_on (struct file * file)
++static inline int security_quota_on (struct dentry * dentry)
+ {
+ 	return 0;
+ }
+diff -puN security/dummy.c~fix-of-quota-deadlock-on-pagelock-quota-core security/dummy.c
+--- 25/security/dummy.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.304105864 -0800
++++ 25-akpm/security/dummy.c	2004-12-03 20:56:04.323102976 -0800
+@@ -92,7 +92,7 @@ static int dummy_quotactl (int cmds, int
+ 	return 0;
+ }
+ 
+-static int dummy_quota_on (struct file *f)
++static int dummy_quota_on (struct dentry *dentry)
+ {
+ 	return 0;
+ }
+diff -puN security/selinux/hooks.c~fix-of-quota-deadlock-on-pagelock-quota-core security/selinux/hooks.c
+--- 25/security/selinux/hooks.c~fix-of-quota-deadlock-on-pagelock-quota-core	2004-12-03 20:56:04.306105560 -0800
++++ 25-akpm/security/selinux/hooks.c	2004-12-03 20:56:04.326102520 -0800
+@@ -1494,9 +1494,9 @@ static int selinux_quotactl(int cmds, in
+ 	return rc;
+ }
+ 
+-static int selinux_quota_on(struct file *f)
++static int selinux_quota_on(struct dentry *dentry)
+ {
+-	return file_has_perm(current, f, FILE__QUOTAON);
++	return dentry_has_perm(current, NULL, dentry, FILE__QUOTAON);
+ }
+ 
+ static int selinux_syslog(int type)
+_
diff --git a/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch
new file mode 100644
index 0000000..bcfa38a
--- /dev/null
+++ b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch
@@ -0,0 +1,273 @@
+Index: linux-2.6.9/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/inode.c	2006-08-25 16:39:10.000000000 +0800
++++ linux-2.6.9/fs/ext3/inode.c	2006-09-14 11:44:29.000000000 +0800
+@@ -1028,7 +1028,7 @@
+ 	return ret;
+ }
+ 
+-static int
++int
+ ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err = journal_dirty_data(handle, bh);
+Index: linux-2.6.9/fs/ext3/super.c
+===================================================================
+--- linux-2.6.9.orig/fs/ext3/super.c	2006-08-25 16:39:48.000000000 +0800
++++ linux-2.6.9/fs/ext3/super.c	2006-09-14 11:51:48.000000000 +0800
+@@ -529,7 +529,10 @@
+ static int ext3_write_info(struct super_block *sb, int type);
+ static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path);
+ static int ext3_quota_on_mount(struct super_block *sb, int type);
+-static int ext3_quota_off_mount(struct super_block *sb, int type);
++static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
++			       size_t len, loff_t off);
++static ssize_t ext3_quota_write(struct super_block *sb, int type,
++				const char *data, size_t len, loff_t off);
+ 
+ static struct dquot_operations ext3_quota_operations = {
+ 	.initialize	= ext3_dquot_initialize,
+@@ -572,6 +575,10 @@
+ 	.statfs		= ext3_statfs,
+ 	.remount_fs	= ext3_remount,
+ 	.clear_inode	= ext3_clear_inode,
++#ifdef CONFIG_QUOTA
++	.quota_read	= ext3_quota_read,
++	.quota_write	= ext3_quota_write,
++#endif
+ };
+ 
+ static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp)
+@@ -713,6 +720,7 @@
+ 	int option;
+ #ifdef CONFIG_QUOTA
+ 	int qtype;
++	char *qname;
+ #endif
+ 
+ 	if (!options)
+@@ -891,19 +899,22 @@
+ 					"quota options when quota turned on.\n");
+ 				return 0;
+ 			}
+-			if (sbi->s_qf_names[qtype]) {
++			qname = match_strdup(&args[0]);
++			if (!qname) {
+ 				printk(KERN_ERR
+-					"EXT3-fs: %s quota file already "
+-					"specified.\n", QTYPE2NAME(qtype));
++					"EXT3-fs: not enough memory for "
++					"storing quotafile name.\n");
+ 				return 0;
+ 			}
+-			sbi->s_qf_names[qtype] = match_strdup(&args[0]);
+-			if (!sbi->s_qf_names[qtype]) {
++			if (sbi->s_qf_names[qtype] &&
++			    strcmp(sbi->s_qf_names[qtype], qname)) {
+ 				printk(KERN_ERR
+-					"EXT3-fs: not enough memory for "
+-					"storing quotafile name.\n");
++					"EXT3-fs: %s quota file already "
++					"specified.\n", QTYPE2NAME(qtype));
++				kfree(qname);
+ 				return 0;
+ 			}
++			sbi->s_qf_names[qtype] = qname;
+ 			if (strchr(sbi->s_qf_names[qtype], '/')) {
+ 				printk(KERN_ERR
+ 					"EXT3-fs: quotafile must be on "
+@@ -1223,7 +1234,7 @@
+ 	/* Turn quotas off */
+ 	for (i = 0; i < MAXQUOTAS; i++) {
+ 		if (sb_dqopt(sb)->files[i])
+-			ext3_quota_off_mount(sb, i);
++			vfs_quota_off(sb, i);
+ 	}
+ #endif
+ 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+@@ -2240,7 +2251,7 @@
+ 
+ static inline struct inode *dquot_to_inode(struct dquot *dquot)
+ {
+-	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]->f_dentry->d_inode;
++	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ }
+ 
+ static int ext3_dquot_initialize(struct inode *inode, int type)
+@@ -2279,8 +2290,10 @@
+ {
+ 	int ret, err;
+ 	handle_t *handle;
++	struct inode *inode;
+ 
+-	handle = ext3_journal_start(dquot_to_inode(dquot),
++	inode = dquot_to_inode(dquot);
++	handle = ext3_journal_start(inode,
+ 					EXT3_QUOTA_TRANS_BLOCKS);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+@@ -2367,22 +2380,9 @@
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 	err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry);
+-	if (err)
+-		dput(dentry);
+-	/* We keep the dentry reference if everything went ok - we drop it
+-	 * on quota_off time */
+-	return err;
+-}
+-
+-/* Turn quotas off during mount time */
+-static int ext3_quota_off_mount(struct super_block *sb, int type)
+-{
+-	int err;
+-	struct dentry *dentry;
+-
+-	dentry = sb_dqopt(sb)->files[type]->f_dentry;
+-	err = vfs_quota_off_mount(sb, type);
+-	/* We invalidate dentry - it has at least wrong hash... */
++	/* Now invalidate and put the dentry - quota got its own reference
++	 * to inode and dentry has at least wrong hash so we had better
++	 * throw it away */
+ 	d_invalidate(dentry);
+ 	dput(dentry);
+ 	return err;
+@@ -2405,20 +2405,121 @@
+ 	if (err)
+ 		return err;
+ 	/* Quotafile not on the same filesystem? */
+-	if (nd.mnt->mnt_sb != sb)
++	if (nd.mnt->mnt_sb != sb) {
++		path_release(&nd);
+ 		return -EXDEV;
++	}
+ 	/* Quotafile not of fs root? */
+ 	if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
+ 		printk(KERN_WARNING
+ 			"EXT3-fs: Quota file not on filesystem root. "
+ 			"Journalled quota will not work.\n");
+-	if (!ext3_should_journal_data(nd.dentry->d_inode))
+-		printk(KERN_WARNING "EXT3-fs: Quota file does not have "
+-			"data-journalling. Journalled quota will not work.\n");
+ 	path_release(&nd);
+ 	return vfs_quota_on(sb, type, format_id, path);
+ }
+ 
++/* Read data from quotafile - avoid pagecache and such because we cannot afford
++ * acquiring the locks... As quota files are never truncated and quota code
++ * itself serializes the operations (and noone else should touch the files)
++ * we don't have to be afraid of races */
++static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
++			       size_t len, loff_t off)
++{
++	struct inode *inode = sb_dqopt(sb)->files[type];
++	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
++	int err = 0;
++	int offset = off & (sb->s_blocksize - 1);
++	int tocopy;
++	size_t toread;
++	struct buffer_head *bh;
++	loff_t i_size = i_size_read(inode);
++
++	if (off > i_size)
++		return 0;
++	if (off+len > i_size)
++		len = i_size-off;
++	toread = len;
++	while (toread > 0) {
++		tocopy = sb->s_blocksize - offset < toread ?
++				sb->s_blocksize - offset : toread;
++		bh = ext3_bread(NULL, inode, blk, 0, &err);
++		if (err)
++			return err;
++		if (!bh)	/* A hole? */
++			memset(data, 0, tocopy);
++		else
++			memcpy(data, bh->b_data+offset, tocopy);
++		brelse(bh);
++		offset = 0;
++		toread -= tocopy;
++		data += tocopy;
++		blk++;
++	}
++	return len;
++}
++
++/* Write to quotafile (we know the transaction is already started and has
++ * enough credits) */
++static ssize_t ext3_quota_write(struct super_block *sb, int type,
++				const char *data, size_t len, loff_t off)
++{
++	struct inode *inode = sb_dqopt(sb)->files[type];
++	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
++	int err = 0;
++	int offset = off & (sb->s_blocksize - 1);
++	int tocopy;
++	int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
++	size_t towrite = len;
++	struct buffer_head *bh;
++	handle_t *handle = journal_current_handle();
++
++	down(&inode->i_sem);
++	while (towrite > 0) {
++		tocopy = sb->s_blocksize - offset < towrite ?
++				sb->s_blocksize - offset : towrite;
++		bh = ext3_bread(handle, inode, blk, 1, &err);
++		if (!bh)
++			goto out;
++		if (journal_quota) {
++			err = ext3_journal_get_write_access(handle, bh);
++			if (err) {
++				brelse(bh);
++				goto out;
++			}
++		}
++		lock_buffer(bh);
++		memcpy(bh->b_data+offset, data, tocopy);
++		flush_dcache_page(bh->b_page);
++		unlock_buffer(bh);
++		if (journal_quota)
++			err = ext3_journal_dirty_metadata(handle, bh);
++		else {
++			/* Always do at least ordered writes for quotas */
++			err = ext3_journal_dirty_data(handle, bh);
++			mark_buffer_dirty(bh);
++		}
++		brelse(bh);
++		if (err)
++			goto out;
++		offset = 0;
++		towrite -= tocopy;
++		data += tocopy;
++		blk++;
++	}
++out:
++	if (len == towrite)
++		return err;
++	if (inode->i_size < off+len-towrite) {
++		i_size_write(inode, off+len-towrite);
++		EXT3_I(inode)->i_disksize = inode->i_size;
++	}
++	inode->i_version++;
++	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++	ext3_mark_inode_dirty(handle, inode);
++	up(&inode->i_sem);
++	return len - towrite;
++}
++
+ #endif
+ 
+ static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
+Index: linux-2.6.9/include/linux/ext3_jbd.h
+===================================================================
+--- linux-2.6.9.orig/include/linux/ext3_jbd.h	2006-08-25 16:39:09.000000000 +0800
++++ linux-2.6.9/include/linux/ext3_jbd.h	2006-09-14 11:44:29.000000000 +0800
+@@ -193,6 +193,8 @@
+ #define ext3_journal_forget(handle, bh) \
+ 	__ext3_journal_forget(__FUNCTION__, (handle), (bh))
+ 
++int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
++
+ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
+ int __ext3_journal_stop(const char *where, handle_t *handle);
+ 
diff --git a/lustre/kernel_patches/patches/quota-umount-race-fix.patch b/lustre/kernel_patches/patches/quota-umount-race-fix.patch
new file mode 100644
index 0000000..42428c3
--- /dev/null
+++ b/lustre/kernel_patches/patches/quota-umount-race-fix.patch
@@ -0,0 +1,139 @@
+
+From: Jan Kara <jack@suse.cz>
+
+Fix possible races between umount and quota on/off.
+
+Finally I decided to take a reference to vfsmount during vfs_quota_on() and
+to drop it after the final cleanup in the vfs_quota_off().  This way we
+should be all the time guarded against umount.  This way was protected also
+the old code which used filp_open() for opening quota files.  I was also
+thinking about other ways of protection but there would be always a window
+(provided I don't want to play much with namespace locks) where
+vfs_quota_on() could be called while umount() is in progress resulting in
+the "Busy inodes after unmount" messages...
+
+Get a reference to vfsmount during quotaon() so that we are guarded against
+umount (as was the old code using filp_open()).
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+---
+
+ 25-akpm/fs/dquot.c               |   45 ++++++++++++++++++++++++++++-----------
+ 25-akpm/include/linux/quota.h    |    1 
+ 25-akpm/include/linux/quotaops.h |    2 -
+ 3 files changed, 35 insertions(+), 13 deletions(-)
+
+diff -puN fs/dquot.c~quota-umount-race-fix fs/dquot.c
+--- 25/fs/dquot.c~quota-umount-race-fix	Tue Nov 23 17:11:34 2004
++++ 25-akpm/fs/dquot.c	Tue Nov 23 17:11:34 2004
+@@ -1314,12 +1314,14 @@ int vfs_quota_off(struct super_block *sb
+ {
+ 	int cnt;
+ 	struct quota_info *dqopt = sb_dqopt(sb);
+-	struct inode *toput[MAXQUOTAS];
++	struct inode *toputinode[MAXQUOTAS];
++	struct vfsmount *toputmnt[MAXQUOTAS];
+ 
+ 	/* We need to serialize quota_off() for device */
+ 	down(&dqopt->dqonoff_sem);
+ 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+-		toput[cnt] = NULL;
++		toputinode[cnt] = NULL;
++		toputmnt[cnt] = NULL;
+ 		if (type != -1 && cnt != type)
+ 			continue;
+ 		if (!sb_has_quota_enabled(sb, cnt))
+@@ -1339,8 +1341,10 @@ int vfs_quota_off(struct super_block *sb
+ 			dqopt->ops[cnt]->free_file_info(sb, cnt);
+ 		put_quota_format(dqopt->info[cnt].dqi_format);
+ 
+-		toput[cnt] = dqopt->files[cnt];
++		toputinode[cnt] = dqopt->files[cnt];
++		toputmnt[cnt] = dqopt->mnt[cnt];
+ 		dqopt->files[cnt] = NULL;
++		dqopt->mnt[cnt] = NULL;
+ 		dqopt->info[cnt].dqi_flags = 0;
+ 		dqopt->info[cnt].dqi_igrace = 0;
+ 		dqopt->info[cnt].dqi_bgrace = 0;
+@@ -1348,7 +1352,10 @@ int vfs_quota_off(struct super_block *sb
+ 	}
+ 	up(&dqopt->dqonoff_sem);
+ 	/* Sync the superblock so that buffers with quota data are written to
+-         * disk (and so userspace sees correct data afterwards) */
++	 * disk (and so userspace sees correct data afterwards).
++	 * The reference to vfsmnt we are still holding protects us from
++	 * umount (we don't have it only when quotas are turned on/off for
++	 * journal replay but in that case we are guarded by the fs anyway). */
+ 	if (sb->s_op->sync_fs)
+ 		sb->s_op->sync_fs(sb, 1);
+ 	sync_blockdev(sb->s_bdev);
+@@ -1358,13 +1365,24 @@ int vfs_quota_off(struct super_block *sb
+	 * must also discard the blockdev buffers so that we see the
+	 * changes done by userspace on the next quotaon() */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+-		if (toput[cnt]) {
+-			down(&toput[cnt]->i_sem);
+-			toput[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA);
+-			truncate_inode_pages(&toput[cnt]->i_data, 0);
+-			up(&toput[cnt]->i_sem);
+-			mark_inode_dirty(toput[cnt]);
+-			iput(toput[cnt]);
++		if (toputinode[cnt]) {
++			down(&dqopt->dqonoff_sem);
++			/* If quota was reenabled in the meantime, we have
++			 * nothing to do */
++			if (!sb_has_quota_enabled(sb, cnt)) {
++				down(&toputinode[cnt]->i_sem);
++				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
++				  S_NOATIME | S_NOQUOTA);
++				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
++				up(&toputinode[cnt]->i_sem);
++				mark_inode_dirty(toputinode[cnt]);
++				iput(toputinode[cnt]);
++			}
++			up(&dqopt->dqonoff_sem);
++			/* We don't hold the reference when we turned on quotas
++			 * just for the journal replay... */
++			if (toputmnt[cnt])
++				mntput(toputmnt[cnt]);
+		}
+	invalidate_bdev(sb->s_bdev, 0);
+	return 0;
+@@ -1478,8 +1496,11 @@ int vfs_quota_on(struct super_block *sb,
+ 	/* Quota file not on the same filesystem? */
+ 	if (nd.mnt->mnt_sb != sb)
+ 		error = -EXDEV;
+-	else
++	else {
+ 		error = vfs_quota_on_inode(nd.dentry->d_inode, type, format_id);
++		if (!error)
++			sb_dqopt(sb)->mnt[type] = mntget(nd.mnt);
++	}
+ out_path:
+ 	path_release(&nd);
+ 	return error;
+diff -puN include/linux/quota.h~quota-umount-race-fix include/linux/quota.h
+--- 25/include/linux/quota.h~quota-umount-race-fix	Tue Nov 23 17:11:34 2004
++++ 25-akpm/include/linux/quota.h	Tue Nov 23 17:11:34 2004
+@@ -286,6 +286,7 @@ struct quota_info {
+ 	struct semaphore dqonoff_sem;		/* Serialize quotaon & quotaoff */
+ 	struct rw_semaphore dqptr_sem;		/* serialize ops using quota_info struct, pointers from inode to dquots */
+ 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
++	struct vfsmount *mnt[MAXQUOTAS];	/* mountpoint entries of filesystems with quota files */
+ 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
+ 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
+ };
+diff -puN include/linux/quotaops.h~quota-umount-race-fix include/linux/quotaops.h
+--- 25/include/linux/quotaops.h~quota-umount-race-fix	Tue Nov 23 17:11:34 2004
++++ 25-akpm/include/linux/quotaops.h	Tue Nov 23 17:11:34 2004
+@@ -177,7 +177,7 @@ static __inline__ int DQUOT_OFF(struct s
+ {
+ 	int ret = -ENOSYS;
+ 
+-	if (sb->s_qcop && sb->s_qcop->quota_off)
++	if (sb_any_quota_enabled(sb) && sb->s_qcop && sb->s_qcop->quota_off)
+ 		ret = sb->s_qcop->quota_off(sb, -1);
+ 	return ret;
+ }
+_
diff --git a/lustre/tests/flocks_test.c b/lustre/tests/flocks_test.c
new file mode 100644
index 0000000..ff54e06
--- /dev/null
+++ b/lustre/tests/flocks_test.c
@@ -0,0 +1,62 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/file.h>
+
+void usage(void)
+{
+        fprintf(stderr, "usage: ./flocks_test on|off -c|-f|-l /path/to/file\n");
+        exit(EXIT_FAILURE);
+}
+
+int main(int argc, char *argv[])
+{
+        int fd;
+        int mount_with_flock = 0;
+        int error = 0;
+
+        if (argc != 4)
+                usage();
+        
+        if (!strncmp(argv[1], "on", 3)) {
+                mount_with_flock = 1;
+        } else if (!strncmp(argv[1], "off", 4)) {
+                mount_with_flock = 0;
+        } else {
+                usage();
+        }
+
+        if ((fd = open(argv[3], O_RDWR)) < 0) {
+                fprintf(stderr, "Couldn't open file: %s\n", argv[2]);
+                exit(EXIT_FAILURE);
+        }
+
+        if (!strncmp(argv[2], "-c", 3)) {
+                struct flock fl;
+
+                fl.l_type = F_RDLCK;
+                fl.l_whence = SEEK_SET;
+                fl.l_start = 0;
+                fl.l_len = 1;
+
+                error = fcntl(fd, F_SETLK, &fl);
+        } else if (!strncmp(argv[2], "-l", 3)) {
+                error = lockf(fd, F_LOCK, 1);
+        } else if (!strncmp(argv[2], "-f", 3)) {
+                error = flock(fd, LOCK_EX);
+        } else {
+                usage();
+        }
+
+        if (mount_with_flock)
+                return((error == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
+        else
+                return((error == 0) ? EXIT_FAILURE : EXIT_SUCCESS);
+}
-- 
1.8.3.1