From 3a4d78df7eb9a45b4ed4b6873cff6ed3a3a4e52c Mon Sep 17 00:00:00 2001 From: ericm Date: Fri, 27 Oct 2006 18:12:58 +0000 Subject: [PATCH] branch: b1_8 merge from b1_5 (20061027_1139) --- .../patches/new-tcp-zero-copy-2.4.29-vanilla.patch | 307 +++++ .../new-tcp-zero-copy-2.6.9-41.2chaos.patch | 318 +++++ .../patches/quota-deadlock-on-pagelock-core.patch | 1264 ++++++++++++++++++++ .../patches/quota-deadlock-on-pagelock-ext3.patch | 273 +++++ .../patches/quota-umount-race-fix.patch | 139 +++ lustre/tests/flocks_test.c | 62 + 6 files changed, 2363 insertions(+) create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch create mode 100644 lustre/kernel_patches/patches/quota-umount-race-fix.patch create mode 100644 lustre/tests/flocks_test.c diff --git a/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch new file mode 100644 index 0000000..62e3087 --- /dev/null +++ b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch @@ -0,0 +1,307 @@ +--- linux-2.4.29-orig/include/linux/skbuff.h 2006-10-10 01:25:07.000000000 +0100 ++++ linux-2.4.29/include/linux/skbuff.h 2006-10-10 00:42:59.000000000 +0100 +@@ -116,6 +116,36 @@ struct skb_frag_struct + __u16 size; + }; + ++/* Zero Copy Callback Descriptor ++ * This struct supports receiving notification when zero-copy network I/O has ++ * completed. The ZCCD can be embedded in a struct containing the state of a ++ * zero-copy network send. Every skbuff that references that send's pages also ++ * keeps a reference on the ZCCD. When they have all been disposed of, the ++ * reference count on the ZCCD drops to zero and the callback is made, telling ++ * the original caller that the pages may now be overwritten. */ ++struct zccd ++{ ++ atomic_t zccd_refcount; ++ void (*zccd_callback)(struct zccd *); ++}; ++ ++static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *)) ++{ ++ atomic_set (&d->zccd_refcount, 1); ++ d->zccd_callback = callback; ++} ++ ++static inline void zccd_incref (struct zccd *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_refcount); ++} ++ ++static inline void zccd_decref (struct zccd *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_refcount)) ++ (d->zccd_callback)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +153,11 @@ struct skb_shared_info { + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ struct zccd *zccd1; ++ struct zccd *zccd2; ++ /* NB zero-copy data is normally whole pages. We have 2 zccds in an ++ * skbuff so we don't unneccessarily split the packet where pages fall ++ * into the same packet. */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +@@ -1131,6 +1166,23 @@ static inline void kunmap_skb_frag(void + #endif + } + ++/* This skbuf has dropped its pages: drop refs on any zero-copy callback ++ * descriptors it has. */ ++static inline void skb_complete_zccd (struct sk_buff *skb) ++{ ++ struct skb_shared_info *info = skb_shinfo(skb); ++ ++ if (info->zccd1 != NULL) { ++ zccd_decref(info->zccd1); ++ info->zccd1 = NULL; ++ } ++ ++ if (info->zccd2 != NULL) { ++ zccd_decref(info->zccd2); ++ info->zccd2 = NULL; ++ } ++} ++ + #define skb_queue_walk(queue, skb) \ + for (skb = (queue)->next; \ + (skb != (struct sk_buff *)(queue)); \ +--- linux-2.4.29-orig/include/net/tcp.h 2006-10-10 01:25:07.000000000 +0100 ++++ linux-2.4.29/include/net/tcp.h 2006-10-10 00:43:26.000000000 +0100 +@@ -674,6 +674,8 @@ extern int tcp_v4_tw_remember_stam + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, struct zccd *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +--- linux-2.4.29-orig/net/core/skbuff.c 2006-10-10 01:25:08.000000000 +0100 ++++ linux-2.4.29/net/core/skbuff.c 2006-10-10 02:03:49.000000000 +0100 +@@ -208,6 +208,9 @@ struct sk_buff *alloc_skb(unsigned int s + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ ++ skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */ ++ + return skb; + + nodata: +@@ -277,6 +280,9 @@ static void skb_release_data(struct sk_b + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ /* complete zero-copy callbacks (if any) */ ++ skb_complete_zccd(skb); ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -535,6 +541,8 @@ int skb_linearize(struct sk_buff *skb, i + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ ++ skb_shinfo(skb)->zccd2 = NULL; /* not required */ + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -589,6 +597,18 @@ struct sk_buff *pskb_copy(struct sk_buff + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; ++ ++ if (skb_shinfo(skb)->zccd1 != NULL) { ++ BUG_TRAP(skb_shinfo(n)->zccd1 = NULL); ++ skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1; ++ zccd_incref(skb_shinfo(n)->zccd1); ++ } ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) { ++ BUG_TRAP(skb_shinfo(n)->zccd2 = NULL); ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ zccd_incref(skb_shinfo(n)->zccd2); ++ } + } + + if (skb_shinfo(skb)->frag_list) { +@@ -638,6 +658,13 @@ int pskb_expand_head(struct sk_buff *skb + memcpy(data+nhead, skb->head, skb->tail-skb->head); + memcpy(data+size, skb->end, sizeof(struct skb_shared_info)); + ++ /* zero-copy descriptors have been copied into the new shinfo - ++ * account the new references */ ++ if (skb_shinfo(skb)->zccd1 != NULL) ++ zccd_incref(skb_shinfo(skb)->zccd1); ++ if (skb_shinfo(skb)->zccd2 != NULL) ++ zccd_incref(skb_shinfo(skb)->zccd2); ++ + for (i=0; inr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + +@@ -794,6 +821,9 @@ int ___pskb_trim(struct sk_buff *skb, un + offset = end; + } + ++ if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */ ++ skb_complete_zccd(skb); /* drop zccd refs */ ++ + if (offset < len) { + skb->data_len -= skb->len - len; + skb->len = len; +@@ -947,6 +977,9 @@ pull_pages: + } + skb_shinfo(skb)->nr_frags = k; + ++ if (k == 0) /* dropped all the pages */ ++ skb_complete_zccd(skb); /* drop zccd refs */ ++ + skb->tail += delta; + skb->data_len -= delta; + +--- linux-2.4.29-orig/net/ipv4/tcp_output.c 2004-11-17 11:54:22.000000000 +0000 ++++ linux-2.4.29/net/ipv4/tcp_output.c 2006-10-10 01:55:29.000000000 +0100 +@@ -379,6 +379,15 @@ static void skb_split(struct sk_buff *sk + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + ++ /* Transfer zero-copy callback descriptors */ ++ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); ++ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; ++ skb_shinfo(skb)->zccd1 = NULL; ++ ++ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); ++ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; ++ skb_shinfo(skb)->zccd2 = NULL; ++ + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + +@@ -425,6 +434,30 @@ static void skb_split(struct sk_buff *sk + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; ++ ++ if (k != 0) { ++ /* skb1 has pages. Transfer or clone the zccds */ ++ ++ if (skb_shinfo(skb)->zccd1 != NULL) { ++ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); ++ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; ++ ++ if (skb_shinfo(skb)->nr_frags == 0) ++ skb_shinfo(skb)->zccd1 = NULL; ++ else ++ zccd_incref(skb_shinfo(skb)->zccd1); ++ } ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) { ++ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); ++ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; ++ ++ if (skb_shinfo(skb)->nr_frags == 0) ++ skb_shinfo(skb)->zccd2 = NULL; ++ else ++ zccd_incref(skb_shinfo(skb)->zccd2); ++ } ++ } + } + } + +--- linux-2.4.29-orig/net/ipv4/tcp.c 2006-10-10 01:25:08.000000000 +0100 ++++ linux-2.4.29/net/ipv4/tcp.c 2006-10-09 20:53:28.000000000 +0100 +@@ -749,7 +749,8 @@ do_interrupted: + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, ++ struct zccd *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -828,7 +829,8 @@ static int tcp_error(struct sock *sk, in + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, ++ struct zccd *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -876,6 +878,17 @@ new_segment: + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd1 != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd1 != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -886,6 +899,18 @@ new_segment: + goto new_segment; + } + ++ if (zccd != NULL && /* completion callback wanted */ ++ skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */ ++ skb_shinfo(skb)->zccd2 != zccd) { ++ if (skb_shinfo(skb)->zccd1 == NULL) { ++ skb_shinfo(skb)->zccd1 = zccd; ++ } else { ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ zccd_incref(zccd); /* new reference */ ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -934,7 +959,8 @@ out_err: + return tcp_error(sk, flags, err); + } + +-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags, struct zccd *zccd) + { + ssize_t res; + struct sock *sk = sock->sk; +@@ -949,12 +975,17 @@ ssize_t tcp_sendpage(struct socket *sock + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) ++{ ++ return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL); ++} ++ + #define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page) + #define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off) + diff --git a/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch new file mode 100644 index 0000000..8782730 --- /dev/null +++ b/lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch @@ -0,0 +1,318 @@ +--- linux/./include/net/tcp.h 2006-10-10 01:49:23.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./include/net/tcp.h 2006-09-21 17:15:21.000000000 +0100 +@@ -787,6 +787,8 @@ extern int tcp_v4_tw_remember_stam + extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, struct zccd *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +--- linux/./include/linux/skbuff.h 2006-10-10 01:49:23.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./include/linux/skbuff.h 2006-10-06 18:09:35.000000000 +0100 +@@ -134,6 +134,36 @@ struct skb_frag_struct { + __u16 size; + }; + ++/* Zero Copy Callback Descriptor ++ * This struct supports receiving notification when zero-copy network I/O has ++ * completed. The ZCCD can be embedded in a struct containing the state of a ++ * zero-copy network send. Every skbuff that references that send's pages also ++ * keeps a reference on the ZCCD. When they have all been disposed of, the ++ * reference count on the ZCCD drops to zero and the callback is made, telling ++ * the original caller that the pages may now be overwritten. */ ++struct zccd ++{ ++ atomic_t zccd_refcount; ++ void (*zccd_callback)(struct zccd *); ++}; ++ ++static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *)) ++{ ++ atomic_set (&d->zccd_refcount, 1); ++ d->zccd_callback = callback; ++} ++ ++static inline void zccd_incref (struct zccd *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_refcount); ++} ++ ++static inline void zccd_decref (struct zccd *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_refcount)) ++ (d->zccd_callback)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -143,6 +173,11 @@ struct skb_shared_info { + unsigned short tso_size; + unsigned short tso_segs; + struct sk_buff *frag_list; ++ struct zccd *zccd1; ++ struct zccd *zccd2; ++ /* NB zero-copy data is normally whole pages. We have 2 zccds in an ++ * skbuff so we don't unneccessarily split the packet where pages fall ++ * into the same packet. */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +@@ -1070,6 +1105,23 @@ static inline void kunmap_skb_frag(void + #endif + } + ++/* This skbuf has dropped its pages: drop refs on any zero-copy callback ++ * descriptors it has. */ ++static inline void skb_complete_zccd (struct sk_buff *skb) ++{ ++ struct skb_shared_info *info = skb_shinfo(skb); ++ ++ if (info->zccd1 != NULL) { ++ zccd_decref(info->zccd1); ++ info->zccd1 = NULL; ++ } ++ ++ if (info->zccd2 != NULL) { ++ zccd_decref(info->zccd2); ++ info->zccd2 = NULL; ++ } ++} ++ + #define skb_queue_walk(queue, skb) \ + for (skb = (queue)->next, prefetch(skb->next); \ + (skb != (struct sk_buff *)(queue)); \ +--- linux/./net/core/dev.c 2006-10-10 01:49:23.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./net/core/dev.c 2006-09-21 16:53:45.000000000 +0100 +@@ -1140,6 +1140,8 @@ int __skb_linearize(struct sk_buff *skb, + ninfo->tso_segs = skb_shinfo(skb)->tso_segs; + ninfo->nr_frags = 0; + ninfo->frag_list = NULL; ++ ninfo->zccd1 = NULL; /* zero copy completion callback */ ++ ninfo->zccd2 = NULL; /* not required */ + + /* Offset between the two in bytes */ + offset = data - skb->head; +--- linux/./net/core/skbuff.c 2006-10-10 01:49:23.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./net/core/skbuff.c 2006-10-10 01:46:16.000000000 +0100 +@@ -155,6 +155,8 @@ struct sk_buff *alloc_skb(unsigned int s + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ ++ skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */ + out: + return skb; + nodata: +@@ -189,6 +191,9 @@ void skb_release_data(struct sk_buff *sk + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ /* complete zero-copy callbacks (if any) */ ++ skb_complete_zccd(skb); ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -484,6 +489,18 @@ struct sk_buff *pskb_copy(struct sk_buff + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; ++ ++ if (skb_shinfo(skb)->zccd1 != NULL) { ++ BUG_TRAP(skb_shinfo(n)->zccd1 == NULL); ++ skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1; ++ zccd_incref(skb_shinfo(n)->zccd1); ++ } ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) { ++ BUG_TRAP(skb_shinfo(n)->zccd2 == NULL); ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ zccd_incref(skb_shinfo(n)->zccd2); ++ } + } + + if (skb_shinfo(skb)->frag_list) { +@@ -533,6 +550,13 @@ int pskb_expand_head(struct sk_buff *skb + memcpy(data + nhead, skb->head, skb->tail - skb->head); + memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + ++ /* zero-copy descriptors have been copied into the new shinfo - ++ * account the new references */ ++ if (skb_shinfo(skb)->zccd1 != NULL) ++ zccd_incref(skb_shinfo(skb)->zccd1); ++ if (skb_shinfo(skb)->zccd2 != NULL) ++ zccd_incref(skb_shinfo(skb)->zccd2); ++ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + +@@ -694,6 +718,9 @@ int ___pskb_trim(struct sk_buff *skb, un + offset = end; + } + ++ if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */ ++ skb_complete_zccd(skb); /* drop zccd refs */ ++ + if (offset < len) { + skb->data_len -= skb->len - len; + skb->len = len; +@@ -846,6 +873,9 @@ pull_pages: + } + skb_shinfo(skb)->nr_frags = k; + ++ if (k == 0) /* dropped all the pages */ ++ skb_complete_zccd(skb); /* drop zccd refs */ ++ + skb->tail += delta; + skb->data_len -= delta; + +@@ -1362,6 +1392,15 @@ static void inline skb_split_inside_head + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + ++ /* Transfer zero-copy callback descriptors */ ++ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); ++ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; ++ skb_shinfo(skb)->zccd1 = NULL; ++ ++ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); ++ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; ++ skb_shinfo(skb)->zccd2 = NULL; ++ + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; +@@ -1410,6 +1449,30 @@ static void inline skb_split_no_header(s + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; ++ ++ if (k != 0) { ++ /* skb1 has pages. Transfer or clone the zccds */ ++ ++ if (skb_shinfo(skb)->zccd1 != NULL) { ++ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); ++ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; ++ ++ if (skb_shinfo(skb)->nr_frags == 0) ++ skb_shinfo(skb)->zccd1 = NULL; ++ else ++ zccd_incref(skb_shinfo(skb)->zccd1); ++ } ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) { ++ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); ++ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; ++ ++ if (skb_shinfo(skb)->nr_frags == 0) ++ skb_shinfo(skb)->zccd2 = NULL; ++ else ++ zccd_incref(skb_shinfo(skb)->zccd2); ++ } ++ } + } + + /** +--- linux/./net/ipv4/tcp_output.c 2006-09-21 00:13:11.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp_output.c 2006-09-21 18:24:26.000000000 +0100 +@@ -562,6 +562,9 @@ static unsigned char *__pskb_trim_head(s + } + skb_shinfo(skb)->nr_frags = k; + ++ if (k == 0) /* dropped all pages */ ++ skb_complete_zccd(skb); ++ + skb->tail = skb->data; + skb->data_len -= len; + skb->len = skb->data_len; +--- linux/./net/ipv4/tcp.c 2006-10-10 01:49:23.000000000 +0100 ++++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp.c 2006-10-09 19:03:15.000000000 +0100 +@@ -628,8 +628,9 @@ static inline void tcp_push(struct sock + } + } + ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags) ++ size_t psize, int flags, struct zccd *zccd) + { + struct tcp_opt *tp = tcp_sk(sk); + int mss_now; +@@ -676,6 +677,16 @@ new_segment: + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* completion callback wanted */ ++ skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd1 != zccd && /* room needed */ ++ skb_shinfo(skb)->zccd2 != zccd) { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); +@@ -692,6 +703,18 @@ new_segment: + skb_fill_page_desc(skb, i, page, offset, copy); + } + ++ if (zccd != NULL && /* completion callback wanted */ ++ skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */ ++ skb_shinfo(skb)->zccd2 != zccd) { ++ if (skb_shinfo(skb)->zccd1 == NULL) { ++ skb_shinfo(skb)->zccd1 = zccd; ++ } else { ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ zccd_incref(zccd); /* new reference */ ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; +@@ -744,8 +767,8 @@ out_err: + return sk_stream_error(sk, flags, err); + } + +-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, +- size_t size, int flags) ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags, struct zccd *zccd) + { + ssize_t res; + struct sock *sk = sock->sk; +@@ -760,12 +783,18 @@ ssize_t tcp_sendpage(struct socket *sock + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL); ++} ++ + #define TCP_PAGE(sk) (sk->sk_sndmsg_page) + #define TCP_OFF(sk) (sk->sk_sndmsg_off) + +@@ -2343,6 +2372,7 @@ EXPORT_SYMBOL(tcp_read_sock); + EXPORT_SYMBOL(tcp_recvmsg); + EXPORT_SYMBOL(tcp_sendmsg); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); + EXPORT_SYMBOL(tcp_setsockopt); + EXPORT_SYMBOL(tcp_shutdown); + EXPORT_SYMBOL(tcp_statistics); diff --git a/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch new file mode 100644 index 0000000..892a61f --- /dev/null +++ b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch @@ -0,0 +1,1264 @@ + +From: Jan Kara + +The four patches in this series fix deadlocks with quotas of pagelock (the +problem was lock inversion on PageLock and transaction start - quota code +needed to first start a transaction and then write the data which subsequently +needed acquisition of PageLock while the standard ordering - PageLock first +and transaction start later - was used e.g. by pdflush). They implement a +new way of quota access to disk: Every filesystem that would like to implement +quotas now has to provide quota_read() and quota_write() functions. These +functions must obey quota lock ordering (in particular they should not take +PageLock inside a transaction). + +The first patch implements the changes in the quota core, the other three +patches implement needed functions in ext2, ext3 and reiserfs. The patch for +reiserfs also fixes several other lock inversion problems (similar as ext3 +had) and implements the journaled quota functionality (which comes almost for +free after the locking fixes...). + +The quota core patch makes quota support in other filesystems (except XFS +which implements everything on its own ;)) unfunctional (quotaon() will refuse +to turn on quotas on them). When the patches get reasonable wide testing and +it will seem that no major changes will be needed I can make fixes also for +the other filesystems (JFS, UDF, UFS). + +This patch: + +The patch implements the new way of quota io in the quota core. Every +filesystem wanting to support quotas has to provide functions quota_read() +and quota_write() obeying quota locking rules. As the writes and reads +bypass the pagecache there is some ugly stuff ensuring that userspace can +see all the data after quotaoff() (or Q_SYNC quotactl). In future I plan +to make quota files inaccessible from userspace (with the exception of +quotacheck(8) which will take care about the cache flushing and such stuff +itself) so that this synchronization stuff can be removed... + +The rewrite of the quota core. Quota uses the filesystem read() and write() +functions no more to avoid possible deadlocks on PageLock. From now on every +filesystem supporting quotas must provide functions quota_read() and +quota_write() which obey the quota locking rules (e.g. they cannot acquire the +PageLock). + +Signed-off-by: Jan Kara +Signed-off-by: Andrew Morton +--- + + 25-akpm/fs/dquot.c | 162 +++++++++++++-------------- + 25-akpm/fs/quota.c | 45 +++++++ + 25-akpm/fs/quota_v1.c | 62 ++-------- + 25-akpm/fs/quota_v2.c | 227 +++++++++++++++++---------------------- + 25-akpm/include/linux/fs.h | 3 + 25-akpm/include/linux/quota.h | 2 + 25-akpm/include/linux/security.h | 8 - + 25-akpm/security/dummy.c | 2 + 25-akpm/security/selinux/hooks.c | 4 + 9 files changed, 247 insertions(+), 268 deletions(-) + +diff -puN fs/dquot.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/dquot.c +--- 25/fs/dquot.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.293107536 -0800 ++++ 25-akpm/fs/dquot.c 2004-12-03 20:56:04.312104648 -0800 +@@ -49,7 +49,7 @@ + * New SMP locking. + * Jan Kara, , 10/2002 + * +- * Added journalled quota support ++ * Added journalled quota support, fix lock inversion problems + * Jan Kara, , 2003,2004 + * + * (C) Copyright 1994 - 1997 Marco van Wieringen +@@ -75,7 +75,8 @@ + #include + #include + #include +-#include ++#include ++#include + + #include + +@@ -114,7 +115,7 @@ + * operations on dquots don't hold dq_lock as they copy data under dq_data_lock + * spinlock to internal buffers before writing. + * +- * Lock ordering (including related VFS locks) is following: ++ * Lock ordering (including related VFS locks) is the following: + * i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > + * > dquot->dq_lock > dqio_sem + * i_sem on quota files is special (it's below dqio_sem) +@@ -183,8 +184,7 @@ static void put_quota_format(struct quot + * on all three lists, depending on its current state. + * + * All dquots are placed to the end of inuse_list when first created, and this +- * list is used for the sync and invalidate operations, which must look +- * at every dquot. ++ * list is used for invalidate operation, which must look at every dquot. + * + * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, + * and this list is searched whenever we need an available dquot. Dquots are +@@ -1314,10 +1314,12 @@ int vfs_quota_off(struct super_block *sb + { + int cnt; + struct quota_info *dqopt = sb_dqopt(sb); ++ struct inode *toput[MAXQUOTAS]; + + /* We need to serialize quota_off() for device */ + down(&dqopt->dqonoff_sem); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ toput[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_enabled(sb, cnt)) +@@ -1337,7 +1339,7 @@ int vfs_quota_off(struct super_block *sb + dqopt->ops[cnt]->free_file_info(sb, cnt); + put_quota_format(dqopt->info[cnt].dqi_format); + +- fput(dqopt->files[cnt]); ++ toput[cnt] = dqopt->files[cnt]; + dqopt->files[cnt] = NULL; + dqopt->info[cnt].dqi_flags = 0; + dqopt->info[cnt].dqi_igrace = 0; +@@ -1345,6 +1347,26 @@ int vfs_quota_off(struct super_block *sb + dqopt->ops[cnt] = NULL; + } + up(&dqopt->dqonoff_sem); ++ /* Sync the superblock so that buffers with quota data are written to ++ * disk (and so userspace sees correct data afterwards) */ ++ if (sb->s_op->sync_fs) ++ sb->s_op->sync_fs(sb, 1); ++ sync_blockdev(sb->s_bdev); ++ /* Now the quota files are just ordinary files and we can set the ++ * inode flags back. Moreover we discard the pagecache so that ++ * userspace sees the writes we did bypassing the pagecache. We ++ * must also discard the blockdev buffers so that we see the ++ * changes done by userspace on the next quotaon() */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) ++ if (toput[cnt]) { ++ down(&toput[cnt]->i_sem); ++ toput[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); ++ truncate_inode_pages(&toput[cnt]->i_data, 0); ++ up(&toput[cnt]->i_sem); ++ mark_inode_dirty(toput[cnt]); ++ iput(toput[cnt]); ++ } ++ invalidate_bdev(sb->s_bdev, 0); + return 0; + } + +@@ -1352,68 +1374,56 @@ int vfs_quota_off(struct super_block *sb + * Turn quotas on on a device + */ + +-/* Helper function when we already have file open */ +-static int vfs_quota_on_file(struct file *f, int type, int format_id) ++/* Helper function when we already have the inode */ ++static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) + { + struct quota_format_type *fmt = find_quota_format(format_id); +- struct inode *inode; +- struct super_block *sb = f->f_dentry->d_sb; ++ struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); +- struct dquot *to_drop[MAXQUOTAS]; +- int error, cnt; +- unsigned int oldflags = -1; ++ int error; ++ int oldflags = -1; + + if (!fmt) + return -ESRCH; +- error = -EIO; +- if (!f->f_op || !f->f_op->read || !f->f_op->write) ++ if (!S_ISREG(inode->i_mode)) { ++ error = -EACCES; + goto out_fmt; +- inode = f->f_dentry->d_inode; +- error = -EACCES; +- if (!S_ISREG(inode->i_mode)) ++ } ++ if (IS_RDONLY(inode)) { ++ error = -EROFS; ++ goto out_fmt; ++ } ++ if (!sb->s_op->quota_write || !sb->s_op->quota_read) { ++ error = -EINVAL; + goto out_fmt; ++ } + ++ /* As we bypass the pagecache we must now flush the inode so that ++ * we see all the changes from userspace... */ ++ write_inode_now(inode, 1); ++ /* And now flush the block cache so that kernel sees the changes */ ++ invalidate_bdev(sb->s_bdev, 0); + down(&inode->i_sem); + down(&dqopt->dqonoff_sem); + if (sb_has_quota_enabled(sb, type)) { +- up(&inode->i_sem); + error = -EBUSY; + goto out_lock; + } + /* We don't want quota and atime on quota files (deadlocks possible) +- * We also need to set GFP mask differently because we cannot recurse +- * into filesystem when allocating page for quota inode */ ++ * Also nobody should write to the file - we use special IO operations ++ * which ignore the immutable bit. */ + down_write(&dqopt->dqptr_sem); +- oldflags = inode->i_flags & (S_NOATIME | S_NOQUOTA); +- inode->i_flags |= S_NOQUOTA | S_NOATIME; ++ oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); ++ inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + up_write(&dqopt->dqptr_sem); +- up(&inode->i_sem); + +- dqopt->files[type] = f; ++ error = -EIO; ++ dqopt->files[type] = igrab(inode); ++ if (!dqopt->files[type]) ++ goto out_lock; + error = -EINVAL; + if (!fmt->qf_ops->check_quota_file(sb, type)) + goto out_file_init; +- /* +- * We write to quota files deep within filesystem code. We don't want +- * the VFS to reenter filesystem code when it tries to allocate a +- * pagecache page for the quota file write. So clear __GFP_FS in +- * the quota file's allocation flags. +- */ +- mapping_set_gfp_mask(inode->i_mapping, +- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); +- +- down_write(&dqopt->dqptr_sem); +- for (cnt = 0; cnt < MAXQUOTAS; cnt++) { +- to_drop[cnt] = inode->i_dquot[cnt]; +- inode->i_dquot[cnt] = NODQUOT; +- } +- up_write(&dqopt->dqptr_sem); +- /* We must put dquots outside of dqptr_sem because we may need to +- * start transaction for dquot_release() */ +- for (cnt = 0; cnt < MAXQUOTAS; cnt++) { +- if (to_drop[cnt]) +- dqput(to_drop[cnt]); +- } + + dqopt->ops[type] = fmt->qf_ops; + dqopt->info[type].dqi_format = fmt; +@@ -1424,6 +1434,7 @@ static int vfs_quota_on_file(struct file + goto out_file_init; + } + up(&dqopt->dqio_sem); ++ up(&inode->i_sem); + set_enable_flags(dqopt, type); + + add_dquot_ref(sb, type); +@@ -1433,19 +1444,18 @@ static int vfs_quota_on_file(struct file + + out_file_init: + dqopt->files[type] = NULL; ++ iput(inode); + out_lock: + up(&dqopt->dqonoff_sem); + if (oldflags != -1) { +- down(&inode->i_sem); + down_write(&dqopt->dqptr_sem); +- /* Reset the NOATIME flag back. I know it could change in the +- * mean time but playing with NOATIME flags on a quota file is +- * never a good idea */ +- inode->i_flags &= ~(S_NOATIME | S_NOQUOTA); ++ /* Set the flags back (in the case of accidental quotaon() ++ * on a wrong file we don't want to mess up the flags) */ ++ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); + inode->i_flags |= oldflags; + up_write(&dqopt->dqptr_sem); +- up(&inode->i_sem); + } ++ up(&inode->i_sem); + out_fmt: + put_quota_format(fmt); + +@@ -1455,47 +1465,37 @@ out_fmt: + /* Actual function called from quotactl() */ + int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) + { +- struct file *f; ++ struct nameidata nd; + int error; + +- f = filp_open(path, O_RDWR, 0600); +- if (IS_ERR(f)) +- return PTR_ERR(f); +- error = security_quota_on(f); ++ error = path_lookup(path, LOOKUP_FOLLOW, &nd); ++ if (error < 0) ++ return error; ++ error = security_quota_on(nd.dentry); + if (error) +- goto out_f; +- error = vfs_quota_on_file(f, type, format_id); +- if (!error) +- return 0; +-out_f: +- filp_close(f, NULL); ++ goto out_path; ++ /* Quota file not on the same filesystem? */ ++ if (nd.mnt->mnt_sb != sb) ++ error = -EXDEV; ++ else ++ error = vfs_quota_on_inode(nd.dentry->d_inode, type, format_id); ++out_path: ++ path_release(&nd); + return error; + } + + /* +- * Function used by filesystems when filp_open() would fail (filesystem is +- * being mounted now). We will use a private file structure. Caller is +- * responsible that it's IO functions won't need vfsmnt structure or +- * some dentry tricks... ++ * This function is used when filesystem needs to initialize quotas ++ * during mount time. + */ + int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry) + { +- struct file *f; + int error; + +- dget(dentry); /* Get a reference for struct file */ +- f = dentry_open(dentry, NULL, O_RDWR); +- if (IS_ERR(f)) { +- error = PTR_ERR(f); +- goto out_dentry; +- } +- error = vfs_quota_on_file(f, type, format_id); +- if (!error) +- return 0; +- fput(f); +-out_dentry: +- dput(dentry); +- return error; ++ error = security_quota_on(dentry); ++ if (error) ++ return error; ++ return vfs_quota_on_inode(dentry->d_inode, type, format_id); + } + + /* Generic routine for getting common part of quota structure */ +diff -puN fs/quota.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota.c +--- 25/fs/quota.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.295107232 -0800 ++++ 25-akpm/fs/quota.c 2004-12-03 20:56:04.313104496 -0800 +@@ -13,6 +13,8 @@ + #include + #include + #include ++#include ++#include + + /* Check validity of quotactl */ + static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) +@@ -135,16 +136,54 @@ restart: + return NULL; + } + ++void quota_sync_sb(struct super_block *sb, int type) ++{ ++ int cnt; ++ struct inode *discard[MAXQUOTAS]; ++ ++ sb->s_qcop->quota_sync(sb, type); ++ /* This is not very clever (and fast) but currently I don't know about ++ * any other simple way of getting quota data to disk and we must get ++ * them there for userspace to be visible... */ ++ if (sb->s_op->sync_fs) ++ sb->s_op->sync_fs(sb, 1); ++ sync_blockdev(sb->s_bdev); ++ ++ /* Now when everything is written we can discard the pagecache so ++ * that userspace sees the changes. We need i_sem and so we could ++ * not do it inside dqonoff_sem. Moreover we need to be carefull ++ * about races with quotaoff() (that is the reason why we have own ++ * reference to inode). */ ++ down(&sb_dqopt(sb)->dqonoff_sem); ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ discard[cnt] = NULL; ++ if (type != -1 && cnt != type) ++ continue; ++ if (!sb_has_quota_enabled(sb, cnt)) ++ continue; ++ discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]); ++ } ++ up(&sb_dqopt(sb)->dqonoff_sem); ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (discard[cnt]) { ++ down(&discard[cnt]->i_sem); ++ truncate_inode_pages(&discard[cnt]->i_data, 0); ++ up(&discard[cnt]->i_sem); ++ iput(discard[cnt]); ++ } ++ } ++} ++ + void sync_dquots(struct super_block *sb, int type) + { + if (sb) { + if (sb->s_qcop->quota_sync) +- sb->s_qcop->quota_sync(sb, type); ++ quota_sync_sb(sb, type); + } + else { +- while ((sb = get_super_to_sync(type)) != 0) { ++ while ((sb = get_super_to_sync(type)) != NULL) { + if (sb->s_qcop->quota_sync) +- sb->s_qcop->quota_sync(sb, type); ++ quota_sync_sb(sb, type); + drop_super(sb); + } + } +diff -puN fs/quota_v1.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota_v1.c +--- 25/fs/quota_v1.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.296107080 -0800 ++++ 25-akpm/fs/quota_v1.c 2004-12-03 20:56:04.314104344 -0800 +@@ -7,7 +7,6 @@ + #include + #include + +-#include + #include + + MODULE_AUTHOR("Jan Kara"); +@@ -41,23 +40,14 @@ static void v1_mem2disk_dqblk(struct v1_ + static int v1_read_dqblk(struct dquot *dquot) + { + int type = dquot->dq_type; +- struct file *filp; +- mm_segment_t fs; +- loff_t offset; + struct v1_disk_dqblk dqblk; + +- filp = sb_dqopt(dquot->dq_sb)->files[type]; +- if (filp == (struct file *)NULL) ++ if (!sb_dqopt(dquot->dq_sb)->files[type]) + return -EINVAL; + +- /* Now we are sure filp is valid */ +- offset = v1_dqoff(dquot->dq_id); + /* Set structure to 0s in case read fails/is after end of file */ + memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset); +- set_fs(fs); ++ dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); + + v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); + if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 && +@@ -71,26 +61,18 @@ static int v1_read_dqblk(struct dquot *d + static int v1_commit_dqblk(struct dquot *dquot) + { + short type = dquot->dq_type; +- struct file *filp; +- mm_segment_t fs; +- loff_t offset; + ssize_t ret; + struct v1_disk_dqblk dqblk; + +- filp = sb_dqopt(dquot->dq_sb)->files[type]; +- offset = v1_dqoff(dquot->dq_id); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- + v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); + if (dquot->dq_id == 0) { + dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; + dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace; + } + ret = 0; +- if (filp) +- ret = filp->f_op->write(filp, (char *)&dqblk, +- sizeof(struct v1_disk_dqblk), &offset); ++ if (sb_dqopt(dquot->dq_sb)->files[type]) ++ ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk, ++ sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); + if (ret != sizeof(struct v1_disk_dqblk)) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", + dquot->dq_sb->s_id); +@@ -101,7 +83,6 @@ static int v1_commit_dqblk(struct dquot + ret = 0; + + out: +- set_fs(fs); + dqstats.writes++; + + return ret; +@@ -121,14 +102,11 @@ struct v2_disk_dqheader { + + static int v1_check_quota_file(struct super_block *sb, int type) + { +- struct file *f = sb_dqopt(sb)->files[type]; +- struct inode *inode = f->f_dentry->d_inode; ++ struct inode *inode = sb_dqopt(sb)->files[type]; + ulong blocks; + size_t off; + struct v2_disk_dqheader dqhead; +- mm_segment_t fs; + ssize_t size; +- loff_t offset = 0; + loff_t isize; + static const uint quota_magics[] = V2_INITQMAGICS; + +@@ -140,10 +118,7 @@ static int v1_check_quota_file(struct su + if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk)) + return 0; + /* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */ +- fs = get_fs(); +- set_fs(KERNEL_DS); +- size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset); +- set_fs(fs); ++ size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) + return 1; /* Probably not new format */ + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) +@@ -155,16 +130,10 @@ static int v1_check_quota_file(struct su + static int v1_read_file_info(struct super_block *sb, int type) + { + struct quota_info *dqopt = sb_dqopt(sb); +- mm_segment_t fs; +- loff_t offset; +- struct file *filp = dqopt->files[type]; + struct v1_disk_dqblk dqblk; + int ret; + +- offset = v1_dqoff(0); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) { ++ if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; +@@ -173,38 +142,31 @@ static int v1_read_file_info(struct supe + dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; + out: +- set_fs(fs); + return ret; + } + + static int v1_write_file_info(struct super_block *sb, int type) + { + struct quota_info *dqopt = sb_dqopt(sb); +- mm_segment_t fs; +- struct file *filp = dqopt->files[type]; + struct v1_disk_dqblk dqblk; +- loff_t offset; + int ret; + + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; +- offset = v1_dqoff(0); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) { ++ if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, ++ sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + dqblk.dqb_itime = dqopt->info[type].dqi_igrace; + dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; +- offset = v1_dqoff(0); +- ret = filp->f_op->write(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset); ++ ret = sb->s_op->quota_write(sb, type, (char *)&dqblk, ++ sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret == sizeof(struct v1_disk_dqblk)) + ret = 0; + else if (ret > 0) + ret = -EIO; + out: +- set_fs(fs); + return ret; + } + +diff -puN fs/quota_v2.c~fix-of-quota-deadlock-on-pagelock-quota-core fs/quota_v2.c +--- 25/fs/quota_v2.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.297106928 -0800 ++++ 25-akpm/fs/quota_v2.c 2004-12-03 20:56:04.318103736 -0800 +@@ -13,7 +13,6 @@ + #include + + #include +-#include + + MODULE_AUTHOR("Jan Kara"); + MODULE_DESCRIPTION("Quota format v2 support"); +@@ -30,19 +29,15 @@ typedef char *dqbuf_t; + static int v2_check_quota_file(struct super_block *sb, int type) + { + struct v2_disk_dqheader dqhead; +- struct file *f = sb_dqopt(sb)->files[type]; +- mm_segment_t fs; + ssize_t size; +- loff_t offset = 0; + static const uint quota_magics[] = V2_INITQMAGICS; + static const uint quota_versions[] = V2_INITQVERSIONS; + +- fs = get_fs(); +- set_fs(KERNEL_DS); +- size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset); +- set_fs(fs); +- if (size != sizeof(struct v2_disk_dqheader)) ++ size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); ++ if (size != sizeof(struct v2_disk_dqheader)) { ++ printk("failed read\n"); + return 0; ++ } + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || + le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) + return 0; +@@ -52,20 +47,15 @@ static int v2_check_quota_file(struct su + /* Read information header from quota file */ + static int v2_read_file_info(struct super_block *sb, int type) + { +- mm_segment_t fs; + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; +- struct file *f = sb_dqopt(sb)->files[type]; + ssize_t size; +- loff_t offset = V2_DQINFOOFF; + +- fs = get_fs(); +- set_fs(KERNEL_DS); +- size = f->f_op->read(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset); +- set_fs(fs); ++ size = sb->s_op->quota_read(sb, type, (char *)&dinfo, ++ sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + printk(KERN_WARNING "Can't read info structure on device %s.\n", +- f->f_dentry->d_sb->s_id); ++ sb->s_id); + return -1; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); +@@ -80,12 +70,9 @@ static int v2_read_file_info(struct supe + /* Write information header to quota file */ + static int v2_write_file_info(struct super_block *sb, int type) + { +- mm_segment_t fs; + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; +- struct file *f = sb_dqopt(sb)->files[type]; + ssize_t size; +- loff_t offset = V2_DQINFOOFF; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; +@@ -96,13 +83,11 @@ static int v2_write_file_info(struct sup + dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- size = f->f_op->write(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset); +- set_fs(fs); ++ size = sb->s_op->quota_write(sb, type, (char *)&dinfo, ++ sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + printk(KERN_WARNING "Can't write info structure on device %s.\n", +- f->f_dentry->d_sb->s_id); ++ sb->s_id); + return -1; + } + return 0; +@@ -146,39 +131,24 @@ static inline void freedqbuf(dqbuf_t buf + kfree(buf); + } + +-static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf) ++static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) + { +- mm_segment_t fs; +- ssize_t ret; +- loff_t offset = blk<f_op->read(filp, (char *)buf, V2_DQBLKSIZE, &offset); +- set_fs(fs); +- return ret; ++ return sb->s_op->quota_read(sb, type, (char *)buf, ++ V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); + } + +-static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf) ++static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) + { +- mm_segment_t fs; +- ssize_t ret; +- loff_t offset = blk<f_op->write(filp, (char *)buf, V2_DQBLKSIZE, &offset); +- set_fs(fs); +- return ret; +- ++ return sb->s_op->quota_write(sb, type, (char *)buf, ++ V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); + } + + /* Remove empty block from list and return it */ +-static int get_free_dqblk(struct file *filp, int type) ++static int get_free_dqblk(struct super_block *sb, int type) + { + dqbuf_t buf = getdqbuf(); +- struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type); ++ struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int ret, blk; + +@@ -186,17 +156,18 @@ static int get_free_dqblk(struct file *f + return -ENOMEM; + if (info->u.v2_i.dqi_free_blk) { + blk = info->u.v2_i.dqi_free_blk; +- if ((ret = read_blk(filp, blk, buf)) < 0) ++ if ((ret = read_blk(sb, type, blk, buf)) < 0) + goto out_buf; + info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, V2_DQBLKSIZE); +- if ((ret = write_blk(filp, info->u.v2_i.dqi_blocks, buf)) < 0) /* Assure block allocation... */ ++ /* Assure block allocation... */ ++ if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0) + goto out_buf; + blk = info->u.v2_i.dqi_blocks++; + } +- mark_info_dirty(filp->f_dentry->d_sb, type); ++ mark_info_dirty(sb, type); + ret = blk; + out_buf: + freedqbuf(buf); +@@ -204,9 +175,9 @@ out_buf: + } + + /* Insert empty block to the list */ +-static int put_free_dqblk(struct file *filp, int type, dqbuf_t buf, uint blk) ++static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk) + { +- struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type); ++ struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int err; + +@@ -214,17 +185,18 @@ static int put_free_dqblk(struct file *f + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + info->u.v2_i.dqi_free_blk = blk; +- mark_info_dirty(filp->f_dentry->d_sb, type); +- if ((err = write_blk(filp, blk, buf)) < 0) /* Some strange block. We had better leave it... */ ++ mark_info_dirty(sb, type); ++ /* Some strange block. We had better leave it... */ ++ if ((err = write_blk(sb, type, blk, buf)) < 0) + return err; + return 0; + } + + /* Remove given block from the list of blocks with free entries */ +-static int remove_free_dqentry(struct file *filp, int type, dqbuf_t buf, uint blk) ++static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) + { + dqbuf_t tmpbuf = getdqbuf(); +- struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type); ++ struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; +@@ -232,26 +204,27 @@ static int remove_free_dqentry(struct fi + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { +- if ((err = read_blk(filp, nextblk, tmpbuf)) < 0) ++ if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; +- if ((err = write_blk(filp, nextblk, tmpbuf)) < 0) ++ if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0) + goto out_buf; + } + if (prevblk) { +- if ((err = read_blk(filp, prevblk, tmpbuf)) < 0) ++ if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; +- if ((err = write_blk(filp, prevblk, tmpbuf)) < 0) ++ if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0) + goto out_buf; + } + else { + info->u.v2_i.dqi_free_entry = nextblk; +- mark_info_dirty(filp->f_dentry->d_sb, type); ++ mark_info_dirty(sb, type); + } + freedqbuf(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); +- if (write_blk(filp, blk, buf) < 0) /* No matter whether write succeeds block is out of list */ ++ /* No matter whether write succeeds block is out of list */ ++ if (write_blk(sb, type, blk, buf) < 0) + printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); + return 0; + out_buf: +@@ -260,10 +233,10 @@ out_buf: + } + + /* Insert given block to the beginning of list with free entries */ +-static int insert_free_dqentry(struct file *filp, int type, dqbuf_t buf, uint blk) ++static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) + { + dqbuf_t tmpbuf = getdqbuf(); +- struct mem_dqinfo *info = sb_dqinfo(filp->f_dentry->d_sb, type); ++ struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int err; + +@@ -271,18 +244,18 @@ static int insert_free_dqentry(struct fi + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); +- if ((err = write_blk(filp, blk, buf)) < 0) ++ if ((err = write_blk(sb, type, blk, buf)) < 0) + goto out_buf; + if (info->u.v2_i.dqi_free_entry) { +- if ((err = read_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) ++ if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); +- if ((err = write_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) ++ if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) + goto out_buf; + } + freedqbuf(tmpbuf); + info->u.v2_i.dqi_free_entry = blk; +- mark_info_dirty(filp->f_dentry->d_sb, type); ++ mark_info_dirty(sb, type); + return 0; + out_buf: + freedqbuf(tmpbuf); +@@ -292,8 +265,8 @@ out_buf: + /* Find space for dquot */ + static uint find_free_dqentry(struct dquot *dquot, int *err) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +- struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info+dquot->dq_type; ++ struct super_block *sb = dquot->dq_sb; ++ struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; + uint blk, i; + struct v2_disk_dqdbheader *dh; + struct v2_disk_dqblk *ddquot; +@@ -309,22 +282,23 @@ static uint find_free_dqentry(struct dqu + ddquot = GETENTRIES(buf); + if (info->u.v2_i.dqi_free_entry) { + blk = info->u.v2_i.dqi_free_entry; +- if ((*err = read_blk(filp, blk, buf)) < 0) ++ if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0) + goto out_buf; + } + else { +- blk = get_free_dqblk(filp, dquot->dq_type); ++ blk = get_free_dqblk(sb, dquot->dq_type); + if ((int)blk < 0) { + *err = blk; + freedqbuf(buf); + return 0; + } + memset(buf, 0, V2_DQBLKSIZE); +- info->u.v2_i.dqi_free_entry = blk; /* This is enough as block is already zeroed and entry list is empty... */ +- mark_info_dirty(dquot->dq_sb, dquot->dq_type); ++ /* This is enough as block is already zeroed and entry list is empty... */ ++ info->u.v2_i.dqi_free_entry = blk; ++ mark_info_dirty(sb, dquot->dq_type); + } + if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ +- if ((*err = remove_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0) { ++ if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); + goto out_buf; + } +@@ -339,7 +313,7 @@ static uint find_free_dqentry(struct dqu + goto out_buf; + } + #endif +- if ((*err = write_blk(filp, blk, buf)) < 0) { ++ if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); + goto out_buf; + } +@@ -354,7 +328,7 @@ out_buf: + /* Insert reference to structure into the trie */ + static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; ++ struct super_block *sb = dquot->dq_sb; + dqbuf_t buf; + int ret = 0, newson = 0, newact = 0; + __le32 *ref; +@@ -363,7 +337,7 @@ static int do_insert_tree(struct dquot * + if (!(buf = getdqbuf())) + return -ENOMEM; + if (!*treeblk) { +- ret = get_free_dqblk(filp, dquot->dq_type); ++ ret = get_free_dqblk(sb, dquot->dq_type); + if (ret < 0) + goto out_buf; + *treeblk = ret; +@@ -371,7 +345,7 @@ static int do_insert_tree(struct dquot * + newact = 1; + } + else { +- if ((ret = read_blk(filp, *treeblk, buf)) < 0) { ++ if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk); + goto out_buf; + } +@@ -394,10 +368,10 @@ static int do_insert_tree(struct dquot * + ret = do_insert_tree(dquot, &newblk, depth+1); + if (newson && ret >= 0) { + ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk); +- ret = write_blk(filp, *treeblk, buf); ++ ret = write_blk(sb, dquot->dq_type, *treeblk, buf); + } + else if (newact && ret < 0) +- put_free_dqblk(filp, dquot->dq_type, buf, *treeblk); ++ put_free_dqblk(sb, dquot->dq_type, buf, *treeblk); + out_buf: + freedqbuf(buf); + return ret; +@@ -416,20 +390,15 @@ static inline int dq_insert_tree(struct + static int v2_write_dquot(struct dquot *dquot) + { + int type = dquot->dq_type; +- struct file *filp; +- mm_segment_t fs; +- loff_t offset; + ssize_t ret; + struct v2_disk_dqblk ddquot, empty; + + /* dq_off is guarded by dqio_sem */ + if (!dquot->dq_off) + if ((ret = dq_insert_tree(dquot)) < 0) { +- printk(KERN_ERR "VFS: Error %Zd occurred while creating quota.\n", ret); ++ printk(KERN_ERR "VFS: Error %d occurred while creating quota.\n", ret); + return ret; + } +- filp = sb_dqopt(dquot->dq_sb)->files[type]; +- offset = dquot->dq_off; + spin_lock(&dq_data_lock); + mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); + /* Argh... We may need to write structure full of zeroes but that would be +@@ -439,10 +408,8 @@ static int v2_write_dquot(struct dquot * + if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) + ddquot.dqb_itime = cpu_to_le64(1); + spin_unlock(&dq_data_lock); +- fs = get_fs(); +- set_fs(KERNEL_DS); +- ret = filp->f_op->write(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset); +- set_fs(fs); ++ ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, ++ (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); + if (ret != sizeof(struct v2_disk_dqblk)) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); + if (ret >= 0) +@@ -458,7 +425,8 @@ static int v2_write_dquot(struct dquot * + /* Free dquot entry in data block */ + static int free_dqentry(struct dquot *dquot, uint blk) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; ++ struct super_block *sb = dquot->dq_sb; ++ int type = dquot->dq_type; + struct v2_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(); + int ret = 0; +@@ -466,34 +434,39 @@ static int free_dqentry(struct dquot *dq + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) { +- printk(KERN_ERR "VFS: Quota structure has offset to other block (%u) than it should (%u).\n", blk, (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS)); ++ printk(KERN_ERR "VFS: Quota structure has offset to other " ++ "block (%u) than it should (%u).\n", blk, ++ (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS)); + goto out_buf; + } +- if ((ret = read_blk(filp, blk, buf)) < 0) { ++ if ((ret = read_blk(sb, type, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + goto out_buf; + } + dh = (struct v2_disk_dqdbheader *)buf; + dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ +- if ((ret = remove_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0 || +- (ret = put_free_dqblk(filp, dquot->dq_type, buf, blk)) < 0) { +- printk(KERN_ERR "VFS: Can't move quota data block (%u) to free list.\n", blk); ++ if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 || ++ (ret = put_free_dqblk(sb, type, buf, blk)) < 0) { ++ printk(KERN_ERR "VFS: Can't move quota data block (%u) " ++ "to free list.\n", blk); + goto out_buf; + } + } + else { +- memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, sizeof(struct v2_disk_dqblk)); ++ memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, ++ sizeof(struct v2_disk_dqblk)); + if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { + /* Insert will write block itself */ +- if ((ret = insert_free_dqentry(filp, dquot->dq_type, buf, blk)) < 0) { ++ if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); + goto out_buf; + } + } + else +- if ((ret = write_blk(filp, blk, buf)) < 0) { +- printk(KERN_ERR "VFS: Can't write quota data block %u\n", blk); ++ if ((ret = write_blk(sb, type, blk, buf)) < 0) { ++ printk(KERN_ERR "VFS: Can't write quota data " ++ "block %u\n", blk); + goto out_buf; + } + } +@@ -506,7 +479,8 @@ out_buf: + /* Remove reference to dquot from tree */ + static int remove_tree(struct dquot *dquot, uint *blk, int depth) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; ++ struct super_block *sb = dquot->dq_sb; ++ int type = dquot->dq_type; + dqbuf_t buf = getdqbuf(); + int ret = 0; + uint newblk; +@@ -514,7 +488,7 @@ static int remove_tree(struct dquot *dqu + + if (!buf) + return -ENOMEM; +- if ((ret = read_blk(filp, *blk, buf)) < 0) { ++ if ((ret = read_blk(sb, type, *blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + goto out_buf; + } +@@ -530,12 +504,13 @@ static int remove_tree(struct dquot *dqu + ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0); + for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */ + if (i == V2_DQBLKSIZE) { +- put_free_dqblk(filp, dquot->dq_type, buf, *blk); ++ put_free_dqblk(sb, type, buf, *blk); + *blk = 0; + } + else +- if ((ret = write_blk(filp, *blk, buf)) < 0) +- printk(KERN_ERR "VFS: Can't write quota tree block %u.\n", *blk); ++ if ((ret = write_blk(sb, type, *blk, buf)) < 0) ++ printk(KERN_ERR "VFS: Can't write quota tree " ++ "block %u.\n", *blk); + } + out_buf: + freedqbuf(buf); +@@ -555,7 +530,6 @@ static int v2_delete_dquot(struct dquot + /* Find entry in block */ + static loff_t find_block_dqentry(struct dquot *dquot, uint blk) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + int i; +@@ -563,27 +537,31 @@ static loff_t find_block_dqentry(struct + + if (!buf) + return -ENOMEM; +- if ((ret = read_blk(filp, blk, buf)) < 0) { ++ if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + if (dquot->dq_id) +- for (i = 0; i < V2_DQSTRINBLK && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); ++ for (i = 0; i < V2_DQSTRINBLK && ++ le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); + else { /* ID 0 as a bit more complicated searching... */ + struct v2_disk_dqblk fakedquot; + + memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); + for (i = 0; i < V2_DQSTRINBLK; i++) +- if (!le32_to_cpu(ddquot[i].dqb_id) && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) ++ if (!le32_to_cpu(ddquot[i].dqb_id) && ++ memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) + break; + } + if (i == V2_DQSTRINBLK) { +- printk(KERN_ERR "VFS: Quota for id %u referenced but not present.\n", dquot->dq_id); ++ printk(KERN_ERR "VFS: Quota for id %u referenced " ++ "but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } + else +- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); ++ ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct ++ v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); + out_buf: + freedqbuf(buf); + return ret; +@@ -592,14 +570,13 @@ out_buf: + /* Find entry for given id in the tree */ + static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth) + { +- struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; +- if ((ret = read_blk(filp, blk, buf)) < 0) { ++ if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } +@@ -625,16 +602,13 @@ static inline loff_t find_dqentry(struct + static int v2_read_dquot(struct dquot *dquot) + { + int type = dquot->dq_type; +- struct file *filp; +- mm_segment_t fs; + loff_t offset; + struct v2_disk_dqblk ddquot, empty; + int ret = 0; + +- filp = sb_dqopt(dquot->dq_sb)->files[type]; +- + #ifdef __QUOTA_V2_PARANOIA +- if (!filp || !dquot->dq_sb) { /* Invalidated quota? */ ++ /* Invalidated quota? */ ++ if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) { + printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + return -EIO; + } +@@ -642,7 +616,8 @@ static int v2_read_dquot(struct dquot *d + offset = find_dqentry(dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) +- printk(KERN_ERR "VFS: Can't read quota structure for id %u.\n", dquot->dq_id); ++ printk(KERN_ERR "VFS: Can't read quota " ++ "structure for id %u.\n", dquot->dq_id); + dquot->dq_off = 0; + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); +@@ -650,12 +625,13 @@ static int v2_read_dquot(struct dquot *d + } + else { + dquot->dq_off = offset; +- fs = get_fs(); +- set_fs(KERNEL_DS); +- if ((ret = filp->f_op->read(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset)) != sizeof(struct v2_disk_dqblk)) { ++ if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, ++ (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) ++ != sizeof(struct v2_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; +- printk(KERN_ERR "VFS: Error while reading quota structure for id %u.\n", dquot->dq_id); ++ printk(KERN_ERR "VFS: Error while reading quota " ++ "structure for id %u.\n", dquot->dq_id); + memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); + } + else { +@@ -666,7 +642,6 @@ static int v2_read_dquot(struct dquot *d + if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) + ddquot.dqb_itime = 0; + } +- set_fs(fs); + disk2memdqb(&dquot->dq_dqb, &ddquot); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && +diff -puN include/linux/fs.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/fs.h +--- 25/include/linux/fs.h~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.300106472 -0800 ++++ 25-akpm/include/linux/fs.h 2004-12-03 20:56:04.319103584 -0800 +@@ -1004,6 +1004,9 @@ struct super_operations { + void (*umount_begin) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); ++ ++ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ++ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + }; + + /* Inode state bits. Protected by inode_lock. */ + +diff -puN include/linux/quota.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/quota.h +--- 25/include/linux/quota.h~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.301106320 -0800 ++++ 25-akpm/include/linux/quota.h 2004-12-03 20:56:04.320103432 -0800 +@@ -285,7 +285,7 @@ struct quota_info { + struct semaphore dqio_sem; /* lock device while I/O in progress */ + struct semaphore dqonoff_sem; /* Serialize quotaon & quotaoff */ + struct rw_semaphore dqptr_sem; /* serialize ops using quota_info struct, pointers from inode to dquots */ +- struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ ++ struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ + }; +diff -puN include/linux/security.h~fix-of-quota-deadlock-on-pagelock-quota-core include/linux/security.h +--- 25/include/linux/security.h~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.303106016 -0800 ++++ 25-akpm/include/linux/security.h 2004-12-03 20:56:04.322103128 -0800 +@@ -1033,7 +1033,7 @@ struct security_operations { + int (*sysctl) (ctl_table * table, int op); + int (*capable) (struct task_struct * tsk, int cap); + int (*quotactl) (int cmds, int type, int id, struct super_block * sb); +- int (*quota_on) (struct file * f); ++ int (*quota_on) (struct dentry * dentry); + int (*syslog) (int type); + int (*vm_enough_memory) (long pages); + +@@ -1281,9 +1281,9 @@ static inline int security_quotactl (int + return security_ops->quotactl (cmds, type, id, sb); + } + +-static inline int security_quota_on (struct file * file) ++static inline int security_quota_on (struct dentry * dentry) + { +- return security_ops->quota_on (file); ++ return security_ops->quota_on (dentry); + } + + static inline int security_syslog(int type) +@@ -1959,7 +1959,7 @@ static inline int security_quotactl (int + return 0; + } + +-static inline int security_quota_on (struct file * file) ++static inline int security_quota_on (struct dentry * dentry) + { + return 0; + } +diff -puN security/dummy.c~fix-of-quota-deadlock-on-pagelock-quota-core security/dummy.c +--- 25/security/dummy.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.304105864 -0800 ++++ 25-akpm/security/dummy.c 2004-12-03 20:56:04.323102976 -0800 +@@ -92,7 +92,7 @@ static int dummy_quotactl (int cmds, int + return 0; + } + +-static int dummy_quota_on (struct file *f) ++static int dummy_quota_on (struct dentry *dentry) + { + return 0; + } +diff -puN security/selinux/hooks.c~fix-of-quota-deadlock-on-pagelock-quota-core security/selinux/hooks.c +--- 25/security/selinux/hooks.c~fix-of-quota-deadlock-on-pagelock-quota-core 2004-12-03 20:56:04.306105560 -0800 ++++ 25-akpm/security/selinux/hooks.c 2004-12-03 20:56:04.326102520 -0800 +@@ -1494,9 +1494,9 @@ static int selinux_quotactl(int cmds, in + return rc; + } + +-static int selinux_quota_on(struct file *f) ++static int selinux_quota_on(struct dentry *dentry) + { +- return file_has_perm(current, f, FILE__QUOTAON); ++ return dentry_has_perm(current, NULL, dentry, FILE__QUOTAON); + } + + static int selinux_syslog(int type) +_ diff --git a/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch new file mode 100644 index 0000000..bcfa38a --- /dev/null +++ b/lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch @@ -0,0 +1,273 @@ +Index: linux-2.6.9/fs/ext3/inode.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/inode.c 2006-08-25 16:39:10.000000000 +0800 ++++ linux-2.6.9/fs/ext3/inode.c 2006-09-14 11:44:29.000000000 +0800 +@@ -1028,7 +1028,7 @@ + return ret; + } + +-static int ++int + ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) + { + int err = journal_dirty_data(handle, bh); +Index: linux-2.6.9/fs/ext3/super.c +=================================================================== +--- linux-2.6.9.orig/fs/ext3/super.c 2006-08-25 16:39:48.000000000 +0800 ++++ linux-2.6.9/fs/ext3/super.c 2006-09-14 11:51:48.000000000 +0800 +@@ -529,7 +529,10 @@ + static int ext3_write_info(struct super_block *sb, int type); + static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path); + static int ext3_quota_on_mount(struct super_block *sb, int type); +-static int ext3_quota_off_mount(struct super_block *sb, int type); ++static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, ++ size_t len, loff_t off); ++static ssize_t ext3_quota_write(struct super_block *sb, int type, ++ const char *data, size_t len, loff_t off); + + static struct dquot_operations ext3_quota_operations = { + .initialize = ext3_dquot_initialize, +@@ -572,6 +575,10 @@ + .statfs = ext3_statfs, + .remount_fs = ext3_remount, + .clear_inode = ext3_clear_inode, ++#ifdef CONFIG_QUOTA ++ .quota_read = ext3_quota_read, ++ .quota_write = ext3_quota_write, ++#endif + }; + + static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp) +@@ -713,6 +720,7 @@ + int option; + #ifdef CONFIG_QUOTA + int qtype; ++ char *qname; + #endif + + if (!options) +@@ -891,19 +899,22 @@ + "quota options when quota turned on.\n"); + return 0; + } +- if (sbi->s_qf_names[qtype]) { ++ qname = match_strdup(&args[0]); ++ if (!qname) { + printk(KERN_ERR +- "EXT3-fs: %s quota file already " +- "specified.\n", QTYPE2NAME(qtype)); ++ "EXT3-fs: not enough memory for " ++ "storing quotafile name.\n"); + return 0; + } +- sbi->s_qf_names[qtype] = match_strdup(&args[0]); +- if (!sbi->s_qf_names[qtype]) { ++ if (sbi->s_qf_names[qtype] && ++ strcmp(sbi->s_qf_names[qtype], qname)) { + printk(KERN_ERR +- "EXT3-fs: not enough memory for " +- "storing quotafile name.\n"); ++ "EXT3-fs: %s quota file already " ++ "specified.\n", QTYPE2NAME(qtype)); ++ kfree(qname); + return 0; + } ++ sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + printk(KERN_ERR + "EXT3-fs: quotafile must be on " +@@ -1223,7 +1234,7 @@ + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) +- ext3_quota_off_mount(sb, i); ++ vfs_quota_off(sb, i); + } + #endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +@@ -2240,7 +2251,7 @@ + + static inline struct inode *dquot_to_inode(struct dquot *dquot) + { +- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]->f_dentry->d_inode; ++ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + } + + static int ext3_dquot_initialize(struct inode *inode, int type) +@@ -2279,8 +2290,10 @@ + { + int ret, err; + handle_t *handle; ++ struct inode *inode; + +- handle = ext3_journal_start(dquot_to_inode(dquot), ++ inode = dquot_to_inode(dquot); ++ handle = ext3_journal_start(inode, + EXT3_QUOTA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2367,22 +2380,9 @@ + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry); +- if (err) +- dput(dentry); +- /* We keep the dentry reference if everything went ok - we drop it +- * on quota_off time */ +- return err; +-} +- +-/* Turn quotas off during mount time */ +-static int ext3_quota_off_mount(struct super_block *sb, int type) +-{ +- int err; +- struct dentry *dentry; +- +- dentry = sb_dqopt(sb)->files[type]->f_dentry; +- err = vfs_quota_off_mount(sb, type); +- /* We invalidate dentry - it has at least wrong hash... */ ++ /* Now invalidate and put the dentry - quota got its own reference ++ * to inode and dentry has at least wrong hash so we had better ++ * throw it away */ + d_invalidate(dentry); + dput(dentry); + return err; +@@ -2405,20 +2405,121 @@ + if (err) + return err; + /* Quotafile not on the same filesystem? */ +- if (nd.mnt->mnt_sb != sb) ++ if (nd.mnt->mnt_sb != sb) { ++ path_release(&nd); + return -EXDEV; ++ } + /* Quotafile not of fs root? */ + if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT3-fs: Quota file not on filesystem root. " + "Journalled quota will not work.\n"); +- if (!ext3_should_journal_data(nd.dentry->d_inode)) +- printk(KERN_WARNING "EXT3-fs: Quota file does not have " +- "data-journalling. Journalled quota will not work.\n"); + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); + } + ++/* Read data from quotafile - avoid pagecache and such because we cannot afford ++ * acquiring the locks... As quota files are never truncated and quota code ++ * itself serializes the operations (and noone else should touch the files) ++ * we don't have to be afraid of races */ ++static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, ++ size_t len, loff_t off) ++{ ++ struct inode *inode = sb_dqopt(sb)->files[type]; ++ sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); ++ int err = 0; ++ int offset = off & (sb->s_blocksize - 1); ++ int tocopy; ++ size_t toread; ++ struct buffer_head *bh; ++ loff_t i_size = i_size_read(inode); ++ ++ if (off > i_size) ++ return 0; ++ if (off+len > i_size) ++ len = i_size-off; ++ toread = len; ++ while (toread > 0) { ++ tocopy = sb->s_blocksize - offset < toread ? ++ sb->s_blocksize - offset : toread; ++ bh = ext3_bread(NULL, inode, blk, 0, &err); ++ if (err) ++ return err; ++ if (!bh) /* A hole? */ ++ memset(data, 0, tocopy); ++ else ++ memcpy(data, bh->b_data+offset, tocopy); ++ brelse(bh); ++ offset = 0; ++ toread -= tocopy; ++ data += tocopy; ++ blk++; ++ } ++ return len; ++} ++ ++/* Write to quotafile (we know the transaction is already started and has ++ * enough credits) */ ++static ssize_t ext3_quota_write(struct super_block *sb, int type, ++ const char *data, size_t len, loff_t off) ++{ ++ struct inode *inode = sb_dqopt(sb)->files[type]; ++ sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); ++ int err = 0; ++ int offset = off & (sb->s_blocksize - 1); ++ int tocopy; ++ int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; ++ size_t towrite = len; ++ struct buffer_head *bh; ++ handle_t *handle = journal_current_handle(); ++ ++ down(&inode->i_sem); ++ while (towrite > 0) { ++ tocopy = sb->s_blocksize - offset < towrite ? ++ sb->s_blocksize - offset : towrite; ++ bh = ext3_bread(handle, inode, blk, 1, &err); ++ if (!bh) ++ goto out; ++ if (journal_quota) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ brelse(bh); ++ goto out; ++ } ++ } ++ lock_buffer(bh); ++ memcpy(bh->b_data+offset, data, tocopy); ++ flush_dcache_page(bh->b_page); ++ unlock_buffer(bh); ++ if (journal_quota) ++ err = ext3_journal_dirty_metadata(handle, bh); ++ else { ++ /* Always do at least ordered writes for quotas */ ++ err = ext3_journal_dirty_data(handle, bh); ++ mark_buffer_dirty(bh); ++ } ++ brelse(bh); ++ if (err) ++ goto out; ++ offset = 0; ++ towrite -= tocopy; ++ data += tocopy; ++ blk++; ++ } ++out: ++ if (len == towrite) ++ return err; ++ if (inode->i_size < off+len-towrite) { ++ i_size_write(inode, off+len-towrite); ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ } ++ inode->i_version++; ++ inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ up(&inode->i_sem); ++ return len - towrite; ++} ++ + #endif + + static struct super_block *ext3_get_sb(struct file_system_type *fs_type, +Index: linux-2.6.9/include/linux/ext3_jbd.h +=================================================================== +--- linux-2.6.9.orig/include/linux/ext3_jbd.h 2006-08-25 16:39:09.000000000 +0800 ++++ linux-2.6.9/include/linux/ext3_jbd.h 2006-09-14 11:44:29.000000000 +0800 +@@ -193,6 +193,8 @@ + #define ext3_journal_forget(handle, bh) \ + __ext3_journal_forget(__FUNCTION__, (handle), (bh)) + ++int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); ++ + handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); + int __ext3_journal_stop(const char *where, handle_t *handle); + diff --git a/lustre/kernel_patches/patches/quota-umount-race-fix.patch b/lustre/kernel_patches/patches/quota-umount-race-fix.patch new file mode 100644 index 0000000..42428c3 --- /dev/null +++ b/lustre/kernel_patches/patches/quota-umount-race-fix.patch @@ -0,0 +1,139 @@ + +From: Jan Kara + +Fix possible races between umount and quota on/off. + +Finally I decided to take a reference to vfsmount during vfs_quota_on() and +to drop it after the final cleanup in the vfs_quota_off(). This way we +should be all the time guarded against umount. This way was protected also +the old code which used filp_open() for opening quota files. I was also +thinking about other ways of protection but there would be always a window +(provided I don't want to play much with namespace locks) where +vfs_quota_on() could be called while umount() is in progress resulting in +the "Busy inodes after unmount" messages... + +Get a reference to vfsmount during quotaon() so that we are guarded against +umount (as was the old code using filp_open()). + +Signed-off-by: Jan Kara +Signed-off-by: Andrew Morton +--- + + 25-akpm/fs/dquot.c | 45 ++++++++++++++++++++++++++++----------- + 25-akpm/include/linux/quota.h | 1 + 25-akpm/include/linux/quotaops.h | 2 - + 3 files changed, 35 insertions(+), 13 deletions(-) + +diff -puN fs/dquot.c~quota-umount-race-fix fs/dquot.c +--- 25/fs/dquot.c~quota-umount-race-fix Tue Nov 23 17:11:34 2004 ++++ 25-akpm/fs/dquot.c Tue Nov 23 17:11:34 2004 +@@ -1314,12 +1314,14 @@ int vfs_quota_off(struct super_block *sb + { + int cnt; + struct quota_info *dqopt = sb_dqopt(sb); +- struct inode *toput[MAXQUOTAS]; ++ struct inode *toputinode[MAXQUOTAS]; ++ struct vfsmount *toputmnt[MAXQUOTAS]; + + /* We need to serialize quota_off() for device */ + down(&dqopt->dqonoff_sem); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { +- toput[cnt] = NULL; ++ toputinode[cnt] = NULL; ++ toputmnt[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_enabled(sb, cnt)) +@@ -1339,8 +1341,10 @@ int vfs_quota_off(struct super_block *sb + dqopt->ops[cnt]->free_file_info(sb, cnt); + put_quota_format(dqopt->info[cnt].dqi_format); + +- toput[cnt] = dqopt->files[cnt]; ++ toputinode[cnt] = dqopt->files[cnt]; ++ toputmnt[cnt] = dqopt->mnt[cnt]; + dqopt->files[cnt] = NULL; ++ dqopt->mnt[cnt] = NULL; + dqopt->info[cnt].dqi_flags = 0; + dqopt->info[cnt].dqi_igrace = 0; + dqopt->info[cnt].dqi_bgrace = 0; +@@ -1348,7 +1352,10 @@ int vfs_quota_off(struct super_block *sb + } + up(&dqopt->dqonoff_sem); + /* Sync the superblock so that buffers with quota data are written to +- * disk (and so userspace sees correct data afterwards) */ ++ * disk (and so userspace sees correct data afterwards). ++ * The reference to vfsmnt we are still holding protects us from ++ * umount (we don't have it only when quotas are turned on/off for ++ * journal replay but in that case we are guarded by the fs anyway). */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); +@@ -1358,13 +1365,24 @@ int vfs_quota_off(struct super_block *sb + * must also discard the blockdev buffers so that we see the + * changes done by userspace on the next quotaon() */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) +- if (toput[cnt]) { +- down(&toput[cnt]->i_sem); +- toput[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); +- truncate_inode_pages(&toput[cnt]->i_data, 0); +- up(&toput[cnt]->i_sem); +- mark_inode_dirty(toput[cnt]); +- iput(toput[cnt]); ++ if (toputinode[cnt]) { ++ down(&dqopt->dqonoff_sem); ++ /* If quota was reenabled in the meantime, we have ++ * nothing to do */ ++ if (!sb_has_quota_enabled(sb, cnt)) { ++ down(&toputinode[cnt]->i_sem); ++ toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | ++ S_NOATIME | S_NOQUOTA); ++ truncate_inode_pages(&toputinode[cnt]->i_data, 0); ++ up(&toputinode[cnt]->i_sem); ++ mark_inode_dirty(toputinode[cnt]); ++ iput(toputinode[cnt]); ++ } ++ up(&dqopt->dqonoff_sem); ++ /* We don't hold the reference when we turned on quotas ++ * just for the journal replay... */ ++ if (toputmnt[cnt]) ++ mntput(toputmnt[cnt]); + } + invalidate_bdev(sb->s_bdev, 0); + return 0; +@@ -1478,8 +1496,11 @@ int vfs_quota_on(struct super_block *sb, + /* Quota file not on the same filesystem? */ + if (nd.mnt->mnt_sb != sb) + error = -EXDEV; +- else ++ else { + error = vfs_quota_on_inode(nd.dentry->d_inode, type, format_id); ++ if (!error) ++ sb_dqopt(sb)->mnt[type] = mntget(nd.mnt); ++ } + out_path: + path_release(&nd); + return error; +diff -puN include/linux/quota.h~quota-umount-race-fix include/linux/quota.h +--- 25/include/linux/quota.h~quota-umount-race-fix Tue Nov 23 17:11:34 2004 ++++ 25-akpm/include/linux/quota.h Tue Nov 23 17:11:34 2004 +@@ -286,6 +286,7 @@ struct quota_info { + struct semaphore dqonoff_sem; /* Serialize quotaon & quotaoff */ + struct rw_semaphore dqptr_sem; /* serialize ops using quota_info struct, pointers from inode to dquots */ + struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ ++ struct vfsmount *mnt[MAXQUOTAS]; /* mountpoint entries of filesystems with quota files */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ + }; +diff -puN include/linux/quotaops.h~quota-umount-race-fix include/linux/quotaops.h +--- 25/include/linux/quotaops.h~quota-umount-race-fix Tue Nov 23 17:11:34 2004 ++++ 25-akpm/include/linux/quotaops.h Tue Nov 23 17:11:34 2004 +@@ -177,7 +177,7 @@ static __inline__ int DQUOT_OFF(struct s + { + int ret = -ENOSYS; + +- if (sb->s_qcop && sb->s_qcop->quota_off) ++ if (sb_any_quota_enabled(sb) && sb->s_qcop && sb->s_qcop->quota_off) + ret = sb->s_qcop->quota_off(sb, -1); + return ret; + } +_ diff --git a/lustre/tests/flocks_test.c b/lustre/tests/flocks_test.c new file mode 100644 index 0000000..ff54e06 --- /dev/null +++ b/lustre/tests/flocks_test.c @@ -0,0 +1,62 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include +#include +#include + +#include + +void usage(void) +{ + fprintf(stderr, "usage: ./flocks_test on|off -c|-f|-l /path/to/file\n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) +{ + int fd; + int mount_with_flock = 0; + int error = 0; + + if (argc != 4) + usage(); + + if (!strncmp(argv[1], "on", 3)) { + mount_with_flock = 1; + } else if (!strncmp(argv[1], "off", 4)) { + mount_with_flock = 0; + } else { + usage(); + } + + if ((fd = open(argv[3], O_RDWR)) < 0) { + fprintf(stderr, "Couldn't open file: %s\n", argv[2]); + exit(EXIT_FAILURE); + } + + if (!strncmp(argv[2], "-c", 3)) { + struct flock fl; + + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 1; + + error = fcntl(fd, F_SETLK, &fl); + } else if (!strncmp(argv[2], "-l", 3)) { + error = lockf(fd, F_LOCK, 1); + } else if (!strncmp(argv[2], "-f", 3)) { + error = flock(fd, LOCK_EX); + } else { + usage(); + } + + if (mount_with_flock) + return((error == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + else + return((error == 0) ? EXIT_FAILURE : EXIT_SUCCESS); +} -- 1.8.3.1