* bug fixes
- fix deadlock in obdfilter statistics vs. object create (5811)
- fix for HPUX NFS client breakage when NFS exporting Lustre (5781)
+ - mdc_enqueue does not need max_mds_easize request buffer on send (5707)
+ - swab llog records of type '0' so we get proper header size/idx (5861)
+ - send llog cancel req to DLM cancel portal instead of cb portal (5515)
* miscellania
- by default create 1 inode per 4kB space on MDS, per 16kB on OSTs
+ - allow --write-conf on an MDS with different nettype than client (5619)
+ - don't write config llogs to MDS for mounts not from that MDS (5617)
+ - lconf should create multiple TCP connections from a client (5201)
- init scripts are now turned off by default; run chkconfig --on
lustre and chkconfig --on lustrefs to use them
-m4_define([LUSTRE_VERSION],[1.4.1])
+m4_define([LUSTRE_VERSION],[1.4.1.1])
#ifdef HAVE_SYS_USER_H
# include <sys/user.h>
#endif
-
-#include "ioctl.h"
+#ifdef HAVE_SYS_IOCTL_H
+# include <sys/ioctl.h>
+#else
+# include "ioctl.h"
+#endif /* !HAVE_SYS_IOCTL_H */
#include <stdio.h>
#include <sys/ioctl.h>
#ifndef _LUSTRE_CFG_H
#define _LUSTRE_CFG_H
-#define LUSTRE_CFG_VERSION 0x00010001
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+ size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
enum lcfg_command_type {
LCFG_ATTACH = 0x00cf001,
LCFG_SET_UPCALL = 0x00cf010,
};
+struct lustre_cfg_bufs {
+ void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+ uint32_t lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+ uint32_t lcfg_bufcount;
+};
+
struct lustre_cfg {
uint32_t lcfg_version;
uint32_t lcfg_command;
uint64_t lcfg_nid;
uint32_t lcfg_nal;
- /* inline buffers for various arguments */
- uint32_t lcfg_dev_namelen;
- char *lcfg_dev_name;
- uint32_t lcfg_inllen1;
- char *lcfg_inlbuf1;
- uint32_t lcfg_inllen2;
- char *lcfg_inlbuf2;
- uint32_t lcfg_inllen3;
- char *lcfg_inlbuf3;
- uint32_t lcfg_inllen4;
- char *lcfg_inlbuf4;
-
- char lcfg_bulk[0];
-
+ uint32_t lcfg_bufcount;
+ uint32_t lcfg_buflens[0];
};
-#define LCFG_INIT(l, cmd, name) \
-do { \
- memset(&(l), 0, sizeof(l)); \
- (l).lcfg_version = LUSTRE_CFG_VERSION; \
- (l).lcfg_command = (cmd); \
- if (name) { \
- (l).lcfg_dev_namelen = strlen(name) + 1; \
- (l).lcfg_dev_name = name; \
- } \
- \
-} while (0)
-
-#ifndef __KERNEL__
-static inline int lustre_cfg_packlen(struct lustre_cfg *lcfg)
+#define LUSTRE_CFG_BUFLEN(lcfg, idx) \
+ ((lcfg)->lcfg_bufcount <= (idx) \
+ ? 0 \
+ : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+ uint32_t index,
+ void *buf,
+ uint32_t buflen)
{
- int len = size_round(sizeof(struct lustre_cfg));
- len += size_round(lcfg->lcfg_dev_namelen);
- len += size_round(lcfg->lcfg_inllen1);
- len += size_round(lcfg->lcfg_inllen2);
- len += size_round(lcfg->lcfg_inllen3);
- len += size_round(lcfg->lcfg_inllen4);
- return size_round(len);
+ if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+ return;
+ if (bufs == NULL)
+ return;
+
+ if (bufs->lcfg_bufcount <= index)
+ bufs->lcfg_bufcount = index + 1;
+
+ bufs->lcfg_buf[index] = buf;
+ bufs->lcfg_buflen[index] = buflen;
}
-static inline int lustre_cfg_pack(struct lustre_cfg *data, char **pbuf,
- int max, int *plen)
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+ uint32_t index,
+ char *str)
{
- char *ptr;
- struct lustre_cfg *overlay;
- int len;
+ lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
- len = lustre_cfg_packlen(data);
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+ memset((bufs), 0, sizeof(*bufs));
+ if (name)
+ lustre_cfg_bufs_set_string(bufs, 0, name);
+}
- data->lcfg_version = LUSTRE_CFG_VERSION;
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+ int i;
+ int offset;
+ int bufcount;
+ LASSERT (lcfg != NULL);
+ LASSERT (index >= 0);
+
+ bufcount = lcfg->lcfg_bufcount;
+ if (index >= bufcount)
+ return NULL;
+
+ offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < index; i++)
+ offset += size_round(lcfg->lcfg_buflens[i]);
+ return (char *)lcfg + offset;
+}
- if (*pbuf && len > max)
- return 1;
- if (*pbuf == NULL) {
- *pbuf = malloc(len);
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+ struct lustre_cfg *lcfg)
+{
+ int i;
+ bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+ for (i = 0; i < bufs->lcfg_bufcount; i++) {
+ bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+ bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
}
- if (!*pbuf)
- return 1;
- overlay = (struct lustre_cfg *)*pbuf;
- memcpy(*pbuf, data, sizeof(*data));
-
- ptr = overlay->lcfg_bulk;
- if (data->lcfg_dev_name)
- LOGL(data->lcfg_dev_name, data->lcfg_dev_namelen, ptr);
- if (data->lcfg_inlbuf1)
- LOGL(data->lcfg_inlbuf1, data->lcfg_inllen1, ptr);
- if (data->lcfg_inlbuf2)
- LOGL(data->lcfg_inlbuf2, data->lcfg_inllen2, ptr);
- if (data->lcfg_inlbuf3)
- LOGL(data->lcfg_inlbuf3, data->lcfg_inllen3, ptr);
- if (data->lcfg_inlbuf4)
- LOGL(data->lcfg_inlbuf4, data->lcfg_inllen4, ptr);
-
- *plen = len;
-
- return 0;
}
-static inline int lustre_cfg_unpack(struct lustre_cfg *data, char *pbuf,
- int max)
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
{
- char *ptr;
- struct lustre_cfg *overlay;
-
- if (!pbuf)
- return 1;
- overlay = (struct lustre_cfg *)pbuf;
-
- /* Preserve the caller's buffer pointers */
- overlay->lcfg_dev_name = data->lcfg_dev_name;
- overlay->lcfg_inlbuf1 = data->lcfg_inlbuf1;
- overlay->lcfg_inlbuf2 = data->lcfg_inlbuf2;
- overlay->lcfg_inlbuf3 = data->lcfg_inlbuf3;
- overlay->lcfg_inlbuf4 = data->lcfg_inlbuf4;
-
- memcpy(data, pbuf, sizeof(*data));
-
- ptr = overlay->lcfg_bulk;
- if (data->lcfg_dev_name)
- LOGU(data->lcfg_dev_name, data->lcfg_dev_namelen, ptr);
- if (data->lcfg_inlbuf1)
- LOGU(data->lcfg_inlbuf1, data->lcfg_inllen1, ptr);
- if (data->lcfg_inlbuf2)
- LOGU(data->lcfg_inlbuf2, data->lcfg_inllen2, ptr);
- if (data->lcfg_inlbuf3)
- LOGU(data->lcfg_inlbuf3, data->lcfg_inllen3, ptr);
- if (data->lcfg_inlbuf4)
- LOGU(data->lcfg_inlbuf4, data->lcfg_inllen4, ptr);
-
- return 0;
-}
-#endif
+ char *s;
-#include <linux/obd_support.h>
+ if (!lcfg->lcfg_buflens[index])
+ return NULL;
+
+ s = lustre_cfg_buf(lcfg, index);
+ if (!s)
+ return NULL;
-static inline int lustre_cfg_getdata(char **buf, int len, void *arg, int kernel)
+ /* make sure it's NULL terminated, even if this kills a char
+ * of data
+ */
+ s[lcfg->lcfg_buflens[index] - 1] = '\0';
+ return s;
+}
+
+static inline int lustre_cfg_len(uint32_t bufcount, uint32_t *buflens)
{
- struct lustre_cfg *lcfg;
- int err;
- int offset = 0;
+ int i;
+ int len;
ENTRY;
- if (len > OBD_MAX_IOCTL_BUFFER) {
- CERROR("User buffer len %d exceeds %d max buffer\n",
- len, OBD_MAX_IOCTL_BUFFER);
- return -EINVAL;
- }
- if (len < sizeof(struct lustre_cfg)) {
- CERROR("OBD: user buffer too small for lustre_cfg\n");
- return -EINVAL;
- }
+ len = LCFG_HDR_SIZE(bufcount);
+ for (i = 0; i < bufcount; i++)
+ len += size_round(buflens[i]);
- /* XXX allocate this more intelligently, using kmalloc when
- * appropriate */
- OBD_ALLOC(*buf, len);
- if (*buf == NULL) {
- CERROR("Cannot allocate control buffer of len %d\n", len);
- RETURN(-EINVAL);
- }
+ RETURN(size_round(len));
+}
- if (kernel) {
- memcpy(*buf, (void *)arg, len);
- } else {
- err = copy_from_user(*buf, (void *)arg, len);
- if (err)
- RETURN(err);
- }
- lcfg = (struct lustre_cfg *)*buf;
+#include <linux/obd_support.h>
- if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
- CERROR("Version mismatch kernel: %#x application: %#x\n",
- LUSTRE_CFG_VERSION, lcfg->lcfg_version);
- return -EINVAL;
- }
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+ struct lustre_cfg_bufs *bufs)
+{
+ struct lustre_cfg *lcfg;
+ char *ptr;
+ int i;
+ ENTRY;
- if (lcfg->lcfg_dev_name) {
- lcfg->lcfg_dev_name = &lcfg->lcfg_bulk[0];
- offset += size_round(lcfg->lcfg_dev_namelen);
- }
+ OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+ bufs->lcfg_buflen));
+ if (!lcfg)
+ RETURN(lcfg);
- if (lcfg->lcfg_inllen1) {
- lcfg->lcfg_inlbuf1 = &lcfg->lcfg_bulk[0] + offset;
- offset += size_round(lcfg->lcfg_inllen1);
- }
+ lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+ lcfg->lcfg_command = cmd;
+ lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
- if (lcfg->lcfg_inllen2) {
- lcfg->lcfg_inlbuf2 = &lcfg->lcfg_bulk[0] + offset;
- offset += size_round(lcfg->lcfg_inllen2);
+ ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+ lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+ LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
}
+ RETURN(lcfg);
+}
- if (lcfg->lcfg_inllen3) {
- lcfg->lcfg_inlbuf3 = &lcfg->lcfg_bulk[0] + offset;
- offset += size_round(lcfg->lcfg_inllen3);
- }
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+ int len;
- if (lcfg->lcfg_inllen4) {
- lcfg->lcfg_inlbuf4 = &lcfg->lcfg_bulk[0] + offset;
- }
+ len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+ OBD_FREE(lcfg, len);
EXIT;
- return 0;
+ return;
}
-static inline void lustre_cfg_freedata(char *buf, int len)
+static inline int lustre_cfg_sanity_check(void *buf, int len)
{
+ struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
ENTRY;
+ if (!lcfg)
+ RETURN(-EINVAL);
- OBD_FREE(buf, len);
- EXIT;
- return;
+ /* check that the first bits of the struct are valid */
+ if (len < LCFG_HDR_SIZE(0))
+ RETURN(-EINVAL);
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+ RETURN(-EINVAL);
+ if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+ RETURN(-EINVAL);
+
+ /* check that the buflens are valid */
+ if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+ RETURN(-EINVAL);
+
+ /* make sure all the pointers point inside the data */
+ if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+ RETURN(-EINVAL);
+
+ RETURN(0);
}
/* Passed by mount */
struct obdo {
obd_id o_id;
obd_gr o_gr;
- obd_time o_atime;
+ obd_size o_size;
obd_time o_mtime;
+ obd_time o_atime;
obd_time o_ctime;
- obd_size o_size;
obd_blocks o_blocks; /* brw: cli sent cached bytes */
obd_size o_grant;
obd_blksize o_blksize; /* optimal IO blocksize */
* array of UUIDs returned by the MDS. With the current
* protocol, this will limit the max number of OSTs per LOV */
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
struct lov_desc {
__u32 ld_tgt_count; /* how many OBD's */
__u32 ld_active_tgt_count; /* how many active */
struct obd_uuid ld_uuid;
};
+#define ld_magic ld_active_tgt_count /* for swabbing from llogs */
+
extern void lustre_swab_lov_desc (struct lov_desc *ld);
/*
/* Log data record types - there is no specific reason that these need to
* be related to the RPC opcodes, but no reason not to (may be handy later?)
*/
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK 0xfff00000
+
typedef enum {
- OST_SZ_REC = 0x10600000 | (OST_SAN_WRITE << 8),
- OST_RAID1_REC = 0x10600000 | ((OST_SAN_WRITE + 1) << 8),
- MDS_UNLINK_REC = 0x10610000 | (MDS_REINT << 8) | REINT_UNLINK,
- OBD_CFG_REC = 0x10620000,
- PTL_CFG_REC = 0x10630000,
- LLOG_GEN_REC = 0x10640000,
- LLOG_HDR_MAGIC = 0x10645539,
- LLOG_LOGID_MAGIC = 0x1064553b,
+ LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0,
+ OST_SZ_REC = LLOG_OP_MAGIC | (OST_SAN_WRITE << 8),
+ OST_RAID1_REC = LLOG_OP_MAGIC | ((OST_SAN_WRITE + 1) << 8),
+ MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_UNLINK,
+ OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000,
+ PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000,
+ LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000,
+ LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539,
+ LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b,
} llog_op_type;
+/*
+ * for now, continue to support old pad records which have 0 for their
+ * type but still need to be swabbed for their length
+ */
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+ (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == \
+ __swab32(LLOG_OP_MAGIC) || \
+ (((r)->lrh_type == 0) && ((r)->lrh_len > LLOG_CHUNK_SIZE)))
+
/* Log record header - stored in little endian order.
* Each record must start with this struct, end with a llog_rec_tail,
* and be a multiple of 256 bits in size.
__u32 lgdc_ctxt_idx;
} __attribute__((packed));
+extern void lustre_swab_lov_user_md(struct lov_user_md *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum);
+
+/* llog_swab.c */
extern void lustre_swab_llogd_body (struct llogd_body *d);
extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec,
+ struct llog_rec_tail *tail);
+
+struct portals_cfg;
+extern void lustre_swab_portals_cfg(struct portals_cfg *pcfg);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
static inline struct ll_fid *obdo_fid(struct obdo *oa)
{
struct semaphore cl_sem;
int cl_conn_count;
/* max_mds_easize is purely a performance thing so we don't have to
- * call obd_size_wiremd() all the time. */
+ * call obd_size_diskmd() all the time. */
+ int cl_default_mds_easize;
int cl_max_mds_easize;
int cl_max_mds_cookiesize;
kdev_t cl_sandev;
# define OBD_GFP_MASK GFP_NOFS
#endif
+#ifdef __KERNEL__
#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, OBD_GFP_MASK)
+#else
+#define OBD_ALLOC(ptr, size) (ptr = malloc(size))
+#endif
+
#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL)
#ifdef __arch_um__
#define POISON_PAGE(page, val) do { } while (0)
#endif
+#ifdef __KERNEL__
#define OBD_FREE(ptr, size) \
do { \
LASSERT(ptr); \
kfree(ptr); \
(ptr) = (void *)0xdeadbeef; \
} while (0)
+#else
+#define OBD_FREE(ptr, size) ((void)(size), free((ptr)))
+#endif
#ifdef __arch_um__
# define OBD_VFREE(ptr, size) OBD_FREE(ptr, size)
--- /dev/null
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_X86=y
+CONFIG_MMU=y
+CONFIG_UID16=y
+CONFIG_GENERIC_ISA_DMA=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+# CONFIG_STANDALONE is not set
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_HOTPLUG=y
+CONFIG_EVLOG=y
+# CONFIG_EVLOG_FWPRINTK is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_EMBEDDED is not set
+
+#
+# Class Based Kernel Resource Management
+#
+CONFIG_CKRM=y
+CONFIG_RCFS_FS=m
+CONFIG_CKRM_TYPE_TASKCLASS=y
+CONFIG_CKRM_RES_NUMTASKS=m
+CONFIG_CKRM_TYPE_SOCKETCLASS=y
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+CONFIG_DELAY_ACCT=y
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_OBSOLETE_MODPARM=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Processor type and features
+#
+# CONFIG_X86_PC is not set
+# CONFIG_X86_ELAN is not set
+# CONFIG_X86_VOYAGER is not set
+# CONFIG_X86_NUMAQ is not set
+# CONFIG_X86_SUMMIT is not set
+# CONFIG_X86_BIGSMP is not set
+# CONFIG_X86_VISWS is not set
+CONFIG_X86_GENERICARCH=y
+# CONFIG_X86_ES7000 is not set
+CONFIG_X86_CYCLONE_TIMER=y
+# CONFIG_M386 is not set
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+# CONFIG_M686 is not set
+CONFIG_MPENTIUMII=y
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUMM is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MK8 is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+# CONFIG_MVIAC3_2 is not set
+CONFIG_X86_GENERIC=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_XADD=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_INTEL_USERCOPY=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+# CONFIG_HPET_TIMER is not set
+# CONFIG_HPET_EMULATE_RTC is not set
+CONFIG_SMP=y
+CONFIG_NR_CPUS=128
+CONFIG_SCHED_SMT=y
+# CONFIG_PREEMPT is not set
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_TSC=y
+CONFIG_X86_MCE=y
+# CONFIG_X86_MCE_NONFATAL is not set
+CONFIG_X86_MCE_P4THERMAL=y
+CONFIG_TOSHIBA=m
+CONFIG_I8K=m
+CONFIG_MICROCODE=m
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+
+#
+# Firmware Drivers
+#
+CONFIG_EDD=m
+# CONFIG_NOHIGHMEM is not set
+# CONFIG_HIGHMEM4G is not set
+CONFIG_HIGHMEM64G=y
+CONFIG_HIGHMEM=y
+CONFIG_X86_PAE=y
+# CONFIG_NUMA is not set
+CONFIG_HIGHPTE=y
+# CONFIG_MATH_EMULATION is not set
+CONFIG_MTRR=y
+CONFIG_EFI=y
+CONFIG_IRQBALANCE=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_BOOT_IOREMAP=y
+CONFIG_REGPARM=y
+
+#
+# Special options
+#
+CONFIG_PROC_MM=y
+
+#
+# Power management options (ACPI, APM)
+#
+CONFIG_PM=y
+# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_PM_DISK is not set
+
+#
+# ACPI (Advanced Configuration and Power Interface) Support
+#
+CONFIG_ACPI=y
+CONFIG_ACPI_BOOT=y
+CONFIG_ACPI_INTERPRETER=y
+CONFIG_ACPI_SLEEP=y
+CONFIG_ACPI_SLEEP_PROC_FS=y
+CONFIG_ACPI_AC=m
+CONFIG_ACPI_BATTERY=m
+CONFIG_ACPI_BUTTON=m
+CONFIG_ACPI_FAN=m
+CONFIG_ACPI_PROCESSOR=m
+CONFIG_ACPI_THERMAL=m
+# CONFIG_ACPI_ASUS is not set
+CONFIG_ACPI_TOSHIBA=m
+# CONFIG_ACPI_DEBUG is not set
+CONFIG_ACPI_BUS=y
+CONFIG_ACPI_EC=y
+CONFIG_ACPI_POWER=y
+CONFIG_ACPI_PCI=y
+CONFIG_ACPI_SYSTEM=y
+CONFIG_X86_PM_TIMER=y
+CONFIG_ACPI_INITRD=y
+
+#
+# APM (Advanced Power Management) BIOS Support
+#
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+CONFIG_APM_DO_ENABLE=y
+# CONFIG_APM_CPU_IDLE is not set
+CONFIG_APM_DISPLAY_BLANK=y
+# CONFIG_APM_RTC_IS_GMT is not set
+CONFIG_APM_ALLOW_INTS=y
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_PROC_INTF=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=m
+CONFIG_CPU_FREQ_GOV_USERSPACE=m
+CONFIG_CPU_FREQ_GOV_ONDEMAND=m
+# CONFIG_CPU_FREQ_24_API is not set
+CONFIG_CPU_FREQ_TABLE=m
+
+#
+# CPUFreq processor drivers
+#
+CONFIG_X86_ACPI_CPUFREQ=m
+# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
+CONFIG_X86_POWERNOW_K6=m
+CONFIG_X86_POWERNOW_K7=m
+CONFIG_X86_POWERNOW_K8=m
+CONFIG_X86_POWERNOW_K8_ACPI=y
+CONFIG_X86_GX_SUSPMOD=m
+CONFIG_X86_SPEEDSTEP_CENTRINO=m
+CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE=y
+# CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI is not set
+CONFIG_X86_SPEEDSTEP_ICH=m
+CONFIG_X86_SPEEDSTEP_SMI=m
+CONFIG_X86_P4_CLOCKMOD=m
+CONFIG_X86_SPEEDSTEP_LIB=m
+CONFIG_X86_LONGRUN=m
+CONFIG_X86_LONGHAUL=m
+
+#
+# Bus options (PCI, PCMCIA, EISA, MCA, ISA)
+#
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GOMMCONFIG is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_PCI_MMCONFIG=y
+# CONFIG_PCI_USE_VECTOR is not set
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+CONFIG_ISA=y
+# CONFIG_EISA is not set
+# CONFIG_MCA is not set
+CONFIG_SCx200=m
+
+#
+# PCMCIA/CardBus support
+#
+CONFIG_PCMCIA=m
+# CONFIG_PCMCIA_DEBUG is not set
+CONFIG_YENTA=m
+CONFIG_CARDBUS=y
+CONFIG_I82092=m
+CONFIG_I82365=m
+CONFIG_TCIC=m
+CONFIG_PCMCIA_PROBE=y
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=m
+CONFIG_HOTPLUG_PCI_FAKE=m
+CONFIG_HOTPLUG_PCI_COMPAQ=m
+CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM=y
+CONFIG_HOTPLUG_PCI_IBM=m
+CONFIG_HOTPLUG_PCI_AMD=m
+CONFIG_HOTPLUG_PCI_ACPI=m
+CONFIG_HOTPLUG_PCI_CPCI=y
+CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m
+CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m
+CONFIG_HOTPLUG_PCI_PCIE=m
+# CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_MISC=m
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+CONFIG_MTD=m
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_PARTITIONS=m
+CONFIG_MTD_CONCAT=m
+CONFIG_MTD_REDBOOT_PARTS=m
+CONFIG_MTD_CMDLINE_PARTS=m
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=m
+CONFIG_MTD_BLOCK=m
+# CONFIG_MTD_BLOCK_RO is not set
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=m
+CONFIG_MTD_JEDECPROBE=m
+CONFIG_MTD_GEN_PROBE=m
+CONFIG_MTD_CFI_ADV_OPTIONS=y
+CONFIG_MTD_CFI_NOSWAP=y
+# CONFIG_MTD_CFI_BE_BYTE_SWAP is not set
+# CONFIG_MTD_CFI_LE_BYTE_SWAP is not set
+# CONFIG_MTD_CFI_GEOMETRY is not set
+CONFIG_MTD_CFI_INTELEXT=m
+CONFIG_MTD_CFI_AMDSTD=m
+CONFIG_MTD_CFI_STAA=m
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+CONFIG_MTD_ABSENT=m
+CONFIG_MTD_OBSOLETE_CHIPS=y
+CONFIG_MTD_AMDSTD=m
+CONFIG_MTD_SHARP=m
+CONFIG_MTD_JEDEC=m
+
+#
+# Mapping drivers for chip access
+#
+CONFIG_MTD_COMPLEX_MAPPINGS=y
+CONFIG_MTD_PHYSMAP=m
+CONFIG_MTD_PHYSMAP_START=0x8000000
+CONFIG_MTD_PHYSMAP_LEN=0x4000000
+CONFIG_MTD_PHYSMAP_BUSWIDTH=2
+CONFIG_MTD_PNC2000=m
+CONFIG_MTD_SC520CDP=m
+CONFIG_MTD_NETSC520=m
+CONFIG_MTD_SBC_GXX=m
+CONFIG_MTD_ELAN_104NC=m
+CONFIG_MTD_OCTAGON=m
+CONFIG_MTD_VMAX=m
+CONFIG_MTD_SCx200_DOCFLASH=m
+CONFIG_MTD_AMD76XROM=m
+CONFIG_MTD_ICH2ROM=m
+CONFIG_MTD_SCB2_FLASH=m
+CONFIG_MTD_NETtel=m
+CONFIG_MTD_DILNETPC=m
+CONFIG_MTD_DILNETPC_BOOTSIZE=0x80000
+CONFIG_MTD_L440GX=m
+CONFIG_MTD_PCI=m
+
+#
+# Self-contained MTD device drivers
+#
+CONFIG_MTD_PMC551=m
+CONFIG_MTD_PMC551_BUGFIX=y
+# CONFIG_MTD_PMC551_DEBUG is not set
+CONFIG_MTD_SLRAM=m
+CONFIG_MTD_MTDRAM=m
+CONFIG_MTDRAM_TOTAL_SIZE=4096
+CONFIG_MTDRAM_ERASE_SIZE=128
+CONFIG_MTD_BLKMTD=m
+
+#
+# Disk-On-Chip Device Drivers
+#
+CONFIG_MTD_DOC2000=m
+CONFIG_MTD_DOC2001=m
+CONFIG_MTD_DOC2001PLUS=m
+CONFIG_MTD_DOCPROBE=m
+CONFIG_MTD_DOCPROBE_ADVANCED=y
+CONFIG_MTD_DOCPROBE_ADDRESS=0x0000
+CONFIG_MTD_DOCPROBE_HIGH=y
+CONFIG_MTD_DOCPROBE_55AA=y
+
+#
+# NAND Flash Device Drivers
+#
+CONFIG_MTD_NAND=m
+# CONFIG_MTD_NAND_VERIFY_WRITE is not set
+CONFIG_MTD_NAND_IDS=m
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_PC_SUPERIO=y
+CONFIG_PARPORT_PC_PCMCIA=m
+CONFIG_PARPORT_OTHER=y
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+CONFIG_PNP=y
+# CONFIG_PNP_DEBUG is not set
+
+#
+# Protocols
+#
+CONFIG_ISAPNP=y
+CONFIG_PNPBIOS=y
+CONFIG_PNPBIOS_PROC_FS=y
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+CONFIG_BLK_DEV_XD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+
+#
+# Parallel IDE high-level drivers
+#
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+
+#
+# Parallel IDE protocol modules
+#
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_BPCK6=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+CONFIG_BLK_CPQ_DA=m
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_CPQ_CISS_DA_NEW=m
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_CARMEL=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=64000
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_LBD=y
+CONFIG_CIPHER_TWOFISH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_HD_IDE is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+CONFIG_IDEDISK_STROKE=y
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+CONFIG_BLK_DEV_IDETAPE=m
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+# CONFIG_IDE_TASK_IOCTL is not set
+# CONFIG_IDE_TASKFILE_IO is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_CMD640=y
+CONFIG_BLK_DEV_CMD640_ENHANCED=y
+CONFIG_BLK_DEV_IDEPNP=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_OFFBOARD=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_BLK_DEV_OPTI621=y
+CONFIG_BLK_DEV_RZ1000=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+CONFIG_IDEDMA_ONLYDISK=y
+CONFIG_BLK_DEV_ADMA=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+CONFIG_BLK_DEV_ATIIXP=y
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5520=m
+CONFIG_BLK_DEV_CS5530=m
+CONFIG_BLK_DEV_HPT34X=y
+CONFIG_HPT34X_AUTODMA=y
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_SC1200=y
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_BLK_DEV_NS87415=y
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+CONFIG_PDC202XX_BURST=y
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_PDC202XX_FORCE=y
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+CONFIG_BLK_DEV_TRM290=y
+CONFIG_BLK_DEV_VIA82CXXX=y
+CONFIG_IDE_CHIPSETS=y
+
+#
+# Note: most of these also require special kernel boot parameters
+#
+CONFIG_BLK_DEV_4DRIVES=y
+CONFIG_BLK_DEV_ALI14XX=y
+CONFIG_BLK_DEV_DTC2278=y
+CONFIG_BLK_DEV_HT6560B=y
+# CONFIG_BLK_DEV_PDC4030 is not set
+CONFIG_BLK_DEV_QD65XX=y
+CONFIG_BLK_DEV_UMC8672=y
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=m
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+# CONFIG_BLK_DEV_SR_VENDOR is not set
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_7000FASST=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AHA152X=m
+CONFIG_SCSI_AHA1542=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=32
+CONFIG_AIC7XXX_RESET_DELAY_MS=5000
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
+CONFIG_AIC7XXX_DEBUG_MASK=0
+CONFIG_AIC7XXX_REG_PRETTY_PRINT=y
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=32
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
+# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+CONFIG_AIC79XX_REG_PRETTY_PRINT=y
+# CONFIG_SCSI_AIC79XX_NEW is not set
+CONFIG_SCSI_ADVANSYS=m
+CONFIG_SCSI_IN2000=m
+CONFIG_MEGARAID_NEWGEN=y
+CONFIG_MEGARAID_MM=m
+CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_LEGACY=m
+CONFIG_SCSI_SATA=y
+CONFIG_SCSI_SATA_SVW=m
+CONFIG_SCSI_ATA_PIIX=m
+CONFIG_SCSI_SATA_PROMISE=m
+CONFIG_SCSI_SATA_SIL=m
+CONFIG_SCSI_SATA_SIS=m
+CONFIG_SCSI_SATA_VIA=m
+CONFIG_SCSI_SATA_VITESSE=m
+CONFIG_SCSI_BUSLOGIC=m
+# CONFIG_SCSI_OMIT_FLASHPOINT is not set
+# CONFIG_SCSI_CPQFCTS is not set
+CONFIG_SCSI_DMX3191D=m
+CONFIG_SCSI_DTC3280=m
+CONFIG_SCSI_EATA=m
+CONFIG_SCSI_EATA_TAGGED_QUEUE=y
+CONFIG_SCSI_EATA_LINKED_COMMANDS=y
+CONFIG_SCSI_EATA_MAX_TAGS=16
+CONFIG_SCSI_EATA_PIO=m
+CONFIG_SCSI_FUTURE_DOMAIN=m
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_GENERIC_NCR5380=m
+CONFIG_SCSI_GENERIC_NCR5380_MMIO=m
+CONFIG_SCSI_GENERIC_NCR53C400=y
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_NCR53C406A=m
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+CONFIG_SCSI_PAS16=m
+CONFIG_SCSI_PSI240I=m
+CONFIG_SCSI_QLOGIC_FAS=m
+CONFIG_SCSI_QLOGIC_ISP=m
+CONFIG_SCSI_QLOGIC_FC=m
+CONFIG_SCSI_QLOGIC_FC_FIRMWARE=y
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_QLA2XXX=m
+CONFIG_SCSI_QLA21XX=m
+CONFIG_SCSI_QLA22XX=m
+CONFIG_SCSI_QLA2300=m
+CONFIG_SCSI_QLA2322=m
+CONFIG_SCSI_QLA6312=m
+CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA2XXX_FAILOVER=y
+CONFIG_SCSI_QLA4XXX=m
+CONFIG_SCSI_QLA4XXX_FAILOVER=y
+CONFIG_SCSI_SYM53C416=m
+CONFIG_SCSI_DC395x=m
+CONFIG_SCSI_DC390T=m
+CONFIG_SCSI_T128=m
+CONFIG_SCSI_U14_34F=m
+CONFIG_SCSI_U14_34F_TAGGED_QUEUE=y
+CONFIG_SCSI_U14_34F_LINKED_COMMANDS=y
+CONFIG_SCSI_U14_34F_MAX_TAGS=8
+CONFIG_SCSI_ULTRASTOR=m
+CONFIG_SCSI_NSP32=m
+CONFIG_SCSI_DEBUG=m
+
+#
+# PCMCIA SCSI adapter support
+#
+CONFIG_PCMCIA_AHA152X=m
+CONFIG_PCMCIA_FDOMAIN=m
+CONFIG_PCMCIA_NINJA_SCSI=m
+CONFIG_PCMCIA_QLOGIC=m
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+CONFIG_CD_NO_IDESCSI=y
+CONFIG_AZTCD=m
+CONFIG_GSCD=m
+CONFIG_MCD=m
+CONFIG_MCD_IRQ=11
+CONFIG_MCD_BASE=0x300
+CONFIG_OPTCD=m
+CONFIG_SJCD=m
+CONFIG_ISP16_CDI=m
+CONFIG_CDU535=m
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_FLAKEY=m
+CONFIG_BLK_DEV_DM_BBR=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=m
+CONFIG_FUSION_MAX_SGE=40
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+
+#
+# I2O device support
+#
+CONFIG_I2O=m
+CONFIG_I2O_CONFIG=m
+CONFIG_I2O_BLOCK=m
+CONFIG_I2O_SCSI=m
+CONFIG_I2O_PROC=m
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=m
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=m
+CONFIG_UNIX=y
+CONFIG_NET_KEY=m
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_ACCEPT_QUEUES is not set
+
+#
+# IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_NDISC_NEW=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_TUNNEL=m
+
+#
+# MOBILE IPv6 (EXPERIMENTAL)
+#
+CONFIG_IPV6_MOBILITY=m
+CONFIG_IPV6_MOBILITY_MN=m
+CONFIG_IPV6_MOBILITY_HA=m
+# CONFIG_IPV6_MOBILITY_DEBUG is not set
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+# CONFIG_DECNET_ROUTER is not set
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_PHYSDEV=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+# CONFIG_IP_NF_NAT_LOCAL is not set
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_CLASSIFY=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+CONFIG_IP_NF_COMPAT_IPCHAINS=m
+CONFIG_IP_NF_COMPAT_IPFWADM=m
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_TARGET_CONNMARK=m
+CONFIG_IP_NF_MATCH_CONNMARK=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+
+#
+# IPv6: Netfilter Configuration
+#
+CONFIG_IP6_NF_FTP=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_CONNTRACK=m
+CONFIG_IP6_NF_MATCH_STATE=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+
+#
+# DECnet: Netfilter Configuration
+#
+CONFIG_DECNET_NF_GRABULATOR=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+# CONFIG_SCTP_HMAC_NONE is not set
+# CONFIG_SCTP_HMAC_SHA1 is not set
+CONFIG_SCTP_HMAC_MD5=y
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+CONFIG_ATM_CLIP_NO_ICMP=y
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+# CONFIG_ATM_BR2684_IPFILTER is not set
+CONFIG_VLAN_8021Q=m
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_LTPC=m
+CONFIG_COPS=m
+CONFIG_COPS_DAYNA=y
+CONFIG_COPS_TANGENT=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_X25=m
+CONFIG_LAPB=m
+# CONFIG_NET_DIVERT is not set
+CONFIG_ECONET=m
+# CONFIG_ECONET_AUNUDP is not set
+# CONFIG_ECONET_NATIVE is not set
+CONFIG_WAN_ROUTER=m
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_CSZ=m
+CONFIG_NET_SCH_ATM=y
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_DELAY=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+CONFIG_ARCNET=m
+CONFIG_ARCNET_1201=m
+CONFIG_ARCNET_1051=m
+CONFIG_ARCNET_RAW=m
+CONFIG_ARCNET_COM90xx=m
+CONFIG_ARCNET_COM90xxIO=m
+CONFIG_ARCNET_RIM_I=m
+CONFIG_ARCNET_COM20020=m
+CONFIG_ARCNET_COM20020_ISA=m
+CONFIG_ARCNET_COM20020_PCI=m
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+CONFIG_ETHERTAP=m
+CONFIG_NET_SB1000=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=m
+CONFIG_HAPPYMEAL=m
+CONFIG_SUNGEM=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_EL1=m
+CONFIG_EL2=m
+CONFIG_ELPLUS=m
+CONFIG_EL16=m
+CONFIG_EL3=m
+CONFIG_3C515=m
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+CONFIG_LANCE=m
+CONFIG_NET_VENDOR_SMC=y
+CONFIG_WD80x3=m
+CONFIG_ULTRA=m
+CONFIG_SMC9194=m
+CONFIG_NET_VENDOR_RACAL=y
+CONFIG_NI52=m
+CONFIG_NI65=m
+
+#
+# Tulip family network device support
+#
+CONFIG_NET_TULIP=y
+CONFIG_DE2104X=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+# CONFIG_TULIP_MMIO is not set
+CONFIG_TULIP_NAPI=y
+CONFIG_TULIP_NAPI_HW_MITIGATION=y
+CONFIG_DE4X5=m
+CONFIG_WINBOND_840=m
+CONFIG_DM9102=m
+CONFIG_PCMCIA_XIRCOM=m
+CONFIG_AT1700=m
+CONFIG_DEPCA=m
+CONFIG_HP100=m
+CONFIG_NET_ISA=y
+CONFIG_E2100=m
+CONFIG_EWRK3=m
+CONFIG_EEXPRESS=m
+CONFIG_EEXPRESS_PRO=m
+CONFIG_HPLAN_PLUS=m
+CONFIG_HPLAN=m
+CONFIG_LP486E=m
+CONFIG_ETH16I=m
+CONFIG_NE2000=m
+CONFIG_ZNET=m
+CONFIG_SEEQ8005=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_ADAPTEC_STARFIRE_NAPI=y
+CONFIG_AC3200=m
+CONFIG_APRICOT=m
+CONFIG_B44=m
+CONFIG_FORCEDETH=m
+CONFIG_CS89x0=m
+CONFIG_DGRS=m
+CONFIG_EEPRO100=m
+# CONFIG_EEPRO100_PIO is not set
+CONFIG_E100=m
+CONFIG_E100_NAPI=y
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_8139_RXBUF_IDX=2
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
+CONFIG_TLAN=m
+CONFIG_VIA_RHINE=m
+# CONFIG_VIA_RHINE_MMIO is not set
+CONFIG_NET_POCKET=y
+CONFIG_ATP=m
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+CONFIG_E1000_NEW=m
+CONFIG_E1000_NEW_NAPI=y
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_SIS190=m
+CONFIG_SK98LIN=m
+CONFIG_TIGON3=m
+CONFIG_NET_BROADCOM=m
+CONFIG_NET_BROADCOM_NEW=m
+CONFIG_NET_BCM44=m
+CONFIG_TIGON3_NEW=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_FDDI=y
+# CONFIG_DEFXX is not set
+CONFIG_SKFP=m
+CONFIG_HIPPI=y
+CONFIG_ROADRUNNER=m
+CONFIG_ROADRUNNER_LARGE_RINGS=y
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+
+#
+# Obsolete Wireless cards support (pre-802.11)
+#
+CONFIG_STRIP=m
+# CONFIG_ARLAN is not set
+CONFIG_WAVELAN=m
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_NETWAVE=m
+
+#
+# Wireless 802.11 Frequency Hopping cards support
+#
+CONFIG_PCMCIA_RAYCS=m
+
+#
+# Wireless 802.11b ISA/PCI cards support
+#
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_PLX_HERMES=m
+CONFIG_TMD_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_ATMEL=m
+CONFIG_PCI_ATMEL=m
+
+#
+# Wireless 802.11b Pcmcia/Cardbus cards support
+#
+CONFIG_PCMCIA_HERMES=m
+CONFIG_AIRO_CS=m
+CONFIG_PCMCIA_ATMEL=m
+CONFIG_PCMCIA_WL3501=m
+
+#
+# Prism GT/Duette 802.11(a/b/g) PCI/Cardbus support
+#
+CONFIG_PRISM54=m
+CONFIG_NET_WIRELESS=y
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMTR=m
+CONFIG_IBMOL=m
+CONFIG_IBMLS=m
+CONFIG_3C359=m
+CONFIG_TMS380TR=m
+CONFIG_TMSPCI=m
+CONFIG_SKISA=m
+CONFIG_PROTEON=m
+CONFIG_ABYSS=m
+CONFIG_SMCTR=m
+CONFIG_NET_FC=y
+CONFIG_NET_LPFC=m
+CONFIG_RCPCI=m
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+
+#
+# Wan interfaces
+#
+CONFIG_WAN=y
+CONFIG_HOSTESS_SV11=m
+# CONFIG_COSA is not set
+CONFIG_DSCC4=m
+CONFIG_DSCC4_PCISYNC=y
+CONFIG_DSCC4_PCI_RST=y
+CONFIG_LANMEDIA=m
+CONFIG_SEALEVEL_4021=m
+CONFIG_SYNCLINK_SYNCPPP=m
+CONFIG_HDLC=m
+CONFIG_HDLC_RAW=y
+CONFIG_HDLC_RAW_ETH=y
+CONFIG_HDLC_CISCO=y
+CONFIG_HDLC_FR=y
+CONFIG_HDLC_PPP=y
+CONFIG_HDLC_X25=y
+CONFIG_PCI200SYN=m
+CONFIG_WANXL=m
+# CONFIG_WANXL_BUILD_FIRMWARE is not set
+CONFIG_PC300=m
+CONFIG_PC300_MLPPP=y
+CONFIG_N2=m
+CONFIG_C101=m
+CONFIG_FARSYNC=m
+CONFIG_DLCI=m
+CONFIG_DLCI_COUNT=24
+CONFIG_DLCI_MAX=8
+CONFIG_SDLA=m
+# CONFIG_WAN_ROUTER_DRIVERS is not set
+CONFIG_LAPBETHER=m
+CONFIG_X25_ASY=m
+# CONFIG_SBNI is not set
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+CONFIG_PCMCIA_AXNET=m
+CONFIG_ARCNET_COM20020_CS=m
+CONFIG_PCMCIA_IBMTR=m
+
+#
+# ATM drivers
+#
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+CONFIG_ATM_FIRESTREAM=m
+CONFIG_ATM_ZATM=m
+# CONFIG_ATM_ZATM_DEBUG is not set
+CONFIG_ATM_NICSTAR=m
+CONFIG_ATM_NICSTAR_USE_SUNI=y
+CONFIG_ATM_NICSTAR_USE_IDT77105=y
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+CONFIG_ATM_IDT77252_RCV_ALL=y
+CONFIG_ATM_IDT77252_USE_SUNI=y
+CONFIG_ATM_AMBASSADOR=m
+# CONFIG_ATM_AMBASSADOR_DEBUG is not set
+CONFIG_ATM_HORIZON=m
+# CONFIG_ATM_HORIZON_DEBUG is not set
+CONFIG_ATM_IA=m
+# CONFIG_ATM_IA_DEBUG is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+CONFIG_ATM_FORE200E_PCA=y
+CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y
+CONFIG_ATM_FORE200E_TX_RETRY=16
+CONFIG_ATM_FORE200E_DEBUG=0
+CONFIG_ATM_FORE200E=m
+CONFIG_ATM_HE=m
+CONFIG_ATM_HE_USE_SUNI=y
+
+#
+# Amateur Radio support
+#
+CONFIG_HAMRADIO=y
+
+#
+# Packet Radio protocols
+#
+CONFIG_AX25=m
+CONFIG_AX25_DAMA_SLAVE=y
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+
+#
+# AX.25 network device drivers
+#
+CONFIG_BPQETHER=m
+CONFIG_SCC=m
+CONFIG_SCC_DELAY=y
+CONFIG_SCC_TRXECHO=y
+CONFIG_BAYCOM_SER_FDX=m
+CONFIG_BAYCOM_SER_HDX=m
+CONFIG_BAYCOM_PAR=m
+CONFIG_BAYCOM_EPP=m
+CONFIG_YAM=m
+
+#
+# IrDA (infrared) support
+#
+CONFIG_IRDA=m
+
+#
+# IrDA protocols
+#
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+
+#
+# IrDA options
+#
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+# CONFIG_IRDA_FAST_RR is not set
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+
+#
+# SIR device drivers
+#
+CONFIG_IRTTY_SIR=m
+
+#
+# Dongle support
+#
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+
+#
+# Old SIR device drivers
+#
+
+#
+# Old Serial dongle support
+#
+
+#
+# FIR device drivers
+#
+CONFIG_USB_IRDA=m
+CONFIG_SIGMATEL_FIR=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_TOSHIBA_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+CONFIG_VIA_FIR=m
+
+#
+# Bluetooth support
+#
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+CONFIG_BT_SCO=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_CMTP=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_HCIUSB=m
+CONFIG_BT_HCIUSB_SCO=y
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_H4=y
+CONFIG_BT_HCIUART_BCSP=y
+CONFIG_BT_HCIUART_BCSP_TXCRC=y
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIDTL1=m
+CONFIG_BT_HCIBT3C=m
+CONFIG_BT_HCIBLUECARD=m
+CONFIG_BT_HCIBTUART=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_RX=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+
+#
+# Old ISDN4Linux
+#
+CONFIG_ISDN_I4L=m
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_IPPP_FILTER=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+CONFIG_ISDN_X25=y
+
+#
+# ISDN feature submodules
+#
+
+#
+# ISDN4Linux hardware drivers
+#
+
+#
+# Passive cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+
+#
+# D-channel protocol features
+#
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+# CONFIG_HISAX_NO_SENDCOMPLETE is not set
+# CONFIG_HISAX_NO_LLC is not set
+# CONFIG_HISAX_NO_KEYPAD is not set
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+
+#
+# HiSax supported cards
+#
+CONFIG_HISAX_16_0=y
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_TELESPCI=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_AVM_A1=y
+CONFIG_HISAX_FRITZPCI=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_IX1MICROR2=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_ASUSCOM=y
+CONFIG_HISAX_TELEINT=y
+CONFIG_HISAX_HFCS=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_SPORTSTER=y
+CONFIG_HISAX_MIC=y
+CONFIG_HISAX_NETJET=y
+CONFIG_HISAX_NETJET_U=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_ISURF=y
+CONFIG_HISAX_HSTSAPHIR=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_HFC_PCI=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+CONFIG_HISAX_ENTERNOW_PCI=y
+CONFIG_HISAX_DEBUG=y
+
+#
+# HiSax PCMCIA card service modules
+#
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_TELES_CS=m
+
+#
+# HiSax sub driver modules
+#
+CONFIG_HISAX_ST5481=m
+CONFIG_HISAX_HFCUSB=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+CONFIG_HISAX_HDLC=y
+
+#
+# Active cards
+#
+CONFIG_ISDN_DRV_ICN=m
+CONFIG_ISDN_DRV_PCBIT=m
+CONFIG_ISDN_DRV_SC=m
+CONFIG_ISDN_DRV_ACT2000=m
+CONFIG_ISDN_DRV_TPAM=m
+
+#
+# CAPI subsystem
+#
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+
+#
+# CAPI hardware drivers
+#
+
+#
+# Active AVM cards
+#
+CONFIG_CAPI_AVM=y
+CONFIG_ISDN_DRV_AVMB1_B1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_T1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+
+#
+# Active Eicon DIVA Server cards
+#
+CONFIG_CAPI_EICON=y
+CONFIG_ISDN_DIVAS=m
+CONFIG_ISDN_DIVAS_BRIPCI=y
+CONFIG_ISDN_DIVAS_PRIPCI=y
+CONFIG_ISDN_DIVAS_DIVACAPI=m
+CONFIG_ISDN_DIVAS_USERIDI=m
+CONFIG_ISDN_DIVAS_MAINT=m
+
+#
+# Telephony Support
+#
+CONFIG_PHONE=m
+CONFIG_PHONE_IXJ=m
+CONFIG_PHONE_IXJ_PCMCIA=m
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=m
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input I/O drivers
+#
+CONFIG_GAMEPORT=m
+CONFIG_SOUND_GAMEPORT=m
+CONFIG_GAMEPORT_NS558=m
+CONFIG_GAMEPORT_L4=m
+CONFIG_GAMEPORT_EMU10K1=m
+CONFIG_GAMEPORT_VORTEX=m
+CONFIG_GAMEPORT_FM801=m
+CONFIG_GAMEPORT_CS461x=m
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=m
+CONFIG_SERIO_CT82C710=m
+CONFIG_SERIO_PARKBD=m
+CONFIG_SERIO_PCIPS2=m
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+CONFIG_KEYBOARD_SUNKBD=m
+# CONFIG_KEYBOARD_LKKBD is not set
+CONFIG_KEYBOARD_XTKBD=m
+CONFIG_KEYBOARD_NEWTON=m
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_SERIAL=m
+CONFIG_MOUSE_INPORT=m
+CONFIG_MOUSE_ATIXL=y
+CONFIG_MOUSE_LOGIBM=m
+CONFIG_MOUSE_PC110PAD=m
+# CONFIG_MOUSE_VSXXXAA is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_ANALOG=m
+CONFIG_JOYSTICK_A3D=m
+CONFIG_JOYSTICK_ADI=m
+CONFIG_JOYSTICK_COBRA=m
+CONFIG_JOYSTICK_GF2K=m
+CONFIG_JOYSTICK_GRIP=m
+CONFIG_JOYSTICK_GRIP_MP=m
+CONFIG_JOYSTICK_GUILLEMOT=m
+CONFIG_JOYSTICK_INTERACT=m
+CONFIG_JOYSTICK_SIDEWINDER=m
+CONFIG_JOYSTICK_TMDC=m
+CONFIG_JOYSTICK_IFORCE=m
+CONFIG_JOYSTICK_IFORCE_USB=y
+CONFIG_JOYSTICK_IFORCE_232=y
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_SPACEORB=m
+CONFIG_JOYSTICK_SPACEBALL=m
+CONFIG_JOYSTICK_STINGER=m
+CONFIG_JOYSTICK_TWIDDLER=m
+CONFIG_JOYSTICK_DB9=m
+CONFIG_JOYSTICK_GAMECON=m
+CONFIG_JOYSTICK_TURBOGRAFX=m
+# CONFIG_INPUT_JOYDUMP is not set
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_GUNZE=m
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=y
+CONFIG_INPUT_UINPUT=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_ECC=m
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_ROCKETPORT=m
+CONFIG_SYNCLINK=m
+CONFIG_SYNCLINKMP=m
+CONFIG_N_HDLC=m
+CONFIG_STALDRV=y
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_CS=m
+# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_NR_UARTS=4
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+# CONFIG_SERIAL_8250_DETECT_IRQ is not set
+CONFIG_SERIAL_8250_MULTIPORT=y
+CONFIG_SERIAL_8250_RSA=y
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_ICOM is not set
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+CONFIG_PRINTER=m
+# CONFIG_LP_CONSOLE is not set
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+CONFIG_QIC02_TAPE=m
+CONFIG_QIC02_DYNCONF=y
+
+#
+# Setting runtime QIC-02 configuration is done with qic02conf
+#
+
+#
+# from the tpqic02-support package. It is available at
+#
+
+#
+# metalab.unc.edu or ftp://titus.cfw.com/pub/Linux/util/
+#
+
+#
+# IPMI
+#
+CONFIG_IPMI_HANDLER=m
+CONFIG_IPMI_PANIC_EVENT=y
+CONFIG_IPMI_PANIC_STRING=y
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_KCS=m
+CONFIG_IPMI_WATCHDOG=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_ACQUIRE_WDT=m
+CONFIG_ADVANTECH_WDT=m
+CONFIG_ALIM1535_WDT=m
+CONFIG_ALIM7101_WDT=m
+CONFIG_AMD7XX_TCO=m
+CONFIG_SC520_WDT=m
+CONFIG_EUROTECH_WDT=m
+CONFIG_IB700_WDT=m
+CONFIG_WAFER_WDT=m
+CONFIG_I8XX_TCO=m
+CONFIG_SC1200_WDT=m
+CONFIG_SCx200_WDT=m
+CONFIG_60XX_WDT=m
+CONFIG_CPU5_WDT=m
+CONFIG_W83627HF_WDT=m
+CONFIG_W83877F_WDT=m
+CONFIG_MACHZ_WDT=m
+
+#
+# ISA-based Watchdog Cards
+#
+CONFIG_PCWATCHDOG=m
+CONFIG_MIXCOMWD=m
+CONFIG_WDT=m
+CONFIG_WDT_501=y
+
+#
+# PCI-based Watchdog Cards
+#
+CONFIG_PCIPCWATCHDOG=m
+CONFIG_WDTPCI=m
+CONFIG_WDT_501_PCI=y
+
+#
+# USB-based Watchdog Cards
+#
+CONFIG_USBPCWATCHDOG=m
+CONFIG_HW_RANDOM=m
+CONFIG_NVRAM=m
+CONFIG_RTC=y
+CONFIG_DTLK=m
+CONFIG_R3964=m
+CONFIG_APPLICOM=m
+CONFIG_SONYPI=m
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=m
+CONFIG_AGP_ALI=m
+CONFIG_AGP_ATI=m
+CONFIG_AGP_AMD=m
+CONFIG_AGP_AMD64=m
+CONFIG_AGP_INTEL=m
+CONFIG_AGP_INTEL_MCH=m
+CONFIG_AGP_NVIDIA=m
+CONFIG_AGP_SIS=m
+CONFIG_AGP_SWORKS=m
+CONFIG_AGP_VIA=m
+CONFIG_AGP_EFFICEON=m
+# CONFIG_DRM is not set
+
+#
+# PCMCIA character devices
+#
+CONFIG_SYNCLINK_CS=m
+# CONFIG_MWAVE is not set
+CONFIG_SCx200_GPIO=m
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+CONFIG_HANGCHECK_TIMER=m
+CONFIG_VTUNE=m
+
+#
+# Linux InfraRed Controller
+#
+CONFIG_LIRC_SUPPORT=m
+CONFIG_LIRC_MAX_DEV=2
+CONFIG_LIRC_BT829=m
+CONFIG_LIRC_IT87=m
+CONFIG_LIRC_ATIUSB=m
+CONFIG_LIRC_SERIAL=m
+# CONFIG_LIRC_HOMEBREW is not set
+CONFIG_LIRC_PORT_SERIAL=0x3f8
+CONFIG_LIRC_IRQ_SERIAL=4
+CONFIG_LIRC_SIR=m
+CONFIG_LIRC_PORT_SIR=0x3f8
+CONFIG_LIRC_IRQ_SIR=4
+
+#
+# I2C support
+#
+CONFIG_I2C=m
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_ALGOPCF=m
+
+#
+# I2C Hardware Bus support
+#
+CONFIG_I2C_ALI1535=m
+CONFIG_I2C_ALI15X3=m
+CONFIG_I2C_AMD756=m
+CONFIG_I2C_AMD8111=m
+CONFIG_I2C_I801=m
+CONFIG_I2C_I810=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_NFORCE2=m
+CONFIG_I2C_PARPORT=m
+CONFIG_I2C_PARPORT_LIGHT=m
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_PROSAVAGE=m
+CONFIG_I2C_SAVAGE4=m
+CONFIG_SCx200_I2C=m
+CONFIG_SCx200_I2C_SCL=12
+CONFIG_SCx200_I2C_SDA=13
+CONFIG_SCx200_ACB=m
+CONFIG_I2C_SIS5595=m
+CONFIG_I2C_SIS630=m
+CONFIG_I2C_SIS96X=m
+CONFIG_I2C_VIA=m
+CONFIG_I2C_VIAPRO=m
+CONFIG_I2C_VOODOO3=m
+
+#
+# Hardware Sensors Chip support
+#
+CONFIG_I2C_SENSOR=m
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ASB100=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_FSCHER=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_IT87=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM83=m
+CONFIG_SENSORS_LM85=m
+CONFIG_SENSORS_LM90=m
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_W83781D=m
+CONFIG_SENSORS_W83L785TS=m
+CONFIG_SENSORS_W83627HF=m
+
+#
+# Other I2C Chip support
+#
+CONFIG_SENSORS_EEPROM=m
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# Misc devices
+#
+CONFIG_IBM_ASM=m
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+
+#
+# Video Adapters
+#
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_PMS=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_SAA5246A=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_STRADIS=m
+CONFIG_VIDEO_ZORAN=m
+CONFIG_VIDEO_ZORAN_BUZ=m
+CONFIG_VIDEO_ZORAN_DC10=m
+CONFIG_VIDEO_ZORAN_DC30=m
+CONFIG_VIDEO_ZORAN_LML33=m
+CONFIG_VIDEO_ZORAN_LML33R10=m
+CONFIG_VIDEO_SAA7134=m
+CONFIG_VIDEO_MXB=m
+CONFIG_VIDEO_DPC=m
+CONFIG_VIDEO_HEXIUM_ORION=m
+CONFIG_VIDEO_HEXIUM_GEMINI=m
+CONFIG_VIDEO_CX88=m
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_CADET=m
+CONFIG_RADIO_RTRACK=m
+CONFIG_RADIO_RTRACK2=m
+CONFIG_RADIO_AZTECH=m
+CONFIG_RADIO_GEMTEK=m
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_RADIO_MIROPCM20=m
+# CONFIG_RADIO_MIROPCM20_RDS is not set
+CONFIG_RADIO_SF16FMI=m
+CONFIG_RADIO_SF16FMR2=m
+CONFIG_RADIO_TERRATEC=m
+CONFIG_RADIO_TRUST=m
+CONFIG_RADIO_TYPHOON=m
+CONFIG_RADIO_TYPHOON_PROC_FS=y
+CONFIG_RADIO_ZOLTRIX=m
+
+#
+# Digital Video Broadcasting Devices
+#
+CONFIG_DVB=y
+CONFIG_DVB_CORE=m
+
+#
+# Supported Frontend Modules
+#
+CONFIG_DVB_TWINHAN_DST=m
+CONFIG_DVB_STV0299=m
+CONFIG_DVB_SP887X=m
+CONFIG_DVB_SP887X_FIRMWARE_FILE="/etc/dvb/sc_main.mc"
+CONFIG_DVB_ALPS_TDLB7=m
+CONFIG_DVB_ALPS_TDMB7=m
+CONFIG_DVB_ATMEL_AT76C651=m
+CONFIG_DVB_CX24110=m
+CONFIG_DVB_GRUNDIG_29504_491=m
+CONFIG_DVB_GRUNDIG_29504_401=m
+CONFIG_DVB_MT312=m
+CONFIG_DVB_VES1820=m
+CONFIG_DVB_VES1X93=m
+CONFIG_DVB_TDA1004X=m
+CONFIG_DVB_TDA1004X_FIRMWARE_FILE="/usr/lib/hotplug/firmware/tda1004x.bin"
+CONFIG_DVB_NXT6000=m
+
+#
+# Supported SAA7146 based PCI Adapters
+#
+CONFIG_DVB_AV7110=m
+# CONFIG_DVB_AV7110_FIRMWARE is not set
+CONFIG_DVB_AV7110_OSD=y
+CONFIG_DVB_BUDGET=m
+CONFIG_DVB_BUDGET_CI=m
+CONFIG_DVB_BUDGET_AV=m
+CONFIG_DVB_BUDGET_PATCH=m
+
+#
+# Supported USB Adapters
+#
+CONFIG_DVB_TTUSB_BUDGET=m
+CONFIG_DVB_TTUSB_DEC=m
+
+#
+# Supported FlexCopII (B2C2) Adapters
+#
+CONFIG_DVB_B2C2_SKYSTAR=m
+
+#
+# Supported BT878 Adapters
+#
+CONFIG_DVB_BT8XX=m
+CONFIG_VIDEO_SAA7146=m
+CONFIG_VIDEO_SAA7146_VV=m
+CONFIG_VIDEO_VIDEOBUF=m
+CONFIG_VIDEO_TUNER=m
+CONFIG_VIDEO_BUF=m
+CONFIG_VIDEO_BTCX=m
+CONFIG_VIDEO_IR=m
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+CONFIG_FB_PM2=m
+CONFIG_FB_PM2_FIFO_DISCONNECT=y
+CONFIG_FB_CYBER2000=m
+CONFIG_FB_IMSTT=y
+CONFIG_FB_VGA16=m
+CONFIG_FB_VESA=y
+CONFIG_VIDEO_SELECT=y
+CONFIG_FB_HGA=m
+CONFIG_FB_RIVA=m
+CONFIG_FB_I810=m
+CONFIG_FB_I810_GTF=y
+# CONFIG_FB_MATROX is not set
+# CONFIG_FB_RADEON_OLD is not set
+CONFIG_FB_RADEON=m
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+CONFIG_FB_ATY=m
+CONFIG_FB_ATY_CT=y
+CONFIG_FB_ATY_GX=y
+CONFIG_FB_ATY_XL_INIT=y
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_KYRO=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_VOODOO1=m
+CONFIG_FB_TRIDENT=m
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_MDA_CONSOLE=m
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_PCI_CONSOLE=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+
+#
+# Bootsplash configuration
+#
+CONFIG_BOOTSPLASH=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+
+#
+# Advanced Linux Sound Architecture
+#
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_RTCTIMER=m
+CONFIG_SND_VERBOSE_PRINTK=y
+CONFIG_SND_DEBUG=y
+CONFIG_SND_DEBUG_MEMORY=y
+# CONFIG_SND_DEBUG_DETECT is not set
+
+#
+# Generic devices
+#
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_OPL3_LIB=m
+CONFIG_SND_OPL4_LIB=m
+CONFIG_SND_VX_LIB=m
+CONFIG_SND_DUMMY=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+CONFIG_SND_SERIAL_U16550=m
+CONFIG_SND_MPU401=m
+
+#
+# ISA devices
+#
+CONFIG_SND_AD1816A=m
+CONFIG_SND_AD1848=m
+CONFIG_SND_CS4231=m
+CONFIG_SND_CS4232=m
+CONFIG_SND_CS4236=m
+CONFIG_SND_ES968=m
+CONFIG_SND_ES1688=m
+CONFIG_SND_ES18XX=m
+CONFIG_SND_GUSCLASSIC=m
+CONFIG_SND_GUSEXTREME=m
+CONFIG_SND_GUSMAX=m
+CONFIG_SND_INTERWAVE=m
+CONFIG_SND_INTERWAVE_STB=m
+CONFIG_SND_OPTI92X_AD1848=m
+CONFIG_SND_OPTI92X_CS4231=m
+CONFIG_SND_OPTI93X=m
+CONFIG_SND_SB8=m
+CONFIG_SND_SB16=m
+CONFIG_SND_SBAWE=m
+CONFIG_SND_SB16_CSP=y
+CONFIG_SND_WAVEFRONT=m
+CONFIG_SND_ALS100=m
+CONFIG_SND_AZT2320=m
+CONFIG_SND_CMI8330=m
+CONFIG_SND_DT019X=m
+CONFIG_SND_OPL3SA2=m
+CONFIG_SND_SGALAXY=m
+CONFIG_SND_SSCAPE=m
+
+#
+# PCI devices
+#
+CONFIG_SND_AC97_CODEC=m
+CONFIG_SND_ALI5451=m
+CONFIG_SND_ATIIXP=m
+CONFIG_SND_AU8810=m
+CONFIG_SND_AU8820=m
+CONFIG_SND_AU8830=m
+CONFIG_SND_AZT3328=m
+CONFIG_SND_BT87X=m
+CONFIG_SND_CS46XX=m
+CONFIG_SND_CS46XX_NEW_DSP=y
+CONFIG_SND_CS4281=m
+CONFIG_SND_EMU10K1=m
+CONFIG_SND_KORG1212=m
+CONFIG_SND_MIXART=m
+CONFIG_SND_NM256=m
+CONFIG_SND_RME32=m
+CONFIG_SND_RME96=m
+CONFIG_SND_RME9652=m
+CONFIG_SND_HDSP=m
+CONFIG_SND_TRIDENT=m
+CONFIG_SND_YMFPCI=m
+CONFIG_SND_ALS4000=m
+CONFIG_SND_CMIPCI=m
+CONFIG_SND_ENS1370=m
+CONFIG_SND_ENS1371=m
+CONFIG_SND_ES1938=m
+CONFIG_SND_ES1968=m
+CONFIG_SND_MAESTRO3=m
+CONFIG_SND_FM801=m
+CONFIG_SND_FM801_TEA575X=m
+CONFIG_SND_ICE1712=m
+CONFIG_SND_ICE1724=m
+CONFIG_SND_INTEL8X0=m
+CONFIG_SND_INTEL8X0M=m
+CONFIG_SND_SONICVIBES=m
+CONFIG_SND_VIA82XX=m
+CONFIG_SND_VX222=m
+
+#
+# ALSA USB devices
+#
+CONFIG_SND_USB_AUDIO=m
+
+#
+# PCMCIA devices
+#
+# CONFIG_SND_VXPOCKET is not set
+# CONFIG_SND_VXP440 is not set
+# CONFIG_SND_PDAUDIOCF is not set
+
+#
+# Open Sound System
+#
+CONFIG_SOUND_PRIME=m
+CONFIG_SOUND_BT878=m
+CONFIG_SOUND_CMPCI=m
+CONFIG_SOUND_CMPCI_FM=y
+CONFIG_SOUND_CMPCI_FMIO=0x388
+CONFIG_SOUND_CMPCI_MIDI=y
+CONFIG_SOUND_CMPCI_MPUIO=0x330
+CONFIG_SOUND_CMPCI_JOYSTICK=y
+CONFIG_SOUND_CMPCI_CM8738=y
+# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set
+CONFIG_SOUND_CMPCI_SPDIFLOOP=y
+CONFIG_SOUND_CMPCI_SPEAKERS=2
+CONFIG_SOUND_EMU10K1=m
+CONFIG_MIDI_EMU10K1=y
+# CONFIG_SOUND_FUSION is not set
+CONFIG_SOUND_CS4281=m
+CONFIG_SOUND_ES1370=m
+CONFIG_SOUND_ES1371=m
+CONFIG_SOUND_ESSSOLO1=m
+CONFIG_SOUND_MAESTRO=m
+CONFIG_SOUND_MAESTRO3=m
+CONFIG_SOUND_ICH=m
+CONFIG_SOUND_SONICVIBES=m
+CONFIG_SOUND_TRIDENT=m
+# CONFIG_SOUND_MSNDCLAS is not set
+# CONFIG_SOUND_MSNDPIN is not set
+CONFIG_SOUND_VIA82CXXX=m
+CONFIG_MIDI_VIA82CXXX=y
+CONFIG_SOUND_OSS=m
+CONFIG_SOUND_TRACEINIT=y
+CONFIG_SOUND_DMAP=y
+# CONFIG_SOUND_AD1816 is not set
+CONFIG_SOUND_AD1889=m
+CONFIG_SOUND_SGALAXY=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_ACI_MIXER=m
+CONFIG_SOUND_CS4232=m
+CONFIG_SOUND_SSCAPE=m
+CONFIG_SOUND_GUS=m
+# CONFIG_SOUND_GUS16 is not set
+CONFIG_SOUND_GUSMAX=y
+CONFIG_SOUND_VMIDI=m
+CONFIG_SOUND_TRIX=m
+CONFIG_SOUND_MSS=m
+CONFIG_SOUND_MPU401=m
+CONFIG_SOUND_NM256=m
+CONFIG_SOUND_MAD16=m
+CONFIG_MAD16_OLDCARD=y
+CONFIG_SOUND_PAS=m
+CONFIG_SOUND_PSS=m
+CONFIG_PSS_MIXER=y
+# CONFIG_PSS_HAVE_BOOT is not set
+CONFIG_SOUND_SB=m
+# CONFIG_SOUND_AWE32_SYNTH is not set
+CONFIG_SOUND_WAVEFRONT=m
+CONFIG_SOUND_MAUI=m
+CONFIG_SOUND_YM3812=m
+CONFIG_SOUND_OPL3SA1=m
+CONFIG_SOUND_OPL3SA2=m
+CONFIG_SOUND_YMFPCI=m
+CONFIG_SOUND_YMFPCI_LEGACY=y
+CONFIG_SOUND_UART6850=m
+CONFIG_SOUND_AEDSP16=m
+CONFIG_SC6600=y
+CONFIG_SC6600_JOY=y
+CONFIG_SC6600_CDROM=4
+CONFIG_SC6600_CDROMBASE=0x0
+# CONFIG_AEDSP16_MSS is not set
+# CONFIG_AEDSP16_SBPRO is not set
+CONFIG_AEDSP16_MPU401=y
+CONFIG_SOUND_TVMIXER=m
+CONFIG_SOUND_KAHLUA=m
+CONFIG_SOUND_ALI5455=m
+CONFIG_SOUND_FORTE=m
+CONFIG_SOUND_RME96XX=m
+CONFIG_SOUND_AD1980=m
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_OHCI_HCD=m
+CONFIG_USB_UHCI_HCD=m
+
+#
+# USB Device Class drivers
+#
+CONFIG_USB_AUDIO=m
+
+#
+# USB Bluetooth TTY can only be used with disabled Bluetooth subsystem
+#
+CONFIG_USB_MIDI=m
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+
+#
+# USB Human Interface Devices (HID)
+#
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+CONFIG_HID_FF=y
+CONFIG_HID_PID=y
+CONFIG_LOGITECH_FF=y
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_USB_HIDDEV=y
+
+#
+# USB HID Boot Protocol drivers
+#
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+
+#
+# USB Multimedia devices
+#
+CONFIG_USB_DABUSB=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_OV511=m
+CONFIG_USB_SE401=m
+CONFIG_USB_STV680=m
+CONFIG_USB_W9968CF=m
+
+#
+# USB Network adaptors
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+
+#
+# USB Host-to-Host Cables
+#
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_GENESYS=y
+CONFIG_USB_NET1080=y
+CONFIG_USB_PL2301=y
+
+#
+# Intelligent USB Devices/Gadgets
+#
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_ZAURUS=y
+CONFIG_USB_CDCETHER=y
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_AX8817X=y
+
+#
+# USB port drivers
+#
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_TIGL=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+CONFIG_USB_SPEEDTOUCH=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# InfiniBand support
+#
+CONFIG_INFINIBAND=m
+CONFIG_INFINIBAND_IPOIB=m
+# CONFIG_INFINIBAND_SDP is not set
+CONFIG_INFINIBAND_SRP=m
+CONFIG_INFINIBAND_UDAPL_HELPER=m
+CONFIG_INFINIBAND_MELLANOX_HCA=m
+CONFIG_AUDIT=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=m
+CONFIG_JBD_DEBUG=y
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_DMAPI=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_RT=y
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_MINIX_FS=y
+CONFIG_ROMFS_FS=m
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_NTFS_FS=m
+# CONFIG_NTFS_DEBUG is not set
+# CONFIG_NTFS_RW is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+# CONFIG_DEVFS_FS is not set
+CONFIG_DEVPTS_FS_XATTR=y
+CONFIG_DEVPTS_FS_SECURITY=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=m
+# CONFIG_KLOG_CHANNEL is not set
+
+#
+# Miscellaneous filesystems
+#
+CONFIG_ADFS_FS=m
+# CONFIG_ADFS_FS_RW is not set
+CONFIG_AFFS_FS=m
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EFS_FS=m
+CONFIG_JFFS_FS=m
+CONFIG_JFFS_FS_VERBOSE=0
+CONFIG_JFFS2_FS=m
+CONFIG_JFFS2_FS_DEBUG=0
+# CONFIG_JFFS2_FS_NAND is not set
+CONFIG_CRAMFS=m
+CONFIG_VXFS_FS=m
+CONFIG_HPFS_FS=m
+CONFIG_QNX4FS_FS=m
+# CONFIG_QNX4FS_RW is not set
+CONFIG_SYSV_FS=m
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFS_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_ACL=y
+CONFIG_NFS_ACL_SUPPORT=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_STATD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_RPCSEC_GSS_KRB5=y
+CONFIG_SMB_FS=m
+CONFIG_SMB_NLS_DEFAULT=y
+CONFIG_SMB_NLS_REMOTE="cp850"
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_CODA_FS=m
+# CONFIG_CODA_FS_OLD_API is not set
+# CONFIG_INTERMEZZO_FS is not set
+CONFIG_AFS_FS=m
+CONFIG_RXRPC=m
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+# CONFIG_MINIX_SUBPARTITION is not set
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_NEC98_PARTITION=y
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_FSHOOKS=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+
+#
+# Kernel hacking
+#
+CONFIG_CRASH_DUMP=m
+CONFIG_KERNTYPES=y
+CONFIG_CRASH_DUMP_BLOCKDEV=m
+CONFIG_CRASH_DUMP_NETDEV=m
+# CONFIG_CRASH_DUMP_MEMDEV is not set
+CONFIG_CRASH_DUMP_COMPRESS_RLE=m
+CONFIG_CRASH_DUMP_COMPRESS_GZIP=m
+CONFIG_DEBUG_KERNEL=y
+CONFIG_EARLY_PRINTK=y
+# CONFIG_KPROBES is not set
+# CONFIG_DEBUGREG is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_STACK_USAGE is not set
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_INFO is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KDB is not set
+CONFIG_X86_FIND_SMP_CONFIG=y
+CONFIG_X86_MPPARSE=y
+# CONFIG_HOOK is not set
+
+#
+# Security options
+#
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+# CONFIG_SECURITY_SELINUX_MLS is not set
+
+#
+# IBM Crypto Hardware support
+#
+CONFIG_IBM_CRYPTO=m
+CONFIG_ICA_LEEDSLITE=m
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+CONFIG_QSORT=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+
+#
+# Build options
+#
+CONFIG_SUSE_KERNEL=y
+CONFIG_CFGNAME="bigsmp"
+CONFIG_RELEASE="SLES9_SP1_BRANCH_2004110217390391"
+CONFIG_X86_SMP=y
+CONFIG_X86_HT=y
+CONFIG_X86_BIOS_REBOOT=y
+CONFIG_X86_TRAMPOLINE=y
+CONFIG_PC=y
--- /dev/null
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_64BIT=y
+CONFIG_MMU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_FRAME_POINTER=y
+CONFIG_FORCE_MAX_ZONEORDER=13
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_STANDALONE=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=19
+CONFIG_HOTPLUG=y
+CONFIG_EVLOG=y
+# CONFIG_EVLOG_FWPRINTK is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_EMBEDDED is not set
+
+#
+# Class Based Kernel Resource Management
+#
+CONFIG_CKRM=y
+CONFIG_RCFS_FS=m
+CONFIG_CKRM_TYPE_TASKCLASS=y
+CONFIG_CKRM_RES_NUMTASKS=m
+CONFIG_CKRM_CPU_SCHEDULE=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
+CONFIG_CKRM_TYPE_SOCKETCLASS=y
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+CONFIG_DELAY_ACCT=y
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_OBSOLETE_MODPARM=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Platform support
+#
+# CONFIG_PPC_ISERIES is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC=y
+CONFIG_PPC64=y
+CONFIG_PPC_OF=y
+CONFIG_ALTIVEC=y
+# CONFIG_PPC_PMAC is not set
+CONFIG_PPC_SPLPAR=y
+# CONFIG_BOOTX_TEXT is not set
+# CONFIG_POWER4_ONLY is not set
+# CONFIG_IOMMU_VMERGE is not set
+CONFIG_SMP=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_NR_CPUS=128
+# CONFIG_HMT is not set
+CONFIG_DISCONTIGMEM=y
+CONFIG_NUMA=y
+CONFIG_SCHED_SMT=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_FLASH=m
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+CONFIG_PPC_VPURR=y
+
+#
+# General setup
+#
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+CONFIG_HOTPLUG_CPU=y
+
+#
+# PCMCIA/CardBus support
+#
+# CONFIG_PCMCIA is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Bluesmoke - error detection and reporting (RAS)
+#
+# CONFIG_BLUESMOKE is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_PC_SUPERIO=y
+CONFIG_PARPORT_OTHER=y
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_CARMEL is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CIPHER_TWOFISH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_IDE_TASK_IOCTL=y
+# CONFIG_IDE_TASKFILE_IO is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_ADMA=y
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+CONFIG_PDC202XX_BURST=y
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+# CONFIG_PDC202XX_FORCE is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=m
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_AIC79XX_NEW is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_SCSI_SATA is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_CPQFCTS is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=m
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+CONFIG_SCSI_QLA2300=m
+# CONFIG_SCSI_QLA2322 is not set
+# CONFIG_SCSI_QLA6312 is not set
+# CONFIG_SCSI_QLA6322 is not set
+CONFIG_SCSI_QLA2XXX_FAILOVER=y
+CONFIG_SCSI_QLA4XXX=m
+CONFIG_SCSI_QLA4XXX_FAILOVER=y
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_FLAKEY=m
+CONFIG_BLK_DEV_DM_BBR=m
+
+#
+# Fusion MPT device support
+#
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+
+#
+# Texas Instruments PCILynx requires I2C
+#
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_UNIX=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+# CONFIG_IP_ROUTE_VERBOSE is not set
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_ACCEPT_QUEUES is not set
+
+#
+# IP: Virtual Server Configuration
+#
+# CONFIG_IP_VS is not set
+CONFIG_IPV6=m
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_NDISC_NEW=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_TUNNEL=m
+
+#
+# MOBILE IPv6 (EXPERIMENTAL)
+#
+CONFIG_IPV6_MOBILITY=m
+CONFIG_IPV6_MOBILITY_MN=m
+CONFIG_IPV6_MOBILITY_HA=m
+# CONFIG_IPV6_MOBILITY_DEBUG is not set
+# CONFIG_DECNET is not set
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_PHYSDEV=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_CLASSIFY=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+# CONFIG_IP_NF_COMPAT_IPCHAINS is not set
+# CONFIG_IP_NF_COMPAT_IPFWADM is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_TARGET_CONNMARK=m
+CONFIG_IP_NF_MATCH_CONNMARK=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+
+#
+# IPv6: Netfilter Configuration
+#
+CONFIG_IP6_NF_FTP=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_CONNTRACK=m
+CONFIG_IP6_NF_MATCH_STATE=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+# CONFIG_BRIDGE_EBT_SNAT is not set
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+# CONFIG_ATM is not set
+CONFIG_VLAN_8021Q=m
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_CSZ=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_DELAY=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_OAKNET is not set
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+CONFIG_E1000_NEW=m
+CONFIG_E1000_NEW_NAPI=y
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SIS190 is not set
+# CONFIG_SK98LIN is not set
+CONFIG_TIGON3=m
+CONFIG_NET_BROADCOM=m
+CONFIG_NET_BROADCOM_NEW=m
+# CONFIG_NET_BCM44 is not set
+CONFIG_TIGON3_NEW=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_IBMVETH=m
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_IBMLS is not set
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+CONFIG_NET_FC=y
+CONFIG_NET_LPFC=m
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# Amateur Radio support
+#
+# CONFIG_HAMRADIO is not set
+
+#
+# IrDA (infrared) support
+#
+# CONFIG_IRDA is not set
+
+#
+# Bluetooth support
+#
+# CONFIG_BT is not set
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_RX=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=m
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input I/O drivers
+#
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_POSFILTER is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_ECC=m
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_PMACZILOG is not set
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_PRINTER is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVCS=m
+# CONFIG_QIC02_TAPE is not set
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+# CONFIG_AGP is not set
+# CONFIG_DRM is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+
+#
+# Linux InfraRed Controller
+#
+# CONFIG_LIRC_SUPPORT is not set
+# CONFIG_LIRC_HOMEBREW is not set
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_S3TRIO is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G450=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+# CONFIG_FB_RADEON is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_MDA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_PCI_CONSOLE=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_UHCI_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_BLUETOOTH_TTY is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+
+#
+# USB Human Interface Devices (HID)
+#
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+CONFIG_USB_HIDDEV=y
+
+#
+# USB HID Boot Protocol drivers
+#
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network adaptors
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+
+#
+# USB Host-to-Host Cables
+#
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_GENESYS=y
+CONFIG_USB_NET1080=y
+CONFIG_USB_PL2301=y
+
+#
+# Intelligent USB Devices/Gadgets
+#
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_ZAURUS=y
+CONFIG_USB_CDCETHER=y
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_AX8817X=y
+
+#
+# USB port drivers
+#
+# CONFIG_USB_USS720 is not set
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+# CONFIG_USB_TIGL is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+CONFIG_USB_LEGOTOWER=m
+# CONFIG_USB_LCD is not set
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# InfiniBand support
+#
+CONFIG_INFINIBAND=m
+CONFIG_INFINIBAND_IPOIB=m
+# CONFIG_INFINIBAND_SDP is not set
+# CONFIG_INFINIBAND_SRP is not set
+# CONFIG_INFINIBAND_UDAPL_HELPER is not set
+CONFIG_INFINIBAND_MELLANOX_HCA=m
+CONFIG_AUDIT=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_DMAPI=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_RT=y
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_MINIX_FS=m
+# CONFIG_ROMFS_FS is not set
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS_FS=y
+# CONFIG_AUTOFS4_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+# CONFIG_DEVFS_FS is not set
+CONFIG_DEVPTS_FS_XATTR=y
+CONFIG_DEVPTS_FS_SECURITY=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=m
+# CONFIG_KLOG_CHANNEL is not set
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=y
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFS_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_ACL=y
+CONFIG_NFS_ACL_SUPPORT=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_STATD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_RPCSEC_GSS_KRB5=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_NEC98_PARTITION=y
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_FSHOOKS=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_KERNTYPES=y
+CONFIG_CRASH_DUMP=m
+CONFIG_CRASH_DUMP_BLOCKDEV=m
+CONFIG_CRASH_DUMP_NETDEV=m
+# CONFIG_CRASH_DUMP_MEMDEV is not set
+# CONFIG_CRASH_DUMP_SOFTBOOT is not set
+CONFIG_CRASH_DUMP_COMPRESS_RLE=m
+CONFIG_CRASH_DUMP_COMPRESS_GZIP=m
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_KDB=y
+CONFIG_KDB_MODULES=y
+CONFIG_KDB_OFF=y
+# CONFIG_PPCDBG is not set
+# CONFIG_DEBUG_INFO is not set
+CONFIG_IRQSTACKS=y
+
+#
+# Security options
+#
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_MLS=y
+
+#
+# IBM Crypto Hardware support
+#
+CONFIG_IBM_CRYPTO=m
+CONFIG_ICA_LEEDSLITE=m
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+CONFIG_QSORT=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+
+#
+# Build options
+#
+CONFIG_SUSE_KERNEL=y
+CONFIG_CFGNAME="pseries64"
+CONFIG_RELEASE="7.141"
--- /dev/null
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_64BIT=y
+CONFIG_MMU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_FRAME_POINTER=y
+CONFIG_FORCE_MAX_ZONEORDER=13
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_STANDALONE=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=19
+CONFIG_HOTPLUG=y
+CONFIG_EVLOG=y
+# CONFIG_EVLOG_FWPRINTK is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_EMBEDDED is not set
+
+#
+# Class Based Kernel Resource Management
+#
+CONFIG_CKRM=y
+CONFIG_RCFS_FS=m
+CONFIG_CKRM_TYPE_TASKCLASS=y
+CONFIG_CKRM_RES_NUMTASKS=m
+CONFIG_CKRM_CPU_SCHEDULE=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
+CONFIG_CKRM_TYPE_SOCKETCLASS=y
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+CONFIG_DELAY_ACCT=y
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_OBSOLETE_MODPARM=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Platform support
+#
+# CONFIG_PPC_ISERIES is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC=y
+CONFIG_PPC64=y
+CONFIG_PPC_OF=y
+CONFIG_ALTIVEC=y
+# CONFIG_PPC_PMAC is not set
+CONFIG_PPC_SPLPAR=y
+# CONFIG_BOOTX_TEXT is not set
+# CONFIG_POWER4_ONLY is not set
+# CONFIG_IOMMU_VMERGE is not set
+CONFIG_SMP=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_NR_CPUS=128
+# CONFIG_HMT is not set
+CONFIG_DISCONTIGMEM=y
+CONFIG_NUMA=y
+CONFIG_SCHED_SMT=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_FLASH=m
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+CONFIG_PPC_VPURR=y
+
+#
+# General setup
+#
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+CONFIG_HOTPLUG_CPU=y
+
+#
+# PCMCIA/CardBus support
+#
+# CONFIG_PCMCIA is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Bluesmoke - error detection and reporting (RAS)
+#
+# CONFIG_BLUESMOKE is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_PC_SUPERIO=y
+CONFIG_PARPORT_OTHER=y
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_CARMEL is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CIPHER_TWOFISH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_IDE_TASK_IOCTL=y
+# CONFIG_IDE_TASKFILE_IO is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_ADMA=y
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+CONFIG_PDC202XX_BURST=y
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+# CONFIG_PDC202XX_FORCE is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=m
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_AIC79XX_NEW is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_SCSI_SATA is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_CPQFCTS is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=m
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+CONFIG_SCSI_QLA2300=m
+# CONFIG_SCSI_QLA2322 is not set
+# CONFIG_SCSI_QLA6312 is not set
+# CONFIG_SCSI_QLA6322 is not set
+CONFIG_SCSI_QLA2XXX_FAILOVER=y
+CONFIG_SCSI_QLA4XXX=m
+CONFIG_SCSI_QLA4XXX_FAILOVER=y
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_FLAKEY=m
+CONFIG_BLK_DEV_DM_BBR=m
+
+#
+# Fusion MPT device support
+#
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+
+#
+# Texas Instruments PCILynx requires I2C
+#
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_UNIX=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+# CONFIG_IP_ROUTE_VERBOSE is not set
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_ACCEPT_QUEUES is not set
+
+#
+# IP: Virtual Server Configuration
+#
+# CONFIG_IP_VS is not set
+CONFIG_IPV6=m
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_NDISC_NEW=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_TUNNEL=m
+
+#
+# MOBILE IPv6 (EXPERIMENTAL)
+#
+CONFIG_IPV6_MOBILITY=m
+CONFIG_IPV6_MOBILITY_MN=m
+CONFIG_IPV6_MOBILITY_HA=m
+# CONFIG_IPV6_MOBILITY_DEBUG is not set
+# CONFIG_DECNET is not set
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_PHYSDEV=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_CLASSIFY=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+# CONFIG_IP_NF_COMPAT_IPCHAINS is not set
+# CONFIG_IP_NF_COMPAT_IPFWADM is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_TARGET_CONNMARK=m
+CONFIG_IP_NF_MATCH_CONNMARK=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+
+#
+# IPv6: Netfilter Configuration
+#
+CONFIG_IP6_NF_FTP=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_CONNTRACK=m
+CONFIG_IP6_NF_MATCH_STATE=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+# CONFIG_BRIDGE_EBT_SNAT is not set
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+# CONFIG_ATM is not set
+CONFIG_VLAN_8021Q=m
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_CSZ=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_DELAY=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_OAKNET is not set
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+CONFIG_E1000_NEW=m
+CONFIG_E1000_NEW_NAPI=y
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SIS190 is not set
+# CONFIG_SK98LIN is not set
+CONFIG_TIGON3=m
+CONFIG_NET_BROADCOM=m
+CONFIG_NET_BROADCOM_NEW=m
+# CONFIG_NET_BCM44 is not set
+CONFIG_TIGON3_NEW=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_IBMVETH=m
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_IBMLS is not set
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+CONFIG_NET_FC=y
+CONFIG_NET_LPFC=m
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# Amateur Radio support
+#
+# CONFIG_HAMRADIO is not set
+
+#
+# IrDA (infrared) support
+#
+# CONFIG_IRDA is not set
+
+#
+# Bluetooth support
+#
+# CONFIG_BT is not set
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_RX=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=m
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input I/O drivers
+#
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_POSFILTER is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_ECC=m
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_PMACZILOG is not set
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_PRINTER is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVCS=m
+# CONFIG_QIC02_TAPE is not set
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+# CONFIG_AGP is not set
+# CONFIG_DRM is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+
+#
+# Linux InfraRed Controller
+#
+# CONFIG_LIRC_SUPPORT is not set
+# CONFIG_LIRC_HOMEBREW is not set
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_S3TRIO is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G450=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+# CONFIG_FB_RADEON is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_MDA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_PCI_CONSOLE=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_UHCI_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_BLUETOOTH_TTY is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+
+#
+# USB Human Interface Devices (HID)
+#
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+CONFIG_USB_HIDDEV=y
+
+#
+# USB HID Boot Protocol drivers
+#
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network adaptors
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+
+#
+# USB Host-to-Host Cables
+#
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_GENESYS=y
+CONFIG_USB_NET1080=y
+CONFIG_USB_PL2301=y
+
+#
+# Intelligent USB Devices/Gadgets
+#
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_ZAURUS=y
+CONFIG_USB_CDCETHER=y
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_AX8817X=y
+
+#
+# USB port drivers
+#
+# CONFIG_USB_USS720 is not set
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+# CONFIG_USB_TIGL is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+CONFIG_USB_LEGOTOWER=m
+# CONFIG_USB_LCD is not set
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# InfiniBand support
+#
+CONFIG_INFINIBAND=m
+CONFIG_INFINIBAND_IPOIB=m
+# CONFIG_INFINIBAND_SDP is not set
+# CONFIG_INFINIBAND_SRP is not set
+# CONFIG_INFINIBAND_UDAPL_HELPER is not set
+CONFIG_INFINIBAND_MELLANOX_HCA=m
+CONFIG_AUDIT=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_DMAPI=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_RT=y
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_MINIX_FS=m
+# CONFIG_ROMFS_FS is not set
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS_FS=y
+# CONFIG_AUTOFS4_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+# CONFIG_DEVFS_FS is not set
+CONFIG_DEVPTS_FS_XATTR=y
+CONFIG_DEVPTS_FS_SECURITY=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=m
+# CONFIG_KLOG_CHANNEL is not set
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=y
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFS_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_ACL=y
+CONFIG_NFS_ACL_SUPPORT=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_STATD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_RPCSEC_GSS_KRB5=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_NEC98_PARTITION=y
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_FSHOOKS=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_KERNTYPES=y
+CONFIG_CRASH_DUMP=m
+CONFIG_CRASH_DUMP_BLOCKDEV=m
+CONFIG_CRASH_DUMP_NETDEV=m
+# CONFIG_CRASH_DUMP_MEMDEV is not set
+# CONFIG_CRASH_DUMP_SOFTBOOT is not set
+CONFIG_CRASH_DUMP_COMPRESS_RLE=m
+CONFIG_CRASH_DUMP_COMPRESS_GZIP=m
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_KDB=y
+CONFIG_KDB_MODULES=y
+CONFIG_KDB_OFF=y
+# CONFIG_PPCDBG is not set
+# CONFIG_DEBUG_INFO is not set
+CONFIG_IRQSTACKS=y
+
+#
+# Security options
+#
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_MLS=y
+
+#
+# IBM Crypto Hardware support
+#
+CONFIG_IBM_CRYPTO=m
+CONFIG_ICA_LEEDSLITE=m
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+CONFIG_QSORT=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+
+#
+# Build options
+#
+CONFIG_SUSE_KERNEL=y
+CONFIG_CFGNAME="pseries64"
+CONFIG_RELEASE="SLES9_SP1_BRANCH_91"
--- /dev/null
+diff -rup --new-file linux.mcp2/fs/ext3/Makefile linux_tmp/fs/ext3/Makefile
+--- linux.mcp2/fs/ext3/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/Makefile 2001-12-21 09:41:55.000000000 -0800
+@@ -0,0 +1,16 @@
++#
++# Makefile for the linux ext2-filesystem routines.
++#
++# Note! Dependencies are done automagically by 'make dep', which also
++# removes any old dependencies. DON'T put your own dependencies here
++# unless it's something special (ie not a .c file).
++#
++# Note 2! The CFLAGS definitions are now in the main makefile...
++
++O_TARGET := ext3.o
++
++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ ioctl.o namei.o super.o symlink.o
++obj-m := $(O_TARGET)
++
++include $(TOPDIR)/Rules.make
+diff -rup --new-file linux.mcp2/fs/ext3/balloc.c linux_tmp/fs/ext3/balloc.c
+--- linux.mcp2/fs/ext3/balloc.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/balloc.c 2002-08-02 17:39:45.000000000 -0700
+@@ -0,0 +1,999 @@
++/*
++ * linux/fs/ext3/balloc.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ */
++
++#include <linux/config.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/locks.h>
++#include <linux/quotaops.h>
++
++/*
++ * balloc.c contains the blocks allocation and deallocation routines
++ */
++
++/*
++ * The free blocks are managed by bitmaps. A file system contains several
++ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
++ * block for inodes, N blocks for the inode table and data blocks.
++ *
++ * The file system contains group descriptors which are located after the
++ * super block. Each descriptor contains the number of the bitmap block and
++ * the free blocks count in the block. The descriptors are loaded in memory
++ * when a file system is mounted (see ext3_read_super).
++ */
++
++
++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
++
++struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
++ unsigned int block_group,
++ struct buffer_head ** bh)
++{
++ unsigned long group_desc;
++ unsigned long desc;
++ struct ext3_group_desc * gdp;
++
++ if (block_group >= sb->u.ext3_sb.s_groups_count) {
++ ext3_error (sb, "ext3_get_group_desc",
++ "block_group >= groups_count - "
++ "block_group = %d, groups_count = %lu",
++ block_group, sb->u.ext3_sb.s_groups_count);
++
++ return NULL;
++ }
++
++ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
++ desc = block_group % EXT3_DESC_PER_BLOCK(sb);
++ if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
++ ext3_error (sb, "ext3_get_group_desc",
++ "Group descriptor not loaded - "
++ "block_group = %d, group_desc = %lu, desc = %lu",
++ block_group, group_desc, desc);
++ return NULL;
++ }
++
++ gdp = (struct ext3_group_desc *)
++ sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
++ if (bh)
++ *bh = sb->u.ext3_sb.s_group_desc[group_desc];
++ return gdp + desc;
++}
++
++/*
++ * Read the bitmap for a given block_group, reading into the specified
++ * slot in the superblock's bitmap cache.
++ *
++ * Return >=0 on success or a -ve error code.
++ */
++
++static int read_block_bitmap (struct super_block * sb,
++ unsigned int block_group,
++ unsigned long bitmap_nr)
++{
++ struct ext3_group_desc * gdp;
++ struct buffer_head * bh = NULL;
++ int retval = -EIO;
++
++ gdp = ext3_get_group_desc (sb, block_group, NULL);
++ if (!gdp)
++ goto error_out;
++ retval = 0;
++ bh = sb_bread(sb, le32_to_cpu(gdp->bg_block_bitmap));
++ if (!bh) {
++ ext3_error (sb, "read_block_bitmap",
++ "Cannot read block bitmap - "
++ "block_group = %d, block_bitmap = %lu",
++ block_group, (unsigned long) gdp->bg_block_bitmap);
++ retval = -EIO;
++ }
++ /*
++ * On IO error, just leave a zero in the superblock's block pointer for
++ * this group. The IO will be retried next time.
++ */
++error_out:
++ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
++ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
++ return retval;
++}
++
++/*
++ * load_block_bitmap loads the block bitmap for a blocks group
++ *
++ * It maintains a cache for the last bitmaps loaded. This cache is managed
++ * with a LRU algorithm.
++ *
++ * Notes:
++ * 1/ There is one cache per mounted file system.
++ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
++ * this function reads the bitmap without maintaining a LRU cache.
++ *
++ * Return the slot used to store the bitmap, or a -ve error code.
++ */
++static int __load_block_bitmap (struct super_block * sb,
++ unsigned int block_group)
++{
++ int i, j, retval = 0;
++ unsigned long block_bitmap_number;
++ struct buffer_head * block_bitmap;
++
++ if (block_group >= sb->u.ext3_sb.s_groups_count)
++ ext3_panic (sb, "load_block_bitmap",
++ "block_group >= groups_count - "
++ "block_group = %d, groups_count = %lu",
++ block_group, sb->u.ext3_sb.s_groups_count);
++
++ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
++ if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
++ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
++ block_group)
++ return block_group;
++ ext3_error (sb, "__load_block_bitmap",
++ "block_group != block_bitmap_number");
++ }
++ retval = read_block_bitmap (sb, block_group, block_group);
++ if (retval < 0)
++ return retval;
++ return block_group;
++ }
++
++ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
++ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
++ ;
++ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
++ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
++ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
++ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
++ for (j = i; j > 0; j--) {
++ sb->u.ext3_sb.s_block_bitmap_number[j] =
++ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
++ sb->u.ext3_sb.s_block_bitmap[j] =
++ sb->u.ext3_sb.s_block_bitmap[j - 1];
++ }
++ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
++ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
++
++ /*
++ * There's still one special case here --- if block_bitmap == 0
++ * then our last attempt to read the bitmap failed and we have
++ * just ended up caching that failure. Try again to read it.
++ */
++ if (!block_bitmap)
++ retval = read_block_bitmap (sb, block_group, 0);
++ } else {
++ if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
++ sb->u.ext3_sb.s_loaded_block_bitmaps++;
++ else
++ brelse (sb->u.ext3_sb.s_block_bitmap
++ [EXT3_MAX_GROUP_LOADED - 1]);
++ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
++ j > 0; j--) {
++ sb->u.ext3_sb.s_block_bitmap_number[j] =
++ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
++ sb->u.ext3_sb.s_block_bitmap[j] =
++ sb->u.ext3_sb.s_block_bitmap[j - 1];
++ }
++ retval = read_block_bitmap (sb, block_group, 0);
++ }
++ return retval;
++}
++
++/*
++ * Load the block bitmap for a given block group. First of all do a couple
++ * of fast lookups for common cases and then pass the request onto the guts
++ * of the bitmap loader.
++ *
++ * Return the slot number of the group in the superblock bitmap cache's on
++ * success, or a -ve error code.
++ *
++ * There is still one inconsistency here --- if the number of groups in this
++ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of
++ * differentiating between a group for which we have never performed a bitmap
++ * IO request, and a group for which the last bitmap read request failed.
++ */
++static inline int load_block_bitmap (struct super_block * sb,
++ unsigned int block_group)
++{
++ int slot;
++
++ /*
++ * Do the lookup for the slot. First of all, check if we're asking
++ * for the same slot as last time, and did we succeed that last time?
++ */
++ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
++ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
++ sb->u.ext3_sb.s_block_bitmap[0]) {
++ return 0;
++ }
++ /*
++ * Or can we do a fast lookup based on a loaded group on a filesystem
++ * small enough to be mapped directly into the superblock?
++ */
++ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED &&
++ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
++ && sb->u.ext3_sb.s_block_bitmap[block_group]) {
++ slot = block_group;
++ }
++ /*
++ * If not, then do a full lookup for this block group.
++ */
++ else {
++ slot = __load_block_bitmap (sb, block_group);
++ }
++
++ /*
++ * <0 means we just got an error
++ */
++ if (slot < 0)
++ return slot;
++
++ /*
++ * If it's a valid slot, we may still have cached a previous IO error,
++ * in which case the bh in the superblock cache will be zero.
++ */
++ if (!sb->u.ext3_sb.s_block_bitmap[slot])
++ return -EIO;
++
++ /*
++ * Must have been read in OK to get this far.
++ */
++ return slot;
++}
++
++/* Free given blocks, update quota and i_blocks field */
++void ext3_free_blocks (handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count)
++{
++ struct buffer_head *bitmap_bh;
++ struct buffer_head *gd_bh;
++ unsigned long block_group;
++ unsigned long bit;
++ unsigned long i;
++ int bitmap_nr;
++ unsigned long overflow;
++ struct super_block * sb;
++ struct ext3_group_desc * gdp;
++ struct ext3_super_block * es;
++ int err = 0, ret;
++ int dquot_freed_blocks = 0;
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_free_blocks: nonexistent device");
++ return;
++ }
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++ if (block < le32_to_cpu(es->s_first_data_block) ||
++ (block + count) > le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks not in datazone - "
++ "block = %lu, count = %lu", block, count);
++ goto error_return;
++ }
++
++ ext3_debug ("freeing block %lu\n", block);
++
++do_more:
++ overflow = 0;
++ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ bit = (block - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb);
++ /*
++ * Check to see if we are freeing blocks across a group
++ * boundary.
++ */
++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
++ count -= overflow;
++ }
++ bitmap_nr = load_block_bitmap (sb, block_group);
++ if (bitmap_nr < 0)
++ goto error_return;
++
++ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
++ if (!gdp)
++ goto error_return;
++
++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
++ in_range (block, le32_to_cpu(gdp->bg_inode_table),
++ sb->u.ext3_sb.s_itb_per_group) ||
++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
++ sb->u.ext3_sb.s_itb_per_group))
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks in system zones - "
++ "Block = %lu, count = %lu",
++ block, count);
++
++ /*
++ * We are about to start releasing blocks in the bitmap,
++ * so we need undo access.
++ */
++ /* @@@ check errors */
++ BUFFER_TRACE(bitmap_bh, "getting undo access");
++ err = ext3_journal_get_undo_access(handle, bitmap_bh);
++ if (err)
++ goto error_return;
++
++ /*
++ * We are about to modify some metadata. Call the journal APIs
++ * to unshare ->b_data if a currently-committing transaction is
++ * using it
++ */
++ BUFFER_TRACE(gd_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, gd_bh);
++ if (err)
++ goto error_return;
++
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++ if (err)
++ goto error_return;
++
++ for (i = 0; i < count; i++) {
++ /*
++ * An HJ special. This is expensive...
++ */
++#ifdef CONFIG_JBD_DEBUG
++ {
++ struct buffer_head *debug_bh;
++ debug_bh = sb_get_hash_table(sb, block + i);
++ if (debug_bh) {
++ BUFFER_TRACE(debug_bh, "Deleted!");
++ if (!bh2jh(bitmap_bh)->b_committed_data)
++ BUFFER_TRACE(debug_bh,
++ "No commited data in bitmap");
++ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
++ __brelse(debug_bh);
++ }
++ }
++#endif
++ BUFFER_TRACE(bitmap_bh, "clear bit");
++ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
++ ext3_error (sb, __FUNCTION__,
++ "bit already cleared for block %lu",
++ block + i);
++ BUFFER_TRACE(bitmap_bh, "bit already cleared");
++ } else {
++ dquot_freed_blocks++;
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
++ es->s_free_blocks_count =
++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
++ }
++ /* @@@ This prevents newly-allocated data from being
++ * freed and then reallocated within the same
++ * transaction.
++ *
++ * Ideally we would want to allow that to happen, but to
++ * do so requires making journal_forget() capable of
++ * revoking the queued write of a data block, which
++ * implies blocking on the journal lock. *forget()
++ * cannot block due to truncate races.
++ *
++ * Eventually we can fix this by making journal_forget()
++ * return a status indicating whether or not it was able
++ * to revoke the buffer. On successful revoke, it is
++ * safe not to set the allocation bit in the committed
++ * bitmap, because we know that there is no outstanding
++ * activity on the buffer any more and so it is safe to
++ * reallocate it.
++ */
++ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
++ J_ASSERT_BH(bitmap_bh,
++ bh2jh(bitmap_bh)->b_committed_data != NULL);
++ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
++ }
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
++ /* And the group descriptor block */
++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
++ ret = ext3_journal_dirty_metadata(handle, gd_bh);
++ if (!err) err = ret;
++
++ /* And the superblock */
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
++ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++ if (!err) err = ret;
++
++ if (overflow && !err) {
++ block += count;
++ count = overflow;
++ goto do_more;
++ }
++ sb->s_dirt = 1;
++error_return:
++ ext3_std_error(sb, err);
++ unlock_super(sb);
++ if (dquot_freed_blocks)
++ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
++ return;
++}
++
++/* For ext3 allocations, we must not reuse any blocks which are
++ * allocated in the bitmap buffer's "last committed data" copy. This
++ * prevents deletes from freeing up the page for reuse until we have
++ * committed the delete transaction.
++ *
++ * If we didn't do this, then deleting something and reallocating it as
++ * data would allow the old block to be overwritten before the
++ * transaction committed (because we force data to disk before commit).
++ * This would lead to corruption if we crashed between overwriting the
++ * data and committing the delete.
++ *
++ * @@@ We may want to make this allocation behaviour conditional on
++ * data-writes at some point, and disable it for metadata allocations or
++ * sync-data inodes.
++ */
++static int ext3_test_allocatable(int nr, struct buffer_head *bh)
++{
++ if (ext3_test_bit(nr, bh->b_data))
++ return 0;
++ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
++ return 1;
++ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
++}
++
++/*
++ * Find an allocatable block in a bitmap. We honour both the bitmap and
++ * its last-committed copy (if that exists), and perform the "most
++ * appropriate allocation" algorithm of looking for a free block near
++ * the initial goal; then for a free byte somewhere in the bitmap; then
++ * for any free bit in the bitmap.
++ */
++static int find_next_usable_block(int start,
++ struct buffer_head *bh, int maxblocks)
++{
++ int here, next;
++ char *p, *r;
++
++ if (start > 0) {
++ /*
++ * The goal was occupied; search forward for a free
++ * block within the next XX blocks.
++ *
++ * end_goal is more or less random, but it has to be
++ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
++ * next 64-bit boundary is simple..
++ */
++ int end_goal = (start + 63) & ~63;
++ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
++ if (here < end_goal && ext3_test_allocatable(here, bh))
++ return here;
++
++ ext3_debug ("Bit not found near goal\n");
++
++ }
++
++ here = start;
++ if (here < 0)
++ here = 0;
++
++ /*
++ * There has been no free block found in the near vicinity of
++ * the goal: do a search forward through the block groups,
++ * searching in each group first for an entire free byte in the
++ * bitmap and then for any free bit.
++ *
++ * Search first in the remainder of the current group
++ */
++ p = ((char *) bh->b_data) + (here >> 3);
++ r = memscan(p, 0, (maxblocks - here + 7) >> 3);
++ next = (r - ((char *) bh->b_data)) << 3;
++
++ if (next < maxblocks && ext3_test_allocatable(next, bh))
++ return next;
++
++ /* The bitmap search --- search forward alternately
++ * through the actual bitmap and the last-committed copy
++ * until we find a bit free in both. */
++
++ while (here < maxblocks) {
++ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data,
++ maxblocks, here);
++ if (next >= maxblocks)
++ return -1;
++ if (ext3_test_allocatable(next, bh))
++ return next;
++
++ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
++ here = ext3_find_next_zero_bit
++ ((unsigned long *) bh2jh(bh)->b_committed_data,
++ maxblocks, next);
++ }
++ return -1;
++}
++
++/*
++ * ext3_new_block uses a goal block to assist allocation. If the goal is
++ * free, or there is a free block within 32 blocks of the goal, that block
++ * is allocated. Otherwise a forward search is made for a free block; within
++ * each block group the search first looks for an entire free byte in the block
++ * bitmap, and then for any free bit if that fails.
++ * This function also updates quota and i_blocks field.
++ */
++int ext3_new_block (handle_t *handle, struct inode * inode,
++ unsigned long goal, u32 * prealloc_count,
++ u32 * prealloc_block, int * errp)
++{
++ struct buffer_head * bh, *bhtmp;
++ struct buffer_head * bh2;
++#if 0
++ char * p, * r;
++#endif
++ int i, j, k, tmp, alloctmp;
++ int bitmap_nr;
++ int fatal = 0, err;
++ int performed_allocation = 0;
++ struct super_block * sb;
++ struct ext3_group_desc * gdp;
++ struct ext3_super_block * es;
++#ifdef EXT3FS_DEBUG
++ static int goal_hits = 0, goal_attempts = 0;
++#endif
++ *errp = -ENOSPC;
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_new_block: nonexistent device");
++ return 0;
++ }
++
++ /*
++ * Check quota for allocation of this block.
++ */
++ if (DQUOT_ALLOC_BLOCK(inode, 1)) {
++ *errp = -EDQUOT;
++ return 0;
++ }
++
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++ if (le32_to_cpu(es->s_free_blocks_count) <=
++ le32_to_cpu(es->s_r_blocks_count) &&
++ ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
++ (sb->u.ext3_sb.s_resgid == 0 ||
++ !in_group_p (sb->u.ext3_sb.s_resgid)) &&
++ !capable(CAP_SYS_RESOURCE)))
++ goto out;
++
++ ext3_debug ("goal=%lu.\n", goal);
++
++ /*
++ * First, test whether the goal block is free.
++ */
++ if (goal < le32_to_cpu(es->s_first_data_block) ||
++ goal >= le32_to_cpu(es->s_blocks_count))
++ goal = le32_to_cpu(es->s_first_data_block);
++ i = (goal - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ gdp = ext3_get_group_desc (sb, i, &bh2);
++ if (!gdp)
++ goto io_error;
++
++ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
++ j = ((goal - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb));
++#ifdef EXT3FS_DEBUG
++ if (j)
++ goal_attempts++;
++#endif
++ bitmap_nr = load_block_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ goto io_error;
++
++ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++
++ ext3_debug ("goal is at %d:%d.\n", i, j);
++
++ if (ext3_test_allocatable(j, bh)) {
++#ifdef EXT3FS_DEBUG
++ goal_hits++;
++ ext3_debug ("goal bit allocated.\n");
++#endif
++ goto got_block;
++ }
++
++ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb));
++ if (j >= 0)
++ goto search_back;
++ }
++
++ ext3_debug ("Bit not found in block group %d.\n", i);
++
++ /*
++ * Now search the rest of the groups. We assume that
++ * i and gdp correctly point to the last group visited.
++ */
++ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
++ i++;
++ if (i >= sb->u.ext3_sb.s_groups_count)
++ i = 0;
++ gdp = ext3_get_group_desc (sb, i, &bh2);
++ if (!gdp) {
++ *errp = -EIO;
++ goto out;
++ }
++ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
++ bitmap_nr = load_block_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ goto io_error;
++
++ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++ j = find_next_usable_block(-1, bh,
++ EXT3_BLOCKS_PER_GROUP(sb));
++ if (j >= 0)
++ goto search_back;
++ }
++ }
++
++ /* No space left on the device */
++ goto out;
++
++search_back:
++ /*
++ * We have succeeded in finding a free byte in the block
++ * bitmap. Now search backwards up to 7 bits to find the
++ * start of this group of free blocks.
++ */
++ for ( k = 0;
++ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh);
++ k++, j--)
++ ;
++
++got_block:
++
++ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count);
++
++ /* Make sure we use undo access for the bitmap, because it is
++ critical that we do the frozen_data COW on bitmap buffers in
++ all cases even if the buffer is in BJ_Forget state in the
++ committing transaction. */
++ BUFFER_TRACE(bh, "get undo access for marking new block");
++ fatal = ext3_journal_get_undo_access(handle, bh);
++ if (fatal) goto out;
++
++ BUFFER_TRACE(bh2, "get_write_access");
++ fatal = ext3_journal_get_write_access(handle, bh2);
++ if (fatal) goto out;
++
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
++ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++ if (fatal) goto out;
++
++ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(es->s_first_data_block);
++
++ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
++ tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
++ in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
++ sb->u.ext3_sb.s_itb_per_group))
++ ext3_error (sb, "ext3_new_block",
++ "Allocating block in system zone - "
++ "block = %u", tmp);
++
++ /* The superblock lock should guard against anybody else beating
++ * us to this point! */
++ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
++ BUFFER_TRACE(bh, "setting bitmap bit");
++ ext3_set_bit(j, bh->b_data);
++ performed_allocation = 1;
++
++#ifdef CONFIG_JBD_DEBUG
++ {
++ struct buffer_head *debug_bh;
++
++ /* Record bitmap buffer state in the newly allocated block */
++ debug_bh = sb_get_hash_table(sb, tmp);
++ if (debug_bh) {
++ BUFFER_TRACE(debug_bh, "state when allocated");
++ BUFFER_TRACE2(debug_bh, bh, "bitmap state");
++ brelse(debug_bh);
++ }
++ }
++#endif
++ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data)
++ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data));
++ bhtmp = bh;
++ alloctmp = j;
++
++ ext3_debug ("found bit %d\n", j);
++
++ /*
++ * Do block preallocation now if required.
++ */
++#ifdef EXT3_PREALLOCATE
++ /*
++ * akpm: this is not enabled for ext3. Need to use
++ * ext3_test_allocatable()
++ */
++ /* Writer: ->i_prealloc* */
++ if (prealloc_count && !*prealloc_count) {
++ int prealloc_goal;
++ unsigned long next_block = tmp + 1;
++
++ prealloc_goal = es->s_prealloc_blocks ?
++ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS;
++
++ *prealloc_block = next_block;
++ /* Writer: end */
++ for (k = 1;
++ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb);
++ k++, next_block++) {
++ if (DQUOT_PREALLOC_BLOCK(inode, 1))
++ break;
++ /* Writer: ->i_prealloc* */
++ if (*prealloc_block + *prealloc_count != next_block ||
++ ext3_set_bit (j + k, bh->b_data)) {
++ /* Writer: end */
++ DQUOT_FREE_BLOCK(inode, 1);
++ break;
++ }
++ (*prealloc_count)++;
++ /* Writer: end */
++ }
++ /*
++ * As soon as we go for per-group spinlocks we'll need these
++ * done inside the loop above.
++ */
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
++ (k - 1));
++ es->s_free_blocks_count =
++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) -
++ (k - 1));
++ ext3_debug ("Preallocated a further %lu bits.\n",
++ (k - 1));
++ }
++#endif
++
++ j = tmp;
++
++ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (!fatal) fatal = err;
++
++ if (j >= le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_new_block",
++ "block(%d) >= blocks count(%d) - "
++ "block_group = %d, es == %p ",j,
++ le32_to_cpu(es->s_blocks_count), i, es);
++ goto out;
++ }
++
++ /*
++ * It is up to the caller to add the new buffer to a journal
++ * list of some description. We don't know in advance whether
++ * the caller wants to use it as metadata or data.
++ */
++
++ ext3_debug ("allocating block %d. "
++ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts);
++
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
++ es->s_free_blocks_count =
++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
++
++ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor");
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (!fatal) fatal = err;
++
++ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++ if (!fatal) fatal = err;
++
++ sb->s_dirt = 1;
++ if (fatal)
++ goto out;
++
++ unlock_super (sb);
++ *errp = 0;
++ return j;
++
++io_error:
++ *errp = -EIO;
++out:
++ if (fatal) {
++ *errp = fatal;
++ ext3_std_error(sb, fatal);
++ }
++ unlock_super (sb);
++ /*
++ * Undo the block allocation
++ */
++ if (!performed_allocation)
++ DQUOT_FREE_BLOCK(inode, 1);
++ return 0;
++
++}
++
++unsigned long ext3_count_free_blocks (struct super_block * sb)
++{
++#ifdef EXT3FS_DEBUG
++ struct ext3_super_block * es;
++ unsigned long desc_count, bitmap_count, x;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ int i;
++
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++ desc_count = 0;
++ bitmap_count = 0;
++ gdp = NULL;
++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++ gdp = ext3_get_group_desc (sb, i, NULL);
++ if (!gdp)
++ continue;
++ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
++ bitmap_nr = load_block_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ continue;
++
++ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
++ sb->s_blocksize);
++ printk ("group %d: stored = %d, counted = %lu\n",
++ i, le16_to_cpu(gdp->bg_free_blocks_count), x);
++ bitmap_count += x;
++ }
++ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
++ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
++ unlock_super (sb);
++ return bitmap_count;
++#else
++ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
++#endif
++}
++
++static inline int block_in_use (unsigned long block,
++ struct super_block * sb,
++ unsigned char * map)
++{
++ return ext3_test_bit ((block -
++ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb), map);
++}
++
++static inline int test_root(int a, int b)
++{
++ if (a == 0)
++ return 1;
++ while (1) {
++ if (a == 1)
++ return 1;
++ if (a % b)
++ return 0;
++ a = a / b;
++ }
++}
++
++int ext3_group_sparse(int group)
++{
++ return (test_root(group, 3) || test_root(group, 5) ||
++ test_root(group, 7));
++}
++
++/**
++ * ext3_bg_has_super - number of blocks used by the superblock in group
++ * @sb: superblock for filesystem
++ * @group: group number to check
++ *
++ * Return the number of blocks used by the superblock (primary or backup)
++ * in this group. Currently this will be only 0 or 1.
++ */
++int ext3_bg_has_super(struct super_block *sb, int group)
++{
++ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
++ !ext3_group_sparse(group))
++ return 0;
++ return 1;
++}
++
++/**
++ * ext3_bg_num_gdb - number of blocks used by the group table in group
++ * @sb: superblock for filesystem
++ * @group: group number to check
++ *
++ * Return the number of blocks used by the group descriptor table
++ * (primary or backup) in this group. In the future there may be a
++ * different number of descriptor blocks in each group.
++ */
++unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
++{
++ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
++ !ext3_group_sparse(group))
++ return 0;
++ return EXT3_SB(sb)->s_gdb_count;
++}
++
++#ifdef CONFIG_EXT3_CHECK
++/* Called at mount-time, super-block is locked */
++void ext3_check_blocks_bitmap (struct super_block * sb)
++{
++ struct buffer_head * bh;
++ struct ext3_super_block * es;
++ unsigned long desc_count, bitmap_count, x, j;
++ unsigned long desc_blocks;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ int i;
++
++ es = sb->u.ext3_sb.s_es;
++ desc_count = 0;
++ bitmap_count = 0;
++ gdp = NULL;
++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++ gdp = ext3_get_group_desc (sb, i, NULL);
++ if (!gdp)
++ continue;
++ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
++ bitmap_nr = load_block_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ continue;
++
++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
++
++ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data))
++ ext3_error(sb, __FUNCTION__,
++ "Superblock in group %d is marked free", i);
++
++ desc_blocks = ext3_bg_num_gdb(sb, i);
++ for (j = 0; j < desc_blocks; j++)
++ if (!ext3_test_bit(j + 1, bh->b_data))
++ ext3_error(sb, __FUNCTION__,
++ "Descriptor block #%ld in group "
++ "%d is marked free", j, i);
++
++ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
++ sb, bh->b_data))
++ ext3_error (sb, "ext3_check_blocks_bitmap",
++ "Block bitmap for group %d is marked free",
++ i);
++
++ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
++ sb, bh->b_data))
++ ext3_error (sb, "ext3_check_blocks_bitmap",
++ "Inode bitmap for group %d is marked free",
++ i);
++
++ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
++ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
++ sb, bh->b_data))
++ ext3_error (sb, "ext3_check_blocks_bitmap",
++ "Block #%d of the inode table in "
++ "group %d is marked free", j, i);
++
++ x = ext3_count_free (bh, sb->s_blocksize);
++ if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
++ ext3_error (sb, "ext3_check_blocks_bitmap",
++ "Wrong free blocks count for group %d, "
++ "stored = %d, counted = %lu", i,
++ le16_to_cpu(gdp->bg_free_blocks_count), x);
++ bitmap_count += x;
++ }
++ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
++ ext3_error (sb, "ext3_check_blocks_bitmap",
++ "Wrong free blocks count in super block, "
++ "stored = %lu, counted = %lu",
++ (unsigned long)le32_to_cpu(es->s_free_blocks_count),
++ bitmap_count);
++}
++#endif
+diff -rup --new-file linux.mcp2/fs/ext3/bitmap.c linux_tmp/fs/ext3/bitmap.c
+--- linux.mcp2/fs/ext3/bitmap.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/bitmap.c 2001-11-09 14:25:04.000000000 -0800
+@@ -0,0 +1,26 @@
++/*
++ * linux/fs/ext3/bitmap.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ */
++
++#include <linux/fs.h>
++
++
++static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
++
++unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
++{
++ unsigned int i;
++ unsigned long sum = 0;
++
++ if (!map)
++ return (0);
++ for (i = 0; i < numchars; i++)
++ sum += nibblemap[map->b_data[i] & 0xf] +
++ nibblemap[(map->b_data[i] >> 4) & 0xf];
++ return (sum);
++}
+diff -rup --new-file linux.mcp2/fs/ext3/dir.c linux_tmp/fs/ext3/dir.c
+--- linux.mcp2/fs/ext3/dir.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800
+@@ -0,0 +1,190 @@
++/*
++ * linux/fs/ext3/dir.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/dir.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * ext3 directory handling functions
++ *
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++
++static unsigned char ext3_filetype_table[] = {
++ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
++};
++
++static int ext3_readdir(struct file *, void *, filldir_t);
++
++struct file_operations ext3_dir_operations = {
++ read: generic_read_dir,
++ readdir: ext3_readdir, /* BKL held */
++ ioctl: ext3_ioctl, /* BKL held */
++ fsync: ext3_sync_file, /* BKL held */
++};
++
++int ext3_check_dir_entry (const char * function, struct inode * dir,
++ struct ext3_dir_entry_2 * de,
++ struct buffer_head * bh,
++ unsigned long offset)
++{
++ const char * error_msg = NULL;
++ const int rlen = le16_to_cpu(de->rec_len);
++
++ if (rlen < EXT3_DIR_REC_LEN(1))
++ error_msg = "rec_len is smaller than minimal";
++ else if (rlen % 4 != 0)
++ error_msg = "rec_len % 4 != 0";
++ else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
++ error_msg = "rec_len is too small for name_len";
++ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
++ error_msg = "directory entry across blocks";
++ else if (le32_to_cpu(de->inode) >
++ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++ error_msg = "inode out of bounds";
++
++ if (error_msg != NULL)
++ ext3_error (dir->i_sb, function,
++ "bad entry in directory #%lu: %s - "
++ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
++ dir->i_ino, error_msg, offset,
++ (unsigned long) le32_to_cpu(de->inode),
++ rlen, de->name_len);
++ return error_msg == NULL ? 1 : 0;
++}
++
++static int ext3_readdir(struct file * filp,
++ void * dirent, filldir_t filldir)
++{
++ int error = 0;
++ unsigned long offset, blk;
++ int i, num, stored;
++ struct buffer_head * bh, * tmp, * bha[16];
++ struct ext3_dir_entry_2 * de;
++ struct super_block * sb;
++ int err;
++ struct inode *inode = filp->f_dentry->d_inode;
++
++ sb = inode->i_sb;
++
++ stored = 0;
++ bh = NULL;
++ offset = filp->f_pos & (sb->s_blocksize - 1);
++
++ while (!error && !stored && filp->f_pos < inode->i_size) {
++ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
++ bh = ext3_bread (0, inode, blk, 0, &err);
++ if (!bh) {
++ ext3_error (sb, "ext3_readdir",
++ "directory #%lu contains a hole at offset %lu",
++ inode->i_ino, (unsigned long)filp->f_pos);
++ filp->f_pos += sb->s_blocksize - offset;
++ continue;
++ }
++
++ /*
++ * Do the readahead
++ */
++ if (!offset) {
++ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
++ i > 0; i--) {
++ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
++ if (tmp && !buffer_uptodate(tmp) &&
++ !buffer_locked(tmp))
++ bha[num++] = tmp;
++ else
++ brelse (tmp);
++ }
++ if (num) {
++ ll_rw_block (READA, num, bha);
++ for (i = 0; i < num; i++)
++ brelse (bha[i]);
++ }
++ }
++
++revalidate:
++ /* If the dir block has changed since the last call to
++ * readdir(2), then we might be pointing to an invalid
++ * dirent right now. Scan from the start of the block
++ * to make sure. */
++ if (filp->f_version != inode->i_version) {
++ for (i = 0; i < sb->s_blocksize && i < offset; ) {
++ de = (struct ext3_dir_entry_2 *)
++ (bh->b_data + i);
++ /* It's too expensive to do a full
++ * dirent test each time round this
++ * loop, but we do have to test at
++ * least that it is non-zero. A
++ * failure will be detected in the
++ * dirent test below. */
++ if (le16_to_cpu(de->rec_len) <
++ EXT3_DIR_REC_LEN(1))
++ break;
++ i += le16_to_cpu(de->rec_len);
++ }
++ offset = i;
++ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
++ | offset;
++ filp->f_version = inode->i_version;
++ }
++
++ while (!error && filp->f_pos < inode->i_size
++ && offset < sb->s_blocksize) {
++ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
++ if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
++ bh, offset)) {
++ /* On error, skip the f_pos to the
++ next block. */
++ filp->f_pos = (filp->f_pos |
++ (sb->s_blocksize - 1)) + 1;
++ brelse (bh);
++ return stored;
++ }
++ offset += le16_to_cpu(de->rec_len);
++ if (le32_to_cpu(de->inode)) {
++ /* We might block in the next section
++ * if the data destination is
++ * currently swapped out. So, use a
++ * version stamp to detect whether or
++ * not the directory has been modified
++ * during the copy operation.
++ */
++ unsigned long version = filp->f_version;
++ unsigned char d_type = DT_UNKNOWN;
++
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb,
++ EXT3_FEATURE_INCOMPAT_FILETYPE)
++ && de->file_type < EXT3_FT_MAX)
++ d_type =
++ ext3_filetype_table[de->file_type];
++ error = filldir(dirent, de->name,
++ de->name_len,
++ filp->f_pos,
++ le32_to_cpu(de->inode),
++ d_type);
++ if (error)
++ break;
++ if (version != filp->f_version)
++ goto revalidate;
++ stored ++;
++ }
++ filp->f_pos += le16_to_cpu(de->rec_len);
++ }
++ offset = 0;
++ brelse (bh);
++ }
++ UPDATE_ATIME(inode);
++ return 0;
++}
+diff -rup --new-file linux.mcp2/fs/ext3/file.c linux_tmp/fs/ext3/file.c
+--- linux.mcp2/fs/ext3/file.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/file.c 2001-11-15 13:37:55.000000000 -0800
+@@ -0,0 +1,94 @@
++/*
++ * linux/fs/ext3/file.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/file.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * ext3 fs regular file handling primitives
++ *
++ * 64-bit file support on 64-bit platforms by Jakub Jelinek
++ * (jj@sunsite.ms.mff.cuni.cz)
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/smp_lock.h>
++
++/*
++ * Called when an inode is released. Note that this is different
++ * from ext3_file_open: open gets called at every open, but release
++ * gets called only when /all/ the files are closed.
++ */
++static int ext3_release_file (struct inode * inode, struct file * filp)
++{
++ if (filp->f_mode & FMODE_WRITE)
++ ext3_discard_prealloc (inode);
++ return 0;
++}
++
++/*
++ * Called when an inode is about to be opened.
++ * We use this to disallow opening RW large files on 32bit systems if
++ * the caller didn't specify O_LARGEFILE. On 64bit systems we force
++ * on this flag in sys_open.
++ */
++static int ext3_open_file (struct inode * inode, struct file * filp)
++{
++ if (!(filp->f_flags & O_LARGEFILE) &&
++ inode->i_size > 0x7FFFFFFFLL)
++ return -EFBIG;
++ return 0;
++}
++
++/*
++ * ext3_file_write().
++ *
++ * Most things are done in ext3_prepare_write() and ext3_commit_write().
++ */
++
++static ssize_t
++ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
++{
++ struct inode *inode = file->f_dentry->d_inode;
++
++ /*
++ * Nasty: if the file is subject to synchronous writes then we need
++ * to force generic_osync_inode() to call ext3_write_inode().
++ * We do that by marking the inode dirty. This adds much more
++ * computational expense than we need, but we're going to sync
++ * anyway.
++ */
++ if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
++ mark_inode_dirty(inode);
++
++ return generic_file_write(file, buf, count, ppos);
++}
++
++struct file_operations ext3_file_operations = {
++ llseek: generic_file_llseek, /* BKL held */
++ read: generic_file_read, /* BKL not held. Don't need */
++ write: ext3_file_write, /* BKL not held. Don't need */
++ ioctl: ext3_ioctl, /* BKL held */
++ mmap: generic_file_mmap,
++ open: ext3_open_file, /* BKL not held. Don't need */
++ release: ext3_release_file, /* BKL not held. Don't need */
++ fsync: ext3_sync_file, /* BKL held */
++};
++
++struct inode_operations ext3_file_inode_operations = {
++ truncate: ext3_truncate, /* BKL held */
++ setattr: ext3_setattr, /* BKL held */
++};
++
+diff -rup --new-file linux.mcp2/fs/ext3/fsync.c linux_tmp/fs/ext3/fsync.c
+--- linux.mcp2/fs/ext3/fsync.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/fsync.c 2001-11-20 21:34:13.000000000 -0800
+@@ -0,0 +1,70 @@
++/*
++ * linux/fs/ext3/fsync.c
++ *
++ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
++ * from
++ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ * from
++ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * ext3fs fsync primitive
++ *
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ *
++ * Removed unnecessary code duplication for little endian machines
++ * and excessive __inline__s.
++ * Andi Kleen, 1997
++ *
++ * Major simplications and cleanup - we only need to do the metadata, because
++ * we can depend on generic_block_fdatasync() to sync the data blocks.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++
++/*
++ * akpm: A new design for ext3_sync_file().
++ *
++ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
++ * There cannot be a transaction open by this task. (AKPM: quotas?)
++ * Another task could have dirtied this inode. Its data can be in any
++ * state in the journalling system.
++ *
++ * What we do is just kick off a commit and wait on it. This will snapshot the
++ * inode to disk.
++ *
++ * Note that there is a serious optimisation we can make here: if the current
++ * inode is not part of j_running_transaction or j_committing_transaction
++ * then we have nothing to do. That would require implementation of t_ilist,
++ * which isn't too hard.
++ */
++
++int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
++{
++ struct inode *inode = dentry->d_inode;
++ int ret;
++
++ J_ASSERT(ext3_journal_current_handle() == 0);
++
++ /*
++ * fsync_inode_buffers() just walks i_dirty_buffers and waits
++ * on them. It's a no-op for full data journalling because
++ * i_dirty_buffers will be ampty.
++ * Really, we only need to start I/O on the dirty buffers -
++ * we'll end up waiting on them in commit.
++ */
++ ret = fsync_inode_buffers(inode);
++ ret |= fsync_inode_data_buffers(inode);
++
++ ext3_force_commit(inode->i_sb);
++
++ return ret;
++}
+diff -rup --new-file linux.mcp2/fs/ext3/ialloc.c linux_tmp/fs/ext3/ialloc.c
+--- linux.mcp2/fs/ext3/ialloc.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/ialloc.c 2002-02-25 11:38:08.000000000 -0800
+@@ -0,0 +1,663 @@
++/*
++ * linux/fs/ext3/ialloc.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * BSD ufs-inspired inode and directory allocation by
++ * Stephen Tweedie (sct@redhat.com), 1993
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/stat.h>
++#include <linux/string.h>
++#include <linux/locks.h>
++#include <linux/quotaops.h>
++
++#include <asm/bitops.h>
++#include <asm/byteorder.h>
++
++/*
++ * ialloc.c contains the inodes allocation and deallocation routines
++ */
++
++/*
++ * The free inodes are managed by bitmaps. A file system contains several
++ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
++ * block for inodes, N blocks for the inode table and data blocks.
++ *
++ * The file system contains group descriptors which are located after the
++ * super block. Each descriptor contains the number of the bitmap block and
++ * the free blocks count in the block. The descriptors are loaded in memory
++ * when a file system is mounted (see ext3_read_super).
++ */
++
++
++/*
++ * Read the inode allocation bitmap for a given block_group, reading
++ * into the specified slot in the superblock's bitmap cache.
++ *
++ * Return >=0 on success or a -ve error code.
++ */
++static int read_inode_bitmap (struct super_block * sb,
++ unsigned long block_group,
++ unsigned int bitmap_nr)
++{
++ struct ext3_group_desc * gdp;
++ struct buffer_head * bh = NULL;
++ int retval = 0;
++
++ gdp = ext3_get_group_desc (sb, block_group, NULL);
++ if (!gdp) {
++ retval = -EIO;
++ goto error_out;
++ }
++ bh = sb_bread(sb, le32_to_cpu(gdp->bg_inode_bitmap));
++ if (!bh) {
++ ext3_error (sb, "read_inode_bitmap",
++ "Cannot read inode bitmap - "
++ "block_group = %lu, inode_bitmap = %lu",
++ block_group, (unsigned long) gdp->bg_inode_bitmap);
++ retval = -EIO;
++ }
++ /*
++ * On IO error, just leave a zero in the superblock's block pointer for
++ * this group. The IO will be retried next time.
++ */
++error_out:
++ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
++ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
++ return retval;
++}
++
++/*
++ * load_inode_bitmap loads the inode bitmap for a blocks group
++ *
++ * It maintains a cache for the last bitmaps loaded. This cache is managed
++ * with a LRU algorithm.
++ *
++ * Notes:
++ * 1/ There is one cache per mounted file system.
++ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
++ * this function reads the bitmap without maintaining a LRU cache.
++ *
++ * Return the slot used to store the bitmap, or a -ve error code.
++ */
++static int load_inode_bitmap (struct super_block * sb,
++ unsigned int block_group)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned long inode_bitmap_number;
++ struct buffer_head * inode_bitmap;
++ int i, j, retval = 0;
++
++ if (block_group >= sbi->s_groups_count)
++ ext3_panic (sb, "load_inode_bitmap",
++ "block_group >= groups_count - "
++ "block_group = %d, groups_count = %lu",
++ block_group, sbi->s_groups_count);
++ if (sbi->s_loaded_inode_bitmaps > 0 &&
++ sbi->s_inode_bitmap_number[0] == block_group &&
++ sbi->s_inode_bitmap[0] != NULL)
++ return 0;
++ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
++ if (sbi->s_inode_bitmap[block_group]) {
++ if (sbi->s_inode_bitmap_number[block_group] !=
++ block_group)
++ ext3_panic(sb, "load_inode_bitmap",
++ "block_group != inode_bitmap_number");
++ return block_group;
++ }
++ retval = read_inode_bitmap(sb, block_group, block_group);
++ if (retval < 0)
++ return retval;
++ return block_group;
++ }
++
++ for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
++ sbi->s_inode_bitmap_number[i] != block_group; i++)
++ /* do nothing */;
++ if (i < sbi->s_loaded_inode_bitmaps &&
++ sbi->s_inode_bitmap_number[i] == block_group) {
++ inode_bitmap_number = sbi->s_inode_bitmap_number[i];
++ inode_bitmap = sbi->s_inode_bitmap[i];
++ for (j = i; j > 0; j--) {
++ sbi->s_inode_bitmap_number[j] =
++ sbi->s_inode_bitmap_number[j - 1];
++ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
++ }
++ sbi->s_inode_bitmap_number[0] = inode_bitmap_number;
++ sbi->s_inode_bitmap[0] = inode_bitmap;
++
++ /*
++ * There's still one special case here --- if inode_bitmap == 0
++ * then our last attempt to read the bitmap failed and we have
++ * just ended up caching that failure. Try again to read it.
++ */
++ if (!inode_bitmap)
++ retval = read_inode_bitmap (sb, block_group, 0);
++ } else {
++ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED)
++ sbi->s_loaded_inode_bitmaps++;
++ else
++ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
++ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) {
++ sbi->s_inode_bitmap_number[j] =
++ sbi->s_inode_bitmap_number[j - 1];
++ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
++ }
++ retval = read_inode_bitmap (sb, block_group, 0);
++ }
++ return retval;
++}
++
++/*
++ * NOTE! When we get the inode, we're the only people
++ * that have access to it, and as such there are no
++ * race conditions we have to worry about. The inode
++ * is not on the hash-lists, and it cannot be reached
++ * through the filesystem because the directory entry
++ * has been deleted earlier.
++ *
++ * HOWEVER: we must make sure that we get no aliases,
++ * which means that we have to call "clear_inode()"
++ * _before_ we mark the inode not in use in the inode
++ * bitmaps. Otherwise a newly created file might use
++ * the same inode number (not actually the same pointer
++ * though), and then we'd have two inodes sharing the
++ * same inode number and space on the harddisk.
++ */
++void ext3_free_inode (handle_t *handle, struct inode * inode)
++{
++ struct super_block * sb = inode->i_sb;
++ int is_directory;
++ unsigned long ino;
++ struct buffer_head * bh;
++ struct buffer_head * bh2;
++ unsigned long block_group;
++ unsigned long bit;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ struct ext3_super_block * es;
++ int fatal = 0, err;
++
++ if (!inode->i_dev) {
++ printk ("ext3_free_inode: inode has no device\n");
++ return;
++ }
++ if (atomic_read(&inode->i_count) > 1) {
++ printk ("ext3_free_inode: inode has count=%d\n",
++ atomic_read(&inode->i_count));
++ return;
++ }
++ if (inode->i_nlink) {
++ printk ("ext3_free_inode: inode has nlink=%d\n",
++ inode->i_nlink);
++ return;
++ }
++ if (!sb) {
++ printk("ext3_free_inode: inode on nonexistent device\n");
++ return;
++ }
++
++ ino = inode->i_ino;
++ ext3_debug ("freeing inode %lu\n", ino);
++
++ /*
++ * Note: we must free any quota before locking the superblock,
++ * as writing the quota to disk may need the lock as well.
++ */
++ DQUOT_INIT(inode);
++ DQUOT_FREE_INODE(inode);
++ DQUOT_DROP(inode);
++
++ is_directory = S_ISDIR(inode->i_mode);
++
++ /* Do this BEFORE marking the inode not in use or returning an error */
++ clear_inode (inode);
++
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
++ ext3_error (sb, "ext3_free_inode",
++ "reserved or nonexistent inode %lu", ino);
++ goto error_return;
++ }
++ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
++ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
++ bitmap_nr = load_inode_bitmap (sb, block_group);
++ if (bitmap_nr < 0)
++ goto error_return;
++
++ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
++
++ BUFFER_TRACE(bh, "get_write_access");
++ fatal = ext3_journal_get_write_access(handle, bh);
++ if (fatal)
++ goto error_return;
++
++ /* Ok, now we can actually update the inode bitmaps.. */
++ if (!ext3_clear_bit (bit, bh->b_data))
++ ext3_error (sb, "ext3_free_inode",
++ "bit already cleared for inode %lu", ino);
++ else {
++ gdp = ext3_get_group_desc (sb, block_group, &bh2);
++
++ BUFFER_TRACE(bh2, "get_write_access");
++ fatal = ext3_journal_get_write_access(handle, bh2);
++ if (fatal) goto error_return;
++
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
++ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++ if (fatal) goto error_return;
++
++ if (gdp) {
++ gdp->bg_free_inodes_count = cpu_to_le16(
++ le16_to_cpu(gdp->bg_free_inodes_count) + 1);
++ if (is_directory)
++ gdp->bg_used_dirs_count = cpu_to_le16(
++ le16_to_cpu(gdp->bg_used_dirs_count) - 1);
++ }
++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (!fatal) fatal = err;
++ es->s_free_inodes_count =
++ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
++ "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++ if (!fatal) fatal = err;
++ }
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (!fatal)
++ fatal = err;
++ sb->s_dirt = 1;
++error_return:
++ ext3_std_error(sb, fatal);
++ unlock_super(sb);
++}
++
++/*
++ * There are two policies for allocating an inode. If the new inode is
++ * a directory, then a forward search is made for a block group with both
++ * free space and a low directory-to-inode ratio; if that fails, then of
++ * the groups with above-average free space, that group with the fewest
++ * directories already is chosen.
++ *
++ * For other inodes, search forward from the parent directory's block
++ * group to find a free inode.
++ */
++struct inode * ext3_new_inode (handle_t *handle,
++ const struct inode * dir, int mode)
++{
++ struct super_block * sb;
++ struct buffer_head * bh;
++ struct buffer_head * bh2;
++ int i, j, avefreei;
++ struct inode * inode;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ struct ext3_group_desc * tmp;
++ struct ext3_super_block * es;
++ int err = 0;
++
++ /* Cannot create files in a deleted directory */
++ if (!dir || !dir->i_nlink)
++ return ERR_PTR(-EPERM);
++
++ sb = dir->i_sb;
++ inode = new_inode(sb);
++ if (!inode)
++ return ERR_PTR(-ENOMEM);
++ init_rwsem(&inode->u.ext3_i.truncate_sem);
++
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++repeat:
++ gdp = NULL;
++ i = 0;
++
++ if (S_ISDIR(mode)) {
++ avefreei = le32_to_cpu(es->s_free_inodes_count) /
++ sb->u.ext3_sb.s_groups_count;
++ if (!gdp) {
++ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
++ struct buffer_head *temp_buffer;
++ tmp = ext3_get_group_desc (sb, j, &temp_buffer);
++ if (tmp &&
++ le16_to_cpu(tmp->bg_free_inodes_count) &&
++ le16_to_cpu(tmp->bg_free_inodes_count) >=
++ avefreei) {
++ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) >
++ le16_to_cpu(gdp->bg_free_blocks_count))) {
++ i = j;
++ gdp = tmp;
++ bh2 = temp_buffer;
++ }
++ }
++ }
++ }
++ } else {
++ /*
++ * Try to place the inode in its parent directory
++ */
++ i = dir->u.ext3_i.i_block_group;
++ tmp = ext3_get_group_desc (sb, i, &bh2);
++ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
++ gdp = tmp;
++ else
++ {
++ /*
++ * Use a quadratic hash to find a group with a
++ * free inode
++ */
++ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
++ i += j;
++ if (i >= sb->u.ext3_sb.s_groups_count)
++ i -= sb->u.ext3_sb.s_groups_count;
++ tmp = ext3_get_group_desc (sb, i, &bh2);
++ if (tmp &&
++ le16_to_cpu(tmp->bg_free_inodes_count)) {
++ gdp = tmp;
++ break;
++ }
++ }
++ }
++ if (!gdp) {
++ /*
++ * That failed: try linear search for a free inode
++ */
++ i = dir->u.ext3_i.i_block_group + 1;
++ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
++ if (++i >= sb->u.ext3_sb.s_groups_count)
++ i = 0;
++ tmp = ext3_get_group_desc (sb, i, &bh2);
++ if (tmp &&
++ le16_to_cpu(tmp->bg_free_inodes_count)) {
++ gdp = tmp;
++ break;
++ }
++ }
++ }
++ }
++
++ err = -ENOSPC;
++ if (!gdp)
++ goto fail;
++
++ err = -EIO;
++ bitmap_nr = load_inode_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ goto fail;
++
++ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
++
++ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
++ EXT3_INODES_PER_GROUP(sb))) <
++ EXT3_INODES_PER_GROUP(sb)) {
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) goto fail;
++
++ if (ext3_set_bit (j, bh->b_data)) {
++ ext3_error (sb, "ext3_new_inode",
++ "bit already set for inode %d", j);
++ goto repeat;
++ }
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err) goto fail;
++ } else {
++ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
++ ext3_error (sb, "ext3_new_inode",
++ "Free inodes count corrupted in group %d",
++ i);
++ /* Is it really ENOSPC? */
++ err = -ENOSPC;
++ if (sb->s_flags & MS_RDONLY)
++ goto fail;
++
++ BUFFER_TRACE(bh2, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh2);
++ if (err) goto fail;
++ gdp->bg_free_inodes_count = 0;
++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err) goto fail;
++ }
++ goto repeat;
++ }
++ j += i * EXT3_INODES_PER_GROUP(sb) + 1;
++ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
++ ext3_error (sb, "ext3_new_inode",
++ "reserved inode or inode > inodes count - "
++ "block_group = %d,inode=%d", i, j);
++ err = -EIO;
++ goto fail;
++ }
++
++ BUFFER_TRACE(bh2, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh2);
++ if (err) goto fail;
++ gdp->bg_free_inodes_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
++ if (S_ISDIR(mode))
++ gdp->bg_used_dirs_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err) goto fail;
++
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++ if (err) goto fail;
++ es->s_free_inodes_count =
++ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++ sb->s_dirt = 1;
++ if (err) goto fail;
++
++ inode->i_uid = current->fsuid;
++ if (test_opt (sb, GRPID))
++ inode->i_gid = dir->i_gid;
++ else if (dir->i_mode & S_ISGID) {
++ inode->i_gid = dir->i_gid;
++ if (S_ISDIR(mode))
++ mode |= S_ISGID;
++ } else
++ inode->i_gid = current->fsgid;
++ inode->i_mode = mode;
++
++ inode->i_ino = j;
++ /* This is the optimal IO size (for stat), not the fs block size */
++ inode->i_blksize = PAGE_SIZE;
++ inode->i_blocks = 0;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
++ if (S_ISLNK(mode))
++ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
++#ifdef EXT3_FRAGMENTS
++ inode->u.ext3_i.i_faddr = 0;
++ inode->u.ext3_i.i_frag_no = 0;
++ inode->u.ext3_i.i_frag_size = 0;
++#endif
++ inode->u.ext3_i.i_file_acl = 0;
++ inode->u.ext3_i.i_dir_acl = 0;
++ inode->u.ext3_i.i_dtime = 0;
++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++#ifdef EXT3_PREALLOCATE
++ inode->u.ext3_i.i_prealloc_count = 0;
++#endif
++ inode->u.ext3_i.i_block_group = i;
++
++ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
++ inode->i_flags |= S_SYNC;
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++ insert_inode_hash(inode);
++ inode->i_generation = sb->u.ext3_sb.s_next_generation++;
++
++ inode->u.ext3_i.i_state = EXT3_STATE_NEW;
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err) goto fail;
++
++ unlock_super (sb);
++ if(DQUOT_ALLOC_INODE(inode)) {
++ DQUOT_DROP(inode);
++ inode->i_flags |= S_NOQUOTA;
++ inode->i_nlink = 0;
++ iput(inode);
++ return ERR_PTR(-EDQUOT);
++ }
++ ext3_debug ("allocating inode %lu\n", inode->i_ino);
++ return inode;
++
++fail:
++ unlock_super(sb);
++ iput(inode);
++ ext3_std_error(sb, err);
++ return ERR_PTR(err);
++}
++
++/* Verify that we are loading a valid orphan from disk */
++struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
++{
++ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
++ unsigned long block_group;
++ int bit;
++ int bitmap_nr;
++ struct buffer_head *bh;
++ struct inode *inode = NULL;
++
++ /* Error cases - e2fsck has already cleaned up for us */
++ if (ino > max_ino) {
++ ext3_warning(sb, __FUNCTION__,
++ "bad orphan ino %ld! e2fsck was run?\n", ino);
++ return NULL;
++ }
++
++ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
++ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
++ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
++ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
++ ext3_warning(sb, __FUNCTION__,
++ "inode bitmap error for orphan %ld\n", ino);
++ return NULL;
++ }
++
++ /* Having the inode bit set should be a 100% indicator that this
++ * is a valid orphan (no e2fsck run on fs). Orphans also include
++ * inodes that were being truncated, so we can't check i_nlink==0.
++ */
++ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
++ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
++ ext3_warning(sb, __FUNCTION__,
++ "bad orphan inode %ld! e2fsck was run?\n", ino);
++ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
++ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
++ printk(KERN_NOTICE "inode=%p\n", inode);
++ if (inode) {
++ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
++ is_bad_inode(inode));
++ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
++ NEXT_ORPHAN(inode));
++ printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
++ }
++ /* Avoid freeing blocks if we got a bad deleted inode */
++ if (inode && inode->i_nlink == 0)
++ inode->i_blocks = 0;
++ iput(inode);
++ return NULL;
++ }
++
++ return inode;
++}
++
++unsigned long ext3_count_free_inodes (struct super_block * sb)
++{
++#ifdef EXT3FS_DEBUG
++ struct ext3_super_block * es;
++ unsigned long desc_count, bitmap_count, x;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ int i;
++
++ lock_super (sb);
++ es = sb->u.ext3_sb.s_es;
++ desc_count = 0;
++ bitmap_count = 0;
++ gdp = NULL;
++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++ gdp = ext3_get_group_desc (sb, i, NULL);
++ if (!gdp)
++ continue;
++ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
++ bitmap_nr = load_inode_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ continue;
++
++ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
++ EXT3_INODES_PER_GROUP(sb) / 8);
++ printk ("group %d: stored = %d, counted = %lu\n",
++ i, le16_to_cpu(gdp->bg_free_inodes_count), x);
++ bitmap_count += x;
++ }
++ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
++ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
++ unlock_super (sb);
++ return desc_count;
++#else
++ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
++#endif
++}
++
++#ifdef CONFIG_EXT3_CHECK
++/* Called at mount-time, super-block is locked */
++void ext3_check_inodes_bitmap (struct super_block * sb)
++{
++ struct ext3_super_block * es;
++ unsigned long desc_count, bitmap_count, x;
++ int bitmap_nr;
++ struct ext3_group_desc * gdp;
++ int i;
++
++ es = sb->u.ext3_sb.s_es;
++ desc_count = 0;
++ bitmap_count = 0;
++ gdp = NULL;
++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++ gdp = ext3_get_group_desc (sb, i, NULL);
++ if (!gdp)
++ continue;
++ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
++ bitmap_nr = load_inode_bitmap (sb, i);
++ if (bitmap_nr < 0)
++ continue;
++
++ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
++ EXT3_INODES_PER_GROUP(sb) / 8);
++ if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
++ ext3_error (sb, "ext3_check_inodes_bitmap",
++ "Wrong free inodes count in group %d, "
++ "stored = %d, counted = %lu", i,
++ le16_to_cpu(gdp->bg_free_inodes_count), x);
++ bitmap_count += x;
++ }
++ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
++ ext3_error (sb, "ext3_check_inodes_bitmap",
++ "Wrong free inodes count in super block, "
++ "stored = %lu, counted = %lu",
++ (unsigned long)le32_to_cpu(es->s_free_inodes_count),
++ bitmap_count);
++}
++#endif
+diff -rup --new-file linux.mcp2/fs/ext3/inode.c linux_tmp/fs/ext3/inode.c
+--- linux.mcp2/fs/ext3/inode.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/inode.c 2002-08-02 17:39:45.000000000 -0700
+@@ -0,0 +1,2699 @@
++/*
++ * linux/fs/ext3/inode.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/inode.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * Goal-directed block allocation by Stephen Tweedie
++ * (sct@redhat.com), 1993, 1998
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ * 64-bit file support on 64-bit platforms by Jakub Jelinek
++ * (jj@sunsite.ms.mff.cuni.cz)
++ *
++ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
++ */
++
++#include <linux/fs.h>
++#include <linux/sched.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/locks.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/quotaops.h>
++#include <linux/module.h>
++
++/*
++ * SEARCH_FROM_ZERO forces each block allocation to search from the start
++ * of the filesystem. This is to force rapid reallocation of recently-freed
++ * blocks. The file fragmentation is horrendous.
++ */
++#undef SEARCH_FROM_ZERO
++
++/* The ext3 forget function must perform a revoke if we are freeing data
++ * which has been journaled. Metadata (eg. indirect blocks) must be
++ * revoked in all cases.
++ *
++ * "bh" may be NULL: a metadata block may have been freed from memory
++ * but there may still be a record of it in the journal, and that record
++ * still needs to be revoked.
++ */
++
++static int ext3_forget(handle_t *handle, int is_metadata,
++ struct inode *inode, struct buffer_head *bh,
++ int blocknr)
++{
++ int err;
++
++ BUFFER_TRACE(bh, "enter");
++
++ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
++ "data mode %lx\n",
++ bh, is_metadata, inode->i_mode,
++ test_opt(inode->i_sb, DATA_FLAGS));
++
++ /* Never use the revoke function if we are doing full data
++ * journaling: there is no need to, and a V1 superblock won't
++ * support it. Otherwise, only skip the revoke on un-journaled
++ * data blocks. */
++
++ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
++ (!is_metadata && !ext3_should_journal_data(inode))) {
++ if (bh) {
++ BUFFER_TRACE(bh, "call journal_forget");
++ ext3_journal_forget(handle, bh);
++ }
++ return 0;
++ }
++
++ /*
++ * data!=journal && (is_metadata || should_journal_data(inode))
++ */
++ BUFFER_TRACE(bh, "call ext3_journal_revoke");
++ err = ext3_journal_revoke(handle, blocknr, bh);
++ if (err)
++ ext3_abort(inode->i_sb, __FUNCTION__,
++ "error %d when attempting revoke", err);
++ BUFFER_TRACE(bh, "exit");
++ return err;
++}
++
++/*
++ * Truncate transactions can be complex and absolutely huge. So we need to
++ * be able to restart the transaction at a conventient checkpoint to make
++ * sure we don't overflow the journal.
++ *
++ * start_transaction gets us a new handle for a truncate transaction,
++ * and extend_transaction tries to extend the existing one a bit. If
++ * extend fails, we need to propagate the failure up and restart the
++ * transaction in the top-level truncate loop. --sct
++ */
++
++static handle_t *start_transaction(struct inode *inode)
++{
++ long needed;
++ handle_t *result;
++
++ needed = inode->i_blocks;
++ if (needed > EXT3_MAX_TRANS_DATA)
++ needed = EXT3_MAX_TRANS_DATA;
++
++ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
++ if (!IS_ERR(result))
++ return result;
++
++ ext3_std_error(inode->i_sb, PTR_ERR(result));
++ return result;
++}
++
++/*
++ * Try to extend this transaction for the purposes of truncation.
++ *
++ * Returns 0 if we managed to create more room. If we can't create more
++ * room, and the transaction must be restarted we return 1.
++ */
++static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
++{
++ long needed;
++
++ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
++ return 0;
++ needed = inode->i_blocks;
++ if (needed > EXT3_MAX_TRANS_DATA)
++ needed = EXT3_MAX_TRANS_DATA;
++ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
++ return 0;
++ return 1;
++}
++
++/*
++ * Restart the transaction associated with *handle. This does a commit,
++ * so before we call here everything must be consistently dirtied against
++ * this transaction.
++ */
++static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
++{
++ long needed = inode->i_blocks;
++ if (needed > EXT3_MAX_TRANS_DATA)
++ needed = EXT3_MAX_TRANS_DATA;
++ jbd_debug(2, "restarting handle %p\n", handle);
++ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
++}
++
++/*
++ * Called at each iput()
++ */
++void ext3_put_inode (struct inode * inode)
++{
++ ext3_discard_prealloc (inode);
++}
++
++/*
++ * Called at the last iput() if i_nlink is zero.
++ */
++void ext3_delete_inode (struct inode * inode)
++{
++ handle_t *handle;
++
++ if (is_bad_inode(inode) ||
++ inode->i_ino == EXT3_ACL_IDX_INO ||
++ inode->i_ino == EXT3_ACL_DATA_INO)
++ goto no_delete;
++
++ lock_kernel();
++ handle = start_transaction(inode);
++ if (IS_ERR(handle)) {
++ /* If we're going to skip the normal cleanup, we still
++ * need to make sure that the in-core orphan linked list
++ * is properly cleaned up. */
++ ext3_orphan_del(NULL, inode);
++
++ ext3_std_error(inode->i_sb, PTR_ERR(handle));
++ unlock_kernel();
++ goto no_delete;
++ }
++
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++ inode->i_size = 0;
++ if (inode->i_blocks)
++ ext3_truncate(inode);
++ /*
++ * Kill off the orphan record which ext3_truncate created.
++ * AKPM: I think this can be inside the above `if'.
++ * Note that ext3_orphan_del() has to be able to cope with the
++ * deletion of a non-existent orphan - this is because we don't
++ * know if ext3_truncate() actually created an orphan record.
++ * (Well, we could do this if we need to, but heck - it works)
++ */
++ ext3_orphan_del(handle, inode);
++ inode->u.ext3_i.i_dtime = CURRENT_TIME;
++
++ /*
++ * One subtle ordering requirement: if anything has gone wrong
++ * (transaction abort, IO errors, whatever), then we can still
++ * do these next steps (the fs will already have been marked as
++ * having errors), but we can't free the inode if the mark_dirty
++ * fails.
++ */
++ if (ext3_mark_inode_dirty(handle, inode))
++ /* If that failed, just do the required in-core inode clear. */
++ clear_inode(inode);
++ else
++ ext3_free_inode(handle, inode);
++ ext3_journal_stop(handle, inode);
++ unlock_kernel();
++ return;
++no_delete:
++ clear_inode(inode); /* We must guarantee clearing of inode... */
++}
++
++void ext3_discard_prealloc (struct inode * inode)
++{
++#ifdef EXT3_PREALLOCATE
++ lock_kernel();
++ /* Writer: ->i_prealloc* */
++ if (inode->u.ext3_i.i_prealloc_count) {
++ unsigned short total = inode->u.ext3_i.i_prealloc_count;
++ unsigned long block = inode->u.ext3_i.i_prealloc_block;
++ inode->u.ext3_i.i_prealloc_count = 0;
++ inode->u.ext3_i.i_prealloc_block = 0;
++ /* Writer: end */
++ ext3_free_blocks (inode, block, total);
++ }
++ unlock_kernel();
++#endif
++}
++
++static int ext3_alloc_block (handle_t *handle,
++ struct inode * inode, unsigned long goal, int *err)
++{
++#ifdef EXT3FS_DEBUG
++ static unsigned long alloc_hits = 0, alloc_attempts = 0;
++#endif
++ unsigned long result;
++
++#ifdef EXT3_PREALLOCATE
++ /* Writer: ->i_prealloc* */
++ if (inode->u.ext3_i.i_prealloc_count &&
++ (goal == inode->u.ext3_i.i_prealloc_block ||
++ goal + 1 == inode->u.ext3_i.i_prealloc_block))
++ {
++ result = inode->u.ext3_i.i_prealloc_block++;
++ inode->u.ext3_i.i_prealloc_count--;
++ /* Writer: end */
++ ext3_debug ("preallocation hit (%lu/%lu).\n",
++ ++alloc_hits, ++alloc_attempts);
++ } else {
++ ext3_discard_prealloc (inode);
++ ext3_debug ("preallocation miss (%lu/%lu).\n",
++ alloc_hits, ++alloc_attempts);
++ if (S_ISREG(inode->i_mode))
++ result = ext3_new_block (inode, goal,
++ &inode->u.ext3_i.i_prealloc_count,
++ &inode->u.ext3_i.i_prealloc_block, err);
++ else
++ result = ext3_new_block (inode, goal, 0, 0, err);
++ /*
++ * AKPM: this is somewhat sticky. I'm not surprised it was
++ * disabled in 2.2's ext3. Need to integrate b_committed_data
++ * guarding with preallocation, if indeed preallocation is
++ * effective.
++ */
++ }
++#else
++ result = ext3_new_block (handle, inode, goal, 0, 0, err);
++#endif
++ return result;
++}
++
++
++typedef struct {
++ u32 *p;
++ u32 key;
++ struct buffer_head *bh;
++} Indirect;
++
++static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
++{
++ p->key = *(p->p = v);
++ p->bh = bh;
++}
++
++static inline int verify_chain(Indirect *from, Indirect *to)
++{
++ while (from <= to && from->key == *from->p)
++ from++;
++ return (from > to);
++}
++
++/**
++ * ext3_block_to_path - parse the block number into array of offsets
++ * @inode: inode in question (we are only interested in its superblock)
++ * @i_block: block number to be parsed
++ * @offsets: array to store the offsets in
++ *
++ * To store the locations of file's data ext3 uses a data structure common
++ * for UNIX filesystems - tree of pointers anchored in the inode, with
++ * data blocks at leaves and indirect blocks in intermediate nodes.
++ * This function translates the block number into path in that tree -
++ * return value is the path length and @offsets[n] is the offset of
++ * pointer to (n+1)th node in the nth one. If @block is out of range
++ * (negative or too large) warning is printed and zero returned.
++ *
++ * Note: function doesn't find node addresses, so no IO is needed. All
++ * we need to know is the capacity of indirect blocks (taken from the
++ * inode->i_sb).
++ */
++
++/*
++ * Portability note: the last comparison (check that we fit into triple
++ * indirect block) is spelled differently, because otherwise on an
++ * architecture with 32-bit longs and 8Kb pages we might get into trouble
++ * if our filesystem had 8Kb blocks. We might use long long, but that would
++ * kill us on x86. Oh, well, at least the sign propagation does not matter -
++ * i_block would have to be negative in the very beginning, so we would not
++ * get there at all.
++ */
++
++static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
++{
++ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
++ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
++ const long direct_blocks = EXT3_NDIR_BLOCKS,
++ indirect_blocks = ptrs,
++ double_blocks = (1 << (ptrs_bits * 2));
++ int n = 0;
++
++ if (i_block < 0) {
++ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
++ } else if (i_block < direct_blocks) {
++ offsets[n++] = i_block;
++ } else if ( (i_block -= direct_blocks) < indirect_blocks) {
++ offsets[n++] = EXT3_IND_BLOCK;
++ offsets[n++] = i_block;
++ } else if ((i_block -= indirect_blocks) < double_blocks) {
++ offsets[n++] = EXT3_DIND_BLOCK;
++ offsets[n++] = i_block >> ptrs_bits;
++ offsets[n++] = i_block & (ptrs - 1);
++ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
++ offsets[n++] = EXT3_TIND_BLOCK;
++ offsets[n++] = i_block >> (ptrs_bits * 2);
++ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
++ offsets[n++] = i_block & (ptrs - 1);
++ } else {
++ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
++ }
++ return n;
++}
++
++/**
++ * ext3_get_branch - read the chain of indirect blocks leading to data
++ * @inode: inode in question
++ * @depth: depth of the chain (1 - direct pointer, etc.)
++ * @offsets: offsets of pointers in inode/indirect blocks
++ * @chain: place to store the result
++ * @err: here we store the error value
++ *
++ * Function fills the array of triples <key, p, bh> and returns %NULL
++ * if everything went OK or the pointer to the last filled triple
++ * (incomplete one) otherwise. Upon the return chain[i].key contains
++ * the number of (i+1)-th block in the chain (as it is stored in memory,
++ * i.e. little-endian 32-bit), chain[i].p contains the address of that
++ * number (it points into struct inode for i==0 and into the bh->b_data
++ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
++ * block for i>0 and NULL for i==0. In other words, it holds the block
++ * numbers of the chain, addresses they were taken from (and where we can
++ * verify that chain did not change) and buffer_heads hosting these
++ * numbers.
++ *
++ * Function stops when it stumbles upon zero pointer (absent block)
++ * (pointer to last triple returned, *@err == 0)
++ * or when it gets an IO error reading an indirect block
++ * (ditto, *@err == -EIO)
++ * or when it notices that chain had been changed while it was reading
++ * (ditto, *@err == -EAGAIN)
++ * or when it reads all @depth-1 indirect blocks successfully and finds
++ * the whole chain, all way to the data (returns %NULL, *err == 0).
++ */
++static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
++ Indirect chain[4], int *err)
++{
++ struct super_block *sb = inode->i_sb;
++ Indirect *p = chain;
++ struct buffer_head *bh;
++
++ *err = 0;
++ /* i_data is not going away, no lock needed */
++ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
++ if (!p->key)
++ goto no_block;
++ while (--depth) {
++ bh = sb_bread(sb, le32_to_cpu(p->key));
++ if (!bh)
++ goto failure;
++ /* Reader: pointers */
++ if (!verify_chain(chain, p))
++ goto changed;
++ add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
++ /* Reader: end */
++ if (!p->key)
++ goto no_block;
++ }
++ return NULL;
++
++changed:
++ *err = -EAGAIN;
++ goto no_block;
++failure:
++ *err = -EIO;
++no_block:
++ return p;
++}
++
++/**
++ * ext3_find_near - find a place for allocation with sufficient locality
++ * @inode: owner
++ * @ind: descriptor of indirect block.
++ *
++ * This function returns the prefered place for block allocation.
++ * It is used when heuristic for sequential allocation fails.
++ * Rules are:
++ * + if there is a block to the left of our position - allocate near it.
++ * + if pointer will live in indirect block - allocate near that block.
++ * + if pointer will live in inode - allocate in the same
++ * cylinder group.
++ * Caller must make sure that @ind is valid and will stay that way.
++ */
++
++static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
++{
++ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
++ u32 *p;
++
++ /* Try to find previous block */
++ for (p = ind->p - 1; p >= start; p--)
++ if (*p)
++ return le32_to_cpu(*p);
++
++ /* No such thing, so let's try location of indirect block */
++ if (ind->bh)
++ return ind->bh->b_blocknr;
++
++ /*
++ * It is going to be refered from inode itself? OK, just put it into
++ * the same cylinder group then.
++ */
++ return (inode->u.ext3_i.i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
++}
++
++/**
++ * ext3_find_goal - find a prefered place for allocation.
++ * @inode: owner
++ * @block: block we want
++ * @chain: chain of indirect blocks
++ * @partial: pointer to the last triple within a chain
++ * @goal: place to store the result.
++ *
++ * Normally this function find the prefered place for block allocation,
++ * stores it in *@goal and returns zero. If the branch had been changed
++ * under us we return -EAGAIN.
++ */
++
++static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
++ Indirect *partial, unsigned long *goal)
++{
++ /* Writer: ->i_next_alloc* */
++ if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
++ inode->u.ext3_i.i_next_alloc_block++;
++ inode->u.ext3_i.i_next_alloc_goal++;
++ }
++#ifdef SEARCH_FROM_ZERO
++ inode->u.ext3_i.i_next_alloc_block = 0;
++ inode->u.ext3_i.i_next_alloc_goal = 0;
++#endif
++ /* Writer: end */
++ /* Reader: pointers, ->i_next_alloc* */
++ if (verify_chain(chain, partial)) {
++ /*
++ * try the heuristic for sequential allocation,
++ * failing that at least try to get decent locality.
++ */
++ if (block == inode->u.ext3_i.i_next_alloc_block)
++ *goal = inode->u.ext3_i.i_next_alloc_goal;
++ if (!*goal)
++ *goal = ext3_find_near(inode, partial);
++#ifdef SEARCH_FROM_ZERO
++ *goal = 0;
++#endif
++ return 0;
++ }
++ /* Reader: end */
++ return -EAGAIN;
++}
++
++/**
++ * ext3_alloc_branch - allocate and set up a chain of blocks.
++ * @inode: owner
++ * @num: depth of the chain (number of blocks to allocate)
++ * @offsets: offsets (in the blocks) to store the pointers to next.
++ * @branch: place to store the chain in.
++ *
++ * This function allocates @num blocks, zeroes out all but the last one,
++ * links them into chain and (if we are synchronous) writes them to disk.
++ * In other words, it prepares a branch that can be spliced onto the
++ * inode. It stores the information about that chain in the branch[], in
++ * the same format as ext3_get_branch() would do. We are calling it after
++ * we had read the existing part of chain and partial points to the last
++ * triple of that (one with zero ->key). Upon the exit we have the same
++ * picture as after the successful ext3_get_block(), excpet that in one
++ * place chain is disconnected - *branch->p is still zero (we did not
++ * set the last link), but branch->key contains the number that should
++ * be placed into *branch->p to fill that gap.
++ *
++ * If allocation fails we free all blocks we've allocated (and forget
++ * their buffer_heads) and return the error value the from failed
++ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
++ * as described above and return 0.
++ */
++
++static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
++ int num,
++ unsigned long goal,
++ int *offsets,
++ Indirect *branch)
++{
++ int blocksize = inode->i_sb->s_blocksize;
++ int n = 0, keys = 0;
++ int err = 0;
++ int i;
++ int parent = ext3_alloc_block(handle, inode, goal, &err);
++
++ branch[0].key = cpu_to_le32(parent);
++ if (parent) {
++ for (n = 1; n < num; n++) {
++ struct buffer_head *bh;
++ /* Allocate the next block */
++ int nr = ext3_alloc_block(handle, inode, parent, &err);
++ if (!nr)
++ break;
++ branch[n].key = cpu_to_le32(nr);
++ keys = n+1;
++
++ /*
++ * Get buffer_head for parent block, zero it out
++ * and set the pointer to new one, then send
++ * parent to disk.
++ */
++ bh = sb_getblk(inode->i_sb, parent);
++ branch[n].bh = bh;
++ lock_buffer(bh);
++ BUFFER_TRACE(bh, "call get_create_access");
++ err = ext3_journal_get_create_access(handle, bh);
++ if (err) {
++ unlock_buffer(bh);
++ brelse(bh);
++ break;
++ }
++
++ memset(bh->b_data, 0, blocksize);
++ branch[n].p = (u32*) bh->b_data + offsets[n];
++ *branch[n].p = branch[n].key;
++ BUFFER_TRACE(bh, "marking uptodate");
++ mark_buffer_uptodate(bh, 1);
++ unlock_buffer(bh);
++
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ break;
++
++ parent = nr;
++ }
++ }
++ if (n == num)
++ return 0;
++
++ /* Allocation failed, free what we already allocated */
++ for (i = 1; i < keys; i++) {
++ BUFFER_TRACE(branch[i].bh, "call journal_forget");
++ ext3_journal_forget(handle, branch[i].bh);
++ }
++ for (i = 0; i < keys; i++)
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ return err;
++}
++
++/**
++ * ext3_splice_branch - splice the allocated branch onto inode.
++ * @inode: owner
++ * @block: (logical) number of block we are adding
++ * @chain: chain of indirect blocks (with a missing link - see
++ * ext3_alloc_branch)
++ * @where: location of missing link
++ * @num: number of blocks we are adding
++ *
++ * This function verifies that chain (up to the missing link) had not
++ * changed, fills the missing link and does all housekeeping needed in
++ * inode (->i_blocks, etc.). In case of success we end up with the full
++ * chain to new block and return 0. Otherwise (== chain had been changed)
++ * we free the new blocks (forgetting their buffer_heads, indeed) and
++ * return -EAGAIN.
++ */
++
++static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
++ Indirect chain[4], Indirect *where, int num)
++{
++ int i;
++ int err = 0;
++
++ /*
++ * If we're splicing into a [td]indirect block (as opposed to the
++ * inode) then we need to get write access to the [td]indirect block
++ * before the splice.
++ */
++ if (where->bh) {
++ BUFFER_TRACE(where->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, where->bh);
++ if (err)
++ goto err_out;
++ }
++ /* Verify that place we are splicing to is still there and vacant */
++
++ /* Writer: pointers, ->i_next_alloc* */
++ if (!verify_chain(chain, where-1) || *where->p)
++ /* Writer: end */
++ goto changed;
++
++ /* That's it */
++
++ *where->p = where->key;
++ inode->u.ext3_i.i_next_alloc_block = block;
++ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
++#ifdef SEARCH_FROM_ZERO
++ inode->u.ext3_i.i_next_alloc_block = 0;
++ inode->u.ext3_i.i_next_alloc_goal = 0;
++#endif
++ /* Writer: end */
++
++ /* We are done with atomic stuff, now do the rest of housekeeping */
++
++ inode->i_ctime = CURRENT_TIME;
++ ext3_mark_inode_dirty(handle, inode);
++
++ /* had we spliced it onto indirect block? */
++ if (where->bh) {
++ /*
++ * akpm: If we spliced it onto an indirect block, we haven't
++ * altered the inode. Note however that if it is being spliced
++ * onto an indirect block at the very end of the file (the
++ * file is growing) then we *will* alter the inode to reflect
++ * the new i_size. But that is not done here - it is done in
++ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
++ */
++ jbd_debug(5, "splicing indirect only\n");
++ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, where->bh);
++ if (err)
++ goto err_out;
++ } else {
++ /*
++ * OK, we spliced it into the inode itself on a direct block.
++ * Inode was dirtied above.
++ */
++ jbd_debug(5, "splicing direct\n");
++ }
++ return err;
++
++changed:
++ /*
++ * AKPM: if where[i].bh isn't part of the current updating
++ * transaction then we explode nastily. Test this code path.
++ */
++ jbd_debug(1, "the chain changed: try again\n");
++ err = -EAGAIN;
++
++err_out:
++ for (i = 1; i < num; i++) {
++ BUFFER_TRACE(where[i].bh, "call journal_forget");
++ ext3_journal_forget(handle, where[i].bh);
++ }
++ /* For the normal collision cleanup case, we free up the blocks.
++ * On genuine filesystem errors we don't even think about doing
++ * that. */
++ if (err == -EAGAIN)
++ for (i = 0; i < num; i++)
++ ext3_free_blocks(handle, inode,
++ le32_to_cpu(where[i].key), 1);
++ return err;
++}
++
++/*
++ * Allocation strategy is simple: if we have to allocate something, we will
++ * have to go the whole way to leaf. So let's do it before attaching anything
++ * to tree, set linkage between the newborn blocks, write them if sync is
++ * required, recheck the path, free and repeat if check fails, otherwise
++ * set the last missing link (that will protect us from any truncate-generated
++ * removals - all blocks on the path are immune now) and possibly force the
++ * write on the parent block.
++ * That has a nice additional property: no special recovery from the failed
++ * allocations is needed - we simply release blocks and do not touch anything
++ * reachable from inode.
++ *
++ * akpm: `handle' can be NULL if create == 0.
++ *
++ * The BKL may not be held on entry here. Be sure to take it early.
++ */
++
++static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
++ long iblock,
++ struct buffer_head *bh_result, int create)
++{
++ int err = -EIO;
++ int offsets[4];
++ Indirect chain[4];
++ Indirect *partial;
++ unsigned long goal;
++ int left;
++ int depth = ext3_block_to_path(inode, iblock, offsets);
++ loff_t new_size;
++
++ J_ASSERT(handle != NULL || create == 0);
++
++ if (depth == 0)
++ goto out;
++
++ lock_kernel();
++reread:
++ partial = ext3_get_branch(inode, depth, offsets, chain, &err);
++
++ /* Simplest case - block found, no allocation needed */
++ if (!partial) {
++ bh_result->b_state &= ~(1UL << BH_New);
++got_it:
++ bh_result->b_dev = inode->i_dev;
++ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
++ bh_result->b_state |= (1UL << BH_Mapped);
++ /* Clean up and exit */
++ partial = chain+depth-1; /* the whole chain */
++ goto cleanup;
++ }
++
++ /* Next simple case - plain lookup or failed read of indirect block */
++ if (!create || err == -EIO) {
++cleanup:
++ while (partial > chain) {
++ BUFFER_TRACE(partial->bh, "call brelse");
++ brelse(partial->bh);
++ partial--;
++ }
++ BUFFER_TRACE(bh_result, "returned");
++ unlock_kernel();
++out:
++ return err;
++ }
++
++ /*
++ * Indirect block might be removed by truncate while we were
++ * reading it. Handling of that case (forget what we've got and
++ * reread) is taken out of the main path.
++ */
++ if (err == -EAGAIN)
++ goto changed;
++
++ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
++ goto changed;
++
++ left = (chain + depth) - partial;
++
++ /*
++ * Block out ext3_truncate while we alter the tree
++ */
++ down_read(&inode->u.ext3_i.truncate_sem);
++ err = ext3_alloc_branch(handle, inode, left, goal,
++ offsets+(partial-chain), partial);
++
++ /* The ext3_splice_branch call will free and forget any buffers
++ * on the new chain if there is a failure, but that risks using
++ * up transaction credits, especially for bitmaps where the
++ * credits cannot be returned. Can we handle this somehow? We
++ * may need to return -EAGAIN upwards in the worst case. --sct */
++ if (!err)
++ err = ext3_splice_branch(handle, inode, iblock, chain,
++ partial, left);
++ up_read(&inode->u.ext3_i.truncate_sem);
++ if (err == -EAGAIN)
++ goto changed;
++ if (err)
++ goto cleanup;
++
++ new_size = inode->i_size;
++ /*
++ * This is not racy against ext3_truncate's modification of i_disksize
++ * because VM/VFS ensures that the file cannot be extended while
++ * truncate is in progress. It is racy between multiple parallel
++ * instances of get_block, but we have the BKL.
++ */
++ if (new_size > inode->u.ext3_i.i_disksize)
++ inode->u.ext3_i.i_disksize = new_size;
++
++ bh_result->b_state |= (1UL << BH_New);
++ goto got_it;
++
++changed:
++ while (partial > chain) {
++ jbd_debug(1, "buffer chain changed, retrying\n");
++ BUFFER_TRACE(partial->bh, "brelsing");
++ brelse(partial->bh);
++ partial--;
++ }
++ goto reread;
++}
++
++/*
++ * The BKL is not held on entry here.
++ */
++static int ext3_get_block(struct inode *inode, long iblock,
++ struct buffer_head *bh_result, int create)
++{
++ handle_t *handle = 0;
++ int ret;
++
++ if (create) {
++ handle = ext3_journal_current_handle();
++ J_ASSERT(handle != 0);
++ }
++ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
++ return ret;
++}
++
++/*
++ * `handle' can be NULL if create is zero
++ */
++struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
++ long block, int create, int * errp)
++{
++ struct buffer_head dummy;
++ int fatal = 0, err;
++
++ J_ASSERT(handle != NULL || create == 0);
++
++ dummy.b_state = 0;
++ dummy.b_blocknr = -1000;
++ buffer_trace_init(&dummy.b_history);
++ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
++ if (!*errp && buffer_mapped(&dummy)) {
++ struct buffer_head *bh;
++ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
++ if (buffer_new(&dummy)) {
++ J_ASSERT(create != 0);
++ J_ASSERT(handle != 0);
++
++ /* Now that we do not always journal data, we
++ should keep in mind whether this should
++ always journal the new buffer as metadata.
++ For now, regular file writes use
++ ext3_get_block instead, so it's not a
++ problem. */
++ lock_kernel();
++ lock_buffer(bh);
++ BUFFER_TRACE(bh, "call get_create_access");
++ fatal = ext3_journal_get_create_access(handle, bh);
++ if (!fatal) {
++ memset(bh->b_data, 0,
++ inode->i_sb->s_blocksize);
++ mark_buffer_uptodate(bh, 1);
++ }
++ unlock_buffer(bh);
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (!fatal) fatal = err;
++ unlock_kernel();
++ } else {
++ BUFFER_TRACE(bh, "not a new buffer");
++ }
++ if (fatal) {
++ *errp = fatal;
++ brelse(bh);
++ bh = NULL;
++ }
++ return bh;
++ }
++ return NULL;
++}
++
++struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
++ int block, int create, int *err)
++{
++ struct buffer_head * bh;
++ int prev_blocks;
++
++ prev_blocks = inode->i_blocks;
++
++ bh = ext3_getblk (handle, inode, block, create, err);
++ if (!bh)
++ return bh;
++#ifdef EXT3_PREALLOCATE
++ /*
++ * If the inode has grown, and this is a directory, then use a few
++ * more of the preallocated blocks to keep directory fragmentation
++ * down. The preallocated blocks are guaranteed to be contiguous.
++ */
++ if (create &&
++ S_ISDIR(inode->i_mode) &&
++ inode->i_blocks > prev_blocks &&
++ EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
++ int i;
++ struct buffer_head *tmp_bh;
++
++ for (i = 1;
++ inode->u.ext3_i.i_prealloc_count &&
++ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
++ i++) {
++ /*
++ * ext3_getblk will zero out the contents of the
++ * directory for us
++ */
++ tmp_bh = ext3_getblk(handle, inode,
++ block+i, create, err);
++ if (!tmp_bh) {
++ brelse (bh);
++ return 0;
++ }
++ brelse (tmp_bh);
++ }
++ }
++#endif
++ if (buffer_uptodate(bh))
++ return bh;
++ ll_rw_block (READ, 1, &bh);
++ wait_on_buffer (bh);
++ if (buffer_uptodate(bh))
++ return bh;
++ brelse (bh);
++ *err = -EIO;
++ return NULL;
++}
++
++static int walk_page_buffers( handle_t *handle,
++ struct buffer_head *head,
++ unsigned from,
++ unsigned to,
++ int *partial,
++ int (*fn)( handle_t *handle,
++ struct buffer_head *bh))
++{
++ struct buffer_head *bh;
++ unsigned block_start, block_end;
++ unsigned blocksize = head->b_size;
++ int err, ret = 0;
++
++ for ( bh = head, block_start = 0;
++ ret == 0 && (bh != head || !block_start);
++ block_start = block_end, bh = bh->b_this_page)
++ {
++ block_end = block_start + blocksize;
++ if (block_end <= from || block_start >= to) {
++ if (partial && !buffer_uptodate(bh))
++ *partial = 1;
++ continue;
++ }
++ err = (*fn)(handle, bh);
++ if (!ret)
++ ret = err;
++ }
++ return ret;
++}
++
++/*
++ * To preserve ordering, it is essential that the hole instantiation and
++ * the data write be encapsulated in a single transaction. We cannot
++ * close off a transaction and start a new one between the ext3_get_block()
++ * and the commit_write(). So doing the journal_start at the start of
++ * prepare_write() is the right place.
++ *
++ * Also, this function can nest inside ext3_writepage() ->
++ * block_write_full_page(). In that case, we *know* that ext3_writepage()
++ * has generated enough buffer credits to do the whole page. So we won't
++ * block on the journal in that case, which is good, because the caller may
++ * be PF_MEMALLOC.
++ *
++ * By accident, ext3 can be reentered when a transaction is open via
++ * quota file writes. If we were to commit the transaction while thus
++ * reentered, there can be a deadlock - we would be holding a quota
++ * lock, and the commit would never complete if another thread had a
++ * transaction open and was blocking on the quota lock - a ranking
++ * violation.
++ *
++ * So what we do is to rely on the fact that journal_stop/journal_start
++ * will _not_ run commit under these circumstances because handle->h_ref
++ * is elevated. We'll still have enough credits for the tiny quotafile
++ * write.
++ */
++
++static int do_journal_get_write_access(handle_t *handle,
++ struct buffer_head *bh)
++{
++ return ext3_journal_get_write_access(handle, bh);
++}
++
++static int ext3_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
++{
++ struct inode *inode = page->mapping->host;
++ int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
++ handle_t *handle;
++
++ lock_kernel();
++ handle = ext3_journal_start(inode, needed_blocks);
++ if (IS_ERR(handle)) {
++ ret = PTR_ERR(handle);
++ goto out;
++ }
++ unlock_kernel();
++ ret = block_prepare_write(page, from, to, ext3_get_block);
++ lock_kernel();
++ if (ret != 0)
++ goto prepare_write_failed;
++
++ if (ext3_should_journal_data(inode)) {
++ ret = walk_page_buffers(handle, page->buffers,
++ from, to, NULL, do_journal_get_write_access);
++ if (ret) {
++ /*
++ * We're going to fail this prepare_write(),
++ * so commit_write() will not be called.
++ * We need to undo block_prepare_write()'s kmap().
++ * AKPM: Do we need to clear PageUptodate? I don't
++ * think so.
++ */
++ kunmap(page);
++ }
++ }
++prepare_write_failed:
++ if (ret)
++ ext3_journal_stop(handle, inode);
++out:
++ unlock_kernel();
++ return ret;
++}
++
++static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
++{
++ return ext3_journal_dirty_data(handle, bh, 0);
++}
++
++/*
++ * For ext3_writepage(). We also brelse() the buffer to account for
++ * the bget() which ext3_writepage() performs.
++ */
++static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
++{
++ int ret = ext3_journal_dirty_data(handle, bh, 1);
++ __brelse(bh);
++ return ret;
++}
++
++/* For commit_write() in data=journal mode */
++static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
++{
++ set_bit(BH_Uptodate, &bh->b_state);
++ return ext3_journal_dirty_metadata(handle, bh);
++}
++
++/*
++ * We need to pick up the new inode size which generic_commit_write gave us
++ * `file' can be NULL - eg, when called from block_symlink().
++ *
++ * ext3 inode->i_dirty_buffers policy: If we're journalling data we
++ * definitely don't want them to appear on the inode at all - instead
++ * we need to manage them at the JBD layer and we need to intercept
++ * the relevant sync operations and translate them into journal operations.
++ *
++ * If we're not journalling data then we can just leave the buffers
++ * on ->i_dirty_buffers. If someone writes them out for us then thanks.
++ * Otherwise we'll do it in commit, if we're using ordered data.
++ */
++
++static int ext3_commit_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
++{
++ handle_t *handle = ext3_journal_current_handle();
++ struct inode *inode = page->mapping->host;
++ int ret = 0, ret2;
++
++ lock_kernel();
++ if (ext3_should_journal_data(inode)) {
++ /*
++ * Here we duplicate the generic_commit_write() functionality
++ */
++ int partial = 0;
++ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
++
++ ret = walk_page_buffers(handle, page->buffers,
++ from, to, &partial, commit_write_fn);
++ if (!partial)
++ SetPageUptodate(page);
++ kunmap(page);
++ if (pos > inode->i_size)
++ inode->i_size = pos;
++ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
++ } else {
++ if (ext3_should_order_data(inode)) {
++ ret = walk_page_buffers(handle, page->buffers,
++ from, to, NULL, journal_dirty_sync_data);
++ }
++ /* Be careful here if generic_commit_write becomes a
++ * required invocation after block_prepare_write. */
++ if (ret == 0) {
++ ret = generic_commit_write(file, page, from, to);
++ } else {
++ /*
++ * block_prepare_write() was called, but we're not
++ * going to call generic_commit_write(). So we
++ * need to perform generic_commit_write()'s kunmap
++ * by hand.
++ */
++ kunmap(page);
++ }
++ }
++ if (inode->i_size > inode->u.ext3_i.i_disksize) {
++ inode->u.ext3_i.i_disksize = inode->i_size;
++ ret2 = ext3_mark_inode_dirty(handle, inode);
++ if (!ret)
++ ret = ret2;
++ }
++ ret2 = ext3_journal_stop(handle, inode);
++ unlock_kernel();
++ if (!ret)
++ ret = ret2;
++ return ret;
++}
++
++/*
++ * bmap() is special. It gets used by applications such as lilo and by
++ * the swapper to find the on-disk block of a specific piece of data.
++ *
++ * Naturally, this is dangerous if the block concerned is still in the
++ * journal. If somebody makes a swapfile on an ext3 data-journaling
++ * filesystem and enables swap, then they may get a nasty shock when the
++ * data getting swapped to that swapfile suddenly gets overwritten by
++ * the original zero's written out previously to the journal and
++ * awaiting writeback in the kernel's buffer cache.
++ *
++ * So, if we see any bmap calls here on a modified, data-journaled file,
++ * take extra steps to flush any blocks which might be in the cache.
++ */
++static int ext3_bmap(struct address_space *mapping, long block)
++{
++ struct inode *inode = mapping->host;
++ journal_t *journal;
++ int err;
++
++ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
++ /*
++ * This is a REALLY heavyweight approach, but the use of
++ * bmap on dirty files is expected to be extremely rare:
++ * only if we run lilo or swapon on a freshly made file
++ * do we expect this to happen.
++ *
++ * (bmap requires CAP_SYS_RAWIO so this does not
++ * represent an unprivileged user DOS attack --- we'd be
++ * in trouble if mortal users could trigger this path at
++ * will.)
++ *
++ * NB. EXT3_STATE_JDATA is not set on files other than
++ * regular files. If somebody wants to bmap a directory
++ * or symlink and gets confused because the buffer
++ * hasn't yet been flushed to disk, they deserve
++ * everything they get.
++ */
++
++ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
++ journal = EXT3_JOURNAL(inode);
++ journal_lock_updates(journal);
++ err = journal_flush(journal);
++ journal_unlock_updates(journal);
++
++ if (err)
++ return 0;
++ }
++
++ return generic_block_bmap(mapping,block,ext3_get_block);
++}
++
++static int bget_one(handle_t *handle, struct buffer_head *bh)
++{
++ atomic_inc(&bh->b_count);
++ return 0;
++}
++
++/*
++ * Note that we always start a transaction even if we're not journalling
++ * data. This is to preserve ordering: any hole instantiation within
++ * __block_write_full_page -> ext3_get_block() should be journalled
++ * along with the data so we don't crash and then get metadata which
++ * refers to old data.
++ *
++ * In all journalling modes block_write_full_page() will start the I/O.
++ *
++ * Problem:
++ *
++ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
++ * ext3_writepage()
++ *
++ * Similar for:
++ *
++ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
++ *
++ * Same applies to ext3_get_block(). We will deadlock on various things like
++ * lock_journal and i_truncate_sem.
++ *
++ * Setting PF_MEMALLOC here doesn't work - too many internal memory
++ * allocations fail.
++ *
++ * 16May01: If we're reentered then journal_current_handle() will be
++ * non-zero. We simply *return*.
++ *
++ * 1 July 2001: @@@ FIXME:
++ * In journalled data mode, a data buffer may be metadata against the
++ * current transaction. But the same file is part of a shared mapping
++ * and someone does a writepage() on it.
++ *
++ * We will move the buffer onto the async_data list, but *after* it has
++ * been dirtied. So there's a small window where we have dirty data on
++ * BJ_Metadata.
++ *
++ * Note that this only applies to the last partial page in the file. The
++ * bit which block_write_full_page() uses prepare/commit for. (That's
++ * broken code anyway: it's wrong for msync()).
++ *
++ * It's a rare case: affects the final partial page, for journalled data
++ * where the file is subject to bith write() and writepage() in the same
++ * transction. To fix it we'll need a custom block_write_full_page().
++ * We'll probably need that anyway for journalling writepage() output.
++ *
++ * We don't honour synchronous mounts for writepage(). That would be
++ * disastrous. Any write() or metadata operation will sync the fs for
++ * us.
++ */
++static int ext3_writepage(struct page *page)
++{
++ struct inode *inode = page->mapping->host;
++ struct buffer_head *page_buffers;
++ handle_t *handle = NULL;
++ int ret = 0, err;
++ int needed;
++ int order_data;
++
++ J_ASSERT(PageLocked(page));
++
++ /*
++ * We give up here if we're reentered, because it might be
++ * for a different filesystem. One *could* look for a
++ * nested transaction opportunity.
++ */
++ lock_kernel();
++ if (ext3_journal_current_handle())
++ goto out_fail;
++
++ needed = ext3_writepage_trans_blocks(inode);
++ if (current->flags & PF_MEMALLOC)
++ handle = ext3_journal_try_start(inode, needed);
++ else
++ handle = ext3_journal_start(inode, needed);
++
++ if (IS_ERR(handle)) {
++ ret = PTR_ERR(handle);
++ goto out_fail;
++ }
++
++ order_data = ext3_should_order_data(inode) ||
++ ext3_should_journal_data(inode);
++
++ unlock_kernel();
++
++ page_buffers = NULL; /* Purely to prevent compiler warning */
++
++ /* bget() all the buffers */
++ if (order_data) {
++ if (!page->buffers)
++ create_empty_buffers(page,
++ inode->i_dev, inode->i_sb->s_blocksize);
++ page_buffers = page->buffers;
++ walk_page_buffers(handle, page_buffers, 0,
++ PAGE_CACHE_SIZE, NULL, bget_one);
++ }
++
++ ret = block_write_full_page(page, ext3_get_block);
++
++ /*
++ * The page can become unlocked at any point now, and
++ * truncate can then come in and change things. So we
++ * can't touch *page from now on. But *page_buffers is
++ * safe due to elevated refcount.
++ */
++
++ handle = ext3_journal_current_handle();
++ lock_kernel();
++
++ /* And attach them to the current transaction */
++ if (order_data) {
++ err = walk_page_buffers(handle, page_buffers,
++ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
++ if (!ret)
++ ret = err;
++ }
++
++ err = ext3_journal_stop(handle, inode);
++ if (!ret)
++ ret = err;
++ unlock_kernel();
++ return ret;
++
++out_fail:
++
++ unlock_kernel();
++ SetPageDirty(page);
++ UnlockPage(page);
++ return ret;
++}
++
++static int ext3_readpage(struct file *file, struct page *page)
++{
++ return block_read_full_page(page,ext3_get_block);
++}
++
++
++static int ext3_flushpage(struct page *page, unsigned long offset)
++{
++ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
++ return journal_flushpage(journal, page, offset);
++}
++
++static int ext3_releasepage(struct page *page, int wait)
++{
++ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
++ return journal_try_to_free_buffers(journal, page, wait);
++}
++
++
++struct address_space_operations ext3_aops = {
++ readpage: ext3_readpage, /* BKL not held. Don't need */
++ writepage: ext3_writepage, /* BKL not held. We take it */
++ sync_page: block_sync_page,
++ prepare_write: ext3_prepare_write, /* BKL not held. We take it */
++ commit_write: ext3_commit_write, /* BKL not held. We take it */
++ bmap: ext3_bmap, /* BKL held */
++ flushpage: ext3_flushpage, /* BKL not held. Don't need */
++ releasepage: ext3_releasepage, /* BKL not held. Don't need */
++};
++
++/*
++ * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
++ * up to the end of the block which corresponds to `from'.
++ * This required during truncate. We need to physically zero the tail end
++ * of that block so it doesn't yield old data if the file is later grown.
++ */
++static int ext3_block_truncate_page(handle_t *handle,
++ struct address_space *mapping, loff_t from)
++{
++ unsigned long index = from >> PAGE_CACHE_SHIFT;
++ unsigned offset = from & (PAGE_CACHE_SIZE-1);
++ unsigned blocksize, iblock, length, pos;
++ struct inode *inode = mapping->host;
++ struct page *page;
++ struct buffer_head *bh;
++ int err;
++
++ blocksize = inode->i_sb->s_blocksize;
++ length = offset & (blocksize - 1);
++
++ /* Block boundary? Nothing to do */
++ if (!length)
++ return 0;
++
++ length = blocksize - length;
++ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
++
++ page = grab_cache_page(mapping, index);
++ err = -ENOMEM;
++ if (!page)
++ goto out;
++
++ if (!page->buffers)
++ create_empty_buffers(page, inode->i_dev, blocksize);
++
++ /* Find the buffer that contains "offset" */
++ bh = page->buffers;
++ pos = blocksize;
++ while (offset >= pos) {
++ bh = bh->b_this_page;
++ iblock++;
++ pos += blocksize;
++ }
++
++ err = 0;
++ if (!buffer_mapped(bh)) {
++ /* Hole? Nothing to do */
++ if (buffer_uptodate(bh))
++ goto unlock;
++ ext3_get_block(inode, iblock, bh, 0);
++ /* Still unmapped? Nothing to do */
++ if (!buffer_mapped(bh))
++ goto unlock;
++ }
++
++ /* Ok, it's mapped. Make sure it's up-to-date */
++ if (Page_Uptodate(page))
++ set_bit(BH_Uptodate, &bh->b_state);
++
++ if (!buffer_uptodate(bh)) {
++ err = -EIO;
++ ll_rw_block(READ, 1, &bh);
++ wait_on_buffer(bh);
++ /* Uhhuh. Read error. Complain and punt. */
++ if (!buffer_uptodate(bh))
++ goto unlock;
++ }
++
++ if (ext3_should_journal_data(inode)) {
++ BUFFER_TRACE(bh, "get write access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto unlock;
++ }
++
++ memset(kmap(page) + offset, 0, length);
++ flush_dcache_page(page);
++ kunmap(page);
++
++ BUFFER_TRACE(bh, "zeroed end of block");
++
++ err = 0;
++ if (ext3_should_journal_data(inode)) {
++ err = ext3_journal_dirty_metadata(handle, bh);
++ } else {
++ if (ext3_should_order_data(inode))
++ err = ext3_journal_dirty_data(handle, bh, 0);
++ __mark_buffer_dirty(bh);
++ }
++
++unlock:
++ UnlockPage(page);
++ page_cache_release(page);
++out:
++ return err;
++}
++
++/*
++ * Probably it should be a library function... search for first non-zero word
++ * or memcmp with zero_page, whatever is better for particular architecture.
++ * Linus?
++ */
++static inline int all_zeroes(u32 *p, u32 *q)
++{
++ while (p < q)
++ if (*p++)
++ return 0;
++ return 1;
++}
++
++/**
++ * ext3_find_shared - find the indirect blocks for partial truncation.
++ * @inode: inode in question
++ * @depth: depth of the affected branch
++ * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
++ * @chain: place to store the pointers to partial indirect blocks
++ * @top: place to the (detached) top of branch
++ *
++ * This is a helper function used by ext3_truncate().
++ *
++ * When we do truncate() we may have to clean the ends of several
++ * indirect blocks but leave the blocks themselves alive. Block is
++ * partially truncated if some data below the new i_size is refered
++ * from it (and it is on the path to the first completely truncated
++ * data block, indeed). We have to free the top of that path along
++ * with everything to the right of the path. Since no allocation
++ * past the truncation point is possible until ext3_truncate()
++ * finishes, we may safely do the latter, but top of branch may
++ * require special attention - pageout below the truncation point
++ * might try to populate it.
++ *
++ * We atomically detach the top of branch from the tree, store the
++ * block number of its root in *@top, pointers to buffer_heads of
++ * partially truncated blocks - in @chain[].bh and pointers to
++ * their last elements that should not be removed - in
++ * @chain[].p. Return value is the pointer to last filled element
++ * of @chain.
++ *
++ * The work left to caller to do the actual freeing of subtrees:
++ * a) free the subtree starting from *@top
++ * b) free the subtrees whose roots are stored in
++ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
++ * c) free the subtrees growing from the inode past the @chain[0].
++ * (no partially truncated stuff there). */
++
++static Indirect *ext3_find_shared(struct inode *inode,
++ int depth,
++ int offsets[4],
++ Indirect chain[4],
++ u32 *top)
++{
++ Indirect *partial, *p;
++ int k, err;
++
++ *top = 0;
++ /* Make k index the deepest non-null offest + 1 */
++ for (k = depth; k > 1 && !offsets[k-1]; k--)
++ ;
++ partial = ext3_get_branch(inode, k, offsets, chain, &err);
++ /* Writer: pointers */
++ if (!partial)
++ partial = chain + k-1;
++ /*
++ * If the branch acquired continuation since we've looked at it -
++ * fine, it should all survive and (new) top doesn't belong to us.
++ */
++ if (!partial->key && *partial->p)
++ /* Writer: end */
++ goto no_top;
++ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
++ ;
++ /*
++ * OK, we've found the last block that must survive. The rest of our
++ * branch should be detached before unlocking. However, if that rest
++ * of branch is all ours and does not grow immediately from the inode
++ * it's easier to cheat and just decrement partial->p.
++ */
++ if (p == chain + k - 1 && p > chain) {
++ p->p--;
++ } else {
++ *top = *p->p;
++ /* Nope, don't do this in ext3. Must leave the tree intact */
++#if 0
++ *p->p = 0;
++#endif
++ }
++ /* Writer: end */
++
++ while(partial > p)
++ {
++ brelse(partial->bh);
++ partial--;
++ }
++no_top:
++ return partial;
++}
++
++/*
++ * Zero a number of block pointers in either an inode or an indirect block.
++ * If we restart the transaction we must again get write access to the
++ * indirect block for further modification.
++ *
++ * We release `count' blocks on disk, but (last - first) may be greater
++ * than `count' because there can be holes in there.
++ */
++static void
++ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
++ unsigned long block_to_free, unsigned long count,
++ u32 *first, u32 *last)
++{
++ u32 *p;
++ if (try_to_extend_transaction(handle, inode)) {
++ if (bh) {
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, bh);
++ }
++ ext3_mark_inode_dirty(handle, inode);
++ ext3_journal_test_restart(handle, inode);
++ BUFFER_TRACE(bh, "get_write_access");
++ ext3_journal_get_write_access(handle, bh);
++ }
++
++ /*
++ * Any buffers which are on the journal will be in memory. We find
++ * them on the hash table so journal_revoke() will run journal_forget()
++ * on them. We've already detached each block from the file, so
++ * bforget() in journal_forget() should be safe.
++ *
++ * AKPM: turn on bforget in journal_forget()!!!
++ */
++ for (p = first; p < last; p++) {
++ u32 nr = le32_to_cpu(*p);
++ if (nr) {
++ struct buffer_head *bh;
++
++ *p = 0;
++ bh = sb_get_hash_table(inode->i_sb, nr);
++ ext3_forget(handle, 0, inode, bh, nr);
++ }
++ }
++
++ ext3_free_blocks(handle, inode, block_to_free, count);
++}
++
++/**
++ * ext3_free_data - free a list of data blocks
++ * @handle: handle for this transaction
++ * @inode: inode we are dealing with
++ * @this_bh: indirect buffer_head which contains *@first and *@last
++ * @first: array of block numbers
++ * @last: points immediately past the end of array
++ *
++ * We are freeing all blocks refered from that array (numbers are stored as
++ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
++ *
++ * We accumulate contiguous runs of blocks to free. Conveniently, if these
++ * blocks are contiguous then releasing them at one time will only affect one
++ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
++ * actually use a lot of journal space.
++ *
++ * @this_bh will be %NULL if @first and @last point into the inode's direct
++ * block pointers.
++ */
++static void ext3_free_data(handle_t *handle, struct inode *inode,
++ struct buffer_head *this_bh, u32 *first, u32 *last)
++{
++ unsigned long block_to_free = 0; /* Starting block # of a run */
++ unsigned long count = 0; /* Number of blocks in the run */
++ u32 *block_to_free_p = NULL; /* Pointer into inode/ind
++ corresponding to
++ block_to_free */
++ unsigned long nr; /* Current block # */
++ u32 *p; /* Pointer into inode/ind
++ for current block */
++ int err;
++
++ if (this_bh) { /* For indirect block */
++ BUFFER_TRACE(this_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, this_bh);
++ /* Important: if we can't update the indirect pointers
++ * to the blocks, we can't free them. */
++ if (err)
++ return;
++ }
++
++ for (p = first; p < last; p++) {
++ nr = le32_to_cpu(*p);
++ if (nr) {
++ /* accumulate blocks to free if they're contiguous */
++ if (count == 0) {
++ block_to_free = nr;
++ block_to_free_p = p;
++ count = 1;
++ } else if (nr == block_to_free + count) {
++ count++;
++ } else {
++ ext3_clear_blocks(handle, inode, this_bh,
++ block_to_free,
++ count, block_to_free_p, p);
++ block_to_free = nr;
++ block_to_free_p = p;
++ count = 1;
++ }
++ }
++ }
++
++ if (count > 0)
++ ext3_clear_blocks(handle, inode, this_bh, block_to_free,
++ count, block_to_free_p, p);
++
++ if (this_bh) {
++ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, this_bh);
++ }
++}
++
++/**
++ * ext3_free_branches - free an array of branches
++ * @handle: JBD handle for this transaction
++ * @inode: inode we are dealing with
++ * @parent_bh: the buffer_head which contains *@first and *@last
++ * @first: array of block numbers
++ * @last: pointer immediately past the end of array
++ * @depth: depth of the branches to free
++ *
++ * We are freeing all blocks refered from these branches (numbers are
++ * stored as little-endian 32-bit) and updating @inode->i_blocks
++ * appropriately.
++ */
++static void ext3_free_branches(handle_t *handle, struct inode *inode,
++ struct buffer_head *parent_bh,
++ u32 *first, u32 *last, int depth)
++{
++ unsigned long nr;
++ u32 *p;
++
++ if (is_handle_aborted(handle))
++ return;
++
++ if (depth--) {
++ struct buffer_head *bh;
++ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
++ p = last;
++ while (--p >= first) {
++ nr = le32_to_cpu(*p);
++ if (!nr)
++ continue; /* A hole */
++
++ /* Go read the buffer for the next level down */
++ bh = sb_bread(inode->i_sb, nr);
++
++ /*
++ * A read failure? Report error and clear slot
++ * (should be rare).
++ */
++ if (!bh) {
++ ext3_error(inode->i_sb, "ext3_free_branches",
++ "Read failure, inode=%ld, block=%ld",
++ inode->i_ino, nr);
++ continue;
++ }
++
++ /* This zaps the entire block. Bottom up. */
++ BUFFER_TRACE(bh, "free child branches");
++ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
++ (u32*)bh->b_data + addr_per_block,
++ depth);
++
++ /*
++ * We've probably journalled the indirect block several
++ * times during the truncate. But it's no longer
++ * needed and we now drop it from the transaction via
++ * journal_revoke().
++ *
++ * That's easy if it's exclusively part of this
++ * transaction. But if it's part of the committing
++ * transaction then journal_forget() will simply
++ * brelse() it. That means that if the underlying
++ * block is reallocated in ext3_get_block(),
++ * unmap_underlying_metadata() will find this block
++ * and will try to get rid of it. damn, damn.
++ *
++ * If this block has already been committed to the
++ * journal, a revoke record will be written. And
++ * revoke records must be emitted *before* clearing
++ * this block's bit in the bitmaps.
++ */
++ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
++
++ /*
++ * Everything below this this pointer has been
++ * released. Now let this top-of-subtree go.
++ *
++ * We want the freeing of this indirect block to be
++ * atomic in the journal with the updating of the
++ * bitmap block which owns it. So make some room in
++ * the journal.
++ *
++ * We zero the parent pointer *after* freeing its
++ * pointee in the bitmaps, so if extend_transaction()
++ * for some reason fails to put the bitmap changes and
++ * the release into the same transaction, recovery
++ * will merely complain about releasing a free block,
++ * rather than leaking blocks.
++ */
++ if (is_handle_aborted(handle))
++ return;
++ if (try_to_extend_transaction(handle, inode)) {
++ ext3_mark_inode_dirty(handle, inode);
++ ext3_journal_test_restart(handle, inode);
++ }
++
++ ext3_free_blocks(handle, inode, nr, 1);
++
++ if (parent_bh) {
++ /*
++ * The block which we have just freed is
++ * pointed to by an indirect block: journal it
++ */
++ BUFFER_TRACE(parent_bh, "get_write_access");
++ if (!ext3_journal_get_write_access(handle,
++ parent_bh)){
++ *p = 0;
++ BUFFER_TRACE(parent_bh,
++ "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle,
++ parent_bh);
++ }
++ }
++ }
++ } else {
++ /* We have reached the bottom of the tree. */
++ BUFFER_TRACE(parent_bh, "free data blocks");
++ ext3_free_data(handle, inode, parent_bh, first, last);
++ }
++}
++
++/*
++ * ext3_truncate()
++ *
++ * We block out ext3_get_block() block instantiations across the entire
++ * transaction, and VFS/VM ensures that ext3_truncate() cannot run
++ * simultaneously on behalf of the same inode.
++ *
++ * As we work through the truncate and commmit bits of it to the journal there
++ * is one core, guiding principle: the file's tree must always be consistent on
++ * disk. We must be able to restart the truncate after a crash.
++ *
++ * The file's tree may be transiently inconsistent in memory (although it
++ * probably isn't), but whenever we close off and commit a journal transaction,
++ * the contents of (the filesystem + the journal) must be consistent and
++ * restartable. It's pretty simple, really: bottom up, right to left (although
++ * left-to-right works OK too).
++ *
++ * Note that at recovery time, journal replay occurs *before* the restart of
++ * truncate against the orphan inode list.
++ *
++ * The committed inode has the new, desired i_size (which is the same as
++ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
++ * that this inode's truncate did not complete and it will again call
++ * ext3_truncate() to have another go. So there will be instantiated blocks
++ * to the right of the truncation point in a crashed ext3 filesystem. But
++ * that's fine - as long as they are linked from the inode, the post-crash
++ * ext3_truncate() run will find them and release them.
++ */
++
++void ext3_truncate(struct inode * inode)
++{
++ handle_t *handle;
++ u32 *i_data = inode->u.ext3_i.i_data;
++ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
++ int offsets[4];
++ Indirect chain[4];
++ Indirect *partial;
++ int nr = 0;
++ int n;
++ long last_block;
++ unsigned blocksize;
++
++ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
++ S_ISLNK(inode->i_mode)))
++ return;
++ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
++ return;
++
++ ext3_discard_prealloc(inode);
++
++ handle = start_transaction(inode);
++ if (IS_ERR(handle))
++ return; /* AKPM: return what? */
++
++ blocksize = inode->i_sb->s_blocksize;
++ last_block = (inode->i_size + blocksize-1)
++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
++
++ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
++
++
++ n = ext3_block_to_path(inode, last_block, offsets);
++ if (n == 0)
++ goto out_stop; /* error */
++
++ /*
++ * OK. This truncate is going to happen. We add the inode to the
++ * orphan list, so that if this truncate spans multiple transactions,
++ * and we crash, we will resume the truncate when the filesystem
++ * recovers. It also marks the inode dirty, to catch the new size.
++ *
++ * Implication: the file must always be in a sane, consistent
++ * truncatable state while each transaction commits.
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /*
++ * The orphan list entry will now protect us from any crash which
++ * occurs before the truncate completes, so it is now safe to propagate
++ * the new, shorter inode size (held for now in i_size) into the
++ * on-disk inode. We do this via i_disksize, which is the value which
++ * ext3 *really* writes onto the disk inode.
++ */
++ inode->u.ext3_i.i_disksize = inode->i_size;
++
++ /*
++ * From here we block out all ext3_get_block() callers who want to
++ * modify the block allocation tree.
++ */
++ down_write(&inode->u.ext3_i.truncate_sem);
++
++ if (n == 1) { /* direct blocks */
++ ext3_free_data(handle, inode, NULL, i_data+offsets[0],
++ i_data + EXT3_NDIR_BLOCKS);
++ goto do_indirects;
++ }
++
++ partial = ext3_find_shared(inode, n, offsets, chain, &nr);
++ /* Kill the top of shared branch (not detached) */
++ if (nr) {
++ if (partial == chain) {
++ /* Shared branch grows from the inode */
++ ext3_free_branches(handle, inode, NULL,
++ &nr, &nr+1, (chain+n-1) - partial);
++ *partial->p = 0;
++ /*
++ * We mark the inode dirty prior to restart,
++ * and prior to stop. No need for it here.
++ */
++ } else {
++ /* Shared branch grows from an indirect block */
++ BUFFER_TRACE(partial->bh, "get_write_access");
++ ext3_free_branches(handle, inode, partial->bh,
++ partial->p,
++ partial->p+1, (chain+n-1) - partial);
++ }
++ }
++ /* Clear the ends of indirect blocks on the shared branch */
++ while (partial > chain) {
++ ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
++ (u32*)partial->bh->b_data + addr_per_block,
++ (chain+n-1) - partial);
++ BUFFER_TRACE(partial->bh, "call brelse");
++ brelse (partial->bh);
++ partial--;
++ }
++do_indirects:
++ /* Kill the remaining (whole) subtrees */
++ switch (offsets[0]) {
++ default:
++ nr = i_data[EXT3_IND_BLOCK];
++ if (nr) {
++ ext3_free_branches(handle, inode, NULL,
++ &nr, &nr+1, 1);
++ i_data[EXT3_IND_BLOCK] = 0;
++ }
++ case EXT3_IND_BLOCK:
++ nr = i_data[EXT3_DIND_BLOCK];
++ if (nr) {
++ ext3_free_branches(handle, inode, NULL,
++ &nr, &nr+1, 2);
++ i_data[EXT3_DIND_BLOCK] = 0;
++ }
++ case EXT3_DIND_BLOCK:
++ nr = i_data[EXT3_TIND_BLOCK];
++ if (nr) {
++ ext3_free_branches(handle, inode, NULL,
++ &nr, &nr+1, 3);
++ i_data[EXT3_TIND_BLOCK] = 0;
++ }
++ case EXT3_TIND_BLOCK:
++ ;
++ }
++ up_write(&inode->u.ext3_i.truncate_sem);
++ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++ ext3_mark_inode_dirty(handle, inode);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ ext3_journal_stop(handle, inode);
++}
++
++/*
++ * ext3_get_inode_loc returns with an extra refcount against the
++ * inode's underlying buffer_head on success.
++ */
++
++int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
++{
++ struct buffer_head *bh = 0;
++ unsigned long block;
++ unsigned long block_group;
++ unsigned long group_desc;
++ unsigned long desc;
++ unsigned long offset;
++ struct ext3_group_desc * gdp;
++
++ if ((inode->i_ino != EXT3_ROOT_INO &&
++ inode->i_ino != EXT3_ACL_IDX_INO &&
++ inode->i_ino != EXT3_ACL_DATA_INO &&
++ inode->i_ino != EXT3_JOURNAL_INO &&
++ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
++ inode->i_ino > le32_to_cpu(
++ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
++ ext3_error (inode->i_sb, "ext3_get_inode_loc",
++ "bad inode number: %lu", inode->i_ino);
++ goto bad_inode;
++ }
++ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
++ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
++ ext3_error (inode->i_sb, "ext3_get_inode_loc",
++ "group >= groups count");
++ goto bad_inode;
++ }
++ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
++ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
++ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
++ if (!bh) {
++ ext3_error (inode->i_sb, "ext3_get_inode_loc",
++ "Descriptor not loaded");
++ goto bad_inode;
++ }
++
++ gdp = (struct ext3_group_desc *) bh->b_data;
++ /*
++ * Figure out the offset within the block group inode table
++ */
++ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
++ EXT3_INODE_SIZE(inode->i_sb);
++ block = le32_to_cpu(gdp[desc].bg_inode_table) +
++ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
++ if (!(bh = sb_bread(inode->i_sb, block))) {
++ ext3_error (inode->i_sb, "ext3_get_inode_loc",
++ "unable to read inode block - "
++ "inode=%lu, block=%lu", inode->i_ino, block);
++ goto bad_inode;
++ }
++ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
++
++ iloc->bh = bh;
++ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
++ iloc->block_group = block_group;
++
++ return 0;
++
++ bad_inode:
++ return -EIO;
++}
++
++void ext3_read_inode(struct inode * inode)
++{
++ struct ext3_iloc iloc;
++ struct ext3_inode *raw_inode;
++ struct buffer_head *bh;
++ int block;
++
++ if(ext3_get_inode_loc(inode, &iloc))
++ goto bad_inode;
++ bh = iloc.bh;
++ raw_inode = iloc.raw_inode;
++ init_rwsem(&inode->u.ext3_i.truncate_sem);
++ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
++ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
++ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
++ if(!(test_opt (inode->i_sb, NO_UID32))) {
++ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
++ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
++ }
++ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
++ inode->i_size = le32_to_cpu(raw_inode->i_size);
++ inode->i_atime = le32_to_cpu(raw_inode->i_atime);
++ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
++ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
++ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
++ /* We now have enough fields to check if the inode was active or not.
++ * This is needed because nfsd might try to access dead inodes
++ * the test is that same one that e2fsck uses
++ * NeilBrown 1999oct15
++ */
++ if (inode->i_nlink == 0) {
++ if (inode->i_mode == 0 ||
++ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
++ /* this inode is deleted */
++ brelse (bh);
++ goto bad_inode;
++ }
++ /* The only unlinked inodes we let through here have
++ * valid i_mode and are being read by the orphan
++ * recovery code: that's fine, we're about to complete
++ * the process of deleting those. */
++ }
++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
++ * (for stat), not the fs block
++ * size */
++ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
++ inode->i_version = ++event;
++ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
++#ifdef EXT3_FRAGMENTS
++ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
++ inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
++ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
++#endif
++ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
++ if (!S_ISREG(inode->i_mode)) {
++ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
++ } else {
++ inode->i_size |=
++ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
++ }
++ inode->u.ext3_i.i_disksize = inode->i_size;
++ inode->i_generation = le32_to_cpu(raw_inode->i_generation);
++#ifdef EXT3_PREALLOCATE
++ inode->u.ext3_i.i_prealloc_count = 0;
++#endif
++ inode->u.ext3_i.i_block_group = iloc.block_group;
++
++ /*
++ * NOTE! The in-memory inode i_data array is in little-endian order
++ * even on big-endian machines: we do NOT byteswap the block numbers!
++ */
++ for (block = 0; block < EXT3_N_BLOCKS; block++)
++ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++
++ brelse (iloc.bh);
++
++ if (inode->i_ino == EXT3_ACL_IDX_INO ||
++ inode->i_ino == EXT3_ACL_DATA_INO)
++ /* Nothing to do */ ;
++ else if (S_ISREG(inode->i_mode)) {
++ inode->i_op = &ext3_file_inode_operations;
++ inode->i_fop = &ext3_file_operations;
++ inode->i_mapping->a_ops = &ext3_aops;
++ } else if (S_ISDIR(inode->i_mode)) {
++ inode->i_op = &ext3_dir_inode_operations;
++ inode->i_fop = &ext3_dir_operations;
++ } else if (S_ISLNK(inode->i_mode)) {
++ if (!inode->i_blocks)
++ inode->i_op = &ext3_fast_symlink_inode_operations;
++ else {
++ inode->i_op = &page_symlink_inode_operations;
++ inode->i_mapping->a_ops = &ext3_aops;
++ }
++ } else
++ init_special_inode(inode, inode->i_mode,
++ le32_to_cpu(iloc.raw_inode->i_block[0]));
++ /* inode->i_attr_flags = 0; unused */
++ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
++ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
++ inode->i_flags |= S_SYNC;
++ }
++ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
++ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */
++ inode->i_flags |= S_APPEND;
++ }
++ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) {
++ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */
++ inode->i_flags |= S_IMMUTABLE;
++ }
++ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
++ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */
++ inode->i_flags |= S_NOATIME;
++ }
++ return;
++
++bad_inode:
++ make_bad_inode(inode);
++ return;
++}
++
++/*
++ * Post the struct inode info into an on-disk inode location in the
++ * buffer-cache. This gobbles the caller's reference to the
++ * buffer_head in the inode location struct.
++ */
++
++static int ext3_do_update_inode(handle_t *handle,
++ struct inode *inode,
++ struct ext3_iloc *iloc)
++{
++ struct ext3_inode *raw_inode = iloc->raw_inode;
++ struct buffer_head *bh = iloc->bh;
++ int err = 0, rc, block;
++
++ if (handle) {
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto out_brelse;
++ }
++ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
++ if(!(test_opt(inode->i_sb, NO_UID32))) {
++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
++/*
++ * Fix up interoperability with old kernels. Otherwise, old inodes get
++ * re-used with the upper 16 bits of the uid/gid intact
++ */
++ if(!inode->u.ext3_i.i_dtime) {
++ raw_inode->i_uid_high =
++ cpu_to_le16(high_16_bits(inode->i_uid));
++ raw_inode->i_gid_high =
++ cpu_to_le16(high_16_bits(inode->i_gid));
++ } else {
++ raw_inode->i_uid_high = 0;
++ raw_inode->i_gid_high = 0;
++ }
++ } else {
++ raw_inode->i_uid_low =
++ cpu_to_le16(fs_high2lowuid(inode->i_uid));
++ raw_inode->i_gid_low =
++ cpu_to_le16(fs_high2lowgid(inode->i_gid));
++ raw_inode->i_uid_high = 0;
++ raw_inode->i_gid_high = 0;
++ }
++ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
++ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
++ raw_inode->i_atime = cpu_to_le32(inode->i_atime);
++ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
++ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
++ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
++ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
++ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
++#ifdef EXT3_FRAGMENTS
++ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
++ raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
++ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
++#else
++ /* If we are not tracking these fields in the in-memory inode,
++ * then preserve them on disk, but still initialise them to zero
++ * for new inodes. */
++ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
++ raw_inode->i_faddr = 0;
++ raw_inode->i_frag = 0;
++ raw_inode->i_fsize = 0;
++ }
++#endif
++ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
++ if (!S_ISREG(inode->i_mode)) {
++ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
++ } else {
++ raw_inode->i_size_high =
++ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
++ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
++ struct super_block *sb = inode->i_sb;
++ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
++ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
++ EXT3_SB(sb)->s_es->s_rev_level ==
++ cpu_to_le32(EXT3_GOOD_OLD_REV)) {
++ /* If this is the first large file
++ * created, add a flag to the superblock.
++ */
++ err = ext3_journal_get_write_access(handle,
++ sb->u.ext3_sb.s_sbh);
++ if (err)
++ goto out_brelse;
++ ext3_update_dynamic_rev(sb);
++ EXT3_SET_RO_COMPAT_FEATURE(sb,
++ EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
++ sb->s_dirt = 1;
++ handle->h_sync = 1;
++ err = ext3_journal_dirty_metadata(handle,
++ sb->u.ext3_sb.s_sbh);
++ }
++ }
++ }
++ raw_inode->i_generation = le32_to_cpu(inode->i_generation);
++ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
++ raw_inode->i_block[0] =
++ cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
++ else for (block = 0; block < EXT3_N_BLOCKS; block++)
++ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
++
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ rc = ext3_journal_dirty_metadata(handle, bh);
++ if (!err)
++ err = rc;
++ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
++
++out_brelse:
++ brelse (bh);
++ ext3_std_error(inode->i_sb, err);
++ return err;
++}
++
++/*
++ * ext3_write_inode()
++ *
++ * We are called from a few places:
++ *
++ * - Within generic_file_write() for O_SYNC files.
++ * Here, there will be no transaction running. We wait for any running
++ * trasnaction to commit.
++ *
++ * - Within sys_sync(), kupdate and such.
++ * We wait on commit, if tol to.
++ *
++ * - Within prune_icache() (PF_MEMALLOC == true)
++ * Here we simply return. We can't afford to block kswapd on the
++ * journal commit.
++ *
++ * In all cases it is actually safe for us to return without doing anything,
++ * because the inode has been copied into a raw inode buffer in
++ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
++ * knfsd.
++ *
++ * Note that we are absolutely dependent upon all inode dirtiers doing the
++ * right thing: they *must* call mark_inode_dirty() after dirtying info in
++ * which we are interested.
++ *
++ * It would be a bug for them to not do this. The code:
++ *
++ * mark_inode_dirty(inode)
++ * stuff();
++ * inode->i_size = expr;
++ *
++ * is in error because a kswapd-driven write_inode() could occur while
++ * `stuff()' is running, and the new i_size will be lost. Plus the inode
++ * will no longer be on the superblock's dirty inode list.
++ */
++void ext3_write_inode(struct inode *inode, int wait)
++{
++ if (current->flags & PF_MEMALLOC)
++ return;
++
++ if (ext3_journal_current_handle()) {
++ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
++ return;
++ }
++
++ if (!wait)
++ return;
++
++ ext3_force_commit(inode->i_sb);
++}
++
++/*
++ * ext3_setattr()
++ *
++ * Called from notify_change.
++ *
++ * We want to trap VFS attempts to truncate the file as soon as
++ * possible. In particular, we want to make sure that when the VFS
++ * shrinks i_size, we put the inode on the orphan list and modify
++ * i_disksize immediately, so that during the subsequent flushing of
++ * dirty pages and freeing of disk blocks, we can guarantee that any
++ * commit will leave the blocks being flushed in an unused state on
++ * disk. (On recovery, the inode will get truncated and the blocks will
++ * be freed, so we have a strong guarantee that no future commit will
++ * leave these blocks visible to the user.)
++ *
++ * This is only needed for regular files. rmdir() has its own path, and
++ * we can never truncate a direcory except on final unlink (at which
++ * point i_nlink is zero so recovery is easy.)
++ *
++ * Called with the BKL.
++ */
++
++int ext3_setattr(struct dentry *dentry, struct iattr *attr)
++{
++ struct inode *inode = dentry->d_inode;
++ int error, rc = 0;
++ const unsigned int ia_valid = attr->ia_valid;
++
++ error = inode_change_ok(inode, attr);
++ if (error)
++ return error;
++
++ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
++ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
++ if (error)
++ return error;
++ }
++
++ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
++ handle_t *handle;
++
++ handle = ext3_journal_start(inode, 3);
++ if (IS_ERR(handle)) {
++ error = PTR_ERR(handle);
++ goto err_out;
++ }
++
++ error = ext3_orphan_add(handle, inode);
++ inode->u.ext3_i.i_disksize = attr->ia_size;
++ rc = ext3_mark_inode_dirty(handle, inode);
++ if (!error)
++ error = rc;
++ ext3_journal_stop(handle, inode);
++ }
++
++ rc = inode_setattr(inode, attr);
++
++ /* If inode_setattr's call to ext3_truncate failed to get a
++ * transaction handle at all, we need to clean up the in-core
++ * orphan list manually. */
++ if (inode->i_nlink)
++ ext3_orphan_del(NULL, inode);
++
++err_out:
++ ext3_std_error(inode->i_sb, error);
++ if (!error)
++ error = rc;
++ return error;
++}
++
++
++/*
++ * akpm: how many blocks doth make a writepage()?
++ *
++ * With N blocks per page, it may be:
++ * N data blocks
++ * 2 indirect block
++ * 2 dindirect
++ * 1 tindirect
++ * N+5 bitmap blocks (from the above)
++ * N+5 group descriptor summary blocks
++ * 1 inode block
++ * 1 superblock.
++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
++ *
++ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++ *
++ * With ordered or writeback data it's the same, less the N data blocks.
++ *
++ * If the inode's direct blocks can hold an integral number of pages then a
++ * page cannot straddle two indirect blocks, and we can only touch one indirect
++ * and dindirect block, and the "5" above becomes "3".
++ *
++ * This still overestimates under most circumstances. If we were to pass the
++ * start and end offsets in here as well we could do block_to_path() on each
++ * block and work out the exact number of indirects which are touched. Pah.
++ */
++
++int ext3_writepage_trans_blocks(struct inode *inode)
++{
++ int bpp = ext3_journal_blocks_per_page(inode);
++ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
++ int ret;
++
++ if (ext3_should_journal_data(inode))
++ ret = 3 * (bpp + indirects) + 2;
++ else
++ ret = 2 * (bpp + indirects) + 2;
++
++#ifdef CONFIG_QUOTA
++ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return ret;
++}
++
++int
++ext3_mark_iloc_dirty(handle_t *handle,
++ struct inode *inode,
++ struct ext3_iloc *iloc)
++{
++ int err = 0;
++
++ if (handle) {
++ /* the do_update_inode consumes one bh->b_count */
++ atomic_inc(&iloc->bh->b_count);
++ err = ext3_do_update_inode(handle, inode, iloc);
++ /* ext3_do_update_inode() does journal_dirty_metadata */
++ brelse(iloc->bh);
++ } else {
++ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n");
++ }
++ return err;
++}
++
++/*
++ * On success, We end up with an outstanding reference count against
++ * iloc->bh. This _must_ be cleaned up later.
++ */
++
++int
++ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
++ struct ext3_iloc *iloc)
++{
++ int err = 0;
++ if (handle) {
++ err = ext3_get_inode_loc(inode, iloc);
++ if (!err) {
++ BUFFER_TRACE(iloc->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, iloc->bh);
++ if (err) {
++ brelse(iloc->bh);
++ iloc->bh = NULL;
++ }
++ }
++ }
++ ext3_std_error(inode->i_sb, err);
++ return err;
++}
++
++/*
++ * akpm: What we do here is to mark the in-core inode as clean
++ * with respect to inode dirtiness (it may still be data-dirty).
++ * This means that the in-core inode may be reaped by prune_icache
++ * without having to perform any I/O. This is a very good thing,
++ * because *any* task may call prune_icache - even ones which
++ * have a transaction open against a different journal.
++ *
++ * Is this cheating? Not really. Sure, we haven't written the
++ * inode out, but prune_icache isn't a user-visible syncing function.
++ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
++ * we start and wait on commits.
++ *
++ * Is this efficient/effective? Well, we're being nice to the system
++ * by cleaning up our inodes proactively so they can be reaped
++ * without I/O. But we are potentially leaving up to five seconds'
++ * worth of inodes floating about which prune_icache wants us to
++ * write out. One way to fix that would be to get prune_icache()
++ * to do a write_super() to free up some memory. It has the desired
++ * effect.
++ */
++int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
++{
++ struct ext3_iloc iloc;
++ int err;
++
++ err = ext3_reserve_inode_write(handle, inode, &iloc);
++ if (!err)
++ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++ return err;
++}
++
++/*
++ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
++ *
++ * We're really interested in the case where a file is being extended.
++ * i_size has been changed by generic_commit_write() and we thus need
++ * to include the updated inode in the current transaction.
++ *
++ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
++ * are allocated to the file.
++ *
++ * If the inode is marked synchronous, we don't honour that here - doing
++ * so would cause a commit on atime updates, which we don't bother doing.
++ * We handle synchronous inodes at the highest possible level.
++ */
++void ext3_dirty_inode(struct inode *inode)
++{
++ handle_t *current_handle = ext3_journal_current_handle();
++ handle_t *handle;
++
++ lock_kernel();
++ handle = ext3_journal_start(inode, 1);
++ if (IS_ERR(handle))
++ goto out;
++ if (current_handle &&
++ current_handle->h_transaction != handle->h_transaction) {
++ /* This task has a transaction open against a different fs */
++ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n");
++ } else {
++ jbd_debug(5, "marking dirty. outer handle=%p\n",
++ current_handle);
++ ext3_mark_inode_dirty(handle, inode);
++ }
++ ext3_journal_stop(handle, inode);
++out:
++ unlock_kernel();
++}
++
++#ifdef AKPM
++/*
++ * Bind an inode's backing buffer_head into this transaction, to prevent
++ * it from being flushed to disk early. Unlike
++ * ext3_reserve_inode_write, this leaves behind no bh reference and
++ * returns no iloc structure, so the caller needs to repeat the iloc
++ * lookup to mark the inode dirty later.
++ */
++static inline int
++ext3_pin_inode(handle_t *handle, struct inode *inode)
++{
++ struct ext3_iloc iloc;
++
++ int err = 0;
++ if (handle) {
++ err = ext3_get_inode_loc(inode, &iloc);
++ if (!err) {
++ BUFFER_TRACE(iloc.bh, "get_write_access");
++ err = journal_get_write_access(handle, iloc.bh);
++ if (!err)
++ err = ext3_journal_dirty_metadata(handle,
++ iloc.bh);
++ brelse(iloc.bh);
++ }
++ }
++ ext3_std_error(inode->i_sb, err);
++ return err;
++}
++#endif
++
++int ext3_change_inode_journal_flag(struct inode *inode, int val)
++{
++ journal_t *journal;
++ handle_t *handle;
++ int err;
++
++ /*
++ * We have to be very careful here: changing a data block's
++ * journaling status dynamically is dangerous. If we write a
++ * data block to the journal, change the status and then delete
++ * that block, we risk forgetting to revoke the old log record
++ * from the journal and so a subsequent replay can corrupt data.
++ * So, first we make sure that the journal is empty and that
++ * nobody is changing anything.
++ */
++
++ journal = EXT3_JOURNAL(inode);
++ if (is_journal_aborted(journal) || IS_RDONLY(inode))
++ return -EROFS;
++
++ journal_lock_updates(journal);
++ journal_flush(journal);
++
++ /*
++ * OK, there are no updates running now, and all cached data is
++ * synced to disk. We are now in a completely consistent state
++ * which doesn't have anything in the journal, and we know that
++ * no filesystem updates are running, so it is safe to modify
++ * the inode's in-core data-journaling state flag now.
++ */
++
++ if (val)
++ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
++ else
++ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
++
++ journal_unlock_updates(journal);
++
++ /* Finally we can mark the inode as dirty. */
++
++ handle = ext3_journal_start(inode, 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ err = ext3_mark_inode_dirty(handle, inode);
++ handle->h_sync = 1;
++ ext3_journal_stop(handle, inode);
++ ext3_std_error(inode->i_sb, err);
++
++ return err;
++}
++
++
++/*
++ * ext3_aops_journal_start().
++ *
++ * <This function died, but the comment lives on>
++ *
++ * We need to take the inode semaphore *outside* the
++ * journal_start/journal_stop. Otherwise, a different task could do a
++ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule
++ * is: transaction open/closes are considered to be a locking operation
++ * and they nest *inside* ->i_sem.
++ * ----------------------------------------------------------------------------
++ * Possible problem:
++ * ext3_file_write()
++ * -> generic_file_write()
++ * -> __alloc_pages()
++ * -> page_launder()
++ * -> ext3_writepage()
++ *
++ * And the writepage can be on a different fs while we have a
++ * transaction open against this one! Bad.
++ *
++ * I tried making the task PF_MEMALLOC here, but that simply results in
++ * 0-order allocation failures passed back to generic_file_write().
++ * Instead, we rely on the reentrancy protection in ext3_writepage().
++ * ----------------------------------------------------------------------------
++ * When we do the journal_start() here we don't really need to reserve
++ * any blocks - we won't need any until we hit ext3_prepare_write(),
++ * which does all the needed journal extending. However! There is a
++ * problem with quotas:
++ *
++ * Thread 1:
++ * sys_sync
++ * ->sync_dquots
++ * ->commit_dquot
++ * ->lock_dquot
++ * ->write_dquot
++ * ->ext3_file_write
++ * ->journal_start
++ * ->ext3_prepare_write
++ * ->journal_extend
++ * ->journal_start
++ * Thread 2:
++ * ext3_create (for example)
++ * ->ext3_new_inode
++ * ->dquot_initialize
++ * ->lock_dquot
++ *
++ * Deadlock. Thread 1's journal_start blocks because thread 2 has a
++ * transaction open. Thread 2's transaction will never close because
++ * thread 2 is stuck waiting for the dquot lock.
++ *
++ * So. We must ensure that thread 1 *never* needs to extend the journal
++ * for quota writes. We do that by reserving enough journal blocks
++ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
++ * need to extend" test in ext3_prepare_write() succeeds.
++ */
+diff -rup --new-file linux.mcp2/fs/ext3/ioctl.c linux_tmp/fs/ext3/ioctl.c
+--- linux.mcp2/fs/ext3/ioctl.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/ioctl.c 2001-11-09 14:25:04.000000000 -0800
+@@ -0,0 +1,170 @@
++/*
++ * linux/fs/ext3/ioctl.c
++ *
++ * Copyright (C) 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/sched.h>
++#include <asm/uaccess.h>
++
++
++int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
++ unsigned long arg)
++{
++ unsigned int flags;
++
++ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
++
++ switch (cmd) {
++ case EXT3_IOC_GETFLAGS:
++ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
++ return put_user(flags, (int *) arg);
++ case EXT3_IOC_SETFLAGS: {
++ handle_t *handle = NULL;
++ int err;
++ struct ext3_iloc iloc;
++ unsigned int oldflags;
++ unsigned int jflag;
++
++ if (IS_RDONLY(inode))
++ return -EROFS;
++
++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
++ return -EPERM;
++
++ if (get_user(flags, (int *) arg))
++ return -EFAULT;
++
++ oldflags = inode->u.ext3_i.i_flags;
++
++ /* The JOURNAL_DATA flag is modifiable only by root */
++ jflag = flags & EXT3_JOURNAL_DATA_FL;
++
++ /*
++ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
++ * the relevant capability.
++ *
++ * This test looks nicer. Thanks to Pauline Middelink
++ */
++ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
++ if (!capable(CAP_LINUX_IMMUTABLE))
++ return -EPERM;
++ }
++
++ /*
++ * The JOURNAL_DATA flag can only be changed by
++ * the relevant capability.
++ */
++ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
++ if (!capable(CAP_SYS_RESOURCE))
++ return -EPERM;
++ }
++
++
++ handle = ext3_journal_start(inode, 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++ err = ext3_reserve_inode_write(handle, inode, &iloc);
++ if (err)
++ goto flags_err;
++
++ flags = flags & EXT3_FL_USER_MODIFIABLE;
++ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
++ inode->u.ext3_i.i_flags = flags;
++
++ if (flags & EXT3_SYNC_FL)
++ inode->i_flags |= S_SYNC;
++ else
++ inode->i_flags &= ~S_SYNC;
++ if (flags & EXT3_APPEND_FL)
++ inode->i_flags |= S_APPEND;
++ else
++ inode->i_flags &= ~S_APPEND;
++ if (flags & EXT3_IMMUTABLE_FL)
++ inode->i_flags |= S_IMMUTABLE;
++ else
++ inode->i_flags &= ~S_IMMUTABLE;
++ if (flags & EXT3_NOATIME_FL)
++ inode->i_flags |= S_NOATIME;
++ else
++ inode->i_flags &= ~S_NOATIME;
++ inode->i_ctime = CURRENT_TIME;
++
++ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++flags_err:
++ ext3_journal_stop(handle, inode);
++ if (err)
++ return err;
++
++ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
++ err = ext3_change_inode_journal_flag(inode, jflag);
++ return err;
++ }
++ case EXT3_IOC_GETVERSION:
++ case EXT3_IOC_GETVERSION_OLD:
++ return put_user(inode->i_generation, (int *) arg);
++ case EXT3_IOC_SETVERSION:
++ case EXT3_IOC_SETVERSION_OLD: {
++ handle_t *handle;
++ struct ext3_iloc iloc;
++ __u32 generation;
++ int err;
++
++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
++ return -EPERM;
++ if (IS_RDONLY(inode))
++ return -EROFS;
++ if (get_user(generation, (int *) arg))
++ return -EFAULT;
++
++ handle = ext3_journal_start(inode, 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ err = ext3_reserve_inode_write(handle, inode, &iloc);
++ if (err)
++ return err;
++
++ inode->i_ctime = CURRENT_TIME;
++ inode->i_generation = generation;
++
++ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++ ext3_journal_stop(handle, inode);
++ return err;
++ }
++#ifdef CONFIG_JBD_DEBUG
++ case EXT3_IOC_WAIT_FOR_READONLY:
++ /*
++ * This is racy - by the time we're woken up and running,
++ * the superblock could be released. And the module could
++ * have been unloaded. So sue me.
++ *
++ * Returns 1 if it slept, else zero.
++ */
++ {
++ struct super_block *sb = inode->i_sb;
++ DECLARE_WAITQUEUE(wait, current);
++ int ret = 0;
++
++ set_current_state(TASK_INTERRUPTIBLE);
++ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
++ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
++ schedule();
++ ret = 1;
++ }
++ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
++ return ret;
++ }
++#endif
++ default:
++ return -ENOTTY;
++ }
++}
+diff -rup --new-file linux.mcp2/fs/ext3/namei.c linux_tmp/fs/ext3/namei.c
+--- linux.mcp2/fs/ext3/namei.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/namei.c 2001-11-09 14:25:04.000000000 -0800
+@@ -0,0 +1,1125 @@
++/*
++ * linux/fs/ext3/namei.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/namei.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ * Directory entry file type support and forward compatibility hooks
++ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/fcntl.h>
++#include <linux/stat.h>
++#include <linux/string.h>
++#include <linux/locks.h>
++#include <linux/quotaops.h>
++
++
++/*
++ * define how far ahead to read directories while searching them.
++ */
++#define NAMEI_RA_CHUNKS 2
++#define NAMEI_RA_BLOCKS 4
++#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
++#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
++
++/*
++ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
++ *
++ * `len <= EXT3_NAME_LEN' is guaranteed by caller.
++ * `de != NULL' is guaranteed by caller.
++ */
++static inline int ext3_match (int len, const char * const name,
++ struct ext3_dir_entry_2 * de)
++{
++ if (len != de->name_len)
++ return 0;
++ if (!de->inode)
++ return 0;
++ return !memcmp(name, de->name, len);
++}
++
++/*
++ * Returns 0 if not found, -1 on failure, and 1 on success
++ */
++static int inline search_dirblock(struct buffer_head * bh,
++ struct inode *dir,
++ struct dentry *dentry,
++ unsigned long offset,
++ struct ext3_dir_entry_2 ** res_dir)
++{
++ struct ext3_dir_entry_2 * de;
++ char * dlimit;
++ int de_len;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ dlimit = bh->b_data + dir->i_sb->s_blocksize;
++ while ((char *) de < dlimit) {
++ /* this code is executed quadratically often */
++ /* do minimal checking `by hand' */
++
++ if ((char *) de + namelen <= dlimit &&
++ ext3_match (namelen, name, de)) {
++ /* found a match - just to be sure, do a full check */
++ if (!ext3_check_dir_entry("ext3_find_entry",
++ dir, de, bh, offset))
++ return -1;
++ *res_dir = de;
++ return 1;
++ }
++ /* prevent looping on a bad block */
++ de_len = le16_to_cpu(de->rec_len);
++ if (de_len <= 0)
++ return -1;
++ offset += de_len;
++ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
++ }
++ return 0;
++}
++
++/*
++ * ext3_find_entry()
++ *
++ * finds an entry in the specified directory with the wanted name. It
++ * returns the cache buffer in which the entry was found, and the entry
++ * itself (as a parameter - res_dir). It does NOT read the inode of the
++ * entry - you'll have to do that yourself if you want to.
++ *
++ * The returned buffer_head has ->b_count elevated. The caller is expected
++ * to brelse() it when appropriate.
++ */
++static struct buffer_head * ext3_find_entry (struct dentry *dentry,
++ struct ext3_dir_entry_2 ** res_dir)
++{
++ struct super_block * sb;
++ struct buffer_head * bh_use[NAMEI_RA_SIZE];
++ struct buffer_head * bh, *ret = NULL;
++ unsigned long start, block, b;
++ int ra_max = 0; /* Number of bh's in the readahead
++ buffer, bh_use[] */
++ int ra_ptr = 0; /* Current index into readahead
++ buffer */
++ int num = 0;
++ int nblocks, i, err;
++ struct inode *dir = dentry->d_parent->d_inode;
++
++ *res_dir = NULL;
++ sb = dir->i_sb;
++
++ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
++ start = dir->u.ext3_i.i_dir_start_lookup;
++ if (start >= nblocks)
++ start = 0;
++ block = start;
++restart:
++ do {
++ /*
++ * We deal with the read-ahead logic here.
++ */
++ if (ra_ptr >= ra_max) {
++ /* Refill the readahead buffer */
++ ra_ptr = 0;
++ b = block;
++ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
++ /*
++ * Terminate if we reach the end of the
++ * directory and must wrap, or if our
++ * search has finished at this block.
++ */
++ if (b >= nblocks || (num && block == start)) {
++ bh_use[ra_max] = NULL;
++ break;
++ }
++ num++;
++ bh = ext3_getblk(NULL, dir, b++, 0, &err);
++ bh_use[ra_max] = bh;
++ if (bh)
++ ll_rw_block(READ, 1, &bh);
++ }
++ }
++ if ((bh = bh_use[ra_ptr++]) == NULL)
++ goto next;
++ wait_on_buffer(bh);
++ if (!buffer_uptodate(bh)) {
++ /* read error, skip block & hope for the best */
++ brelse(bh);
++ goto next;
++ }
++ i = search_dirblock(bh, dir, dentry,
++ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
++ if (i == 1) {
++ dir->u.ext3_i.i_dir_start_lookup = block;
++ ret = bh;
++ goto cleanup_and_exit;
++ } else {
++ brelse(bh);
++ if (i < 0)
++ goto cleanup_and_exit;
++ }
++ next:
++ if (++block >= nblocks)
++ block = 0;
++ } while (block != start);
++
++ /*
++ * If the directory has grown while we were searching, then
++ * search the last part of the directory before giving up.
++ */
++ block = nblocks;
++ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
++ if (block < nblocks) {
++ start = 0;
++ goto restart;
++ }
++
++cleanup_and_exit:
++ /* Clean up the read-ahead blocks */
++ for (; ra_ptr < ra_max; ra_ptr++)
++ brelse (bh_use[ra_ptr]);
++ return ret;
++}
++
++static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
++{
++ struct inode * inode;
++ struct ext3_dir_entry_2 * de;
++ struct buffer_head * bh;
++
++ if (dentry->d_name.len > EXT3_NAME_LEN)
++ return ERR_PTR(-ENAMETOOLONG);
++
++ bh = ext3_find_entry(dentry, &de);
++ inode = NULL;
++ if (bh) {
++ unsigned long ino = le32_to_cpu(de->inode);
++ brelse (bh);
++ inode = iget(dir->i_sb, ino);
++
++ if (!inode)
++ return ERR_PTR(-EACCES);
++ }
++ d_add(dentry, inode);
++ return NULL;
++}
++
++#define S_SHIFT 12
++static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
++ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE,
++ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR,
++ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV,
++ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV,
++ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO,
++ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK,
++ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK,
++};
++
++static inline void ext3_set_de_type(struct super_block *sb,
++ struct ext3_dir_entry_2 *de,
++ umode_t mode) {
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
++ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
++}
++
++/*
++ * ext3_add_entry()
++ *
++ * adds a file entry to the specified directory, using the same
++ * semantics as ext3_find_entry(). It returns NULL if it failed.
++ *
++ * NOTE!! The inode part of 'de' is left at 0 - which means you
++ * may not sleep between calling this and putting something into
++ * the entry, as someone else might have used it while you slept.
++ */
++
++/*
++ * AKPM: the journalling code here looks wrong on the error paths
++ */
++static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ unsigned long offset;
++ unsigned short rec_len;
++ struct buffer_head * bh;
++ struct ext3_dir_entry_2 * de, * de1;
++ struct super_block * sb;
++ int retval;
++
++ sb = dir->i_sb;
++
++ if (!namelen)
++ return -EINVAL;
++ bh = ext3_bread (handle, dir, 0, 0, &retval);
++ if (!bh)
++ return retval;
++ rec_len = EXT3_DIR_REC_LEN(namelen);
++ offset = 0;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ while (1) {
++ if ((char *)de >= sb->s_blocksize + bh->b_data) {
++ brelse (bh);
++ bh = NULL;
++ bh = ext3_bread (handle, dir,
++ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
++ if (!bh)
++ return retval;
++ if (dir->i_size <= offset) {
++ if (dir->i_size == 0) {
++ brelse(bh);
++ return -ENOENT;
++ }
++
++ ext3_debug ("creating next block\n");
++
++ BUFFER_TRACE(bh, "get_write_access");
++ ext3_journal_get_write_access(handle, bh);
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ de->inode = 0;
++ de->rec_len = le16_to_cpu(sb->s_blocksize);
++ dir->u.ext3_i.i_disksize =
++ dir->i_size = offset + sb->s_blocksize;
++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, dir);
++ } else {
++
++ ext3_debug ("skipping to next block\n");
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ }
++ }
++ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
++ offset)) {
++ brelse (bh);
++ return -ENOENT;
++ }
++ if (ext3_match (namelen, name, de)) {
++ brelse (bh);
++ return -EEXIST;
++ }
++ if ((le32_to_cpu(de->inode) == 0 &&
++ le16_to_cpu(de->rec_len) >= rec_len) ||
++ (le16_to_cpu(de->rec_len) >=
++ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
++ BUFFER_TRACE(bh, "get_write_access");
++ ext3_journal_get_write_access(handle, bh);
++ /* By now the buffer is marked for journaling */
++ offset += le16_to_cpu(de->rec_len);
++ if (le32_to_cpu(de->inode)) {
++ de1 = (struct ext3_dir_entry_2 *) ((char *) de +
++ EXT3_DIR_REC_LEN(de->name_len));
++ de1->rec_len =
++ cpu_to_le16(le16_to_cpu(de->rec_len) -
++ EXT3_DIR_REC_LEN(de->name_len));
++ de->rec_len = cpu_to_le16(
++ EXT3_DIR_REC_LEN(de->name_len));
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ if (inode) {
++ de->inode = cpu_to_le32(inode->i_ino);
++ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++ } else
++ de->inode = 0;
++ de->name_len = namelen;
++ memcpy (de->name, name, namelen);
++ /*
++ * XXX shouldn't update any times until successful
++ * completion of syscall, but too many callers depend
++ * on this.
++ *
++ * XXX similarly, too many callers depend on
++ * ext3_new_inode() setting the times, but error
++ * recovery deletes the inode, so the worst that can
++ * happen is that the times are slightly out of date
++ * and/or different from the directory change time.
++ */
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, dir);
++ dir->i_version = ++event;
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, bh);
++ brelse(bh);
++ return 0;
++ }
++ offset += le16_to_cpu(de->rec_len);
++ de = (struct ext3_dir_entry_2 *)
++ ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ brelse (bh);
++ return -ENOSPC;
++}
++
++/*
++ * ext3_delete_entry deletes a directory entry by merging it with the
++ * previous entry
++ */
++static int ext3_delete_entry (handle_t *handle,
++ struct inode * dir,
++ struct ext3_dir_entry_2 * de_del,
++ struct buffer_head * bh)
++{
++ struct ext3_dir_entry_2 * de, * pde;
++ int i;
++
++ i = 0;
++ pde = NULL;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ while (i < bh->b_size) {
++ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
++ return -EIO;
++ if (de == de_del) {
++ BUFFER_TRACE(bh, "get_write_access");
++ ext3_journal_get_write_access(handle, bh);
++ if (pde)
++ pde->rec_len =
++ cpu_to_le16(le16_to_cpu(pde->rec_len) +
++ le16_to_cpu(de->rec_len));
++ else
++ de->inode = 0;
++ dir->i_version = ++event;
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, bh);
++ return 0;
++ }
++ i += le16_to_cpu(de->rec_len);
++ pde = de;
++ de = (struct ext3_dir_entry_2 *)
++ ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ return -ENOENT;
++}
++
++/*
++ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
++ * do not perform it in these functions. We perform it at the call site,
++ * if it is needed.
++ */
++static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
++{
++ inode->i_nlink++;
++}
++
++static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
++{
++ inode->i_nlink--;
++}
++
++static int ext3_add_nondir(handle_t *handle,
++ struct dentry *dentry, struct inode *inode)
++{
++ int err = ext3_add_entry(handle, dentry, inode);
++ if (!err) {
++ d_instantiate(dentry, inode);
++ return 0;
++ }
++ ext3_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
++
++/*
++ * By the time this is called, we already have created
++ * the directory cache entry for the new file, but it
++ * is so far negative - it has no inode.
++ *
++ * If the create succeeds, we fill in the inode information
++ * with d_instantiate().
++ */
++static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
++{
++ handle_t *handle;
++ struct inode * inode;
++ int err;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode = ext3_new_inode (handle, dir, mode);
++ err = PTR_ERR(inode);
++ if (!IS_ERR(inode)) {
++ inode->i_op = &ext3_file_inode_operations;
++ inode->i_fop = &ext3_file_operations;
++ inode->i_mapping->a_ops = &ext3_aops;
++ ext3_mark_inode_dirty(handle, inode);
++ err = ext3_add_nondir(handle, dentry, inode);
++ }
++ ext3_journal_stop(handle, dir);
++ return err;
++}
++
++static int ext3_mknod (struct inode * dir, struct dentry *dentry,
++ int mode, int rdev)
++{
++ handle_t *handle;
++ struct inode *inode;
++ int err;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode = ext3_new_inode (handle, dir, mode);
++ err = PTR_ERR(inode);
++ if (!IS_ERR(inode)) {
++ init_special_inode(inode, mode, rdev);
++ ext3_mark_inode_dirty(handle, inode);
++ err = ext3_add_nondir(handle, dentry, inode);
++ }
++ ext3_journal_stop(handle, dir);
++ return err;
++}
++
++static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
++{
++ handle_t *handle;
++ struct inode * inode;
++ struct buffer_head * dir_block;
++ struct ext3_dir_entry_2 * de;
++ int err;
++
++ if (dir->i_nlink >= EXT3_LINK_MAX)
++ return -EMLINK;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode = ext3_new_inode (handle, dir, S_IFDIR);
++ err = PTR_ERR(inode);
++ if (IS_ERR(inode))
++ goto out_stop;
++
++ inode->i_op = &ext3_dir_inode_operations;
++ inode->i_fop = &ext3_dir_operations;
++ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++ inode->i_blocks = 0;
++ dir_block = ext3_bread (handle, inode, 0, 1, &err);
++ if (!dir_block) {
++ inode->i_nlink--; /* is this nlink == 0? */
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
++ BUFFER_TRACE(dir_block, "get_write_access");
++ ext3_journal_get_write_access(handle, dir_block);
++ de = (struct ext3_dir_entry_2 *) dir_block->b_data;
++ de->inode = cpu_to_le32(inode->i_ino);
++ de->name_len = 1;
++ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
++ strcpy (de->name, ".");
++ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
++ de = (struct ext3_dir_entry_2 *)
++ ((char *) de + le16_to_cpu(de->rec_len));
++ de->inode = cpu_to_le32(dir->i_ino);
++ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
++ de->name_len = 2;
++ strcpy (de->name, "..");
++ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
++ inode->i_nlink = 2;
++ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, dir_block);
++ brelse (dir_block);
++ inode->i_mode = S_IFDIR | mode;
++ if (dir->i_mode & S_ISGID)
++ inode->i_mode |= S_ISGID;
++ ext3_mark_inode_dirty(handle, inode);
++ err = ext3_add_entry (handle, dentry, inode);
++ if (err)
++ goto out_no_entry;
++ dir->i_nlink++;
++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, dir);
++ d_instantiate(dentry, inode);
++out_stop:
++ ext3_journal_stop(handle, dir);
++ return err;
++
++out_no_entry:
++ inode->i_nlink = 0;
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++}
++
++/*
++ * routine to check that the specified directory is empty (for rmdir)
++ */
++static int empty_dir (struct inode * inode)
++{
++ unsigned long offset;
++ struct buffer_head * bh;
++ struct ext3_dir_entry_2 * de, * de1;
++ struct super_block * sb;
++ int err;
++
++ sb = inode->i_sb;
++ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
++ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
++ ext3_warning (inode->i_sb, "empty_dir",
++ "bad directory (dir #%lu) - no data block",
++ inode->i_ino);
++ return 1;
++ }
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ de1 = (struct ext3_dir_entry_2 *)
++ ((char *) de + le16_to_cpu(de->rec_len));
++ if (le32_to_cpu(de->inode) != inode->i_ino ||
++ !le32_to_cpu(de1->inode) ||
++ strcmp (".", de->name) ||
++ strcmp ("..", de1->name)) {
++ ext3_warning (inode->i_sb, "empty_dir",
++ "bad directory (dir #%lu) - no `.' or `..'",
++ inode->i_ino);
++ brelse (bh);
++ return 1;
++ }
++ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
++ de = (struct ext3_dir_entry_2 *)
++ ((char *) de1 + le16_to_cpu(de1->rec_len));
++ while (offset < inode->i_size ) {
++ if (!bh ||
++ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
++ brelse (bh);
++ bh = ext3_bread (NULL, inode,
++ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
++ if (!bh) {
++#if 0
++ ext3_error (sb, "empty_dir",
++ "directory #%lu contains a hole at offset %lu",
++ inode->i_ino, offset);
++#endif
++ offset += sb->s_blocksize;
++ continue;
++ }
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ }
++ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh,
++ offset)) {
++ brelse (bh);
++ return 1;
++ }
++ if (le32_to_cpu(de->inode)) {
++ brelse (bh);
++ return 0;
++ }
++ offset += le16_to_cpu(de->rec_len);
++ de = (struct ext3_dir_entry_2 *)
++ ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ brelse (bh);
++ return 1;
++}
++
++/* ext3_orphan_add() links an unlinked or truncated inode into a list of
++ * such inodes, starting at the superblock, in case we crash before the
++ * file is closed/deleted, or in case the inode truncate spans multiple
++ * transactions and the last transaction is not recovered after a crash.
++ *
++ * At filesystem recovery time, we walk this list deleting unlinked
++ * inodes and truncating linked inodes in ext3_orphan_cleanup().
++ */
++int ext3_orphan_add(handle_t *handle, struct inode *inode)
++{
++ struct super_block *sb = inode->i_sb;
++ struct ext3_iloc iloc;
++ int err = 0, rc;
++
++ lock_super(sb);
++ if (!list_empty(&inode->u.ext3_i.i_orphan))
++ goto out_unlock;
++
++ /* Orphan handling is only valid for files with data blocks
++ * being truncated, or files being unlinked. */
++
++ /* @@@ FIXME: Observation from aviro:
++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
++ * here (on lock_super()), so race with ext3_link() which might bump
++ * ->i_nlink. For, say it, character device. Not a regular file,
++ * not a directory, not a symlink and ->i_nlink > 0.
++ */
++ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
++ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
++
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++ if (err)
++ goto out_unlock;
++
++ err = ext3_reserve_inode_write(handle, inode, &iloc);
++ if (err)
++ goto out_unlock;
++
++ /* Insert this inode at the head of the on-disk orphan list... */
++ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
++ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++ rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
++ if (!err)
++ err = rc;
++
++ /* Only add to the head of the in-memory list if all the
++ * previous operations succeeded. If the orphan_add is going to
++ * fail (possibly taking the journal offline), we can't risk
++ * leaving the inode on the orphan list: stray orphan-list
++ * entries can cause panics at unmount time.
++ *
++ * This is safe: on error we're going to ignore the orphan list
++ * anyway on the next recovery. */
++ if (!err)
++ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++
++ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
++ jbd_debug(4, "orphan inode %ld will point to %d\n",
++ inode->i_ino, NEXT_ORPHAN(inode));
++out_unlock:
++ unlock_super(sb);
++ ext3_std_error(inode->i_sb, err);
++ return err;
++}
++
++/*
++ * ext3_orphan_del() removes an unlinked or truncated inode from the list
++ * of such inodes stored on disk, because it is finally being cleaned up.
++ */
++int ext3_orphan_del(handle_t *handle, struct inode *inode)
++{
++ struct list_head *prev;
++ struct ext3_sb_info *sbi;
++ ino_t ino_next;
++ struct ext3_iloc iloc;
++ int err = 0;
++
++ lock_super(inode->i_sb);
++ if (list_empty(&inode->u.ext3_i.i_orphan)) {
++ unlock_super(inode->i_sb);
++ return 0;
++ }
++
++ ino_next = NEXT_ORPHAN(inode);
++ prev = inode->u.ext3_i.i_orphan.prev;
++ sbi = EXT3_SB(inode->i_sb);
++
++ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
++
++ list_del(&inode->u.ext3_i.i_orphan);
++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++
++ /* If we're on an error path, we may not have a valid
++ * transaction handle with which to update the orphan list on
++ * disk, but we still need to remove the inode from the linked
++ * list in memory. */
++ if (!handle)
++ goto out;
++
++ err = ext3_reserve_inode_write(handle, inode, &iloc);
++ if (err)
++ goto out_err;
++
++ if (prev == &sbi->s_orphan) {
++ jbd_debug(4, "superblock will point to %ld\n", ino_next);
++ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, sbi->s_sbh);
++ if (err)
++ goto out_brelse;
++ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
++ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
++ } else {
++ struct ext3_iloc iloc2;
++ struct inode *i_prev =
++ list_entry(prev, struct inode, u.ext3_i.i_orphan);
++
++ jbd_debug(4, "orphan inode %ld will point to %ld\n",
++ i_prev->i_ino, ino_next);
++ err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
++ if (err)
++ goto out_brelse;
++ NEXT_ORPHAN(i_prev) = ino_next;
++ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
++ }
++ if (err)
++ goto out_brelse;
++ NEXT_ORPHAN(inode) = 0;
++ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++ if (err)
++ goto out_brelse;
++
++out_err:
++ ext3_std_error(inode->i_sb, err);
++out:
++ unlock_super(inode->i_sb);
++ return err;
++
++out_brelse:
++ brelse(iloc.bh);
++ goto out_err;
++}
++
++static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
++{
++ int retval;
++ struct inode * inode;
++ struct buffer_head * bh;
++ struct ext3_dir_entry_2 * de;
++ handle_t *handle;
++
++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ retval = -ENOENT;
++ bh = ext3_find_entry (dentry, &de);
++ if (!bh)
++ goto end_rmdir;
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode = dentry->d_inode;
++ DQUOT_INIT(inode);
++
++ retval = -EIO;
++ if (le32_to_cpu(de->inode) != inode->i_ino)
++ goto end_rmdir;
++
++ retval = -ENOTEMPTY;
++ if (!empty_dir (inode))
++ goto end_rmdir;
++
++ retval = ext3_delete_entry(handle, dir, de, bh);
++ if (retval)
++ goto end_rmdir;
++ if (inode->i_nlink != 2)
++ ext3_warning (inode->i_sb, "ext3_rmdir",
++ "empty directory has nlink!=2 (%d)",
++ inode->i_nlink);
++ inode->i_version = ++event;
++ inode->i_nlink = 0;
++ /* There's no need to set i_disksize: the fact that i_nlink is
++ * zero will ensure that the right thing happens during any
++ * recovery. */
++ inode->i_size = 0;
++ ext3_orphan_add(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ dir->i_nlink--;
++ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, dir);
++
++end_rmdir:
++ ext3_journal_stop(handle, dir);
++ brelse (bh);
++ return retval;
++}
++
++static int ext3_unlink(struct inode * dir, struct dentry *dentry)
++{
++ int retval;
++ struct inode * inode;
++ struct buffer_head * bh;
++ struct ext3_dir_entry_2 * de;
++ handle_t *handle;
++
++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ retval = -ENOENT;
++ bh = ext3_find_entry (dentry, &de);
++ if (!bh)
++ goto end_unlink;
++
++ inode = dentry->d_inode;
++ DQUOT_INIT(inode);
++
++ retval = -EIO;
++ if (le32_to_cpu(de->inode) != inode->i_ino)
++ goto end_unlink;
++
++ if (!inode->i_nlink) {
++ ext3_warning (inode->i_sb, "ext3_unlink",
++ "Deleting nonexistent file (%lu), %d",
++ inode->i_ino, inode->i_nlink);
++ inode->i_nlink = 1;
++ }
++ retval = ext3_delete_entry(handle, dir, de, bh);
++ if (retval)
++ goto end_unlink;
++ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, dir);
++ inode->i_nlink--;
++ if (!inode->i_nlink)
++ ext3_orphan_add(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ inode->i_ctime = dir->i_ctime;
++ retval = 0;
++
++end_unlink:
++ ext3_journal_stop(handle, dir);
++ brelse (bh);
++ return retval;
++}
++
++static int ext3_symlink (struct inode * dir,
++ struct dentry *dentry, const char * symname)
++{
++ handle_t *handle;
++ struct inode * inode;
++ int l, err;
++
++ l = strlen(symname)+1;
++ if (l > dir->i_sb->s_blocksize)
++ return -ENAMETOOLONG;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++ err = PTR_ERR(inode);
++ if (IS_ERR(inode))
++ goto out_stop;
++
++ if (l > sizeof (inode->u.ext3_i.i_data)) {
++ inode->i_op = &page_symlink_inode_operations;
++ inode->i_mapping->a_ops = &ext3_aops;
++ /*
++ * block_symlink() calls back into ext3_prepare/commit_write.
++ * We have a transaction open. All is sweetness. It also sets
++ * i_size in generic_commit_write().
++ */
++ err = block_symlink(inode, symname, l);
++ if (err)
++ goto out_no_entry;
++ } else {
++ inode->i_op = &ext3_fast_symlink_inode_operations;
++ memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++ inode->i_size = l-1;
++ }
++ inode->u.ext3_i.i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++ err = ext3_add_nondir(handle, dentry, inode);
++out_stop:
++ ext3_journal_stop(handle, dir);
++ return err;
++
++out_no_entry:
++ ext3_dec_count(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++}
++
++static int ext3_link (struct dentry * old_dentry,
++ struct inode * dir, struct dentry *dentry)
++{
++ handle_t *handle;
++ struct inode *inode = old_dentry->d_inode;
++ int err;
++
++ if (S_ISDIR(inode->i_mode))
++ return -EPERM;
++
++ if (inode->i_nlink >= EXT3_LINK_MAX)
++ return -EMLINK;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ inode->i_ctime = CURRENT_TIME;
++ ext3_inc_count(handle, inode);
++ atomic_inc(&inode->i_count);
++
++ ext3_mark_inode_dirty(handle, inode);
++ err = ext3_add_nondir(handle, dentry, inode);
++ ext3_journal_stop(handle, dir);
++ return err;
++}
++
++#define PARENT_INO(buffer) \
++ ((struct ext3_dir_entry_2 *) ((char *) buffer + \
++ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
++
++/*
++ * Anybody can rename anything with this: the permission checks are left to the
++ * higher-level routines.
++ */
++static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
++ struct inode * new_dir,struct dentry *new_dentry)
++{
++ handle_t *handle;
++ struct inode * old_inode, * new_inode;
++ struct buffer_head * old_bh, * new_bh, * dir_bh;
++ struct ext3_dir_entry_2 * old_de, * new_de;
++ int retval;
++
++ old_bh = new_bh = dir_bh = NULL;
++
++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
++ handle->h_sync = 1;
++
++ old_bh = ext3_find_entry (old_dentry, &old_de);
++ /*
++ * Check for inode number is _not_ due to possible IO errors.
++ * We might rmdir the source, keep it as pwd of some process
++ * and merrily kill the link to whatever was created under the
++ * same name. Goodbye sticky bit ;-<
++ */
++ old_inode = old_dentry->d_inode;
++ retval = -ENOENT;
++ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
++ goto end_rename;
++
++ new_inode = new_dentry->d_inode;
++ new_bh = ext3_find_entry (new_dentry, &new_de);
++ if (new_bh) {
++ if (!new_inode) {
++ brelse (new_bh);
++ new_bh = NULL;
++ } else {
++ DQUOT_INIT(new_inode);
++ }
++ }
++ if (S_ISDIR(old_inode->i_mode)) {
++ if (new_inode) {
++ retval = -ENOTEMPTY;
++ if (!empty_dir (new_inode))
++ goto end_rename;
++ }
++ retval = -EIO;
++ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
++ if (!dir_bh)
++ goto end_rename;
++ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
++ goto end_rename;
++ retval = -EMLINK;
++ if (!new_inode && new_dir!=old_dir &&
++ new_dir->i_nlink >= EXT3_LINK_MAX)
++ goto end_rename;
++ }
++ if (!new_bh) {
++ retval = ext3_add_entry (handle, new_dentry, old_inode);
++ if (retval)
++ goto end_rename;
++ } else {
++ BUFFER_TRACE(new_bh, "get write access");
++ BUFFER_TRACE(new_bh, "get_write_access");
++ ext3_journal_get_write_access(handle, new_bh);
++ new_de->inode = le32_to_cpu(old_inode->i_ino);
++ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
++ EXT3_FEATURE_INCOMPAT_FILETYPE))
++ new_de->file_type = old_de->file_type;
++ new_dir->i_version = ++event;
++ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, new_bh);
++ brelse(new_bh);
++ new_bh = NULL;
++ }
++
++ /*
++ * Like most other Unix systems, set the ctime for inodes on a
++ * rename.
++ */
++ old_inode->i_ctime = CURRENT_TIME;
++ ext3_mark_inode_dirty(handle, old_inode);
++
++ /*
++ * ok, that's it
++ */
++ ext3_delete_entry(handle, old_dir, old_de, old_bh);
++
++ if (new_inode) {
++ new_inode->i_nlink--;
++ new_inode->i_ctime = CURRENT_TIME;
++ }
++ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
++ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ if (dir_bh) {
++ BUFFER_TRACE(dir_bh, "get_write_access");
++ ext3_journal_get_write_access(handle, dir_bh);
++ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
++ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
++ ext3_journal_dirty_metadata(handle, dir_bh);
++ old_dir->i_nlink--;
++ if (new_inode) {
++ new_inode->i_nlink--;
++ } else {
++ new_dir->i_nlink++;
++ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_mark_inode_dirty(handle, new_dir);
++ }
++ }
++ ext3_mark_inode_dirty(handle, old_dir);
++ if (new_inode) {
++ ext3_mark_inode_dirty(handle, new_inode);
++ if (!new_inode->i_nlink)
++ ext3_orphan_add(handle, new_inode);
++ }
++ retval = 0;
++
++end_rename:
++ brelse (dir_bh);
++ brelse (old_bh);
++ brelse (new_bh);
++ ext3_journal_stop(handle, old_dir);
++ return retval;
++}
++
++/*
++ * directories can handle most operations...
++ */
++struct inode_operations ext3_dir_inode_operations = {
++ create: ext3_create, /* BKL held */
++ lookup: ext3_lookup, /* BKL held */
++ link: ext3_link, /* BKL held */
++ unlink: ext3_unlink, /* BKL held */
++ symlink: ext3_symlink, /* BKL held */
++ mkdir: ext3_mkdir, /* BKL held */
++ rmdir: ext3_rmdir, /* BKL held */
++ mknod: ext3_mknod, /* BKL held */
++ rename: ext3_rename, /* BKL held */
++};
+diff -rup --new-file linux.mcp2/fs/ext3/super.c linux_tmp/fs/ext3/super.c
+--- linux.mcp2/fs/ext3/super.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/super.c 2002-02-25 11:38:08.000000000 -0800
+@@ -0,0 +1,1753 @@
++/*
++ * linux/fs/ext3/super.c
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/inode.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * Big-endian to little-endian byte-swapping/bitmaps by
++ * David S. Miller (davem@caip.rutgers.edu), 1995
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/sched.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/slab.h>
++#include <linux/init.h>
++#include <linux/locks.h>
++#include <linux/blkdev.h>
++#include <linux/smp_lock.h>
++#include <linux/random.h>
++#include <asm/uaccess.h>
++
++#ifdef CONFIG_JBD_DEBUG
++static int ext3_ro_after; /* Make fs read-only after this many jiffies */
++#endif
++
++static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
++static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
++ int);
++static void ext3_commit_super (struct super_block * sb,
++ struct ext3_super_block * es,
++ int sync);
++static void ext3_mark_recovery_complete(struct super_block * sb,
++ struct ext3_super_block * es);
++static void ext3_clear_journal_err(struct super_block * sb,
++ struct ext3_super_block * es);
++
++#ifdef CONFIG_JBD_DEBUG
++int journal_no_write[2];
++
++/*
++ * Debug code for turning filesystems "read-only" after a specified
++ * amount of time. This is for crash/recovery testing.
++ */
++
++static void make_rdonly(kdev_t dev, int *no_write)
++{
++ if (dev) {
++ printk(KERN_WARNING "Turning device %s read-only\n",
++ bdevname(dev));
++ *no_write = 0xdead0000 + dev;
++ }
++}
++
++static void turn_fs_readonly(unsigned long arg)
++{
++ struct super_block *sb = (struct super_block *)arg;
++
++ make_rdonly(sb->s_dev, &journal_no_write[0]);
++ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]);
++ wake_up(&EXT3_SB(sb)->ro_wait_queue);
++}
++
++static void setup_ro_after(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ init_timer(&sbi->turn_ro_timer);
++ if (ext3_ro_after) {
++ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n",
++ ext3_ro_after);
++ init_waitqueue_head(&sbi->ro_wait_queue);
++ journal_no_write[0] = 0;
++ journal_no_write[1] = 0;
++ sbi->turn_ro_timer.function = turn_fs_readonly;
++ sbi->turn_ro_timer.data = (unsigned long)sb;
++ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after;
++ ext3_ro_after = 0;
++ add_timer(&sbi->turn_ro_timer);
++ }
++}
++
++static void clear_ro_after(struct super_block *sb)
++{
++ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer);
++ journal_no_write[0] = 0;
++ journal_no_write[1] = 0;
++ ext3_ro_after = 0;
++}
++#else
++#define setup_ro_after(sb) do {} while (0)
++#define clear_ro_after(sb) do {} while (0)
++#endif
++
++
++static char error_buf[1024];
++
++/* Determine the appropriate response to ext3_error on a given filesystem */
++
++static int ext3_error_behaviour(struct super_block *sb)
++{
++ /* First check for mount-time options */
++ if (test_opt (sb, ERRORS_PANIC))
++ return EXT3_ERRORS_PANIC;
++ if (test_opt (sb, ERRORS_RO))
++ return EXT3_ERRORS_RO;
++ if (test_opt (sb, ERRORS_CONT))
++ return EXT3_ERRORS_CONTINUE;
++
++ /* If no overrides were specified on the mount, then fall back
++ * to the default behaviour set in the filesystem's superblock
++ * on disk. */
++ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
++ case EXT3_ERRORS_PANIC:
++ return EXT3_ERRORS_PANIC;
++ case EXT3_ERRORS_RO:
++ return EXT3_ERRORS_RO;
++ default:
++ break;
++ }
++ return EXT3_ERRORS_CONTINUE;
++}
++
++/* Deal with the reporting of failure conditions on a filesystem such as
++ * inconsistencies detected or read IO failures.
++ *
++ * On ext2, we can store the error state of the filesystem in the
++ * superblock. That is not possible on ext3, because we may have other
++ * write ordering constraints on the superblock which prevent us from
++ * writing it out straight away; and given that the journal is about to
++ * be aborted, we can't rely on the current, or future, transactions to
++ * write out the superblock safely.
++ *
++ * We'll just use the journal_abort() error code to record an error in
++ * the journal instead. On recovery, the journal will compain about
++ * that error until we've noted it down and cleared it.
++ */
++
++static void ext3_handle_error(struct super_block *sb)
++{
++ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
++
++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
++ es->s_state |= cpu_to_le32(EXT3_ERROR_FS);
++
++ if (sb->s_flags & MS_RDONLY)
++ return;
++
++ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) {
++ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
++ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
++ }
++
++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
++ panic ("EXT3-fs (device %s): panic forced after error\n",
++ bdevname(sb->s_dev));
++
++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) {
++ printk (KERN_CRIT "Remounting filesystem read-only\n");
++ sb->s_flags |= MS_RDONLY;
++ }
++
++ ext3_commit_super(sb, es, 1);
++}
++
++void ext3_error (struct super_block * sb, const char * function,
++ const char * fmt, ...)
++{
++ va_list args;
++
++ va_start (args, fmt);
++ vsprintf (error_buf, fmt, args);
++ va_end (args);
++
++ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n",
++ bdevname(sb->s_dev), function, error_buf);
++
++ ext3_handle_error(sb);
++}
++
++const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16])
++{
++ char *errstr = NULL;
++
++ switch (errno) {
++ case -EIO:
++ errstr = "IO failure";
++ break;
++ case -ENOMEM:
++ errstr = "Out of memory";
++ break;
++ case -EROFS:
++ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
++ errstr = "Journal has aborted";
++ else
++ errstr = "Readonly filesystem";
++ break;
++ default:
++ /* If the caller passed in an extra buffer for unknown
++ * errors, textualise them now. Else we just return
++ * NULL. */
++ if (nbuf) {
++ /* Check for truncated error codes... */
++ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
++ errstr = nbuf;
++ }
++
++ break;
++ }
++
++ return errstr;
++}
++
++/* __ext3_std_error decodes expected errors from journaling functions
++ * automatically and invokes the appropriate error response. */
++
++void __ext3_std_error (struct super_block * sb, const char * function,
++ int errno)
++{
++ char nbuf[16];
++ const char *errstr = ext3_decode_error(sb, errno, nbuf);
++
++ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
++ bdevname(sb->s_dev), function, errstr);
++
++ ext3_handle_error(sb);
++}
++
++/*
++ * ext3_abort is a much stronger failure handler than ext3_error. The
++ * abort function may be used to deal with unrecoverable failures such
++ * as journal IO errors or ENOMEM at a critical moment in log management.
++ *
++ * We unconditionally force the filesystem into an ABORT|READONLY state,
++ * unless the error response on the fs has been set to panic in which
++ * case we take the easy way out and panic immediately.
++ */
++
++void ext3_abort (struct super_block * sb, const char * function,
++ const char * fmt, ...)
++{
++ va_list args;
++
++ printk (KERN_CRIT "ext3_abort called.\n");
++
++ va_start (args, fmt);
++ vsprintf (error_buf, fmt, args);
++ va_end (args);
++
++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
++ panic ("EXT3-fs panic (device %s): %s: %s\n",
++ bdevname(sb->s_dev), function, error_buf);
++
++ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n",
++ bdevname(sb->s_dev), function, error_buf);
++
++ if (sb->s_flags & MS_RDONLY)
++ return;
++
++ printk (KERN_CRIT "Remounting filesystem read-only\n");
++ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
++ sb->s_flags |= MS_RDONLY;
++ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
++ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
++}
++
++/* Deal with the reporting of failure conditions while running, such as
++ * inconsistencies in operation or invalid system states.
++ *
++ * Use ext3_error() for cases of invalid filesystem states, as that will
++ * record an error on disk and force a filesystem check on the next boot.
++ */
++NORET_TYPE void ext3_panic (struct super_block * sb, const char * function,
++ const char * fmt, ...)
++{
++ va_list args;
++
++ va_start (args, fmt);
++ vsprintf (error_buf, fmt, args);
++ va_end (args);
++
++ /* this is to prevent panic from syncing this filesystem */
++ /* AKPM: is this sufficient? */
++ sb->s_flags |= MS_RDONLY;
++ panic ("EXT3-fs panic (device %s): %s: %s\n",
++ bdevname(sb->s_dev), function, error_buf);
++}
++
++void ext3_warning (struct super_block * sb, const char * function,
++ const char * fmt, ...)
++{
++ va_list args;
++
++ va_start (args, fmt);
++ vsprintf (error_buf, fmt, args);
++ va_end (args);
++ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n",
++ bdevname(sb->s_dev), function, error_buf);
++}
++
++void ext3_update_dynamic_rev(struct super_block *sb)
++{
++ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
++
++ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
++ return;
++
++ ext3_warning(sb, __FUNCTION__,
++ "updating to rev %d because of new feature flag, "
++ "running e2fsck is recommended",
++ EXT3_DYNAMIC_REV);
++
++ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
++ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
++ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
++ /* leave es->s_feature_*compat flags alone */
++ /* es->s_uuid will be set by e2fsck if empty */
++
++ /*
++ * The rest of the superblock fields should be zero, and if not it
++ * means they are likely already in use, so leave them alone. We
++ * can leave it up to e2fsck to clean up any inconsistencies there.
++ */
++}
++
++/*
++ * Open the external journal device
++ */
++static struct block_device *ext3_blkdev_get(kdev_t dev)
++{
++ struct block_device *bdev;
++ int err = -ENODEV;
++
++ bdev = bdget(kdev_t_to_nr(dev));
++ if (bdev == NULL)
++ goto fail;
++ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
++ if (err < 0)
++ goto fail;
++ return bdev;
++
++fail:
++ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n",
++ bdevname(dev), err);
++ return NULL;
++}
++
++/*
++ * Release the journal device
++ */
++static int ext3_blkdev_put(struct block_device *bdev)
++{
++ return blkdev_put(bdev, BDEV_FS);
++}
++
++static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
++{
++ struct block_device *bdev;
++ int ret = -ENODEV;
++
++ bdev = sbi->journal_bdev;
++ if (bdev) {
++ ret = ext3_blkdev_put(bdev);
++ sbi->journal_bdev = 0;
++ }
++ return ret;
++}
++
++#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
++
++static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
++{
++ struct list_head *l;
++
++ printk(KERN_ERR "sb orphan head is %d\n",
++ le32_to_cpu(sbi->s_es->s_last_orphan));
++
++ printk(KERN_ERR "sb_info orphan list:\n");
++ list_for_each(l, &sbi->s_orphan) {
++ struct inode *inode = orphan_list_entry(l);
++ printk(KERN_ERR " "
++ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n",
++ inode->i_dev, inode->i_ino, inode,
++ inode->i_mode, inode->i_nlink,
++ le32_to_cpu(NEXT_ORPHAN(inode)));
++ }
++}
++
++void ext3_put_super (struct super_block * sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_super_block *es = sbi->s_es;
++ kdev_t j_dev = sbi->s_journal->j_dev;
++ int i;
++
++ journal_destroy(sbi->s_journal);
++ if (!(sb->s_flags & MS_RDONLY)) {
++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ es->s_state = le16_to_cpu(sbi->s_mount_state);
++ BUFFER_TRACE(sbi->s_sbh, "marking dirty");
++ mark_buffer_dirty(sbi->s_sbh);
++ ext3_commit_super(sb, es, 1);
++ }
++
++ for (i = 0; i < sbi->s_gdb_count; i++)
++ brelse(sbi->s_group_desc[i]);
++ kfree(sbi->s_group_desc);
++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
++ brelse(sbi->s_inode_bitmap[i]);
++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
++ brelse(sbi->s_block_bitmap[i]);
++ brelse(sbi->s_sbh);
++
++ /* Debugging code just in case the in-memory inode orphan list
++ * isn't empty. The on-disk one can be non-empty if we've
++ * detected an error and taken the fs readonly, but the
++ * in-memory list had better be clean by this point. */
++ if (!list_empty(&sbi->s_orphan))
++ dump_orphan_list(sb, sbi);
++ J_ASSERT(list_empty(&sbi->s_orphan));
++
++ invalidate_buffers(sb->s_dev);
++ if (j_dev != sb->s_dev) {
++ /*
++ * Invalidate the journal device's buffers. We don't want them
++ * floating about in memory - the physical journal device may
++ * hotswapped, and it breaks the `ro-after' testing code.
++ */
++ fsync_no_super(j_dev);
++ invalidate_buffers(j_dev);
++ ext3_blkdev_remove(sbi);
++ }
++ clear_ro_after(sb);
++
++ return;
++}
++
++static struct super_operations ext3_sops = {
++ read_inode: ext3_read_inode, /* BKL held */
++ write_inode: ext3_write_inode, /* BKL not held. Don't need */
++ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
++ put_inode: ext3_put_inode, /* BKL not held. Don't need */
++ delete_inode: ext3_delete_inode, /* BKL not held. We take it */
++ put_super: ext3_put_super, /* BKL held */
++ write_super: ext3_write_super, /* BKL held */
++ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
++ unlockfs: ext3_unlockfs, /* BKL not held. We take it */
++ statfs: ext3_statfs, /* BKL held */
++ remount_fs: ext3_remount, /* BKL held */
++};
++
++static int want_value(char *value, char *option)
++{
++ if (!value || !*value) {
++ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n",
++ option);
++ return -1;
++ }
++ return 0;
++}
++
++static int want_null_value(char *value, char *option)
++{
++ if (*value) {
++ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n",
++ option, value);
++ return -1;
++ }
++ return 0;
++}
++
++static int want_numeric(char *value, char *option, unsigned long *number)
++{
++ if (want_value(value, option))
++ return -1;
++ *number = simple_strtoul(value, &value, 0);
++ if (want_null_value(value, option))
++ return -1;
++ return 0;
++}
++
++/*
++ * This function has been shamelessly adapted from the msdos fs
++ */
++static int parse_options (char * options, unsigned long * sb_block,
++ struct ext3_sb_info *sbi,
++ unsigned long * inum,
++ int is_remount)
++{
++ unsigned long *mount_options = &sbi->s_mount_opt;
++ uid_t *resuid = &sbi->s_resuid;
++ gid_t *resgid = &sbi->s_resgid;
++ char * this_char;
++ char * value;
++
++ if (!options)
++ return 1;
++ for (this_char = strtok (options, ",");
++ this_char != NULL;
++ this_char = strtok (NULL, ",")) {
++ if ((value = strchr (this_char, '=')) != NULL)
++ *value++ = 0;
++ if (!strcmp (this_char, "bsddf"))
++ clear_opt (*mount_options, MINIX_DF);
++ else if (!strcmp (this_char, "nouid32")) {
++ set_opt (*mount_options, NO_UID32);
++ }
++ else if (!strcmp (this_char, "abort"))
++ set_opt (*mount_options, ABORT);
++ else if (!strcmp (this_char, "check")) {
++ if (!value || !*value || !strcmp (value, "none"))
++ clear_opt (*mount_options, CHECK);
++ else
++#ifdef CONFIG_EXT3_CHECK
++ set_opt (*mount_options, CHECK);
++#else
++ printk(KERN_ERR
++ "EXT3 Check option not supported\n");
++#endif
++ }
++ else if (!strcmp (this_char, "debug"))
++ set_opt (*mount_options, DEBUG);
++ else if (!strcmp (this_char, "errors")) {
++ if (want_value(value, "errors"))
++ return 0;
++ if (!strcmp (value, "continue")) {
++ clear_opt (*mount_options, ERRORS_RO);
++ clear_opt (*mount_options, ERRORS_PANIC);
++ set_opt (*mount_options, ERRORS_CONT);
++ }
++ else if (!strcmp (value, "remount-ro")) {
++ clear_opt (*mount_options, ERRORS_CONT);
++ clear_opt (*mount_options, ERRORS_PANIC);
++ set_opt (*mount_options, ERRORS_RO);
++ }
++ else if (!strcmp (value, "panic")) {
++ clear_opt (*mount_options, ERRORS_CONT);
++ clear_opt (*mount_options, ERRORS_RO);
++ set_opt (*mount_options, ERRORS_PANIC);
++ }
++ else {
++ printk (KERN_ERR
++ "EXT3-fs: Invalid errors option: %s\n",
++ value);
++ return 0;
++ }
++ }
++ else if (!strcmp (this_char, "grpid") ||
++ !strcmp (this_char, "bsdgroups"))
++ set_opt (*mount_options, GRPID);
++ else if (!strcmp (this_char, "minixdf"))
++ set_opt (*mount_options, MINIX_DF);
++ else if (!strcmp (this_char, "nocheck"))
++ clear_opt (*mount_options, CHECK);
++ else if (!strcmp (this_char, "nogrpid") ||
++ !strcmp (this_char, "sysvgroups"))
++ clear_opt (*mount_options, GRPID);
++ else if (!strcmp (this_char, "resgid")) {
++ unsigned long v;
++ if (want_numeric(value, "resgid", &v))
++ return 0;
++ *resgid = v;
++ }
++ else if (!strcmp (this_char, "resuid")) {
++ unsigned long v;
++ if (want_numeric(value, "resuid", &v))
++ return 0;
++ *resuid = v;
++ }
++ else if (!strcmp (this_char, "sb")) {
++ if (want_numeric(value, "sb", sb_block))
++ return 0;
++ }
++#ifdef CONFIG_JBD_DEBUG
++ else if (!strcmp (this_char, "ro-after")) {
++ unsigned long v;
++ if (want_numeric(value, "ro-after", &v))
++ return 0;
++ ext3_ro_after = v;
++ }
++#endif
++ /* Silently ignore the quota options */
++ else if (!strcmp (this_char, "grpquota")
++ || !strcmp (this_char, "noquota")
++ || !strcmp (this_char, "quota")
++ || !strcmp (this_char, "usrquota"))
++ /* Don't do anything ;-) */ ;
++ else if (!strcmp (this_char, "journal")) {
++ /* @@@ FIXME */
++ /* Eventually we will want to be able to create
++ a journal file here. For now, only allow the
++ user to specify an existing inode to be the
++ journal file. */
++ if (is_remount) {
++ printk(KERN_ERR "EXT3-fs: cannot specify "
++ "journal on remount\n");
++ return 0;
++ }
++
++ if (want_value(value, "journal"))
++ return 0;
++ if (!strcmp (value, "update"))
++ set_opt (*mount_options, UPDATE_JOURNAL);
++ else if (want_numeric(value, "journal", inum))
++ return 0;
++ }
++ else if (!strcmp (this_char, "noload"))
++ set_opt (*mount_options, NOLOAD);
++ else if (!strcmp (this_char, "data")) {
++ int data_opt = 0;
++
++ if (want_value(value, "data"))
++ return 0;
++ if (!strcmp (value, "journal"))
++ data_opt = EXT3_MOUNT_JOURNAL_DATA;
++ else if (!strcmp (value, "ordered"))
++ data_opt = EXT3_MOUNT_ORDERED_DATA;
++ else if (!strcmp (value, "writeback"))
++ data_opt = EXT3_MOUNT_WRITEBACK_DATA;
++ else {
++ printk (KERN_ERR
++ "EXT3-fs: Invalid data option: %s\n",
++ value);
++ return 0;
++ }
++ if (is_remount) {
++ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) !=
++ data_opt) {
++ printk(KERN_ERR
++ "EXT3-fs: cannot change data "
++ "mode on remount\n");
++ return 0;
++ }
++ } else {
++ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS;
++ *mount_options |= data_opt;
++ }
++ } else {
++ printk (KERN_ERR
++ "EXT3-fs: Unrecognized mount option %s\n",
++ this_char);
++ return 0;
++ }
++ }
++ return 1;
++}
++
++static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
++ int read_only)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int res = 0;
++
++ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
++ printk (KERN_ERR "EXT3-fs warning: revision level too high, "
++ "forcing read-only mode\n");
++ res = MS_RDONLY;
++ }
++ if (read_only)
++ return res;
++ if (!(sbi->s_mount_state & EXT3_VALID_FS))
++ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
++ "running e2fsck is recommended\n");
++ else if ((sbi->s_mount_state & EXT3_ERROR_FS))
++ printk (KERN_WARNING
++ "EXT3-fs warning: mounting fs with errors, "
++ "running e2fsck is recommended\n");
++ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
++ le16_to_cpu(es->s_mnt_count) >=
++ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
++ printk (KERN_WARNING
++ "EXT3-fs warning: maximal mount count reached, "
++ "running e2fsck is recommended\n");
++ else if (le32_to_cpu(es->s_checkinterval) &&
++ (le32_to_cpu(es->s_lastcheck) +
++ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME))
++ printk (KERN_WARNING
++ "EXT3-fs warning: checktime reached, "
++ "running e2fsck is recommended\n");
++#if 0
++ /* @@@ We _will_ want to clear the valid bit if we find
++ inconsistencies, to force a fsck at reboot. But for
++ a plain journaled filesystem we can keep it set as
++ valid forever! :) */
++ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
++#endif
++ if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
++ es->s_max_mnt_count =
++ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
++ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
++ es->s_mtime = cpu_to_le32(CURRENT_TIME);
++ ext3_update_dynamic_rev(sb);
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ ext3_commit_super (sb, es, 1);
++ if (test_opt (sb, DEBUG))
++ printk (KERN_INFO
++ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, "
++ "bpg=%lu, ipg=%lu, mo=%04lx]\n",
++ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize,
++ sbi->s_groups_count,
++ EXT3_BLOCKS_PER_GROUP(sb),
++ EXT3_INODES_PER_GROUP(sb),
++ sbi->s_mount_opt);
++ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
++ bdevname(sb->s_dev));
++ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
++ printk("external journal on %s\n",
++ bdevname(EXT3_SB(sb)->s_journal->j_dev));
++ } else {
++ printk("internal journal\n");
++ }
++#ifdef CONFIG_EXT3_CHECK
++ if (test_opt (sb, CHECK)) {
++ ext3_check_blocks_bitmap (sb);
++ ext3_check_inodes_bitmap (sb);
++ }
++#endif
++ setup_ro_after(sb);
++ return res;
++}
++
++static int ext3_check_descriptors (struct super_block * sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
++ struct ext3_group_desc * gdp = NULL;
++ int desc_block = 0;
++ int i;
++
++ ext3_debug ("Checking group descriptors");
++
++ for (i = 0; i < sbi->s_groups_count; i++)
++ {
++ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
++ gdp = (struct ext3_group_desc *)
++ sbi->s_group_desc[desc_block++]->b_data;
++ if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
++ le32_to_cpu(gdp->bg_block_bitmap) >=
++ block + EXT3_BLOCKS_PER_GROUP(sb))
++ {
++ ext3_error (sb, "ext3_check_descriptors",
++ "Block bitmap for group %d"
++ " not in group (block %lu)!",
++ i, (unsigned long)
++ le32_to_cpu(gdp->bg_block_bitmap));
++ return 0;
++ }
++ if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
++ le32_to_cpu(gdp->bg_inode_bitmap) >=
++ block + EXT3_BLOCKS_PER_GROUP(sb))
++ {
++ ext3_error (sb, "ext3_check_descriptors",
++ "Inode bitmap for group %d"
++ " not in group (block %lu)!",
++ i, (unsigned long)
++ le32_to_cpu(gdp->bg_inode_bitmap));
++ return 0;
++ }
++ if (le32_to_cpu(gdp->bg_inode_table) < block ||
++ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
++ block + EXT3_BLOCKS_PER_GROUP(sb))
++ {
++ ext3_error (sb, "ext3_check_descriptors",
++ "Inode table for group %d"
++ " not in group (block %lu)!",
++ i, (unsigned long)
++ le32_to_cpu(gdp->bg_inode_table));
++ return 0;
++ }
++ block += EXT3_BLOCKS_PER_GROUP(sb);
++ gdp++;
++ }
++ return 1;
++}
++
++
++/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
++ * the superblock) which were deleted from all directories, but held open by
++ * a process at the time of a crash. We walk the list and try to delete these
++ * inodes at recovery time (only with a read-write filesystem).
++ *
++ * In order to keep the orphan inode chain consistent during traversal (in
++ * case of crash during recovery), we link each inode into the superblock
++ * orphan list_head and handle it the same way as an inode deletion during
++ * normal operation (which journals the operations for us).
++ *
++ * We only do an iget() and an iput() on each inode, which is very safe if we
++ * accidentally point at an in-use or already deleted inode. The worst that
++ * can happen in this case is that we get a "bit already cleared" message from
++ * ext3_free_inode(). The only reason we would point at a wrong inode is if
++ * e2fsck was run on this filesystem, and it must have already done the orphan
++ * inode cleanup for us, so we can safely abort without any further action.
++ */
++static void ext3_orphan_cleanup (struct super_block * sb,
++ struct ext3_super_block * es)
++{
++ unsigned int s_flags = sb->s_flags;
++ int nr_orphans = 0, nr_truncates = 0;
++ if (!es->s_last_orphan) {
++ jbd_debug(4, "no orphan inodes to clean up\n");
++ return;
++ }
++
++ if (s_flags & MS_RDONLY) {
++ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
++ bdevname(sb->s_dev));
++ sb->s_flags &= ~MS_RDONLY;
++ }
++
++ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
++ if (es->s_last_orphan)
++ jbd_debug(1, "Errors on filesystem, "
++ "clearing orphan list.\n");
++ es->s_last_orphan = 0;
++ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
++ return;
++ }
++
++ while (es->s_last_orphan) {
++ struct inode *inode;
++
++ if (!(inode =
++ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
++ es->s_last_orphan = 0;
++ break;
++ }
++
++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
++ if (inode->i_nlink) {
++ printk(KERN_DEBUG __FUNCTION__
++ ": truncating inode %ld to %Ld bytes\n",
++ inode->i_ino, inode->i_size);
++ jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
++ inode->i_ino, inode->i_size);
++ ext3_truncate(inode);
++ nr_truncates++;
++ } else {
++ printk(KERN_DEBUG __FUNCTION__
++ ": deleting unreferenced inode %ld\n",
++ inode->i_ino);
++ jbd_debug(2, "deleting unreferenced inode %ld\n",
++ inode->i_ino);
++ nr_orphans++;
++ }
++ iput(inode); /* The delete magic happens here! */
++ }
++
++#define PLURAL(x) (x), ((x)==1) ? "" : "s"
++
++ if (nr_orphans)
++ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
++ bdevname(sb->s_dev), PLURAL(nr_orphans));
++ if (nr_truncates)
++ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
++ bdevname(sb->s_dev), PLURAL(nr_truncates));
++ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
++}
++
++#define log2(n) ffz(~(n))
++
++/*
++ * Maximal file size. There is a direct, and {,double-,triple-}indirect
++ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
++ * We need to be 1 filesystem block less than the 2^32 sector limit.
++ */
++static loff_t ext3_max_size(int bits)
++{
++ loff_t res = EXT3_NDIR_BLOCKS;
++ res += 1LL << (bits-2);
++ res += 1LL << (2*(bits-2));
++ res += 1LL << (3*(bits-2));
++ res <<= bits;
++ if (res > (512LL << 32) - (1 << bits))
++ res = (512LL << 32) - (1 << bits);
++ return res;
++}
++
++struct super_block * ext3_read_super (struct super_block * sb, void * data,
++ int silent)
++{
++ struct buffer_head * bh;
++ struct ext3_super_block *es = 0;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned long sb_block = 1;
++ unsigned long logic_sb_block = 1;
++ unsigned long offset = 0;
++ unsigned long journal_inum = 0;
++ kdev_t dev = sb->s_dev;
++ int blocksize;
++ int hblock;
++ int db_count;
++ int i;
++ int needs_recovery;
++
++#ifdef CONFIG_JBD_DEBUG
++ ext3_ro_after = 0;
++#endif
++ /*
++ * See what the current blocksize for the device is, and
++ * use that as the blocksize. Otherwise (or if the blocksize
++ * is smaller than the default) use the default.
++ * This is important for devices that have a hardware
++ * sectorsize that is larger than the default.
++ */
++ blocksize = EXT3_MIN_BLOCK_SIZE;
++ hblock = get_hardsect_size(dev);
++ if (blocksize < hblock)
++ blocksize = hblock;
++
++ sbi->s_mount_opt = 0;
++ sbi->s_resuid = EXT3_DEF_RESUID;
++ sbi->s_resgid = EXT3_DEF_RESGID;
++ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
++ sb->s_dev = 0;
++ goto out_fail;
++ }
++
++ sb->s_blocksize = blocksize;
++ set_blocksize (dev, blocksize);
++
++ /*
++ * The ext3 superblock will not be buffer aligned for other than 1kB
++ * block sizes. We need to calculate the offset from buffer start.
++ */
++ if (blocksize != EXT3_MIN_BLOCK_SIZE) {
++ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
++ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
++ }
++
++ if (!(bh = sb_bread(sb, logic_sb_block))) {
++ printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
++ goto out_fail;
++ }
++ /*
++ * Note: s_es must be initialized as soon as possible because
++ * some ext3 macro-instructions depend on its value
++ */
++ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
++ sbi->s_es = es;
++ sb->s_magic = le16_to_cpu(es->s_magic);
++ if (sb->s_magic != EXT3_SUPER_MAGIC) {
++ if (!silent)
++ printk(KERN_ERR
++ "VFS: Can't find ext3 filesystem on dev %s.\n",
++ bdevname(dev));
++ goto failed_mount;
++ }
++ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
++ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
++ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
++ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
++ printk(KERN_WARNING
++ "EXT3-fs warning: feature flags set on rev 0 fs, "
++ "running e2fsck is recommended\n");
++ /*
++ * Check feature flags regardless of the revision level, since we
++ * previously didn't change the revision level when setting the flags,
++ * so there is a chance incompat flags are set on a rev 0 filesystem.
++ */
++ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) {
++ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
++ "unsupported optional features (%x).\n",
++ bdevname(dev), i);
++ goto failed_mount;
++ }
++ if (!(sb->s_flags & MS_RDONLY) &&
++ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){
++ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
++ "unsupported optional features (%x).\n",
++ bdevname(dev), i);
++ goto failed_mount;
++ }
++ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10;
++ sb->s_blocksize = 1 << sb->s_blocksize_bits;
++
++ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE ||
++ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) {
++ printk(KERN_ERR
++ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
++ blocksize, bdevname(dev));
++ goto failed_mount;
++ }
++
++ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
++
++ if (sb->s_blocksize != blocksize) {
++ blocksize = sb->s_blocksize;
++
++ /*
++ * Make sure the blocksize for the filesystem is larger
++ * than the hardware sectorsize for the machine.
++ */
++ if (sb->s_blocksize < hblock) {
++ printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
++ "device blocksize %d.\n", blocksize, hblock);
++ goto failed_mount;
++ }
++
++ brelse (bh);
++ set_blocksize (dev, sb->s_blocksize);
++ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
++ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
++ bh = sb_bread(sb, logic_sb_block);
++ if (!bh) {
++ printk(KERN_ERR
++ "EXT3-fs: Can't read superblock on 2nd try.\n");
++ return NULL;
++ }
++ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
++ sbi->s_es = es;
++ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) {
++ printk (KERN_ERR
++ "EXT3-fs: Magic mismatch, very weird !\n");
++ goto failed_mount;
++ }
++ }
++
++ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
++ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
++ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
++ } else {
++ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
++ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
++ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) {
++ printk (KERN_ERR
++ "EXT3-fs: unsupported inode size: %d\n",
++ sbi->s_inode_size);
++ goto failed_mount;
++ }
++ }
++ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
++ le32_to_cpu(es->s_log_frag_size);
++ if (blocksize != sbi->s_frag_size) {
++ printk(KERN_ERR
++ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
++ sbi->s_frag_size, blocksize);
++ goto failed_mount;
++ }
++ sbi->s_frags_per_block = 1;
++ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
++ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
++ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
++ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
++ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block;
++ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
++ sbi->s_sbh = bh;
++ if (sbi->s_resuid == EXT3_DEF_RESUID)
++ sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
++ if (sbi->s_resgid == EXT3_DEF_RESGID)
++ sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
++ sbi->s_mount_state = le16_to_cpu(es->s_state);
++ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
++ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++
++ if (sbi->s_blocks_per_group > blocksize * 8) {
++ printk (KERN_ERR
++ "EXT3-fs: #blocks per group too big: %lu\n",
++ sbi->s_blocks_per_group);
++ goto failed_mount;
++ }
++ if (sbi->s_frags_per_group > blocksize * 8) {
++ printk (KERN_ERR
++ "EXT3-fs: #fragments per group too big: %lu\n",
++ sbi->s_frags_per_group);
++ goto failed_mount;
++ }
++ if (sbi->s_inodes_per_group > blocksize * 8) {
++ printk (KERN_ERR
++ "EXT3-fs: #inodes per group too big: %lu\n",
++ sbi->s_inodes_per_group);
++ goto failed_mount;
++ }
++
++ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
++ le32_to_cpu(es->s_first_data_block) +
++ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
++ EXT3_DESC_PER_BLOCK(sb);
++ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
++ GFP_KERNEL);
++ if (sbi->s_group_desc == NULL) {
++ printk (KERN_ERR "EXT3-fs: not enough memory\n");
++ goto failed_mount;
++ }
++ for (i = 0; i < db_count; i++) {
++ sbi->s_group_desc[i] = sb_bread(sb, logic_sb_block + i + 1);
++ if (!sbi->s_group_desc[i]) {
++ printk (KERN_ERR "EXT3-fs: "
++ "can't read group descriptor %d\n", i);
++ db_count = i;
++ goto failed_mount2;
++ }
++ }
++ if (!ext3_check_descriptors (sb)) {
++ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
++ goto failed_mount2;
++ }
++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
++ sbi->s_inode_bitmap_number[i] = 0;
++ sbi->s_inode_bitmap[i] = NULL;
++ sbi->s_block_bitmap_number[i] = 0;
++ sbi->s_block_bitmap[i] = NULL;
++ }
++ sbi->s_loaded_inode_bitmaps = 0;
++ sbi->s_loaded_block_bitmaps = 0;
++ sbi->s_gdb_count = db_count;
++ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
++ /*
++ * set up enough so that it can read an inode
++ */
++ sb->s_op = &ext3_sops;
++ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
++
++ sb->s_root = 0;
++
++ needs_recovery = (es->s_last_orphan != 0 ||
++ EXT3_HAS_INCOMPAT_FEATURE(sb,
++ EXT3_FEATURE_INCOMPAT_RECOVER));
++
++ /*
++ * The first inode we look at is the journal inode. Don't try
++ * root first: it may be modified in the journal!
++ */
++ if (!test_opt(sb, NOLOAD) &&
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
++ if (ext3_load_journal(sb, es))
++ goto failed_mount2;
++ } else if (journal_inum) {
++ if (ext3_create_journal(sb, es, journal_inum))
++ goto failed_mount2;
++ } else {
++ if (!silent)
++ printk (KERN_ERR
++ "ext3: No journal on filesystem on %s\n",
++ bdevname(dev));
++ goto failed_mount2;
++ }
++
++ /* We have now updated the journal if required, so we can
++ * validate the data journaling mode. */
++ switch (test_opt(sb, DATA_FLAGS)) {
++ case 0:
++ /* No mode set, assume a default based on the journal
++ capabilities: ORDERED_DATA if the journal can
++ cope, else JOURNAL_DATA */
++ if (journal_check_available_features
++ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
++ set_opt(sbi->s_mount_opt, ORDERED_DATA);
++ else
++ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
++ break;
++
++ case EXT3_MOUNT_ORDERED_DATA:
++ case EXT3_MOUNT_WRITEBACK_DATA:
++ if (!journal_check_available_features
++ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
++ printk(KERN_ERR "EXT3-fs: Journal does not support "
++ "requested data journaling mode\n");
++ goto failed_mount3;
++ }
++ default:
++ break;
++ }
++
++ /*
++ * The journal_load will have done any necessary log recovery,
++ * so we can safely mount the rest of the filesystem now.
++ */
++
++ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO));
++ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) ||
++ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) {
++ if (sb->s_root) {
++ dput(sb->s_root);
++ sb->s_root = NULL;
++ printk(KERN_ERR
++ "EXT3-fs: corrupt root inode, run e2fsck\n");
++ } else
++ printk(KERN_ERR "EXT3-fs: get root inode failed\n");
++ goto failed_mount3;
++ }
++
++ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++ /*
++ * akpm: core read_super() calls in here with the superblock locked.
++ * That deadlocks, because orphan cleanup needs to lock the superblock
++ * in numerous places. Here we just pop the lock - it's relatively
++ * harmless, because we are now ready to accept write_super() requests,
++ * and aviro says that's the only reason for hanging onto the
++ * superblock lock.
++ */
++ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
++ unlock_super(sb); /* akpm: sigh */
++ ext3_orphan_cleanup(sb, es);
++ lock_super(sb);
++ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++ if (needs_recovery)
++ printk (KERN_INFO "EXT3-fs: recovery complete.\n");
++ ext3_mark_recovery_complete(sb, es);
++ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
++ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
++ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
++ "writeback");
++
++ return sb;
++
++failed_mount3:
++ journal_destroy(sbi->s_journal);
++failed_mount2:
++ for (i = 0; i < db_count; i++)
++ brelse(sbi->s_group_desc[i]);
++ kfree(sbi->s_group_desc);
++failed_mount:
++ ext3_blkdev_remove(sbi);
++ brelse(bh);
++out_fail:
++ return NULL;
++}
++
++static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
++{
++ struct inode *journal_inode;
++ journal_t *journal;
++
++ /* First, test for the existence of a valid inode on disk. Bad
++ * things happen if we iget() an unused inode, as the subsequent
++ * iput() will try to delete it. */
++
++ journal_inode = iget(sb, journal_inum);
++ if (!journal_inode) {
++ printk(KERN_ERR "EXT3-fs: no journal found.\n");
++ return NULL;
++ }
++ if (!journal_inode->i_nlink) {
++ make_bad_inode(journal_inode);
++ iput(journal_inode);
++ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
++ return NULL;
++ }
++
++ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
++ journal_inode, journal_inode->i_size);
++ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
++ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
++ iput(journal_inode);
++ return NULL;
++ }
++
++ journal = journal_init_inode(journal_inode);
++ if (!journal) {
++ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
++ iput(journal_inode);
++ }
++
++ return journal;
++}
++
++static journal_t *ext3_get_dev_journal(struct super_block *sb,
++ int dev)
++{
++ struct buffer_head * bh;
++ journal_t *journal;
++ int start;
++ int len;
++ int hblock, blocksize;
++ unsigned long sb_block;
++ unsigned long offset;
++ kdev_t journal_dev = to_kdev_t(dev);
++ struct ext3_super_block * es;
++ struct block_device *bdev;
++
++ bdev = ext3_blkdev_get(journal_dev);
++ if (bdev == NULL)
++ return NULL;
++
++ blocksize = sb->s_blocksize;
++ hblock = get_hardsect_size(journal_dev);
++ if (blocksize < hblock) {
++ printk(KERN_ERR
++ "EXT3-fs: blocksize too small for journal device.\n");
++ goto out_bdev;
++ }
++
++ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
++ offset = EXT3_MIN_BLOCK_SIZE % blocksize;
++ set_blocksize(dev, blocksize);
++ if (!(bh = bread(dev, sb_block, blocksize))) {
++ printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
++ "external journal\n");
++ goto out_bdev;
++ }
++
++ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
++ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
++ !(le32_to_cpu(es->s_feature_incompat) &
++ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
++ printk(KERN_ERR "EXT3-fs: external journal has "
++ "bad superblock\n");
++ brelse(bh);
++ goto out_bdev;
++ }
++
++ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
++ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
++ brelse(bh);
++ goto out_bdev;
++ }
++
++ len = le32_to_cpu(es->s_blocks_count);
++ start = sb_block + 1;
++ brelse(bh); /* we're done with the superblock */
++
++ journal = journal_init_dev(journal_dev, sb->s_dev,
++ start, len, blocksize);
++ if (!journal) {
++ printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
++ goto out_bdev;
++ }
++ ll_rw_block(READ, 1, &journal->j_sb_buffer);
++ wait_on_buffer(journal->j_sb_buffer);
++ if (!buffer_uptodate(journal->j_sb_buffer)) {
++ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
++ goto out_journal;
++ }
++ if (ntohl(journal->j_superblock->s_nr_users) != 1) {
++ printk(KERN_ERR "EXT3-fs: External journal has more than one "
++ "user (unsupported) - %d\n",
++ ntohl(journal->j_superblock->s_nr_users));
++ goto out_journal;
++ }
++ EXT3_SB(sb)->journal_bdev = bdev;
++ return journal;
++out_journal:
++ journal_destroy(journal);
++out_bdev:
++ ext3_blkdev_put(bdev);
++ return NULL;
++}
++
++static int ext3_load_journal(struct super_block * sb,
++ struct ext3_super_block * es)
++{
++ journal_t *journal;
++ int journal_inum = le32_to_cpu(es->s_journal_inum);
++ int journal_dev = le32_to_cpu(es->s_journal_dev);
++ int err = 0;
++ int really_read_only;
++
++ really_read_only = is_read_only(sb->s_dev);
++
++ /*
++ * Are we loading a blank journal or performing recovery after a
++ * crash? For recovery, we need to check in advance whether we
++ * can get read-write access to the device.
++ */
++
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
++ if (sb->s_flags & MS_RDONLY) {
++ printk(KERN_INFO "EXT3-fs: INFO: recovery "
++ "required on readonly filesystem.\n");
++ if (really_read_only) {
++ printk(KERN_ERR "EXT3-fs: write access "
++ "unavailable, cannot proceed.\n");
++ return -EROFS;
++ }
++ printk (KERN_INFO "EXT3-fs: write access will "
++ "be enabled during recovery.\n");
++ }
++ }
++
++ if (journal_inum && journal_dev) {
++ printk(KERN_ERR "EXT3-fs: filesystem has both journal "
++ "and inode journals!\n");
++ return -EINVAL;
++ }
++
++ if (journal_inum) {
++ if (!(journal = ext3_get_journal(sb, journal_inum)))
++ return -EINVAL;
++ } else {
++ if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
++ return -EINVAL;
++ }
++
++
++ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
++ err = journal_update_format(journal);
++ if (err) {
++ printk(KERN_ERR "EXT3-fs: error updating journal.\n");
++ journal_destroy(journal);
++ return err;
++ }
++ }
++
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
++ err = journal_wipe(journal, !really_read_only);
++ if (!err)
++ err = journal_load(journal);
++
++ if (err) {
++ printk(KERN_ERR "EXT3-fs: error loading journal.\n");
++ journal_destroy(journal);
++ return err;
++ }
++
++ EXT3_SB(sb)->s_journal = journal;
++ ext3_clear_journal_err(sb, es);
++ return 0;
++}
++
++static int ext3_create_journal(struct super_block * sb,
++ struct ext3_super_block * es,
++ int journal_inum)
++{
++ journal_t *journal;
++
++ if (sb->s_flags & MS_RDONLY) {
++ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
++ "create journal.\n");
++ return -EROFS;
++ }
++
++ if (!(journal = ext3_get_journal(sb, journal_inum)))
++ return -EINVAL;
++
++ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
++ journal_inum);
++
++ if (journal_create(journal)) {
++ printk(KERN_ERR "EXT3-fs: error creating journal.\n");
++ journal_destroy(journal);
++ return -EIO;
++ }
++
++ EXT3_SB(sb)->s_journal = journal;
++
++ ext3_update_dynamic_rev(sb);
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
++
++ es->s_journal_inum = cpu_to_le32(journal_inum);
++ sb->s_dirt = 1;
++
++ /* Make sure we flush the recovery flag to disk. */
++ ext3_commit_super(sb, es, 1);
++
++ return 0;
++}
++
++static void ext3_commit_super (struct super_block * sb,
++ struct ext3_super_block * es,
++ int sync)
++{
++ es->s_wtime = cpu_to_le32(CURRENT_TIME);
++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
++ mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
++ if (sync) {
++ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
++ wait_on_buffer(sb->u.ext3_sb.s_sbh);
++ }
++}
++
++
++/*
++ * Have we just finished recovery? If so, and if we are mounting (or
++ * remounting) the filesystem readonly, then we will end up with a
++ * consistent fs on disk. Record that fact.
++ */
++static void ext3_mark_recovery_complete(struct super_block * sb,
++ struct ext3_super_block * es)
++{
++ journal_flush(EXT3_SB(sb)->s_journal);
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
++ sb->s_flags & MS_RDONLY) {
++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ sb->s_dirt = 0;
++ ext3_commit_super(sb, es, 1);
++ }
++}
++
++/*
++ * If we are mounting (or read-write remounting) a filesystem whose journal
++ * has recorded an error from a previous lifetime, move that error to the
++ * main filesystem now.
++ */
++static void ext3_clear_journal_err(struct super_block * sb,
++ struct ext3_super_block * es)
++{
++ journal_t *journal;
++ int j_errno;
++ const char *errstr;
++
++ journal = EXT3_SB(sb)->s_journal;
++
++ /*
++ * Now check for any error status which may have been recorded in the
++ * journal by a prior ext3_error() or ext3_abort()
++ */
++
++ j_errno = journal_errno(journal);
++ if (j_errno) {
++ char nbuf[16];
++
++ errstr = ext3_decode_error(sb, j_errno, nbuf);
++ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
++ "from previous mount: %s", errstr);
++ ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
++ "filesystem check.");
++
++ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
++ es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
++ ext3_commit_super (sb, es, 1);
++
++ journal_clear_err(journal);
++ }
++}
++
++/*
++ * Force the running and committing transactions to commit,
++ * and wait on the commit.
++ */
++int ext3_force_commit(struct super_block *sb)
++{
++ journal_t *journal;
++ int ret;
++
++ if (sb->s_flags & MS_RDONLY)
++ return 0;
++
++ journal = EXT3_SB(sb)->s_journal;
++ sb->s_dirt = 0;
++ lock_kernel(); /* important: lock down j_running_transaction */
++ ret = ext3_journal_force_commit(journal);
++ unlock_kernel();
++ return ret;
++}
++
++/*
++ * Ext3 always journals updates to the superblock itself, so we don't
++ * have to propagate any other updates to the superblock on disk at this
++ * point. Just start an async writeback to get the buffers on their way
++ * to the disk.
++ *
++ * This implicitly triggers the writebehind on sync().
++ */
++
++static int do_sync_supers = 0;
++MODULE_PARM(do_sync_supers, "i");
++MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
++
++void ext3_write_super (struct super_block * sb)
++{
++ tid_t target;
++
++ if (down_trylock(&sb->s_lock) == 0)
++ BUG(); /* aviro detector */
++ sb->s_dirt = 0;
++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
++
++ if (do_sync_supers) {
++ unlock_super(sb);
++ log_wait_commit(EXT3_SB(sb)->s_journal, target);
++ lock_super(sb);
++ }
++}
++
++/*
++ * LVM calls this function before a (read-only) snapshot is created. This
++ * gives us a chance to flush the journal completely and mark the fs clean.
++ */
++void ext3_write_super_lockfs(struct super_block *sb)
++{
++ sb->s_dirt = 0;
++
++ lock_kernel(); /* 2.4.5 forgot to do this for us */
++ if (!(sb->s_flags & MS_RDONLY)) {
++ journal_t *journal = EXT3_SB(sb)->s_journal;
++
++ /* Now we set up the journal barrier. */
++ journal_lock_updates(journal);
++ journal_flush(journal);
++
++ /* Journal blocked and flushed, clear needs_recovery flag. */
++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
++ }
++ unlock_kernel();
++}
++
++/*
++ * Called by LVM after the snapshot is done. We need to reset the RECOVER
++ * flag here, even though the filesystem is not technically dirty yet.
++ */
++void ext3_unlockfs(struct super_block *sb)
++{
++ if (!(sb->s_flags & MS_RDONLY)) {
++ lock_kernel();
++ lock_super(sb);
++ /* Reser the needs_recovery flag before the fs is unlocked. */
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
++ unlock_super(sb);
++ journal_unlock_updates(EXT3_SB(sb)->s_journal);
++ unlock_kernel();
++ }
++}
++
++int ext3_remount (struct super_block * sb, int * flags, char * data)
++{
++ struct ext3_super_block * es;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned long tmp;
++
++ clear_ro_after(sb);
++
++ /*
++ * Allow the "check" option to be passed as a remount option.
++ */
++ if (!parse_options(data, &tmp, sbi, &tmp, 1))
++ return -EINVAL;
++
++ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
++ ext3_abort(sb, __FUNCTION__, "Abort forced by user");
++
++ es = sbi->s_es;
++
++ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
++ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
++ return -EROFS;
++
++ if (*flags & MS_RDONLY) {
++ /*
++ * First of all, the unconditional stuff we have to do
++ * to disable replay of the journal when we next remount
++ */
++ sb->s_flags |= MS_RDONLY;
++
++ /*
++ * OK, test if we are remounting a valid rw partition
++ * readonly, and if so set the rdonly flag and then
++ * mark the partition as valid again.
++ */
++ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
++ (sbi->s_mount_state & EXT3_VALID_FS))
++ es->s_state = cpu_to_le16(sbi->s_mount_state);
++
++ ext3_mark_recovery_complete(sb, es);
++ } else {
++ int ret;
++ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
++ ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
++ printk(KERN_WARNING "EXT3-fs: %s: couldn't "
++ "remount RDWR because of unsupported "
++ "optional features (%x).\n",
++ bdevname(sb->s_dev), ret);
++ return -EROFS;
++ }
++ /*
++ * Mounting a RDONLY partition read-write, so reread
++ * and store the current valid flag. (It may have
++ * been changed by e2fsck since we originally mounted
++ * the partition.)
++ */
++ ext3_clear_journal_err(sb, es);
++ sbi->s_mount_state = le16_to_cpu(es->s_state);
++ if (!ext3_setup_super (sb, es, 0))
++ sb->s_flags &= ~MS_RDONLY;
++ }
++ }
++ setup_ro_after(sb);
++ return 0;
++}
++
++int ext3_statfs (struct super_block * sb, struct statfs * buf)
++{
++ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
++ unsigned long overhead;
++ int i;
++
++ if (test_opt (sb, MINIX_DF))
++ overhead = 0;
++ else {
++ /*
++ * Compute the overhead (FS structures)
++ */
++
++ /*
++ * All of the blocks before first_data_block are
++ * overhead
++ */
++ overhead = le32_to_cpu(es->s_first_data_block);
++
++ /*
++ * Add the overhead attributed to the superblock and
++ * block group descriptors. If the sparse superblocks
++ * feature is turned on, then not all groups have this.
++ */
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ overhead += ext3_bg_has_super(sb, i) +
++ ext3_bg_num_gdb(sb, i);
++
++ /*
++ * Every block group has an inode bitmap, a block
++ * bitmap, and an inode table.
++ */
++ overhead += (EXT3_SB(sb)->s_groups_count *
++ (2 + EXT3_SB(sb)->s_itb_per_group));
++ }
++
++ buf->f_type = EXT3_SUPER_MAGIC;
++ buf->f_bsize = sb->s_blocksize;
++ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
++ buf->f_bfree = ext3_count_free_blocks (sb);
++ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
++ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
++ buf->f_bavail = 0;
++ buf->f_files = le32_to_cpu(es->s_inodes_count);
++ buf->f_ffree = ext3_count_free_inodes (sb);
++ buf->f_namelen = EXT3_NAME_LEN;
++ return 0;
++}
++
++static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
++
++static int __init init_ext3_fs(void)
++{
++ return register_filesystem(&ext3_fs_type);
++}
++
++static void __exit exit_ext3_fs(void)
++{
++ unregister_filesystem(&ext3_fs_type);
++}
++
++EXPORT_NO_SYMBOLS;
++
++MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
++MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
++MODULE_LICENSE("GPL");
++module_init(init_ext3_fs)
++module_exit(exit_ext3_fs)
+diff -rup --new-file linux.mcp2/fs/ext3/symlink.c linux_tmp/fs/ext3/symlink.c
+--- linux.mcp2/fs/ext3/symlink.c 1969-12-31 16:00:00.000000000 -0800
++++ linux_tmp/fs/ext3/symlink.c 2001-11-09 14:25:04.000000000 -0800
+@@ -0,0 +1,39 @@
++/*
++ * linux/fs/ext3/symlink.c
++ *
++ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
++ *
++ * Copyright (C) 1992, 1993, 1994, 1995
++ * Remy Card (card@masi.ibp.fr)
++ * Laboratoire MASI - Institut Blaise Pascal
++ * Universite Pierre et Marie Curie (Paris VI)
++ *
++ * from
++ *
++ * linux/fs/minix/symlink.c
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ *
++ * ext3 symlink handling code
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++
++static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
++{
++ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
++ return vfs_readlink(dentry, buffer, buflen, s);
++}
++
++static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
++{
++ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
++ return vfs_follow_link(nd, s);
++}
++
++struct inode_operations ext3_fast_symlink_inode_operations = {
++ readlink: ext3_readlink, /* BKL not held. Don't need */
++ follow_link: ext3_follow_link, /* BKL not held. Don't need */
++};
--- /dev/null
+diff -ruP linux.mcp2/fs/jbd/Makefile linuxppc_2.4.19_final/fs/jbd/Makefile
+--- linux.mcp2/fs/jbd/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/Makefile 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,15 @@
++#
++# fs/jbd/Makefile
++#
++# Makefile for the linux journaling routines.
++#
++
++export-objs := journal.o
++O_TARGET := jbd.o
++
++obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
++
++obj-m := $(O_TARGET)
++
++include $(TOPDIR)/Rules.make
++
+diff -ruP linux.mcp2/fs/jbd/checkpoint.c linuxppc_2.4.19_final/fs/jbd/checkpoint.c
+--- linux.mcp2/fs/jbd/checkpoint.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/checkpoint.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,605 @@
++/*
++ * linux/fs/checkpoint.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
++ *
++ * Copyright 1999 Red Hat Software --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Checkpoint routines for the generic filesystem journaling code.
++ * Part of the ext2fs journaling system.
++ *
++ * Checkpointing is the process of ensuring that a section of the log is
++ * committed fully to disk, so that that portion of the log can be
++ * reused.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++
++extern spinlock_t journal_datalist_lock;
++
++/*
++ * Unlink a buffer from a transaction.
++ *
++ * Called with journal_datalist_lock held.
++ */
++
++static inline void __buffer_unlink(struct journal_head *jh)
++{
++ transaction_t *transaction;
++
++ transaction = jh->b_cp_transaction;
++ jh->b_cp_transaction = NULL;
++
++ jh->b_cpnext->b_cpprev = jh->b_cpprev;
++ jh->b_cpprev->b_cpnext = jh->b_cpnext;
++ if (transaction->t_checkpoint_list == jh)
++ transaction->t_checkpoint_list = jh->b_cpnext;
++ if (transaction->t_checkpoint_list == jh)
++ transaction->t_checkpoint_list = NULL;
++}
++
++/*
++ * Try to release a checkpointed buffer from its transaction.
++ * Returns 1 if we released it.
++ * Requires journal_datalist_lock
++ */
++static int __try_to_free_cp_buf(struct journal_head *jh)
++{
++ int ret = 0;
++ struct buffer_head *bh = jh2bh(jh);
++
++ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
++ JBUFFER_TRACE(jh, "remove from checkpoint list");
++ __journal_remove_checkpoint(jh);
++ __journal_remove_journal_head(bh);
++ BUFFER_TRACE(bh, "release");
++ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
++ refile_buffer(bh);
++ __brelse(bh);
++ ret = 1;
++ }
++ return ret;
++}
++
++/*
++ * log_wait_for_space: wait until there is space in the journal.
++ *
++ * Called with the journal already locked, but it will be unlocked if we have
++ * to wait for a checkpoint to free up some space in the log.
++ */
++
++void log_wait_for_space(journal_t *journal, int nblocks)
++{
++ while (log_space_left(journal) < nblocks) {
++ if (journal->j_flags & JFS_ABORT)
++ return;
++ unlock_journal(journal);
++ down(&journal->j_checkpoint_sem);
++ lock_journal(journal);
++
++ /* Test again, another process may have checkpointed
++ * while we were waiting for the checkpoint lock */
++ if (log_space_left(journal) < nblocks) {
++ log_do_checkpoint(journal, nblocks);
++ }
++ up(&journal->j_checkpoint_sem);
++ }
++}
++
++/*
++ * Clean up a transaction's checkpoint list.
++ *
++ * We wait for any pending IO to complete and make sure any clean
++ * buffers are removed from the transaction.
++ *
++ * Return 1 if we performed any actions which might have destroyed the
++ * checkpoint. (journal_remove_checkpoint() deletes the transaction when
++ * the last checkpoint buffer is cleansed)
++ *
++ * Called with the journal locked.
++ * Called with journal_datalist_lock held.
++ */
++static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
++{
++ struct journal_head *jh, *next_jh, *last_jh;
++ struct buffer_head *bh;
++ int ret = 0;
++
++ assert_spin_locked(&journal_datalist_lock);
++ jh = transaction->t_checkpoint_list;
++ if (!jh)
++ return 0;
++
++ last_jh = jh->b_cpprev;
++ next_jh = jh;
++ do {
++ jh = next_jh;
++ bh = jh2bh(jh);
++ if (buffer_locked(bh)) {
++ atomic_inc(&bh->b_count);
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ wait_on_buffer(bh);
++ /* the journal_head may have gone by now */
++ BUFFER_TRACE(bh, "brelse");
++ __brelse(bh);
++ goto out_return_1;
++ }
++
++ if (jh->b_transaction != NULL) {
++ transaction_t *transaction = jh->b_transaction;
++ tid_t tid = transaction->t_tid;
++
++ spin_unlock(&journal_datalist_lock);
++ log_start_commit(journal, transaction);
++ unlock_journal(journal);
++ log_wait_commit(journal, tid);
++ goto out_return_1;
++ }
++
++ /*
++ * We used to test for (jh->b_list != BUF_CLEAN) here.
++ * But unmap_underlying_metadata() can place buffer onto
++ * BUF_CLEAN. Since refile_buffer() no longer takes buffers
++ * off checkpoint lists, we cope with it here
++ */
++ /*
++ * AKPM: I think the buffer_jdirty test is redundant - it
++ * shouldn't have NULL b_transaction?
++ */
++ next_jh = jh->b_cpnext;
++ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
++ BUFFER_TRACE(bh, "remove from checkpoint");
++ __journal_remove_checkpoint(jh);
++ __journal_remove_journal_head(bh);
++ refile_buffer(bh);
++ __brelse(bh);
++ ret = 1;
++ }
++
++ jh = next_jh;
++ } while (jh != last_jh);
++
++ return ret;
++out_return_1:
++ lock_journal(journal);
++ spin_lock(&journal_datalist_lock);
++ return 1;
++}
++
++#define NR_BATCH 64
++
++static void __flush_batch(struct buffer_head **bhs, int *batch_count)
++{
++ int i;
++
++ spin_unlock(&journal_datalist_lock);
++ ll_rw_block(WRITE, *batch_count, bhs);
++ run_task_queue(&tq_disk);
++ spin_lock(&journal_datalist_lock);
++ for (i = 0; i < *batch_count; i++) {
++ struct buffer_head *bh = bhs[i];
++ clear_bit(BH_JWrite, &bh->b_state);
++ BUFFER_TRACE(bh, "brelse");
++ __brelse(bh);
++ }
++ *batch_count = 0;
++}
++
++/*
++ * Try to flush one buffer from the checkpoint list to disk.
++ *
++ * Return 1 if something happened which requires us to abort the current
++ * scan of the checkpoint list.
++ *
++ * Called with journal_datalist_lock held.
++ */
++static int __flush_buffer(journal_t *journal, struct journal_head *jh,
++ struct buffer_head **bhs, int *batch_count,
++ int *drop_count)
++{
++ struct buffer_head *bh = jh2bh(jh);
++ int ret = 0;
++
++ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
++ J_ASSERT_JH(jh, jh->b_transaction == NULL);
++
++ /*
++ * Important: we are about to write the buffer, and
++ * possibly block, while still holding the journal lock.
++ * We cannot afford to let the transaction logic start
++ * messing around with this buffer before we write it to
++ * disk, as that would break recoverability.
++ */
++ BUFFER_TRACE(bh, "queue");
++ atomic_inc(&bh->b_count);
++ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
++ set_bit(BH_JWrite, &bh->b_state);
++ bhs[*batch_count] = bh;
++ (*batch_count)++;
++ if (*batch_count == NR_BATCH) {
++ __flush_batch(bhs, batch_count);
++ ret = 1;
++ }
++ } else {
++ int last_buffer = 0;
++ if (jh->b_cpnext == jh) {
++ /* We may be about to drop the transaction. Tell the
++ * caller that the lists have changed.
++ */
++ last_buffer = 1;
++ }
++ if (__try_to_free_cp_buf(jh)) {
++ (*drop_count)++;
++ ret = last_buffer;
++ }
++ }
++ return ret;
++}
++
++
++/*
++ * Perform an actual checkpoint. We don't write out only enough to
++ * satisfy the current blocked requests: rather we submit a reasonably
++ * sized chunk of the outstanding data to disk at once for
++ * efficiency. log_wait_for_space() will retry if we didn't free enough.
++ *
++ * However, we _do_ take into account the amount requested so that once
++ * the IO has been queued, we can return as soon as enough of it has
++ * completed to disk.
++ *
++ * The journal should be locked before calling this function.
++ */
++
++/* @@@ `nblocks' is unused. Should it be used? */
++int log_do_checkpoint (journal_t *journal, int nblocks)
++{
++ transaction_t *transaction, *last_transaction, *next_transaction;
++ int result;
++ int target;
++ int batch_count = 0;
++ struct buffer_head *bhs[NR_BATCH];
++
++ jbd_debug(1, "Start checkpoint\n");
++
++ /*
++ * First thing: if there are any transactions in the log which
++ * don't need checkpointing, just eliminate them from the
++ * journal straight away.
++ */
++ result = cleanup_journal_tail(journal);
++ jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
++ if (result <= 0)
++ return result;
++
++ /*
++ * OK, we need to start writing disk blocks. Try to free up a
++ * quarter of the log in a single checkpoint if we can.
++ */
++ /*
++ * AKPM: check this code. I had a feeling a while back that it
++ * degenerates into a busy loop at unmount time.
++ */
++ target = (journal->j_last - journal->j_first) / 4;
++
++ spin_lock(&journal_datalist_lock);
++repeat:
++ transaction = journal->j_checkpoint_transactions;
++ if (transaction == NULL)
++ goto done;
++ last_transaction = transaction->t_cpprev;
++ next_transaction = transaction;
++
++ do {
++ struct journal_head *jh, *last_jh, *next_jh;
++ int drop_count = 0;
++ int cleanup_ret, retry = 0;
++
++ transaction = next_transaction;
++ next_transaction = transaction->t_cpnext;
++ jh = transaction->t_checkpoint_list;
++ last_jh = jh->b_cpprev;
++ next_jh = jh;
++ do {
++ jh = next_jh;
++ next_jh = jh->b_cpnext;
++ retry = __flush_buffer(journal, jh, bhs, &batch_count,
++ &drop_count);
++ } while (jh != last_jh && !retry);
++ if (batch_count) {
++ __flush_batch(bhs, &batch_count);
++ goto repeat;
++ }
++ if (retry)
++ goto repeat;
++ /*
++ * We have walked the whole transaction list without
++ * finding anything to write to disk. We had better be
++ * able to make some progress or we are in trouble.
++ */
++ cleanup_ret = __cleanup_transaction(journal, transaction);
++ J_ASSERT(drop_count != 0 || cleanup_ret != 0);
++ goto repeat; /* __cleanup may have dropped lock */
++ } while (transaction != last_transaction);
++
++done:
++ spin_unlock(&journal_datalist_lock);
++ result = cleanup_journal_tail(journal);
++ if (result < 0)
++ return result;
++
++ return 0;
++}
++
++/*
++ * Check the list of checkpoint transactions for the journal to see if
++ * we have already got rid of any since the last update of the log tail
++ * in the journal superblock. If so, we can instantly roll the
++ * superblock forward to remove those transactions from the log.
++ *
++ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
++ *
++ * Called with the journal lock held.
++ *
++ * This is the only part of the journaling code which really needs to be
++ * aware of transaction aborts. Checkpointing involves writing to the
++ * main filesystem area rather than to the journal, so it can proceed
++ * even in abort state, but we must not update the journal superblock if
++ * we have an abort error outstanding.
++ */
++
++int cleanup_journal_tail(journal_t *journal)
++{
++ transaction_t * transaction;
++ tid_t first_tid;
++ unsigned long blocknr, freed;
++
++ /* OK, work out the oldest transaction remaining in the log, and
++ * the log block it starts at.
++ *
++ * If the log is now empty, we need to work out which is the
++ * next transaction ID we will write, and where it will
++ * start. */
++
++ /* j_checkpoint_transactions needs locking */
++ spin_lock(&journal_datalist_lock);
++ transaction = journal->j_checkpoint_transactions;
++ if (transaction) {
++ first_tid = transaction->t_tid;
++ blocknr = transaction->t_log_start;
++ } else if ((transaction = journal->j_committing_transaction) != NULL) {
++ first_tid = transaction->t_tid;
++ blocknr = transaction->t_log_start;
++ } else if ((transaction = journal->j_running_transaction) != NULL) {
++ first_tid = transaction->t_tid;
++ blocknr = journal->j_head;
++ } else {
++ first_tid = journal->j_transaction_sequence;
++ blocknr = journal->j_head;
++ }
++ spin_unlock(&journal_datalist_lock);
++ J_ASSERT (blocknr != 0);
++
++ /* If the oldest pinned transaction is at the tail of the log
++ already then there's not much we can do right now. */
++ if (journal->j_tail_sequence == first_tid)
++ return 1;
++
++ /* OK, update the superblock to recover the freed space.
++ * Physical blocks come first: have we wrapped beyond the end of
++ * the log? */
++ freed = blocknr - journal->j_tail;
++ if (blocknr < journal->j_tail)
++ freed = freed + journal->j_last - journal->j_first;
++
++ jbd_debug(1,
++ "Cleaning journal tail from %d to %d (offset %lu), "
++ "freeing %lu\n",
++ journal->j_tail_sequence, first_tid, blocknr, freed);
++
++ journal->j_free += freed;
++ journal->j_tail_sequence = first_tid;
++ journal->j_tail = blocknr;
++ if (!(journal->j_flags & JFS_ABORT))
++ journal_update_superblock(journal, 1);
++ return 0;
++}
++
++
++/* Checkpoint list management */
++
++/*
++ * journal_clean_checkpoint_list
++ *
++ * Find all the written-back checkpoint buffers in the journal and release them.
++ *
++ * Called with the journal locked.
++ * Called with journal_datalist_lock held.
++ * Returns number of bufers reaped (for debug)
++ */
++
++int __journal_clean_checkpoint_list(journal_t *journal)
++{
++ transaction_t *transaction, *last_transaction, *next_transaction;
++ int ret = 0;
++
++ transaction = journal->j_checkpoint_transactions;
++ if (transaction == 0)
++ goto out;
++
++ last_transaction = transaction->t_cpprev;
++ next_transaction = transaction;
++ do {
++ struct journal_head *jh;
++
++ transaction = next_transaction;
++ next_transaction = transaction->t_cpnext;
++ jh = transaction->t_checkpoint_list;
++ if (jh) {
++ struct journal_head *last_jh = jh->b_cpprev;
++ struct journal_head *next_jh = jh;
++ do {
++ jh = next_jh;
++ next_jh = jh->b_cpnext;
++ ret += __try_to_free_cp_buf(jh);
++ } while (jh != last_jh);
++ }
++ } while (transaction != last_transaction);
++out:
++ return ret;
++}
++
++/*
++ * journal_remove_checkpoint: called after a buffer has been committed
++ * to disk (either by being write-back flushed to disk, or being
++ * committed to the log).
++ *
++ * We cannot safely clean a transaction out of the log until all of the
++ * buffer updates committed in that transaction have safely been stored
++ * elsewhere on disk. To achieve this, all of the buffers in a
++ * transaction need to be maintained on the transaction's checkpoint
++ * list until they have been rewritten, at which point this function is
++ * called to remove the buffer from the existing transaction's
++ * checkpoint list.
++ *
++ * This function is called with the journal locked.
++ * This function is called with journal_datalist_lock held.
++ */
++
++void __journal_remove_checkpoint(struct journal_head *jh)
++{
++ transaction_t *transaction;
++ journal_t *journal;
++
++ JBUFFER_TRACE(jh, "entry");
++
++ if ((transaction = jh->b_cp_transaction) == NULL) {
++ JBUFFER_TRACE(jh, "not on transaction");
++ goto out;
++ }
++
++ journal = transaction->t_journal;
++
++ __buffer_unlink(jh);
++
++ if (transaction->t_checkpoint_list != NULL)
++ goto out;
++ JBUFFER_TRACE(jh, "transaction has no more buffers");
++
++ /* There is one special case to worry about: if we have just
++ pulled the buffer off a committing transaction's forget list,
++ then even if the checkpoint list is empty, the transaction
++ obviously cannot be dropped! */
++
++ if (transaction == journal->j_committing_transaction) {
++ JBUFFER_TRACE(jh, "belongs to committing transaction");
++ goto out;
++ }
++
++ /* OK, that was the last buffer for the transaction: we can now
++ safely remove this transaction from the log */
++
++ __journal_drop_transaction(journal, transaction);
++
++ /* Just in case anybody was waiting for more transactions to be
++ checkpointed... */
++ wake_up(&journal->j_wait_logspace);
++out:
++ JBUFFER_TRACE(jh, "exit");
++}
++
++void journal_remove_checkpoint(struct journal_head *jh)
++{
++ spin_lock(&journal_datalist_lock);
++ __journal_remove_checkpoint(jh);
++ spin_unlock(&journal_datalist_lock);
++}
++
++/*
++ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
++ * list so that we know when it is safe to clean the transaction out of
++ * the log.
++ *
++ * Called with the journal locked.
++ * Called with journal_datalist_lock held.
++ */
++void __journal_insert_checkpoint(struct journal_head *jh,
++ transaction_t *transaction)
++{
++ JBUFFER_TRACE(jh, "entry");
++ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
++ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
++
++ assert_spin_locked(&journal_datalist_lock);
++ jh->b_cp_transaction = transaction;
++
++ if (!transaction->t_checkpoint_list) {
++ jh->b_cpnext = jh->b_cpprev = jh;
++ } else {
++ jh->b_cpnext = transaction->t_checkpoint_list;
++ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
++ jh->b_cpprev->b_cpnext = jh;
++ jh->b_cpnext->b_cpprev = jh;
++ }
++ transaction->t_checkpoint_list = jh;
++}
++
++void journal_insert_checkpoint(struct journal_head *jh,
++ transaction_t *transaction)
++{
++ spin_lock(&journal_datalist_lock);
++ __journal_insert_checkpoint(jh, transaction);
++ spin_unlock(&journal_datalist_lock);
++}
++
++/*
++ * We've finished with this transaction structure: adios...
++ *
++ * The transaction must have no links except for the checkpoint by this
++ * point.
++ *
++ * Called with the journal locked.
++ * Called with journal_datalist_lock held.
++ */
++
++void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
++{
++ assert_spin_locked(&journal_datalist_lock);
++ if (transaction->t_cpnext) {
++ transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
++ transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
++ if (journal->j_checkpoint_transactions == transaction)
++ journal->j_checkpoint_transactions =
++ transaction->t_cpnext;
++ if (journal->j_checkpoint_transactions == transaction)
++ journal->j_checkpoint_transactions = NULL;
++ }
++
++ J_ASSERT (transaction->t_ilist == NULL);
++ J_ASSERT (transaction->t_buffers == NULL);
++ J_ASSERT (transaction->t_sync_datalist == NULL);
++ J_ASSERT (transaction->t_async_datalist == NULL);
++ J_ASSERT (transaction->t_forget == NULL);
++ J_ASSERT (transaction->t_iobuf_list == NULL);
++ J_ASSERT (transaction->t_shadow_list == NULL);
++ J_ASSERT (transaction->t_log_list == NULL);
++ J_ASSERT (transaction->t_checkpoint_list == NULL);
++ J_ASSERT (transaction->t_updates == 0);
++
++ J_ASSERT (transaction->t_journal->j_committing_transaction !=
++ transaction);
++
++ jbd_debug (1, "Dropping transaction %d, all done\n",
++ transaction->t_tid);
++ kfree (transaction);
++}
++
+diff -ruP linux.mcp2/fs/jbd/commit.c linuxppc_2.4.19_final/fs/jbd/commit.c
+--- linux.mcp2/fs/jbd/commit.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/commit.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,719 @@
++/*
++ * linux/fs/commit.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
++ *
++ * Copyright 1998 Red Hat corp --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Journal commit routines for the generic filesystem journaling code;
++ * part of the ext2fs journaling system.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#include <linux/smp_lock.h>
++
++extern spinlock_t journal_datalist_lock;
++
++/*
++ * Default IO end handler for temporary BJ_IO buffer_heads.
++ */
++void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
++{
++ BUFFER_TRACE(bh, "");
++ mark_buffer_uptodate(bh, uptodate);
++ unlock_buffer(bh);
++}
++
++/*
++ * journal_commit_transaction
++ *
++ * The primary function for committing a transaction to the log. This
++ * function is called by the journal thread to begin a complete commit.
++ */
++void journal_commit_transaction(journal_t *journal)
++{
++ transaction_t *commit_transaction;
++ struct journal_head *jh, *new_jh, *descriptor;
++ struct journal_head *next_jh, *last_jh;
++ struct buffer_head *wbuf[64];
++ int bufs;
++ int flags;
++ int err;
++ unsigned long blocknr;
++ char *tagp = NULL;
++ journal_header_t *header;
++ journal_block_tag_t *tag = NULL;
++ int space_left = 0;
++ int first_tag = 0;
++ int tag_flag;
++ int i;
++
++ /*
++ * First job: lock down the current transaction and wait for
++ * all outstanding updates to complete.
++ */
++
++ lock_journal(journal); /* Protect journal->j_running_transaction */
++
++#ifdef COMMIT_STATS
++ spin_lock(&journal_datalist_lock);
++ summarise_journal_usage(journal);
++ spin_unlock(&journal_datalist_lock);
++#endif
++
++ lock_kernel();
++
++ J_ASSERT (journal->j_running_transaction != NULL);
++ J_ASSERT (journal->j_committing_transaction == NULL);
++
++ commit_transaction = journal->j_running_transaction;
++ J_ASSERT (commit_transaction->t_state == T_RUNNING);
++
++ jbd_debug (1, "JBD: starting commit of transaction %d\n",
++ commit_transaction->t_tid);
++
++ commit_transaction->t_state = T_LOCKED;
++ while (commit_transaction->t_updates != 0) {
++ unlock_journal(journal);
++ sleep_on(&journal->j_wait_updates);
++ lock_journal(journal);
++ }
++
++ J_ASSERT (commit_transaction->t_outstanding_credits <=
++ journal->j_max_transaction_buffers);
++
++ /* Do we need to erase the effects of a prior journal_flush? */
++ if (journal->j_flags & JFS_FLUSHED) {
++ jbd_debug(3, "super block updated\n");
++ journal_update_superblock(journal, 1);
++ } else {
++ jbd_debug(3, "superblock not updated\n");
++ }
++
++ /*
++ * First thing we are allowed to do is to discard any remaining
++ * BJ_Reserved buffers. Note, it is _not_ permissible to assume
++ * that there are no such buffers: if a large filesystem
++ * operation like a truncate needs to split itself over multiple
++ * transactions, then it may try to do a journal_restart() while
++ * there are still BJ_Reserved buffers outstanding. These must
++ * be released cleanly from the current transaction.
++ *
++ * In this case, the filesystem must still reserve write access
++ * again before modifying the buffer in the new transaction, but
++ * we do not require it to remember exactly which old buffers it
++ * has reserved. This is consistent with the existing behaviour
++ * that multiple journal_get_write_access() calls to the same
++ * buffer are perfectly permissable.
++ */
++
++ while (commit_transaction->t_reserved_list) {
++ jh = commit_transaction->t_reserved_list;
++ JBUFFER_TRACE(jh, "reserved, unused: refile");
++ journal_refile_buffer(jh);
++ }
++
++ /*
++ * Now try to drop any written-back buffers from the journal's
++ * checkpoint lists. We do this *before* commit because it potentially
++ * frees some memory
++ */
++ spin_lock(&journal_datalist_lock);
++ __journal_clean_checkpoint_list(journal);
++ spin_unlock(&journal_datalist_lock);
++
++ /* First part of the commit: force the revoke list out to disk.
++ * The revoke code generates its own metadata blocks on disk for this.
++ *
++ * It is important that we do this while the transaction is
++ * still locked. Generating the revoke records should not
++ * generate any IO stalls, so this should be quick; and doing
++ * the work while we have the transaction locked means that we
++ * only ever have to maintain the revoke list for one
++ * transaction at a time.
++ */
++
++ jbd_debug (3, "JBD: commit phase 1\n");
++
++ journal_write_revoke_records(journal, commit_transaction);
++
++ /*
++ * Now that we have built the revoke records, we can start
++ * reusing the revoke list for a new running transaction. We
++ * can now safely start committing the old transaction: time to
++ * get a new running transaction for incoming filesystem updates
++ */
++
++ commit_transaction->t_state = T_FLUSH;
++
++ wake_up(&journal->j_wait_transaction_locked);
++
++ journal->j_committing_transaction = commit_transaction;
++ journal->j_running_transaction = NULL;
++
++ commit_transaction->t_log_start = journal->j_head;
++
++ unlock_kernel();
++
++ jbd_debug (3, "JBD: commit phase 2\n");
++
++ /*
++ * Now start flushing things to disk, in the order they appear
++ * on the transaction lists. Data blocks go first.
++ */
++
++ /*
++ * Whenever we unlock the journal and sleep, things can get added
++ * onto ->t_datalist, so we have to keep looping back to write_out_data
++ * until we *know* that the list is empty.
++ */
++write_out_data:
++
++ /*
++ * Cleanup any flushed data buffers from the data list. Even in
++ * abort mode, we want to flush this out as soon as possible.
++ *
++ * We take journal_datalist_lock to protect the lists from
++ * journal_try_to_free_buffers().
++ */
++ spin_lock(&journal_datalist_lock);
++
++write_out_data_locked:
++ bufs = 0;
++ next_jh = commit_transaction->t_sync_datalist;
++ if (next_jh == NULL)
++ goto sync_datalist_empty;
++ last_jh = next_jh->b_tprev;
++
++ do {
++ struct buffer_head *bh;
++
++ jh = next_jh;
++ next_jh = jh->b_tnext;
++ bh = jh2bh(jh);
++ if (!buffer_locked(bh)) {
++ if (buffer_dirty(bh)) {
++ BUFFER_TRACE(bh, "start journal writeout");
++ atomic_inc(&bh->b_count);
++ wbuf[bufs++] = bh;
++ } else {
++ BUFFER_TRACE(bh, "writeout complete: unfile");
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ __journal_remove_journal_head(bh);
++ refile_buffer(bh);
++ __brelse(bh);
++ }
++ }
++ if (bufs == ARRAY_SIZE(wbuf)) {
++ /*
++ * Major speedup: start here on the next scan
++ */
++ J_ASSERT(commit_transaction->t_sync_datalist != 0);
++ commit_transaction->t_sync_datalist = jh;
++ break;
++ }
++ } while (jh != last_jh);
++
++ if (bufs || current->need_resched) {
++ jbd_debug(2, "submit %d writes\n", bufs);
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ if (bufs)
++ ll_rw_block(WRITE, bufs, wbuf);
++ if (current->need_resched)
++ schedule();
++ journal_brelse_array(wbuf, bufs);
++ lock_journal(journal);
++ spin_lock(&journal_datalist_lock);
++ if (bufs)
++ goto write_out_data_locked;
++ }
++
++ /*
++ * Wait for all previously submitted IO on the data list to complete.
++ */
++ jh = commit_transaction->t_sync_datalist;
++ if (jh == NULL)
++ goto sync_datalist_empty;
++
++ do {
++ struct buffer_head *bh;
++ jh = jh->b_tprev; /* Wait on the last written */
++ bh = jh2bh(jh);
++ if (buffer_locked(bh)) {
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ wait_on_buffer(bh);
++ /* the journal_head may have been removed now */
++ lock_journal(journal);
++ goto write_out_data;
++ } else if (buffer_dirty(bh)) {
++ goto write_out_data_locked;
++ }
++ } while (jh != commit_transaction->t_sync_datalist);
++ goto write_out_data_locked;
++
++sync_datalist_empty:
++ /*
++ * Wait for all the async writepage data. As they become unlocked
++ * in end_buffer_io_async(), the only place where they can be
++ * reaped is in try_to_free_buffers(), and we're locked against
++ * that.
++ */
++ while ((jh = commit_transaction->t_async_datalist)) {
++ struct buffer_head *bh = jh2bh(jh);
++ if (buffer_locked(bh)) {
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ wait_on_buffer(bh);
++ lock_journal(journal);
++ spin_lock(&journal_datalist_lock);
++ continue; /* List may have changed */
++ }
++ if (jh->b_next_transaction) {
++ /*
++ * For writepage() buffers in journalled data mode: a
++ * later transaction may want the buffer for "metadata"
++ */
++ __journal_refile_buffer(jh);
++ } else {
++ BUFFER_TRACE(bh, "finished async writeout: unfile");
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ __journal_remove_journal_head(bh);
++ BUFFER_TRACE(bh, "finished async writeout: refile");
++ /* It can sometimes be on BUF_LOCKED due to migration
++ * from syncdata to asyncdata */
++ if (bh->b_list != BUF_CLEAN)
++ refile_buffer(bh);
++ __brelse(bh);
++ }
++ }
++ spin_unlock(&journal_datalist_lock);
++
++ /*
++ * If we found any dirty or locked buffers, then we should have
++ * looped back up to the write_out_data label. If there weren't
++ * any then journal_clean_data_list should have wiped the list
++ * clean by now, so check that it is in fact empty.
++ */
++ J_ASSERT (commit_transaction->t_sync_datalist == NULL);
++ J_ASSERT (commit_transaction->t_async_datalist == NULL);
++
++ jbd_debug (3, "JBD: commit phase 3\n");
++
++ /*
++ * Way to go: we have now written out all of the data for a
++ * transaction! Now comes the tricky part: we need to write out
++ * metadata. Loop over the transaction's entire buffer list:
++ */
++ commit_transaction->t_state = T_COMMIT;
++
++ descriptor = 0;
++ bufs = 0;
++ while (commit_transaction->t_buffers) {
++
++ /* Find the next buffer to be journaled... */
++
++ jh = commit_transaction->t_buffers;
++
++ /* If we're in abort mode, we just un-journal the buffer and
++ release it for background writing. */
++
++ if (is_journal_aborted(journal)) {
++ JBUFFER_TRACE(jh, "journal is aborting: refile");
++ journal_refile_buffer(jh);
++ /* If that was the last one, we need to clean up
++ * any descriptor buffers which may have been
++ * already allocated, even if we are now
++ * aborting. */
++ if (!commit_transaction->t_buffers)
++ goto start_journal_io;
++ continue;
++ }
++
++ /* Make sure we have a descriptor block in which to
++ record the metadata buffer. */
++
++ if (!descriptor) {
++ struct buffer_head *bh;
++
++ J_ASSERT (bufs == 0);
++
++ jbd_debug(4, "JBD: get descriptor\n");
++
++ descriptor = journal_get_descriptor_buffer(journal);
++ if (!descriptor) {
++ __journal_abort_hard(journal);
++ continue;
++ }
++
++ bh = jh2bh(descriptor);
++ jbd_debug(4, "JBD: got buffer %ld (%p)\n",
++ bh->b_blocknr, bh->b_data);
++ header = (journal_header_t *)&bh->b_data[0];
++ header->h_magic = htonl(JFS_MAGIC_NUMBER);
++ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
++ header->h_sequence = htonl(commit_transaction->t_tid);
++
++ tagp = &bh->b_data[sizeof(journal_header_t)];
++ space_left = bh->b_size - sizeof(journal_header_t);
++ first_tag = 1;
++ set_bit(BH_JWrite, &bh->b_state);
++ wbuf[bufs++] = bh;
++
++ /* Record it so that we can wait for IO
++ completion later */
++ BUFFER_TRACE(bh, "ph3: file as descriptor");
++ journal_file_buffer(descriptor, commit_transaction,
++ BJ_LogCtl);
++ }
++
++ /* Where is the buffer to be written? */
++
++ err = journal_next_log_block(journal, &blocknr);
++ /* If the block mapping failed, just abandon the buffer
++ and repeat this loop: we'll fall into the
++ refile-on-abort condition above. */
++ if (err) {
++ __journal_abort_hard(journal);
++ continue;
++ }
++
++ /* Bump b_count to prevent truncate from stumbling over
++ the shadowed buffer! @@@ This can go if we ever get
++ rid of the BJ_IO/BJ_Shadow pairing of buffers. */
++ atomic_inc(&jh2bh(jh)->b_count);
++
++ /* Make a temporary IO buffer with which to write it out
++ (this will requeue both the metadata buffer and the
++ temporary IO buffer). new_bh goes on BJ_IO*/
++
++ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
++ /*
++ * akpm: journal_write_metadata_buffer() sets
++ * new_bh->b_transaction to commit_transaction.
++ * We need to clean this up before we release new_bh
++ * (which is of type BJ_IO)
++ */
++ JBUFFER_TRACE(jh, "ph3: write metadata");
++ flags = journal_write_metadata_buffer(commit_transaction,
++ jh, &new_jh, blocknr);
++ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
++ set_bit(BH_Lock, &jh2bh(new_jh)->b_state);
++ wbuf[bufs++] = jh2bh(new_jh);
++
++ /* Record the new block's tag in the current descriptor
++ buffer */
++
++ tag_flag = 0;
++ if (flags & 1)
++ tag_flag |= JFS_FLAG_ESCAPE;
++ if (!first_tag)
++ tag_flag |= JFS_FLAG_SAME_UUID;
++
++ tag = (journal_block_tag_t *) tagp;
++ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
++ tag->t_flags = htonl(tag_flag);
++ tagp += sizeof(journal_block_tag_t);
++ space_left -= sizeof(journal_block_tag_t);
++
++ if (first_tag) {
++ memcpy (tagp, journal->j_uuid, 16);
++ tagp += 16;
++ space_left -= 16;
++ first_tag = 0;
++ }
++
++ /* If there's no more to do, or if the descriptor is full,
++ let the IO rip! */
++
++ if (bufs == ARRAY_SIZE(wbuf) ||
++ commit_transaction->t_buffers == NULL ||
++ space_left < sizeof(journal_block_tag_t) + 16) {
++
++ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
++
++ /* Write an end-of-descriptor marker before
++ submitting the IOs. "tag" still points to
++ the last tag we set up. */
++
++ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
++
++start_journal_io:
++ unlock_journal(journal);
++ for (i=0; i<bufs; i++) {
++ struct buffer_head *bh = wbuf[i];
++ clear_bit(BH_Dirty, &bh->b_state);
++ bh->b_end_io = journal_end_buffer_io_sync;
++ submit_bh(WRITE, bh);
++ }
++ if (current->need_resched)
++ schedule();
++ lock_journal(journal);
++
++ /* Force a new descriptor to be generated next
++ time round the loop. */
++ descriptor = NULL;
++ bufs = 0;
++ }
++ }
++
++ /* Lo and behold: we have just managed to send a transaction to
++ the log. Before we can commit it, wait for the IO so far to
++ complete. Control buffers being written are on the
++ transaction's t_log_list queue, and metadata buffers are on
++ the t_iobuf_list queue.
++
++ Wait for the transactions in reverse order. That way we are
++ less likely to be woken up until all IOs have completed, and
++ so we incur less scheduling load.
++ */
++
++ jbd_debug(3, "JBD: commit phase 4\n");
++
++ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */
++ wait_for_iobuf:
++ while (commit_transaction->t_iobuf_list != NULL) {
++ struct buffer_head *bh;
++ jh = commit_transaction->t_iobuf_list->b_tprev;
++ bh = jh2bh(jh);
++ if (buffer_locked(bh)) {
++ unlock_journal(journal);
++ wait_on_buffer(bh);
++ lock_journal(journal);
++ goto wait_for_iobuf;
++ }
++
++ clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
++
++ JBUFFER_TRACE(jh, "ph4: unfile after journal write");
++ journal_unfile_buffer(jh);
++
++ /*
++ * akpm: don't put back a buffer_head with stale pointers
++ * dangling around.
++ */
++ J_ASSERT_JH(jh, jh->b_transaction != NULL);
++ jh->b_transaction = NULL;
++
++ /*
++ * ->t_iobuf_list should contain only dummy buffer_heads
++ * which were created by journal_write_metadata_buffer().
++ */
++ bh = jh2bh(jh);
++ BUFFER_TRACE(bh, "dumping temporary bh");
++ journal_unlock_journal_head(jh);
++ __brelse(bh);
++ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
++ put_unused_buffer_head(bh);
++
++ /* We also have to unlock and free the corresponding
++ shadowed buffer */
++ jh = commit_transaction->t_shadow_list->b_tprev;
++ bh = jh2bh(jh);
++ clear_bit(BH_JWrite, &bh->b_state);
++ J_ASSERT_BH(bh, buffer_jdirty(bh));
++
++ /* The metadata is now released for reuse, but we need
++ to remember it against this transaction so that when
++ we finally commit, we can do any checkpointing
++ required. */
++ JBUFFER_TRACE(jh, "file as BJ_Forget");
++ journal_file_buffer(jh, commit_transaction, BJ_Forget);
++ /* Wake up any transactions which were waiting for this
++ IO to complete */
++ wake_up(&bh->b_wait);
++ JBUFFER_TRACE(jh, "brelse shadowed buffer");
++ __brelse(bh);
++ }
++
++ J_ASSERT (commit_transaction->t_shadow_list == NULL);
++
++ jbd_debug(3, "JBD: commit phase 5\n");
++
++ /* Here we wait for the revoke record and descriptor record buffers */
++ wait_for_ctlbuf:
++ while (commit_transaction->t_log_list != NULL) {
++ struct buffer_head *bh;
++
++ jh = commit_transaction->t_log_list->b_tprev;
++ bh = jh2bh(jh);
++ if (buffer_locked(bh)) {
++ unlock_journal(journal);
++ wait_on_buffer(bh);
++ lock_journal(journal);
++ goto wait_for_ctlbuf;
++ }
++
++ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
++ clear_bit(BH_JWrite, &bh->b_state);
++ journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ journal_unlock_journal_head(jh);
++ put_bh(bh); /* One for getblk */
++ }
++
++ jbd_debug(3, "JBD: commit phase 6\n");
++
++ if (is_journal_aborted(journal))
++ goto skip_commit;
++
++ /* Done it all: now write the commit record. We should have
++ * cleaned up our previous buffers by now, so if we are in abort
++ * mode we can now just skip the rest of the journal write
++ * entirely. */
++
++ descriptor = journal_get_descriptor_buffer(journal);
++ if (!descriptor) {
++ __journal_abort_hard(journal);
++ goto skip_commit;
++ }
++
++ /* AKPM: buglet - add `i' to tmp! */
++ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
++ journal_header_t *tmp =
++ (journal_header_t*)jh2bh(descriptor)->b_data;
++ tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
++ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
++ tmp->h_sequence = htonl(commit_transaction->t_tid);
++ }
++
++ unlock_journal(journal);
++ JBUFFER_TRACE(descriptor, "write commit block");
++ {
++ struct buffer_head *bh = jh2bh(descriptor);
++ clear_bit(BH_Dirty, &bh->b_state);
++ bh->b_end_io = journal_end_buffer_io_sync;
++ submit_bh(WRITE, bh);
++ wait_on_buffer(bh);
++ put_bh(bh); /* One for getblk() */
++ journal_unlock_journal_head(descriptor);
++ }
++ lock_journal(journal);
++
++ /* End of a transaction! Finally, we can do checkpoint
++ processing: any buffers committed as a result of this
++ transaction can be removed from any checkpoint list it was on
++ before. */
++
++skip_commit:
++
++ jbd_debug(3, "JBD: commit phase 7\n");
++
++ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
++ J_ASSERT(commit_transaction->t_async_datalist == NULL);
++ J_ASSERT(commit_transaction->t_buffers == NULL);
++ J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
++ J_ASSERT(commit_transaction->t_iobuf_list == NULL);
++ J_ASSERT(commit_transaction->t_shadow_list == NULL);
++ J_ASSERT(commit_transaction->t_log_list == NULL);
++
++ while (commit_transaction->t_forget) {
++ transaction_t *cp_transaction;
++ struct buffer_head *bh;
++
++ jh = commit_transaction->t_forget;
++ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
++ jh->b_transaction == journal->j_running_transaction);
++
++ /*
++ * If there is undo-protected committed data against
++ * this buffer, then we can remove it now. If it is a
++ * buffer needing such protection, the old frozen_data
++ * field now points to a committed version of the
++ * buffer, so rotate that field to the new committed
++ * data.
++ *
++ * Otherwise, we can just throw away the frozen data now.
++ */
++ if (jh->b_committed_data) {
++ kfree(jh->b_committed_data);
++ jh->b_committed_data = NULL;
++ if (jh->b_frozen_data) {
++ jh->b_committed_data = jh->b_frozen_data;
++ jh->b_frozen_data = NULL;
++ }
++ } else if (jh->b_frozen_data) {
++ kfree(jh->b_frozen_data);
++ jh->b_frozen_data = NULL;
++ }
++
++ spin_lock(&journal_datalist_lock);
++ cp_transaction = jh->b_cp_transaction;
++ if (cp_transaction) {
++ JBUFFER_TRACE(jh, "remove from old cp transaction");
++ J_ASSERT_JH(jh, commit_transaction != cp_transaction);
++ __journal_remove_checkpoint(jh);
++ }
++
++ /* Only re-checkpoint the buffer_head if it is marked
++ * dirty. If the buffer was added to the BJ_Forget list
++ * by journal_forget, it may no longer be dirty and
++ * there's no point in keeping a checkpoint record for
++ * it. */
++ bh = jh2bh(jh);
++ if (buffer_jdirty(bh)) {
++ JBUFFER_TRACE(jh, "add to new checkpointing trans");
++ __journal_insert_checkpoint(jh, commit_transaction);
++ JBUFFER_TRACE(jh, "refile for checkpoint writeback");
++ __journal_refile_buffer(jh);
++ } else {
++ J_ASSERT_BH(bh, !buffer_dirty(bh));
++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = 0;
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ }
++ spin_unlock(&journal_datalist_lock);
++ }
++
++ /* Done with this transaction! */
++
++ jbd_debug(3, "JBD: commit phase 8\n");
++
++ J_ASSERT (commit_transaction->t_state == T_COMMIT);
++ commit_transaction->t_state = T_FINISHED;
++
++ J_ASSERT (commit_transaction == journal->j_committing_transaction);
++ journal->j_commit_sequence = commit_transaction->t_tid;
++ journal->j_committing_transaction = NULL;
++
++ spin_lock(&journal_datalist_lock);
++ if (commit_transaction->t_checkpoint_list == NULL) {
++ __journal_drop_transaction(journal, commit_transaction);
++ } else {
++ if (journal->j_checkpoint_transactions == NULL) {
++ journal->j_checkpoint_transactions = commit_transaction;
++ commit_transaction->t_cpnext = commit_transaction;
++ commit_transaction->t_cpprev = commit_transaction;
++ } else {
++ commit_transaction->t_cpnext =
++ journal->j_checkpoint_transactions;
++ commit_transaction->t_cpprev =
++ commit_transaction->t_cpnext->t_cpprev;
++ commit_transaction->t_cpnext->t_cpprev =
++ commit_transaction;
++ commit_transaction->t_cpprev->t_cpnext =
++ commit_transaction;
++ }
++ }
++ spin_unlock(&journal_datalist_lock);
++
++ jbd_debug(1, "JBD: commit %d complete, head %d\n",
++ journal->j_commit_sequence, journal->j_tail_sequence);
++
++ unlock_journal(journal);
++ wake_up(&journal->j_wait_done_commit);
++}
+diff -ruP linux.mcp2/fs/jbd/journal.c linuxppc_2.4.19_final/fs/jbd/journal.c
+--- linux.mcp2/fs/jbd/journal.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/journal.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,1877 @@
++/*
++ * linux/fs/journal.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
++ *
++ * Copyright 1998 Red Hat corp --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Generic filesystem journal-writing code; part of the ext2fs
++ * journaling system.
++ *
++ * This file manages journals: areas of disk reserved for logging
++ * transactional updates. This includes the kernel journaling thread
++ * which is responsible for scheduling updates to the log.
++ *
++ * We do not actually manage the physical storage of the journal in this
++ * file: that is left to a per-journal policy function, which allows us
++ * to store the journal within a filesystem-specified area for ext2
++ * journaling (ext2 can use a reserved inode for storing the log).
++ */
++
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#include <linux/smp_lock.h>
++#include <linux/sched.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <asm/uaccess.h>
++#include <linux/proc_fs.h>
++
++EXPORT_SYMBOL(journal_start);
++EXPORT_SYMBOL(journal_try_start);
++EXPORT_SYMBOL(journal_restart);
++EXPORT_SYMBOL(journal_extend);
++EXPORT_SYMBOL(journal_stop);
++EXPORT_SYMBOL(journal_lock_updates);
++EXPORT_SYMBOL(journal_unlock_updates);
++EXPORT_SYMBOL(journal_get_write_access);
++EXPORT_SYMBOL(journal_get_create_access);
++EXPORT_SYMBOL(journal_get_undo_access);
++EXPORT_SYMBOL(journal_dirty_data);
++EXPORT_SYMBOL(journal_dirty_metadata);
++#if 0
++EXPORT_SYMBOL(journal_release_buffer);
++#endif
++EXPORT_SYMBOL(journal_forget);
++#if 0
++EXPORT_SYMBOL(journal_sync_buffer);
++#endif
++EXPORT_SYMBOL(journal_flush);
++EXPORT_SYMBOL(journal_revoke);
++
++EXPORT_SYMBOL(journal_init_dev);
++EXPORT_SYMBOL(journal_init_inode);
++EXPORT_SYMBOL(journal_update_format);
++EXPORT_SYMBOL(journal_check_used_features);
++EXPORT_SYMBOL(journal_check_available_features);
++EXPORT_SYMBOL(journal_set_features);
++EXPORT_SYMBOL(journal_create);
++EXPORT_SYMBOL(journal_load);
++EXPORT_SYMBOL(journal_destroy);
++EXPORT_SYMBOL(journal_recover);
++EXPORT_SYMBOL(journal_update_superblock);
++EXPORT_SYMBOL(journal_abort);
++EXPORT_SYMBOL(journal_errno);
++EXPORT_SYMBOL(journal_ack_err);
++EXPORT_SYMBOL(journal_clear_err);
++EXPORT_SYMBOL(log_wait_commit);
++EXPORT_SYMBOL(log_start_commit);
++EXPORT_SYMBOL(journal_wipe);
++EXPORT_SYMBOL(journal_blocks_per_page);
++EXPORT_SYMBOL(journal_flushpage);
++EXPORT_SYMBOL(journal_try_to_free_buffers);
++EXPORT_SYMBOL(journal_bmap);
++EXPORT_SYMBOL(journal_force_commit);
++
++static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
++
++/*
++ * journal_datalist_lock is used to protect data buffers:
++ *
++ * bh->b_transaction
++ * bh->b_tprev
++ * bh->b_tnext
++ *
++ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is
++ * async wrt everything else.
++ *
++ * It is also used for checkpoint data, also to protect against
++ * journal_try_to_free_buffer():
++ *
++ * bh->b_cp_transaction
++ * bh->b_cpnext
++ * bh->b_cpprev
++ * transaction->t_checkpoint_list
++ * transaction->t_cpnext
++ * transaction->t_cpprev
++ * journal->j_checkpoint_transactions
++ *
++ * It is global at this time rather than per-journal because it's
++ * impossible for __journal_free_buffer to go from a buffer_head
++ * back to a journal_t unracily (well, not true. Fix later)
++ *
++ *
++ * The `datalist' and `checkpoint list' functions are quite
++ * separate and we could use two spinlocks here.
++ *
++ * lru_list_lock nests inside journal_datalist_lock.
++ */
++spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;
++
++/*
++ * jh_splice_lock needs explantion.
++ *
++ * In a number of places we want to do things like:
++ *
++ * if (buffer_jbd(bh) && bh2jh(bh)->foo)
++ *
++ * This is racy on SMP, because another CPU could remove the journal_head
++ * in the middle of this expression. We need locking.
++ *
++ * But we can greatly optimise the locking cost by testing BH_JBD
++ * outside the lock. So, effectively:
++ *
++ * ret = 0;
++ * if (buffer_jbd(bh)) {
++ * spin_lock(&jh_splice_lock);
++ * if (buffer_jbd(bh)) { (* Still there? *)
++ * ret = bh2jh(bh)->foo;
++ * }
++ * spin_unlock(&jh_splice_lock);
++ * }
++ * return ret;
++ *
++ * Now, that protects us from races where another CPU can remove the
++ * journal_head. But it doesn't defend us from the situation where another
++ * CPU can *add* a journal_head. This is a correctness issue. But it's not
++ * a problem because a) the calling code was *already* racy and b) it often
++ * can't happen at the call site and c) the places where we add journal_heads
++ * tend to be under external locking.
++ */
++spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;
++
++/*
++ * List of all journals in the system. Protected by the BKL.
++ */
++static LIST_HEAD(all_journals);
++
++/*
++ * Helper function used to manage commit timeouts
++ */
++
++static void commit_timeout(unsigned long __data)
++{
++ struct task_struct * p = (struct task_struct *) __data;
++
++ wake_up_process(p);
++}
++
++/* Static check for data structure consistency. There's no code
++ * invoked --- we'll just get a linker failure if things aren't right.
++ */
++void __journal_internal_check(void)
++{
++ extern void journal_bad_superblock_size(void);
++ if (sizeof(struct journal_superblock_s) != 1024)
++ journal_bad_superblock_size();
++}
++
++/*
++ * kjournald: The main thread function used to manage a logging device
++ * journal.
++ *
++ * This kernel thread is responsible for two things:
++ *
++ * 1) COMMIT: Every so often we need to commit the current state of the
++ * filesystem to disk. The journal thread is responsible for writing
++ * all of the metadata buffers to disk.
++ *
++ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
++ * of the data in that part of the log has been rewritten elsewhere on
++ * the disk. Flushing these old buffers to reclaim space in the log is
++ * known as checkpointing, and this thread is responsible for that job.
++ */
++
++journal_t *current_journal; // AKPM: debug
++
++int kjournald(void *arg)
++{
++ journal_t *journal = (journal_t *) arg;
++ transaction_t *transaction;
++ struct timer_list timer;
++
++ current_journal = journal;
++
++ lock_kernel();
++ daemonize();
++ reparent_to_init();
++ spin_lock_irq(¤t->sigmask_lock);
++ sigfillset(¤t->blocked);
++ recalc_sigpending(current);
++ spin_unlock_irq(¤t->sigmask_lock);
++
++ sprintf(current->comm, "kjournald");
++
++ /* Set up an interval timer which can be used to trigger a
++ commit wakeup after the commit interval expires */
++ init_timer(&timer);
++ timer.data = (unsigned long) current;
++ timer.function = commit_timeout;
++ journal->j_commit_timer = &timer;
++
++ /* Record that the journal thread is running */
++ journal->j_task = current;
++ wake_up(&journal->j_wait_done_commit);
++
++ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
++ journal->j_commit_interval / HZ);
++ list_add(&journal->j_all_journals, &all_journals);
++
++ /* And now, wait forever for commit wakeup events. */
++ while (1) {
++ if (journal->j_flags & JFS_UNMOUNT)
++ break;
++
++ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
++ journal->j_commit_sequence, journal->j_commit_request);
++
++ if (journal->j_commit_sequence != journal->j_commit_request) {
++ jbd_debug(1, "OK, requests differ\n");
++ if (journal->j_commit_timer_active) {
++ journal->j_commit_timer_active = 0;
++ del_timer(journal->j_commit_timer);
++ }
++
++ journal_commit_transaction(journal);
++ continue;
++ }
++
++ wake_up(&journal->j_wait_done_commit);
++ interruptible_sleep_on(&journal->j_wait_commit);
++
++ jbd_debug(1, "kjournald wakes\n");
++
++ /* Were we woken up by a commit wakeup event? */
++ if ((transaction = journal->j_running_transaction) != NULL &&
++ time_after_eq(jiffies, transaction->t_expires)) {
++ journal->j_commit_request = transaction->t_tid;
++ jbd_debug(1, "woke because of timeout\n");
++ }
++ }
++
++ if (journal->j_commit_timer_active) {
++ journal->j_commit_timer_active = 0;
++ del_timer_sync(journal->j_commit_timer);
++ }
++
++ list_del(&journal->j_all_journals);
++
++ journal->j_task = NULL;
++ wake_up(&journal->j_wait_done_commit);
++ unlock_kernel();
++ jbd_debug(1, "Journal thread exiting.\n");
++ return 0;
++}
++
++static void journal_start_thread(journal_t *journal)
++{
++ kernel_thread(kjournald, (void *) journal,
++ CLONE_VM | CLONE_FS | CLONE_FILES);
++ while (!journal->j_task)
++ sleep_on(&journal->j_wait_done_commit);
++}
++
++static void journal_kill_thread(journal_t *journal)
++{
++ journal->j_flags |= JFS_UNMOUNT;
++
++ while (journal->j_task) {
++ wake_up(&journal->j_wait_commit);
++ sleep_on(&journal->j_wait_done_commit);
++ }
++}
++
++#if 0
++
++This is no longer needed - we do it in commit quite efficiently.
++Note that if this function is resurrected, the loop needs to
++be reorganised into the next_jh/last_jh algorithm.
++
++/*
++ * journal_clean_data_list: cleanup after data IO.
++ *
++ * Once the IO system has finished writing the buffers on the transaction's
++ * data list, we can remove those buffers from the list. This function
++ * scans the list for such buffers and removes them cleanly.
++ *
++ * We assume that the journal is already locked.
++ * We are called with journal_datalist_lock held.
++ *
++ * AKPM: This function looks inefficient. Approximately O(n^2)
++ * for potentially thousands of buffers. It no longer shows on profiles
++ * because these buffers are mainly dropped in journal_commit_transaction().
++ */
++
++void __journal_clean_data_list(transaction_t *transaction)
++{
++ struct journal_head *jh, *next;
++
++ assert_spin_locked(&journal_datalist_lock);
++
++restart:
++ jh = transaction->t_sync_datalist;
++ if (!jh)
++ goto out;
++ do {
++ next = jh->b_tnext;
++ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) {
++ struct buffer_head *bh = jh2bh(jh);
++ BUFFER_TRACE(bh, "data writeout complete: unfile");
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ __journal_remove_journal_head(bh);
++ refile_buffer(bh);
++ __brelse(bh);
++ goto restart;
++ }
++ jh = next;
++ } while (transaction->t_sync_datalist &&
++ jh != transaction->t_sync_datalist);
++out:
++ return;
++}
++#endif
++
++/*
++ * journal_write_metadata_buffer: write a metadata buffer to the journal.
++ *
++ * Writes a metadata buffer to a given disk block. The actual IO is not
++ * performed but a new buffer_head is constructed which labels the data
++ * to be written with the correct destination disk block.
++ *
++ * Any magic-number escaping which needs to be done will cause a
++ * copy-out here. If the buffer happens to start with the
++ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
++ * magic number is only written to the log for descripter blocks. In
++ * this case, we copy the data and replace the first word with 0, and we
++ * return a result code which indicates that this buffer needs to be
++ * marked as an escaped buffer in the corresponding log descriptor
++ * block. The missing word can then be restored when the block is read
++ * during recovery.
++ *
++ * If the source buffer has already been modified by a new transaction
++ * since we took the last commit snapshot, we use the frozen copy of
++ * that data for IO. If we end up using the existing buffer_head's data
++ * for the write, then we *have* to lock the buffer to prevent anyone
++ * else from using and possibly modifying it while the IO is in
++ * progress.
++ *
++ * The function returns a pointer to the buffer_heads to be used for IO.
++ *
++ * We assume that the journal has already been locked in this function.
++ *
++ * Return value:
++ * <0: Error
++ * >=0: Finished OK
++ *
++ * On success:
++ * Bit 0 set == escape performed on the data
++ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
++ */
++
++static inline unsigned long virt_to_offset(void *p)
++{return ((unsigned long) p) & ~PAGE_MASK;}
++
++int journal_write_metadata_buffer(transaction_t *transaction,
++ struct journal_head *jh_in,
++ struct journal_head **jh_out,
++ int blocknr)
++{
++ int need_copy_out = 0;
++ int done_copy_out = 0;
++ int do_escape = 0;
++ char *mapped_data;
++ struct buffer_head *new_bh;
++ struct journal_head * new_jh;
++ struct page *new_page;
++ unsigned int new_offset;
++
++ /*
++ * The buffer really shouldn't be locked: only the current committing
++ * transaction is allowed to write it, so nobody else is allowed
++ * to do any IO.
++ *
++ * akpm: except if we're journalling data, and write() output is
++ * also part of a shared mapping, and another thread has
++ * decided to launch a writepage() against this buffer.
++ */
++ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in)));
++
++ /*
++ * If a new transaction has already done a buffer copy-out, then
++ * we use that version of the data for the commit.
++ */
++
++ if (jh_in->b_frozen_data) {
++ done_copy_out = 1;
++ new_page = virt_to_page(jh_in->b_frozen_data);
++ new_offset = virt_to_offset(jh_in->b_frozen_data);
++ } else {
++ new_page = jh2bh(jh_in)->b_page;
++ new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
++ }
++
++ mapped_data = ((char *) kmap(new_page)) + new_offset;
++
++ /*
++ * Check for escaping
++ */
++ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
++ need_copy_out = 1;
++ do_escape = 1;
++ }
++
++ /*
++ * Do we need to do a data copy?
++ */
++
++ if (need_copy_out && !done_copy_out) {
++ char *tmp;
++ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
++
++ jh_in->b_frozen_data = tmp;
++ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
++
++ /* If we get to this path, we'll always need the new
++ address kmapped so that we can clear the escaped
++ magic number below. */
++ kunmap(new_page);
++ new_page = virt_to_page(tmp);
++ new_offset = virt_to_offset(tmp);
++ mapped_data = ((char *) kmap(new_page)) + new_offset;
++
++ done_copy_out = 1;
++ }
++
++ /*
++ * Right, time to make up the new buffer_head.
++ */
++ do {
++ new_bh = get_unused_buffer_head(0);
++ if (!new_bh) {
++ printk (KERN_NOTICE __FUNCTION__
++ ": ENOMEM at get_unused_buffer_head, "
++ "trying again.\n");
++ current->policy |= SCHED_YIELD;
++ schedule();
++ }
++ } while (!new_bh);
++ /* keep subsequent assertions sane */
++ new_bh->b_prev_free = 0;
++ new_bh->b_next_free = 0;
++ new_bh->b_state = 0;
++ init_buffer(new_bh, NULL, NULL);
++ atomic_set(&new_bh->b_count, 1);
++ new_jh = journal_add_journal_head(new_bh);
++
++ set_bh_page(new_bh, new_page, new_offset);
++
++ new_jh->b_transaction = NULL;
++ new_bh->b_size = jh2bh(jh_in)->b_size;
++ new_bh->b_dev = transaction->t_journal->j_dev;
++ new_bh->b_blocknr = blocknr;
++ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty);
++
++ *jh_out = new_jh;
++
++ /*
++ * Did we need to do an escaping? Now we've done all the
++ * copying, we can finally do so.
++ */
++
++ if (do_escape)
++ * ((unsigned int *) mapped_data) = 0;
++ kunmap(new_page);
++
++ /*
++ * The to-be-written buffer needs to get moved to the io queue,
++ * and the original buffer whose contents we are shadowing or
++ * copying is moved to the transaction's shadow queue.
++ */
++ JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
++ journal_file_buffer(jh_in, transaction, BJ_Shadow);
++ JBUFFER_TRACE(new_jh, "file as BJ_IO");
++ journal_file_buffer(new_jh, transaction, BJ_IO);
++
++ return do_escape | (done_copy_out << 1);
++}
++
++/*
++ * Allocation code for the journal file. Manage the space left in the
++ * journal, so that we can begin checkpointing when appropriate.
++ */
++
++/*
++ * log_space_left: Return the number of free blocks left in the journal.
++ *
++ * Called with the journal already locked.
++ */
++
++int log_space_left (journal_t *journal)
++{
++ int left = journal->j_free;
++
++ /* Be pessimistic here about the number of those free blocks
++ * which might be required for log descriptor control blocks. */
++
++#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
++
++ left -= MIN_LOG_RESERVED_BLOCKS;
++
++ if (left <= 0)
++ return 0;
++ left -= (left >> 3);
++ return left;
++}
++
++/*
++ * This function must be non-allocating for PF_MEMALLOC tasks
++ */
++tid_t log_start_commit (journal_t *journal, transaction_t *transaction)
++{
++ tid_t target = journal->j_commit_request;
++
++ lock_kernel(); /* Protect journal->j_running_transaction */
++
++ /*
++ * A NULL transaction asks us to commit the currently running
++ * transaction, if there is one.
++ */
++ if (transaction)
++ target = transaction->t_tid;
++ else {
++ transaction = journal->j_running_transaction;
++ if (!transaction)
++ goto out;
++ target = transaction->t_tid;
++ }
++
++ /*
++ * Are we already doing a recent enough commit?
++ */
++ if (tid_geq(journal->j_commit_request, target))
++ goto out;
++
++ /*
++ * We want a new commit: OK, mark the request and wakup the
++ * commit thread. We do _not_ do the commit ourselves.
++ */
++
++ journal->j_commit_request = target;
++ jbd_debug(1, "JBD: requesting commit %d/%d\n",
++ journal->j_commit_request,
++ journal->j_commit_sequence);
++ wake_up(&journal->j_wait_commit);
++
++out:
++ unlock_kernel();
++ return target;
++}
++
++/*
++ * Wait for a specified commit to complete.
++ * The caller may not hold the journal lock.
++ */
++void log_wait_commit (journal_t *journal, tid_t tid)
++{
++ lock_kernel();
++#ifdef CONFIG_JBD_DEBUG
++ lock_journal(journal);
++ if (!tid_geq(journal->j_commit_request, tid)) {
++ printk(KERN_EMERG __FUNCTION__
++ ": error: j_commit_request=%d, tid=%d\n",
++ journal->j_commit_request, tid);
++ }
++ unlock_journal(journal);
++#endif
++ while (tid_gt(tid, journal->j_commit_sequence)) {
++ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
++ tid, journal->j_commit_sequence);
++ wake_up(&journal->j_wait_commit);
++ sleep_on(&journal->j_wait_done_commit);
++ }
++ unlock_kernel();
++}
++
++/*
++ * Log buffer allocation routines:
++ */
++
++int journal_next_log_block(journal_t *journal, unsigned long *retp)
++{
++ unsigned long blocknr;
++
++ J_ASSERT(journal->j_free > 1);
++
++ blocknr = journal->j_head;
++ journal->j_head++;
++ journal->j_free--;
++ if (journal->j_head == journal->j_last)
++ journal->j_head = journal->j_first;
++ return journal_bmap(journal, blocknr, retp);
++}
++
++/*
++ * Conversion of logical to physical block numbers for the journal
++ *
++ * On external journals the journal blocks are identity-mapped, so
++ * this is a no-op. If needed, we can use j_blk_offset - everything is
++ * ready.
++ */
++int journal_bmap(journal_t *journal, unsigned long blocknr,
++ unsigned long *retp)
++{
++ int err = 0;
++ unsigned long ret;
++
++ if (journal->j_inode) {
++ ret = bmap(journal->j_inode, blocknr);
++ if (ret)
++ *retp = ret;
++ else {
++ printk (KERN_ALERT __FUNCTION__
++ ": journal block not found "
++ "at offset %lu on %s\n",
++ blocknr, bdevname(journal->j_dev));
++ err = -EIO;
++ __journal_abort_soft(journal, err);
++ }
++ } else {
++ *retp = blocknr; /* +journal->j_blk_offset */
++ }
++ return err;
++}
++
++/*
++ * We play buffer_head aliasing tricks to write data/metadata blocks to
++ * the journal without copying their contents, but for journal
++ * descriptor blocks we do need to generate bona fide buffers.
++ *
++ * We return a jh whose bh is locked and ready to be populated.
++ */
++
++struct journal_head * journal_get_descriptor_buffer(journal_t *journal)
++{
++ struct buffer_head *bh;
++ unsigned long blocknr;
++ int err;
++
++ err = journal_next_log_block(journal, &blocknr);
++
++ if (err)
++ return NULL;
++
++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
++ lock_buffer(bh);
++ BUFFER_TRACE(bh, "return this buffer");
++ return journal_add_journal_head(bh);
++}
++
++/*
++ * Management for journal control blocks: functions to create and
++ * destroy journal_t structures, and to initialise and read existing
++ * journal blocks from disk. */
++
++/* First: create and setup a journal_t object in memory. We initialise
++ * very few fields yet: that has to wait until we have created the
++ * journal structures from from scratch, or loaded them from disk. */
++
++static journal_t * journal_init_common (void)
++{
++ journal_t *journal;
++ int err;
++
++ MOD_INC_USE_COUNT;
++
++ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
++ if (!journal)
++ goto fail;
++ memset(journal, 0, sizeof(*journal));
++
++ init_waitqueue_head(&journal->j_wait_transaction_locked);
++ init_waitqueue_head(&journal->j_wait_logspace);
++ init_waitqueue_head(&journal->j_wait_done_commit);
++ init_waitqueue_head(&journal->j_wait_checkpoint);
++ init_waitqueue_head(&journal->j_wait_commit);
++ init_waitqueue_head(&journal->j_wait_updates);
++ init_MUTEX(&journal->j_barrier);
++ init_MUTEX(&journal->j_checkpoint_sem);
++ init_MUTEX(&journal->j_sem);
++
++ journal->j_commit_interval = (HZ * 5);
++
++ /* The journal is marked for error until we succeed with recovery! */
++ journal->j_flags = JFS_ABORT;
++
++ /* Set up a default-sized revoke table for the new mount. */
++ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
++ if (err) {
++ kfree(journal);
++ goto fail;
++ }
++ return journal;
++fail:
++ MOD_DEC_USE_COUNT;
++ return NULL;
++}
++
++/* journal_init_dev and journal_init_inode:
++ *
++ * Create a journal structure assigned some fixed set of disk blocks to
++ * the journal. We don't actually touch those disk blocks yet, but we
++ * need to set up all of the mapping information to tell the journaling
++ * system where the journal blocks are.
++ *
++ * journal_init_dev creates a journal which maps a fixed contiguous
++ * range of blocks on an arbitrary block device.
++ *
++ * journal_init_inode creates a journal which maps an on-disk inode as
++ * the journal. The inode must exist already, must support bmap() and
++ * must have all data blocks preallocated.
++ */
++
++journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
++ int start, int len, int blocksize)
++{
++ journal_t *journal = journal_init_common();
++ struct buffer_head *bh;
++
++ if (!journal)
++ return NULL;
++
++ journal->j_dev = dev;
++ journal->j_fs_dev = fs_dev;
++ journal->j_blk_offset = start;
++ journal->j_maxlen = len;
++ journal->j_blocksize = blocksize;
++
++ bh = getblk(journal->j_dev, start, journal->j_blocksize);
++ J_ASSERT(bh != NULL);
++ journal->j_sb_buffer = bh;
++ journal->j_superblock = (journal_superblock_t *)bh->b_data;
++
++ return journal;
++}
++
++journal_t * journal_init_inode (struct inode *inode)
++{
++ struct buffer_head *bh;
++ journal_t *journal = journal_init_common();
++ int err;
++ unsigned long blocknr;
++
++ if (!journal)
++ return NULL;
++
++ journal->j_dev = inode->i_dev;
++ journal->j_fs_dev = inode->i_dev;
++ journal->j_inode = inode;
++ jbd_debug(1,
++ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
++ journal, bdevname(inode->i_dev), inode->i_ino,
++ (long long) inode->i_size,
++ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
++
++ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
++ journal->j_blocksize = inode->i_sb->s_blocksize;
++
++ err = journal_bmap(journal, 0, &blocknr);
++ /* If that failed, give up */
++ if (err) {
++ printk(KERN_ERR __FUNCTION__ ": Cannnot locate journal "
++ "superblock\n");
++ kfree(journal);
++ return NULL;
++ }
++
++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
++ J_ASSERT(bh != NULL);
++ journal->j_sb_buffer = bh;
++ journal->j_superblock = (journal_superblock_t *)bh->b_data;
++
++ return journal;
++}
++
++/*
++ * If the journal init or create aborts, we need to mark the journal
++ * superblock as being NULL to prevent the journal destroy from writing
++ * back a bogus superblock.
++ */
++static void journal_fail_superblock (journal_t *journal)
++{
++ struct buffer_head *bh = journal->j_sb_buffer;
++ brelse(bh);
++ journal->j_sb_buffer = NULL;
++}
++
++/*
++ * Given a journal_t structure, initialise the various fields for
++ * startup of a new journaling session. We use this both when creating
++ * a journal, and after recovering an old journal to reset it for
++ * subsequent use.
++ */
++
++static int journal_reset (journal_t *journal)
++{
++ journal_superblock_t *sb = journal->j_superblock;
++ unsigned int first, last;
++
++ first = ntohl(sb->s_first);
++ last = ntohl(sb->s_maxlen);
++
++ journal->j_first = first;
++ journal->j_last = last;
++
++ journal->j_head = first;
++ journal->j_tail = first;
++ journal->j_free = last - first;
++
++ journal->j_tail_sequence = journal->j_transaction_sequence;
++ journal->j_commit_sequence = journal->j_transaction_sequence - 1;
++ journal->j_commit_request = journal->j_commit_sequence;
++
++ journal->j_max_transaction_buffers = journal->j_maxlen / 4;
++
++ /* Add the dynamic fields and write it to disk. */
++ journal_update_superblock(journal, 1);
++
++ lock_journal(journal);
++ journal_start_thread(journal);
++ unlock_journal(journal);
++
++ return 0;
++}
++
++/*
++ * Given a journal_t structure which tells us which disk blocks we can
++ * use, create a new journal superblock and initialise all of the
++ * journal fields from scratch. */
++
++int journal_create (journal_t *journal)
++{
++ unsigned long blocknr;
++ struct buffer_head *bh;
++ journal_superblock_t *sb;
++ int i, err;
++
++ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
++ printk (KERN_ERR "Journal length (%d blocks) too short.\n",
++ journal->j_maxlen);
++ journal_fail_superblock(journal);
++ return -EINVAL;
++ }
++
++ if (journal->j_inode == NULL) {
++ /*
++ * We don't know what block to start at!
++ */
++ printk(KERN_EMERG __FUNCTION__
++ ": creation of journal on external device!\n");
++ BUG();
++ }
++
++ /* Zero out the entire journal on disk. We cannot afford to
++ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
++ jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
++ for (i = 0; i < journal->j_maxlen; i++) {
++ err = journal_bmap(journal, i, &blocknr);
++ if (err)
++ return err;
++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
++ wait_on_buffer(bh);
++ memset (bh->b_data, 0, journal->j_blocksize);
++ BUFFER_TRACE(bh, "marking dirty");
++ mark_buffer_dirty(bh);
++ BUFFER_TRACE(bh, "marking uptodate");
++ mark_buffer_uptodate(bh, 1);
++ __brelse(bh);
++ }
++
++ sync_dev(journal->j_dev);
++ jbd_debug(1, "JBD: journal cleared.\n");
++
++ /* OK, fill in the initial static fields in the new superblock */
++ sb = journal->j_superblock;
++
++ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER);
++ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
++
++ sb->s_blocksize = htonl(journal->j_blocksize);
++ sb->s_maxlen = htonl(journal->j_maxlen);
++ sb->s_first = htonl(1);
++
++ journal->j_transaction_sequence = 1;
++
++ journal->j_flags &= ~JFS_ABORT;
++ journal->j_format_version = 2;
++
++ return journal_reset(journal);
++}
++
++/*
++ * Update a journal's dynamic superblock fields and write it to disk,
++ * optionally waiting for the IO to complete.
++*/
++
++void journal_update_superblock(journal_t *journal, int wait)
++{
++ journal_superblock_t *sb = journal->j_superblock;
++ struct buffer_head *bh = journal->j_sb_buffer;
++
++ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
++ journal->j_tail, journal->j_tail_sequence, journal->j_errno);
++
++ sb->s_sequence = htonl(journal->j_tail_sequence);
++ sb->s_start = htonl(journal->j_tail);
++ sb->s_errno = htonl(journal->j_errno);
++
++ BUFFER_TRACE(bh, "marking dirty");
++ mark_buffer_dirty(bh);
++ ll_rw_block(WRITE, 1, &bh);
++ if (wait)
++ wait_on_buffer(bh);
++
++ /* If we have just flushed the log (by marking s_start==0), then
++ * any future commit will have to be careful to update the
++ * superblock again to re-record the true start of the log. */
++
++ if (sb->s_start)
++ journal->j_flags &= ~JFS_FLUSHED;
++ else
++ journal->j_flags |= JFS_FLUSHED;
++}
++
++
++/*
++ * Read the superblock for a given journal, performing initial
++ * validation of the format.
++ */
++
++static int journal_get_superblock(journal_t *journal)
++{
++ struct buffer_head *bh;
++ journal_superblock_t *sb;
++ int err = -EIO;
++
++ bh = journal->j_sb_buffer;
++
++ J_ASSERT(bh != NULL);
++ if (!buffer_uptodate(bh)) {
++ ll_rw_block(READ, 1, &bh);
++ wait_on_buffer(bh);
++ if (!buffer_uptodate(bh)) {
++ printk (KERN_ERR
++ "JBD: IO error reading journal superblock\n");
++ goto out;
++ }
++ }
++
++ sb = journal->j_superblock;
++
++ err = -EINVAL;
++
++ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
++ sb->s_blocksize != htonl(journal->j_blocksize)) {
++ printk(KERN_WARNING "JBD: no valid journal superblock found\n");
++ goto out;
++ }
++
++ switch(ntohl(sb->s_header.h_blocktype)) {
++ case JFS_SUPERBLOCK_V1:
++ journal->j_format_version = 1;
++ break;
++ case JFS_SUPERBLOCK_V2:
++ journal->j_format_version = 2;
++ break;
++ default:
++ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
++ goto out;
++ }
++
++ if (ntohl(sb->s_maxlen) < journal->j_maxlen)
++ journal->j_maxlen = ntohl(sb->s_maxlen);
++ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
++ printk (KERN_WARNING "JBD: journal file too short\n");
++ goto out;
++ }
++
++ return 0;
++
++out:
++ journal_fail_superblock(journal);
++ return err;
++}
++
++/*
++ * Load the on-disk journal superblock and read the key fields into the
++ * journal_t.
++ */
++
++static int load_superblock(journal_t *journal)
++{
++ int err;
++ journal_superblock_t *sb;
++
++ err = journal_get_superblock(journal);
++ if (err)
++ return err;
++
++ sb = journal->j_superblock;
++
++ journal->j_tail_sequence = ntohl(sb->s_sequence);
++ journal->j_tail = ntohl(sb->s_start);
++ journal->j_first = ntohl(sb->s_first);
++ journal->j_last = ntohl(sb->s_maxlen);
++ journal->j_errno = ntohl(sb->s_errno);
++
++ return 0;
++}
++
++
++/*
++ * Given a journal_t structure which tells us which disk blocks contain
++ * a journal, read the journal from disk to initialise the in-memory
++ * structures.
++ */
++
++int journal_load(journal_t *journal)
++{
++ int err;
++
++ err = load_superblock(journal);
++ if (err)
++ return err;
++
++ /* If this is a V2 superblock, then we have to check the
++ * features flags on it. */
++
++ if (journal->j_format_version >= 2) {
++ journal_superblock_t *sb = journal->j_superblock;
++
++ if ((sb->s_feature_ro_compat &
++ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
++ (sb->s_feature_incompat &
++ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
++ printk (KERN_WARNING
++ "JBD: Unrecognised features on journal\n");
++ return -EINVAL;
++ }
++ }
++
++ /* Let the recovery code check whether it needs to recover any
++ * data from the journal. */
++ if (journal_recover(journal))
++ goto recovery_error;
++
++ /* OK, we've finished with the dynamic journal bits:
++ * reinitialise the dynamic contents of the superblock in memory
++ * and reset them on disk. */
++ if (journal_reset(journal))
++ goto recovery_error;
++
++ journal->j_flags &= ~JFS_ABORT;
++ journal->j_flags |= JFS_LOADED;
++ return 0;
++
++recovery_error:
++ printk (KERN_WARNING "JBD: recovery failed\n");
++ return -EIO;
++}
++
++/*
++ * Release a journal_t structure once it is no longer in use by the
++ * journaled object.
++ */
++
++void journal_destroy (journal_t *journal)
++{
++ /* Wait for the commit thread to wake up and die. */
++ journal_kill_thread(journal);
++
++ /* Force a final log commit */
++ if (journal->j_running_transaction)
++ journal_commit_transaction(journal);
++
++ /* Force any old transactions to disk */
++ lock_journal(journal);
++ while (journal->j_checkpoint_transactions != NULL)
++ log_do_checkpoint(journal, 1);
++
++ J_ASSERT(journal->j_running_transaction == NULL);
++ J_ASSERT(journal->j_committing_transaction == NULL);
++ J_ASSERT(journal->j_checkpoint_transactions == NULL);
++
++ /* We can now mark the journal as empty. */
++ journal->j_tail = 0;
++ journal->j_tail_sequence = ++journal->j_transaction_sequence;
++ if (journal->j_sb_buffer) {
++ journal_update_superblock(journal, 1);
++ brelse(journal->j_sb_buffer);
++ }
++
++ if (journal->j_inode)
++ iput(journal->j_inode);
++ if (journal->j_revoke)
++ journal_destroy_revoke(journal);
++
++ unlock_journal(journal);
++ kfree(journal);
++ MOD_DEC_USE_COUNT;
++}
++
++
++/* Published API: Check whether the journal uses all of a given set of
++ * features. Return true (non-zero) if it does. */
++
++int journal_check_used_features (journal_t *journal, unsigned long compat,
++ unsigned long ro, unsigned long incompat)
++{
++ journal_superblock_t *sb;
++
++ if (!compat && !ro && !incompat)
++ return 1;
++ if (journal->j_format_version == 1)
++ return 0;
++
++ sb = journal->j_superblock;
++
++ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
++ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
++ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
++ return 1;
++
++ return 0;
++}
++
++/* Published API: Check whether the journaling code supports the use of
++ * all of a given set of features on this journal. Return true
++ * (non-zero) if it can. */
++
++int journal_check_available_features (journal_t *journal, unsigned long compat,
++ unsigned long ro, unsigned long incompat)
++{
++ journal_superblock_t *sb;
++
++ if (!compat && !ro && !incompat)
++ return 1;
++
++ sb = journal->j_superblock;
++
++ /* We can support any known requested features iff the
++ * superblock is in version 2. Otherwise we fail to support any
++ * extended sb features. */
++
++ if (journal->j_format_version != 2)
++ return 0;
++
++ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
++ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
++ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
++ return 1;
++
++ return 0;
++}
++
++/* Published API: Mark a given journal feature as present on the
++ * superblock. Returns true if the requested features could be set. */
++
++int journal_set_features (journal_t *journal, unsigned long compat,
++ unsigned long ro, unsigned long incompat)
++{
++ journal_superblock_t *sb;
++
++ if (journal_check_used_features(journal, compat, ro, incompat))
++ return 1;
++
++ if (!journal_check_available_features(journal, compat, ro, incompat))
++ return 0;
++
++ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
++ compat, ro, incompat);
++
++ sb = journal->j_superblock;
++
++ sb->s_feature_compat |= cpu_to_be32(compat);
++ sb->s_feature_ro_compat |= cpu_to_be32(ro);
++ sb->s_feature_incompat |= cpu_to_be32(incompat);
++
++ return 1;
++}
++
++
++/*
++ * Published API:
++ * Given an initialised but unloaded journal struct, poke about in the
++ * on-disk structure to update it to the most recent supported version.
++ */
++
++int journal_update_format (journal_t *journal)
++{
++ journal_superblock_t *sb;
++ int err;
++
++ err = journal_get_superblock(journal);
++ if (err)
++ return err;
++
++ sb = journal->j_superblock;
++
++ switch (ntohl(sb->s_header.h_blocktype)) {
++ case JFS_SUPERBLOCK_V2:
++ return 0;
++ case JFS_SUPERBLOCK_V1:
++ return journal_convert_superblock_v1(journal, sb);
++ default:
++ break;
++ }
++ return -EINVAL;
++}
++
++static int journal_convert_superblock_v1(journal_t *journal,
++ journal_superblock_t *sb)
++{
++ int offset, blocksize;
++ struct buffer_head *bh;
++
++ printk(KERN_WARNING
++ "JBD: Converting superblock from version 1 to 2.\n");
++
++ /* Pre-initialise new fields to zero */
++ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
++ blocksize = ntohl(sb->s_blocksize);
++ memset(&sb->s_feature_compat, 0, blocksize-offset);
++
++ sb->s_nr_users = cpu_to_be32(1);
++ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
++ journal->j_format_version = 2;
++
++ bh = journal->j_sb_buffer;
++ BUFFER_TRACE(bh, "marking dirty");
++ mark_buffer_dirty(bh);
++ ll_rw_block(WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ return 0;
++}
++
++
++/*
++ * Flush all data for a given journal to disk and empty the journal.
++ * Filesystems can use this when remounting readonly to ensure that
++ * recovery does not need to happen on remount.
++ */
++
++int journal_flush (journal_t *journal)
++{
++ int err = 0;
++ transaction_t *transaction = NULL;
++ unsigned long old_tail;
++
++ lock_kernel();
++
++ /* Force everything buffered to the log... */
++ if (journal->j_running_transaction) {
++ transaction = journal->j_running_transaction;
++ log_start_commit(journal, transaction);
++ } else if (journal->j_committing_transaction)
++ transaction = journal->j_committing_transaction;
++
++ /* Wait for the log commit to complete... */
++ if (transaction)
++ log_wait_commit(journal, transaction->t_tid);
++
++ /* ...and flush everything in the log out to disk. */
++ lock_journal(journal);
++ while (!err && journal->j_checkpoint_transactions != NULL)
++ err = log_do_checkpoint(journal, journal->j_maxlen);
++ cleanup_journal_tail(journal);
++
++ /* Finally, mark the journal as really needing no recovery.
++ * This sets s_start==0 in the underlying superblock, which is
++ * the magic code for a fully-recovered superblock. Any future
++ * commits of data to the journal will restore the current
++ * s_start value. */
++ old_tail = journal->j_tail;
++ journal->j_tail = 0;
++ journal_update_superblock(journal, 1);
++ journal->j_tail = old_tail;
++
++ unlock_journal(journal);
++
++ J_ASSERT(!journal->j_running_transaction);
++ J_ASSERT(!journal->j_committing_transaction);
++ J_ASSERT(!journal->j_checkpoint_transactions);
++ J_ASSERT(journal->j_head == journal->j_tail);
++ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
++
++ unlock_kernel();
++
++ return err;
++}
++
++/*
++ * Wipe out all of the contents of a journal, safely. This will produce
++ * a warning if the journal contains any valid recovery information.
++ * Must be called between journal_init_*() and journal_load().
++ *
++ * If (write) is non-zero, then we wipe out the journal on disk; otherwise
++ * we merely suppress recovery.
++ */
++
++int journal_wipe (journal_t *journal, int write)
++{
++ journal_superblock_t *sb;
++ int err = 0;
++
++ J_ASSERT (!(journal->j_flags & JFS_LOADED));
++
++ err = load_superblock(journal);
++ if (err)
++ return err;
++
++ sb = journal->j_superblock;
++
++ if (!journal->j_tail)
++ goto no_recovery;
++
++ printk (KERN_WARNING "JBD: %s recovery information on journal\n",
++ write ? "Clearing" : "Ignoring");
++
++ err = journal_skip_recovery(journal);
++ if (write)
++ journal_update_superblock(journal, 1);
++
++ no_recovery:
++ return err;
++}
++
++/*
++ * journal_dev_name: format a character string to describe on what
++ * device this journal is present.
++ */
++
++const char * journal_dev_name(journal_t *journal)
++{
++ kdev_t dev;
++
++ if (journal->j_inode)
++ dev = journal->j_inode->i_dev;
++ else
++ dev = journal->j_dev;
++
++ return bdevname(dev);
++}
++
++/*
++ * journal_abort: perform a complete, immediate shutdown of the ENTIRE
++ * journal (not of a single transaction). This operation cannot be
++ * undone without closing and reopening the journal.
++ *
++ * The journal_abort function is intended to support higher level error
++ * recovery mechanisms such as the ext2/ext3 remount-readonly error
++ * mode.
++ *
++ * Journal abort has very specific semantics. Any existing dirty,
++ * unjournaled buffers in the main filesystem will still be written to
++ * disk by bdflush, but the journaling mechanism will be suspended
++ * immediately and no further transaction commits will be honoured.
++ *
++ * Any dirty, journaled buffers will be written back to disk without
++ * hitting the journal. Atomicity cannot be guaranteed on an aborted
++ * filesystem, but we _do_ attempt to leave as much data as possible
++ * behind for fsck to use for cleanup.
++ *
++ * Any attempt to get a new transaction handle on a journal which is in
++ * ABORT state will just result in an -EROFS error return. A
++ * journal_stop on an existing handle will return -EIO if we have
++ * entered abort state during the update.
++ *
++ * Recursive transactions are not disturbed by journal abort until the
++ * final journal_stop, which will receive the -EIO error.
++ *
++ * Finally, the journal_abort call allows the caller to supply an errno
++ * which will be recored (if possible) in the journal superblock. This
++ * allows a client to record failure conditions in the middle of a
++ * transaction without having to complete the transaction to record the
++ * failure to disk. ext3_error, for example, now uses this
++ * functionality.
++ *
++ * Errors which originate from within the journaling layer will NOT
++ * supply an errno; a null errno implies that absolutely no further
++ * writes are done to the journal (unless there are any already in
++ * progress).
++ */
++
++/* Quick version for internal journal use (doesn't lock the journal).
++ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
++ * and don't attempt to make any other journal updates. */
++void __journal_abort_hard (journal_t *journal)
++{
++ transaction_t *transaction;
++
++ if (journal->j_flags & JFS_ABORT)
++ return;
++
++ printk (KERN_ERR "Aborting journal on device %s.\n",
++ journal_dev_name(journal));
++
++ journal->j_flags |= JFS_ABORT;
++ transaction = journal->j_running_transaction;
++ if (transaction)
++ log_start_commit(journal, transaction);
++}
++
++/* Soft abort: record the abort error status in the journal superblock,
++ * but don't do any other IO. */
++void __journal_abort_soft (journal_t *journal, int errno)
++{
++ if (journal->j_flags & JFS_ABORT)
++ return;
++
++ if (!journal->j_errno)
++ journal->j_errno = errno;
++
++ __journal_abort_hard(journal);
++
++ if (errno)
++ journal_update_superblock(journal, 1);
++}
++
++/* Full version for external use */
++void journal_abort (journal_t *journal, int errno)
++{
++ lock_journal(journal);
++ __journal_abort_soft(journal, errno);
++ unlock_journal(journal);
++}
++
++int journal_errno (journal_t *journal)
++{
++ int err;
++
++ lock_journal(journal);
++ if (journal->j_flags & JFS_ABORT)
++ err = -EROFS;
++ else
++ err = journal->j_errno;
++ unlock_journal(journal);
++ return err;
++}
++
++int journal_clear_err (journal_t *journal)
++{
++ int err = 0;
++
++ lock_journal(journal);
++ if (journal->j_flags & JFS_ABORT)
++ err = -EROFS;
++ else
++ journal->j_errno = 0;
++ unlock_journal(journal);
++ return err;
++}
++
++void journal_ack_err (journal_t *journal)
++{
++ lock_journal(journal);
++ if (journal->j_errno)
++ journal->j_flags |= JFS_ACK_ERR;
++ unlock_journal(journal);
++}
++
++int journal_blocks_per_page(struct inode *inode)
++{
++ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
++}
++
++/*
++ * shrink_journal_memory().
++ * Called when we're under memory pressure. Free up all the written-back
++ * checkpointed metadata buffers.
++ */
++void shrink_journal_memory(void)
++{
++ struct list_head *list;
++
++ lock_kernel();
++ list_for_each(list, &all_journals) {
++ journal_t *journal =
++ list_entry(list, journal_t, j_all_journals);
++ spin_lock(&journal_datalist_lock);
++ __journal_clean_checkpoint_list(journal);
++ spin_unlock(&journal_datalist_lock);
++ }
++ unlock_kernel();
++}
++
++/*
++ * Simple support for retying memory allocations. Introduced to help to
++ * debug different VM deadlock avoidance strategies.
++ */
++/*
++ * Simple support for retying memory allocations. Introduced to help to
++ * debug different VM deadlock avoidance strategies.
++ */
++void * __jbd_kmalloc (char *where, size_t size, int flags, int retry)
++{
++ void *p;
++ static unsigned long last_warning;
++
++ while (1) {
++ p = kmalloc(size, flags);
++ if (p)
++ return p;
++ if (!retry)
++ return NULL;
++ /* Log every retry for debugging. Also log them to the
++ * syslog, but do rate-limiting on the non-debugging
++ * messages. */
++ jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
++
++ if (time_after(jiffies, last_warning + 5*HZ)) {
++ printk(KERN_NOTICE
++ "ENOMEM in %s, retrying.\n", where);
++ last_warning = jiffies;
++ }
++
++ current->policy |= SCHED_YIELD;
++ schedule();
++ }
++}
++
++/*
++ * Journal_head storage management
++ */
++static kmem_cache_t *journal_head_cache;
++#ifdef CONFIG_JBD_DEBUG
++static atomic_t nr_journal_heads = ATOMIC_INIT(0);
++#endif
++
++static int journal_init_journal_head_cache(void)
++{
++ int retval;
++
++ J_ASSERT(journal_head_cache == 0);
++ journal_head_cache = kmem_cache_create("journal_head",
++ sizeof(struct journal_head),
++ 0, /* offset */
++ 0, /* flags */
++ NULL, /* ctor */
++ NULL); /* dtor */
++ retval = 0;
++ if (journal_head_cache == 0) {
++ retval = -ENOMEM;
++ printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
++ }
++ return retval;
++}
++
++static void journal_destroy_journal_head_cache(void)
++{
++ J_ASSERT(journal_head_cache != NULL);
++ kmem_cache_destroy(journal_head_cache);
++ journal_head_cache = 0;
++}
++
++/*
++ * journal_head splicing and dicing
++ */
++static struct journal_head *journal_alloc_journal_head(void)
++{
++ struct journal_head *ret;
++ static unsigned long last_warning;
++
++#ifdef CONFIG_JBD_DEBUG
++ atomic_inc(&nr_journal_heads);
++#endif
++ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
++ if (ret == 0) {
++ jbd_debug(1, "out of memory for journal_head\n");
++ if (time_after(jiffies, last_warning + 5*HZ)) {
++ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__
++ ", retrying.\n");
++ last_warning = jiffies;
++ }
++ while (ret == 0) {
++ current->policy |= SCHED_YIELD;
++ schedule();
++ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
++ }
++ }
++ return ret;
++}
++
++static void journal_free_journal_head(struct journal_head *jh)
++{
++#ifdef CONFIG_JBD_DEBUG
++ atomic_dec(&nr_journal_heads);
++ memset(jh, 0x5b, sizeof(*jh));
++#endif
++ kmem_cache_free(journal_head_cache, jh);
++}
++
++/*
++ * A journal_head is attached to a buffer_head whenever JBD has an
++ * interest in the buffer.
++ *
++ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
++ * is set. This bit is tested in core kernel code where we need to take
++ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
++ * there.
++ *
++ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
++ *
++ * When a buffer has its BH_JBD bit set it is immune from being released by
++ * core kernel code, mainly via ->b_count.
++ *
++ * A journal_head may be detached from its buffer_head when the journal_head's
++ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
++ * Various places in JBD call journal_remove_journal_head() to indicate that the
++ * journal_head can be dropped if needed.
++ *
++ * Various places in the kernel want to attach a journal_head to a buffer_head
++ * _before_ attaching the journal_head to a transaction. To protect the
++ * journal_head in this situation, journal_add_journal_head elevates the
++ * journal_head's b_jcount refcount by one. The caller must call
++ * journal_unlock_journal_head() to undo this.
++ *
++ * So the typical usage would be:
++ *
++ * (Attach a journal_head if needed. Increments b_jcount)
++ * struct journal_head *jh = journal_add_journal_head(bh);
++ * ...
++ * jh->b_transaction = xxx;
++ * journal_unlock_journal_head(jh);
++ *
++ * Now, the journal_head's b_jcount is zero, but it is safe from being released
++ * because it has a non-zero b_transaction.
++ */
++
++/*
++ * Give a buffer_head a journal_head.
++ *
++ * Doesn't need the journal lock.
++ * May sleep.
++ * Cannot be called with journal_datalist_lock held.
++ */
++struct journal_head *journal_add_journal_head(struct buffer_head *bh)
++{
++ struct journal_head *jh;
++
++ spin_lock(&journal_datalist_lock);
++ if (buffer_jbd(bh)) {
++ jh = bh2jh(bh);
++ } else {
++ J_ASSERT_BH(bh,
++ (atomic_read(&bh->b_count) > 0) ||
++ (bh->b_page && bh->b_page->mapping));
++ spin_unlock(&journal_datalist_lock);
++ jh = journal_alloc_journal_head();
++ memset(jh, 0, sizeof(*jh));
++ spin_lock(&journal_datalist_lock);
++
++ if (buffer_jbd(bh)) {
++ /* Someone did it for us! */
++ J_ASSERT_BH(bh, bh->b_private != NULL);
++ journal_free_journal_head(jh);
++ jh = bh->b_private;
++ } else {
++ /*
++ * We actually don't need jh_splice_lock when
++ * adding a journal_head - only on removal.
++ */
++ spin_lock(&jh_splice_lock);
++ set_bit(BH_JBD, &bh->b_state);
++ bh->b_private = jh;
++ jh->b_bh = bh;
++ atomic_inc(&bh->b_count);
++ spin_unlock(&jh_splice_lock);
++ BUFFER_TRACE(bh, "added journal_head");
++ }
++ }
++ jh->b_jcount++;
++ spin_unlock(&journal_datalist_lock);
++ return bh->b_private;
++}
++
++/*
++ * journal_remove_journal_head(): if the buffer isn't attached to a transaction
++ * and has a zero b_jcount then remove and release its journal_head. If we did
++ * see that the buffer is not used by any transaction we also "logically"
++ * decrement ->b_count.
++ *
++ * We in fact take an additional increment on ->b_count as a convenience,
++ * because the caller usually wants to do additional things with the bh
++ * after calling here.
++ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
++ * time. Once the caller has run __brelse(), the buffer is eligible for
++ * reaping by try_to_free_buffers().
++ *
++ * Requires journal_datalist_lock.
++ */
++void __journal_remove_journal_head(struct buffer_head *bh)
++{
++ struct journal_head *jh = bh2jh(bh);
++
++ assert_spin_locked(&journal_datalist_lock);
++ J_ASSERT_JH(jh, jh->b_jcount >= 0);
++ atomic_inc(&bh->b_count);
++ if (jh->b_jcount == 0) {
++ if (jh->b_transaction == NULL &&
++ jh->b_next_transaction == NULL &&
++ jh->b_cp_transaction == NULL) {
++ J_ASSERT_BH(bh, buffer_jbd(bh));
++ J_ASSERT_BH(bh, jh2bh(jh) == bh);
++ BUFFER_TRACE(bh, "remove journal_head");
++ spin_lock(&jh_splice_lock);
++ bh->b_private = NULL;
++ jh->b_bh = NULL; /* debug, really */
++ clear_bit(BH_JBD, &bh->b_state);
++ __brelse(bh);
++ spin_unlock(&jh_splice_lock);
++ journal_free_journal_head(jh);
++ } else {
++ BUFFER_TRACE(bh, "journal_head was locked");
++ }
++ }
++}
++
++void journal_unlock_journal_head(struct journal_head *jh)
++{
++ spin_lock(&journal_datalist_lock);
++ J_ASSERT_JH(jh, jh->b_jcount > 0);
++ --jh->b_jcount;
++ if (!jh->b_jcount && !jh->b_transaction) {
++ struct buffer_head *bh;
++ bh = jh2bh(jh);
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ }
++
++ spin_unlock(&journal_datalist_lock);
++}
++
++void journal_remove_journal_head(struct buffer_head *bh)
++{
++ spin_lock(&journal_datalist_lock);
++ __journal_remove_journal_head(bh);
++ spin_unlock(&journal_datalist_lock);
++}
++
++/*
++ * /proc tunables
++ */
++#if defined(CONFIG_JBD_DEBUG)
++int journal_enable_debug;
++EXPORT_SYMBOL(journal_enable_debug);
++#endif
++
++#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
++
++static struct proc_dir_entry *proc_jbd_debug;
++
++int read_jbd_debug(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int ret;
++
++ ret = sprintf(page + off, "%d\n", journal_enable_debug);
++ *eof = 1;
++ return ret;
++}
++
++int write_jbd_debug(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char buf[32];
++
++ if (count > ARRAY_SIZE(buf) - 1)
++ count = ARRAY_SIZE(buf) - 1;
++ if (copy_from_user(buf, buffer, count))
++ return -EFAULT;
++ buf[ARRAY_SIZE(buf) - 1] = '\0';
++ journal_enable_debug = simple_strtoul(buf, NULL, 10);
++ return count;
++}
++
++#define JBD_PROC_NAME "sys/fs/jbd-debug"
++
++static void __init create_jbd_proc_entry(void)
++{
++ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
++ if (proc_jbd_debug) {
++ /* Why is this so hard? */
++ proc_jbd_debug->read_proc = read_jbd_debug;
++ proc_jbd_debug->write_proc = write_jbd_debug;
++ }
++}
++
++static void __exit remove_jbd_proc_entry(void)
++{
++ if (proc_jbd_debug)
++ remove_proc_entry(JBD_PROC_NAME, NULL);
++}
++
++#else
++
++#define create_jbd_proc_entry() do {} while (0)
++#define remove_jbd_proc_entry() do {} while (0)
++
++#endif
++
++/*
++ * Module startup and shutdown
++ */
++
++static int __init journal_init_caches(void)
++{
++ int ret;
++
++ ret = journal_init_revoke_caches();
++ if (ret == 0)
++ ret = journal_init_journal_head_cache();
++ return ret;
++}
++
++static void journal_destroy_caches(void)
++{
++ journal_destroy_revoke_caches();
++ journal_destroy_journal_head_cache();
++}
++
++static int __init journal_init(void)
++{
++ int ret;
++
++ printk(KERN_INFO "Journalled Block Device driver loaded\n");
++ ret = journal_init_caches();
++ if (ret != 0)
++ journal_destroy_caches();
++ create_jbd_proc_entry();
++ return ret;
++}
++
++static void __exit journal_exit(void)
++{
++#ifdef CONFIG_JBD_DEBUG
++ int n = atomic_read(&nr_journal_heads);
++ if (n)
++ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
++#endif
++ remove_jbd_proc_entry();
++ journal_destroy_caches();
++}
++
++MODULE_LICENSE("GPL");
++module_init(journal_init);
++module_exit(journal_exit);
++
+diff -ruP linux.mcp2/fs/jbd/recovery.c linuxppc_2.4.19_final/fs/jbd/recovery.c
+--- linux.mcp2/fs/jbd/recovery.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/recovery.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,589 @@
++/*
++ * linux/fs/recovery.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
++ *
++ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Journal recovery routines for the generic filesystem journaling code;
++ * part of the ext2fs journaling system.
++ */
++
++#ifndef __KERNEL__
++#include "jfs_user.h"
++#else
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#endif
++
++/*
++ * Maintain information about the progress of the recovery job, so that
++ * the different passes can carry information between them.
++ */
++struct recovery_info
++{
++ tid_t start_transaction;
++ tid_t end_transaction;
++
++ int nr_replays;
++ int nr_revokes;
++ int nr_revoke_hits;
++};
++
++enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
++static int do_one_pass(journal_t *journal,
++ struct recovery_info *info, enum passtype pass);
++static int scan_revoke_records(journal_t *, struct buffer_head *,
++ tid_t, struct recovery_info *);
++
++#ifdef __KERNEL__
++
++/* Release readahead buffers after use */
++void journal_brelse_array(struct buffer_head *b[], int n)
++{
++ while (--n >= 0)
++ brelse (b[n]);
++}
++
++
++/*
++ * When reading from the journal, we are going through the block device
++ * layer directly and so there is no readahead being done for us. We
++ * need to implement any readahead ourselves if we want it to happen at
++ * all. Recovery is basically one long sequential read, so make sure we
++ * do the IO in reasonably large chunks.
++ *
++ * This is not so critical that we need to be enormously clever about
++ * the readahead size, though. 128K is a purely arbitrary, good-enough
++ * fixed value.
++ */
++
++#define MAXBUF 8
++static int do_readahead(journal_t *journal, unsigned int start)
++{
++ int err;
++ unsigned int max, nbufs, next;
++ unsigned long blocknr;
++ struct buffer_head *bh;
++
++ struct buffer_head * bufs[MAXBUF];
++
++ /* Do up to 128K of readahead */
++ max = start + (128 * 1024 / journal->j_blocksize);
++ if (max > journal->j_maxlen)
++ max = journal->j_maxlen;
++
++ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
++ * a time to the block device IO layer. */
++
++ nbufs = 0;
++
++ for (next = start; next < max; next++) {
++ err = journal_bmap(journal, next, &blocknr);
++
++ if (err) {
++ printk (KERN_ERR "JBD: bad block at offset %u\n",
++ next);
++ goto failed;
++ }
++
++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
++ if (!bh) {
++ err = -ENOMEM;
++ goto failed;
++ }
++
++ if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
++ bufs[nbufs++] = bh;
++ if (nbufs == MAXBUF) {
++ ll_rw_block(READ, nbufs, bufs);
++ journal_brelse_array(bufs, nbufs);
++ nbufs = 0;
++ }
++ } else
++ brelse(bh);
++ }
++
++ if (nbufs)
++ ll_rw_block(READ, nbufs, bufs);
++ err = 0;
++
++failed:
++ if (nbufs)
++ journal_brelse_array(bufs, nbufs);
++ return err;
++}
++
++#endif /* __KERNEL__ */
++
++
++/*
++ * Read a block from the journal
++ */
++
++static int jread(struct buffer_head **bhp, journal_t *journal,
++ unsigned int offset)
++{
++ int err;
++ unsigned long blocknr;
++ struct buffer_head *bh;
++
++ *bhp = NULL;
++
++ J_ASSERT (offset < journal->j_maxlen);
++
++ err = journal_bmap(journal, offset, &blocknr);
++
++ if (err) {
++ printk (KERN_ERR "JBD: bad block at offset %u\n",
++ offset);
++ return err;
++ }
++
++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
++ if (!bh)
++ return -ENOMEM;
++
++ if (!buffer_uptodate(bh)) {
++ /* If this is a brand new buffer, start readahead.
++ Otherwise, we assume we are already reading it. */
++ if (!buffer_req(bh))
++ do_readahead(journal, offset);
++ wait_on_buffer(bh);
++ }
++
++ if (!buffer_uptodate(bh)) {
++ printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
++ offset);
++ brelse(bh);
++ return -EIO;
++ }
++
++ *bhp = bh;
++ return 0;
++}
++
++
++/*
++ * Count the number of in-use tags in a journal descriptor block.
++ */
++
++static int count_tags(struct buffer_head *bh, int size)
++{
++ char * tagp;
++ journal_block_tag_t * tag;
++ int nr = 0;
++
++ tagp = &bh->b_data[sizeof(journal_header_t)];
++
++ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
++ tag = (journal_block_tag_t *) tagp;
++
++ nr++;
++ tagp += sizeof(journal_block_tag_t);
++ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID)))
++ tagp += 16;
++
++ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG))
++ break;
++ }
++
++ return nr;
++}
++
++
++/* Make sure we wrap around the log correctly! */
++#define wrap(journal, var) \
++do { \
++ if (var >= (journal)->j_last) \
++ var -= ((journal)->j_last - (journal)->j_first); \
++} while (0)
++
++/*
++ * journal_recover
++ *
++ * The primary function for recovering the log contents when mounting a
++ * journaled device.
++ *
++ * Recovery is done in three passes. In the first pass, we look for the
++ * end of the log. In the second, we assemble the list of revoke
++ * blocks. In the third and final pass, we replay any un-revoked blocks
++ * in the log.
++ */
++
++int journal_recover(journal_t *journal)
++{
++ int err;
++ journal_superblock_t * sb;
++
++ struct recovery_info info;
++
++ memset(&info, 0, sizeof(info));
++ sb = journal->j_superblock;
++
++ /*
++ * The journal superblock's s_start field (the current log head)
++ * is always zero if, and only if, the journal was cleanly
++ * unmounted.
++ */
++
++ if (!sb->s_start) {
++ jbd_debug(1, "No recovery required, last transaction %d\n",
++ ntohl(sb->s_sequence));
++ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1;
++ return 0;
++ }
++
++
++ err = do_one_pass(journal, &info, PASS_SCAN);
++ if (!err)
++ err = do_one_pass(journal, &info, PASS_REVOKE);
++ if (!err)
++ err = do_one_pass(journal, &info, PASS_REPLAY);
++
++ jbd_debug(0, "JBD: recovery, exit status %d, "
++ "recovered transactions %u to %u\n",
++ err, info.start_transaction, info.end_transaction);
++ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
++ info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
++
++ /* Restart the log at the next transaction ID, thus invalidating
++ * any existing commit records in the log. */
++ journal->j_transaction_sequence = ++info.end_transaction;
++
++ journal_clear_revoke(journal);
++ fsync_no_super(journal->j_fs_dev);
++ return err;
++}
++
++/*
++ * journal_skip_recovery
++ *
++ * Locate any valid recovery information from the journal and set up the
++ * journal structures in memory to ignore it (presumably because the
++ * caller has evidence that it is out of date).
++ *
++ * We perform one pass over the journal to allow us to tell the user how
++ * much recovery information is being erased, and to let us initialise
++ * the journal transaction sequence numbers to the next unused ID.
++ */
++
++int journal_skip_recovery(journal_t *journal)
++{
++ int err;
++ journal_superblock_t * sb;
++
++ struct recovery_info info;
++
++ memset (&info, 0, sizeof(info));
++ sb = journal->j_superblock;
++
++ err = do_one_pass(journal, &info, PASS_SCAN);
++
++ if (err) {
++ printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++ ++journal->j_transaction_sequence;
++ } else {
++#ifdef CONFIG_JBD_DEBUG
++ int dropped = info.end_transaction - ntohl(sb->s_sequence);
++#endif
++
++ jbd_debug(0,
++ "JBD: ignoring %d transaction%s from the journal.\n",
++ dropped, (dropped == 1) ? "" : "s");
++ journal->j_transaction_sequence = ++info.end_transaction;
++ }
++
++ journal->j_tail = 0;
++
++ return err;
++}
++
++static int do_one_pass(journal_t *journal,
++ struct recovery_info *info, enum passtype pass)
++{
++
++ unsigned int first_commit_ID, next_commit_ID;
++ unsigned long next_log_block;
++ int err, success = 0;
++ journal_superblock_t * sb;
++ journal_header_t * tmp;
++ struct buffer_head * bh;
++ unsigned int sequence;
++ int blocktype;
++
++ /* Precompute the maximum metadata descriptors in a descriptor block */
++ int MAX_BLOCKS_PER_DESC;
++ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
++ / sizeof(journal_block_tag_t));
++
++ /*
++ * First thing is to establish what we expect to find in the log
++ * (in terms of transaction IDs), and where (in terms of log
++ * block offsets): query the superblock.
++ */
++
++ sb = journal->j_superblock;
++ next_commit_ID = ntohl(sb->s_sequence);
++ next_log_block = ntohl(sb->s_start);
++
++ first_commit_ID = next_commit_ID;
++ if (pass == PASS_SCAN)
++ info->start_transaction = first_commit_ID;
++
++ jbd_debug(1, "Starting recovery pass %d\n", pass);
++
++ /*
++ * Now we walk through the log, transaction by transaction,
++ * making sure that each transaction has a commit block in the
++ * expected place. Each complete transaction gets replayed back
++ * into the main filesystem.
++ */
++
++ while (1) {
++ int flags;
++ char * tagp;
++ journal_block_tag_t * tag;
++ struct buffer_head * obh;
++ struct buffer_head * nbh;
++
++ /* If we already know where to stop the log traversal,
++ * check right now that we haven't gone past the end of
++ * the log. */
++
++ if (pass != PASS_SCAN)
++ if (tid_geq(next_commit_ID, info->end_transaction))
++ break;
++
++ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
++ next_commit_ID, next_log_block, journal->j_last);
++
++ /* Skip over each chunk of the transaction looking
++ * either the next descriptor block or the final commit
++ * record. */
++
++ jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
++ err = jread(&bh, journal, next_log_block);
++ if (err)
++ goto failed;
++
++ next_log_block++;
++ wrap(journal, next_log_block);
++
++ /* What kind of buffer is it?
++ *
++ * If it is a descriptor block, check that it has the
++ * expected sequence number. Otherwise, we're all done
++ * here. */
++
++ tmp = (journal_header_t *)bh->b_data;
++
++ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) {
++ brelse(bh);
++ break;
++ }
++
++ blocktype = ntohl(tmp->h_blocktype);
++ sequence = ntohl(tmp->h_sequence);
++ jbd_debug(3, "Found magic %d, sequence %d\n",
++ blocktype, sequence);
++
++ if (sequence != next_commit_ID) {
++ brelse(bh);
++ break;
++ }
++
++ /* OK, we have a valid descriptor block which matches
++ * all of the sequence number checks. What are we going
++ * to do with it? That depends on the pass... */
++
++ switch(blocktype) {
++ case JFS_DESCRIPTOR_BLOCK:
++ /* If it is a valid descriptor block, replay it
++ * in pass REPLAY; otherwise, just skip over the
++ * blocks it describes. */
++ if (pass != PASS_REPLAY) {
++ next_log_block +=
++ count_tags(bh, journal->j_blocksize);
++ wrap(journal, next_log_block);
++ brelse(bh);
++ continue;
++ }
++
++ /* A descriptor block: we can now write all of
++ * the data blocks. Yay, useful work is finally
++ * getting done here! */
++
++ tagp = &bh->b_data[sizeof(journal_header_t)];
++ while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
++ <= journal->j_blocksize) {
++ unsigned long io_block;
++
++ tag = (journal_block_tag_t *) tagp;
++ flags = ntohl(tag->t_flags);
++
++ io_block = next_log_block++;
++ wrap(journal, next_log_block);
++ err = jread(&obh, journal, io_block);
++ if (err) {
++ /* Recover what we can, but
++ * report failure at the end. */
++ success = err;
++ printk (KERN_ERR
++ "JBD: IO error %d recovering "
++ "block %ld in log\n",
++ err, io_block);
++ } else {
++ unsigned long blocknr;
++
++ J_ASSERT(obh != NULL);
++ blocknr = ntohl(tag->t_blocknr);
++
++ /* If the block has been
++ * revoked, then we're all done
++ * here. */
++ if (journal_test_revoke
++ (journal, blocknr,
++ next_commit_ID)) {
++ brelse(obh);
++ ++info->nr_revoke_hits;
++ goto skip_write;
++ }
++
++ /* Find a buffer for the new
++ * data being restored */
++ nbh = getblk(journal->j_fs_dev, blocknr,
++ journal->j_blocksize);
++ if (nbh == NULL) {
++ printk(KERN_ERR
++ "JBD: Out of memory "
++ "during recovery.\n");
++ err = -ENOMEM;
++ brelse(bh);
++ brelse(obh);
++ goto failed;
++ }
++
++ lock_buffer(nbh);
++ memcpy(nbh->b_data, obh->b_data,
++ journal->j_blocksize);
++ if (flags & JFS_FLAG_ESCAPE) {
++ *((unsigned int *)bh->b_data) =
++ htonl(JFS_MAGIC_NUMBER);
++ }
++
++ BUFFER_TRACE(nbh, "marking dirty");
++ mark_buffer_dirty(nbh);
++ BUFFER_TRACE(nbh, "marking uptodate");
++ mark_buffer_uptodate(nbh, 1);
++ unlock_buffer(nbh);
++ ++info->nr_replays;
++ /* ll_rw_block(WRITE, 1, &nbh); */
++ brelse(obh);
++ brelse(nbh);
++ }
++
++ skip_write:
++ tagp += sizeof(journal_block_tag_t);
++ if (!(flags & JFS_FLAG_SAME_UUID))
++ tagp += 16;
++
++ if (flags & JFS_FLAG_LAST_TAG)
++ break;
++ }
++
++ brelse(bh);
++ continue;
++
++ case JFS_COMMIT_BLOCK:
++ /* Found an expected commit block: not much to
++ * do other than move on to the next sequence
++ * number. */
++ brelse(bh);
++ next_commit_ID++;
++ continue;
++
++ case JFS_REVOKE_BLOCK:
++ /* If we aren't in the REVOKE pass, then we can
++ * just skip over this block. */
++ if (pass != PASS_REVOKE) {
++ brelse(bh);
++ continue;
++ }
++
++ err = scan_revoke_records(journal, bh,
++ next_commit_ID, info);
++ brelse(bh);
++ if (err)
++ goto failed;
++ continue;
++
++ default:
++ jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
++ blocktype);
++ goto done;
++ }
++ }
++
++ done:
++ /*
++ * We broke out of the log scan loop: either we came to the
++ * known end of the log or we found an unexpected block in the
++ * log. If the latter happened, then we know that the "current"
++ * transaction marks the end of the valid log.
++ */
++
++ if (pass == PASS_SCAN)
++ info->end_transaction = next_commit_ID;
++ else {
++ /* It's really bad news if different passes end up at
++ * different places (but possible due to IO errors). */
++ if (info->end_transaction != next_commit_ID) {
++ printk (KERN_ERR "JBD: recovery pass %d ended at "
++ "transaction %u, expected %u\n",
++ pass, next_commit_ID, info->end_transaction);
++ if (!success)
++ success = -EIO;
++ }
++ }
++
++ return success;
++
++ failed:
++ return err;
++}
++
++
++/* Scan a revoke record, marking all blocks mentioned as revoked. */
++
++static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
++ tid_t sequence, struct recovery_info *info)
++{
++ journal_revoke_header_t *header;
++ int offset, max;
++
++ header = (journal_revoke_header_t *) bh->b_data;
++ offset = sizeof(journal_revoke_header_t);
++ max = ntohl(header->r_count);
++
++ while (offset < max) {
++ unsigned long blocknr;
++ int err;
++
++ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset)));
++ offset += 4;
++ err = journal_set_revoke(journal, blocknr, sequence);
++ if (err)
++ return err;
++ ++info->nr_revokes;
++ }
++ return 0;
++}
+diff -ruP linux.mcp2/fs/jbd/revoke.c linuxppc_2.4.19_final/fs/jbd/revoke.c
+--- linux.mcp2/fs/jbd/revoke.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/revoke.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,636 @@
++/*
++ * linux/fs/revoke.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
++ *
++ * Copyright 2000 Red Hat corp --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Journal revoke routines for the generic filesystem journaling code;
++ * part of the ext2fs journaling system.
++ *
++ * Revoke is the mechanism used to prevent old log records for deleted
++ * metadata from being replayed on top of newer data using the same
++ * blocks. The revoke mechanism is used in two separate places:
++ *
++ * + Commit: during commit we write the entire list of the current
++ * transaction's revoked blocks to the journal
++ *
++ * + Recovery: during recovery we record the transaction ID of all
++ * revoked blocks. If there are multiple revoke records in the log
++ * for a single block, only the last one counts, and if there is a log
++ * entry for a block beyond the last revoke, then that log entry still
++ * gets replayed.
++ *
++ * We can get interactions between revokes and new log data within a
++ * single transaction:
++ *
++ * Block is revoked and then journaled:
++ * The desired end result is the journaling of the new block, so we
++ * cancel the revoke before the transaction commits.
++ *
++ * Block is journaled and then revoked:
++ * The revoke must take precedence over the write of the block, so we
++ * need either to cancel the journal entry or to write the revoke
++ * later in the log than the log block. In this case, we choose the
++ * latter: journaling a block cancels any revoke record for that block
++ * in the current transaction, so any revoke for that block in the
++ * transaction must have happened after the block was journaled and so
++ * the revoke must take precedence.
++ *
++ * Block is revoked and then written as data:
++ * The data write is allowed to succeed, but the revoke is _not_
++ * cancelled. We still need to prevent old log records from
++ * overwriting the new data. We don't even need to clear the revoke
++ * bit here.
++ *
++ * Revoke information on buffers is a tri-state value:
++ *
++ * RevokeValid clear: no cached revoke status, need to look it up
++ * RevokeValid set, Revoked clear:
++ * buffer has not been revoked, and cancel_revoke
++ * need do nothing.
++ * RevokeValid set, Revoked set:
++ * buffer has been revoked.
++ */
++
++#ifndef __KERNEL__
++#include "jfs_user.h"
++#else
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#include <linux/list.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++#endif
++
++static kmem_cache_t *revoke_record_cache;
++static kmem_cache_t *revoke_table_cache;
++
++/* Each revoke record represents one single revoked block. During
++ journal replay, this involves recording the transaction ID of the
++ last transaction to revoke this block. */
++
++struct jbd_revoke_record_s
++{
++ struct list_head hash;
++ tid_t sequence; /* Used for recovery only */
++ unsigned long blocknr;
++};
++
++
++/* The revoke table is just a simple hash table of revoke records. */
++struct jbd_revoke_table_s
++{
++ /* It is conceivable that we might want a larger hash table
++ * for recovery. Must be a power of two. */
++ int hash_size;
++ int hash_shift;
++ struct list_head *hash_table;
++};
++
++
++#ifdef __KERNEL__
++static void write_one_revoke_record(journal_t *, transaction_t *,
++ struct journal_head **, int *,
++ struct jbd_revoke_record_s *);
++static void flush_descriptor(journal_t *, struct journal_head *, int);
++#endif
++
++/* Utility functions to maintain the revoke table */
++
++/* Borrowed from buffer.c: this is a tried and tested block hash function */
++static inline int hash(journal_t *journal, unsigned long block)
++{
++ struct jbd_revoke_table_s *table = journal->j_revoke;
++ int hash_shift = table->hash_shift;
++
++ return ((block << (hash_shift - 6)) ^
++ (block >> 13) ^
++ (block << (hash_shift - 12))) & (table->hash_size - 1);
++}
++
++int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
++{
++ struct list_head *hash_list;
++ struct jbd_revoke_record_s *record;
++
++repeat:
++ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
++ if (!record)
++ goto oom;
++
++ record->sequence = seq;
++ record->blocknr = blocknr;
++ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
++ list_add(&record->hash, hash_list);
++ return 0;
++
++oom:
++ if (!journal_oom_retry)
++ return -ENOMEM;
++ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
++ current->policy |= SCHED_YIELD;
++ schedule();
++ goto repeat;
++}
++
++/* Find a revoke record in the journal's hash table. */
++
++static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
++ unsigned long blocknr)
++{
++ struct list_head *hash_list;
++ struct jbd_revoke_record_s *record;
++
++ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
++
++ record = (struct jbd_revoke_record_s *) hash_list->next;
++ while (&(record->hash) != hash_list) {
++ if (record->blocknr == blocknr)
++ return record;
++ record = (struct jbd_revoke_record_s *) record->hash.next;
++ }
++ return NULL;
++}
++
++int __init journal_init_revoke_caches(void)
++{
++ revoke_record_cache = kmem_cache_create("revoke_record",
++ sizeof(struct jbd_revoke_record_s),
++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
++ if (revoke_record_cache == 0)
++ return -ENOMEM;
++
++ revoke_table_cache = kmem_cache_create("revoke_table",
++ sizeof(struct jbd_revoke_table_s),
++ 0, 0, NULL, NULL);
++ if (revoke_table_cache == 0) {
++ kmem_cache_destroy(revoke_record_cache);
++ revoke_record_cache = NULL;
++ return -ENOMEM;
++ }
++ return 0;
++}
++
++void journal_destroy_revoke_caches(void)
++{
++ kmem_cache_destroy(revoke_record_cache);
++ revoke_record_cache = 0;
++ kmem_cache_destroy(revoke_table_cache);
++ revoke_table_cache = 0;
++}
++
++/* Initialise the revoke table for a given journal to a given size. */
++
++int journal_init_revoke(journal_t *journal, int hash_size)
++{
++ int shift, tmp;
++
++ J_ASSERT (journal->j_revoke == NULL);
++
++ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
++ if (!journal->j_revoke)
++ return -ENOMEM;
++
++ /* Check that the hash_size is a power of two */
++ J_ASSERT ((hash_size & (hash_size-1)) == 0);
++
++ journal->j_revoke->hash_size = hash_size;
++
++ shift = 0;
++ tmp = hash_size;
++ while((tmp >>= 1UL) != 0UL)
++ shift++;
++ journal->j_revoke->hash_shift = shift;
++
++ journal->j_revoke->hash_table =
++ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
++ if (!journal->j_revoke->hash_table) {
++ kmem_cache_free(revoke_table_cache, journal->j_revoke);
++ journal->j_revoke = NULL;
++ return -ENOMEM;
++ }
++
++ for (tmp = 0; tmp < hash_size; tmp++)
++ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
++
++ return 0;
++}
++
++/* Destoy a journal's revoke table. The table must already be empty! */
++
++void journal_destroy_revoke(journal_t *journal)
++{
++ struct jbd_revoke_table_s *table;
++ struct list_head *hash_list;
++ int i;
++
++ table = journal->j_revoke;
++ if (!table)
++ return;
++
++ for (i=0; i<table->hash_size; i++) {
++ hash_list = &table->hash_table[i];
++ J_ASSERT (list_empty(hash_list));
++ }
++
++ kfree(table->hash_table);
++ kmem_cache_free(revoke_table_cache, table);
++ journal->j_revoke = NULL;
++}
++
++
++#ifdef __KERNEL__
++
++/*
++ * journal_revoke: revoke a given buffer_head from the journal. This
++ * prevents the block from being replayed during recovery if we take a
++ * crash after this current transaction commits. Any subsequent
++ * metadata writes of the buffer in this transaction cancel the
++ * revoke.
++ *
++ * Note that this call may block --- it is up to the caller to make
++ * sure that there are no further calls to journal_write_metadata
++ * before the revoke is complete. In ext3, this implies calling the
++ * revoke before clearing the block bitmap when we are deleting
++ * metadata.
++ *
++ * Revoke performs a journal_forget on any buffer_head passed in as a
++ * parameter, but does _not_ forget the buffer_head if the bh was only
++ * found implicitly.
++ *
++ * bh_in may not be a journalled buffer - it may have come off
++ * the hash tables without an attached journal_head.
++ *
++ * If bh_in is non-zero, journal_revoke() will decrement its b_count
++ * by one.
++ */
++
++int journal_revoke(handle_t *handle, unsigned long blocknr,
++ struct buffer_head *bh_in)
++{
++ struct buffer_head *bh = NULL;
++ journal_t *journal;
++ kdev_t dev;
++ int err;
++
++ if (bh_in)
++ BUFFER_TRACE(bh_in, "enter");
++
++ journal = handle->h_transaction->t_journal;
++ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
++ J_ASSERT (!"Cannot set revoke feature!");
++ return -EINVAL;
++ }
++
++ dev = journal->j_fs_dev;
++ bh = bh_in;
++
++ if (!bh) {
++ bh = get_hash_table(dev, blocknr, journal->j_blocksize);
++ if (bh)
++ BUFFER_TRACE(bh, "found on hash");
++ }
++#ifdef JBD_EXPENSIVE_CHECKING
++ else {
++ struct buffer_head *bh2;
++
++ /* If there is a different buffer_head lying around in
++ * memory anywhere... */
++ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize);
++ if (bh2) {
++ /* ... and it has RevokeValid status... */
++ if ((bh2 != bh) &&
++ test_bit(BH_RevokeValid, &bh2->b_state))
++ /* ...then it better be revoked too,
++ * since it's illegal to create a revoke
++ * record against a buffer_head which is
++ * not marked revoked --- that would
++ * risk missing a subsequent revoke
++ * cancel. */
++ J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
++ bh2->b_state));
++ __brelse(bh2);
++ }
++ }
++#endif
++
++ /* We really ought not ever to revoke twice in a row without
++ first having the revoke cancelled: it's illegal to free a
++ block twice without allocating it in between! */
++ if (bh) {
++ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
++ set_bit(BH_Revoked, &bh->b_state);
++ set_bit(BH_RevokeValid, &bh->b_state);
++ if (bh_in) {
++ BUFFER_TRACE(bh_in, "call journal_forget");
++ journal_forget(handle, bh_in);
++ } else {
++ BUFFER_TRACE(bh, "call brelse");
++ __brelse(bh);
++ }
++ }
++
++ lock_journal(journal);
++ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
++ err = insert_revoke_hash(journal, blocknr,
++ handle->h_transaction->t_tid);
++ unlock_journal(journal);
++ BUFFER_TRACE(bh_in, "exit");
++ return err;
++}
++
++/*
++ * Cancel an outstanding revoke. For use only internally by the
++ * journaling code (called from journal_get_write_access).
++ *
++ * We trust the BH_Revoked bit on the buffer if the buffer is already
++ * being journaled: if there is no revoke pending on the buffer, then we
++ * don't do anything here.
++ *
++ * This would break if it were possible for a buffer to be revoked and
++ * discarded, and then reallocated within the same transaction. In such
++ * a case we would have lost the revoked bit, but when we arrived here
++ * the second time we would still have a pending revoke to cancel. So,
++ * do not trust the Revoked bit on buffers unless RevokeValid is also
++ * set.
++ *
++ * The caller must have the journal locked.
++ */
++int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
++{
++ struct jbd_revoke_record_s *record;
++ journal_t *journal = handle->h_transaction->t_journal;
++ int need_cancel;
++ int did_revoke = 0; /* akpm: debug */
++ struct buffer_head *bh = jh2bh(jh);
++
++ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
++
++ /* Is the existing Revoke bit valid? If so, we trust it, and
++ * only perform the full cancel if the revoke bit is set. If
++ * not, we can't trust the revoke bit, and we need to do the
++ * full search for a revoke record. */
++ if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
++ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
++ else {
++ need_cancel = 1;
++ clear_bit(BH_Revoked, &bh->b_state);
++ }
++
++ if (need_cancel) {
++ record = find_revoke_record(journal, bh->b_blocknr);
++ if (record) {
++ jbd_debug(4, "cancelled existing revoke on "
++ "blocknr %lu\n", bh->b_blocknr);
++ list_del(&record->hash);
++ kmem_cache_free(revoke_record_cache, record);
++ did_revoke = 1;
++ }
++ }
++
++#ifdef JBD_EXPENSIVE_CHECKING
++ /* There better not be one left behind by now! */
++ record = find_revoke_record(journal, bh->b_blocknr);
++ J_ASSERT_JH(jh, record == NULL);
++#endif
++
++ /* Finally, have we just cleared revoke on an unhashed
++ * buffer_head? If so, we'd better make sure we clear the
++ * revoked status on any hashed alias too, otherwise the revoke
++ * state machine will get very upset later on. */
++ if (need_cancel && !bh->b_pprev) {
++ struct buffer_head *bh2;
++ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
++ if (bh2) {
++ clear_bit(BH_Revoked, &bh2->b_state);
++ __brelse(bh2);
++ }
++ }
++
++ return did_revoke;
++}
++
++
++/*
++ * Write revoke records to the journal for all entries in the current
++ * revoke hash, deleting the entries as we go.
++ *
++ * Called with the journal lock held.
++ */
++
++void journal_write_revoke_records(journal_t *journal,
++ transaction_t *transaction)
++{
++ struct journal_head *descriptor;
++ struct jbd_revoke_record_s *record;
++ struct jbd_revoke_table_s *revoke;
++ struct list_head *hash_list;
++ int i, offset, count;
++
++ descriptor = NULL;
++ offset = 0;
++ count = 0;
++ revoke = journal->j_revoke;
++
++ for (i = 0; i < revoke->hash_size; i++) {
++ hash_list = &revoke->hash_table[i];
++
++ while (!list_empty(hash_list)) {
++ record = (struct jbd_revoke_record_s *)
++ hash_list->next;
++ write_one_revoke_record(journal, transaction,
++ &descriptor, &offset,
++ record);
++ count++;
++ list_del(&record->hash);
++ kmem_cache_free(revoke_record_cache, record);
++ }
++ }
++ if (descriptor)
++ flush_descriptor(journal, descriptor, offset);
++ jbd_debug(1, "Wrote %d revoke records\n", count);
++}
++
++/*
++ * Write out one revoke record. We need to create a new descriptor
++ * block if the old one is full or if we have not already created one.
++ */
++
++static void write_one_revoke_record(journal_t *journal,
++ transaction_t *transaction,
++ struct journal_head **descriptorp,
++ int *offsetp,
++ struct jbd_revoke_record_s *record)
++{
++ struct journal_head *descriptor;
++ int offset;
++ journal_header_t *header;
++
++ /* If we are already aborting, this all becomes a noop. We
++ still need to go round the loop in
++ journal_write_revoke_records in order to free all of the
++ revoke records: only the IO to the journal is omitted. */
++ if (is_journal_aborted(journal))
++ return;
++
++ descriptor = *descriptorp;
++ offset = *offsetp;
++
++ /* Make sure we have a descriptor with space left for the record */
++ if (descriptor) {
++ if (offset == journal->j_blocksize) {
++ flush_descriptor(journal, descriptor, offset);
++ descriptor = NULL;
++ }
++ }
++
++ if (!descriptor) {
++ descriptor = journal_get_descriptor_buffer(journal);
++ if (!descriptor)
++ return;
++ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
++ header->h_magic = htonl(JFS_MAGIC_NUMBER);
++ header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
++ header->h_sequence = htonl(transaction->t_tid);
++
++ /* Record it so that we can wait for IO completion later */
++ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
++ journal_file_buffer(descriptor, transaction, BJ_LogCtl);
++
++ offset = sizeof(journal_revoke_header_t);
++ *descriptorp = descriptor;
++ }
++
++ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) =
++ htonl(record->blocknr);
++ offset += 4;
++ *offsetp = offset;
++}
++
++/*
++ * Flush a revoke descriptor out to the journal. If we are aborting,
++ * this is a noop; otherwise we are generating a buffer which needs to
++ * be waited for during commit, so it has to go onto the appropriate
++ * journal buffer list.
++ */
++
++static void flush_descriptor(journal_t *journal,
++ struct journal_head *descriptor,
++ int offset)
++{
++ journal_revoke_header_t *header;
++
++ if (is_journal_aborted(journal)) {
++ JBUFFER_TRACE(descriptor, "brelse");
++ unlock_buffer(jh2bh(descriptor));
++ __brelse(jh2bh(descriptor));
++ return;
++ }
++
++ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
++ header->r_count = htonl(offset);
++ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
++ {
++ struct buffer_head *bh = jh2bh(descriptor);
++ BUFFER_TRACE(bh, "write");
++ clear_bit(BH_Dirty, &bh->b_state);
++ bh->b_end_io = journal_end_buffer_io_sync;
++ submit_bh(WRITE, bh);
++ }
++}
++
++#endif
++
++/*
++ * Revoke support for recovery.
++ *
++ * Recovery needs to be able to:
++ *
++ * record all revoke records, including the tid of the latest instance
++ * of each revoke in the journal
++ *
++ * check whether a given block in a given transaction should be replayed
++ * (ie. has not been revoked by a revoke record in that or a subsequent
++ * transaction)
++ *
++ * empty the revoke table after recovery.
++ */
++
++/*
++ * First, setting revoke records. We create a new revoke record for
++ * every block ever revoked in the log as we scan it for recovery, and
++ * we update the existing records if we find multiple revokes for a
++ * single block.
++ */
++
++int journal_set_revoke(journal_t *journal,
++ unsigned long blocknr,
++ tid_t sequence)
++{
++ struct jbd_revoke_record_s *record;
++
++ record = find_revoke_record(journal, blocknr);
++ if (record) {
++ /* If we have multiple occurences, only record the
++ * latest sequence number in the hashed record */
++ if (tid_gt(sequence, record->sequence))
++ record->sequence = sequence;
++ return 0;
++ }
++ return insert_revoke_hash(journal, blocknr, sequence);
++}
++
++/*
++ * Test revoke records. For a given block referenced in the log, has
++ * that block been revoked? A revoke record with a given transaction
++ * sequence number revokes all blocks in that transaction and earlier
++ * ones, but later transactions still need replayed.
++ */
++
++int journal_test_revoke(journal_t *journal,
++ unsigned long blocknr,
++ tid_t sequence)
++{
++ struct jbd_revoke_record_s *record;
++
++ record = find_revoke_record(journal, blocknr);
++ if (!record)
++ return 0;
++ if (tid_gt(sequence, record->sequence))
++ return 0;
++ return 1;
++}
++
++/*
++ * Finally, once recovery is over, we need to clear the revoke table so
++ * that it can be reused by the running filesystem.
++ */
++
++void journal_clear_revoke(journal_t *journal)
++{
++ int i;
++ struct list_head *hash_list;
++ struct jbd_revoke_record_s *record;
++ struct jbd_revoke_table_s *revoke;
++
++ revoke = journal->j_revoke;
++
++ for (i = 0; i < revoke->hash_size; i++) {
++ hash_list = &revoke->hash_table[i];
++ while (!list_empty(hash_list)) {
++ record = (struct jbd_revoke_record_s*) hash_list->next;
++ list_del(&record->hash);
++ kmem_cache_free(revoke_record_cache, record);
++ }
++ }
++}
++
+diff -ruP linux.mcp2/fs/jbd/transaction.c linuxppc_2.4.19_final/fs/jbd/transaction.c
+--- linux.mcp2/fs/jbd/transaction.c 1969-12-31 16:00:00.000000000 -0800
++++ linuxppc_2.4.19_final/fs/jbd/transaction.c 2004-05-17 13:56:17.000000000 -0700
+@@ -0,0 +1,2055 @@
++/*
++ * linux/fs/transaction.c
++ *
++ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
++ *
++ * Copyright 1998 Red Hat corp --- All Rights Reserved
++ *
++ * This file is part of the Linux kernel and is made available under
++ * the terms of the GNU General Public License, version 2, or at your
++ * option, any later version, incorporated herein by reference.
++ *
++ * Generic filesystem transaction handling code; part of the ext2fs
++ * journaling system.
++ *
++ * This file manages transactions (compound commits managed by the
++ * journaling code) and handles (individual atomic operations by the
++ * filesystem).
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#include <linux/timer.h>
++#include <linux/smp_lock.h>
++#include <linux/mm.h>
++
++extern spinlock_t journal_datalist_lock;
++
++/*
++ * get_transaction: obtain a new transaction_t object.
++ *
++ * Simply allocate and initialise a new transaction. Create it in
++ * RUNNING state and add it to the current journal (which should not
++ * have an existing running transaction: we only make a new transaction
++ * once we have started to commit the old one).
++ *
++ * Preconditions:
++ * The journal MUST be locked. We don't perform atomic mallocs on the
++ * new transaction and we can't block without protecting against other
++ * processes trying to touch the journal while it is in transition.
++ */
++
++static transaction_t * get_transaction (journal_t * journal, int is_try)
++{
++ transaction_t * transaction;
++
++ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
++ if (!transaction)
++ return NULL;
++
++ memset (transaction, 0, sizeof (transaction_t));
++
++ transaction->t_journal = journal;
++ transaction->t_state = T_RUNNING;
++ transaction->t_tid = journal->j_transaction_sequence++;
++ transaction->t_expires = jiffies + journal->j_commit_interval;
++
++ /* Set up the commit timer for the new transaction. */
++ J_ASSERT (!journal->j_commit_timer_active);
++ journal->j_commit_timer_active = 1;
++ journal->j_commit_timer->expires = transaction->t_expires;
++ add_timer(journal->j_commit_timer);
++
++ J_ASSERT (journal->j_running_transaction == NULL);
++ journal->j_running_transaction = transaction;
++
++ return transaction;
++}
++
++/*
++ * Handle management.
++ *
++ * A handle_t is an object which represents a single atomic update to a
++ * filesystem, and which tracks all of the modifications which form part
++ * of that one update.
++ */
++
++/*
++ * start_this_handle: Given a handle, deal with any locking or stalling
++ * needed to make sure that there is enough journal space for the handle
++ * to begin. Attach the handle to a transaction and set up the
++ * transaction's buffer credits.
++ */
++
++static int start_this_handle(journal_t *journal, handle_t *handle)
++{
++ transaction_t *transaction;
++ int needed;
++ int nblocks = handle->h_buffer_credits;
++
++ jbd_debug(3, "New handle %p going live.\n", handle);
++
++repeat:
++
++ lock_journal(journal);
++
++repeat_locked:
++
++ if (is_journal_aborted(journal) ||
++ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
++ unlock_journal(journal);
++ return -EROFS;
++ }
++
++ /* Wait on the journal's transaction barrier if necessary */
++ if (journal->j_barrier_count) {
++ unlock_journal(journal);
++ sleep_on(&journal->j_wait_transaction_locked);
++ goto repeat;
++ }
++
++ if (!journal->j_running_transaction)
++ get_transaction(journal, 0);
++ /* @@@ Error? */
++ J_ASSERT(journal->j_running_transaction);
++
++ transaction = journal->j_running_transaction;
++
++ /* If the current transaction is locked down for commit, wait
++ * for the lock to be released. */
++
++ if (transaction->t_state == T_LOCKED) {
++ unlock_journal(journal);
++ jbd_debug(3, "Handle %p stalling...\n", handle);
++ sleep_on(&journal->j_wait_transaction_locked);
++ goto repeat;
++ }
++
++ /* If there is not enough space left in the log to write all
++ * potential buffers requested by this operation, we need to
++ * stall pending a log checkpoint to free some more log
++ * space. */
++
++ needed = transaction->t_outstanding_credits + nblocks;
++
++ if (needed > journal->j_max_transaction_buffers) {
++ /* If the current transaction is already too large, then
++ * start to commit it: we can then go back and attach
++ * this handle to a new transaction. */
++
++ jbd_debug(2, "Handle %p starting new commit...\n", handle);
++ log_start_commit(journal, transaction);
++ unlock_journal(journal);
++ sleep_on(&journal->j_wait_transaction_locked);
++ lock_journal(journal);
++ goto repeat_locked;
++ }
++
++ /*
++ * The commit code assumes that it can get enough log space
++ * without forcing a checkpoint. This is *critical* for
++ * correctness: a checkpoint of a buffer which is also
++ * associated with a committing transaction creates a deadlock,
++ * so commit simply cannot force through checkpoints.
++ *
++ * We must therefore ensure the necessary space in the journal
++ * *before* starting to dirty potentially checkpointed buffers
++ * in the new transaction.
++ *
++ * The worst part is, any transaction currently committing can
++ * reduce the free space arbitrarily. Be careful to account for
++ * those buffers when checkpointing.
++ */
++
++ /*
++ * @@@ AKPM: This seems rather over-defensive. We're giving commit
++ * a _lot_ of headroom: 1/4 of the journal plus the size of
++ * the committing transaction. Really, we only need to give it
++ * committing_transaction->t_outstanding_credits plus "enough" for
++ * the log control blocks.
++ * Also, this test is inconsitent with the matching one in
++ * journal_extend().
++ */
++ needed = journal->j_max_transaction_buffers;
++ if (journal->j_committing_transaction)
++ needed += journal->j_committing_transaction->
++ t_outstanding_credits;
++
++ if (log_space_left(journal) < needed) {
++ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
++ log_wait_for_space(journal, needed);
++ goto repeat_locked;
++ }
++
++ /* OK, account for the buffers that this operation expects to
++ * use and add the handle to the running transaction. */
++
++ handle->h_transaction = transaction;
++ transaction->t_outstanding_credits += nblocks;
++ transaction->t_updates++;
++ transaction->t_handle_count++;
++ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
++ handle, nblocks, transaction->t_outstanding_credits,
++ log_space_left(journal));
++
++ unlock_journal(journal);
++
++ return 0;
++}
++
++/*
++ * Obtain a new handle.
++ *
++ * We make sure that the transaction can guarantee at least nblocks of
++ * modified buffers in the log. We block until the log can guarantee
++ * that much space.
++ *
++ * This function is visible to journal users (like ext2fs), so is not
++ * called with the journal already locked.
++ *
++ * Return a pointer to a newly allocated handle, or NULL on failure
++ */
++
++handle_t *journal_start(journal_t *journal, int nblocks)
++{
++ handle_t *handle = journal_current_handle();
++ int err;
++
++ if (!journal)
++ return ERR_PTR(-EROFS);
++
++ if (handle) {
++ J_ASSERT(handle->h_transaction->t_journal == journal);
++ handle->h_ref++;
++ return handle;
++ }
++
++ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
++ if (!handle)
++ return ERR_PTR(-ENOMEM);
++ memset (handle, 0, sizeof (handle_t));
++
++ handle->h_buffer_credits = nblocks;
++ handle->h_ref = 1;
++ current->journal_info = handle;
++
++ err = start_this_handle(journal, handle);
++ if (err < 0) {
++ kfree(handle);
++ current->journal_info = NULL;
++ return ERR_PTR(err);
++ }
++
++ return handle;
++}
++
++/*
++ * Return zero on success
++ */
++static int try_start_this_handle(journal_t *journal, handle_t *handle)
++{
++ transaction_t *transaction;
++ int needed;
++ int nblocks = handle->h_buffer_credits;
++ int ret = 0;
++
++ jbd_debug(3, "New handle %p maybe going live.\n", handle);
++
++ lock_journal(journal);
++
++ if (is_journal_aborted(journal) ||
++ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
++ ret = -EROFS;
++ goto fail_unlock;
++ }
++
++ if (journal->j_barrier_count)
++ goto fail_unlock;
++
++ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
++ goto fail_unlock;
++
++ transaction = journal->j_running_transaction;
++ if (transaction->t_state == T_LOCKED)
++ goto fail_unlock;
++
++ needed = transaction->t_outstanding_credits + nblocks;
++ /* We could run log_start_commit here */
++ if (needed > journal->j_max_transaction_buffers)
++ goto fail_unlock;
++
++ needed = journal->j_max_transaction_buffers;
++ if (journal->j_committing_transaction)
++ needed += journal->j_committing_transaction->
++ t_outstanding_credits;
++
++ if (log_space_left(journal) < needed)
++ goto fail_unlock;
++
++ handle->h_transaction = transaction;
++ transaction->t_outstanding_credits += nblocks;
++ transaction->t_updates++;
++ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
++ handle, nblocks, transaction->t_outstanding_credits,
++ log_space_left(journal));
++ unlock_journal(journal);
++ return 0;
++
++fail_unlock:
++ unlock_journal(journal);
++ if (ret >= 0)
++ ret = -1;
++ return ret;
++}
++
++/*
++ * Try to start a handle, but non-blockingly. If we weren't able
++ * to, return an ERR_PTR value.
++ */
++handle_t *journal_try_start(journal_t *journal, int nblocks)
++{
++ handle_t *handle = journal_current_handle();
++ int err;
++
++ if (!journal)
++ return ERR_PTR(-EROFS);
++
++ if (handle) {
++ jbd_debug(4, "h_ref %d -> %d\n",
++ handle->h_ref,
++ handle->h_ref + 1);
++ J_ASSERT(handle->h_transaction->t_journal == journal);
++ if (is_handle_aborted(handle))
++ return ERR_PTR(-EIO);
++ handle->h_ref++;
++ return handle;
++ } else {
++ jbd_debug(4, "no current transaction\n");
++ }
++
++ if (is_journal_aborted(journal))
++ return ERR_PTR(-EIO);
++
++ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
++ if (!handle)
++ return ERR_PTR(-ENOMEM);
++ memset (handle, 0, sizeof (handle_t));
++
++ handle->h_buffer_credits = nblocks;
++ handle->h_ref = 1;
++ current->journal_info = handle;
++
++ err = try_start_this_handle(journal, handle);
++ if (err < 0) {
++ kfree(handle);
++ current->journal_info = NULL;
++ return ERR_PTR(err);
++ }
++
++ return handle;
++}
++
++/*
++ * journal_extend: extend buffer credits.
++ *
++ * Some transactions, such as large extends and truncates, can be done
++ * atomically all at once or in several stages. The operation requests
++ * a credit for a number of buffer modications in advance, but can
++ * extend its credit if it needs more.
++ *
++ * journal_extend tries to give the running handle more buffer credits.
++ * It does not guarantee that allocation: this is a best-effort only.
++ * The calling process MUST be able to deal cleanly with a failure to
++ * extend here.
++ *
++ * Return 0 on success, non-zero on failure.
++ *
++ * return code < 0 implies an error
++ * return code > 0 implies normal transaction-full status.
++ */
++
++int journal_extend (handle_t *handle, int nblocks)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ int result;
++ int wanted;
++
++ lock_journal (journal);
++
++ result = -EIO;
++ if (is_handle_aborted(handle))
++ goto error_out;
++
++ result = 1;
++
++ /* Don't extend a locked-down transaction! */
++ if (handle->h_transaction->t_state != T_RUNNING) {
++ jbd_debug(3, "denied handle %p %d blocks: "
++ "transaction not running\n", handle, nblocks);
++ goto error_out;
++ }
++
++ wanted = transaction->t_outstanding_credits + nblocks;
++
++ if (wanted > journal->j_max_transaction_buffers) {
++ jbd_debug(3, "denied handle %p %d blocks: "
++ "transaction too large\n", handle, nblocks);
++ goto error_out;
++ }
++
++ if (wanted > log_space_left(journal)) {
++ jbd_debug(3, "denied handle %p %d blocks: "
++ "insufficient log space\n", handle, nblocks);
++ goto error_out;
++ }
++
++ handle->h_buffer_credits += nblocks;
++ transaction->t_outstanding_credits += nblocks;
++ result = 0;
++
++ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
++
++error_out:
++ unlock_journal (journal);
++ return result;
++}
++
++
++/*
++ * journal_restart: restart a handle for a multi-transaction filesystem
++ * operation.
++ *
++ * If the journal_extend() call above fails to grant new buffer credits
++ * to a running handle, a call to journal_restart will commit the
++ * handle's transaction so far and reattach the handle to a new
++ * transaction capabable of guaranteeing the requested number of
++ * credits.
++ */
++
++int journal_restart(handle_t *handle, int nblocks)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ int ret;
++
++ /* If we've had an abort of any type, don't even think about
++ * actually doing the restart! */
++ if (is_handle_aborted(handle))
++ return 0;
++
++ /* First unlink the handle from its current transaction, and
++ * start the commit on that. */
++
++ J_ASSERT (transaction->t_updates > 0);
++ J_ASSERT (journal_current_handle() == handle);
++
++ transaction->t_outstanding_credits -= handle->h_buffer_credits;
++ transaction->t_updates--;
++
++ if (!transaction->t_updates)
++ wake_up(&journal->j_wait_updates);
++
++ jbd_debug(2, "restarting handle %p\n", handle);
++ log_start_commit(journal, transaction);
++
++ handle->h_buffer_credits = nblocks;
++ ret = start_this_handle(journal, handle);
++ return ret;
++}
++
++
++/*
++ * Barrier operation: establish a transaction barrier.
++ *
++ * This locks out any further updates from being started, and blocks
++ * until all existing updates have completed, returning only once the
++ * journal is in a quiescent state with no updates running.
++ *
++ * The journal lock should not be held on entry.
++ */
++
++void journal_lock_updates (journal_t *journal)
++{
++ lock_journal(journal);
++ ++journal->j_barrier_count;
++
++ /* Wait until there are no running updates */
++ while (1) {
++ transaction_t *transaction = journal->j_running_transaction;
++ if (!transaction)
++ break;
++ if (!transaction->t_updates)
++ break;
++
++ unlock_journal(journal);
++ sleep_on(&journal->j_wait_updates);
++ lock_journal(journal);
++ }
++
++ unlock_journal(journal);
++
++ /* We have now established a barrier against other normal
++ * updates, but we also need to barrier against other
++ * journal_lock_updates() calls to make sure that we serialise
++ * special journal-locked operations too. */
++ down(&journal->j_barrier);
++}
++
++/*
++ * Release a transaction barrier obtained with journal_lock_updates().
++ *
++ * Should be called without the journal lock held.
++ */
++
++void journal_unlock_updates (journal_t *journal)
++{
++ lock_journal(journal);
++
++ J_ASSERT (journal->j_barrier_count != 0);
++
++ up(&journal->j_barrier);
++ --journal->j_barrier_count;
++ wake_up(&journal->j_wait_transaction_locked);
++ unlock_journal(journal);
++}
++
++/*
++ * journal_get_write_access: notify intent to modify a buffer for metadata
++ * (not data) update.
++ *
++ * If the buffer is already part of the current transaction, then there
++ * is nothing we need to do. If it is already part of a prior
++ * transaction which we are still committing to disk, then we need to
++ * make sure that we do not overwrite the old copy: we do copy-out to
++ * preserve the copy going to disk. We also account the buffer against
++ * the handle's metadata buffer credits (unless the buffer is already
++ * part of the transaction, that is).
++ *
++ * Returns an error code or 0 on success.
++ *
++ * In full data journalling mode the buffer may be of type BJ_AsyncData,
++ * because we're write()ing a buffer which is also part of a shared mapping.
++ */
++
++static int
++do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ int error;
++ char *frozen_buffer = NULL;
++ int need_copy = 0;
++
++ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
++
++ JBUFFER_TRACE(jh, "entry");
++repeat:
++ /* @@@ Need to check for errors here at some point. */
++
++ /*
++ * AKPM: neither bdflush nor kupdate run with the BKL. There's
++ * nothing we can do to prevent them from starting writeout of a
++ * BUF_DIRTY buffer at any time. And checkpointing buffers are on
++ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked.
++ *
++ * However. It is very wrong for us to allow ext3 to start directly
++ * altering the ->b_data of buffers which may at that very time be
++ * undergoing writeout to the client filesystem. This can leave
++ * the filesystem in an inconsistent, transient state if we crash.
++ * So what we do is to steal the buffer if it is in checkpoint
++ * mode and dirty. The journal lock will keep out checkpoint-mode
++ * state transitions within journal_remove_checkpoint() and the buffer
++ * is locked to keep bdflush/kupdate/whoever away from it as well.
++ *
++ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
++ * simple lock_journal(). This code here will care for locked buffers.
++ */
++ /*
++ * The buffer_locked() || buffer_dirty() tests here are simply an
++ * optimisation tweak. If anyone else in the system decides to
++ * lock this buffer later on, we'll blow up. There doesn't seem
++ * to be a good reason why they should do this.
++ */
++ if (jh->b_cp_transaction &&
++ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
++ unlock_journal(journal);
++ lock_buffer(jh2bh(jh));
++ spin_lock(&journal_datalist_lock);
++ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
++ /* OK, we need to steal it */
++ JBUFFER_TRACE(jh, "stealing from checkpoint mode");
++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++ J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
++
++ J_ASSERT(handle->h_buffer_credits > 0);
++ handle->h_buffer_credits--;
++
++ /* This will clear BH_Dirty and set BH_JBDDirty. */
++ JBUFFER_TRACE(jh, "file as BJ_Reserved");
++ __journal_file_buffer(jh, transaction, BJ_Reserved);
++
++ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
++ refile_buffer(jh2bh(jh));
++
++ /*
++ * The buffer is now hidden from bdflush. It is
++ * metadata against the current transaction.
++ */
++ JBUFFER_TRACE(jh, "steal from cp mode is complete");
++ }
++ spin_unlock(&journal_datalist_lock);
++ unlock_buffer(jh2bh(jh));
++ lock_journal(journal);
++ goto repeat;
++ }
++
++ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
++
++ error = -EROFS;
++ if (is_handle_aborted(handle))
++ goto out_unlocked;
++ error = 0;
++
++ spin_lock(&journal_datalist_lock);
++
++ /* The buffer is already part of this transaction if
++ * b_transaction or b_next_transaction points to it. */
++
++ if (jh->b_transaction == transaction ||
++ jh->b_next_transaction == transaction)
++ goto done_locked;
++
++ /* If there is already a copy-out version of this buffer, then
++ * we don't need to make another one. */
++
++ if (jh->b_frozen_data) {
++ JBUFFER_TRACE(jh, "has frozen data");
++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++ jh->b_next_transaction = transaction;
++
++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
++ handle->h_buffer_credits--;
++ goto done_locked;
++ }
++
++ /* Is there data here we need to preserve? */
++
++ if (jh->b_transaction && jh->b_transaction != transaction) {
++ JBUFFER_TRACE(jh, "owned by older transaction");
++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++ J_ASSERT_JH(jh, jh->b_transaction ==
++ journal->j_committing_transaction);
++
++ /* There is one case we have to be very careful about.
++ * If the committing transaction is currently writing
++ * this buffer out to disk and has NOT made a copy-out,
++ * then we cannot modify the buffer contents at all
++ * right now. The essence of copy-out is that it is the
++ * extra copy, not the primary copy, which gets
++ * journaled. If the primary copy is already going to
++ * disk then we cannot do copy-out here. */
++
++ if (jh->b_jlist == BJ_Shadow) {
++ JBUFFER_TRACE(jh, "on shadow: sleep");
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ /* commit wakes up all shadow buffers after IO */
++ sleep_on(&jh2bh(jh)->b_wait);
++ lock_journal(journal);
++ goto repeat;
++ }
++
++ /* Only do the copy if the currently-owning transaction
++ * still needs it. If it is on the Forget list, the
++ * committing transaction is past that stage. The
++ * buffer had better remain locked during the kmalloc,
++ * but that should be true --- we hold the journal lock
++ * still and the buffer is already on the BUF_JOURNAL
++ * list so won't be flushed.
++ *
++ * Subtle point, though: if this is a get_undo_access,
++ * then we will be relying on the frozen_data to contain
++ * the new value of the committed_data record after the
++ * transaction, so we HAVE to force the frozen_data copy
++ * in that case. */
++
++ if (jh->b_jlist != BJ_Forget || force_copy) {
++ JBUFFER_TRACE(jh, "generate frozen data");
++ if (!frozen_buffer) {
++ JBUFFER_TRACE(jh, "allocate memory for buffer");
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
++ GFP_NOFS);
++ lock_journal(journal);
++ if (!frozen_buffer) {
++ printk(KERN_EMERG __FUNCTION__
++ "OOM for frozen_buffer\n");
++ JBUFFER_TRACE(jh, "oom!");
++ error = -ENOMEM;
++ spin_lock(&journal_datalist_lock);
++ goto done_locked;
++ }
++ goto repeat;
++ }
++
++ jh->b_frozen_data = frozen_buffer;
++ frozen_buffer = NULL;
++ need_copy = 1;
++ }
++ jh->b_next_transaction = transaction;
++ }
++
++ J_ASSERT(handle->h_buffer_credits > 0);
++ handle->h_buffer_credits--;
++
++ /* Finally, if the buffer is not journaled right now, we need to
++ * make sure it doesn't get written to disk before the caller
++ * actually commits the new data. */
++
++ if (!jh->b_transaction) {
++ JBUFFER_TRACE(jh, "no transaction");
++ J_ASSERT_JH(jh, !jh->b_next_transaction);
++ jh->b_transaction = transaction;
++ JBUFFER_TRACE(jh, "file as BJ_Reserved");
++ __journal_file_buffer(jh, transaction, BJ_Reserved);
++ }
++
++done_locked:
++ spin_unlock(&journal_datalist_lock);
++ if (need_copy) {
++ struct page *page;
++ int offset;
++ char *source;
++
++ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
++ page = jh2bh(jh)->b_page;
++ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
++ source = kmap(page);
++ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
++ kunmap(page);
++ }
++
++
++ /* If we are about to journal a buffer, then any revoke pending
++ on it is no longer valid. */
++ journal_cancel_revoke(handle, jh);
++
++out_unlocked:
++ if (frozen_buffer)
++ kfree(frozen_buffer);
++
++ JBUFFER_TRACE(jh, "exit");
++ return error;
++}
++
++int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ struct journal_head *jh = journal_add_journal_head(bh);
++ int rc;
++
++ /* We do not want to get caught playing with fields which the
++ * log thread also manipulates. Make sure that the buffer
++ * completes any outstanding IO before proceeding. */
++ lock_journal(journal);
++ rc = do_get_write_access(handle, jh, 0);
++ journal_unlock_journal_head(jh);
++ unlock_journal(journal);
++ return rc;
++}
++
++
++/*
++ * When the user wants to journal a newly created buffer_head
++ * (ie. getblk() returned a new buffer and we are going to populate it
++ * manually rather than reading off disk), then we need to keep the
++ * buffer_head locked until it has been completely filled with new
++ * data. In this case, we should be able to make the assertion that
++ * the bh is not already part of an existing transaction.
++ *
++ * The buffer should already be locked by the caller by this point.
++ * There is no lock ranking violation: it was a newly created,
++ * unlocked buffer beforehand. */
++
++int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ struct journal_head *jh = journal_add_journal_head(bh);
++ int err;
++
++ jbd_debug(5, "journal_head %p\n", jh);
++ lock_journal(journal);
++ err = -EROFS;
++ if (is_handle_aborted(handle))
++ goto out;
++ err = 0;
++
++ JBUFFER_TRACE(jh, "entry");
++ /* The buffer may already belong to this transaction due to
++ * pre-zeroing in the filesystem's new_block code. It may also
++ * be on the previous, committing transaction's lists, but it
++ * HAS to be in Forget state in that case: the transaction must
++ * have deleted the buffer for it to be reused here. */
++ J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
++ jh->b_transaction == NULL ||
++ (jh->b_transaction == journal->j_committing_transaction &&
++ jh->b_jlist == BJ_Forget)));
++
++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
++
++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
++ handle->h_buffer_credits--;
++
++ spin_lock(&journal_datalist_lock);
++ if (jh->b_transaction == NULL) {
++ jh->b_transaction = transaction;
++ JBUFFER_TRACE(jh, "file as BJ_Reserved");
++ __journal_file_buffer(jh, transaction, BJ_Reserved);
++ JBUFFER_TRACE(jh, "refile");
++ refile_buffer(jh2bh(jh));
++ } else if (jh->b_transaction == journal->j_committing_transaction) {
++ JBUFFER_TRACE(jh, "set next transaction");
++ jh->b_next_transaction = transaction;
++ }
++ spin_unlock(&journal_datalist_lock);
++
++ /*
++ * akpm: I added this. ext3_alloc_branch can pick up new indirect
++ * blocks which contain freed but then revoked metadata. We need
++ * to cancel the revoke in case we end up freeing it yet again
++ * and the reallocating as data - this would cause a second revoke,
++ * which hits an assertion error.
++ */
++ JBUFFER_TRACE(jh, "cancelling revoke");
++ journal_cancel_revoke(handle, jh);
++ journal_unlock_journal_head(jh);
++out:
++ unlock_journal(journal);
++ return err;
++}
++
++
++
++/*
++ * journal_get_undo_access: Notify intent to modify metadata with non-
++ * rewindable consequences
++ *
++ * Sometimes there is a need to distinguish between metadata which has
++ * been committed to disk and that which has not. The ext3fs code uses
++ * this for freeing and allocating space: we have to make sure that we
++ * do not reuse freed space until the deallocation has been committed,
++ * since if we overwrote that space we would make the delete
++ * un-rewindable in case of a crash.
++ *
++ * To deal with that, journal_get_undo_access requests write access to a
++ * buffer for parts of non-rewindable operations such as delete
++ * operations on the bitmaps. The journaling code must keep a copy of
++ * the buffer's contents prior to the undo_access call until such time
++ * as we know that the buffer has definitely been committed to disk.
++ *
++ * We never need to know which transaction the committed data is part
++ * of: buffers touched here are guaranteed to be dirtied later and so
++ * will be committed to a new transaction in due course, at which point
++ * we can discard the old committed data pointer.
++ *
++ * Returns error number or 0 on success.
++ */
++
++int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
++{
++ journal_t *journal = handle->h_transaction->t_journal;
++ int err;
++ struct journal_head *jh = journal_add_journal_head(bh);
++
++ JBUFFER_TRACE(jh, "entry");
++ lock_journal(journal);
++
++ /* Do this first --- it can drop the journal lock, so we want to
++ * make sure that obtaining the committed_data is done
++ * atomically wrt. completion of any outstanding commits. */
++ err = do_get_write_access (handle, jh, 1);
++ if (err)
++ goto out;
++
++ if (!jh->b_committed_data) {
++ /* Copy out the current buffer contents into the
++ * preserved, committed copy. */
++ JBUFFER_TRACE(jh, "generate b_committed data");
++ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size,
++ GFP_NOFS);
++ if (!jh->b_committed_data) {
++ printk(KERN_EMERG __FUNCTION__
++ ": No memory for committed data!\n");
++ err = -ENOMEM;
++ goto out;
++ }
++
++ memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
++ jh2bh(jh)->b_size);
++ }
++
++out:
++ if (!err)
++ J_ASSERT_JH(jh, jh->b_committed_data);
++ journal_unlock_journal_head(jh);
++ unlock_journal(journal);
++ return err;
++}
++
++/*
++ * journal_dirty_data: mark a buffer as containing dirty data which
++ * needs to be flushed before we can commit the current transaction.
++ *
++ * The buffer is placed on the transaction's data list and is marked as
++ * belonging to the transaction.
++ *
++ * If `async' is set then the writebask will be initiated by the caller
++ * using submit_bh -> end_buffer_io_async. We put the buffer onto
++ * t_async_datalist.
++ *
++ * Returns error number or 0 on success.
++ *
++ * journal_dirty_data() can be called via page_launder->ext3_writepage
++ * by kswapd. So it cannot block. Happily, there's nothing here
++ * which needs lock_journal if `async' is set.
++ *
++ * When the buffer is on the current transaction we freely move it
++ * between BJ_AsyncData and BJ_SyncData according to who tried to
++ * change its state last.
++ */
++
++int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
++{
++ journal_t *journal = handle->h_transaction->t_journal;
++ int need_brelse = 0;
++ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
++ struct journal_head *jh;
++
++ if (is_handle_aborted(handle))
++ return 0;
++
++ jh = journal_add_journal_head(bh);
++ JBUFFER_TRACE(jh, "entry");
++
++ /*
++ * The buffer could *already* be dirty. Writeout can start
++ * at any time.
++ */
++ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
++
++ /*
++ * What if the buffer is already part of a running transaction?
++ *
++ * There are two cases:
++ * 1) It is part of the current running transaction. Refile it,
++ * just in case we have allocated it as metadata, deallocated
++ * it, then reallocated it as data.
++ * 2) It is part of the previous, still-committing transaction.
++ * If all we want to do is to guarantee that the buffer will be
++ * written to disk before this new transaction commits, then
++ * being sure that the *previous* transaction has this same
++ * property is sufficient for us! Just leave it on its old
++ * transaction.
++ *
++ * In case (2), the buffer must not already exist as metadata
++ * --- that would violate write ordering (a transaction is free
++ * to write its data at any point, even before the previous
++ * committing transaction has committed). The caller must
++ * never, ever allow this to happen: there's nothing we can do
++ * about it in this layer.
++ */
++ spin_lock(&journal_datalist_lock);
++ if (jh->b_transaction) {
++ JBUFFER_TRACE(jh, "has transaction");
++ if (jh->b_transaction != handle->h_transaction) {
++ JBUFFER_TRACE(jh, "belongs to older transaction");
++ J_ASSERT_JH(jh, jh->b_transaction ==
++ journal->j_committing_transaction);
++
++ /* @@@ IS THIS TRUE ? */
++ /*
++ * Not any more. Scenario: someone does a write()
++ * in data=journal mode. The buffer's transaction has
++ * moved into commit. Then someone does another
++ * write() to the file. We do the frozen data copyout
++ * and set b_next_transaction to point to j_running_t.
++ * And while we're in that state, someone does a
++ * writepage() in an attempt to pageout the same area
++ * of the file via a shared mapping. At present that
++ * calls journal_dirty_data(), and we get right here.
++ * It may be too late to journal the data. Simply
++ * falling through to the next test will suffice: the
++ * data will be dirty and wil be checkpointed. The
++ * ordering comments in the next comment block still
++ * apply.
++ */
++ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
++
++ /*
++ * If we're journalling data, and this buffer was
++ * subject to a write(), it could be metadata, forget
++ * or shadow against the committing transaction. Now,
++ * someone has dirtied the same darn page via a mapping
++ * and it is being writepage()'d.
++ * We *could* just steal the page from commit, with some
++ * fancy locking there. Instead, we just skip it -
++ * don't tie the page's buffers to the new transaction
++ * at all.
++ * Implication: if we crash before the writepage() data
++ * is written into the filesystem, recovery will replay
++ * the write() data.
++ */
++ if (jh->b_jlist != BJ_None &&
++ jh->b_jlist != BJ_SyncData &&
++ jh->b_jlist != BJ_AsyncData) {
++ JBUFFER_TRACE(jh, "Not stealing");
++ goto no_journal;
++ }
++
++ /*
++ * This buffer may be undergoing writeout in commit. We
++ * can't return from here and let the caller dirty it
++ * again because that can cause the write-out loop in
++ * commit to never terminate.
++ */
++ if (!async && buffer_dirty(bh)) {
++ atomic_inc(&bh->b_count);
++ spin_unlock(&journal_datalist_lock);
++ need_brelse = 1;
++ ll_rw_block(WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ spin_lock(&journal_datalist_lock);
++ /* The buffer may become locked again at any
++ time if it is redirtied */
++ }
++
++ /* journal_clean_data_list() may have got there first */
++ if (jh->b_transaction != NULL) {
++ JBUFFER_TRACE(jh, "unfile from commit");
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ }
++ /* The buffer will be refiled below */
++
++ }
++ /*
++ * Special case --- the buffer might actually have been
++ * allocated and then immediately deallocated in the previous,
++ * committing transaction, so might still be left on that
++ * transaction's metadata lists.
++ */
++ if (jh->b_jlist != wanted_jlist) {
++ JBUFFER_TRACE(jh, "not on correct data list: unfile");
++ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = NULL;
++ JBUFFER_TRACE(jh, "file as data");
++ __journal_file_buffer(jh, handle->h_transaction,
++ wanted_jlist);
++ }
++ } else {
++ JBUFFER_TRACE(jh, "not on a transaction");
++ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
++ }
++no_journal:
++ spin_unlock(&journal_datalist_lock);
++ if (need_brelse) {
++ BUFFER_TRACE(bh, "brelse");
++ __brelse(bh);
++ }
++ JBUFFER_TRACE(jh, "exit");
++ journal_unlock_journal_head(jh);
++ return 0;
++}
++
++/*
++ * journal_dirty_metadata: mark a buffer as containing dirty metadata
++ * which needs to be journaled as part of the current transaction.
++ *
++ * The buffer is placed on the transaction's metadata list and is marked
++ * as belonging to the transaction.
++ *
++ * Special care needs to be taken if the buffer already belongs to the
++ * current committing transaction (in which case we should have frozen
++ * data present for that commit). In that case, we don't relink the
++ * buffer: that only gets done when the old transaction finally
++ * completes its commit.
++ *
++ * Returns error number or 0 on success.
++ */
++
++int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ struct journal_head *jh = bh2jh(bh);
++
++ jbd_debug(5, "journal_head %p\n", jh);
++ JBUFFER_TRACE(jh, "entry");
++ lock_journal(journal);
++ if (is_handle_aborted(handle))
++ goto out_unlock;
++
++ spin_lock(&journal_datalist_lock);
++ set_bit(BH_JBDDirty, &bh->b_state);
++ set_buffer_flushtime(bh);
++
++ J_ASSERT_JH(jh, jh->b_transaction != NULL);
++
++ /*
++ * Metadata already on the current transaction list doesn't
++ * need to be filed. Metadata on another transaction's list must
++ * be committing, and will be refiled once the commit completes:
++ * leave it alone for now.
++ */
++
++ if (jh->b_transaction != transaction) {
++ JBUFFER_TRACE(jh, "already on other transaction");
++ J_ASSERT_JH(jh, jh->b_transaction ==
++ journal->j_committing_transaction);
++ J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
++ /* And this case is illegal: we can't reuse another
++ * transaction's data buffer, ever. */
++ /* FIXME: writepage() should be journalled */
++ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
++ goto done_locked;
++ }
++
++ /* That test should have eliminated the following case: */
++ J_ASSERT_JH(jh, jh->b_frozen_data == 0);
++
++ JBUFFER_TRACE(jh, "file as BJ_Metadata");
++ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
++
++done_locked:
++ spin_unlock(&journal_datalist_lock);
++ JBUFFER_TRACE(jh, "exit");
++out_unlock:
++ unlock_journal(journal);
++ return 0;
++}
++
++#if 0
++/*
++ * journal_release_buffer: undo a get_write_access without any buffer
++ * updates, if the update decided in the end that it didn't need access.
++ *
++ * journal_get_write_access() can block, so it is quite possible for a
++ * journaling component to decide after the write access is returned
++ * that global state has changed and the update is no longer required. */
++
++void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ struct journal_head *jh = bh2jh(bh);
++
++ lock_journal(journal);
++ JBUFFER_TRACE(jh, "entry");
++
++ /* If the buffer is reserved but not modified by this
++ * transaction, then it is safe to release it. In all other
++ * cases, just leave the buffer as it is. */
++
++ spin_lock(&journal_datalist_lock);
++ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction &&
++ !buffer_jdirty(jh2bh(jh))) {
++ JBUFFER_TRACE(jh, "unused: refiling it");
++ handle->h_buffer_credits++;
++ __journal_refile_buffer(jh);
++ }
++ spin_unlock(&journal_datalist_lock);
++
++ JBUFFER_TRACE(jh, "exit");
++ unlock_journal(journal);
++}
++#endif
++
++/*
++ * journal_forget: bforget() for potentially-journaled buffers. We can
++ * only do the bforget if there are no commits pending against the
++ * buffer. If the buffer is dirty in the current running transaction we
++ * can safely unlink it.
++ *
++ * bh may not be a journalled buffer at all - it may be a non-JBD
++ * buffer which came off the hashtable. Check for this.
++ *
++ * Decrements bh->b_count by one.
++ *
++ * Allow this call even if the handle has aborted --- it may be part of
++ * the caller's cleanup after an abort.
++ */
++
++void journal_forget (handle_t *handle, struct buffer_head *bh)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ struct journal_head *jh;
++
++ BUFFER_TRACE(bh, "entry");
++
++ lock_journal(journal);
++ spin_lock(&journal_datalist_lock);
++
++ if (!buffer_jbd(bh))
++ goto not_jbd;
++ jh = bh2jh(bh);
++
++ if (jh->b_transaction == handle->h_transaction) {
++ J_ASSERT_JH(jh, !jh->b_frozen_data);
++
++ /* If we are forgetting a buffer which is already part
++ * of this transaction, then we can just drop it from
++ * the transaction immediately. */
++ clear_bit(BH_Dirty, &bh->b_state);
++ clear_bit(BH_JBDDirty, &bh->b_state);
++
++ JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
++ J_ASSERT_JH(jh, !jh->b_committed_data);
++
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = 0;
++
++ /*
++ * We are no longer going to journal this buffer.
++ * However, the commit of this transaction is still
++ * important to the buffer: the delete that we are now
++ * processing might obsolete an old log entry, so by
++ * committing, we can satisfy the buffer's checkpoint.
++ *
++ * So, if we have a checkpoint on the buffer, we should
++ * now refile the buffer on our BJ_Forget list so that
++ * we know to remove the checkpoint after we commit.
++ */
++
++ if (jh->b_cp_transaction) {
++ __journal_file_buffer(jh, transaction, BJ_Forget);
++ } else {
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ if (!buffer_jbd(bh)) {
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ __bforget(bh);
++ return;
++ }
++ }
++
++ } else if (jh->b_transaction) {
++ J_ASSERT_JH(jh, (jh->b_transaction ==
++ journal->j_committing_transaction));
++ /* However, if the buffer is still owned by a prior
++ * (committing) transaction, we can't drop it yet... */
++ JBUFFER_TRACE(jh, "belongs to older transaction");
++ /* ... but we CAN drop it from the new transaction if we
++ * have also modified it since the original commit. */
++
++ if (jh->b_next_transaction) {
++ J_ASSERT(jh->b_next_transaction == transaction);
++ jh->b_next_transaction = NULL;
++ }
++ }
++
++not_jbd:
++ spin_unlock(&journal_datalist_lock);
++ unlock_journal(journal);
++ __brelse(bh);
++ return;
++}
++
++#if 0 /* Unused */
++/*
++ * journal_sync_buffer: flush a potentially-journaled buffer to disk.
++ *
++ * Used for O_SYNC filesystem operations. If the buffer is journaled,
++ * we need to complete the O_SYNC by waiting for the transaction to
++ * complete. It is an error to call journal_sync_buffer before
++ * journal_stop!
++ */
++
++void journal_sync_buffer(struct buffer_head *bh)
++{
++ transaction_t *transaction;
++ journal_t *journal;
++ long sequence;
++ struct journal_head *jh;
++
++ /* If the buffer isn't journaled, this is easy: just sync it to
++ * disk. */
++ BUFFER_TRACE(bh, "entry");
++
++ spin_lock(&journal_datalist_lock);
++ if (!buffer_jbd(bh)) {
++ spin_unlock(&journal_datalist_lock);
++ return;
++ }
++ jh = bh2jh(bh);
++ if (jh->b_transaction == NULL) {
++ /* If the buffer has already been journaled, then this
++ * is a noop. */
++ if (jh->b_cp_transaction == NULL) {
++ spin_unlock(&journal_datalist_lock);
++ return;
++ }
++ atomic_inc(&bh->b_count);
++ spin_unlock(&journal_datalist_lock);
++ ll_rw_block (WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ __brelse(bh);
++ goto out;
++ }
++
++ /* Otherwise, just wait until the transaction is synced to disk. */
++ transaction = jh->b_transaction;
++ journal = transaction->t_journal;
++ sequence = transaction->t_tid;
++ spin_unlock(&journal_datalist_lock);
++
++ jbd_debug(2, "requesting commit for jh %p\n", jh);
++ log_start_commit (journal, transaction);
++
++ while (tid_gt(sequence, journal->j_commit_sequence)) {
++ wake_up(&journal->j_wait_done_commit);
++ sleep_on(&journal->j_wait_done_commit);
++ }
++ JBUFFER_TRACE(jh, "exit");
++out:
++ return;
++}
++#endif
++
++/*
++ * All done for a particular handle.
++ *
++ * There is not much action needed here. We just return any remaining
++ * buffer credits to the transaction and remove the handle. The only
++ * complication is that we need to start a commit operation if the
++ * filesystem is marked for synchronous update.
++ *
++ * journal_stop itself will not usually return an error, but it may
++ * do so in unusual circumstances. In particular, expect it to
++ * return -EIO if a journal_abort has been executed since the
++ * transaction began.
++ */
++
++int journal_stop(handle_t *handle)
++{
++ transaction_t *transaction = handle->h_transaction;
++ journal_t *journal = transaction->t_journal;
++ int old_handle_count, err;
++
++ if (!handle)
++ return 0;
++
++ J_ASSERT (transaction->t_updates > 0);
++ J_ASSERT (journal_current_handle() == handle);
++
++ if (is_handle_aborted(handle))
++ err = -EIO;
++ else
++ err = 0;
++
++ if (--handle->h_ref > 0) {
++ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
++ handle->h_ref);
++ return err;
++ }
++
++ jbd_debug(4, "Handle %p going down\n", handle);
++
++ /*
++ * Implement synchronous transaction batching. If the handle
++ * was synchronous, don't force a commit immediately. Let's
++ * yield and let another thread piggyback onto this transaction.
++ * Keep doing that while new threads continue to arrive.
++ * It doesn't cost much - we're about to run a commit and sleep
++ * on IO anyway. Speeds up many-threaded, many-dir operations
++ * by 30x or more...
++ */
++ if (handle->h_sync) {
++ do {
++ old_handle_count = transaction->t_handle_count;
++ set_current_state(TASK_RUNNING);
++ current->policy |= SCHED_YIELD;
++ schedule();
++ } while (old_handle_count != transaction->t_handle_count);
++ }
++
++ current->journal_info = NULL;
++ transaction->t_outstanding_credits -= handle->h_buffer_credits;
++ transaction->t_updates--;
++ if (!transaction->t_updates) {
++ wake_up(&journal->j_wait_updates);
++ if (journal->j_barrier_count)
++ wake_up(&journal->j_wait_transaction_locked);
++ }
++
++ /*
++ * If the handle is marked SYNC, we need to set another commit
++ * going! We also want to force a commit if the current
++ * transaction is occupying too much of the log, or if the
++ * transaction is too old now.
++ */
++ if (handle->h_sync ||
++ transaction->t_outstanding_credits >
++ journal->j_max_transaction_buffers ||
++ time_after_eq(jiffies, transaction->t_expires)) {
++ /* Do this even for aborted journals: an abort still
++ * completes the commit thread, it just doesn't write
++ * anything to disk. */
++ tid_t tid = transaction->t_tid;
++
++ jbd_debug(2, "transaction too old, requesting commit for "
++ "handle %p\n", handle);
++ /* This is non-blocking */
++ log_start_commit(journal, transaction);
++
++ /*
++ * Special case: JFS_SYNC synchronous updates require us
++ * to wait for the commit to complete.
++ */
++ if (handle->h_sync && !(current->flags & PF_MEMALLOC))
++ log_wait_commit(journal, tid);
++ }
++ kfree(handle);
++ return err;
++}
++
++/*
++ * For synchronous operations: force any uncommitted trasnactions
++ * to disk. May seem kludgy, but it reuses all the handle batching
++ * code in a very simple manner.
++ */
++int journal_force_commit(journal_t *journal)
++{
++ handle_t *handle;
++ int ret = 0;
++
++ lock_kernel();
++ handle = journal_start(journal, 1);
++ if (IS_ERR(handle)) {
++ ret = PTR_ERR(handle);
++ goto out;
++ }
++ handle->h_sync = 1;
++ journal_stop(handle);
++out:
++ unlock_kernel();
++ return ret;
++}
++
++/*
++ *
++ * List management code snippets: various functions for manipulating the
++ * transaction buffer lists.
++ *
++ */
++
++/*
++ * Append a buffer to a transaction list, given the transaction's list head
++ * pointer.
++ * journal_datalist_lock is held.
++ */
++
++static inline void
++__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
++{
++ if (!*list) {
++ jh->b_tnext = jh->b_tprev = jh;
++ *list = jh;
++ } else {
++ /* Insert at the tail of the list to preserve order */
++ struct journal_head *first = *list, *last = first->b_tprev;
++ jh->b_tprev = last;
++ jh->b_tnext = first;
++ last->b_tnext = first->b_tprev = jh;
++ }
++}
++
++/*
++ * Remove a buffer from a transaction list, given the transaction's list
++ * head pointer.
++ *
++ * Called with journal_datalist_lock held, and the journal may not
++ * be locked.
++ */
++
++static inline void
++__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
++{
++ if (*list == jh) {
++ *list = jh->b_tnext;
++ if (*list == jh)
++ *list = 0;
++ }
++ jh->b_tprev->b_tnext = jh->b_tnext;
++ jh->b_tnext->b_tprev = jh->b_tprev;
++}
++
++/*
++ * Remove a buffer from the appropriate transaction list.
++ *
++ * Note that this function can *change* the value of
++ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
++ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
++ * is holding onto a copy of one of thee pointers, it could go bad.
++ * Generally the caller needs to re-read the pointer from the transaction_t.
++ *
++ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
++ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that
++ * case, journal_datalist_lock will be held, and the journal may not be locked.
++ */
++void __journal_unfile_buffer(struct journal_head *jh)
++{
++ struct journal_head **list = 0;
++ transaction_t * transaction;
++
++ assert_spin_locked(&journal_datalist_lock);
++ transaction = jh->b_transaction;
++
++#ifdef __SMP__
++ J_ASSERT (current->lock_depth >= 0);
++#endif
++ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
++
++ if (jh->b_jlist != BJ_None)
++ J_ASSERT_JH(jh, transaction != 0);
++
++ switch (jh->b_jlist) {
++ case BJ_None:
++ return;
++ case BJ_SyncData:
++ list = &transaction->t_sync_datalist;
++ break;
++ case BJ_AsyncData:
++ list = &transaction->t_async_datalist;
++ break;
++ case BJ_Metadata:
++ transaction->t_nr_buffers--;
++ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
++ list = &transaction->t_buffers;
++ break;
++ case BJ_Forget:
++ list = &transaction->t_forget;
++ break;
++ case BJ_IO:
++ list = &transaction->t_iobuf_list;
++ break;
++ case BJ_Shadow:
++ list = &transaction->t_shadow_list;
++ break;
++ case BJ_LogCtl:
++ list = &transaction->t_log_list;
++ break;
++ case BJ_Reserved:
++ list = &transaction->t_reserved_list;
++ break;
++ }
++
++ __blist_del_buffer(list, jh);
++ jh->b_jlist = BJ_None;
++ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
++ set_bit(BH_Dirty, &jh2bh(jh)->b_state);
++ }
++}
++
++void journal_unfile_buffer(struct journal_head *jh)
++{
++ spin_lock(&journal_datalist_lock);
++ __journal_unfile_buffer(jh);
++ spin_unlock(&journal_datalist_lock);
++}
++
++/*
++ * Called from journal_try_to_free_buffers(). The journal is not
++ * locked. lru_list_lock is not held.
++ *
++ * Here we see why journal_datalist_lock is global and not per-journal.
++ * We cannot get back to this buffer's journal pointer without locking
++ * out journal_clean_data_list() in some manner.
++ *
++ * One could use journal_datalist_lock to get unracy access to a
++ * per-journal lock.
++ *
++ * Called with journal_datalist_lock held.
++ *
++ * Returns non-zero iff we were able to free the journal_head.
++ */
++static int __journal_try_to_free_buffer(struct buffer_head *bh,
++ int *locked_or_dirty)
++{
++ struct journal_head *jh;
++
++ assert_spin_locked(&journal_datalist_lock);
++
++ jh = bh2jh(bh);
++
++ if (buffer_locked(bh) || buffer_dirty(bh)) {
++ *locked_or_dirty = 1;
++ goto out;
++ }
++
++ if (!buffer_uptodate(bh))
++ goto out;
++
++ if (jh->b_next_transaction != 0)
++ goto out;
++
++ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
++ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
++ /* A written-back ordered data buffer */
++ JBUFFER_TRACE(jh, "release data");
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = 0;
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ }
++ }
++ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
++ /* written-back checkpointed metadata buffer */
++ if (jh->b_jlist == BJ_None) {
++ JBUFFER_TRACE(jh, "remove from checkpoint list");
++ __journal_remove_checkpoint(jh);
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ }
++ }
++ return !buffer_jbd(bh);
++
++out:
++ return 0;
++}
++
++/*
++ * journal_try_to_free_buffers(). For all the buffers on this page,
++ * if they are fully written out ordered data, move them onto BUF_CLEAN
++ * so try_to_free_buffers() can reap them. Called with lru_list_lock
++ * not held. Does its own locking.
++ *
++ * This complicates JBD locking somewhat. We aren't protected by the
++ * BKL here. We wish to remove the buffer from its committing or
++ * running transaction's ->t_datalist via __journal_unfile_buffer.
++ *
++ * This may *change* the value of transaction_t->t_datalist, so anyone
++ * who looks at t_datalist needs to lock against this function.
++ *
++ * Even worse, someone may be doing a journal_dirty_data on this
++ * buffer. So we need to lock against that. journal_dirty_data()
++ * will come out of the lock with the buffer dirty, which makes it
++ * ineligible for release here.
++ *
++ * Who else is affected by this? hmm... Really the only contender
++ * is do_get_write_access() - it could be looking at the buffer while
++ * journal_try_to_free_buffer() is changing its state. But that
++ * cannot happen because we never reallocate freed data as metadata
++ * while the data is part of a transaction. Yes?
++ *
++ * This function returns non-zero if we wish try_to_free_buffers()
++ * to be called. We do this is the page is releasable by try_to_free_buffers().
++ * We also do it if the page has locked or dirty buffers and the caller wants
++ * us to perform sync or async writeout.
++ */
++int journal_try_to_free_buffers(journal_t *journal,
++ struct page *page, int gfp_mask)
++{
++ struct buffer_head *bh;
++ struct buffer_head *tmp;
++ int locked_or_dirty = 0;
++ int call_ttfb = 1;
++
++ J_ASSERT(PageLocked(page));
++
++ bh = page->buffers;
++ tmp = bh;
++ spin_lock(&journal_datalist_lock);
++ do {
++ struct buffer_head *p = tmp;
++
++ tmp = tmp->b_this_page;
++ if (buffer_jbd(p))
++ if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
++ call_ttfb = 0;
++ } while (tmp != bh);
++ spin_unlock(&journal_datalist_lock);
++
++ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
++ goto out;
++ if (!locked_or_dirty)
++ goto out;
++ /*
++ * The VM wants us to do writeout, or to block on IO, or both.
++ * So we allow try_to_free_buffers to be called even if the page
++ * still has journalled buffers.
++ */
++ call_ttfb = 1;
++out:
++ return call_ttfb;
++}
++
++/*
++ * This buffer is no longer needed. If it is on an older transaction's
++ * checkpoint list we need to record it on this transaction's forget list
++ * to pin this buffer (and hence its checkpointing transaction) down until
++ * this transaction commits. If the buffer isn't on a checkpoint list, we
++ * release it.
++ * Returns non-zero if JBD no longer has an interest in the buffer.
++ */
++static int dispose_buffer(struct journal_head *jh,
++ transaction_t *transaction)
++{
++ int may_free = 1;
++ struct buffer_head *bh = jh2bh(jh);
++
++ spin_lock(&journal_datalist_lock);
++ __journal_unfile_buffer(jh);
++ jh->b_transaction = 0;
++
++ if (jh->b_cp_transaction) {
++ JBUFFER_TRACE(jh, "on running+cp transaction");
++ __journal_file_buffer(jh, transaction, BJ_Forget);
++ clear_bit(BH_JBDDirty, &bh->b_state);
++ may_free = 0;
++ } else {
++ JBUFFER_TRACE(jh, "on running transaction");
++ __journal_remove_journal_head(bh);
++ __brelse(bh);
++ }
++ spin_unlock(&journal_datalist_lock);
++ return may_free;
++}
++
++/*
++ * journal_flushpage
++ *
++ * This code is tricky. It has a number of cases to deal with.
++ *
++ * There are two invariants which this code relies on:
++ *
++ * i_size must be updated on disk before we start calling flushpage on the
++ * data.
++ *
++ * This is done in ext3 by defining an ext3_setattr method which
++ * updates i_size before truncate gets going. By maintaining this
++ * invariant, we can be sure that it is safe to throw away any buffers
++ * attached to the current transaction: once the transaction commits,
++ * we know that the data will not be needed.
++ *
++ * Note however that we can *not* throw away data belonging to the
++ * previous, committing transaction!
++ *
++ * Any disk blocks which *are* part of the previous, committing
++ * transaction (and which therefore cannot be discarded immediately) are
++ * not going to be reused in the new running transaction
++ *
++ * The bitmap committed_data images guarantee this: any block which is
++ * allocated in one transaction and removed in the next will be marked
++ * as in-use in the committed_data bitmap, so cannot be reused until
++ * the next transaction to delete the block commits. This means that
++ * leaving committing buffers dirty is quite safe: the disk blocks
++ * cannot be reallocated to a different file and so buffer aliasing is
++ * not possible.
++ *
++ *
++ * The above applies mainly to ordered data mode. In writeback mode we
++ * don't make guarantees about the order in which data hits disk --- in
++ * particular we don't guarantee that new dirty data is flushed before
++ * transaction commit --- so it is always safe just to discard data
++ * immediately in that mode. --sct
++ */
++
++/*
++ * The journal_unmap_buffer helper function returns zero if the buffer
++ * concerned remains pinned as an anonymous buffer belonging to an older
++ * transaction.
++ *
++ * We're outside-transaction here. Either or both of j_running_transaction
++ * and j_committing_transaction may be NULL.
++ */
++static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
++{
++ transaction_t *transaction;
++ struct journal_head *jh;
++ int may_free = 1;
++
++ BUFFER_TRACE(bh, "entry");
++
++ if (!buffer_mapped(bh))
++ return 1;
++
++ /* It is safe to proceed here without the
++ * journal_datalist_spinlock because the buffers cannot be
++ * stolen by try_to_free_buffers as long as we are holding the
++ * page lock. --sct */
++
++ if (!buffer_jbd(bh))
++ goto zap_buffer;
++
++ jh = bh2jh(bh);
++ transaction = jh->b_transaction;
++ if (transaction == NULL) {
++ /* First case: not on any transaction. If it
++ * has no checkpoint link, then we can zap it:
++ * it's a writeback-mode buffer so we don't care
++ * if it hits disk safely. */
++ if (!jh->b_cp_transaction) {
++ JBUFFER_TRACE(jh, "not on any transaction: zap");
++ goto zap_buffer;
++ }
++
++ if (!buffer_dirty(bh)) {
++ /* bdflush has written it. We can drop it now */
++ goto zap_buffer;
++ }
++
++ /* OK, it must be in the journal but still not
++ * written fully to disk: it's metadata or
++ * journaled data... */
++
++ if (journal->j_running_transaction) {
++ /* ... and once the current transaction has
++ * committed, the buffer won't be needed any
++ * longer. */
++ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
++ return dispose_buffer(jh,
++ journal->j_running_transaction);
++ } else {
++ /* There is no currently-running transaction. So the
++ * orphan record which we wrote for this file must have
++ * passed into commit. We must attach this buffer to
++ * the committing transaction, if it exists. */
++ if (journal->j_committing_transaction) {
++ JBUFFER_TRACE(jh, "give to committing trans");
++ return dispose_buffer(jh,
++ journal->j_committing_transaction);
++ } else {
++ /* The orphan record's transaction has
++ * committed. We can cleanse this buffer */
++ clear_bit(BH_JBDDirty, &bh->b_state);
++ goto zap_buffer;
++ }
++ }
++ } else if (transaction == journal->j_committing_transaction) {
++ /* If it is committing, we simply cannot touch it. We
++ * can remove it's next_transaction pointer from the
++ * running transaction if that is set, but nothing
++ * else. */
++ JBUFFER_TRACE(jh, "on committing transaction");
++ if (jh->b_next_transaction) {
++ J_ASSERT(jh->b_next_transaction ==
++ journal->j_running_transaction);
++ jh->b_next_transaction = NULL;
++ }
++ return 0;
++ } else {
++ /* Good, the buffer belongs to the running transaction.
++ * We are writing our own transaction's data, not any
++ * previous one's, so it is safe to throw it away
++ * (remember that we expect the filesystem to have set
++ * i_size already for this truncate so recovery will not
++ * expose the disk blocks we are discarding here.) */
++ J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
++ may_free = dispose_buffer(jh, transaction);
++ }
++
++zap_buffer:
++ if (buffer_dirty(bh))
++ mark_buffer_clean(bh);
++ J_ASSERT_BH(bh, !buffer_jdirty(bh));
++ clear_bit(BH_Uptodate, &bh->b_state);
++ clear_bit(BH_Mapped, &bh->b_state);
++ clear_bit(BH_Req, &bh->b_state);
++ clear_bit(BH_New, &bh->b_state);
++ return may_free;
++}
++
++/*
++ * Return non-zero if the page's buffers were successfully reaped
++ */
++int journal_flushpage(journal_t *journal,
++ struct page *page,
++ unsigned long offset)
++{
++ struct buffer_head *head, *bh, *next;
++ unsigned int curr_off = 0;
++ int may_free = 1;
++
++ if (!PageLocked(page))
++ BUG();
++ if (!page->buffers)
++ return 1;
++
++ /* We will potentially be playing with lists other than just the
++ * data lists (especially for journaled data mode), so be
++ * cautious in our locking. */
++ lock_journal(journal);
++
++ head = bh = page->buffers;
++ do {
++ unsigned int next_off = curr_off + bh->b_size;
++ next = bh->b_this_page;
++
++ /* AKPM: doing lock_buffer here may be overly paranoid */
++ if (offset <= curr_off) {
++ /* This block is wholly outside the truncation point */
++ lock_buffer(bh);
++ may_free &= journal_unmap_buffer(journal, bh);
++ unlock_buffer(bh);
++ }
++ curr_off = next_off;
++ bh = next;
++
++ } while (bh != head);
++
++ unlock_journal(journal);
++
++ if (!offset) {
++ if (!may_free || !try_to_free_buffers(page, 0))
++ return 0;
++ J_ASSERT(page->buffers == NULL);
++ }
++ return 1;
++}
++
++/*
++ * File a buffer on the given transaction list.
++ */
++void __journal_file_buffer(struct journal_head *jh,
++ transaction_t *transaction, int jlist)
++{
++ struct journal_head **list = 0;
++
++ assert_spin_locked(&journal_datalist_lock);
++
++#ifdef __SMP__
++ J_ASSERT (current->lock_depth >= 0);
++#endif
++ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
++ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
++ jh->b_transaction == 0);
++
++ if (jh->b_transaction) {
++ if (jh->b_jlist == jlist)
++ return;
++ __journal_unfile_buffer(jh);
++ } else {
++ jh->b_transaction = transaction;
++ }
++
++ switch (jlist) {
++ case BJ_None:
++ J_ASSERT_JH(jh, !jh->b_committed_data);
++ J_ASSERT_JH(jh, !jh->b_frozen_data);
++ return;
++ case BJ_SyncData:
++ list = &transaction->t_sync_datalist;
++ break;
++ case BJ_AsyncData:
++ list = &transaction->t_async_datalist;
++ break;
++ case BJ_Metadata:
++ transaction->t_nr_buffers++;
++ list = &transaction->t_buffers;
++ break;
++ case BJ_Forget:
++ list = &transaction->t_forget;
++ break;
++ case BJ_IO:
++ list = &transaction->t_iobuf_list;
++ break;
++ case BJ_Shadow:
++ list = &transaction->t_shadow_list;
++ break;
++ case BJ_LogCtl:
++ list = &transaction->t_log_list;
++ break;
++ case BJ_Reserved:
++ list = &transaction->t_reserved_list;
++ break;
++ }
++
++ __blist_add_buffer(list, jh);
++ jh->b_jlist = jlist;
++
++ if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
++ jlist == BJ_Shadow || jlist == BJ_Forget) {
++ if (atomic_set_buffer_clean(jh2bh(jh))) {
++ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
++ }
++ }
++}
++
++void journal_file_buffer(struct journal_head *jh,
++ transaction_t *transaction, int jlist)
++{
++ spin_lock(&journal_datalist_lock);
++ __journal_file_buffer(jh, transaction, jlist);
++ spin_unlock(&journal_datalist_lock);
++}
++
++/*
++ * Remove a buffer from its current buffer list in preparation for
++ * dropping it from its current transaction entirely. If the buffer has
++ * already started to be used by a subsequent transaction, refile the
++ * buffer on that transaction's metadata list.
++ */
++
++void __journal_refile_buffer(struct journal_head *jh)
++{
++ assert_spin_locked(&journal_datalist_lock);
++#ifdef __SMP__
++ J_ASSERT_JH(jh, current->lock_depth >= 0);
++#endif
++ __journal_unfile_buffer(jh);
++
++ /* If the buffer is now unused, just drop it. If it has been
++ modified by a later transaction, add it to the new
++ transaction's metadata list. */
++
++ jh->b_transaction = jh->b_next_transaction;
++ jh->b_next_transaction = NULL;
++
++ if (jh->b_transaction != NULL) {
++ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
++ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
++ } else {
++ /* Onto BUF_DIRTY for writeback */
++ refile_buffer(jh2bh(jh));
++ }
++}
++
++/*
++ * For the unlocked version of this call, also make sure that any
++ * hanging journal_head is cleaned up if necessary.
++ *
++ * __journal_refile_buffer is usually called as part of a single locked
++ * operation on a buffer_head, in which the caller is probably going to
++ * be hooking the journal_head onto other lists. In that case it is up
++ * to the caller to remove the journal_head if necessary. For the
++ * unlocked journal_refile_buffer call, the caller isn't going to be
++ * doing anything else to the buffer so we need to do the cleanup
++ * ourselves to avoid a jh leak.
++ *
++ * *** The journal_head may be freed by this call! ***
++ */
++void journal_refile_buffer(struct journal_head *jh)
++{
++ struct buffer_head *bh;
++
++ spin_lock(&journal_datalist_lock);
++ bh = jh2bh(jh);
++
++ __journal_refile_buffer(jh);
++ __journal_remove_journal_head(bh);
++
++ spin_unlock(&journal_datalist_lock);
++ __brelse(bh);
++}
--- /dev/null
+ include/linux/mm.h | 1 +
+ 1 files changed, 1 insertion(+)
+
+Index: linux.mcp2/include/linux/mm.h
+===================================================================
+--- linux.mcp2.orig/include/linux/mm.h 2004-05-05 14:32:29.000000000 -0700
++++ linux.mcp2/include/linux/mm.h 2004-05-05 14:46:54.000000000 -0700
+@@ -162,6 +162,7 @@
+ protected by pagemap_lru_lock !! */
+ struct page **pprev_hash; /* Complement to *next_hash. */
+ struct buffer_head * buffers; /* Buffer maps us to a disk block. */
++ unsigned long private;
+
+ /*
+ * On machines where all RAM is mapped into kernel address space,
--- /dev/null
+ drivers/block/blkpg.c | 36 ++++++++++++++++++++++++++++++++++++
+ drivers/block/loop.c | 3 +++
+ drivers/ide/ide-disk.c | 4 ++++
+ 3 files changed, 43 insertions(+)
+
+Index: linux-2.4.19.SuSE/drivers/block/blkpg.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/drivers/block/blkpg.c 2004-04-29 16:19:25.000000000 -0700
++++ linux-2.4.19.SuSE/drivers/block/blkpg.c 2004-04-29 16:35:09.000000000 -0700
+@@ -296,3 +296,38 @@
+ }
+
+ EXPORT_SYMBOL(blk_ioctl);
++
++
++#define NUM_DEV_NO_WRITE 16
++static int dev_no_write[NUM_DEV_NO_WRITE];
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently). This is for filesystem crash/recovery testing.
++ */
++void dev_set_rdonly(kdev_t dev, int no_write)
++{
++ if (dev) {
++ printk(KERN_WARNING "Turning device %s read-only\n",
++ bdevname(dev));
++ dev_no_write[no_write] = 0xdead0000 + dev;
++ }
++}
++
++int dev_check_rdonly(kdev_t dev) {
++ int i;
++
++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
++ dev == (dev_no_write[i] & 0xffff))
++ return 1;
++ }
++ return 0;
++}
++
++void dev_clear_rdonly(int no_write) {
++ dev_no_write[no_write] = 0;
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
+Index: linux-2.4.19.SuSE/drivers/block/loop.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/drivers/block/loop.c 2004-04-29 16:19:25.000000000 -0700
++++ linux-2.4.19.SuSE/drivers/block/loop.c 2004-04-29 16:32:56.000000000 -0700
+@@ -478,6 +478,9 @@
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (rw == WRITE) {
++ if (dev_check_rdonly(rbh->b_rdev))
++ goto err;
++
+ if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+ goto err;
+ } else if (rw == READA) {
+Index: linux-2.4.19.SuSE/drivers/ide/ide-disk.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/drivers/ide/ide-disk.c 2004-04-29 16:18:55.000000000 -0700
++++ linux-2.4.19.SuSE/drivers/ide/ide-disk.c 2004-04-29 16:32:56.000000000 -0700
+@@ -558,6 +558,10 @@
+ */
+ static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
+ {
++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
++ ide_end_request(1, HWGROUP(drive));
++ return ide_stopped;
++ }
+ if (IDE_CONTROL_REG)
+ OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
+
--- /dev/null
+Index: linux-bgl/kernel/sched.c
+===================================================================
+--- linux-bgl.orig/kernel/sched.c 2003-07-02 08:43:33.000000000 -0700
++++ linux-bgl/kernel/sched.c 2004-10-26 23:37:44.314193755 -0700
+@@ -1124,7 +1124,7 @@
+ return retval;
+ }
+
+-static void show_task(struct task_struct * p)
++void show_task(struct task_struct * p)
+ {
+ unsigned long free = 0;
+ int state;
+Index: linux-bgl/kernel/ksyms.c
+===================================================================
+--- linux-bgl.orig/kernel/ksyms.c 2004-10-26 23:23:00.518654978 -0700
++++ linux-bgl/kernel/ksyms.c 2004-10-26 23:38:29.289071295 -0700
+@@ -76,6 +76,7 @@
+ };
+ #endif
+
++void show_task(struct task_struct *);
+
+ EXPORT_SYMBOL(inter_module_register);
+ EXPORT_SYMBOL(inter_module_unregister);
+@@ -595,3 +596,6 @@
+
+ EXPORT_SYMBOL(tasklist_lock);
+ EXPORT_SYMBOL(pidhash);
++
++/* debug */
++EXPORT_SYMBOL(show_task);
--- /dev/null
+ include/linux/mm.h | 1 +
+ mm/filemap.c | 3 ++-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+Index: linux-ion/include/linux/mm.h
+===================================================================
+--- linux-ion.orig/include/linux/mm.h 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/include/linux/mm.h 2004-09-27 15:07:50.000000000 -0700
+@@ -593,6 +593,7 @@
+ /* filemap.c */
+ extern void remove_inode_page(struct page *);
+ extern unsigned long page_unuse(struct page *);
++extern void truncate_complete_page(struct page *);
+ extern void truncate_inode_pages(struct address_space *, loff_t);
+
+ /* generic vm_area_ops exported for stackable file systems */
+Index: linux-ion/mm/filemap.c
+===================================================================
+--- linux-ion.orig/mm/filemap.c 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/mm/filemap.c 2004-09-27 15:08:13.000000000 -0700
+@@ -231,7 +231,7 @@
+ do_flushpage(page, partial);
+ }
+
+-static void truncate_complete_page(struct page *page)
++void truncate_complete_page(struct page *page)
+ {
+ /* Leave it on the LRU if it gets converted into anonymous buffers */
+ if (!page->buffers || do_flushpage(page, 0))
+@@ -249,6 +249,7 @@
+ remove_inode_page(page);
+ page_cache_release(page);
+ }
++EXPORT_SYMBOL(truncate_complete_page);
+
+ static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
+ static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
--- /dev/null
+
+
+
+Index: linux-ion/kernel/ksyms.c
+===================================================================
+--- linux-ion.orig/kernel/ksyms.c 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/kernel/ksyms.c 2004-09-27 15:04:52.000000000 -0700
+@@ -286,6 +286,10 @@
+ EXPORT_SYMBOL(dcache_readdir);
+ EXPORT_SYMBOL(dcache_dir_ops);
+
++/* lustre */
++EXPORT_SYMBOL(panic_notifier_list);
++EXPORT_SYMBOL(do_kern_mount);
++
+ /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
+ EXPORT_SYMBOL(default_llseek);
+ EXPORT_SYMBOL(dentry_open);
+Index: linux-ion/include/linux/fs.h
+===================================================================
+--- linux-ion.orig/include/linux/fs.h 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/include/linux/fs.h 2004-09-27 15:04:52.000000000 -0700
+@@ -1050,6 +1050,7 @@
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
++struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data);
+ extern void umount_tree(struct vfsmount *);
+
+ #define kern_umount mntput
+Index: linux-ion/mm/memory.c
+===================================================================
+--- linux-ion.orig/mm/memory.c 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/mm/memory.c 2004-09-27 15:05:56.000000000 -0700
+@@ -401,6 +401,7 @@
+ mm->rss = 0;
+ spin_unlock(&mm->page_table_lock);
+ }
++EXPORT_SYMBOL(zap_page_range);
+
+ /*
+ * Do a quick page-table lookup for a single page.
--- /dev/null
+ fs/ext3/Makefile | 2 ++
+ fs/ext3/super.c | 2 +-
+ include/linux/fs.h | 1 +
+ kernel/ksyms.c | 4 ++++
+ 4 files changed, 8 insertions(+), 1 deletion(-)
+
+Index: linux-2.4.19.SuSE/fs/ext3/Makefile
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-04-29 16:18:08.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-04-29 16:36:09.000000000 -0700
+@@ -9,6 +9,8 @@
+
+ O_TARGET := ext3.o
+
++export-objs := super.o inode.o
++
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ ioctl.o namei.o super.o symlink.o
+ obj-m := $(O_TARGET)
+Index: linux-2.4.19.SuSE/fs/ext3/super.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-04-29 16:18:08.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-04-29 16:36:09.000000000 -0700
+@@ -1821,7 +1821,7 @@
+ exit_ext3_xattr();
+ }
+
+-EXPORT_NO_SYMBOLS;
++EXPORT_SYMBOL(ext3_bread);
+
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+Index: linux-2.4.19.SuSE/include/linux/fs.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/fs.h 2004-04-29 16:19:41.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/fs.h 2004-04-29 16:36:52.000000000 -0700
+@@ -1174,6 +1174,7 @@
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
+ extern void umount_tree(struct vfsmount *);
+
+ #define kern_umount mntput
+Index: linux-2.4.19.SuSE/kernel/ksyms.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-04-29 16:19:35.000000000 -0700
++++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-04-29 16:36:09.000000000 -0700
+@@ -330,6 +330,10 @@
+ EXPORT_SYMBOL(dcache_readdir);
+ EXPORT_SYMBOL(dcache_dir_ops);
+
++/* lustre */
++EXPORT_SYMBOL(pagecache_lock_cacheline);
++EXPORT_SYMBOL(do_kern_mount);
++
+ /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
+ EXPORT_SYMBOL(default_llseek);
+ EXPORT_SYMBOL(dentry_open);
--- /dev/null
+ fs/ext3/Makefile | 2
+ fs/ext3/dir.c | 299 +++++++++
+ fs/ext3/file.c | 3
+ fs/ext3/hash.c | 215 ++++++
+ fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c | 7
+ include/linux/ext3_fs.h | 85 ++
+ include/linux/ext3_fs_sb.h | 2
+ include/linux/ext3_jbd.h | 2
+ include/linux/rbtree.h | 2
+ lib/rbtree.c | 42 +
+ 11 files changed, 1887 insertions(+), 160 deletions(-)
+
+Index: linux-2.4.19.SuSE/fs/ext3/Makefile
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-05-27 11:07:21.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-05-27 11:08:28.000000000 -0700
+@@ -12,7 +12,7 @@
+ export-objs := super.o inode.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o
++ ioctl.o namei.o super.o symlink.o hash.o
+ obj-m := $(O_TARGET)
+
+ obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
+Index: linux-2.4.19.SuSE/fs/ext3/dir.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800
++++ linux-2.4.19.SuSE/fs/ext3/dir.c 2004-05-27 11:08:28.000000000 -0700
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir);
+
+ struct file_operations ext3_dir_operations = {
+ read: generic_read_dir,
+@@ -35,6 +39,17 @@
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++ (filetype >= EXT3_FT_MAX))
++ return DT_UNKNOWN;
++
++ return (ext3_filetype_table[filetype]);
++}
++
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+@@ -79,6 +94,16 @@
+
+ sb = inode->i_sb;
+
++ if (is_dx(inode)) {
++ err = ext3_dx_readdir(filp, dirent, filldir);
++ if (err != ERR_BAD_DX_DIR)
++ return err;
++ /*
++ * We don't set the inode dirty flag since it's not
++ * critical that it get flushed back to the disk.
++ */
++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++ }
+ stored = 0;
+ bh = NULL;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+- unsigned char d_type = DT_UNKNOWN;
+
+- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+- EXT3_FEATURE_INCOMPAT_FILETYPE)
+- && de->file_type < EXT3_FT_MAX)
+- d_type =
+- ext3_filetype_table[de->file_type];
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+- d_type);
++ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+@@ -188,3 +207,269 @@
+ UPDATE_ATIME(inode);
+ return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ *
++ * Currently we only use major hash numer. This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie. Sigh.
++ */
++#define hash2pos(major, minor) (major >> 1)
++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos) (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++ __u32 hash;
++ __u32 minor_hash;
++ rb_node_t rb_hash;
++ struct fname *next;
++ __u32 inode;
++ __u8 name_len;
++ __u8 file_type;
++ char name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++ rb_node_t *n = root->rb_node;
++ rb_node_t *parent;
++ struct fname *fname;
++
++ while (n) {
++ /* Do the node's children first */
++ if ((n)->rb_left) {
++ n = n->rb_left;
++ continue;
++ }
++ if (n->rb_right) {
++ n = n->rb_right;
++ continue;
++ }
++ /*
++ * The node has no children; free it, and then zero
++ * out parent's link to it. Finally go to the
++ * beginning of the loop and try to free the parent
++ * node.
++ */
++ parent = n->rb_parent;
++ fname = rb_entry(n, struct fname, rb_hash);
++ kfree(fname);
++ if (!parent)
++ root->rb_node = 0;
++ else if (parent->rb_left == n)
++ parent->rb_left = 0;
++ else if (parent->rb_right == n)
++ parent->rb_right = 0;
++ n = parent;
++ }
++ root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++ struct dir_private_info *p;
++
++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++ if (!p)
++ return NULL;
++ p->root.rb_node = 0;
++ p->curr_node = 0;
++ p->extra_fname = 0;
++ p->last_pos = 0;
++ p->curr_hash = pos2maj_hash(pos);
++ p->curr_minor_hash = pos2min_hash(pos);
++ p->next_hash = 0;
++ return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++ free_rb_tree_fname(&p->root);
++ kfree(p);
++}
++
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent)
++{
++ rb_node_t **p, *parent = NULL;
++ struct fname * fname, *new_fn;
++ struct dir_private_info *info;
++ int len;
++
++ info = (struct dir_private_info *) dir_file->private_data;
++ p = &info->root.rb_node;
++
++ /* Create and allocate the fname structure */
++ len = sizeof(struct fname) + dirent->name_len + 1;
++ new_fn = kmalloc(len, GFP_KERNEL);
++ memset(new_fn, 0, len);
++ new_fn->hash = hash;
++ new_fn->minor_hash = minor_hash;
++ new_fn->inode = le32_to_cpu(dirent->inode);
++ new_fn->name_len = dirent->name_len;
++ new_fn->file_type = dirent->file_type;
++ memcpy(new_fn->name, dirent->name, dirent->name_len);
++ new_fn->name[dirent->name_len] = 0;
++
++ while (*p) {
++ parent = *p;
++ fname = rb_entry(parent, struct fname, rb_hash);
++
++ /*
++ * If the hash and minor hash match up, then we put
++ * them on a linked list. This rarely happens...
++ */
++ if ((new_fn->hash == fname->hash) &&
++ (new_fn->minor_hash == fname->minor_hash)) {
++ new_fn->next = fname->next;
++ fname->next = new_fn;
++ return;
++ }
++
++ if (new_fn->hash < fname->hash)
++ p = &(*p)->rb_left;
++ else if (new_fn->hash > fname->hash)
++ p = &(*p)->rb_right;
++ else if (new_fn->minor_hash < fname->minor_hash)
++ p = &(*p)->rb_left;
++ else /* if (new_fn->minor_hash > fname->minor_hash) */
++ p = &(*p)->rb_right;
++ }
++
++ rb_link_node(&new_fn->rb_hash, parent, p);
++ rb_insert_color(&new_fn->rb_hash, &info->root);
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir. It calls filldir
++ * for all entres on the fname linked list. (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++ filldir_t filldir, struct fname *fname)
++{
++ struct dir_private_info *info = filp->private_data;
++ loff_t curr_pos;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct super_block * sb;
++ int error;
++
++ sb = inode->i_sb;
++
++ if (!fname) {
++ printk("call_filldir: called with null fname?!?\n");
++ return 0;
++ }
++ curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ while (fname) {
++ error = filldir(dirent, fname->name,
++ fname->name_len, curr_pos,
++ fname->inode,
++ get_dtype(sb, fname->file_type));
++ if (error) {
++ filp->f_pos = curr_pos;
++ info->extra_fname = fname->next;
++ return error;
++ }
++ fname = fname->next;
++ }
++ return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir)
++{
++ struct dir_private_info *info = filp->private_data;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct fname *fname;
++ int ret;
++
++ if (!info) {
++ info = create_dir_info(filp->f_pos);
++ if (!info)
++ return -ENOMEM;
++ filp->private_data = info;
++ }
++
++ /* Some one has messed with f_pos; reset the world */
++ if (info->last_pos != filp->f_pos) {
++ free_rb_tree_fname(&info->root);
++ info->curr_node = 0;
++ info->extra_fname = 0;
++ info->curr_hash = pos2maj_hash(filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ }
++
++ /*
++ * If there are any leftover names on the hash collision
++ * chain, return them first.
++ */
++ if (info->extra_fname &&
++ call_filldir(filp, dirent, filldir, info->extra_fname))
++ goto finished;
++
++ if (!info->curr_node)
++ info->curr_node = rb_get_first(&info->root);
++
++ while (1) {
++ /*
++ * Fill the rbtree if we have no more entries,
++ * or the inode has changed since we last read in the
++ * cached entries.
++ */
++ if ((!info->curr_node) ||
++ (filp->f_version != inode->i_version)) {
++ info->curr_node = 0;
++ free_rb_tree_fname(&info->root);
++ filp->f_version = inode->i_version;
++ ret = ext3_htree_fill_tree(filp, info->curr_hash,
++ info->curr_minor_hash,
++ &info->next_hash);
++ if (ret < 0)
++ return ret;
++ if (ret == 0)
++ break;
++ info->curr_node = rb_get_first(&info->root);
++ }
++
++ fname = rb_entry(info->curr_node, struct fname, rb_hash);
++ info->curr_hash = fname->hash;
++ info->curr_minor_hash = fname->minor_hash;
++ if (call_filldir(filp, dirent, filldir, fname))
++ break;
++
++ info->curr_node = rb_get_next(info->curr_node);
++ if (!info->curr_node) {
++ info->curr_hash = info->next_hash;
++ info->curr_minor_hash = 0;
++ }
++ }
++finished:
++ info->last_pos = filp->f_pos;
++ UPDATE_ATIME(inode);
++ return 0;
++}
++#endif
+Index: linux-2.4.19.SuSE/fs/ext3/namei.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/namei.c 2002-12-04 09:46:03.000000000 -0800
++++ linux-2.4.19.SuSE/fs/ext3/namei.c 2004-05-27 11:08:52.000000000 -0700
+@@ -16,6 +16,12 @@
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ * Hash Tree Directory indexing (c)
++ * Daniel Phillips, 2001
++ * Hash Tree Directory indexing porting
++ * Christopher Li, 2002
++ * Hash Tree Directory indexing cleanup
++ * Theodore Ts'o, 2002
+ */
+
+ #include <linux/fs.h>
+@@ -40,6 +46,630 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
++static struct buffer_head *ext3_append(handle_t *handle,
++ struct inode *inode,
++ u32 *block, int *err)
++{
++ struct buffer_head *bh;
++
++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++ inode->i_size += inode->i_sb->s_blocksize;
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_journal_get_write_access(handle,bh);
++ }
++ return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command)
++#endif
++
++struct fake_dirent
++{
++ /*le*/u32 inode;
++ /*le*/u16 rec_len;
++ u8 name_len;
++ u8 file_type;
++};
++
++struct dx_countlimit
++{
++ le_u16 limit;
++ le_u16 count;
++};
++
++struct dx_entry
++{
++ le_u32 hash;
++ le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero. Therefore, the
++ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++ struct fake_dirent dot;
++ char dot_name[4];
++ struct fake_dirent dotdot;
++ char dotdot_name[4];
++ struct dx_root_info
++ {
++ le_u32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
++ }
++ info;
++ struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++ struct fake_dirent fake;
++ struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++ struct buffer_head *bh;
++ struct dx_entry *entries;
++ struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++ u32 hash;
++ u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++ struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct dx_frame *frame,
++ int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++ struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++ entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++ entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++ EXT3_DIR_REC_LEN(2) - infosize;
++ return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++ return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{
++ unsigned names;
++ unsigned space;
++ unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++ int size, int show_names)
++{
++ unsigned names = 0, space = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ printk("names: ");
++ while ((char *) de < base + size)
++ {
++ if (de->inode)
++ {
++ if (show_names)
++ {
++ int len = de->name_len;
++ char *name = de->name;
++ while (len--) printk("%c", *name++);
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ printk(":%x.%u ", h.hash,
++ ((char *) de - base));
++ }
++ space += EXT3_DIR_REC_LEN(de->name_len);
++ names++;
++ }
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ printk("(%i)\n", names);
++ return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++ struct dx_entry *entries, int levels)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++ unsigned bcount = 0;
++ struct buffer_head *bh;
++ int err;
++ printk("%i indexed blocks...\n", count);
++ for (i = 0; i < count; i++, entries++)
++ {
++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++ struct stats stats;
++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++ stats = levels?
++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++ names += stats.names;
++ space += stats.space;
++ bcount += stats.bcount;
++ brelse (bh);
++ }
++ if (bcount)
++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
++ names, space/bcount,(space/bcount)*100/blocksize);
++ return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++ unsigned count, indirect;
++ struct dx_entry *at, *entries, *p, *q, *m;
++ struct dx_root *root;
++ struct buffer_head *bh;
++ struct dx_frame *frame = frame_in;
++ u32 hash;
++
++ frame->bh = NULL;
++ if (dentry)
++ dir = dentry->d_parent->d_inode;
++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++ goto fail;
++ root = (struct dx_root *) bh->b_data;
++ if (root->info.hash_version != DX_HASH_TEA &&
++ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_LEGACY) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++ hinfo->hash_version = root->info.hash_version;
++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ if (dentry)
++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ hash = hinfo->hash;
++
++ if (root->info.unused_flags & 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ if ((indirect = root->info.indirect_levels) > 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ while (1)
++ {
++ count = dx_get_count(entries);
++ assert (count && count <= dx_get_limit(entries));
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ dxtrace(printk("."));
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++
++ if (0) // linear search cross check
++ {
++ unsigned n = count - 1;
++ at = entries;
++ while (n--)
++ {
++ dxtrace(printk(","));
++ if (dx_get_hash(++at) > hash)
++ {
++ at--;
++ break;
++ }
++ }
++ assert (at == p - 1);
++ }
++
++ at = p - 1;
++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ frame->bh = bh;
++ frame->entries = entries;
++ frame->at = at;
++ if (!indirect--) return frame;
++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ goto fail2;
++ at = entries = ((struct dx_node *) bh->b_data)->entries;
++ assert (dx_get_limit(entries) == dx_node_limit (dir));
++ frame++;
++ }
++fail2:
++ while (frame >= frame_in) {
++ brelse(frame->bh);
++ frame--;
++ }
++fail:
++ return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++ if (frames[0].bh == NULL)
++ return;
++
++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ brelse(frames[1].bh);
++ brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary. Whether or not the search is necessary is
++ * controlled by the hash parameter. If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value. This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not. If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash)
++{
++ struct dx_frame *p;
++ struct buffer_head *bh;
++ int num_frames = 0;
++ __u32 bhash;
++
++ *err = ENOENT;
++ p = frame;
++ /*
++ * Find the next leaf page by incrementing the frame pointer.
++ * If we run out of entries in the interior node, loop around and
++ * increment pointer in the parent node. When we break out of
++ * this loop, num_frames indicates the number of interior
++ * nodes need to be read.
++ */
++ while (1) {
++ if (++(p->at) < p->entries + dx_get_count(p->entries))
++ break;
++ if (p == frames)
++ return 0;
++ num_frames++;
++ p--;
++ }
++
++ /*
++ * If the hash is 1, then continue only if the next page has a
++ * continuation hash of any value. This is used for readdir
++ * handling. Otherwise, check to see if the hash matches the
++ * desired contiuation hash. If it doesn't, return since
++ * there's no point to read in the successive index pages.
++ */
++ bhash = dx_get_hash(p->at);
++ if (start_hash)
++ *start_hash = bhash;
++ if ((hash & 1) == 0) {
++ if ((bhash & ~1) != hash)
++ return 0;
++ }
++ /*
++ * If the hash is HASH_NB_ALWAYS, we always go to the next
++ * block so no check is necessary
++ */
++ while (num_frames--) {
++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++ 0, err)))
++ return -1; /* Failure */
++ p++;
++ brelse (p->bh);
++ p->bh = bh;
++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ }
++ return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory. We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash)
++{
++ struct dx_hash_info hinfo;
++ struct buffer_head *bh;
++ struct ext3_dir_entry_2 *de, *top;
++ static struct dx_frame frames[2], *frame;
++ struct inode *dir;
++ int block, err;
++ int count = 0;
++ int ret;
++ __u32 hashval;
++
++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++ start_minor_hash));
++ dir = dir_file->f_dentry->d_inode;
++ hinfo.hash = start_hash;
++ hinfo.minor_hash = 0;
++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++
++ while (1) {
++ block = dx_get_block(frame->at);
++ dxtrace(printk("Reading block %d\n", block));
++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++ goto errout;
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de)) {
++ ext3fs_dirhash(de->name, de->name_len, &hinfo);
++ if ((hinfo.hash < start_hash) ||
++ ((hinfo.hash == start_hash) &&
++ (hinfo.minor_hash < start_minor_hash)))
++ continue;
++ ext3_htree_store_dirent(dir_file, hinfo.hash,
++ hinfo.minor_hash, de);
++ count++;
++ }
++ brelse (bh);
++ hashval = ~1;
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
++ frame, frames, &err, &hashval);
++ if (next_hash)
++ *next_hash = hashval;
++ if (ret == -1)
++ goto errout;
++ /*
++ * Stop if: (a) there are no more entries, or
++ * (b) we have inserted at least one entry and the
++ * next hash value is not a continuation
++ */
++ if ((ret == 0) ||
++ (count && ((hashval & 1) == 0)))
++ break;
++ }
++ dx_release(frames);
++ dxtrace(printk("Fill tree: returned %d entries\n", count));
++ return count;
++errout:
++ dx_release(frames);
++ return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++ int count = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ while ((char *) de < base + size)
++ {
++ if (de->name_len && de->inode) {
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ map_tail--;
++ map_tail->hash = h.hash;
++ map_tail->offs = (u32) ((char *) de - base);
++ count++;
++ }
++ /* XXX: do we need to check rec_len == 0 case? -Chris */
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++ struct dx_map_entry *p, *q, *top = map + count - 1;
++ int more;
++ /* Combsort until bubble sort doesn't suck */
++ while (count > 2)
++ {
++ count = count*10/13;
++ if (count - 9 < 2) /* 9, 10 -> 11 */
++ count = 11;
++ for (p = top, q = p - count; q >= map; p--, q--)
++ if (p->hash < q->hash)
++ swap(*p, *q);
++ }
++ /* Garden variety bubble sort */
++ do {
++ more = 0;
++ q = top;
++ while (q-- > map)
++ {
++ if (q[1].hash >= q[0].hash)
++ continue;
++ swap(*(q+1), *q);
++ more = 1;
++ }
++ } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++ struct dx_entry *entries = frame->entries;
++ struct dx_entry *old = frame->at, *new = old + 1;
++ int count = dx_get_count(entries);
++
++ assert(count < dx_get_limit(entries));
++ assert(old < entries + count);
++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++ dx_set_hash(new, hash);
++ dx_set_block(new, block);
++ dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++ EXT3_FEATURE_COMPAT_DIR_INDEX))
++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+@@ -96,6 +726,7 @@
+ return 0;
+ }
+
++
+ /*
+ * ext3_find_entry()
+ *
+@@ -107,6 +738,8 @@
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
++
++
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -121,12 +754,32 @@
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
++ int namelen;
++ const u8 *name;
++ unsigned blocksize;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+-
++ blocksize = sb->s_blocksize;
++ namelen = dentry->d_name.len;
++ name = dentry->d_name.name;
++ if (namelen > EXT3_NAME_LEN)
++ return NULL;
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ /*
++ * On success, or if the error was file not found,
++ * return. Otherwise, fall back to doing a search the
++ * old fashioned way.
++ */
++ if (bh || (err != ERR_BAD_DX_DIR))
++ return bh;
++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ }
++#endif
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+- start = dir->u.ext3_i.i_dir_start_lookup;
++ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+@@ -167,7 +820,7 @@
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+- dir->u.ext3_i.i_dir_start_lookup = block;
++ EXT3_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+@@ -198,6 +851,66 @@
+ return ret;
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err)
++{
++ struct super_block * sb;
++ struct dx_hash_info hinfo;
++ u32 hash;
++ struct dx_frame frames[2], *frame;
++ struct ext3_dir_entry_2 *de, *top;
++ struct buffer_head *bh;
++ unsigned long block;
++ int retval;
++ int namelen = dentry->d_name.len;
++ const u8 *name = dentry->d_name.name;
++ struct inode *dir = dentry->d_parent->d_inode;
++
++ sb = dir->i_sb;
++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++ return NULL;
++ hash = hinfo.hash;
++ do {
++ block = dx_get_block(frame->at);
++ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++ goto errout;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de))
++ if (ext3_match (namelen, name, de)) {
++ if (!ext3_check_dir_entry("ext3_find_entry",
++ dir, de, bh,
++ (block<<EXT3_BLOCK_SIZE_BITS(sb))
++ +((char *)de - bh->b_data))) {
++ brelse (bh);
++ goto errout;
++ }
++ *res_dir = de;
++ dx_release (frames);
++ return bh;
++ }
++ brelse (bh);
++ /* Check to see if we should continue to search */
++ retval = ext3_htree_next_block(dir, hash, frame,
++ frames, err, 0);
++ if (retval == -1) {
++ ext3_warning(sb, __FUNCTION__,
++ "error reading index page in directory #%lu",
++ dir->i_ino);
++ goto errout;
++ }
++ } while (retval == 1);
++
++ *err = -ENOENT;
++errout:
++ dxtrace(printk("%s not found\n", name));
++ dx_release (frames);
++ return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ struct inode * inode;
+@@ -214,8 +927,9 @@
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+- if (!inode)
++ if (!inode) {
+ return ERR_PTR(-EACCES);
++ }
+ }
+ d_add(dentry, inode);
+ return NULL;
+@@ -239,6 +953,301 @@
+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++ unsigned rec_len = 0;
++
++ while (count--) {
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ memcpy (to, de, rec_len);
++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
++ de->inode = 0;
++ map++;
++ to += rec_len;
++ }
++ return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ unsigned rec_len = 0;
++
++ prev = to = de;
++ while ((char*)de < base + size) {
++ next = (struct ext3_dir_entry_2 *) ((char *) de +
++ le16_to_cpu(de->rec_len));
++ if (de->inode && de->name_len) {
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ if (de > to)
++ memmove(to, de, rec_len);
++ to->rec_len = cpu_to_le16(rec_len);
++ prev = to;
++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ }
++ de = next;
++ }
++ return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++ struct buffer_head **bh,struct dx_frame *frame,
++ struct dx_hash_info *hinfo, int *error)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count, continued;
++ struct buffer_head *bh2;
++ u32 newblock;
++ u32 hash2;
++ struct dx_map_entry *map;
++ char *data1 = (*bh)->b_data, *data2;
++ unsigned split;
++ struct ext3_dir_entry_2 *de = NULL, *de2;
++ int err;
++
++ bh2 = ext3_append (handle, dir, &newblock, error);
++ if (!(bh2)) {
++ brelse(*bh);
++ *bh = NULL;
++ goto errout;
++ }
++
++ BUFFER_TRACE(*bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, *bh);
++ if (err) {
++ journal_error:
++ brelse(*bh);
++ brelse(bh2);
++ *bh = NULL;
++ ext3_std_error(dir->i_sb, err);
++ goto errout;
++ }
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++
++ data2 = bh2->b_data;
++
++ /* create map in the end of data2 block */
++ map = (struct dx_map_entry *) (data2 + blocksize);
++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++ blocksize, hinfo, map);
++ map -= count;
++ split = count/2; // need to adjust to actual middle
++ dx_sort_map (map, count);
++ hash2 = map[split].hash;
++ continued = hash2 == map[split - 1].hash;
++ dxtrace(printk("Split block %i at %x, %i/%i\n",
++ dx_get_block(frame->at), hash2, split, count-split));
++
++ /* Fancy dance to stay within two buffers */
++ de2 = dx_move_dirents(data1, data2, map + split, count - split);
++ de = dx_pack_dirents(data1,blocksize);
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++ /* Which block gets the new entry? */
++ if (hinfo->hash >= hash2)
++ {
++ swap(*bh, bh2);
++ de = de2;
++ }
++ dx_insert_block (frame, hash2 + continued, newblock);
++ err = ext3_journal_dirty_metadata (handle, bh2);
++ if (err)
++ goto journal_error;
++ err = ext3_journal_dirty_metadata (handle, frame->bh);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++ return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block. If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry. If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space. It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ *
++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct ext3_dir_entry_2 *de,
++ struct buffer_head * bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ unsigned long offset = 0;
++ unsigned short reclen;
++ int nlen, rlen, err;
++ char *top;
++
++ reclen = EXT3_DIR_REC_LEN(namelen);
++ if (!de) {
++ de = (struct ext3_dir_entry_2 *)bh->b_data;
++ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++ while ((char *) de <= top) {
++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++ bh, offset)) {
++ brelse (bh);
++ return -EIO;
++ }
++ if (ext3_match (namelen, name, de)) {
++ brelse (bh);
++ return -EEXIST;
++ }
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if ((de->inode? rlen - nlen: rlen) >= reclen)
++ break;
++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++ offset += rlen;
++ }
++ if ((char *) de > top)
++ return -ENOSPC;
++ }
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) {
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return err;
++ }
++
++ /* By now the buffer is marked for journaling */
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if (de->inode) {
++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ de1->rec_len = cpu_to_le16(rlen - nlen);
++ de->rec_len = cpu_to_le16(nlen);
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ if (inode) {
++ de->inode = cpu_to_le32(inode->i_ino);
++ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++ } else
++ de->inode = 0;
++ de->name_len = namelen;
++ memcpy (de->name, name, namelen);
++ /*
++ * XXX shouldn't update any times until successful
++ * completion of syscall, but too many callers depend
++ * on this.
++ *
++ * XXX similarly, too many callers depend on
++ * ext3_new_inode() setting the times, but error
++ * recovery deletes the inode, so the worst that can
++ * happen is that the times are slightly out of date
++ * and/or different from the directory change time.
++ */
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ ext3_update_dx_flag(dir);
++ dir->i_version = ++event;
++ ext3_mark_inode_dirty(handle, dir);
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct buffer_head *bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ struct buffer_head *bh2;
++ struct dx_root *root;
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries;
++ struct ext3_dir_entry_2 *de, *de2;
++ char *data1, *top;
++ unsigned len;
++ int retval;
++ unsigned blocksize;
++ struct dx_hash_info hinfo;
++ u32 block;
++
++ blocksize = dir->i_sb->s_blocksize;
++ dxtrace(printk("Creating index\n"));
++ retval = ext3_journal_get_write_access(handle, bh);
++ if (retval) {
++ ext3_std_error(dir->i_sb, retval);
++ brelse(bh);
++ return retval;
++ }
++ root = (struct dx_root *) bh->b_data;
++
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ bh2 = ext3_append (handle, dir, &block, &retval);
++ if (!(bh2)) {
++ brelse(bh);
++ return retval;
++ }
++ data1 = bh2->b_data;
++
++ /* The 0th block becomes the root, move the dirents out */
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
++ len = ((char *) root) + blocksize - (char *) de;
++ memcpy (data1, de, len);
++ de = (struct ext3_dir_entry_2 *) data1;
++ top = data1 + len;
++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++ de = de2;
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ /* Initialize the root; the dot dirents already exist */
++ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++ memset (&root->info, 0, sizeof(root->info));
++ root->info.info_length = sizeof(root->info);
++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++ entries = root->entries;
++ dx_set_block (entries, 1);
++ dx_set_count (entries, 1);
++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++ /* Initialize as for dx_probe */
++ hinfo.hash_version = root->info.hash_version;
++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ frame = frames;
++ frame->entries = entries;
++ frame->at = entries;
++ frame->bh = bh;
++ bh = bh2;
++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ dx_release (frames);
++ if (!(de))
++ return retval;
++
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+ * ext3_add_entry()
+ *
+@@ -249,127 +1258,198 @@
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+- const char *name = dentry->d_name.name;
+- int namelen = dentry->d_name.len;
+ unsigned long offset;
+- unsigned short rec_len;
+ struct buffer_head * bh;
+- struct ext3_dir_entry_2 * de, * de1;
++ struct ext3_dir_entry_2 *de;
+ struct super_block * sb;
+ int retval;
++#ifdef CONFIG_EXT3_INDEX
++ int dx_fallback=0;
++#endif
++ unsigned blocksize;
++ unsigned nlen, rlen;
++ u32 block, blocks;
+
+ sb = dir->i_sb;
+-
+- if (!namelen)
++ blocksize = sb->s_blocksize;
++ if (!dentry->d_name.len)
+ return -EINVAL;
+- bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ retval = ext3_dx_add_entry(handle, dentry, inode);
++ if (!retval || (retval != ERR_BAD_DX_DIR))
++ return retval;
++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++ dx_fallback++;
++ ext3_mark_inode_dirty(handle, dir);
++ }
++#endif
++ blocks = dir->i_size >> sb->s_blocksize_bits;
++ for (block = 0, offset = 0; block < blocks; block++) {
++ bh = ext3_bread(handle, dir, block, 0, &retval);
++ if(!bh)
++ return retval;
++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (retval != -ENOSPC)
++ return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++ if (blocks == 1 && !dx_fallback &&
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++ return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++ brelse(bh);
++ }
++ bh = ext3_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+- rec_len = EXT3_DIR_REC_LEN(namelen);
+- offset = 0;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+- while (1) {
+- if ((char *)de >= sb->s_blocksize + bh->b_data) {
+- brelse (bh);
+- bh = NULL;
+- bh = ext3_bread (handle, dir,
+- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+- if (!bh)
+- return retval;
+- if (dir->i_size <= offset) {
+- if (dir->i_size == 0) {
+- brelse(bh);
+- return -ENOENT;
+- }
+-
+- ext3_debug ("creating next block\n");
+-
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- de->inode = 0;
+- de->rec_len = le16_to_cpu(sb->s_blocksize);
+- dir->u.ext3_i.i_disksize =
+- dir->i_size = offset + sb->s_blocksize;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- } else {
+-
+- ext3_debug ("skipping to next block\n");
++ de->inode = 0;
++ de->rec_len = cpu_to_le16(rlen = blocksize);
++ nlen = 0;
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- }
+- }
+- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+- offset)) {
+- brelse (bh);
+- return -ENOENT;
+- }
+- if (ext3_match (namelen, name, de)) {
+- brelse (bh);
+- return -EEXIST;
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries, *at;
++ struct dx_hash_info hinfo;
++ struct buffer_head * bh;
++ struct inode *dir = dentry->d_parent->d_inode;
++ struct super_block * sb = dir->i_sb;
++ struct ext3_dir_entry_2 *de;
++ int err;
++
++ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++ entries = frame->entries;
++ at = frame->at;
++
++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ goto cleanup;
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto journal_error;
++
++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (err != -ENOSPC) {
++ bh = 0;
++ goto cleanup;
++ }
++
++ /* Block full, should compress but for now just split */
++ dxtrace(printk("using %u of %u node entries\n",
++ dx_get_count(entries), dx_get_limit(entries)));
++ /* Need to split index? */
++ if (dx_get_count(entries) == dx_get_limit(entries)) {
++ u32 newblock;
++ unsigned icount = dx_get_count(entries);
++ int levels = frame - frames;
++ struct dx_entry *entries2;
++ struct dx_node *node2;
++ struct buffer_head *bh2;
++
++ if (levels && (dx_get_count(frames->entries) ==
++ dx_get_limit(frames->entries))) {
++ ext3_warning(sb, __FUNCTION__,
++ "Directory index full!\n");
++ err = -ENOSPC;
++ goto cleanup;
+ }
+- if ((le32_to_cpu(de->inode) == 0 &&
+- le16_to_cpu(de->rec_len) >= rec_len) ||
+- (le16_to_cpu(de->rec_len) >=
+- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- /* By now the buffer is marked for journaling */
+- offset += le16_to_cpu(de->rec_len);
+- if (le32_to_cpu(de->inode)) {
+- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+- EXT3_DIR_REC_LEN(de->name_len));
+- de1->rec_len =
+- cpu_to_le16(le16_to_cpu(de->rec_len) -
+- EXT3_DIR_REC_LEN(de->name_len));
+- de->rec_len = cpu_to_le16(
+- EXT3_DIR_REC_LEN(de->name_len));
+- de = de1;
++
++ bh2 = ext3_append (handle, dir, &newblock, &err);
++ if (!(bh2))
++ goto cleanup;
++ node2 = (struct dx_node *)(bh2->b_data);
++ entries2 = node2->entries;
++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++ node2->fake.inode = 0;
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++ if (levels) {
++ unsigned icount1 = icount/2, icount2 = icount - icount1;
++ unsigned hash2 = dx_get_hash(entries + icount1);
++ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++
++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++ err = ext3_journal_get_write_access(handle,
++ frames[0].bh);
++ if (err)
++ goto journal_error;
++
++ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry));
++ dx_set_count (entries, icount1);
++ dx_set_count (entries2, icount2);
++ dx_set_limit (entries2, dx_node_limit(dir));
++
++ /* Which index block gets the new entry? */
++ if (at - entries >= icount1) {
++ frame->at = at = at - entries - icount1 + entries2;
++ frame->entries = entries = entries2;
++ swap(frame->bh, bh2);
+ }
+- de->file_type = EXT3_FT_UNKNOWN;
+- if (inode) {
+- de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
+- de->inode = 0;
+- de->name_len = namelen;
+- memcpy (de->name, name, namelen);
+- /*
+- * XXX shouldn't update any times until successful
+- * completion of syscall, but too many callers depend
+- * on this.
+- *
+- * XXX similarly, too many callers depend on
+- * ext3_new_inode() setting the times, but error
+- * recovery deletes the inode, so the worst that can
+- * happen is that the times are slightly out of date
+- * and/or different from the directory change time.
+- */
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- dir->i_version = ++event;
+- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+- ext3_journal_dirty_metadata(handle, bh);
+- brelse(bh);
+- return 0;
++ dx_insert_block (frames + 0, hash2, newblock);
++ dxtrace(dx_show_index ("node", frames[1].entries));
++ dxtrace(dx_show_index ("node",
++ ((struct dx_node *) bh2->b_data)->entries));
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ } else {
++ dxtrace(printk("Creating second level index...\n"));
++ memcpy((char *) entries2, (char *) entries,
++ icount * sizeof(struct dx_entry));
++ dx_set_limit(entries2, dx_node_limit(dir));
++
++ /* Set up root */
++ dx_set_count(entries, 1);
++ dx_set_block(entries + 0, newblock);
++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++ /* Add new access path frame */
++ frame = frames + 1;
++ frame->at = at = at - entries + entries2;
++ frame->entries = entries = entries2;
++ frame->bh = bh2;
++ err = ext3_journal_get_write_access(handle,
++ frame->bh);
++ if (err)
++ goto journal_error;
+ }
+- offset += le16_to_cpu(de->rec_len);
+- de = (struct ext3_dir_entry_2 *)
+- ((char *) de + le16_to_cpu(de->rec_len));
++ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- brelse (bh);
+- return -ENOSPC;
++ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ if (!de)
++ goto cleanup;
++ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ bh = 0;
++ goto cleanup;
++
++journal_error:
++ ext3_std_error(dir->i_sb, err);
++cleanup:
++ if (bh)
++ brelse(bh);
++ dx_release(frames);
++ return err;
+ }
++#endif
+
+ /*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -453,9 +1533,11 @@
+ struct inode * inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -480,9 +1562,11 @@
+ struct inode *inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -512,9 +1596,11 @@
+ if (dir->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -526,7 +1612,8 @@
+
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
++ inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ inode->i_nlink--; /* is this nlink == 0? */
+@@ -555,21 +1642,19 @@
+ brelse (dir_block);
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ inode->i_nlink = 0;
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ dir->i_nlink++;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- inode->i_nlink = 0;
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ /*
+@@ -656,7 +1741,7 @@
+ int err = 0, rc;
+
+ lock_super(sb);
+- if (!list_empty(&inode->u.ext3_i.i_orphan))
++ if (!list_empty(&EXT3_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+@@ -697,7 +1782,7 @@
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -715,25 +1800,26 @@
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ struct list_head *prev;
++ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_sb_info *sbi;
+ ino_t ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+- if (list_empty(&inode->u.ext3_i.i_orphan)) {
++ if (list_empty(&ei->i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+- prev = inode->u.ext3_i.i_orphan.prev;
++ prev = ei->i_orphan.prev;
+ sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+
+- list_del(&inode->u.ext3_i.i_orphan);
+- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++ list_del(&ei->i_orphan);
++ INIT_LIST_HEAD(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+@@ -794,8 +1880,9 @@
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+@@ -833,7 +1920,7 @@
+ ext3_mark_inode_dirty(handle, inode);
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+@@ -851,8 +1938,9 @@
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -879,7 +1967,7 @@
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+ if (!inode->i_nlink)
+@@ -905,9 +1993,11 @@
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -917,7 +2007,7 @@
+ if (IS_ERR(inode))
+ goto out_stop;
+
+- if (l > sizeof (inode->u.ext3_i.i_data)) {
++ if (l > sizeof (EXT3_I(inode)->i_data)) {
+ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+@@ -926,25 +2016,23 @@
+ * i_size in generic_commit_write().
+ */
+ err = block_symlink(inode, symname, l);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ ext3_dec_count(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ } else {
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+- memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+ inode->i_size = l-1;
+ }
+- inode->u.ext3_i.i_disksize = inode->i_size;
++ EXT3_I(inode)->i_disksize = inode->i_size;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- ext3_dec_count(handle, inode);
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+@@ -957,12 +2045,15 @@
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (inode->i_nlink >= EXT3_LINK_MAX) {
+ return -EMLINK;
++ }
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -996,9 +2087,11 @@
+
+ old_bh = new_bh = dir_bh = NULL;
+
+- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+@@ -1078,7 +2171,7 @@
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_bh);
+@@ -1090,7 +2183,7 @@
+ new_inode->i_nlink--;
+ } else {
+ new_dir->i_nlink++;
+- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+ }
+Index: linux-2.4.19.SuSE/fs/ext3/super.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-05-27 11:07:21.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-05-27 11:08:28.000000000 -0700
+@@ -741,6 +741,7 @@
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+ ext3_commit_super (sb, es, 1);
+ if (test_opt (sb, DEBUG))
+ printk (KERN_INFO
+@@ -751,6 +752,7 @@
+ EXT3_BLOCKS_PER_GROUP(sb),
+ EXT3_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
++
+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ bdevname(sb->s_dev));
+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -925,6 +927,7 @@
+ return res;
+ }
+
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ int silent)
+ {
+@@ -1113,6 +1116,9 @@
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++ for (i=0; i < 4; i++)
++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+@@ -1821,6 +1827,7 @@
+ exit_ext3_xattr();
+ }
+
++EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+Index: linux-2.4.19.SuSE/fs/ext3/file.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/file.c 2002-12-04 09:46:18.000000000 -0800
++++ linux-2.4.19.SuSE/fs/ext3/file.c 2004-05-27 11:08:28.000000000 -0700
+@@ -38,6 +38,9 @@
+ {
+ if (filp->f_mode & FMODE_WRITE)
+ ext3_discard_prealloc (inode);
++ if (is_dx(inode) && filp->private_data)
++ ext3_htree_free_dir_info(filp->private_data);
++
+ return 0;
+ }
+
+Index: linux-2.4.19.SuSE/fs/ext3/hash.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/hash.c 1970-01-02 14:15:01.000000000 -0800
++++ linux-2.4.19.SuSE/fs/ext3/hash.c 2004-05-27 11:08:28.000000000 -0700
+@@ -0,0 +1,215 @@
++/*
++ * linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ *
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++ __u32 sum = 0;
++ __u32 b0 = buf[0], b1 = buf[1];
++ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
++ int n = 16;
++
++ do {
++ sum += DELTA;
++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
++ } while(--n);
++
++ buf[0] += b0;
++ buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function. The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s) \
++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform. Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++ /* Round 1 */
++ ROUND(F, a, b, c, d, in[0] + K1, 3);
++ ROUND(F, d, a, b, c, in[1] + K1, 7);
++ ROUND(F, c, d, a, b, in[2] + K1, 11);
++ ROUND(F, b, c, d, a, in[3] + K1, 19);
++ ROUND(F, a, b, c, d, in[4] + K1, 3);
++ ROUND(F, d, a, b, c, in[5] + K1, 7);
++ ROUND(F, c, d, a, b, in[6] + K1, 11);
++ ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++ /* Round 2 */
++ ROUND(G, a, b, c, d, in[1] + K2, 3);
++ ROUND(G, d, a, b, c, in[3] + K2, 5);
++ ROUND(G, c, d, a, b, in[5] + K2, 9);
++ ROUND(G, b, c, d, a, in[7] + K2, 13);
++ ROUND(G, a, b, c, d, in[0] + K2, 3);
++ ROUND(G, d, a, b, c, in[2] + K2, 5);
++ ROUND(G, c, d, a, b, in[4] + K2, 9);
++ ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++ /* Round 3 */
++ ROUND(H, a, b, c, d, in[3] + K3, 3);
++ ROUND(H, d, a, b, c, in[7] + K3, 9);
++ ROUND(H, c, d, a, b, in[2] + K3, 11);
++ ROUND(H, b, c, d, a, in[6] + K3, 15);
++ ROUND(H, a, b, c, d, in[1] + K3, 3);
++ ROUND(H, d, a, b, c, in[5] + K3, 9);
++ ROUND(H, c, d, a, b, in[0] + K3, 11);
++ ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++ buf[0] += a;
++ buf[1] += b;
++ buf[2] += c;
++ buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++ while (len--) {
++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++
++ if (hash & 0x80000000) hash -= 0x7fffffff;
++ hash1 = hash0;
++ hash0 = hash;
++ }
++ return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++ __u32 pad, val;
++ int i;
++
++ pad = (__u32)len | ((__u32)len << 8);
++ pad |= pad << 16;
++
++ val = pad;
++ if (len > num*4)
++ len = num * 4;
++ for (i=0; i < len; i++) {
++ if ((i % 4) == 0)
++ val = pad;
++ val = msg[i] + (val << 8);
++ if ((i % 4) == 3) {
++ *buf++ = val;
++ val = pad;
++ num--;
++ }
++ }
++ if (--num >= 0)
++ *buf++ = val;
++ while (--num >= 0)
++ *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename. If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ *
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash. If the seed is all zero's, then some default seed
++ * may be used.
++ *
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits. 32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++ __u32 hash;
++ __u32 minor_hash = 0;
++ const char *p;
++ int i;
++ __u32 in[8], buf[4];
++
++ /* Initialize the default seed for the hash checksum functions */
++ buf[0] = 0x67452301;
++ buf[1] = 0xefcdab89;
++ buf[2] = 0x98badcfe;
++ buf[3] = 0x10325476;
++
++ /* Check to see if the seed is all zero's */
++ if (hinfo->seed) {
++ for (i=0; i < 4; i++) {
++ if (hinfo->seed[i])
++ break;
++ }
++ if (i < 4)
++ memcpy(buf, hinfo->seed, sizeof(buf));
++ }
++
++ switch (hinfo->hash_version) {
++ case DX_HASH_LEGACY:
++ hash = dx_hack_hash(name, len);
++ break;
++ case DX_HASH_HALF_MD4:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 8);
++ halfMD4Transform(buf, in);
++ len -= 32;
++ p += 32;
++ }
++ minor_hash = buf[2];
++ hash = buf[1];
++ break;
++ case DX_HASH_TEA:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 4);
++ TEA_transform(buf, in);
++ len -= 16;
++ p += 16;
++ }
++ hash = buf[0];
++ minor_hash = buf[1];
++ break;
++ default:
++ hinfo->hash = 0;
++ return -1;
++ }
++ hinfo->hash = hash & ~1;
++ hinfo->minor_hash = minor_hash;
++ return 0;
++}
+Index: linux-2.4.19.SuSE/lib/rbtree.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/lib/rbtree.c 2002-08-02 17:39:46.000000000 -0700
++++ linux-2.4.19.SuSE/lib/rbtree.c 2004-05-27 11:08:28.000000000 -0700
+@@ -17,6 +17,8 @@
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
++
++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+
+ #include <linux/rbtree.h>
+@@ -294,3 +296,43 @@
+ __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++ rb_node_t *n;
++
++ n = root->rb_node;
++ if (!n)
++ return 0;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++ rb_node_t *parent;
++
++ if (n->rb_right) {
++ n = n->rb_right;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++ } else {
++ while ((parent = n->rb_parent)) {
++ if (n == parent->rb_left)
++ return parent;
++ n = parent;
++ }
++ return 0;
++ }
++}
++EXPORT_SYMBOL(rb_get_next);
++
+Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h 2003-10-05 09:30:34.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/ext3_fs.h 2004-05-27 11:08:28.000000000 -0700
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION "2.4-0.9.18"
+
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+ * Debug code
+ */
+ #ifdef EXT3FS_DEBUG
+@@ -414,8 +419,11 @@
+ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+-
+-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
++ __u32 s_hash_seed[4]; /* HTREE hash seed */
++ __u8 s_def_hash_version; /* Default hash version to use */
++ __u8 s_reserved_char_pad;
++ __u16 s_reserved_word_pad;
++ __u32 s_reserved[192]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+@@ -552,9 +560,46 @@
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++ #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY 0
++#define DX_HASH_HALF_MD4 1
++#define DX_HASH_TEA 2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++ u32 hash;
++ u32 minor_hash;
++ int hash_version;
++ u32 *seed;
++};
+
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS 1
++
++
++/*
+ * Describe an inode's exact location on disk and in memory
+ */
+ struct ext3_iloc
+@@ -564,6 +609,27 @@
+ unsigned long block_group;
+ };
+
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories. It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++ rb_root_t root;
++ rb_node_t *curr_node;
++ struct fname *extra_fname;
++ loff_t last_pos;
++ __u32 curr_hash;
++ __u32 curr_minor_hash;
++ __u32 next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR -75000
++
+ /*
+ * Function prototypes
+ */
+@@ -591,11 +657,20 @@
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+- struct ext3_dir_entry_2 *, struct buffer_head *,
+- unsigned long);
++ struct ext3_dir_entry_2 *,
++ struct buffer_head *, unsigned long);
++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++ dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -628,6 +703,8 @@
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h 2003-10-05 09:16:36.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h 2004-05-27 11:08:28.000000000 -0700
+@@ -62,6 +62,8 @@
+ int s_inode_size;
+ int s_first_ino;
+ u32 s_next_generation;
++ u32 s_hash_seed[4];
++ int s_def_hash_version;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+Index: linux-2.4.19.SuSE/include/linux/ext3_jbd.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/ext3_jbd.h 2003-10-05 09:30:34.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/ext3_jbd.h 2004-05-27 11:08:28.000000000 -0700
+@@ -69,6 +69,8 @@
+
+ #define EXT3_RESERVE_TRANS_BLOCKS 12
+
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+Index: linux-2.4.19.SuSE/include/linux/rbtree.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/rbtree.h 2003-10-05 09:16:36.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/rbtree.h 2004-05-27 11:08:28.000000000 -0700
+@@ -120,6 +120,8 @@
+
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
--- /dev/null
+ fs/ext3/file.c | 4
+ fs/ext3/inode.c | 116 ++++++++++++++++++++++
+ fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/ext3_fs.h | 5
+ include/linux/ext3_fs_sb.h | 10 +
+ 5 files changed, 365 insertions(+)
+
+Index: linux-2.4.20/fs/ext3/super.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/super.c 2004-01-12 20:13:37.000000000 +0300
++++ linux-2.4.20/fs/ext3/super.c 2004-01-13 16:59:54.000000000 +0300
+@@ -48,6 +48,8 @@
+ static void ext3_clear_journal_err(struct super_block * sb,
+ struct ext3_super_block * es);
+
++static int ext3_sync_fs(struct super_block * sb);
++
+ #ifdef CONFIG_JBD_DEBUG
+ int journal_no_write[2];
+
+@@ -398,6 +400,221 @@
+ }
+ }
+
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++ struct super_block *sb = data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct task_struct *tsk = current;
++
++ /* Almost like daemonize, but not quite */
++ exit_mm(current);
++ tsk->session = 1;
++ tsk->pgrp = 1;
++ tsk->tty = NULL;
++ exit_files(current);
++ reparent_to_init();
++
++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++ sigfillset(&tsk->blocked);
++
++ /*tsk->flags |= PF_KERNTHREAD;*/
++
++ INIT_LIST_HEAD(&sbi->s_delete_list);
++ wake_up(&sbi->s_delete_waiter_queue);
++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
++
++ /* main loop */
++ for (;;) {
++ wait_event_interruptible(sbi->s_delete_thread_queue,
++ !list_empty(&sbi->s_delete_list) ||
++ !test_opt(sb, ASYNCDEL));
++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
++
++ spin_lock(&sbi->s_delete_lock);
++ if (list_empty(&sbi->s_delete_list)) {
++ clear_opt(sbi->s_mount_opt, ASYNCDEL);
++ memset(&sbi->s_delete_list, 0,
++ sizeof(sbi->s_delete_list));
++ spin_unlock(&sbi->s_delete_lock);
++ ext3_debug("delete thread on %s exiting\n",
++ kdevname(sb->s_dev));
++ wake_up(&sbi->s_delete_waiter_queue);
++ break;
++ }
++
++ while (!list_empty(&sbi->s_delete_list)) {
++ struct inode *inode=list_entry(sbi->s_delete_list.next,
++ struct inode, i_dentry);
++ unsigned long blocks = inode->i_blocks >>
++ (inode->i_blkbits - 9);
++
++ list_del_init(&inode->i_dentry);
++ spin_unlock(&sbi->s_delete_lock);
++ ext3_debug("%s delete ino %lu blk %lu\n",
++ tsk->comm, inode->i_ino, blocks);
++
++ iput(inode);
++
++ spin_lock(&sbi->s_delete_lock);
++ sbi->s_delete_blocks -= blocks;
++ sbi->s_delete_inodes--;
++ }
++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
++ ext3_warning(sb, __FUNCTION__,
++ "%lu blocks, %lu inodes on list?\n",
++ sbi->s_delete_blocks,sbi->s_delete_inodes);
++ sbi->s_delete_blocks = 0;
++ sbi->s_delete_inodes = 0;
++ }
++ spin_unlock(&sbi->s_delete_lock);
++ wake_up(&sbi->s_delete_waiter_queue);
++ }
++
++ return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int rc;
++
++ spin_lock_init(&sbi->s_delete_lock);
++ init_waitqueue_head(&sbi->s_delete_thread_queue);
++ init_waitqueue_head(&sbi->s_delete_waiter_queue);
++
++ if (!test_opt(sb, ASYNCDEL))
++ return;
++
++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++ if (rc < 0)
++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++ rc);
++ else
++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++ if (sbi->s_delete_list.next == 0) /* thread never started */
++ return;
++
++ clear_opt(sbi->s_mount_opt, ASYNCDEL);
++ wake_up(&sbi->s_delete_thread_queue);
++ wait_event(sbi->s_delete_waiter_queue,
++ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
++}
++
++/* Instead of playing games with the inode flags, destruction, etc we just
++ * create a new inode locally and put it on a list for the truncate thread.
++ * We need large parts of the inode struct in order to complete the
++ * truncate and unlink, so we may as well just have a real inode to do it.
++ *
++ * If we have any problem deferring the delete, just delete it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * delete thread when we run out of space.
++ */
++static void ext3_delete_inode_thread(struct inode *old_inode)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
++ struct inode *new_inode;
++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++ if (is_bad_inode(old_inode)) {
++ clear_inode(old_inode);
++ return;
++ }
++
++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
++ goto out_delete;
++
++ /* We may want to delete the inode immediately and not defer it */
++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
++ goto out_delete;
++
++ /* We can't use the delete thread as-is during real orphan recovery,
++ * as we add to the orphan list here, causing ext3_orphan_cleanup()
++ * to loop endlessly. It would be nice to do so, but needs work.
++ */
++ if (oei->i_state & EXT3_STATE_DELETE ||
++ sbi->s_mount_state & EXT3_ORPHAN_FS) {
++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++ old_inode->i_ino, blocks);
++ goto out_delete;
++ }
++
++ /* We can iget this inode again here, because our caller has unhashed
++ * old_inode, so new_inode will be in a different inode struct.
++ *
++ * We need to ensure that the i_orphan pointers in the other inodes
++ * point at the new inode copy instead of the old one so the orphan
++ * list doesn't get corrupted when the old orphan inode is freed.
++ */
++ down(&sbi->s_orphan_lock);
++
++ sbi->s_mount_state |= EXT3_ORPHAN_FS;
++ new_inode = iget(old_inode->i_sb, old_inode->i_ino);
++ sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
++ if (is_bad_inode(new_inode)) {
++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
++ iput(new_inode);
++ new_inode = NULL;
++ }
++ if (!new_inode) {
++ up(&sbi->s_orphan_lock);
++ ext3_debug("delete inode %lu directly (bad read)\n",
++ old_inode->i_ino);
++ goto out_delete;
++ }
++ J_ASSERT(new_inode != old_inode);
++
++ J_ASSERT(!list_empty(&oei->i_orphan));
++
++ nei = EXT3_I(new_inode);
++ /* Ugh. We need to insert new_inode into the same spot on the list
++ * as old_inode was, to ensure the in-memory orphan list is still
++ * in the same order as the on-disk orphan list (badness otherwise).
++ */
++ nei->i_orphan = oei->i_orphan;
++ nei->i_orphan.next->prev = &nei->i_orphan;
++ nei->i_orphan.prev->next = &nei->i_orphan;
++ nei->i_state |= EXT3_STATE_DELETE;
++ up(&sbi->s_orphan_lock);
++
++ clear_inode(old_inode);
++
++ spin_lock(&sbi->s_delete_lock);
++ J_ASSERT(list_empty(&new_inode->i_dentry));
++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++ sbi->s_delete_blocks += blocks;
++ sbi->s_delete_inodes++;
++ spin_unlock(&sbi->s_delete_lock);
++
++ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++ new_inode->i_ino, blocks);
++
++ wake_up(&sbi->s_delete_thread_queue);
++ return;
++
++out_delete:
++ ext3_delete_inode(old_inode);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -405,6 +622,7 @@
+ kdev_t j_dev = sbi->s_journal->j_dev;
+ int i;
+
++ J_ASSERT(sbi->s_delete_inodes == 0);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -453,9 +671,14 @@
+ write_inode: ext3_write_inode, /* BKL not held. Don't need */
+ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
+ put_inode: ext3_put_inode, /* BKL not held. Don't need */
++#ifdef EXT3_DELETE_THREAD
++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */
++#else
+ delete_inode: ext3_delete_inode, /* BKL not held. We take it */
++#endif
+ put_super: ext3_put_super, /* BKL held */
+ write_super: ext3_write_super, /* BKL held */
++ sync_fs: ext3_sync_fs,
+ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+ unlockfs: ext3_unlockfs, /* BKL not held. We take it */
+ statfs: ext3_statfs, /* BKL held */
+@@ -521,6 +744,13 @@
+ clear_opt (*mount_options, XATTR_USER);
+ else
+ #endif
++#ifdef EXT3_DELETE_THREAD
++ if (!strcmp(this_char, "asyncdel"))
++ set_opt(*mount_options, ASYNCDEL);
++ else if (!strcmp(this_char, "noasyncdel"))
++ clear_opt(*mount_options, ASYNCDEL);
++ else
++#endif
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+@@ -1220,6 +1450,7 @@
+ }
+
+ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++ ext3_start_delete_thread(sb);
+ /*
+ * akpm: core read_super() calls in here with the superblock locked.
+ * That deadlocks, because orphan cleanup needs to lock the superblock
+@@ -1625,6 +1856,21 @@
+ }
+ }
+
++static int ext3_sync_fs(struct super_block *sb)
++{
++ tid_t target;
++
++ if (atomic_read(&sb->s_active) == 0) {
++ /* fs is being umounted: time to stop delete thread */
++ ext3_stop_delete_thread(EXT3_SB(sb));
++ }
++
++ sb->s_dirt = 0;
++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
++ log_wait_commit(EXT3_SB(sb)->s_journal, target);
++ return 0;
++}
++
+ /*
+ * LVM calls this function before a (read-only) snapshot is created. This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+@@ -1682,6 +1928,9 @@
+ if (!parse_options(data, &tmp, sbi, &tmp, 1))
+ return -EINVAL;
+
++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
++ ext3_stop_delete_thread(sbi);
++
+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+
+Index: linux-2.4.20/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-12 20:13:37.000000000 +0300
++++ linux-2.4.20/fs/ext3/inode.c 2004-01-13 16:55:45.000000000 +0300
+@@ -2552,6 +2552,118 @@
+ return err;
+ }
+
++#ifdef EXT3_DELETE_THREAD
++/* Move blocks from to-be-truncated inode over to a new inode, and delete
++ * that one from the delete thread instead. This avoids a lot of latency
++ * when truncating large files.
++ *
++ * If we have any problem deferring the truncate, just truncate it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * delete thread when we run out of space.
++ */
++void ext3_truncate_thread(struct inode *old_inode)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
++ struct inode *new_inode;
++ handle_t *handle;
++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
++ goto out_truncate;
++
++ /* XXX This is a temporary limitation for code simplicity.
++ * We could truncate to arbitrary sizes at some later time.
++ */
++ if (old_inode->i_size != 0)
++ goto out_truncate;
++
++ /* We may want to truncate the inode immediately and not defer it */
++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++ old_inode->i_size > oei->i_disksize)
++ goto out_truncate;
++
++ /* We can't use the delete thread as-is during real orphan recovery,
++ * as we add to the orphan list here, causing ext3_orphan_cleanup()
++ * to loop endlessly. It would be nice to do so, but needs work.
++ */
++ if (oei->i_state & EXT3_STATE_DELETE ||
++ sbi->s_mount_state & EXT3_ORPHAN_FS) {
++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++ old_inode->i_ino, blocks);
++ goto out_truncate;
++ }
++
++ ext3_discard_prealloc(old_inode);
++
++ /* old_inode = 1
++ * new_inode = sb + GDT + ibitmap
++ * orphan list = 1 inode/superblock for add, 2 inodes for del
++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++ */
++ handle = ext3_journal_start(old_inode, 7);
++ if (IS_ERR(handle))
++ goto out_truncate;
++
++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
++ if (IS_ERR(new_inode)) {
++ ext3_debug("truncate inode %lu directly (no new inodes)\n",
++ old_inode->i_ino);
++ goto out_journal;
++ }
++
++ nei = EXT3_I(new_inode);
++
++ down_write(&oei->truncate_sem);
++ new_inode->i_size = old_inode->i_size;
++ new_inode->i_blocks = old_inode->i_blocks;
++ new_inode->i_uid = old_inode->i_uid;
++ new_inode->i_gid = old_inode->i_gid;
++ new_inode->i_nlink = 0;
++
++ /* FIXME when we do arbitrary truncates */
++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
++
++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
++ memset(oei->i_data, 0, sizeof(oei->i_data));
++
++ nei->i_disksize = oei->i_disksize;
++ nei->i_state |= EXT3_STATE_DELETE;
++ up_write(&oei->truncate_sem);
++
++ if (ext3_orphan_add(handle, new_inode) < 0)
++ goto out_journal;
++
++ if (ext3_orphan_del(handle, old_inode) < 0) {
++ ext3_orphan_del(handle, new_inode);
++ iput(new_inode);
++ goto out_journal;
++ }
++
++ ext3_journal_stop(handle, old_inode);
++
++ spin_lock(&sbi->s_delete_lock);
++ J_ASSERT(list_empty(&new_inode->i_dentry));
++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++ sbi->s_delete_blocks += blocks;
++ sbi->s_delete_inodes++;
++ spin_unlock(&sbi->s_delete_lock);
++
++ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++ new_inode->i_ino, blocks);
++
++ wake_up(&sbi->s_delete_thread_queue);
++ return;
++
++out_journal:
++ ext3_journal_stop(handle, old_inode);
++out_truncate:
++ ext3_truncate(old_inode);
++}
++#endif /* EXT3_DELETE_THREAD */
++
+ /*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+Index: linux-2.4.20/fs/ext3/file.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/file.c 2004-01-12 20:13:36.000000000 +0300
++++ linux-2.4.20/fs/ext3/file.c 2004-01-13 16:55:45.000000000 +0300
+@@ -125,7 +125,11 @@
+ };
+
+ struct inode_operations ext3_file_inode_operations = {
++#ifdef EXT3_DELETE_THREAD
++ truncate: ext3_truncate_thread, /* BKL held */
++#else
+ truncate: ext3_truncate, /* BKL held */
++#endif
+ setattr: ext3_setattr, /* BKL held */
+ setxattr: ext3_setxattr, /* BKL held */
+ getxattr: ext3_getxattr, /* BKL held */
+Index: linux-2.4.20/fs/buffer.c
+===================================================================
+--- linux-2.4.20.orig/fs/buffer.c 2003-05-16 05:29:12.000000000 +0400
++++ linux-2.4.20/fs/buffer.c 2004-01-13 16:55:45.000000000 +0300
+@@ -328,6 +328,8 @@
+ if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ unlock_super(sb);
++ if (sb->s_op && sb->s_op->sync_fs)
++ sb->s_op->sync_fs(sb);
+ unlock_kernel();
+
+ return sync_buffers(dev, 1);
+Index: linux-2.4.20/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-12 20:13:37.000000000 +0300
++++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-13 16:55:45.000000000 +0300
+@@ -193,6 +193,7 @@
+ */
+ #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
+
+ /*
+ * ioctl commands
+@@ -320,6 +321,7 @@
+ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -696,6 +698,9 @@
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+ extern void ext3_truncate (struct inode *);
++#ifdef EXT3_DELETE_THREAD
++extern void ext3_truncate_thread(struct inode *inode);
++#endif
+
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+Index: linux-2.4.20/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:13:37.000000000 +0300
++++ linux-2.4.20/include/linux/ext3_fs_sb.h 2004-01-13 16:55:45.000000000 +0300
+@@ -29,6 +29,8 @@
+
+ #define EXT3_MAX_GROUP_LOADED 8
+
++#define EXT3_DELETE_THREAD
++
+ /*
+ * third extended-fs super-block data in memory
+ */
+@@ -76,6 +78,14 @@
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++ spinlock_t s_delete_lock;
++ struct list_head s_delete_list;
++ unsigned long s_delete_blocks;
++ unsigned long s_delete_inodes;
++ wait_queue_head_t s_delete_thread_queue;
++ wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.4.20/include/linux/fs.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/fs.h 2004-01-12 20:13:36.000000000 +0300
++++ linux-2.4.20/include/linux/fs.h 2004-01-13 16:55:45.000000000 +0300
+@@ -917,6 +917,7 @@
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ void (*write_super) (struct super_block *);
++ int (*sync_fs) (struct super_block *);
+ void (*write_super_lockfs) (struct super_block *);
+ void (*unlockfs) (struct super_block *);
+ int (*statfs) (struct super_block *, struct statfs *);
--- /dev/null
+ fs/ext3/Makefile | 2
+ fs/ext3/dir.c | 302 +++++++++
+ fs/ext3/file.c | 3
+ fs/ext3/hash.c | 215 ++++++
+ fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c | 7
+ include/linux/ext3_fs.h | 85 ++
+ include/linux/ext3_fs_sb.h | 2
+ include/linux/ext3_jbd.h | 2
+ include/linux/rbtree.h | 2
+ lib/rbtree.c | 42 +
+ 11 files changed, 1921 insertions(+), 161 deletions(-)
+
+Index: linux.mcp2/fs/ext3/dir.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/dir.c 2004-05-17 15:03:55.000000000 -0700
++++ linux.mcp2/fs/ext3/dir.c 2004-05-17 15:07:06.000000000 -0700
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir);
+
+ struct file_operations ext3_dir_operations = {
+ read: generic_read_dir,
+@@ -35,6 +39,17 @@
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++ (filetype >= EXT3_FT_MAX))
++ return DT_UNKNOWN;
++
++ return (ext3_filetype_table[filetype]);
++}
++
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+@@ -79,6 +94,16 @@
+
+ sb = inode->i_sb;
+
++ if (is_dx(inode)) {
++ err = ext3_dx_readdir(filp, dirent, filldir);
++ if (err != ERR_BAD_DX_DIR)
++ return err;
++ /*
++ * We don't set the inode dirty flag since it's not
++ * critical that it get flushed back to the disk.
++ */
++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++ }
+ stored = 0;
+ bh = NULL;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+- unsigned char d_type = DT_UNKNOWN;
+
+- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+- EXT3_FEATURE_INCOMPAT_FILETYPE)
+- && de->file_type < EXT3_FT_MAX)
+- d_type =
+- ext3_filetype_table[de->file_type];
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+- d_type);
++ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+@@ -188,3 +207,272 @@
+ UPDATE_ATIME(inode);
+ return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ *
++ * Currently we only use major hash numer. This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie. Sigh.
++ */
++#define hash2pos(major, minor) (major >> 1)
++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos) (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++ __u32 hash;
++ __u32 minor_hash;
++ rb_node_t rb_hash;
++ struct fname *next;
++ __u32 inode;
++ __u8 name_len;
++ __u8 file_type;
++ char name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++ rb_node_t *n = root->rb_node;
++ rb_node_t *parent;
++ struct fname *fname;
++
++ while (n) {
++ /* Do the node's children first */
++ if ((n)->rb_left) {
++ n = n->rb_left;
++ continue;
++ }
++ if (n->rb_right) {
++ n = n->rb_right;
++ continue;
++ }
++ /*
++ * The node has no children; free it, and then zero
++ * out parent's link to it. Finally go to the
++ * beginning of the loop and try to free the parent
++ * node.
++ */
++ parent = n->rb_parent;
++ fname = rb_entry(n, struct fname, rb_hash);
++ kfree(fname);
++ if (!parent)
++ root->rb_node = 0;
++ else if (parent->rb_left == n)
++ parent->rb_left = 0;
++ else if (parent->rb_right == n)
++ parent->rb_right = 0;
++ n = parent;
++ }
++ root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++ struct dir_private_info *p;
++
++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++ if (!p)
++ return NULL;
++ p->root.rb_node = 0;
++ p->curr_node = 0;
++ p->extra_fname = 0;
++ p->last_pos = 0;
++ p->curr_hash = pos2maj_hash(pos);
++ p->curr_minor_hash = pos2min_hash(pos);
++ p->next_hash = 0;
++ return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++ free_rb_tree_fname(&p->root);
++ kfree(p);
++}
++
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent)
++{
++ rb_node_t **p, *parent = NULL;
++ struct fname * fname, *new_fn;
++ struct dir_private_info *info;
++ int len;
++
++ info = (struct dir_private_info *) dir_file->private_data;
++ p = &info->root.rb_node;
++
++ /* Create and allocate the fname structure */
++ len = sizeof(struct fname) + dirent->name_len + 1;
++ new_fn = kmalloc(len, GFP_KERNEL);
++ if (!new_fn)
++ return -ENOMEM;
++ memset(new_fn, 0, len);
++ new_fn->hash = hash;
++ new_fn->minor_hash = minor_hash;
++ new_fn->inode = le32_to_cpu(dirent->inode);
++ new_fn->name_len = dirent->name_len;
++ new_fn->file_type = dirent->file_type;
++ memcpy(new_fn->name, dirent->name, dirent->name_len);
++ new_fn->name[dirent->name_len] = 0;
++
++ while (*p) {
++ parent = *p;
++ fname = rb_entry(parent, struct fname, rb_hash);
++
++ /*
++ * If the hash and minor hash match up, then we put
++ * them on a linked list. This rarely happens...
++ */
++ if ((new_fn->hash == fname->hash) &&
++ (new_fn->minor_hash == fname->minor_hash)) {
++ new_fn->next = fname->next;
++ fname->next = new_fn;
++ return 0;
++ }
++
++ if (new_fn->hash < fname->hash)
++ p = &(*p)->rb_left;
++ else if (new_fn->hash > fname->hash)
++ p = &(*p)->rb_right;
++ else if (new_fn->minor_hash < fname->minor_hash)
++ p = &(*p)->rb_left;
++ else /* if (new_fn->minor_hash > fname->minor_hash) */
++ p = &(*p)->rb_right;
++ }
++
++ rb_link_node(&new_fn->rb_hash, parent, p);
++ rb_insert_color(&new_fn->rb_hash, &info->root);
++ return 0;
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir. It calls filldir
++ * for all entres on the fname linked list. (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++ filldir_t filldir, struct fname *fname)
++{
++ struct dir_private_info *info = filp->private_data;
++ loff_t curr_pos;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct super_block * sb;
++ int error;
++
++ sb = inode->i_sb;
++
++ if (!fname) {
++ printk("call_filldir: called with null fname?!?\n");
++ return 0;
++ }
++ curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ while (fname) {
++ error = filldir(dirent, fname->name,
++ fname->name_len, curr_pos,
++ fname->inode,
++ get_dtype(sb, fname->file_type));
++ if (error) {
++ filp->f_pos = curr_pos;
++ info->extra_fname = fname->next;
++ return error;
++ }
++ fname = fname->next;
++ }
++ return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir)
++{
++ struct dir_private_info *info = filp->private_data;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct fname *fname;
++ int ret;
++
++ if (!info) {
++ info = create_dir_info(filp->f_pos);
++ if (!info)
++ return -ENOMEM;
++ filp->private_data = info;
++ }
++
++ /* Some one has messed with f_pos; reset the world */
++ if (info->last_pos != filp->f_pos) {
++ free_rb_tree_fname(&info->root);
++ info->curr_node = 0;
++ info->extra_fname = 0;
++ info->curr_hash = pos2maj_hash(filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ }
++
++ /*
++ * If there are any leftover names on the hash collision
++ * chain, return them first.
++ */
++ if (info->extra_fname &&
++ call_filldir(filp, dirent, filldir, info->extra_fname))
++ goto finished;
++
++ if (!info->curr_node)
++ info->curr_node = rb_get_first(&info->root);
++
++ while (1) {
++ /*
++ * Fill the rbtree if we have no more entries,
++ * or the inode has changed since we last read in the
++ * cached entries.
++ */
++ if ((!info->curr_node) ||
++ (filp->f_version != inode->i_version)) {
++ info->curr_node = 0;
++ free_rb_tree_fname(&info->root);
++ filp->f_version = inode->i_version;
++ ret = ext3_htree_fill_tree(filp, info->curr_hash,
++ info->curr_minor_hash,
++ &info->next_hash);
++ if (ret < 0)
++ return ret;
++ if (ret == 0)
++ break;
++ info->curr_node = rb_get_first(&info->root);
++ }
++
++ fname = rb_entry(info->curr_node, struct fname, rb_hash);
++ info->curr_hash = fname->hash;
++ info->curr_minor_hash = fname->minor_hash;
++ if (call_filldir(filp, dirent, filldir, fname))
++ break;
++
++ info->curr_node = rb_get_next(info->curr_node);
++ if (!info->curr_node) {
++ info->curr_hash = info->next_hash;
++ info->curr_minor_hash = 0;
++ }
++ }
++finished:
++ info->last_pos = filp->f_pos;
++ UPDATE_ATIME(inode);
++ return 0;
++}
++#endif
+Index: linux.mcp2/fs/ext3/file.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/file.c 2004-05-17 15:03:55.000000000 -0700
++++ linux.mcp2/fs/ext3/file.c 2004-05-17 15:07:06.000000000 -0700
+@@ -35,6 +35,9 @@
+ {
+ if (filp->f_mode & FMODE_WRITE)
+ ext3_discard_prealloc (inode);
++ if (is_dx(inode) && filp->private_data)
++ ext3_htree_free_dir_info(filp->private_data);
++
+ return 0;
+ }
+
+Index: linux.mcp2/fs/ext3/hash.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/hash.c 2002-04-11 07:25:15.000000000 -0700
++++ linux.mcp2/fs/ext3/hash.c 2004-05-17 15:07:06.000000000 -0700
+@@ -0,0 +1,215 @@
++/*
++ * linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ *
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++ __u32 sum = 0;
++ __u32 b0 = buf[0], b1 = buf[1];
++ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
++ int n = 16;
++
++ do {
++ sum += DELTA;
++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
++ } while(--n);
++
++ buf[0] += b0;
++ buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function. The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s) \
++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform. Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++ /* Round 1 */
++ ROUND(F, a, b, c, d, in[0] + K1, 3);
++ ROUND(F, d, a, b, c, in[1] + K1, 7);
++ ROUND(F, c, d, a, b, in[2] + K1, 11);
++ ROUND(F, b, c, d, a, in[3] + K1, 19);
++ ROUND(F, a, b, c, d, in[4] + K1, 3);
++ ROUND(F, d, a, b, c, in[5] + K1, 7);
++ ROUND(F, c, d, a, b, in[6] + K1, 11);
++ ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++ /* Round 2 */
++ ROUND(G, a, b, c, d, in[1] + K2, 3);
++ ROUND(G, d, a, b, c, in[3] + K2, 5);
++ ROUND(G, c, d, a, b, in[5] + K2, 9);
++ ROUND(G, b, c, d, a, in[7] + K2, 13);
++ ROUND(G, a, b, c, d, in[0] + K2, 3);
++ ROUND(G, d, a, b, c, in[2] + K2, 5);
++ ROUND(G, c, d, a, b, in[4] + K2, 9);
++ ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++ /* Round 3 */
++ ROUND(H, a, b, c, d, in[3] + K3, 3);
++ ROUND(H, d, a, b, c, in[7] + K3, 9);
++ ROUND(H, c, d, a, b, in[2] + K3, 11);
++ ROUND(H, b, c, d, a, in[6] + K3, 15);
++ ROUND(H, a, b, c, d, in[1] + K3, 3);
++ ROUND(H, d, a, b, c, in[5] + K3, 9);
++ ROUND(H, c, d, a, b, in[0] + K3, 11);
++ ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++ buf[0] += a;
++ buf[1] += b;
++ buf[2] += c;
++ buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++ while (len--) {
++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++
++ if (hash & 0x80000000) hash -= 0x7fffffff;
++ hash1 = hash0;
++ hash0 = hash;
++ }
++ return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++ __u32 pad, val;
++ int i;
++
++ pad = (__u32)len | ((__u32)len << 8);
++ pad |= pad << 16;
++
++ val = pad;
++ if (len > num*4)
++ len = num * 4;
++ for (i=0; i < len; i++) {
++ if ((i % 4) == 0)
++ val = pad;
++ val = msg[i] + (val << 8);
++ if ((i % 4) == 3) {
++ *buf++ = val;
++ val = pad;
++ num--;
++ }
++ }
++ if (--num >= 0)
++ *buf++ = val;
++ while (--num >= 0)
++ *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename. If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ *
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash. If the seed is all zero's, then some default seed
++ * may be used.
++ *
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits. 32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++ __u32 hash;
++ __u32 minor_hash = 0;
++ const char *p;
++ int i;
++ __u32 in[8], buf[4];
++
++ /* Initialize the default seed for the hash checksum functions */
++ buf[0] = 0x67452301;
++ buf[1] = 0xefcdab89;
++ buf[2] = 0x98badcfe;
++ buf[3] = 0x10325476;
++
++ /* Check to see if the seed is all zero's */
++ if (hinfo->seed) {
++ for (i=0; i < 4; i++) {
++ if (hinfo->seed[i])
++ break;
++ }
++ if (i < 4)
++ memcpy(buf, hinfo->seed, sizeof(buf));
++ }
++
++ switch (hinfo->hash_version) {
++ case DX_HASH_LEGACY:
++ hash = dx_hack_hash(name, len);
++ break;
++ case DX_HASH_HALF_MD4:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 8);
++ halfMD4Transform(buf, in);
++ len -= 32;
++ p += 32;
++ }
++ minor_hash = buf[2];
++ hash = buf[1];
++ break;
++ case DX_HASH_TEA:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 4);
++ TEA_transform(buf, in);
++ len -= 16;
++ p += 16;
++ }
++ hash = buf[0];
++ minor_hash = buf[1];
++ break;
++ default:
++ hinfo->hash = 0;
++ return -1;
++ }
++ hinfo->hash = hash & ~1;
++ hinfo->minor_hash = minor_hash;
++ return 0;
++}
+Index: linux.mcp2/fs/ext3/Makefile
+===================================================================
+--- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:03:55.000000000 -0700
++++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:07:06.000000000 -0700
+@@ -10,7 +10,7 @@
+ O_TARGET := ext3.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o
++ ioctl.o namei.o super.o symlink.o hash.o
+ obj-m := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
+Index: linux.mcp2/fs/ext3/namei.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:03:55.000000000 -0700
++++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:07:06.000000000 -0700
+@@ -16,6 +16,12 @@
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ * Hash Tree Directory indexing (c)
++ * Daniel Phillips, 2001
++ * Hash Tree Directory indexing porting
++ * Christopher Li, 2002
++ * Hash Tree Directory indexing cleanup
++ * Theodore Ts'o, 2002
+ */
+
+ #include <linux/fs.h>
+@@ -38,6 +44,642 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
++static struct buffer_head *ext3_append(handle_t *handle,
++ struct inode *inode,
++ u32 *block, int *err)
++{
++ struct buffer_head *bh;
++
++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++ inode->i_size += inode->i_sb->s_blocksize;
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_journal_get_write_access(handle,bh);
++ }
++ return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command)
++#endif
++
++struct fake_dirent
++{
++ /*le*/u32 inode;
++ /*le*/u16 rec_len;
++ u8 name_len;
++ u8 file_type;
++};
++
++struct dx_countlimit
++{
++ le_u16 limit;
++ le_u16 count;
++};
++
++struct dx_entry
++{
++ le_u32 hash;
++ le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero. Therefore, the
++ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++ struct fake_dirent dot;
++ char dot_name[4];
++ struct fake_dirent dotdot;
++ char dotdot_name[4];
++ struct dx_root_info
++ {
++ le_u32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
++ }
++ info;
++ struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++ struct fake_dirent fake;
++ struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++ struct buffer_head *bh;
++ struct dx_entry *entries;
++ struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++ u32 hash;
++ u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++ struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct dx_frame *frame,
++ int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++ struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++ entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++ entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++ EXT3_DIR_REC_LEN(2) - infosize;
++ return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++ return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{
++ unsigned names;
++ unsigned space;
++ unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++ int size, int show_names)
++{
++ unsigned names = 0, space = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ printk("names: ");
++ while ((char *) de < base + size)
++ {
++ if (de->inode)
++ {
++ if (show_names)
++ {
++ int len = de->name_len;
++ char *name = de->name;
++ while (len--) printk("%c", *name++);
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ printk(":%x.%u ", h.hash,
++ ((char *) de - base));
++ }
++ space += EXT3_DIR_REC_LEN(de->name_len);
++ names++;
++ }
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ printk("(%i)\n", names);
++ return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++ struct dx_entry *entries, int levels)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++ unsigned bcount = 0;
++ struct buffer_head *bh;
++ int err;
++ printk("%i indexed blocks...\n", count);
++ for (i = 0; i < count; i++, entries++)
++ {
++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++ struct stats stats;
++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++ stats = levels?
++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++ names += stats.names;
++ space += stats.space;
++ bcount += stats.bcount;
++ brelse (bh);
++ }
++ if (bcount)
++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
++ names, space/bcount,(space/bcount)*100/blocksize);
++ return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++ unsigned count, indirect;
++ struct dx_entry *at, *entries, *p, *q, *m;
++ struct dx_root *root;
++ struct buffer_head *bh;
++ struct dx_frame *frame = frame_in;
++ u32 hash;
++
++ frame->bh = NULL;
++ if (dentry)
++ dir = dentry->d_parent->d_inode;
++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++ goto fail;
++ root = (struct dx_root *) bh->b_data;
++ if (root->info.hash_version != DX_HASH_TEA &&
++ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_LEGACY) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++ hinfo->hash_version = root->info.hash_version;
++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ if (dentry)
++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ hash = hinfo->hash;
++
++ if (root->info.unused_flags & 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ if ((indirect = root->info.indirect_levels) > 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ while (1)
++ {
++ count = dx_get_count(entries);
++ assert (count && count <= dx_get_limit(entries));
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ dxtrace(printk("."));
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++
++ if (0) // linear search cross check
++ {
++ unsigned n = count - 1;
++ at = entries;
++ while (n--)
++ {
++ dxtrace(printk(","));
++ if (dx_get_hash(++at) > hash)
++ {
++ at--;
++ break;
++ }
++ }
++ assert (at == p - 1);
++ }
++
++ at = p - 1;
++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ frame->bh = bh;
++ frame->entries = entries;
++ frame->at = at;
++ if (!indirect--) return frame;
++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ goto fail2;
++ at = entries = ((struct dx_node *) bh->b_data)->entries;
++ assert (dx_get_limit(entries) == dx_node_limit (dir));
++ frame++;
++ }
++fail2:
++ while (frame >= frame_in) {
++ brelse(frame->bh);
++ frame--;
++ }
++fail:
++ return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++ if (frames[0].bh == NULL)
++ return;
++
++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ brelse(frames[1].bh);
++ brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary. Whether or not the search is necessary is
++ * controlled by the hash parameter. If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value. This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not. If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash)
++{
++ struct dx_frame *p;
++ struct buffer_head *bh;
++ int num_frames = 0;
++ __u32 bhash;
++
++ *err = ENOENT;
++ p = frame;
++ /*
++ * Find the next leaf page by incrementing the frame pointer.
++ * If we run out of entries in the interior node, loop around and
++ * increment pointer in the parent node. When we break out of
++ * this loop, num_frames indicates the number of interior
++ * nodes need to be read.
++ */
++ while (1) {
++ if (++(p->at) < p->entries + dx_get_count(p->entries))
++ break;
++ if (p == frames)
++ return 0;
++ num_frames++;
++ p--;
++ }
++
++ /*
++ * If the hash is 1, then continue only if the next page has a
++ * continuation hash of any value. This is used for readdir
++ * handling. Otherwise, check to see if the hash matches the
++ * desired contiuation hash. If it doesn't, return since
++ * there's no point to read in the successive index pages.
++ */
++ bhash = dx_get_hash(p->at);
++ if (start_hash)
++ *start_hash = bhash;
++ if ((hash & 1) == 0) {
++ if ((bhash & ~1) != hash)
++ return 0;
++ }
++ /*
++ * If the hash is HASH_NB_ALWAYS, we always go to the next
++ * block so no check is necessary
++ */
++ while (num_frames--) {
++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++ 0, err)))
++ return -1; /* Failure */
++ p++;
++ brelse (p->bh);
++ p->bh = bh;
++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ }
++ return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory. We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash)
++{
++ struct dx_hash_info hinfo;
++ struct buffer_head *bh;
++ struct ext3_dir_entry_2 *de, *top;
++ static struct dx_frame frames[2], *frame;
++ struct inode *dir;
++ int block, err;
++ int count = 0;
++ int ret;
++ __u32 hashval;
++
++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++ start_minor_hash));
++ dir = dir_file->f_dentry->d_inode;
++ hinfo.hash = start_hash;
++ hinfo.minor_hash = 0;
++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++
++ /* Add '.' and '..' from the htree header */
++ if (!start_hash && !start_minor_hash) {
++ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ de = ext3_next_entry(de);
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ count += 2;
++ }
++
++ while (1) {
++ block = dx_get_block(frame->at);
++ dxtrace(printk("Reading block %d\n", block));
++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++ goto errout;
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de)) {
++ ext3fs_dirhash(de->name, de->name_len, &hinfo);
++ if ((hinfo.hash < start_hash) ||
++ ((hinfo.hash == start_hash) &&
++ (hinfo.minor_hash < start_minor_hash)))
++ continue;
++ if ((err = ext3_htree_store_dirent(dir_file,
++ hinfo.hash, hinfo.minor_hash, de)) != 0)
++ goto errout;
++ count++;
++ }
++ brelse (bh);
++ hashval = ~1;
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
++ frame, frames, &err, &hashval);
++ if (next_hash)
++ *next_hash = hashval;
++ if (ret == -1)
++ goto errout;
++ /*
++ * Stop if: (a) there are no more entries, or
++ * (b) we have inserted at least one entry and the
++ * next hash value is not a continuation
++ */
++ if ((ret == 0) ||
++ (count && ((hashval & 1) == 0)))
++ break;
++ }
++ dx_release(frames);
++ dxtrace(printk("Fill tree: returned %d entries\n", count));
++ return count;
++errout:
++ dx_release(frames);
++ return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++ int count = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ while ((char *) de < base + size)
++ {
++ if (de->name_len && de->inode) {
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ map_tail--;
++ map_tail->hash = h.hash;
++ map_tail->offs = (u32) ((char *) de - base);
++ count++;
++ }
++ /* XXX: do we need to check rec_len == 0 case? -Chris */
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++ struct dx_map_entry *p, *q, *top = map + count - 1;
++ int more;
++ /* Combsort until bubble sort doesn't suck */
++ while (count > 2)
++ {
++ count = count*10/13;
++ if (count - 9 < 2) /* 9, 10 -> 11 */
++ count = 11;
++ for (p = top, q = p - count; q >= map; p--, q--)
++ if (p->hash < q->hash)
++ swap(*p, *q);
++ }
++ /* Garden variety bubble sort */
++ do {
++ more = 0;
++ q = top;
++ while (q-- > map)
++ {
++ if (q[1].hash >= q[0].hash)
++ continue;
++ swap(*(q+1), *q);
++ more = 1;
++ }
++ } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++ struct dx_entry *entries = frame->entries;
++ struct dx_entry *old = frame->at, *new = old + 1;
++ int count = dx_get_count(entries);
++
++ assert(count < dx_get_limit(entries));
++ assert(old < entries + count);
++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++ dx_set_hash(new, hash);
++ dx_set_block(new, block);
++ dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++ EXT3_FEATURE_COMPAT_DIR_INDEX))
++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+@@ -94,6 +736,7 @@
+ return 0;
+ }
+
++
+ /*
+ * ext3_find_entry()
+ *
+@@ -105,6 +748,8 @@
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
++
++
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,12 +764,32 @@
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
++ int namelen;
++ const u8 *name;
++ unsigned blocksize;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+-
++ blocksize = sb->s_blocksize;
++ namelen = dentry->d_name.len;
++ name = dentry->d_name.name;
++ if (namelen > EXT3_NAME_LEN)
++ return NULL;
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ /*
++ * On success, or if the error was file not found,
++ * return. Otherwise, fall back to doing a search the
++ * old fashioned way.
++ */
++ if (bh || (err != ERR_BAD_DX_DIR))
++ return bh;
++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ }
++#endif
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+- start = dir->u.ext3_i.i_dir_start_lookup;
++ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+@@ -165,7 +830,7 @@
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+- dir->u.ext3_i.i_dir_start_lookup = block;
++ EXT3_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+@@ -196,6 +861,66 @@
+ return ret;
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err)
++{
++ struct super_block * sb;
++ struct dx_hash_info hinfo;
++ u32 hash;
++ struct dx_frame frames[2], *frame;
++ struct ext3_dir_entry_2 *de, *top;
++ struct buffer_head *bh;
++ unsigned long block;
++ int retval;
++ int namelen = dentry->d_name.len;
++ const u8 *name = dentry->d_name.name;
++ struct inode *dir = dentry->d_parent->d_inode;
++
++ sb = dir->i_sb;
++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++ return NULL;
++ hash = hinfo.hash;
++ do {
++ block = dx_get_block(frame->at);
++ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++ goto errout;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de))
++ if (ext3_match (namelen, name, de)) {
++ if (!ext3_check_dir_entry("ext3_find_entry",
++ dir, de, bh,
++ (block<<EXT3_BLOCK_SIZE_BITS(sb))
++ +((char *)de - bh->b_data))) {
++ brelse (bh);
++ goto errout;
++ }
++ *res_dir = de;
++ dx_release (frames);
++ return bh;
++ }
++ brelse (bh);
++ /* Check to see if we should continue to search */
++ retval = ext3_htree_next_block(dir, hash, frame,
++ frames, err, 0);
++ if (retval == -1) {
++ ext3_warning(sb, __FUNCTION__,
++ "error reading index page in directory #%lu",
++ dir->i_ino);
++ goto errout;
++ }
++ } while (retval == 1);
++
++ *err = -ENOENT;
++errout:
++ dxtrace(printk("%s not found\n", name));
++ dx_release (frames);
++ return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ struct inode * inode;
+@@ -212,8 +937,9 @@
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+- if (!inode)
++ if (!inode) {
+ return ERR_PTR(-EACCES);
++ }
+ }
+ d_add(dentry, inode);
+ return NULL;
+@@ -237,6 +963,301 @@
+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++ unsigned rec_len = 0;
++
++ while (count--) {
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ memcpy (to, de, rec_len);
++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
++ de->inode = 0;
++ map++;
++ to += rec_len;
++ }
++ return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ unsigned rec_len = 0;
++
++ prev = to = de;
++ while ((char*)de < base + size) {
++ next = (struct ext3_dir_entry_2 *) ((char *) de +
++ le16_to_cpu(de->rec_len));
++ if (de->inode && de->name_len) {
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ if (de > to)
++ memmove(to, de, rec_len);
++ to->rec_len = cpu_to_le16(rec_len);
++ prev = to;
++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
++ }
++ de = next;
++ }
++ return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++ struct buffer_head **bh,struct dx_frame *frame,
++ struct dx_hash_info *hinfo, int *error)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count, continued;
++ struct buffer_head *bh2;
++ u32 newblock;
++ u32 hash2;
++ struct dx_map_entry *map;
++ char *data1 = (*bh)->b_data, *data2;
++ unsigned split;
++ struct ext3_dir_entry_2 *de = NULL, *de2;
++ int err;
++
++ bh2 = ext3_append (handle, dir, &newblock, error);
++ if (!(bh2)) {
++ brelse(*bh);
++ *bh = NULL;
++ goto errout;
++ }
++
++ BUFFER_TRACE(*bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, *bh);
++ if (err) {
++ journal_error:
++ brelse(*bh);
++ brelse(bh2);
++ *bh = NULL;
++ ext3_std_error(dir->i_sb, err);
++ goto errout;
++ }
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++
++ data2 = bh2->b_data;
++
++ /* create map in the end of data2 block */
++ map = (struct dx_map_entry *) (data2 + blocksize);
++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++ blocksize, hinfo, map);
++ map -= count;
++ split = count/2; // need to adjust to actual middle
++ dx_sort_map (map, count);
++ hash2 = map[split].hash;
++ continued = hash2 == map[split - 1].hash;
++ dxtrace(printk("Split block %i at %x, %i/%i\n",
++ dx_get_block(frame->at), hash2, split, count-split));
++
++ /* Fancy dance to stay within two buffers */
++ de2 = dx_move_dirents(data1, data2, map + split, count - split);
++ de = dx_pack_dirents(data1,blocksize);
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++ /* Which block gets the new entry? */
++ if (hinfo->hash >= hash2)
++ {
++ swap(*bh, bh2);
++ de = de2;
++ }
++ dx_insert_block (frame, hash2 + continued, newblock);
++ err = ext3_journal_dirty_metadata (handle, bh2);
++ if (err)
++ goto journal_error;
++ err = ext3_journal_dirty_metadata (handle, frame->bh);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++ return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block. If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry. If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space. It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ *
++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct ext3_dir_entry_2 *de,
++ struct buffer_head * bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ unsigned long offset = 0;
++ unsigned short reclen;
++ int nlen, rlen, err;
++ char *top;
++
++ reclen = EXT3_DIR_REC_LEN(namelen);
++ if (!de) {
++ de = (struct ext3_dir_entry_2 *)bh->b_data;
++ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++ while ((char *) de <= top) {
++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++ bh, offset)) {
++ brelse (bh);
++ return -EIO;
++ }
++ if (ext3_match (namelen, name, de)) {
++ brelse (bh);
++ return -EEXIST;
++ }
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if ((de->inode? rlen - nlen: rlen) >= reclen)
++ break;
++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++ offset += rlen;
++ }
++ if ((char *) de > top)
++ return -ENOSPC;
++ }
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) {
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return err;
++ }
++
++ /* By now the buffer is marked for journaling */
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if (de->inode) {
++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ de1->rec_len = cpu_to_le16(rlen - nlen);
++ de->rec_len = cpu_to_le16(nlen);
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ if (inode) {
++ de->inode = cpu_to_le32(inode->i_ino);
++ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++ } else
++ de->inode = 0;
++ de->name_len = namelen;
++ memcpy (de->name, name, namelen);
++ /*
++ * XXX shouldn't update any times until successful
++ * completion of syscall, but too many callers depend
++ * on this.
++ *
++ * XXX similarly, too many callers depend on
++ * ext3_new_inode() setting the times, but error
++ * recovery deletes the inode, so the worst that can
++ * happen is that the times are slightly out of date
++ * and/or different from the directory change time.
++ */
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ ext3_update_dx_flag(dir);
++ dir->i_version = ++event;
++ ext3_mark_inode_dirty(handle, dir);
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct buffer_head *bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ struct buffer_head *bh2;
++ struct dx_root *root;
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries;
++ struct ext3_dir_entry_2 *de, *de2;
++ char *data1, *top;
++ unsigned len;
++ int retval;
++ unsigned blocksize;
++ struct dx_hash_info hinfo;
++ u32 block;
++
++ blocksize = dir->i_sb->s_blocksize;
++ dxtrace(printk("Creating index\n"));
++ retval = ext3_journal_get_write_access(handle, bh);
++ if (retval) {
++ ext3_std_error(dir->i_sb, retval);
++ brelse(bh);
++ return retval;
++ }
++ root = (struct dx_root *) bh->b_data;
++
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ bh2 = ext3_append (handle, dir, &block, &retval);
++ if (!(bh2)) {
++ brelse(bh);
++ return retval;
++ }
++ data1 = bh2->b_data;
++
++ /* The 0th block becomes the root, move the dirents out */
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
++ len = ((char *) root) + blocksize - (char *) de;
++ memcpy (data1, de, len);
++ de = (struct ext3_dir_entry_2 *) data1;
++ top = data1 + len;
++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++ de = de2;
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ /* Initialize the root; the dot dirents already exist */
++ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++ memset (&root->info, 0, sizeof(root->info));
++ root->info.info_length = sizeof(root->info);
++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++ entries = root->entries;
++ dx_set_block (entries, 1);
++ dx_set_count (entries, 1);
++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++ /* Initialize as for dx_probe */
++ hinfo.hash_version = root->info.hash_version;
++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ frame = frames;
++ frame->entries = entries;
++ frame->at = entries;
++ frame->bh = bh;
++ bh = bh2;
++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ dx_release (frames);
++ if (!(de))
++ return retval;
++
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+ * ext3_add_entry()
+ *
+@@ -247,127 +1268,198 @@
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+- const char *name = dentry->d_name.name;
+- int namelen = dentry->d_name.len;
+ unsigned long offset;
+- unsigned short rec_len;
+ struct buffer_head * bh;
+- struct ext3_dir_entry_2 * de, * de1;
++ struct ext3_dir_entry_2 *de;
+ struct super_block * sb;
+ int retval;
++#ifdef CONFIG_EXT3_INDEX
++ int dx_fallback=0;
++#endif
++ unsigned blocksize;
++ unsigned nlen, rlen;
++ u32 block, blocks;
+
+ sb = dir->i_sb;
+-
+- if (!namelen)
++ blocksize = sb->s_blocksize;
++ if (!dentry->d_name.len)
+ return -EINVAL;
+- bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ retval = ext3_dx_add_entry(handle, dentry, inode);
++ if (!retval || (retval != ERR_BAD_DX_DIR))
++ return retval;
++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++ dx_fallback++;
++ ext3_mark_inode_dirty(handle, dir);
++ }
++#endif
++ blocks = dir->i_size >> sb->s_blocksize_bits;
++ for (block = 0, offset = 0; block < blocks; block++) {
++ bh = ext3_bread(handle, dir, block, 0, &retval);
++ if(!bh)
++ return retval;
++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (retval != -ENOSPC)
++ return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++ if (blocks == 1 && !dx_fallback &&
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++ return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++ brelse(bh);
++ }
++ bh = ext3_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+- rec_len = EXT3_DIR_REC_LEN(namelen);
+- offset = 0;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+- while (1) {
+- if ((char *)de >= sb->s_blocksize + bh->b_data) {
+- brelse (bh);
+- bh = NULL;
+- bh = ext3_bread (handle, dir,
+- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+- if (!bh)
+- return retval;
+- if (dir->i_size <= offset) {
+- if (dir->i_size == 0) {
+- brelse(bh);
+- return -ENOENT;
+- }
++ de->inode = 0;
++ de->rec_len = cpu_to_le16(rlen = blocksize);
++ nlen = 0;
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+
+- ext3_debug ("creating next block\n");
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries, *at;
++ struct dx_hash_info hinfo;
++ struct buffer_head * bh;
++ struct inode *dir = dentry->d_parent->d_inode;
++ struct super_block * sb = dir->i_sb;
++ struct ext3_dir_entry_2 *de;
++ int err;
+
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- de->inode = 0;
+- de->rec_len = le16_to_cpu(sb->s_blocksize);
+- dir->u.ext3_i.i_disksize =
+- dir->i_size = offset + sb->s_blocksize;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- } else {
++ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++ entries = frame->entries;
++ at = frame->at;
+
+- ext3_debug ("skipping to next block\n");
++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ goto cleanup;
+
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- }
+- }
+- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+- offset)) {
+- brelse (bh);
+- return -ENOENT;
+- }
+- if (ext3_match (namelen, name, de)) {
+- brelse (bh);
+- return -EEXIST;
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto journal_error;
++
++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (err != -ENOSPC) {
++ bh = 0;
++ goto cleanup;
++ }
++
++ /* Block full, should compress but for now just split */
++ dxtrace(printk("using %u of %u node entries\n",
++ dx_get_count(entries), dx_get_limit(entries)));
++ /* Need to split index? */
++ if (dx_get_count(entries) == dx_get_limit(entries)) {
++ u32 newblock;
++ unsigned icount = dx_get_count(entries);
++ int levels = frame - frames;
++ struct dx_entry *entries2;
++ struct dx_node *node2;
++ struct buffer_head *bh2;
++
++ if (levels && (dx_get_count(frames->entries) ==
++ dx_get_limit(frames->entries))) {
++ ext3_warning(sb, __FUNCTION__,
++ "Directory index full!\n");
++ err = -ENOSPC;
++ goto cleanup;
+ }
+- if ((le32_to_cpu(de->inode) == 0 &&
+- le16_to_cpu(de->rec_len) >= rec_len) ||
+- (le16_to_cpu(de->rec_len) >=
+- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- /* By now the buffer is marked for journaling */
+- offset += le16_to_cpu(de->rec_len);
+- if (le32_to_cpu(de->inode)) {
+- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+- EXT3_DIR_REC_LEN(de->name_len));
+- de1->rec_len =
+- cpu_to_le16(le16_to_cpu(de->rec_len) -
+- EXT3_DIR_REC_LEN(de->name_len));
+- de->rec_len = cpu_to_le16(
+- EXT3_DIR_REC_LEN(de->name_len));
+- de = de1;
++ bh2 = ext3_append (handle, dir, &newblock, &err);
++ if (!(bh2))
++ goto cleanup;
++ node2 = (struct dx_node *)(bh2->b_data);
++ entries2 = node2->entries;
++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++ node2->fake.inode = 0;
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++ if (levels) {
++ unsigned icount1 = icount/2, icount2 = icount - icount1;
++ unsigned hash2 = dx_get_hash(entries + icount1);
++ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++
++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++ err = ext3_journal_get_write_access(handle,
++ frames[0].bh);
++ if (err)
++ goto journal_error;
++
++ memcpy ((char *) entries2, (char *) (entries + icount1),
++ icount2 * sizeof(struct dx_entry));
++ dx_set_count (entries, icount1);
++ dx_set_count (entries2, icount2);
++ dx_set_limit (entries2, dx_node_limit(dir));
++
++ /* Which index block gets the new entry? */
++ if (at - entries >= icount1) {
++ frame->at = at = at - entries - icount1 + entries2;
++ frame->entries = entries = entries2;
++ swap(frame->bh, bh2);
+ }
+- de->file_type = EXT3_FT_UNKNOWN;
+- if (inode) {
+- de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
+- de->inode = 0;
+- de->name_len = namelen;
+- memcpy (de->name, name, namelen);
+- /*
+- * XXX shouldn't update any times until successful
+- * completion of syscall, but too many callers depend
+- * on this.
+- *
+- * XXX similarly, too many callers depend on
+- * ext3_new_inode() setting the times, but error
+- * recovery deletes the inode, so the worst that can
+- * happen is that the times are slightly out of date
+- * and/or different from the directory change time.
+- */
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- dir->i_version = ++event;
+- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+- ext3_journal_dirty_metadata(handle, bh);
+- brelse(bh);
+- return 0;
++ dx_insert_block (frames + 0, hash2, newblock);
++ dxtrace(dx_show_index ("node", frames[1].entries));
++ dxtrace(dx_show_index ("node",
++ ((struct dx_node *) bh2->b_data)->entries));
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ } else {
++ dxtrace(printk("Creating second level index...\n"));
++ memcpy((char *) entries2, (char *) entries,
++ icount * sizeof(struct dx_entry));
++ dx_set_limit(entries2, dx_node_limit(dir));
++
++ /* Set up root */
++ dx_set_count(entries, 1);
++ dx_set_block(entries + 0, newblock);
++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++ /* Add new access path frame */
++ frame = frames + 1;
++ frame->at = at = at - entries + entries2;
++ frame->entries = entries = entries2;
++ frame->bh = bh2;
++ err = ext3_journal_get_write_access(handle,
++ frame->bh);
++ if (err)
++ goto journal_error;
+ }
+- offset += le16_to_cpu(de->rec_len);
+- de = (struct ext3_dir_entry_2 *)
+- ((char *) de + le16_to_cpu(de->rec_len));
++ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- brelse (bh);
+- return -ENOSPC;
++ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ if (!de)
++ goto cleanup;
++ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ bh = 0;
++ goto cleanup;
++
++journal_error:
++ ext3_std_error(dir->i_sb, err);
++cleanup:
++ if (bh)
++ brelse(bh);
++ dx_release(frames);
++ return err;
+ }
++#endif
+
+ /*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -451,9 +1543,11 @@
+ struct inode * inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -478,9 +1572,11 @@
+ struct inode *inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -507,9 +1603,11 @@
+ if (dir->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -521,7 +1619,7 @@
+
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+@@ -554,21 +1652,19 @@
+ inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ inode->i_nlink = 0;
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ dir->i_nlink++;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- inode->i_nlink = 0;
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ /*
+@@ -655,7 +1751,7 @@
+ int err = 0, rc;
+
+ lock_super(sb);
+- if (!list_empty(&inode->u.ext3_i.i_orphan))
++ if (!list_empty(&EXT3_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+@@ -696,7 +1792,7 @@
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -714,25 +1810,26 @@
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ struct list_head *prev;
++ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_sb_info *sbi;
+ ino_t ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+- if (list_empty(&inode->u.ext3_i.i_orphan)) {
++ if (list_empty(&ei->i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+- prev = inode->u.ext3_i.i_orphan.prev;
++ prev = ei->i_orphan.prev;
+ sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+
+- list_del(&inode->u.ext3_i.i_orphan);
+- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++ list_del(&ei->i_orphan);
++ INIT_LIST_HEAD(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+@@ -793,8 +1890,9 @@
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+@@ -832,7 +1930,7 @@
+ ext3_mark_inode_dirty(handle, inode);
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+@@ -850,8 +1948,9 @@
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -878,7 +1977,7 @@
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+ if (!inode->i_nlink)
+@@ -904,9 +2003,11 @@
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -916,7 +2017,7 @@
+ if (IS_ERR(inode))
+ goto out_stop;
+
+- if (l > sizeof (inode->u.ext3_i.i_data)) {
++ if (l > sizeof (EXT3_I(inode)->i_data)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+@@ -925,8 +2026,12 @@
+ * i_size in generic_commit_write().
+ */
+ err = block_symlink(inode, symname, l);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ ext3_dec_count(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ } else {
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+ memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
+@@ -938,12 +2043,6 @@
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- ext3_dec_count(handle, inode);
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+@@ -956,12 +2055,15 @@
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (inode->i_nlink >= EXT3_LINK_MAX) {
+ return -EMLINK;
++ }
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -995,9 +2097,11 @@
+
+ old_bh = new_bh = dir_bh = NULL;
+
+- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+@@ -1070,14 +2174,33 @@
+ /*
+ * ok, that's it
+ */
+- ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ if (retval == -ENOENT) {
++ /*
++ * old_de could have moved out from under us.
++ */
++ struct buffer_head *old_bh2;
++ struct ext3_dir_entry_2 *old_de2;
++
++ old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++ if (old_bh2) {
++ retval = ext3_delete_entry(handle, old_dir,
++ old_de2, old_bh2);
++ brelse(old_bh2);
++ }
++ }
++ if (retval) {
++ ext3_warning(old_dir->i_sb, "ext3_rename",
++ "Deleting old file (%lu), %d, error=%d",
++ old_dir->i_ino, old_dir->i_nlink, retval);
++ }
+
+ if (new_inode) {
+ new_inode->i_nlink--;
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_bh);
+@@ -1089,7 +2212,7 @@
+ new_inode->i_nlink--;
+ } else {
+ new_dir->i_nlink++;
+- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+ }
+Index: linux.mcp2/fs/ext3/super.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:03:55.000000000 -0700
++++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:08:50.000000000 -0700
+@@ -702,6 +702,7 @@
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+ ext3_commit_super (sb, es, 1);
+ if (test_opt (sb, DEBUG))
+ printk (KERN_INFO
+@@ -712,6 +713,7 @@
+ EXT3_BLOCKS_PER_GROUP(sb),
+ EXT3_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
++
+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ bdevname(sb->s_dev));
+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -886,6 +888,7 @@
+ return res;
+ }
+
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ int silent)
+ {
+@@ -1062,6 +1065,9 @@
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++ for (i=0; i < 4; i++)
++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+@@ -1744,7 +1750,7 @@
+ unregister_filesystem(&ext3_fs_type);
+ }
+
+-EXPORT_NO_SYMBOLS;
++EXPORT_SYMBOL(ext3_force_commit);
+
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+Index: linux.mcp2/include/linux/ext3_fs.h
+===================================================================
+--- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 14:53:17.000000000 -0700
++++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:07:07.000000000 -0700
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION "2.4-0.9.17"
+
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+ * Debug code
+ */
+ #ifdef EXT3FS_DEBUG
+@@ -437,8 +442,11 @@
+ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+-
+-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
++ __u32 s_hash_seed[4]; /* HTREE hash seed */
++ __u8 s_def_hash_version; /* Default hash version to use */
++ __u8 s_reserved_char_pad;
++ __u16 s_reserved_word_pad;
++ __u32 s_reserved[192]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+@@ -575,9 +583,46 @@
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++ #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY 0
++#define DX_HASH_HALF_MD4 1
++#define DX_HASH_TEA 2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++ u32 hash;
++ u32 minor_hash;
++ int hash_version;
++ u32 *seed;
++};
+
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS 1
++
++
++/*
+ * Describe an inode's exact location on disk and in memory
+ */
+ struct ext3_iloc
+@@ -587,6 +632,27 @@
+ unsigned long block_group;
+ };
+
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories. It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++ rb_root_t root;
++ rb_node_t *curr_node;
++ struct fname *extra_fname;
++ loff_t last_pos;
++ __u32 curr_hash;
++ __u32 curr_minor_hash;
++ __u32 next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR -75000
++
+ /*
+ * Function prototypes
+ */
+@@ -614,11 +680,20 @@
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+- struct ext3_dir_entry_2 *, struct buffer_head *,
+- unsigned long);
++ struct ext3_dir_entry_2 *,
++ struct buffer_head *, unsigned long);
++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++ dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -650,6 +725,8 @@
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+Index: linux.mcp2/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux.mcp2.orig/include/linux/ext3_fs_sb.h 2004-05-17 14:41:25.000000000 -0700
++++ linux.mcp2/include/linux/ext3_fs_sb.h 2004-05-17 15:07:07.000000000 -0700
+@@ -62,6 +62,8 @@
+ int s_inode_size;
+ int s_first_ino;
+ u32 s_next_generation;
++ u32 s_hash_seed[4];
++ int s_def_hash_version;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+Index: linux.mcp2/include/linux/ext3_jbd.h
+===================================================================
+--- linux.mcp2.orig/include/linux/ext3_jbd.h 2004-05-17 14:53:17.000000000 -0700
++++ linux.mcp2/include/linux/ext3_jbd.h 2004-05-17 15:07:07.000000000 -0700
+@@ -63,6 +63,8 @@
+
+ #define EXT3_RESERVE_TRANS_BLOCKS 12
+
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+Index: linux.mcp2/include/linux/rbtree.h
+===================================================================
+--- linux.mcp2.orig/include/linux/rbtree.h 2004-05-17 14:41:25.000000000 -0700
++++ linux.mcp2/include/linux/rbtree.h 2004-05-17 15:07:07.000000000 -0700
+@@ -120,6 +120,8 @@
+
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
+Index: linux.mcp2/lib/rbtree.c
+===================================================================
+--- linux.mcp2.orig/lib/rbtree.c 2004-01-19 07:49:44.000000000 -0800
++++ linux.mcp2/lib/rbtree.c 2004-05-17 15:10:39.000000000 -0700
+@@ -17,6 +17,8 @@
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
++
++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+
+ #include <linux/rbtree.h>
+@@ -294,3 +296,42 @@
+ __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++ rb_node_t *n;
++
++ n = root->rb_node;
++ if (!n)
++ return 0;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++ rb_node_t *parent;
++
++ if (n->rb_right) {
++ n = n->rb_right;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++ } else {
++ while ((parent = n->rb_parent)) {
++ if (n == parent->rb_left)
++ return parent;
++ n = parent;
++ }
++ return 0;
++ }
++}
++EXPORT_SYMBOL(rb_get_next);
--- /dev/null
+ 0 files changed
+
+--- linux-2.4.20/fs/ext3/super.c~ext3-no-write-super 2003-08-11 13:20:17.000000000 +0400
++++ linux-2.4.20-alexey/fs/ext3/super.c 2003-08-11 13:31:35.000000000 +0400
+@@ -1849,7 +1849,6 @@ void ext3_write_super (struct super_bloc
+ if (down_trylock(&sb->s_lock) == 0)
+ BUG(); /* aviro detector */
+ sb->s_dirt = 0;
+- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
+ /*
+ * Tricky --- if we are unmounting, the write really does need
+@@ -1857,6 +1856,7 @@ void ext3_write_super (struct super_bloc
+ * sb->s_root.
+ */
+ if (do_sync_supers || !sb->s_root) {
++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+ unlock_super(sb);
+ log_wait_commit(EXT3_SB(sb)->s_journal, target);
+ lock_super(sb);
+
+_
--- /dev/null
+ fs/ext3/super.c | 7 ++++++-
+ 1 files changed, 6 insertions(+), 1 deletion(-)
+
+--- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600
+@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc
+ sb->s_dirt = 0;
+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
+- if (do_sync_supers) {
++ /*
++ * Tricky --- if we are unmounting, the write really does need
++ * to be synchronous. We can detect that by looking for NULL in
++ * sb->s_root.
++ */
++ if (do_sync_supers || !sb->s_root) {
+ unlock_super(sb);
+ log_wait_commit(EXT3_SB(sb)->s_journal, target);
+ lock_super(sb);
+
+_
--- /dev/null
+ ./fs/ext3/namei.c | 11 +++++------
+ 1 files changed, 5 insertions(+), 6 deletions(-)
+
+Index: linux-2.4.19-pre1/./fs/ext3/namei.c
+===================================================================
+--- linux-2.4.19-pre1.orig/./fs/ext3/namei.c 2003-11-21 01:52:06.000000000 +0300
++++ linux-2.4.19-pre1/./fs/ext3/namei.c 2003-11-21 01:58:15.000000000 +0300
+@@ -1522,8 +1522,11 @@
+ {
+ int err = ext3_add_entry(handle, dentry, inode);
+ if (!err) {
+- d_instantiate(dentry, inode);
+- return 0;
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ d_instantiate(dentry, inode);
++ return 0;
++ }
+ }
+ ext3_dec_count(handle, inode);
+ iput(inode);
+@@ -1559,7 +1562,6 @@
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+- ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ }
+ ext3_journal_stop(handle, dir);
+@@ -1586,7 +1588,6 @@
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, mode, rdev);
+- ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ }
+ ext3_journal_stop(handle, dir);
+@@ -2035,7 +2036,6 @@
+ inode->i_size = l-1;
+ }
+ inode->u.ext3_i.i_disksize = inode->i_size;
+- ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+@@ -2069,7 +2069,6 @@
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+- ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ ext3_journal_stop(handle, dir);
+ return err;
--- /dev/null
+
+
+
+ fs/inode.c | 21 ++++++++++++++-------
+ fs/smbfs/inode.c | 2 +-
+ fs/super.c | 4 ++--
+ include/linux/fs.h | 2 +-
+ 4 files changed, 18 insertions(+), 11 deletions(-)
+
+Index: linux.mcp2/fs/inode.c
+===================================================================
+--- linux.mcp2.orig/fs/inode.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/inode.c 2004-05-05 14:31:31.000000000 -0700
+@@ -553,7 +553,8 @@
+ /*
+ * Invalidate all inodes for a device.
+ */
+-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
++static int invalidate_list(struct list_head *head, struct super_block * sb,
++ struct list_head * dispose, int show)
+ {
+ struct list_head *next;
+ int busy = 0, count = 0;
+@@ -578,6 +579,11 @@
+ count++;
+ continue;
+ }
++ if (show)
++ printk(KERN_ERR
++ "inode busy: dev %s:%lu (%p) mode %o count %u\n",
++ kdevname(sb->s_dev), inode->i_ino, inode,
++ inode->i_mode, atomic_read(&inode->i_count));
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+@@ -596,22 +602,23 @@
+ /**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
++ * @show: whether we should display any busy inodes found
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+-int invalidate_inodes(struct super_block * sb)
++int invalidate_inodes(struct super_block * sb, int show)
+ {
+ int busy;
+ LIST_HEAD(throw_away);
+
+ spin_lock(&inode_lock);
+- busy = invalidate_list(&inode_in_use, sb, &throw_away);
+- busy |= invalidate_list(&inode_unused, sb, &throw_away);
+- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show);
++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show);
++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show);
++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+@@ -637,7 +644,7 @@
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+- res = invalidate_inodes(sb);
++ res = invalidate_inodes(sb, 0);
+ drop_super(sb);
+ }
+ invalidate_buffers(dev);
+Index: linux.mcp2/fs/super.c
+===================================================================
+--- linux.mcp2.orig/fs/super.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/super.c 2004-05-05 14:32:06.000000000 -0700
+@@ -838,7 +838,7 @@
+ lock_super(sb);
+ lock_kernel();
+ sb->s_flags &= ~MS_ACTIVE;
+- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */
++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */
+ if (sop) {
+ if (sop->write_super && sb->s_dirt)
+ sop->write_super(sb);
+@@ -847,7 +847,7 @@
+ }
+
+ /* Forget any remaining inodes */
+- if (invalidate_inodes(sb)) {
++ if (invalidate_inodes(sb, 1)) {
+ printk(KERN_ERR "VFS: Busy inodes after unmount. "
+ "Self-destruct in 5 seconds. Have a nice day...\n");
+ }
+Index: linux.mcp2/fs/smbfs/inode.c
+===================================================================
+--- linux.mcp2.orig/fs/smbfs/inode.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/smbfs/inode.c 2004-05-05 14:31:31.000000000 -0700
+@@ -166,7 +166,7 @@
+ {
+ VERBOSE("\n");
+ shrink_dcache_sb(SB_of(server));
+- invalidate_inodes(SB_of(server));
++ invalidate_inodes(SB_of(server), 0);
+ }
+
+ /*
+Index: linux.mcp2/include/linux/fs.h
+===================================================================
+--- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:31:06.000000000 -0700
++++ linux.mcp2/include/linux/fs.h 2004-05-05 14:31:31.000000000 -0700
+@@ -1283,7 +1283,7 @@
+ extern void set_buffer_flushtime(struct buffer_head *);
+ extern void balance_dirty(void);
+ extern int check_disk_change(kdev_t);
+-extern int invalidate_inodes(struct super_block *);
++extern int invalidate_inodes(struct super_block *, int);
+ extern int invalidate_device(kdev_t, int);
+ extern void invalidate_inode_pages(struct inode *);
+ extern void invalidate_inode_pages2(struct address_space *);
--- /dev/null
+ fs/Makefile | 2 +-
+ fs/inode.c | 4 +++-
+ mm/page_alloc.c | 1 +
+ 3 files changed, 5 insertions(+), 2 deletions(-)
+
+Index: linux-ion/fs/inode.c
+===================================================================
+--- linux-ion.orig/fs/inode.c 2004-09-27 14:58:03.000000000 -0700
++++ linux-ion/fs/inode.c 2004-09-27 14:58:34.000000000 -0700
+@@ -5,6 +5,7 @@
+ */
+
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+@@ -66,7 +67,8 @@
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++EXPORT_SYMBOL(inode_lock);
+
+ /*
+ * Statistics gathering..
+Index: linux-ion/fs/Makefile
+===================================================================
+--- linux-ion.orig/fs/Makefile 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/fs/Makefile 2004-09-27 14:59:37.000000000 -0700
+@@ -7,7 +7,7 @@
+
+ O_TARGET := fs.o
+
+-export-objs := filesystems.o open.o dcache.o buffer.o
++export-objs := filesystems.o open.o dcache.o buffer.o inode.o
+ mod-subdirs := nls
+
+ obj-y := open.o read_write.o devices.o file_table.o buffer.o \
+Index: linux-ion/mm/page_alloc.c
+===================================================================
+--- linux-ion.orig/mm/page_alloc.c 2004-07-28 14:34:57.000000000 -0700
++++ linux-ion/mm/page_alloc.c 2004-09-27 14:58:34.000000000 -0700
+@@ -28,6 +28,7 @@
+ LIST_HEAD(inactive_list);
+ LIST_HEAD(active_list);
+ pg_data_t *pgdat_list;
++EXPORT_SYMBOL(pgdat_list);
+
+ /* Used to look up the address of the struct zone encoded in page->zone */
+ zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
--- /dev/null
+ Documentation/filesystems/ext2.txt | 16 ++
+ fs/ext3/Makefile | 2
+ fs/ext3/inode.c | 4
+ fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h | 13 +
+ fs/ext3/namei.c | 13 +
+ fs/ext3/super.c | 11 +
+ include/linux/ext3_fs.h | 2
+ 8 files changed, 318 insertions(+), 2 deletions(-)
+
+Index: linux-2.4.19/Documentation/filesystems/ext2.txt
+===================================================================
+--- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400
++++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400
+@@ -35,6 +35,22 @@
+
+ sb=n Use alternate superblock at this location.
+
++iopen Makes an invisible pseudo-directory called
++ __iopen__ available in the root directory
++ of the filesystem. Allows open-by-inode-
++ number. i.e., inode 3145 can be accessed
++ via /mntpt/__iopen__/3145
++
++iopen_nopriv This option makes the iopen directory be
++ world-readable. This may be safer since it
++ allows daemons to run as an unprivileged user,
++ however it significantly changes the security
++ model of a Unix filesystem, since previously
++ all files under a mode 700 directory were not
++ generally avilable even if the
++ permissions on the file itself is
++ world-readable.
++
+ grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+
+
+Index: linux.mcp2/fs/ext3/Makefile
+===================================================================
+--- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:20:52.000000000 -0700
++++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:21:55.000000000 -0700
+@@ -11,7 +11,7 @@
+
+ export-objs := ext3-exports.o
+
+-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m := $(O_TARGET)
+
+Index: linux.mcp2/fs/ext3/inode.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/inode.c 2004-05-17 15:20:59.000000000 -0700
++++ linux.mcp2/fs/ext3/inode.c 2004-05-17 15:21:55.000000000 -0700
+@@ -31,6 +31,7 @@
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
++#include "iopen.h"
+
+ /*
+ * SEARCH_FROM_ZERO forces each block allocation to search from the start
+@@ -2125,6 +2126,9 @@
+ struct buffer_head *bh;
+ int block;
+
++ if (ext3_iopen_get_inode(inode))
++ return;
++
+ if(ext3_get_inode_loc(inode, &iloc))
+ goto bad_inode;
+ bh = iloc.bh;
+Index: linux.mcp2/fs/ext3/iopen.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700
++++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700
+@@ -0,0 +1,282 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ * for an inode at one time.
++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ * aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN 32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
++{
++ struct inode *inode;
++ unsigned long ino;
++ struct list_head *lp;
++ struct dentry *alternate;
++ char buf[IOPEN_NAME_LEN];
++
++ if (dentry->d_name.len >= IOPEN_NAME_LEN)
++ return ERR_PTR(-ENAMETOOLONG);
++
++ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++ buf[dentry->d_name.len] = 0;
++
++ if (strcmp(buf, ".") == 0)
++ ino = dir->i_ino;
++ else if (strcmp(buf, "..") == 0)
++ ino = EXT3_ROOT_INO;
++ else
++ ino = simple_strtoul(buf, 0, 0);
++
++ if ((ino != EXT3_ROOT_INO &&
++ //ino != EXT3_ACL_IDX_INO &&
++ //ino != EXT3_ACL_DATA_INO &&
++ ino < EXT3_FIRST_INO(dir->i_sb)) ||
++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++ return ERR_PTR(-ENOENT);
++
++ inode = iget(dir->i_sb, ino);
++ if (!inode)
++ return ERR_PTR(-EACCES);
++ if (is_bad_inode(inode)) {
++ iput(inode);
++ return ERR_PTR(-ENOENT);
++ }
++
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ assert(list_empty(&dentry->d_hash)); /* d_rehash */
++
++ /* preferrably return a connected dentry */
++ spin_lock(&dcache_lock);
++ list_for_each(lp, &inode->i_dentry) {
++ alternate = list_entry(lp, struct dentry, d_alias);
++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
++ }
++
++ if (!list_empty(&inode->i_dentry)) {
++ alternate = list_entry(inode->i_dentry.next,
++ struct dentry, d_alias);
++ dget_locked(alternate);
++ alternate->d_vfs_flags |= DCACHE_REFERENCED;
++ iput(inode);
++ spin_unlock(&dcache_lock);
++ return alternate;
++ }
++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++
++ __d_rehash(dentry, 0); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++#define do_switch(x,y) do { \
++ __typeof__ (x) __tmp = x; \
++ x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++ const unsigned char *old_name, *new_name;
++
++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN);
++ old_name = target->d_name.name;
++ new_name = dentry->d_name.name;
++ if (old_name == target->d_iname)
++ old_name = dentry->d_iname;
++ if (new_name == dentry->d_iname)
++ new_name = target->d_iname;
++ target->d_name.name = new_name;
++ dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++ int rehash)
++{
++ struct dentry *tmp, *goal = NULL;
++ struct list_head *lp;
++
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ if (rehash)
++ assert(list_empty(&dentry->d_hash)); /* d_rehash */
++ assert(list_empty(&dentry->d_subdirs));
++
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_rehash;
++
++ /* preferrably return a connected dentry */
++ list_for_each(lp, &inode->i_dentry) {
++ tmp = list_entry(lp, struct dentry, d_alias);
++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
++ assert(tmp->d_alias.next == &inode->i_dentry);
++ assert(tmp->d_alias.prev == &inode->i_dentry);
++ goal = tmp;
++ dget_locked(goal);
++ break;
++ }
++ }
++
++ if (!goal)
++ goto do_instantiate;
++
++ /* Move the goal to the de hash queue - like d_move() */
++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
++ list_del_init(&goal->d_hash);
++
++ list_del(&goal->d_child);
++ list_del(&dentry->d_child);
++
++ /* Switch the parents and the names.. */
++ switch_names(goal, dentry);
++ do_switch(goal->d_parent, dentry->d_parent);
++ do_switch(goal->d_name.len, dentry->d_name.len);
++ do_switch(goal->d_name.hash, dentry->d_name.hash);
++
++ /* And add them back to the (new) parent lists */
++ list_add(&goal->d_child, &goal->d_parent->d_subdirs);
++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
++ __d_rehash(goal, 0);
++ spin_unlock(&dcache_lock);
++ iput(inode);
++
++ return goal;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++do_rehash:
++ if (rehash)
++ __d_rehash(dentry, 0); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++ lookup: iopen_lookup, /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++ read: generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++ int len;
++
++ len = strlen(name);
++ if (dentry->d_name.len != len)
++ return 0;
++ if (strncmp(dentry->d_name.name, name, len))
++ return 0;
++ return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++ struct inode *inode;
++
++ if (dir->i_ino != EXT3_ROOT_INO ||
++ !test_opt(dir->i_sb, IOPEN) ||
++ !match_dentry(dentry, "__iopen__"))
++ return 0;
++
++ inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++ if (!inode)
++ return 0;
++ d_add(dentry, inode);
++ return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately. Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++ if (inode->i_ino != EXT3_BAD_INO)
++ return 0;
++
++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++ inode->i_mode |= 0777;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 1;
++ inode->i_size = 4096;
++ inode->i_atime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = CURRENT_TIME;
++ inode->u.ext3_i.i_dtime = 0;
++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
++ * (for stat), not the fs block
++ * size */
++ inode->i_blocks = 0;
++ inode->i_version = 1;
++ inode->i_generation = 0;
++
++ inode->i_op = &iopen_inode_operations;
++ inode->i_fop = &iopen_file_operations;
++ inode->i_mapping->a_ops = 0;
++
++ return 1;
++}
+Index: linux.mcp2/fs/ext3/iopen.h
+===================================================================
+--- linux.mcp2.orig/fs/ext3/iopen.h 2002-04-11 07:25:15.000000000 -0700
++++ linux.mcp2/fs/ext3/iopen.h 2004-05-17 15:21:55.000000000 -0700
+@@ -0,0 +1,15 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++ struct inode *inode, int rehash);
+Index: linux.mcp2/fs/ext3/namei.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:20:59.000000000 -0700
++++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:21:55.000000000 -0700
+@@ -35,7 +35,7 @@
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+-
++#include "iopen.h"
+
+ /*
+ * define how far ahead to read directories while searching them.
+@@ -931,6 +931,9 @@
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
++ if (ext3_check_for_iopen(dir, dentry))
++ return NULL;
++
+ bh = ext3_find_entry(dentry, &de);
+ inode = NULL;
+ if (bh) {
+@@ -942,8 +945,8 @@
+ return ERR_PTR(-EACCES);
+ }
+ }
+- d_add(dentry, inode);
+- return NULL;
++
++ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
+ #define S_SHIFT 12
+@@ -1932,10 +1935,6 @@
+ inode->i_nlink);
+ inode->i_version = ++event;
+ inode->i_nlink = 0;
+- /* There's no need to set i_disksize: the fact that i_nlink is
+- * zero will ensure that the right thing happens during any
+- * recovery. */
+- inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+ ext3_mark_inode_dirty(handle, inode);
+ dir->i_nlink--;
+@@ -2054,6 +2053,23 @@
+ return err;
+ }
+
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ int err = ext3_add_entry(handle, dentry, inode);
++ if (!err) {
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ dput(iopen_connect_dentry(dentry, inode, 0));
++ return 0;
++ }
++ }
++ ext3_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
++
+ static int ext3_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+ {
+@@ -2081,7 +2097,8 @@
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+- err = ext3_add_nondir(handle, dentry, inode);
++ err = ext3_add_link(handle, dentry, inode);
++ ext3_orphan_del(handle, inode);
+ ext3_journal_stop(handle, dir);
+ return err;
+ }
+Index: linux.mcp2/fs/ext3/super.c
+===================================================================
+--- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:20:59.000000000 -0700
++++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:21:55.000000000 -0700
+@@ -836,6 +836,18 @@
+ || !strcmp (this_char, "quota")
+ || !strcmp (this_char, "usrquota"))
+ /* Don't do anything ;-) */ ;
++ else if (!strcmp (this_char, "iopen")) {
++ set_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ }
++ else if (!strcmp (this_char, "noiopen")) {
++ clear_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ }
++ else if (!strcmp (this_char, "iopen_nopriv")) {
++ set_opt (sbi->s_mount_opt, IOPEN);
++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ }
+ else if (!strcmp (this_char, "journal")) {
+ /* @@@ FIXME */
+ /* Eventually we will want to be able to create
+Index: linux.mcp2/include/linux/ext3_fs.h
+===================================================================
+--- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 15:20:59.000000000 -0700
++++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:21:55.000000000 -0700
+@@ -323,6 +323,8 @@
+ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
++#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
--- /dev/null
+Index: linux-2.4.19-pre1/include/linux/jbd.h
+===================================================================
+--- linux-2.4.19-pre1.orig/include/linux/jbd.h 2003-11-21 03:00:11.000000000 +0300
++++ linux-2.4.19-pre1/include/linux/jbd.h 2003-11-21 03:04:47.000000000 +0300
+@@ -275,6 +275,13 @@
+ return bh->b_private;
+ }
+
++#define HAVE_JOURNAL_CALLBACK_STATUS
++struct journal_callback {
++ struct list_head jcb_list;
++ void (*jcb_func)(struct journal_callback *jcb, int error);
++ /* user data goes here */
++};
++
+ struct jbd_revoke_table_s;
+
+ /* The handle_t type represents a single atomic update being performed
+@@ -305,6 +312,12 @@
+ operations */
+ int h_err;
+
++ /* List of application registered callbacks for this handle.
++ * The function(s) will be called after the transaction that
++ * this handle is part of has been committed to disk.
++ */
++ struct list_head h_jcb;
++
+ /* Flags */
+ unsigned int h_sync: 1; /* sync-on-close */
+ unsigned int h_jdata: 1; /* force data journaling */
+@@ -424,6 +437,10 @@
+
+ /* How many handles used this transaction? */
+ int t_handle_count;
++
++ /* List of registered callback functions for this transaction.
++ * Called when the transaction is committed. */
++ struct list_head t_jcb;
+ };
+
+
+@@ -672,6 +689,9 @@
+ extern int journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int journal_stop(handle_t *);
+ extern int journal_flush (journal_t *);
++extern void journal_callback_set(handle_t *handle,
++ void (*fn)(struct journal_callback *,int),
++ struct journal_callback *jcb);
+
+ extern void journal_lock_updates (journal_t *);
+ extern void journal_unlock_updates (journal_t *);
+Index: linux-2.4.19-pre1/fs/jbd/checkpoint.c
+===================================================================
+--- linux-2.4.19-pre1.orig/fs/jbd/checkpoint.c 2003-11-21 02:53:20.000000000 +0300
++++ linux-2.4.19-pre1/fs/jbd/checkpoint.c 2003-11-21 03:04:47.000000000 +0300
+@@ -601,7 +601,8 @@
+ J_ASSERT (transaction->t_log_list == NULL);
+ J_ASSERT (transaction->t_checkpoint_list == NULL);
+ J_ASSERT (transaction->t_updates == 0);
+-
++ J_ASSERT (list_empty(&transaction->t_jcb));
++
+ J_ASSERT (transaction->t_journal->j_committing_transaction !=
+ transaction);
+
+Index: linux-2.4.19-pre1/fs/jbd/commit.c
+===================================================================
+--- linux-2.4.19-pre1.orig/fs/jbd/commit.c 2003-11-21 02:53:20.000000000 +0300
++++ linux-2.4.19-pre1/fs/jbd/commit.c 2003-11-21 03:04:47.000000000 +0300
+@@ -480,7 +480,7 @@
+ transaction's t_log_list queue, and metadata buffers are on
+ the t_iobuf_list queue.
+
+- Wait for the transactions in reverse order. That way we are
++ Wait for the buffers in reverse order. That way we are
+ less likely to be woken up until all IOs have completed, and
+ so we incur less scheduling load.
+ */
+@@ -571,8 +571,10 @@
+
+ jbd_debug(3, "JBD: commit phase 6\n");
+
+- if (is_journal_aborted(journal))
++ if (is_journal_aborted(journal)) {
++ unlock_journal(journal);
+ goto skip_commit;
++ }
+
+ /* Done it all: now write the commit record. We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+@@ -582,9 +584,10 @@
+ descriptor = journal_get_descriptor_buffer(journal);
+ if (!descriptor) {
+ __journal_abort_hard(journal);
++ unlock_journal(journal);
+ goto skip_commit;
+ }
+-
++
+ /* AKPM: buglet - add `i' to tmp! */
+ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
+ journal_header_t *tmp =
+@@ -605,14 +608,32 @@
+ put_bh(bh); /* One for getblk() */
+ journal_unlock_journal_head(descriptor);
+ }
+- lock_journal(journal);
+
+ /* End of a transaction! Finally, we can do checkpoint
+ processing: any buffers committed as a result of this
+ transaction can be removed from any checkpoint list it was on
+ before. */
+
+-skip_commit:
++skip_commit: /* The journal should be unlocked by now. */
++
++ /* Call any callbacks that had been registered for handles in this
++ * transaction. It is up to the callback to free any allocated
++ * memory.
++ */
++ if (!list_empty(&commit_transaction->t_jcb)) {
++ struct list_head *p, *n;
++ int error = is_journal_aborted(journal);
++
++ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
++ struct journal_callback *jcb;
++
++ jcb = list_entry(p, struct journal_callback, jcb_list);
++ list_del(p);
++ jcb->jcb_func(jcb, error);
++ }
++ }
++
++ lock_journal(journal);
+
+ jbd_debug(3, "JBD: commit phase 7\n");
+
+Index: linux-2.4.19-pre1/fs/jbd/journal.c
+===================================================================
+--- linux-2.4.19-pre1.orig/fs/jbd/journal.c 2003-11-21 02:53:20.000000000 +0300
++++ linux-2.4.19-pre1/fs/jbd/journal.c 2003-11-21 03:04:47.000000000 +0300
+@@ -58,6 +58,7 @@
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
++EXPORT_SYMBOL(journal_callback_set);
+
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+Index: linux-2.4.19-pre1/fs/jbd/transaction.c
+===================================================================
+--- linux-2.4.19-pre1.orig/fs/jbd/transaction.c 2003-11-21 02:53:20.000000000 +0300
++++ linux-2.4.19-pre1/fs/jbd/transaction.c 2003-11-21 03:05:14.000000000 +0300
+@@ -57,6 +57,7 @@
+ transaction->t_state = T_RUNNING;
+ transaction->t_tid = journal->j_transaction_sequence++;
+ transaction->t_expires = jiffies + journal->j_commit_interval;
++ INIT_LIST_HEAD(&transaction->t_jcb);
+
+ /* Set up the commit timer for the new transaction. */
+ J_ASSERT (!journal->j_commit_timer_active);
+@@ -90,7 +91,14 @@
+ transaction_t *transaction;
+ int needed;
+ int nblocks = handle->h_buffer_credits;
+-
++
++ if (nblocks > journal->j_max_transaction_buffers) {
++ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n",
++ current->comm, nblocks,
++ journal->j_max_transaction_buffers);
++ return -ENOSPC;
++ }
++
+ jbd_debug(3, "New handle %p going live.\n", handle);
+
+ repeat:
+@@ -196,6 +204,20 @@
+ return 0;
+ }
+
++/* Allocate a new handle. This should probably be in a slab... */
++static handle_t *new_handle(int nblocks)
++{
++ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
++ if (!handle)
++ return NULL;
++ memset(handle, 0, sizeof (handle_t));
++ handle->h_buffer_credits = nblocks;
++ handle->h_ref = 1;
++ INIT_LIST_HEAD(&handle->h_jcb);
++
++ return handle;
++}
++
+ /*
+ * Obtain a new handle.
+ *
+@@ -222,14 +244,11 @@
+ handle->h_ref++;
+ return handle;
+ }
+-
+- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
++
++ handle = new_handle(nblocks);
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+- memset (handle, 0, sizeof (handle_t));
+
+- handle->h_buffer_credits = nblocks;
+- handle->h_ref = 1;
+ current->journal_info = handle;
+
+ err = start_this_handle(journal, handle);
+@@ -328,14 +347,11 @@
+
+ if (is_journal_aborted(journal))
+ return ERR_PTR(-EIO);
+-
+- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
++
++ handle = new_handle(nblocks);
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+- memset (handle, 0, sizeof (handle_t));
+
+- handle->h_buffer_credits = nblocks;
+- handle->h_ref = 1;
+ current->journal_info = handle;
+
+ err = try_start_this_handle(journal, handle);
+@@ -1324,6 +1340,28 @@
+ #endif
+
+ /*
++ * Register a callback function for this handle. The function will be
++ * called when the transaction that this handle is part of has been
++ * committed to disk with the original callback data struct and the
++ * error status of the journal as parameters. There is no guarantee of
++ * ordering between handles within a single transaction, nor between
++ * callbacks registered on the same handle.
++ *
++ * The caller is responsible for allocating the journal_callback struct.
++ * This is to allow the caller to add as much extra data to the callback
++ * as needed, but reduce the overhead of multiple allocations. The caller
++ * allocated struct must start with a struct journal_callback at offset 0,
++ * and has the caller-specific data afterwards.
++ */
++void journal_callback_set(handle_t *handle,
++ void (*func)(struct journal_callback *jcb, int error),
++ struct journal_callback *jcb)
++{
++ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
++ jcb->jcb_func = func;
++}
++
++/*
+ * All done for a particular handle.
+ *
+ * There is not much action needed here. We just return any remaining
+@@ -1389,7 +1427,10 @@
+ wake_up(&journal->j_wait_transaction_locked);
+ }
+
+- /*
++ /* Move callbacks from the handle to the transaction. */
++ list_splice(&handle->h_jcb, &transaction->t_jcb);
++
++ /*
+ * If the handle is marked SYNC, we need to set another commit
+ * going! We also want to force a commit if the current
+ * transaction is occupying too much of the log, or if the
--- /dev/null
+Index: linux-bgl/arch/arm/vmlinux-armo.lds.in
+===================================================================
+--- linux-bgl.orig/arch/arm/vmlinux-armo.lds.in 2003-07-02 08:44:12.000000000 -0700
++++ linux-bgl/arch/arm/vmlinux-armo.lds.in 2004-10-26 22:52:50.037677957 -0700
+@@ -62,6 +62,10 @@
+ *(__ksymtab)
+ __stop___ksymtab = .;
+
++ __start___kallsyms = .; /* All kernel symbols */
++ *(__kallsyms)
++ __stop___kallsyms = .;
++
+ *(.got) /* Global offset table */
+
+ _etext = .; /* End of text section */
+Index: linux-bgl/arch/arm/vmlinux-armv.lds.in
+===================================================================
+--- linux-bgl.orig/arch/arm/vmlinux-armv.lds.in 2003-07-02 08:44:12.000000000 -0700
++++ linux-bgl/arch/arm/vmlinux-armv.lds.in 2004-10-26 22:52:50.038677801 -0700
+@@ -67,6 +67,12 @@
+ __stop___ksymtab = .;
+ }
+
++ __kallsyms : { /* Kernel debugging table */
++ __start___kallsyms = .; /* All kernel symbols */
++ *(__kallsyms)
++ __stop___kallsyms = .;
++ }
++
+ . = ALIGN(8192);
+
+ .data : {
+Index: linux-bgl/arch/ppc/config.in
+===================================================================
+--- linux-bgl.orig/arch/ppc/config.in 2004-10-04 09:55:49.000000000 -0700
++++ linux-bgl/arch/ppc/config.in 2004-10-26 23:11:56.416643929 -0700
+@@ -732,6 +732,7 @@
+ string 'Additional compile arguments' CONFIG_COMPILE_OPTIONS "-g -ggdb"
+ fi
+ fi
++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
+
+ if [ "$CONFIG_ALL_PPC" = "y" ]; then
+ bool 'Support for early boot text console (BootX or OpenFirmware only)' CONFIG_BOOTX_TEXT
+Index: linux-bgl/arch/ppc/vmlinux.lds
+===================================================================
+--- linux-bgl.orig/arch/ppc/vmlinux.lds 2003-07-02 08:43:30.000000000 -0700
++++ linux-bgl/arch/ppc/vmlinux.lds 2004-10-26 22:52:50.043677020 -0700
+@@ -73,6 +73,10 @@
+ __ksymtab : { *(__ksymtab) }
+ __stop___ksymtab = .;
+
++ __start___kallsyms = .; /* All kernel symbols */
++ __kallsyms : { *(__kallsyms) }
++ __stop___kallsyms = .;
++
+ __start___ftr_fixup = .;
+ __ftr_fixup : { *(__ftr_fixup) }
+ __stop___ftr_fixup = .;
+Index: linux-bgl/arch/i386/config.in
+===================================================================
+--- linux-bgl.orig/arch/i386/config.in 2003-07-02 08:43:46.000000000 -0700
++++ linux-bgl/arch/i386/config.in 2004-10-26 22:52:50.040677488 -0700
+@@ -363,6 +363,7 @@
+ if [ "$CONFIG_ISDN" != "n" ]; then
+ source drivers/isdn/Config.in
+ fi
++ bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
+ fi
+ endmenu
+
+Index: linux-bgl/arch/i386/vmlinux.lds
+===================================================================
+--- linux-bgl.orig/arch/i386/vmlinux.lds 2003-07-02 08:44:32.000000000 -0700
++++ linux-bgl/arch/i386/vmlinux.lds 2004-10-26 22:52:50.040677488 -0700
+@@ -27,6 +27,9 @@
+ __start___ksymtab = .; /* Kernel symbol table */
+ __ksymtab : { *(__ksymtab) }
+ __stop___ksymtab = .;
++ __start___kallsyms = .; /* All kernel symbols */
++ __kallsyms : { *(__kallsyms) }
++ __stop___kallsyms = .;
+
+ .data : { /* Data */
+ *(.data)
+Index: linux-bgl/arch/ia64/config.in
+===================================================================
+--- linux-bgl.orig/arch/ia64/config.in 2003-07-02 08:44:12.000000000 -0700
++++ linux-bgl/arch/ia64/config.in 2004-10-26 22:52:50.055675147 -0700
+@@ -278,4 +278,6 @@
+ bool ' Turn on irq debug checks (slow!)' CONFIG_IA64_DEBUG_IRQ
+ fi
+
++bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
++
+ endmenu
+Index: linux-bgl/arch/alpha/vmlinux.lds.in
+===================================================================
+--- linux-bgl.orig/arch/alpha/vmlinux.lds.in 2003-07-02 08:43:45.000000000 -0700
++++ linux-bgl/arch/alpha/vmlinux.lds.in 2004-10-26 22:52:50.036678113 -0700
+@@ -28,6 +28,10 @@
+ __stop___ksymtab = .;
+ .kstrtab : { *(.kstrtab) }
+
++ __start___kallsyms = .; /* All kernel symbols */
++ __kallsyms : { *(__kallsyms) }
++ __stop___kallsyms = .;
++
+ /* Startup code */
+ . = ALIGN(8192);
+ __init_begin = .;
+Index: linux-bgl/Makefile
+===================================================================
+--- linux-bgl.orig/Makefile 2004-10-04 09:55:49.000000000 -0700
++++ linux-bgl/Makefile 2004-10-26 22:54:44.018588371 -0700
+@@ -38,10 +38,13 @@
+ MAKEFILES = $(TOPDIR)/.config
+ GENKSYMS = /sbin/genksyms
+ DEPMOD = /sbin/depmod
++KALLSYMS = /sbin/kallsyms
+ MODFLAGS = -DMODULE
+ CFLAGS_KERNEL =
+ PERL = perl
+
++TMPPREFIX =
++
+ export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \
+ CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \
+ CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL
+@@ -198,7 +201,7 @@
+ CLEAN_FILES = \
+ kernel/ksyms.lst include/linux/compile.h \
+ vmlinux System.map \
+- .tmp* \
++ $(TMPPREFIX).tmp* \
+ drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \
+ drivers/char/conmakehash \
+ drivers/char/drm/*-mod.c \
+@@ -278,16 +281,39 @@
+ boot: vmlinux
+ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot
+
++LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \
++ --start-group \
++ $(CORE_FILES) \
++ $(DRIVERS) \
++ $(NETWORKS) \
++ $(LIBS) \
++ --end-group
++ifeq ($(CONFIG_KALLSYMS),y)
++LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o
++else
++LD_VMLINUX_KALLSYMS :=
++endif
++
+ vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs
+- $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \
+- --start-group \
+- $(CORE_FILES) \
+- $(DRIVERS) \
+- $(NETWORKS) \
+- $(LIBS) \
+- --end-group \
+- -o vmlinux
++ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms
++
++.PHONY: kallsyms
++
++kallsyms:
++ifeq ($(CONFIG_KALLSYMS),y)
++ @echo kallsyms pass 1
++ $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1
++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o
++ @echo kallsyms pass 2
++ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2
++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o
++ @echo kallsyms pass 3
++ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3
++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o
++endif
++ $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o vmlinux
+ $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map
++ @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms*
+
+ symlinks:
+ rm -f include/asm
+Index: linux-bgl/kernel/Makefile
+===================================================================
+--- linux-bgl.orig/kernel/Makefile 2003-07-02 08:44:29.000000000 -0700
++++ linux-bgl/kernel/Makefile 2004-10-26 22:59:34.101037916 -0700
+@@ -19,6 +19,7 @@
+ obj-$(CONFIG_UID16) += uid16.o
+ obj-$(CONFIG_MODULES) += ksyms.o
+ obj-$(CONFIG_PM) += pm.o
++obj-$(CONFIG_KALLSYMS) += kallsyms.o
+
+ ifneq ($(CONFIG_IA64),y)
+ # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+Index: linux-bgl/kernel/ksyms.c
+===================================================================
+--- linux-bgl.orig/kernel/ksyms.c 2004-10-26 21:49:59.922431839 -0700
++++ linux-bgl/kernel/ksyms.c 2004-10-26 22:52:50.050675927 -0700
+@@ -56,6 +56,9 @@
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
++#ifdef CONFIG_KALLSYMS
++#include <linux/kallsyms.h>
++#endif
+
+ extern void set_device_ro(kdev_t dev,int flag);
+
+@@ -81,6 +84,15 @@
+ EXPORT_SYMBOL(inter_module_put);
+ EXPORT_SYMBOL(try_inc_mod_count);
+
++#ifdef CONFIG_KALLSYMS
++extern const char __start___kallsyms[];
++extern const char __stop___kallsyms[];
++EXPORT_SYMBOL(__start___kallsyms);
++EXPORT_SYMBOL(__stop___kallsyms);
++
++
++#endif
++
+ /* process memory management */
+ EXPORT_SYMBOL(do_mmap_pgoff);
+ EXPORT_SYMBOL(do_munmap);
+Index: linux-bgl/kernel/kallsyms.c
+===================================================================
+--- linux-bgl.orig/kernel/kallsyms.c 2004-10-26 17:10:51.404753448 -0700
++++ linux-bgl/kernel/kallsyms.c 2004-10-26 22:52:50.048676240 -0700
+@@ -0,0 +1,306 @@
++/* An example of using kallsyms data in a kernel debugger.
++
++ Copyright 2000 Keith Owens <kaos@ocs.com.au> April 2000
++
++ This file is part of the Linux modutils.
++
++ This program is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published by the
++ Free Software Foundation; either version 2 of the License, or (at your
++ option) any later version.
++
++ This program is distributed in the hope that it will be useful, but
++ WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software Foundation,
++ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++ */
++
++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.4.1 2004/10/29 00:51:21 jacob Exp $"
++
++/*
++ This code uses the list of all kernel and module symbols to :-
++
++ * Find any non-stack symbol in a kernel or module. Symbols do
++ not have to be exported for debugging.
++
++ * Convert an address to the module (or kernel) that owns it, the
++ section it is in and the nearest symbol. This finds all non-stack
++ symbols, not just exported ones.
++
++ You need modutils >= 2.3.11 and a kernel with the kallsyms patch
++ which was compiled with CONFIG_KALLSYMS.
++ */
++
++#include <linux/elf.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/kallsyms.h>
++
++/* These external symbols are only set on kernels compiled with
++ * CONFIG_KALLSYMS.
++ */
++
++extern const char __start___kallsyms[];
++extern const char __stop___kallsyms[];
++
++static struct module **kallsyms_module_list;
++
++static void kallsyms_get_module_list(void)
++{
++ const struct kallsyms_header *ka_hdr;
++ const struct kallsyms_section *ka_sec;
++ const struct kallsyms_symbol *ka_sym;
++ const char *ka_str;
++ int i;
++ const char *p;
++
++ if (__start___kallsyms >= __stop___kallsyms)
++ return;
++ ka_hdr = (struct kallsyms_header *)__start___kallsyms;
++ ka_sec = (struct kallsyms_section *)
++ ((char *)(ka_hdr) + ka_hdr->section_off);
++ ka_sym = (struct kallsyms_symbol *)
++ ((char *)(ka_hdr) + ka_hdr->symbol_off);
++ ka_str =
++ ((char *)(ka_hdr) + ka_hdr->string_off);
++
++ for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) {
++ p = ka_str + ka_sym->name_off;
++ if (strcmp(p, "module_list") == 0) {
++ if (ka_sym->symbol_addr)
++ kallsyms_module_list = (struct module **)(ka_sym->symbol_addr);
++ break;
++ }
++ }
++}
++
++static inline void kallsyms_do_first_time(void)
++{
++ static int first_time = 1;
++ if (first_time)
++ kallsyms_get_module_list();
++ first_time = 0;
++}
++
++/* A symbol can appear in more than one module. A token is used to
++ * restart the scan at the next module, set the token to 0 for the
++ * first scan of each symbol.
++ */
++
++int kallsyms_symbol_to_address(
++ const char *name, /* Name to lookup */
++ unsigned long *token, /* Which module to start at */
++ const char **mod_name, /* Set to module name */
++ unsigned long *mod_start, /* Set to start address of module */
++ unsigned long *mod_end, /* Set to end address of module */
++ const char **sec_name, /* Set to section name */
++ unsigned long *sec_start, /* Set to start address of section */
++ unsigned long *sec_end, /* Set to end address of section */
++ const char **sym_name, /* Set to full symbol name */
++ unsigned long *sym_start, /* Set to start address of symbol */
++ unsigned long *sym_end /* Set to end address of symbol */
++ )
++{
++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
++ const struct kallsyms_section *ka_sec;
++ const struct kallsyms_symbol *ka_sym = NULL;
++ const char *ka_str = NULL;
++ const struct module *m;
++ int i = 0, l;
++ const char *p, *pt_R;
++ char *p2;
++
++ kallsyms_do_first_time();
++ if (!kallsyms_module_list)
++ return(0);
++
++ /* Restart? */
++ m = *kallsyms_module_list;
++ if (token && *token) {
++ for (; m; m = m->next)
++ if ((unsigned long)m == *token)
++ break;
++ if (m)
++ m = m->next;
++ }
++
++ for (; m; m = m->next) {
++ if (!mod_member_present(m, kallsyms_start) ||
++ !mod_member_present(m, kallsyms_end) ||
++ m->kallsyms_start >= m->kallsyms_end)
++ continue;
++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
++ ka_sym = (struct kallsyms_symbol *)
++ ((char *)(ka_hdr) + ka_hdr->symbol_off);
++ ka_str =
++ ((char *)(ka_hdr) + ka_hdr->string_off);
++ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) {
++ p = ka_str + ka_sym->name_off;
++ if (strcmp(p, name) == 0)
++ break;
++ /* Unversioned requests match versioned names */
++ if (!(pt_R = strstr(p, "_R")))
++ continue;
++ l = strlen(pt_R);
++ if (l < 10)
++ continue; /* Not _R.*xxxxxxxx */
++ (void)simple_strtoul(pt_R+l-8, &p2, 16);
++ if (*p2)
++ continue; /* Not _R.*xxxxxxxx */
++ if (strncmp(p, name, pt_R-p) == 0)
++ break; /* Match with version */
++ }
++ if (i < ka_hdr->symbols)
++ break;
++ }
++
++ if (token)
++ *token = (unsigned long)m;
++ if (!m)
++ return(0); /* not found */
++
++ ka_sec = (const struct kallsyms_section *)
++ ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off);
++ *mod_name = *(m->name) ? m->name : "kernel";
++ *mod_start = ka_hdr->start;
++ *mod_end = ka_hdr->end;
++ *sec_name = ka_sec->name_off + ka_str;
++ *sec_start = ka_sec->start;
++ *sec_end = ka_sec->start + ka_sec->size;
++ *sym_name = ka_sym->name_off + ka_str;
++ *sym_start = ka_sym->symbol_addr;
++ if (i < ka_hdr->symbols-1) {
++ const struct kallsyms_symbol *ka_symn = ka_sym;
++ kallsyms_next_sym(ka_hdr, ka_symn);
++ *sym_end = ka_symn->symbol_addr;
++ }
++ else
++ *sym_end = *sec_end;
++ return(1);
++}
++
++int kallsyms_address_to_symbol(
++ unsigned long address, /* Address to lookup */
++ const char **mod_name, /* Set to module name */
++ unsigned long *mod_start, /* Set to start address of module */
++ unsigned long *mod_end, /* Set to end address of module */
++ const char **sec_name, /* Set to section name */
++ unsigned long *sec_start, /* Set to start address of section */
++ unsigned long *sec_end, /* Set to end address of section */
++ const char **sym_name, /* Set to full symbol name */
++ unsigned long *sym_start, /* Set to start address of symbol */
++ unsigned long *sym_end /* Set to end address of symbol */
++ )
++{
++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
++ const struct kallsyms_section *ka_sec = NULL;
++ const struct kallsyms_symbol *ka_sym;
++ const char *ka_str;
++ const struct module *m;
++ int i;
++ unsigned long end;
++
++ kallsyms_do_first_time();
++ if (!kallsyms_module_list)
++ return(0);
++
++ for (m = *kallsyms_module_list; m; m = m->next) {
++ if (!mod_member_present(m, kallsyms_start) ||
++ !mod_member_present(m, kallsyms_end) ||
++ m->kallsyms_start >= m->kallsyms_end)
++ continue;
++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
++ ka_sec = (const struct kallsyms_section *)
++ ((char *)ka_hdr + ka_hdr->section_off);
++ /* Is the address in any section in this module? */
++ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) {
++ if (ka_sec->start <= address &&
++ (ka_sec->start + ka_sec->size) > address)
++ break;
++ }
++ if (i < ka_hdr->sections)
++ break; /* Found a matching section */
++ }
++
++ if (!m)
++ return(0); /* not found */
++
++ ka_sym = (struct kallsyms_symbol *)
++ ((char *)(ka_hdr) + ka_hdr->symbol_off);
++ ka_str =
++ ((char *)(ka_hdr) + ka_hdr->string_off);
++ *mod_name = *(m->name) ? m->name : "kernel";
++ *mod_start = ka_hdr->start;
++ *mod_end = ka_hdr->end;
++ *sec_name = ka_sec->name_off + ka_str;
++ *sec_start = ka_sec->start;
++ *sec_end = ka_sec->start + ka_sec->size;
++ *sym_name = *sec_name; /* In case we find no matching symbol */
++ *sym_start = *sec_start;
++ *sym_end = *sec_end;
++
++ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) {
++ if (ka_sym->symbol_addr > address)
++ continue;
++ if (i < ka_hdr->symbols-1) {
++ const struct kallsyms_symbol *ka_symn = ka_sym;
++ kallsyms_next_sym(ka_hdr, ka_symn);
++ end = ka_symn->symbol_addr;
++ }
++ else
++ end = *sec_end;
++ if (end <= address)
++ continue;
++ if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off
++ != (char *)ka_sec)
++ continue; /* wrong section */
++ *sym_name = ka_str + ka_sym->name_off;
++ *sym_start = ka_sym->symbol_addr;
++ *sym_end = end;
++ break;
++ }
++ return(1);
++}
++
++/* List all sections in all modules. The callback routine is invoked with
++ * token, module name, section name, section start, section end, section flags.
++ */
++int kallsyms_sections(void *token,
++ int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word)))
++{
++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
++ const struct kallsyms_section *ka_sec = NULL;
++ const char *ka_str;
++ const struct module *m;
++ int i;
++
++ kallsyms_do_first_time();
++ if (!kallsyms_module_list)
++ return(0);
++
++ for (m = *kallsyms_module_list; m; m = m->next) {
++ if (!mod_member_present(m, kallsyms_start) ||
++ !mod_member_present(m, kallsyms_end) ||
++ m->kallsyms_start >= m->kallsyms_end)
++ continue;
++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
++ ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off);
++ ka_str = ((char *)(ka_hdr) + ka_hdr->string_off);
++ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) {
++ if (callback(
++ token,
++ *(m->name) ? m->name : "kernel",
++ ka_sec->name_off + ka_str,
++ ka_sec->start,
++ ka_sec->start + ka_sec->size,
++ ka_sec->flags))
++ return(0);
++ }
++ }
++ return(1);
++}
+Index: linux-bgl/include/linux/kallsyms.h
+===================================================================
+--- linux-bgl.orig/include/linux/kallsyms.h 2004-10-26 17:10:51.404753448 -0700
++++ linux-bgl/include/linux/kallsyms.h 2004-10-26 22:52:50.045676708 -0700
+@@ -0,0 +1,141 @@
++/* kallsyms headers
++ Copyright 2000 Keith Owens <kaos@ocs.com.au>
++
++ This file is part of the Linux modutils. It is exported to kernel
++ space so debuggers can access the kallsyms data.
++
++ The kallsyms data contains all the non-stack symbols from a kernel
++ or a module. The kernel symbols are held between __start___kallsyms
++ and __stop___kallsyms. The symbols for a module are accessed via
++ the struct module chain which is based at module_list.
++
++ This program is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published by the
++ Free Software Foundation; either version 2 of the License, or (at your
++ option) any later version.
++
++ This program is distributed in the hope that it will be useful, but
++ WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software Foundation,
++ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++ */
++
++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.4.1 2004/10/29 00:51:21 jacob Exp $"
++
++#ifndef MODUTILS_KALLSYMS_H
++#define MODUTILS_KALLSYMS_H 1
++
++/* Have to (re)define these ElfW entries here because external kallsyms
++ * code does not have access to modutils/include/obj.h. This code is
++ * included from user spaces tools (modutils) and kernel, they need
++ * different includes.
++ */
++
++#ifndef ELFCLASS32
++#ifdef __KERNEL__
++#include <linux/elf.h>
++#else /* __KERNEL__ */
++#include <elf.h>
++#endif /* __KERNEL__ */
++#endif /* ELFCLASS32 */
++
++#ifndef ELFCLASSM
++#define ELFCLASSM ELF_CLASS
++#endif
++
++#ifndef ElfW
++# if ELFCLASSM == ELFCLASS32
++# define ElfW(x) Elf32_ ## x
++# define ELFW(x) ELF32_ ## x
++# else
++# define ElfW(x) Elf64_ ## x
++# define ELFW(x) ELF64_ ## x
++# endif
++#endif
++
++/* Format of data in the kallsyms section.
++ * Most of the fields are small numbers but the total size and all
++ * offsets can be large so use the 32/64 bit types for these fields.
++ *
++ * Do not use sizeof() on these structures, modutils may be using extra
++ * fields. Instead use the size fields in the header to access the
++ * other bits of data.
++ */
++
++struct kallsyms_header {
++ int size; /* Size of this header */
++ ElfW(Word) total_size; /* Total size of kallsyms data */
++ int sections; /* Number of section entries */
++ ElfW(Off) section_off; /* Offset to first section entry */
++ int section_size; /* Size of one section entry */
++ int symbols; /* Number of symbol entries */
++ ElfW(Off) symbol_off; /* Offset to first symbol entry */
++ int symbol_size; /* Size of one symbol entry */
++ ElfW(Off) string_off; /* Offset to first string */
++ ElfW(Addr) start; /* Start address of first section */
++ ElfW(Addr) end; /* End address of last section */
++};
++
++struct kallsyms_section {
++ ElfW(Addr) start; /* Start address of section */
++ ElfW(Word) size; /* Size of this section */
++ ElfW(Off) name_off; /* Offset to section name */
++ ElfW(Word) flags; /* Flags from section */
++};
++
++struct kallsyms_symbol {
++ ElfW(Off) section_off; /* Offset to section that owns this symbol */
++ ElfW(Addr) symbol_addr; /* Address of symbol */
++ ElfW(Off) name_off; /* Offset to symbol name */
++};
++
++#define KALLSYMS_SEC_NAME "__kallsyms"
++#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */
++
++#define kallsyms_next_sec(h,s) \
++ ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size))
++#define kallsyms_next_sym(h,s) \
++ ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size))
++
++int kallsyms_symbol_to_address(
++ const char *name, /* Name to lookup */
++ unsigned long *token, /* Which module to start with */
++ const char **mod_name, /* Set to module name or "kernel" */
++ unsigned long *mod_start, /* Set to start address of module */
++ unsigned long *mod_end, /* Set to end address of module */
++ const char **sec_name, /* Set to section name */
++ unsigned long *sec_start, /* Set to start address of section */
++ unsigned long *sec_end, /* Set to end address of section */
++ const char **sym_name, /* Set to full symbol name */
++ unsigned long *sym_start, /* Set to start address of symbol */
++ unsigned long *sym_end /* Set to end address of symbol */
++ );
++
++int kallsyms_address_to_symbol(
++ unsigned long address, /* Address to lookup */
++ const char **mod_name, /* Set to module name */
++ unsigned long *mod_start, /* Set to start address of module */
++ unsigned long *mod_end, /* Set to end address of module */
++ const char **sec_name, /* Set to section name */
++ unsigned long *sec_start, /* Set to start address of section */
++ unsigned long *sec_end, /* Set to end address of section */
++ const char **sym_name, /* Set to full symbol name */
++ unsigned long *sym_start, /* Set to start address of symbol */
++ unsigned long *sym_end /* Set to end address of symbol */
++ );
++
++int kallsyms_sections(void *token,
++ int (*callback)(void *, /* token */
++ const char *, /* module name */
++ const char *, /* section name */
++ ElfW(Addr), /* Section start */
++ ElfW(Addr), /* Section end */
++ ElfW(Word) /* Section flags */
++ )
++ );
++
++#endif /* kallsyms.h */
--- /dev/null
+Index: linux-bgl/arch/i386/kernel/traps.c
+===================================================================
+--- linux-bgl.orig/arch/i386/kernel/traps.c 2003-07-02 08:43:23.000000000 -0700
++++ linux-bgl/arch/i386/kernel/traps.c 2004-10-26 23:25:17.950442396 -0700
+@@ -24,6 +24,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/highmem.h>
++#include <linux/version.h>
+
+ #ifdef CONFIG_MCA
+ #include <linux/mca.h>
+@@ -135,6 +136,8 @@
+ {
+ int i;
+ unsigned long addr;
++ /* static to not take up stackspace; if we race here too bad */
++ static char buffer[512];
+
+ if (!stack)
+ stack = (unsigned long*)&stack;
+@@ -144,9 +147,8 @@
+ while (((long) stack & (THREAD_SIZE-1)) != 0) {
+ addr = *stack++;
+ if (kernel_text_address(addr)) {
+- if (i && ((i % 6) == 0))
+- printk("\n ");
+- printk(" [<%08lx>]", addr);
++ lookup_symbol(addr, buffer, 512);
++ printk("[<%08lx>] %s (0x%p)\n", addr,buffer,stack-1);
+ i++;
+ }
+ }
+@@ -186,12 +188,19 @@
+ show_trace(esp);
+ }
+
++#ifdef CONFIG_MK7
++#define ARCHIT "/athlon"
++#else
++#define ARCHIT "/i686"
++#endif
++
+ void show_registers(struct pt_regs *regs)
+ {
+ int i;
+ int in_kernel = 1;
+ unsigned long esp;
+ unsigned short ss;
++ static char buffer[512];
+
+ esp = (unsigned long) (®s->esp);
+ ss = __KERNEL_DS;
+@@ -200,8 +209,12 @@
+ esp = regs->esp;
+ ss = regs->xss & 0xffff;
+ }
++
++ print_modules();
++ lookup_symbol(regs->eip, buffer, 512);
+ printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n",
+ smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags);
++ printk("\nEIP is at %s (" UTS_RELEASE ARCHIT ")\n",buffer);
+ printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
+ regs->eax, regs->ebx, regs->ecx, regs->edx);
+ printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
+@@ -261,7 +274,7 @@
+ if (__get_user(file, (char **)(eip + 4)) ||
+ (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+ file = "<bad filename>";
+-
++ printk("------------[ cut here ]------------\n");
+ printk("kernel BUG at %s:%d!\n", file, line);
+
+ no_bug:
+Index: linux-bgl/arch/i386/kernel/process.c
+===================================================================
+--- linux-bgl.orig/arch/i386/kernel/process.c 2003-07-02 08:44:07.000000000 -0700
++++ linux-bgl/arch/i386/kernel/process.c 2004-10-26 23:28:53.017015082 -0700
+@@ -33,6 +33,7 @@
+ #include <linux/reboot.h>
+ #include <linux/init.h>
+ #include <linux/mc146818rtc.h>
++#include <linux/version.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -437,10 +438,14 @@
+ void show_regs(struct pt_regs * regs)
+ {
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
++ static char buffer[512];
++
++ lookup_symbol(regs->eip, buffer, 512);
+
+ printk("\n");
+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+ printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
++ printk("\nEIP is at %s (" UTS_RELEASE ")\n", buffer);
+ if (regs->xcs & 3)
+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+ printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted());
+Index: linux-bgl/arch/ia64/kernel/process.c
+===================================================================
+--- linux-bgl.orig/arch/ia64/kernel/process.c 2003-07-02 08:43:26.000000000 -0700
++++ linux-bgl/arch/ia64/kernel/process.c 2004-10-26 23:29:56.340005959 -0700
+@@ -18,6 +18,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/stddef.h>
+ #include <linux/unistd.h>
++#include <linux/version.h>
+
+ #include <asm/delay.h>
+ #include <asm/efi.h>
+@@ -33,9 +34,10 @@
+ #include <asm/sn/idle.h>
+ #endif
+
+-static void
+-do_show_stack (struct unw_frame_info *info, void *arg)
++void
++ia64_do_show_stack (struct unw_frame_info *info, void *arg)
+ {
++ static char buffer[512];
+ unsigned long ip, sp, bsp;
+
+ printk("\nCall Trace: ");
+@@ -46,7 +48,8 @@
+
+ unw_get_sp(info, &sp);
+ unw_get_bsp(info, &bsp);
+- printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx\n", ip, sp, bsp);
++ lookup_symbol(ip, buffer, 512);
++ printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx %s\n", ip, sp, bsp, buffer);
+ } while (unw_unwind(info) >= 0);
+ }
+
+@@ -56,19 +59,19 @@
+ struct unw_frame_info info;
+
+ unw_init_from_blocked_task(&info, task);
+- do_show_stack(&info, 0);
++ ia64_do_show_stack(&info, 0);
+ }
+
+ void
+ show_stack (struct task_struct *task)
+ {
+ if (!task)
+- unw_init_running(do_show_stack, 0);
++ unw_init_running(ia64_do_show_stack, 0);
+ else {
+ struct unw_frame_info info;
+
+ unw_init_from_blocked_task(&info, task);
+- do_show_stack(&info, 0);
++ ia64_do_show_stack(&info, 0);
+ }
+ }
+
+@@ -76,8 +79,11 @@
+ show_regs (struct pt_regs *regs)
+ {
+ unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
++ static char buffer[512];
+
+ printk("\nPid: %d, comm: %20s\n", current->pid, current->comm);
++ lookup_symbol(ip, buffer, 512);
++ printk("EIP is at %s (" UTS_RELEASE ")\n", buffer);
+ printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n",
+ regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
+ printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
+Index: linux-bgl/arch/s390/config.in
+===================================================================
+--- linux-bgl.orig/arch/s390/config.in 2003-07-02 08:43:27.000000000 -0700
++++ linux-bgl/arch/s390/config.in 2004-10-26 23:25:17.961440685 -0700
+@@ -73,5 +73,6 @@
+ # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG
+ #fi
+ bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
+ endmenu
+
+Index: linux-bgl/arch/s390/kernel/traps.c
+===================================================================
+--- linux-bgl.orig/arch/s390/kernel/traps.c 2003-07-02 08:44:02.000000000 -0700
++++ linux-bgl/arch/s390/kernel/traps.c 2004-10-26 23:25:17.964440218 -0700
+@@ -27,6 +27,7 @@
+ #include <linux/init.h>
+ #include <linux/delay.h>
+ #include <linux/module.h>
++#include <linux/version.h>
+
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -108,27 +109,26 @@
+
+ void show_trace(unsigned long * stack)
+ {
++ static char buffer[512];
+ unsigned long backchain, low_addr, high_addr, ret_addr;
+ int i;
+
+ if (!stack)
+ stack = (unsigned long*)&stack;
+
+- printk("Call Trace: ");
+ low_addr = ((unsigned long) stack) & PSW_ADDR_MASK;
+ high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE;
+ /* Skip the first frame (biased stack) */
+ backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK;
+- /* Print up to 8 lines */
+- for (i = 0; i < 8; i++) {
++ /* Print up to 20 lines */
++ for (i = 0; i < 20; i++) {
+ if (backchain < low_addr || backchain >= high_addr)
+ break;
+ ret_addr = *((unsigned long *) (backchain+56)) & PSW_ADDR_MASK;
+ if (!kernel_text_address(ret_addr))
+ break;
+- if (i && ((i % 6) == 0))
+- printk("\n ");
+- printk("[<%08lx>] ", ret_addr);
++ lookup_symbol(ret_addr, buffer, 512);
++ printk("[<%08lx>] %s (0x%lx)\n", ret_addr,buffer,backchain+56);
+ low_addr = backchain;
+ backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK;
+ }
+@@ -171,6 +171,7 @@
+
+ void show_registers(struct pt_regs *regs)
+ {
++ static char buffer[512];
+ mm_segment_t old_fs;
+ char *mode;
+ int i;
+@@ -179,6 +180,10 @@
+ printk("%s PSW : %08lx %08lx\n",
+ mode, (unsigned long) regs->psw.mask,
+ (unsigned long) regs->psw.addr);
++ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) {
++ lookup_symbol(regs->psw.addr & 0x7FFFFFFF, buffer, 512);
++ printk(" %s (" UTS_RELEASE ")\n", buffer);
++ }
+ printk("%s GPRS: %08x %08x %08x %08x\n", mode,
+ regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
+ printk(" %08x %08x %08x %08x\n",
+Index: linux-bgl/arch/s390x/config.in
+===================================================================
+--- linux-bgl.orig/arch/s390x/config.in 2003-07-02 08:43:07.000000000 -0700
++++ linux-bgl/arch/s390x/config.in 2004-10-26 23:25:17.964440218 -0700
+@@ -75,5 +75,6 @@
+ # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG
+ #fi
+ bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
+ endmenu
+
+Index: linux-bgl/arch/s390x/kernel/traps.c
+===================================================================
+--- linux-bgl.orig/arch/s390x/kernel/traps.c 2003-07-02 08:43:25.000000000 -0700
++++ linux-bgl/arch/s390x/kernel/traps.c 2004-10-26 23:25:17.966439907 -0700
+@@ -27,6 +27,7 @@
+ #include <linux/init.h>
+ #include <linux/delay.h>
+ #include <linux/module.h>
++#include <linux/version.h>
+
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -112,25 +113,25 @@
+ {
+ unsigned long backchain, low_addr, high_addr, ret_addr;
+ int i;
++ /* static to not take up stackspace; if we race here too bad */
++ static char buffer[512];
+
+ if (!stack)
+ stack = (unsigned long*)&stack;
+
+- printk("Call Trace: ");
+ low_addr = ((unsigned long) stack) & PSW_ADDR_MASK;
+ high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE;
+ /* Skip the first frame (biased stack) */
+ backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK;
+- /* Print up to 8 lines */
+- for (i = 0; i < 8; i++) {
++ /* Print up to 20 lines */
++ for (i = 0; i < 20; i++) {
+ if (backchain < low_addr || backchain >= high_addr)
+ break;
+ ret_addr = *((unsigned long *) (backchain+112)) & PSW_ADDR_MASK;
+ if (!kernel_text_address(ret_addr))
+ break;
+- if (i && ((i % 3) == 0))
+- printk("\n ");
+- printk("[<%016lx>] ", ret_addr);
++ lookup_symbol(ret_addr, buffer, 512);
++ printk("[<%016lx>] %s (0x%lx)\n", ret_addr, buffer, backchain+112);
+ low_addr = backchain;
+ backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK;
+ }
+@@ -173,6 +174,7 @@
+
+ void show_registers(struct pt_regs *regs)
+ {
++ static char buffer[512];
+ mm_segment_t old_fs;
+ char *mode;
+ int i;
+@@ -181,6 +183,10 @@
+ printk("%s PSW : %016lx %016lx\n",
+ mode, (unsigned long) regs->psw.mask,
+ (unsigned long) regs->psw.addr);
++ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) {
++ lookup_symbol(regs->psw.addr, buffer, 512);
++ printk(" %s (" UTS_RELEASE ")\n", buffer);
++ }
+ printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
+ regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
+ printk(" %016lx %016lx %016lx %016lx\n",
+Index: linux-bgl/arch/ppc64/mm/fault.c
+===================================================================
+--- linux-bgl.orig/arch/ppc64/mm/fault.c 2003-07-02 08:43:12.000000000 -0700
++++ linux-bgl/arch/ppc64/mm/fault.c 2004-10-26 23:30:24.467942247 -0700
+@@ -224,7 +224,6 @@
+ if (debugger_kernel_faults)
+ debugger(regs);
+ #endif
+- print_backtrace( (unsigned long *)regs->gpr[1] );
+ panic("kernel access of bad area pc %lx lr %lx address %lX tsk %s/%d",
+ regs->nip,regs->link,address,current->comm,current->pid);
+ }
+Index: linux-bgl/arch/ppc64/kernel/traps.c
+===================================================================
+--- linux-bgl.orig/arch/ppc64/kernel/traps.c 2003-07-02 08:44:03.000000000 -0700
++++ linux-bgl/arch/ppc64/kernel/traps.c 2004-10-26 23:33:45.297572484 -0700
+@@ -89,7 +89,6 @@
+ #if defined(CONFIG_KDB)
+ kdb(KDB_REASON_OOPS, 0, (kdb_eframe_t) regs);
+ #endif
+- print_backtrace((unsigned long *)regs->gpr[1]);
+ panic("Exception in kernel pc %lx signal %d",regs->nip,signr);
+ #if defined(CONFIG_PPCDBG) && (defined(CONFIG_XMON) || defined(CONFIG_KGDB))
+ /* Allow us to catch SIGILLs for 64-bit app/glibc debugging. -Peter */
+@@ -187,7 +186,6 @@
+ if (kdb(KDB_REASON_FAULT, 0, regs))
+ return ;
+ #endif
+- print_backtrace((unsigned long *)regs->gpr[1]);
+ panic("machine check");
+ }
+ _exception(SIGSEGV, regs);
+@@ -209,7 +207,6 @@
+ }
+ #endif
+ show_regs(regs);
+- print_backtrace((unsigned long *)regs->gpr[1]);
+ panic("System Management Interrupt");
+ }
+
+Index: linux-bgl/arch/ppc64/kernel/process.c
+===================================================================
+--- linux-bgl.orig/arch/ppc64/kernel/process.c 2003-07-02 08:44:31.000000000 -0700
++++ linux-bgl/arch/ppc64/kernel/process.c 2004-10-26 23:33:01.060713583 -0700
+@@ -30,6 +30,8 @@
+ #include <linux/user.h>
+ #include <linux/elf.h>
+ #include <linux/init.h>
++#include <linux/version.h>
++#include <linux/module.h>
+
+ #include <asm/pgtable.h>
+ #include <asm/uaccess.h>
+@@ -130,12 +132,61 @@
+ __restore_flags(s);
+ }
+
++/*
++ * If the address is either in the .text section of the
++ * kernel, or in the vmalloc'ed module regions, it *may*
++ * be the address of a calling routine
++ */
++
++#ifdef CONFIG_MODULES
++
++extern struct module *module_list;
++extern struct module kernel_module;
++extern char _stext[], _etext[];
++
++static inline int kernel_text_address(unsigned long addr)
++{
++ int retval = 0;
++ struct module *mod;
++
++ if (addr >= (unsigned long) &_stext &&
++ addr <= (unsigned long) &_etext)
++ return 1;
++
++ for (mod = module_list; mod != &kernel_module; mod = mod->next) {
++ /* mod_bound tests for addr being inside the vmalloc'ed
++ * module area. Of course it'd be better to test only
++ * for the .text subset... */
++ if (mod_bound(addr, 0, mod)) {
++ retval = 1;
++ break;
++ }
++ }
++
++ return retval;
++}
++
++#else
++
++static inline int kernel_text_address(unsigned long addr)
++{
++ return (addr >= (unsigned long) &_stext &&
++ addr <= (unsigned long) &_etext);
++}
++
++#endif
++
++
+ void show_regs(struct pt_regs * regs)
+ {
+ int i;
++ static char buffer[512];
+
+- printk("NIP: %016lX XER: %016lX LR: %016lX REGS: %p TRAP: %04lx %s\n",
++ print_modules();
++ printk("NIP: %016lx XER: %016lx LR: %016lx REGS: %p TRAP: %04lx %s\n",
+ regs->nip, regs->xer, regs->link, regs,regs->trap, print_tainted());
++ lookup_symbol(regs->nip, buffer, 512);
++ printk("NIP is at %s (" UTS_RELEASE ")\n", buffer);
+ printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x IR/DR: %01x%01x\n",
+ regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0,
+ regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0,
+@@ -147,27 +198,22 @@
+ printk("\nlast math %p ", last_task_used_math);
+
+ #ifdef CONFIG_SMP
+- /* printk(" CPU: %d last CPU: %d", current->processor,current->last_processor); */
++ printk("CPU: %d", smp_processor_id());
+ #endif /* CONFIG_SMP */
+
+- printk("\n");
+ for (i = 0; i < 32; i++)
+ {
+ long r;
+ if ((i % 4) == 0)
+- {
+- printk("GPR%02d: ", i);
+- }
++ printk("\nGPR%02d: ", i);
+
+ if ( __get_user(r, &(regs->gpr[i])) )
+ return;
+
+- printk("%016lX ", r);
+- if ((i % 4) == 3)
+- {
+- printk("\n");
+- }
++ printk("%016lx ", r);
+ }
++ printk("\n");
++ print_backtrace((unsigned long *)regs->gpr[1]);
+ }
+
+ void exit_thread(void)
+@@ -415,67 +461,24 @@
+ }
+ }
+
+-extern char _stext[], _etext[];
+-
+-char * ppc_find_proc_name( unsigned * p, char * buf, unsigned buflen )
+-{
+- unsigned long tb_flags;
+- unsigned short name_len;
+- unsigned long tb_start, code_start, code_ptr, code_offset;
+- unsigned code_len;
+- strcpy( buf, "Unknown" );
+- code_ptr = (unsigned long)p;
+- code_offset = 0;
+- if ( ( (unsigned long)p >= (unsigned long)_stext ) && ( (unsigned long)p <= (unsigned long)_etext ) ) {
+- while ( (unsigned long)p <= (unsigned long)_etext ) {
+- if ( *p == 0 ) {
+- tb_start = (unsigned long)p;
+- ++p; /* Point to traceback flags */
+- tb_flags = *((unsigned long *)p);
+- p += 2; /* Skip over traceback flags */
+- if ( tb_flags & TB_NAME_PRESENT ) {
+- if ( tb_flags & TB_PARMINFO )
+- ++p; /* skip over parminfo data */
+- if ( tb_flags & TB_HAS_TBOFF ) {
+- code_len = *p; /* get code length */
+- code_start = tb_start - code_len;
+- code_offset = code_ptr - code_start + 1;
+- if ( code_offset > 0x100000 )
+- break;
+- ++p; /* skip over code size */
+- }
+- name_len = *((unsigned short *)p);
+- if ( name_len > (buflen-20) )
+- name_len = buflen-20;
+- memcpy( buf, ((char *)p)+2, name_len );
+- buf[name_len] = 0;
+- if ( code_offset )
+- sprintf( buf+name_len, "+0x%lx", code_offset-1 );
+- }
+- break;
+- }
+- ++p;
+- }
+- }
+- return buf;
+-}
+-
+ void
+ print_backtrace(unsigned long *sp)
+ {
+ int cnt = 0;
+ unsigned long i;
+- char name_buf[256];
++ char buffer[512];
+
+- printk("Call backtrace: \n");
++ printk("Call Trace: \n");
+ while (sp) {
+ if (__get_user( i, &sp[2] ))
+ break;
+- printk("%016lX ", i);
+- printk("%s\n", ppc_find_proc_name( (unsigned *)i, name_buf, 256 ));
++ if (kernel_text_address(i)) {
++ if (__get_user(sp, (unsigned long **)sp))
++ break;
++ lookup_symbol(i, buffer, 512);
++ printk("[<%016lx>] %s\n", i, buffer);
++ }
+ if (cnt > 32) break;
+- if (__get_user(sp, (unsigned long **)sp))
+- break;
+ }
+ printk("\n");
+ }
+@@ -515,6 +518,7 @@
+ unsigned long ip, sp;
+ unsigned long stack_page = (unsigned long)p;
+ int count = 0;
++ static char buffer[512];
+
+ if (!p)
+ return;
+@@ -528,7 +532,8 @@
+ break;
+ if (count > 0) {
+ ip = *(unsigned long *)(sp + 16);
+- printk("[%016lx] ", ip);
++ lookup_symbol(ip, buffer, 512);
++ printk("[<%016lx>] %s\n", ip, buffer);
+ }
+ } while (count++ < 16);
+ printk("\n");
+Index: linux-bgl/kernel/Makefile
+===================================================================
+--- linux-bgl.orig/kernel/Makefile 2004-10-26 23:23:00.516655289 -0700
++++ linux-bgl/kernel/Makefile 2004-10-26 23:35:04.930451186 -0700
+@@ -14,7 +14,7 @@
+ obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
+ module.o exit.o itimer.o info.o time.o softirq.o resource.o \
+ sysctl.o acct.o capability.o ptrace.o timer.o user.o \
+- signal.o sys.o kmod.o context.o
++ signal.o sys.o kmod.o context.o kksymoops.o
+
+ obj-$(CONFIG_UID16) += uid16.o
+ obj-$(CONFIG_MODULES) += ksyms.o
+Index: linux-bgl/kernel/kksymoops.c
+===================================================================
+--- linux-bgl.orig/kernel/kksymoops.c 2004-10-26 17:10:51.404753448 -0700
++++ linux-bgl/kernel/kksymoops.c 2004-10-26 23:25:17.971439129 -0700
+@@ -0,0 +1,82 @@
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/kernel.h>
++#include <linux/config.h>
++#ifdef CONFIG_KALLSYMS
++#include <linux/kallsyms.h>
++#endif
++
++
++
++int lookup_symbol(unsigned long address, char *buffer, int buflen)
++{
++ struct module *this_mod;
++ unsigned long bestsofar;
++
++ const char *mod_name = NULL, *sec_name = NULL, *sym_name = NULL;
++ unsigned long mod_start,mod_end,sec_start,sec_end,sym_start,sym_end;
++
++ if (!buffer)
++ return -EFAULT;
++
++ if (buflen<256)
++ return -ENOMEM;
++
++ memset(buffer,0,buflen);
++
++#ifdef CONFIG_KALLSYMS
++ if (!kallsyms_address_to_symbol(address,&mod_name,&mod_start,&mod_end,&sec_name,
++ &sec_start, &sec_end, &sym_name, &sym_start, &sym_end)) {
++ /* kallsyms doesn't have a clue; lets try harder */
++ bestsofar = 0;
++ snprintf(buffer,buflen-1,"[unresolved]");
++
++ this_mod = module_list;
++
++ while (this_mod != NULL) {
++ int i;
++ /* walk the symbol list of this module. Only symbols
++ who's address is smaller than the searched for address
++ are relevant; and only if it's better than the best so far */
++ for (i=0; i< this_mod->nsyms; i++)
++ if ((this_mod->syms[i].value<=address) &&
++ (bestsofar<this_mod->syms[i].value)) {
++ snprintf(buffer,buflen-1,"%s [%s] 0x%x",
++ this_mod->syms[i].name,
++ this_mod->name,
++ (unsigned int)(address - this_mod->syms[i].value));
++ bestsofar = this_mod->syms[i].value;
++ }
++ this_mod = this_mod->next;
++ }
++
++ } else { /* kallsyms success */
++ snprintf(buffer,buflen-1,"%s [%s] 0x%x",sym_name,mod_name,(unsigned int)(address-sym_start));
++ }
++#endif
++ return strlen(buffer);
++}
++
++static char modlist[4096];
++/* this function isn't smp safe but that's not really a problem; it's called from
++ * oops context only and any locking could actually prevent the oops from going out;
++ * the line that is generated is informational only and should NEVER prevent the real oops
++ * from going out.
++ */
++void print_modules(void)
++{
++ struct module *this_mod;
++ int pos = 0, i;
++ memset(modlist,0,4096);
++
++#ifdef CONFIG_KALLSYMS
++ this_mod = module_list;
++ while (this_mod != NULL) {
++ if (this_mod->name != NULL)
++ pos +=snprintf(modlist+pos,160-pos-1,"%s ",this_mod->name);
++ this_mod = this_mod->next;
++ }
++ printk("%s\n",modlist);
++#endif
++}
+Index: linux-bgl/include/linux/kernel.h
+===================================================================
+--- linux-bgl.orig/include/linux/kernel.h 2003-07-02 08:44:16.000000000 -0700
++++ linux-bgl/include/linux/kernel.h 2004-10-26 23:25:17.968439596 -0700
+@@ -107,6 +107,9 @@
+ extern int tainted;
+ extern const char *print_tainted(void);
+
++extern int lookup_symbol(unsigned long address, char *buffer, int buflen);
++extern void print_modules(void);
++
+ #if DEBUG
+ #define pr_debug(fmt,arg...) \
+ printk(KERN_DEBUG fmt,##arg)
--- /dev/null
+Index: linux-2.4.24/arch/i386/kernel/i386_ksyms.c
+===================================================================
+--- linux-2.4.24.orig/arch/i386/kernel/i386_ksyms.c 2003-11-28 13:26:19.000000000 -0500
++++ linux-2.4.24/arch/i386/kernel/i386_ksyms.c 2004-05-07 16:58:39.000000000 -0400
+@@ -186,3 +186,8 @@
+ EXPORT_SYMBOL(edd);
+ EXPORT_SYMBOL(eddnr);
+ #endif
++
++EXPORT_SYMBOL_GPL(show_mem);
++EXPORT_SYMBOL_GPL(show_state);
++EXPORT_SYMBOL_GPL(show_regs);
++
+Index: linux-2.4.24/arch/i386/kernel/process.c
+===================================================================
+--- linux-2.4.24.orig/arch/i386/kernel/process.c 2003-11-28 13:26:19.000000000 -0500
++++ linux-2.4.24/arch/i386/kernel/process.c 2004-05-07 17:08:18.000000000 -0400
+@@ -400,7 +400,8 @@
+ * Stop all CPUs and turn off local APICs and the IO-APIC, so
+ * other OSs see a clean IRQ state.
+ */
+- smp_send_stop();
++ if (!netdump_func)
++ smp_send_stop();
+ #elif CONFIG_X86_LOCAL_APIC
+ if (cpu_has_apic) {
+ __cli();
+Index: linux-2.4.24/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.4.24.orig/arch/i386/kernel/traps.c 2004-05-07 16:57:00.000000000 -0400
++++ linux-2.4.24/arch/i386/kernel/traps.c 2004-05-07 17:09:17.000000000 -0400
+@@ -280,6 +280,9 @@
+ printk("Kernel BUG\n");
+ }
+
++void (*netdump_func) (struct pt_regs *regs) = NULL;
++int netdump_mode = 0;
++
+ spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
+
+ void die(const char * str, struct pt_regs * regs, long err)
+@@ -290,6 +293,8 @@
+ handle_BUG(regs);
+ printk("%s: %04lx\n", str, err & 0xffff);
+ show_registers(regs);
++ if (netdump_func)
++ netdump_func(regs);
+ bust_spinlocks(0);
+ spin_unlock_irq(&die_lock);
+ do_exit(SIGSEGV);
+@@ -1041,3 +1046,9 @@
+
+ EXPORT_SYMBOL_GPL(is_kernel_text_address);
+ EXPORT_SYMBOL_GPL(lookup_symbol);
++
++EXPORT_SYMBOL_GPL(netdump_func);
++EXPORT_SYMBOL_GPL(netdump_mode);
++#if CONFIG_X86_LOCAL_APIC
++EXPORT_SYMBOL_GPL(nmi_watchdog);
++#endif
+Index: linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c
+===================================================================
+--- linux-2.4.24.orig/arch/x86_64/kernel/x8664_ksyms.c 2003-11-28 13:26:19.000000000 -0500
++++ linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c 2004-05-07 17:01:51.000000000 -0400
+@@ -41,6 +41,9 @@
+ EXPORT_SYMBOL(drive_info);
+ #endif
+
++//extern void (*netdump_func) (struct pt_regs *regs) = NULL;
++int netdump_mode = 0;
++
+ /* platform dependent support */
+ EXPORT_SYMBOL(boot_cpu_data);
+ EXPORT_SYMBOL(dump_fpu);
+@@ -229,3 +232,6 @@
+ EXPORT_SYMBOL(touch_nmi_watchdog);
+
+ EXPORT_SYMBOL(do_fork);
++
++EXPORT_SYMBOL_GPL(netdump_func);
++EXPORT_SYMBOL_GPL(netdump_mode);
+Index: linux-2.4.24/drivers/net/3c59x.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/3c59x.c 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/3c59x.c 2004-05-07 17:01:00.000000000 -0400
+@@ -874,6 +874,7 @@
+ static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+ static void vortex_tx_timeout(struct net_device *dev);
+ static void acpi_set_WOL(struct net_device *dev);
++static void vorboom_poll(struct net_device *dev);
+ static struct ethtool_ops vortex_ethtool_ops;
+ \f
+ /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */
+@@ -1343,6 +1344,9 @@
+ dev->set_multicast_list = set_rx_mode;
+ dev->tx_timeout = vortex_tx_timeout;
+ dev->watchdog_timeo = (watchdog * HZ) / 1000;
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = &vorboom_poll;
++#endif
+ if (pdev && vp->enable_wol) {
+ vp->pm_state_valid = 1;
+ pci_save_state(vp->pdev, vp->power_state);
+@@ -2322,6 +2326,29 @@
+ spin_unlock(&vp->lock);
+ }
+
++#ifdef HAVE_POLL_CONTROLLER
++
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++
++static void vorboom_poll (struct net_device *dev)
++{
++ struct vortex_private *vp = (struct vortex_private *)dev->priv;
++
++ if (!netdump_mode) disable_irq(dev->irq);
++ if (vp->full_bus_master_tx)
++ boomerang_interrupt(dev->irq, dev, 0);
++ else
++ vortex_interrupt(dev->irq, dev, 0);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++
++#endif
++
++
+ static int vortex_rx(struct net_device *dev)
+ {
+ struct vortex_private *vp = (struct vortex_private *)dev->priv;
+Index: linux-2.4.24/drivers/net/Config.in
+===================================================================
+--- linux-2.4.24.orig/drivers/net/Config.in 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/Config.in 2004-05-07 16:58:39.000000000 -0400
+@@ -295,6 +295,8 @@
+ dep_tristate ' SysKonnect FDDI PCI support' CONFIG_SKFP $CONFIG_PCI
+ fi
+
++tristate 'Network logging support' CONFIG_NETCONSOLE
++
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ if [ "$CONFIG_INET" = "y" ]; then
+ bool 'HIPPI driver support (EXPERIMENTAL)' CONFIG_HIPPI
+Index: linux-2.4.24/drivers/net/eepro100.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/eepro100.c 2003-08-25 07:44:42.000000000 -0400
++++ linux-2.4.24/drivers/net/eepro100.c 2004-05-07 16:58:39.000000000 -0400
+@@ -543,6 +543,7 @@
+ static int speedo_rx(struct net_device *dev);
+ static void speedo_tx_buffer_gc(struct net_device *dev);
+ static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
++static void poll_speedo (struct net_device *dev);
+ static int speedo_close(struct net_device *dev);
+ static struct net_device_stats *speedo_get_stats(struct net_device *dev);
+ static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+@@ -879,6 +880,9 @@
+ dev->get_stats = &speedo_get_stats;
+ dev->set_multicast_list = &set_rx_mode;
+ dev->do_ioctl = &speedo_ioctl;
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = &poll_speedo;
++#endif
+
+ return 0;
+ }
+@@ -1176,10 +1180,8 @@
+
+
+ /* Media monitoring and control. */
+-static void speedo_timer(unsigned long data)
++static void speedo_timeout(struct net_device *dev, struct speedo_private *sp)
+ {
+- struct net_device *dev = (struct net_device *)data;
+- struct speedo_private *sp = (struct speedo_private *)dev->priv;
+ long ioaddr = dev->base_addr;
+ int phy_num = sp->phy[0] & 0x1f;
+
+@@ -1217,6 +1219,15 @@
+ dev->name, sp->rx_mode, jiffies, sp->last_rx_time);
+ set_rx_mode(dev);
+ }
++}
++
++static void speedo_timer(unsigned long data)
++{
++ struct net_device *dev = (struct net_device *)data;
++ struct speedo_private *sp = (struct speedo_private *)dev->priv;
++
++ speedo_timeout(dev, sp);
++
+ /* We must continue to monitor the media. */
+ sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */
+ add_timer(&sp->timer);
+@@ -1661,6 +1672,29 @@
+ return;
+ }
+
++#ifdef HAVE_POLL_CONTROLLER
++
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++
++static void poll_speedo (struct net_device *dev)
++{
++ struct speedo_private *sp = (struct speedo_private *)dev->priv;
++
++ if (!netdump_mode) disable_irq(dev->irq);
++ if (sp->timer.expires == jiffies) {
++ sp->timer.expires = RUN_AT(2*HZ);
++ speedo_timeout(dev, sp);
++ }
++ speedo_interrupt (dev->irq, dev, NULL);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++
++#endif
++
+ static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry)
+ {
+ struct speedo_private *sp = (struct speedo_private *)dev->priv;
+Index: linux-2.4.24/drivers/net/Makefile
+===================================================================
+--- linux-2.4.24.orig/drivers/net/Makefile 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/Makefile 2004-05-07 16:58:39.000000000 -0400
+@@ -250,6 +250,8 @@
+ obj-y += ../acorn/net/acorn-net.o
+ endif
+
++obj-$(CONFIG_NETCONSOLE) += netconsole.o
++
+ #
+ # HIPPI adapters
+ #
+Index: linux-2.4.24/drivers/net/netconsole.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/netconsole.c 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.4.24/drivers/net/netconsole.c 2004-05-07 16:58:39.000000000 -0400
+@@ -0,0 +1,1246 @@
++/*
++ * linux/drivers/net/netconsole.c
++ *
++ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2002 Red Hat, Inc.
++ *
++ * This file contains the implementation of an IRQ-safe, crash-safe
++ * kernel console implementation that outputs kernel messages to the
++ * network.
++ *
++ * Modification history:
++ *
++ * 2001-09-17 started by Ingo Molnar.
++ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson
++ */
++
++/****************************************************************
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2, or (at your option)
++ * any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ ****************************************************************/
++
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <linux/mm.h>
++#include <linux/tty.h>
++#include <linux/init.h>
++#include <linux/delay.h>
++#include <linux/random.h>
++#include <linux/reboot.h>
++#include <linux/module.h>
++#include <asm/unaligned.h>
++#include <asm/pgtable.h>
++#if CONFIG_X86_LOCAL_APIC
++#include <asm/apic.h>
++#endif
++#include <linux/console.h>
++#include <linux/smp_lock.h>
++#include <linux/netdevice.h>
++#include <linux/tty_driver.h>
++#include <linux/etherdevice.h>
++#include <linux/elf.h>
++
++static struct net_device *netconsole_dev;
++static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port;
++static u32 source_ip, netdump_target_ip, netlog_target_ip, syslog_target_ip;
++static unsigned char netdump_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
++static unsigned char netlog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
++static unsigned char syslog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
++
++static unsigned int mhz = 500, idle_timeout;
++static unsigned long long mhz_cycles, jiffy_cycles;
++
++#include "netconsole.h"
++
++#define MAX_UDP_CHUNK 1460
++#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN)
++
++#define DEBUG 0
++#if DEBUG
++# define Dprintk(x...) printk(KERN_INFO x)
++#else
++# define Dprintk(x...)
++#endif
++/*
++ * We maintain a small pool of fully-sized skbs,
++ * to make sure the message gets out even in
++ * extreme OOM situations.
++ */
++#define MAX_NETCONSOLE_SKBS 128
++
++static spinlock_t netconsole_lock = SPIN_LOCK_UNLOCKED;
++static int nr_netconsole_skbs;
++static struct sk_buff *netconsole_skbs;
++
++#define MAX_SKB_SIZE \
++ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
++ sizeof(struct iphdr) + sizeof(struct ethhdr))
++
++static int new_arp = 0;
++static unsigned char arp_sha[ETH_ALEN], arp_tha[ETH_ALEN];
++static u32 arp_sip, arp_tip;
++
++static void send_netconsole_arp(struct net_device *dev);
++
++static void __refill_netconsole_skbs(void)
++{
++ struct sk_buff *skb;
++ unsigned long flags;
++
++ spin_lock_irqsave(&netconsole_lock, flags);
++ while (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) {
++ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
++ if (!skb)
++ break;
++ if (netconsole_skbs)
++ skb->next = netconsole_skbs;
++ else
++ skb->next = NULL;
++ netconsole_skbs = skb;
++ nr_netconsole_skbs++;
++ }
++ spin_unlock_irqrestore(&netconsole_lock, flags);
++}
++
++static struct sk_buff * get_netconsole_skb(void)
++{
++ struct sk_buff *skb;
++
++ unsigned long flags;
++
++ spin_lock_irqsave(&netconsole_lock, flags);
++ skb = netconsole_skbs;
++ if (skb) {
++ netconsole_skbs = skb->next;
++ skb->next = NULL;
++ nr_netconsole_skbs--;
++ }
++ spin_unlock_irqrestore(&netconsole_lock, flags);
++
++ return skb;
++}
++
++static unsigned long long t0;
++
++/*
++ * Do cleanups:
++ * - zap completed output skbs.
++ * - send ARPs if requested
++ * - reboot the box if inactive for more than N seconds.
++ */
++static void zap_completion_queue(void)
++{
++ unsigned long long t1;
++ int cpu = smp_processor_id();
++
++ if (softnet_data[cpu].completion_queue) {
++ struct sk_buff *clist;
++
++ local_irq_disable();
++ clist = softnet_data[cpu].completion_queue;
++ softnet_data[cpu].completion_queue = NULL;
++ local_irq_enable();
++
++ while (clist != NULL) {
++ struct sk_buff *skb = clist;
++ clist = clist->next;
++ __kfree_skb(skb);
++ }
++ }
++
++ if (new_arp) {
++ Dprintk("got ARP req - sending reply.\n");
++ new_arp = 0;
++ send_netconsole_arp(netconsole_dev);
++ }
++
++ rdtscll(t1);
++ if (idle_timeout) {
++ if (t0) {
++ if (((t1 - t0) >> 20) > mhz_cycles * (unsigned long long)idle_timeout) {
++ t0 = t1;
++ printk("netdump idle timeout - rebooting in 3 seconds.\n");
++ mdelay(3000);
++ machine_restart(NULL);
++ }
++ }
++ }
++ /* maintain jiffies in a polling fashion, based on rdtsc. */
++ {
++ static unsigned long long prev_tick;
++
++ if (t1 - prev_tick >= jiffy_cycles) {
++ prev_tick += jiffy_cycles;
++ jiffies++;
++ }
++ }
++}
++
++static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve)
++{
++ int once = 1;
++ int count = 0;
++ struct sk_buff *skb = NULL;
++
++repeat:
++ zap_completion_queue();
++ if (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS)
++ __refill_netconsole_skbs();
++
++ skb = alloc_skb(len, GFP_ATOMIC);
++ if (!skb) {
++ skb = get_netconsole_skb();
++ if (!skb) {
++ count++;
++ if (once && (count == 1000000)) {
++ printk("possibly FATAL: out of netconsole skbs!!! will keep retrying.\n");
++ once = 0;
++ }
++ Dprintk("alloc skb: polling controller ...\n");
++ dev->poll_controller(dev);
++ goto repeat;
++ }
++ }
++
++ atomic_set(&skb->users, 1);
++ skb_reserve(skb, reserve);
++ return skb;
++}
++
++static void transmit_raw_skb(struct sk_buff *skb, struct net_device *dev)
++{
++
++repeat_poll:
++ spin_lock(&dev->xmit_lock);
++ dev->xmit_lock_owner = smp_processor_id();
++
++ if (netif_queue_stopped(dev)) {
++ dev->xmit_lock_owner = -1;
++ spin_unlock(&dev->xmit_lock);
++
++ Dprintk("xmit skb: polling controller ...\n");
++ dev->poll_controller(dev);
++ zap_completion_queue();
++ goto repeat_poll;
++ }
++
++ dev->hard_start_xmit(skb, dev);
++
++ dev->xmit_lock_owner = -1;
++ spin_unlock(&dev->xmit_lock);
++}
++
++static void transmit_netconsole_skb(struct sk_buff *skb, struct net_device *dev,
++ int ip_len, int udp_len,
++ u16 source_port, u16 target_port, u32 source_ip, u32 target_ip,
++ unsigned char * macdaddr)
++{
++ struct udphdr *udph;
++ struct iphdr *iph;
++ struct ethhdr *eth;
++
++ udph = (struct udphdr *) skb_push(skb, sizeof(*udph));
++ udph->source = source_port;
++ udph->dest = target_port;
++ udph->len = htons(udp_len);
++ udph->check = 0;
++
++ iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
++
++ iph->version = 4;
++ iph->ihl = 5;
++ iph->tos = 0;
++ iph->tot_len = htons(ip_len);
++ iph->id = 0;
++ iph->frag_off = 0;
++ iph->ttl = 64;
++ iph->protocol = IPPROTO_UDP;
++ iph->check = 0;
++ iph->saddr = source_ip;
++ iph->daddr = target_ip;
++ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
++
++ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
++
++ eth->h_proto = htons(ETH_P_IP);
++ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
++ memcpy(eth->h_dest, macdaddr, dev->addr_len);
++
++ transmit_raw_skb(skb, dev);
++}
++
++static void send_netconsole_arp(struct net_device *dev)
++{
++ int total_len, arp_len, arp_data_len;
++ struct sk_buff *skb;
++ unsigned char *arp;
++ struct arphdr *arph;
++ struct ethhdr *eth;
++
++ arp_data_len = 2*4 + 2*ETH_ALEN;
++ arp_len = arp_data_len + sizeof(struct arphdr);
++ total_len = arp_len + ETH_HLEN;
++
++ skb = alloc_netconsole_skb(dev, total_len, total_len - arp_data_len);
++
++ arp = skb->data;
++
++ memcpy(arp, dev->dev_addr, ETH_ALEN);
++ arp += ETH_ALEN;
++
++ memcpy(arp, &source_ip, 4);
++ arp += 4;
++
++ memcpy(arp, arp_sha, ETH_ALEN);
++ arp += ETH_ALEN;
++
++ memcpy(arp, &arp_sip, 4);
++ arp += 4;
++
++ skb->len += 2*4 + 2*ETH_ALEN;
++
++ arph = (struct arphdr *)skb_push(skb, sizeof(*arph));
++
++ arph->ar_hrd = htons(dev->type);
++ arph->ar_pro = __constant_htons(ETH_P_IP);
++ arph->ar_hln = ETH_ALEN;
++ arph->ar_pln = 4;
++ arph->ar_op = __constant_htons(ARPOP_REPLY);
++
++ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
++
++ eth->h_proto = htons(ETH_P_ARP);
++ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
++ memcpy(eth->h_dest, arp_sha, dev->addr_len);
++
++ transmit_raw_skb(skb, dev);
++}
++
++static void send_netdump_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply)
++{
++ int total_len, ip_len, udp_len;
++ struct sk_buff *skb;
++
++ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr);
++ ip_len = udp_len + sizeof(struct iphdr);
++ total_len = ip_len + ETH_HLEN;
++
++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN);
++
++ skb->data[0] = NETCONSOLE_VERSION;
++ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1));
++ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5));
++ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9));
++
++ memcpy(skb->data + HEADER_LEN, msg, msg_len);
++ skb->len += msg_len + HEADER_LEN;
++
++ transmit_netconsole_skb(skb, dev, ip_len, udp_len,
++ source_port, netdump_target_port, source_ip, netdump_target_ip, netdump_daddr);
++}
++
++#define SYSLOG_HEADER_LEN 4
++
++static void send_netlog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply)
++{
++ int total_len, ip_len, udp_len;
++ struct sk_buff *skb;
++
++ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr);
++ ip_len = udp_len + sizeof(struct iphdr);
++ total_len = ip_len + ETH_HLEN;
++
++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN);
++
++ skb->data[0] = NETCONSOLE_VERSION;
++ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1));
++ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5));
++ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9));
++
++ memcpy(skb->data + HEADER_LEN, msg, msg_len);
++ skb->len += msg_len + HEADER_LEN;
++
++ transmit_netconsole_skb(skb, dev, ip_len, udp_len,
++ source_port, netlog_target_port, source_ip, netlog_target_ip, netlog_daddr);
++}
++
++#define SYSLOG_HEADER_LEN 4
++
++static void send_syslog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, int pri)
++{
++ int total_len, ip_len, udp_len;
++ struct sk_buff *skb;
++
++ udp_len = msg_len + SYSLOG_HEADER_LEN + sizeof(struct udphdr);
++ ip_len = udp_len + sizeof(struct iphdr);
++ total_len = ip_len + ETH_HLEN;
++
++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - SYSLOG_HEADER_LEN);
++
++ skb->data[0] = '<';
++ skb->data[1] = pri + '0';
++ skb->data[2]= '>';
++ skb->data[3]= ' ';
++
++ memcpy(skb->data + SYSLOG_HEADER_LEN, msg, msg_len);
++ skb->len += msg_len + SYSLOG_HEADER_LEN;
++
++ transmit_netconsole_skb(skb, dev, ip_len, udp_len, source_port,
++ syslog_target_port, source_ip, syslog_target_ip, syslog_daddr);
++}
++
++#define MAX_SYSLOG_CHARS 1000
++
++static spinlock_t syslog_lock = SPIN_LOCK_UNLOCKED;
++static int syslog_chars;
++static unsigned char syslog_line [MAX_SYSLOG_CHARS + 10];
++
++/*
++ * We feed kernel messages char by char, and send the UDP packet
++ * one linefeed. We buffer all characters received.
++ */
++static inline void feed_syslog_char(struct net_device *dev, const unsigned char c)
++{
++ if (syslog_chars == MAX_SYSLOG_CHARS)
++ syslog_chars--;
++ syslog_line[syslog_chars] = c;
++ syslog_chars++;
++ if (c == '\n') {
++ send_syslog_skb(dev, syslog_line, syslog_chars, 5);
++ syslog_chars = 0;
++ }
++}
++
++static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED;
++static unsigned int log_offset;
++
++static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len)
++{
++ int len, left, i;
++ struct net_device *dev;
++ const char *msg = msg0;
++ reply_t reply;
++
++ dev = netconsole_dev;
++ if (!dev || netdump_mode)
++ return;
++
++ if (dev->poll_controller && netif_running(dev)) {
++ unsigned long flags;
++
++ __save_flags(flags);
++ __cli();
++ left = msg_len;
++ if (netlog_target_ip) {
++ while (left) {
++ if (left > MAX_PRINT_CHUNK)
++ len = MAX_PRINT_CHUNK;
++ else
++ len = left;
++ reply.code = REPLY_LOG;
++ reply.nr = 0;
++ spin_lock(&sequence_lock);
++ reply.info = log_offset;
++ log_offset += len;
++ spin_unlock(&sequence_lock);
++ send_netlog_skb(dev, msg, len, &reply);
++ msg += len;
++ left -= len;
++ }
++ }
++ if (syslog_target_ip) {
++ spin_lock(&syslog_lock);
++ for (i = 0; i < msg_len; i++)
++ feed_syslog_char(dev, msg0[i]);
++ spin_unlock(&syslog_lock);
++ }
++
++ __restore_flags(flags);
++ }
++}
++
++static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
++{
++ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
++}
++
++static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
++ unsigned short ulen, u32 saddr, u32 daddr)
++{
++ if (uh->check == 0) {
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
++ } else if (skb->ip_summed == CHECKSUM_HW) {
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
++ if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
++ return 0;
++ skb->ip_summed = CHECKSUM_NONE;
++ }
++ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
++ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP,
++0);
++ /* Probably, we should checksum udp header (it should be in cache
++ * in any case) and data in tiny packets (< rx copybreak).
++ */
++ return 0;
++}
++
++static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
++{
++ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
++}
++
++static __inline__ int udp_checksum_complete(struct sk_buff *skb)
++{
++ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
++ __udp_checksum_complete(skb);
++}
++
++/*
++ * NOTE: security depends on the trusted path between the netconsole
++ * server and netconsole client, since none of the packets are
++ * encrypted. The random magic number protects the protocol
++ * against spoofing.
++ */
++static u64 netconsole_magic;
++static u32 magic1, magic2;
++
++static spinlock_t req_lock = SPIN_LOCK_UNLOCKED;
++static int nr_req = 0;
++static LIST_HEAD(request_list);
++
++static void add_new_req(req_t *req)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&req_lock, flags);
++ list_add_tail(&req->list, &request_list);
++ nr_req++;
++ Dprintk("pending requests: %d.\n", nr_req);
++ spin_unlock_irqrestore(&req_lock, flags);
++
++ rdtscll(t0);
++}
++
++static req_t *get_new_req(void)
++{
++ req_t *req = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&req_lock, flags);
++ if (nr_req) {
++ req = list_entry(request_list.next, req_t, list);
++ list_del(&req->list);
++ nr_req--;
++ }
++ spin_unlock_irqrestore(&req_lock, flags);
++
++ return req;
++}
++
++static req_t *alloc_req(void)
++{
++ req_t *req;
++
++ req = (req_t *) kmalloc(sizeof(*req), GFP_ATOMIC);
++ return req;
++}
++
++static int netconsole_rx_hook(struct sk_buff *skb)
++{
++ int proto;
++ struct iphdr *iph;
++ struct udphdr *uh;
++ __u32 len, saddr, daddr, ulen;
++ req_t *__req;
++ req_t *req;
++ struct net_device *dev;
++
++ if (!netdump_mode)
++ return NET_RX_SUCCESS;
++#if DEBUG
++ {
++ static int packet_count;
++ Dprintk(" %d\r", ++packet_count);
++ }
++#endif
++ dev = skb->dev;
++ if (dev->type != ARPHRD_ETHER)
++ goto out;
++ proto = ntohs(skb->mac.ethernet->h_proto);
++ Dprintk("rx got skb %p (len: %d, users: %d), dev %s, h_proto: %04x.\n", skb, skb->len, atomic_read(&skb->users), dev->name, proto);
++ #define D(x) skb->mac.ethernet->h_dest[x]
++ Dprintk("... h_dest: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
++ #undef D
++ #define D(x) skb->mac.ethernet->h_source[x]
++ Dprintk("... h_source: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
++ #undef D
++ if (skb->pkt_type == PACKET_OTHERHOST)
++ goto out;
++ if (skb_shared(skb))
++ goto out;
++ if (proto == ETH_P_ARP) {
++ struct arphdr *arp;
++ unsigned char *arp_ptr;
++
++ Dprintk("got arp skb.\n");
++ arp = (struct arphdr *)skb->data;
++ if (!pskb_may_pull(skb, sizeof(struct arphdr) + 2*4 + 2*ETH_ALEN))
++ goto out;
++ if (htons(dev->type) != arp->ar_hrd)
++ goto out;
++ if (arp->ar_pro != __constant_htons(ETH_P_IP))
++ goto out;
++ if (arp->ar_hln != ETH_ALEN)
++ goto out;
++ if (arp->ar_pln != 4)
++ goto out;
++ if (arp->ar_op != __constant_htons(ARPOP_REQUEST))
++ goto out;
++ /*
++ * ARP header looks ok so far, extract fields:
++ */
++ arp_ptr = (unsigned char *)(arp + 1);
++
++ memcpy(arp_sha, arp_ptr, ETH_ALEN);
++ arp_ptr += ETH_ALEN;
++
++ memcpy(&arp_sip, arp_ptr, 4);
++ arp_ptr += 4;
++
++ memcpy(arp_tha, arp_ptr, ETH_ALEN);
++ arp_ptr += ETH_ALEN;
++
++ memcpy(&arp_tip, arp_ptr, 4);
++
++ #define D(x) arp_sha[x]
++ Dprintk("... arp_sha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
++ #undef D
++ #define D(x) ((unsigned char *)&arp_sip)[x]
++ Dprintk("... arp_sip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
++ #undef D
++ #define D(x) arp_tha[x]
++ Dprintk("... arp_tha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
++ #undef D
++ #define D(x) ((unsigned char *)&arp_tip)[x]
++ Dprintk("... arp_tip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
++ #undef D
++ #define D(x) ((unsigned char *)&source_ip)[x]
++ Dprintk("... (source_ip): %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
++ #undef D
++
++ if (LOOPBACK(arp_tip) || MULTICAST(arp_tip))
++ goto out;
++
++ if (arp_tip != source_ip)
++ goto out;
++ new_arp = 1;
++ goto out;
++ }
++ if (proto != ETH_P_IP)
++ goto out;
++ /*
++ * IP header correctness testing:
++ */
++ iph = (struct iphdr *)skb->data;
++ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
++ goto out;
++ Dprintk("... IP ihl*4: %d, version: %d.\n", iph->ihl*4, iph->version);
++ if (iph->ihl < 5 || iph->version != 4)
++ goto out;
++ if (!pskb_may_pull(skb, iph->ihl*4))
++ goto out;
++ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
++ goto out;
++ len = ntohs(iph->tot_len);
++ Dprintk("... IP len: %d.\n", len);
++ if (skb->len < len || len < iph->ihl*4)
++ goto out;
++ saddr = iph->saddr;
++ daddr = iph->daddr;
++ Dprintk("... IP src: %08x, dst: %08x.\n", saddr, daddr);
++ Dprintk("... IP protocol: %d.\n", iph->protocol);
++ if (iph->protocol != IPPROTO_UDP)
++ goto out;
++ Dprintk("... netdump src: %08x, dst: %08x.\n", source_ip, netlog_target_ip);
++ if (source_ip != daddr)
++ goto out;
++ if (netlog_target_ip != saddr)
++ goto out;
++ len -= iph->ihl*4;
++ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
++ ulen = ntohs(uh->len);
++ Dprintk("... UDP len: %d (left %d).\n", ulen, len);
++
++#define MIN_COMM_SIZE (sizeof(*uh) + NETDUMP_REQ_SIZE)
++ if (ulen != len || ulen < MIN_COMM_SIZE) {
++ Dprintk("... UDP, hm, len not ok.\n");
++ goto out;
++ }
++ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) {
++ Dprintk("... UDP, hm, checksum init not ok.\n");
++ goto out;
++ }
++ if (udp_checksum_complete(skb)) {
++ Dprintk("... UDP, hm, checksum complete not ok.\n");
++ goto out;
++ }
++ Dprintk("... UDP packet OK!\n");
++ Dprintk("... UDP src port: %d, dst port: %d.\n", uh->source, uh->dest);
++ if (source_port != uh->source)
++ goto out;
++ if (netlog_target_port != uh->dest)
++ goto out;
++ __req = (req_t *)(uh + 1);
++ Dprintk("... UDP netdump packet OK!\n");
++
++ req = alloc_req();
++ if (!req) {
++ printk("no more RAM to allocate request - dropping it.\n");
++ goto out;
++ }
++
++ req->magic = ntohl(__req->magic);
++ req->command = ntohl(__req->command);
++ req->from = ntohl(__req->from);
++ req->to = ntohl(__req->to);
++ req->nr = ntohl(__req->nr);
++
++ Dprintk("... netdump magic: %08Lx.\n", req->magic);
++ Dprintk("... netdump command: %08x.\n", req->command);
++ Dprintk("... netdump from: %08x.\n", req->from);
++ Dprintk("... netdump to: %08x.\n", req->to);
++
++ add_new_req(req);
++out:
++ return NET_RX_DROP;
++}
++
++#define INVALID_PAGE "page is not valid!\n"
++
++static void send_netdump_mem (struct net_device *dev, req_t *req)
++{
++ int i;
++ char *kaddr;
++ char str[1024];
++ struct page *page;
++ unsigned long nr = req->from;
++ int nr_chunks = PAGE_SIZE/1024;
++ reply_t reply;
++
++ reply.nr = req->nr;
++ reply.info = 0;
++ if (req->from >= max_mapnr) {
++ sprintf(str, "page %08lx is bigger than max page # %08lx!\n", nr, max_mapnr);
++ reply.code = REPLY_ERROR;
++ send_netdump_skb(dev, str, strlen(str), &reply);
++ return;
++ }
++ page = mem_map + nr;
++ if (PageReserved(page))
++ page = ZERO_PAGE(0);
++
++ kaddr = (char *)kmap_atomic(page, KM_NETDUMP);
++
++ for (i = 0; i < nr_chunks; i++) {
++ unsigned int offset = i*1024;
++ reply.code = REPLY_MEM;
++ reply.info = offset;
++ send_netdump_skb(dev, kaddr + offset, 1024, &reply);
++ }
++
++ kunmap_atomic(kaddr, KM_NETDUMP);
++}
++
++/*
++ * This function waits for the client to acknowledge the receipt
++ * of the netdump startup reply, with the possibility of packets
++ * getting lost. We resend the startup packet if no ACK is received,
++ * after a 1 second delay.
++ *
++ * (The client can test the success of the handshake via the HELLO
++ * command, and send ACKs until we enter netdump mode.)
++ */
++static void netdump_startup_handshake(struct net_device *dev)
++{
++ char tmp[200];
++ reply_t reply;
++ req_t *req = NULL;
++ int i;
++
++ netdump_mode = 1;
++
++repeat:
++ sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n");
++ reply.code = REPLY_START_NETDUMP;
++ reply.nr = 0;
++ reply.info = 0;
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++
++ for (i = 0; i < 10000; i++) {
++ // wait 1 sec.
++ udelay(100);
++ Dprintk("handshake: polling controller ...\n");
++ dev->poll_controller(dev);
++ zap_completion_queue();
++ req = get_new_req();
++ if (req)
++ break;
++ }
++ if (!req)
++ goto repeat;
++ if (req->command != COMM_START_NETDUMP_ACK) {
++ kfree(req);
++ goto repeat;
++ }
++ kfree(req);
++
++ printk("NETDUMP START!\n");
++}
++
++#if 0
++
++static inline void print_status (req_t *req)
++{
++ static int count = 0;
++
++ switch (++count & 3) {
++ case 0: printk("/\r"); break;
++ case 1: printk("|\r"); break;
++ case 2: printk("\\\r"); break;
++ case 3: printk("-\r"); break;
++ }
++}
++
++#else
++
++static inline void print_status (req_t *req)
++{
++ static int count = 0;
++ static int prev_jiffies = 0;
++
++ if (jiffies/HZ != prev_jiffies/HZ) {
++ prev_jiffies = jiffies;
++ count++;
++ switch (count & 3) {
++ case 0: printk("%d(%ld)/\r", nr_req, jiffies); break;
++ case 1: printk("%d(%ld)|\r", nr_req, jiffies); break;
++ case 2: printk("%d(%ld)\\\r", nr_req, jiffies); break;
++ case 3: printk("%d(%ld)-\r", nr_req, jiffies); break;
++ }
++ }
++}
++
++#endif
++
++#define CLI 1
++
++#if CONFIG_SMP
++static void freeze_cpu (void * dummy)
++{
++ printk("CPU#%d is frozen.\n", smp_processor_id());
++#if CLI
++ for (;;) __cli();
++#else
++ for (;;) __sti();
++#endif
++}
++#endif
++
++static void netconsole_netdump (struct pt_regs *regs)
++{
++ reply_t reply;
++ char tmp[200];
++ unsigned long flags;
++ struct net_device *dev = netconsole_dev;
++ unsigned long esp;
++ unsigned short ss;
++ struct pt_regs myregs;
++ req_t *req;
++
++ __save_flags(flags);
++ __cli();
++#if CONFIG_X86_LOCAL_APIC
++ nmi_watchdog = 0;
++#endif
++#if CONFIG_SMP
++ smp_call_function(freeze_cpu, NULL, 1, 0);
++#endif
++ mdelay(1000);
++ /*
++ * Just in case we are crashing within the networking code
++ * ... attempt to fix up.
++ */
++ spin_lock_init(&dev->xmit_lock);
++
++ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs));
++ ss = __KERNEL_DS;
++ if (regs->xcs & 3) {
++ esp = regs->esp;
++ ss = regs->xss & 0xffff;
++ }
++ myregs = *regs;
++ myregs.esp = esp;
++ myregs.xss = (myregs.xss & 0xffff0000) | ss;
++
++ rdtscll(t0);
++
++ printk("< netdump activated - performing handshake with the client. >\n");
++ netdump_startup_handshake(dev);
++
++ printk("< handshake completed - listening for dump requests. >\n");
++
++ while (netdump_mode) {
++ __cli();
++ Dprintk("main netdump loop: polling controller ...\n");
++ dev->poll_controller(dev);
++ zap_completion_queue();
++#if !CLI
++ __sti();
++#endif
++ req = get_new_req();
++ if (!req)
++ continue;
++ Dprintk("got new req, command %d.\n", req->command);
++ print_status(req);
++ switch (req->command) {
++ case COMM_NONE:
++ Dprintk("got NO command.\n");
++ break;
++
++ case COMM_SEND_MEM:
++ Dprintk("got MEM command.\n");
++ // send ->from ->to.
++ send_netdump_mem(dev, req);
++ break;
++
++ case COMM_EXIT:
++ Dprintk("got EXIT command.\n");
++ netdump_mode = 0;
++ break;
++
++ case COMM_REBOOT:
++ Dprintk("got REBOOT command.\n");
++ printk("netdump: rebooting in 3 seconds.\n");
++ mdelay(3000);
++ machine_restart(NULL);
++ break;
++
++ case COMM_HELLO:
++ sprintf(tmp, "Hello, this is netdump version 0.%02d\n", NETCONSOLE_VERSION);
++ reply.code = REPLY_HELLO;
++ reply.nr = req->nr;
++ reply.info = NETCONSOLE_VERSION;
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ break;
++
++ case COMM_GET_PAGE_SIZE:
++ sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE);
++ reply.code = REPLY_PAGE_SIZE;
++ reply.nr = req->nr;
++ reply.info = PAGE_SIZE;
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ break;
++
++ case COMM_GET_REGS:
++ {
++ char *tmp2 = tmp;
++ elf_gregset_t elf_regs;
++
++ reply.code = REPLY_REGS;
++ reply.nr = req->nr;
++ reply.info = max_mapnr;
++ tmp2 = tmp + sprintf(tmp, "Sending register info.\n");
++ ELF_CORE_COPY_REGS(elf_regs, regs);
++ memcpy(tmp2, &elf_regs, sizeof(elf_regs));
++ send_netdump_skb(dev, tmp, strlen(tmp) + sizeof(elf_regs), &reply);
++ break;
++ }
++
++ case COMM_GET_NR_PAGES:
++ reply.code = REPLY_NR_PAGES;
++ reply.nr = req->nr;
++ reply.info = max_mapnr;
++ sprintf(tmp, "Number of pages: %ld\n", max_mapnr);
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ break;
++
++ case COMM_SHOW_STATE:
++ netdump_mode = 0;
++ if (regs)
++ show_regs(regs);
++ show_state();
++ show_mem();
++ netdump_mode = 1;
++ reply.code = REPLY_SHOW_STATE;
++ reply.nr = req->nr;
++ reply.info = 0;
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ break;
++
++ default:
++ reply.code = REPLY_ERROR;
++ reply.nr = req->nr;
++ reply.info = req->command;
++ Dprintk("got UNKNOWN command!\n");
++ sprintf(tmp, "Got unknown command code %d!\n", req->command);
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ break;
++ }
++ kfree(req);
++ req = NULL;
++ }
++ sprintf(tmp, "NETDUMP end.\n");
++ reply.code = REPLY_END_NETDUMP;
++ reply.nr = 0;
++ reply.info = 0;
++ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
++ printk("NETDUMP END!\n");
++ __restore_flags(flags);
++}
++
++static char *dev;
++static int netdump_target_eth_byte0 = 255;
++static int netdump_target_eth_byte1 = 255;
++static int netdump_target_eth_byte2 = 255;
++static int netdump_target_eth_byte3 = 255;
++static int netdump_target_eth_byte4 = 255;
++static int netdump_target_eth_byte5 = 255;
++
++static int netlog_target_eth_byte0 = 255;
++static int netlog_target_eth_byte1 = 255;
++static int netlog_target_eth_byte2 = 255;
++static int netlog_target_eth_byte3 = 255;
++static int netlog_target_eth_byte4 = 255;
++static int netlog_target_eth_byte5 = 255;
++
++static int syslog_target_eth_byte0 = 255;
++static int syslog_target_eth_byte1 = 255;
++static int syslog_target_eth_byte2 = 255;
++static int syslog_target_eth_byte3 = 255;
++static int syslog_target_eth_byte4 = 255;
++static int syslog_target_eth_byte5 = 255;
++
++MODULE_PARM(netdump_target_ip, "i");
++MODULE_PARM_DESC(netdump_target_ip,
++ "remote netdump IP address as a native (not network) endian integer");
++MODULE_PARM(netlog_target_ip, "i");
++MODULE_PARM_DESC(netlog_target_ip,
++ "remote netlog IP address as a native (not network) endian integer");
++MODULE_PARM(syslog_target_ip, "i");
++MODULE_PARM_DESC(syslog_target_ip,
++ "remote syslog IP address as a native (not network) endian integer");
++
++MODULE_PARM(source_port, "h");
++MODULE_PARM_DESC(source_port,
++ "local port from which to send netdump packets");
++
++MODULE_PARM(netdump_target_port, "h");
++MODULE_PARM_DESC(netdump_target_port,
++ "remote port to which to send netdump packets");
++MODULE_PARM(netlog_target_port, "h");
++MODULE_PARM_DESC(netlog_target_port,
++ "remote port to which to send netlog packets");
++MODULE_PARM(syslog_target_port, "h");
++MODULE_PARM_DESC(syslog_target_port,
++ "remote port to which to send syslog packets");
++
++#define ETH_BYTE(name,nr) \
++ MODULE_PARM(name##_target_eth_byte##nr, "i"); \
++ MODULE_PARM_DESC(name##_target_eth_byte##nr, \
++ "byte "#nr" of the netdump server MAC address")
++
++#define ETH_BYTES(name) \
++ ETH_BYTE(name, 0); ETH_BYTE(name, 1); ETH_BYTE(name, 2); \
++ ETH_BYTE(name, 3); ETH_BYTE(name, 4); ETH_BYTE(name, 5);
++
++ETH_BYTES(netdump);
++ETH_BYTES(netlog);
++ETH_BYTES(syslog);
++
++MODULE_PARM(magic1, "i");
++MODULE_PARM_DESC(magic1,
++ "lower 32 bits of magic cookie shared between client and server");
++MODULE_PARM(magic2, "i");
++MODULE_PARM_DESC(magic2,
++ "upper 32 bits of magic cookie shared between client and server");
++MODULE_PARM(dev, "s");
++MODULE_PARM_DESC(dev,
++ "name of the device from which to send netdump and syslog packets");
++MODULE_PARM(mhz, "i");
++MODULE_PARM_DESC(mhz,
++ "one second wall clock time takes this many million CPU cycles");
++MODULE_PARM(idle_timeout, "i");
++MODULE_PARM_DESC(idle_timeout,
++ "reboot system after this many idle seconds");
++
++static struct console netconsole =
++ { flags: CON_ENABLED, write: write_netconsole_msg };
++
++static int init_netconsole(void)
++{
++ struct net_device *ndev = NULL;
++ struct in_device *in_dev;
++
++ printk(KERN_INFO "netlog: using network device <%s>\n", dev);
++ // this will be valid once the device goes up.
++ if (dev)
++ ndev = dev_get_by_name(dev);
++ if (!ndev) {
++ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev);
++ return -1;
++ }
++ if (!ndev->poll_controller) {
++ printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev);
++ return -1;
++ }
++ in_dev = in_dev_get(ndev);
++ if (!in_dev) {
++ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev);
++ return -1;
++ }
++
++ if (!magic1 || !magic2) {
++ printk(KERN_ERR "netlog: magic cookie (magic1,magic2) not specified.\n");
++ return -1;
++ }
++ netconsole_magic = magic1 + (((u64)magic2)<<32);
++
++ source_ip = ntohl(in_dev->ifa_list->ifa_local);
++ if (!source_ip) {
++ printk(KERN_ERR "netlog: network device %s has no local address, aborting.\n", dev);
++ return -1;
++ }
++#define IP(x) ((unsigned char *)&source_ip)[x]
++ printk(KERN_INFO "netlog: using source IP %u.%u.%u.%u\n",
++ IP(3), IP(2), IP(1), IP(0));
++#undef IP
++ source_ip = htonl(source_ip);
++ if (!source_port) {
++ printk(KERN_ERR "netlog: source_port parameter not specified, aborting.\n");
++ return -1;
++ }
++ printk(KERN_INFO "netlog: using source UDP port: %u\n", source_port);
++ source_port = htons(source_port);
++
++ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) {
++ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n");
++ return -1;
++ }
++ if (netdump_target_ip) {
++#define IP(x) ((unsigned char *)&netdump_target_ip)[x]
++ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n",
++ IP(3), IP(2), IP(1), IP(0));
++#undef IP
++ netdump_target_ip = htonl(netdump_target_ip);
++ }
++ if (netlog_target_ip) {
++#define IP(x) ((unsigned char *)&netlog_target_ip)[x]
++ printk(KERN_INFO "netlog: using netlog target IP %u.%u.%u.%u\n",
++ IP(3), IP(2), IP(1), IP(0));
++#undef IP
++ netlog_target_ip = htonl(netlog_target_ip);
++ }
++ if (syslog_target_ip) {
++ if (!syslog_target_port)
++ syslog_target_port = 514;
++#define IP(x) ((unsigned char *)&syslog_target_ip)[x]
++ printk("netlog: using syslog target IP %u.%u.%u.%u, port: %d\n", IP(3), IP(2), IP(1), IP(0), syslog_target_port);
++#undef IP
++ syslog_target_ip = htonl(syslog_target_ip);
++ syslog_target_port = htons(syslog_target_port);
++ }
++ if (!netdump_target_port && !netlog_target_port && !syslog_target_port) {
++ printk(KERN_ERR "netlog: target_port parameter not specified, aborting.\n");
++ return -1;
++ }
++ if (netdump_target_port) {
++ printk(KERN_INFO "netlog: using target UDP port: %u\n", netdump_target_port);
++ netdump_target_port = htons(netdump_target_port);
++ }
++ if (netlog_target_port) {
++ printk(KERN_INFO "netlog: using target UDP port: %u\n", netlog_target_port);
++ netlog_target_port = htons(netlog_target_port);
++ }
++
++ netdump_daddr[0] = netdump_target_eth_byte0;
++ netdump_daddr[1] = netdump_target_eth_byte1;
++ netdump_daddr[2] = netdump_target_eth_byte2;
++ netdump_daddr[3] = netdump_target_eth_byte3;
++ netdump_daddr[4] = netdump_target_eth_byte4;
++ netdump_daddr[5] = netdump_target_eth_byte5;
++
++ if ((netdump_daddr[0] & netdump_daddr[1] & netdump_daddr[2] & netdump_daddr[3] & netdump_daddr[4] & netdump_daddr[5]) == 255)
++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n");
++ else
++ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
++ netdump_daddr[0], netdump_daddr[1], netdump_daddr[2], netdump_daddr[3], netdump_daddr[4], netdump_daddr[5]);
++
++ netlog_daddr[0] = netlog_target_eth_byte0;
++ netlog_daddr[1] = netlog_target_eth_byte1;
++ netlog_daddr[2] = netlog_target_eth_byte2;
++ netlog_daddr[3] = netlog_target_eth_byte3;
++ netlog_daddr[4] = netlog_target_eth_byte4;
++ netlog_daddr[5] = netlog_target_eth_byte5;
++
++ if ((netlog_daddr[0] & netlog_daddr[1] & netlog_daddr[2] & netlog_daddr[3] & netlog_daddr[4] & netlog_daddr[5]) == 255)
++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n");
++ else
++ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
++ netlog_daddr[0], netlog_daddr[1], netlog_daddr[2], netlog_daddr[3], netlog_daddr[4], netlog_daddr[5]);
++ syslog_daddr[0] = syslog_target_eth_byte0;
++ syslog_daddr[1] = syslog_target_eth_byte1;
++ syslog_daddr[2] = syslog_target_eth_byte2;
++ syslog_daddr[3] = syslog_target_eth_byte3;
++ syslog_daddr[4] = syslog_target_eth_byte4;
++ syslog_daddr[5] = syslog_target_eth_byte5;
++
++ if ((syslog_daddr[0] & syslog_daddr[1] & syslog_daddr[2] & syslog_daddr[3] & syslog_daddr[4] & syslog_daddr[5]) == 255)
++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send syslog packets.\n");
++ else
++ printk(KERN_INFO "netlog: using syslog target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
++ syslog_daddr[0], syslog_daddr[1], syslog_daddr[2], syslog_daddr[3], syslog_daddr[4], syslog_daddr[5]);
++
++ mhz_cycles = (unsigned long long)mhz * 1000000ULL;
++ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ);
++
++ INIT_LIST_HEAD(&request_list);
++
++ ndev->rx_hook = netconsole_rx_hook;
++ netdump_func = netconsole_netdump;
++ netconsole_dev = ndev;
++#define STARTUP_MSG "[...network console startup...]\n"
++ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG));
++
++ register_console(&netconsole);
++ printk(KERN_INFO "netlog: network logging started up successfully!\n");
++ return 0;
++}
++
++static void cleanup_netconsole(void)
++{
++ printk(KERN_INFO "netlog: network logging shut down.\n");
++ unregister_console(&netconsole);
++
++#define SHUTDOWN_MSG "[...network console shutdown...]\n"
++ write_netconsole_msg(NULL, SHUTDOWN_MSG, strlen(SHUTDOWN_MSG));
++ netconsole_dev->rx_hook = NULL;
++ netconsole_dev = NULL;
++}
++
++module_init(init_netconsole);
++module_exit(cleanup_netconsole);
++
++MODULE_LICENSE("GPL");
++
+Index: linux-2.4.24/drivers/net/netconsole.h
+===================================================================
+--- linux-2.4.24.orig/drivers/net/netconsole.h 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.4.24/drivers/net/netconsole.h 2004-05-07 16:58:39.000000000 -0400
+@@ -0,0 +1,81 @@
++/*
++ * linux/drivers/net/netconsole.h
++ *
++ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
++ *
++ * This file contains the implementation of an IRQ-safe, crash-safe
++ * kernel console implementation that outputs kernel messages to the
++ * network.
++ *
++ * Modification history:
++ *
++ * 2001-09-17 started by Ingo Molnar.
++ */
++
++/****************************************************************
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2, or (at your option)
++ * any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ ****************************************************************/
++
++#define NETCONSOLE_VERSION 0x04
++
++enum netdump_commands {
++ COMM_NONE = 0,
++ COMM_SEND_MEM = 1,
++ COMM_EXIT = 2,
++ COMM_REBOOT = 3,
++ COMM_HELLO = 4,
++ COMM_GET_NR_PAGES = 5,
++ COMM_GET_PAGE_SIZE = 6,
++ COMM_START_NETDUMP_ACK = 7,
++ COMM_GET_REGS = 8,
++ COMM_SHOW_STATE = 9,
++};
++
++#define NETDUMP_REQ_SIZE (8+4*4)
++
++typedef struct netdump_req_s {
++ u64 magic;
++ u32 nr;
++ u32 command;
++ u32 from;
++ u32 to;
++ struct list_head list;
++} req_t;
++
++enum netdump_replies {
++ REPLY_NONE = 0,
++ REPLY_ERROR = 1,
++ REPLY_LOG = 2,
++ REPLY_MEM = 3,
++ REPLY_RESERVED = 4,
++ REPLY_HELLO = 5,
++ REPLY_NR_PAGES = 6,
++ REPLY_PAGE_SIZE = 7,
++ REPLY_START_NETDUMP = 8,
++ REPLY_END_NETDUMP = 9,
++ REPLY_REGS = 10,
++ REPLY_MAGIC = 11,
++ REPLY_SHOW_STATE = 12,
++};
++
++typedef struct netdump_reply_s {
++ u32 nr;
++ u32 code;
++ u32 info;
++} reply_t;
++
++#define HEADER_LEN (1 + sizeof(reply_t))
++
+Index: linux-2.4.24/drivers/net/tlan.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/tlan.c 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/tlan.c 2004-05-07 16:58:39.000000000 -0400
+@@ -345,6 +345,8 @@
+ static void TLan_EeReceiveByte( u16, u8 *, int );
+ static int TLan_EeReadByte( struct net_device *, u8, u8 * );
+
++static void TLan_Poll(struct net_device *);
++
+
+ static void
+ TLan_StoreSKB( struct tlan_list_tag *tag, struct sk_buff *skb)
+@@ -891,6 +893,9 @@
+ dev->get_stats = &TLan_GetStats;
+ dev->set_multicast_list = &TLan_SetMulticastList;
+ dev->do_ioctl = &TLan_ioctl;
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = &TLan_Poll;
++#endif
+ dev->tx_timeout = &TLan_tx_timeout;
+ dev->watchdog_timeo = TX_TIMEOUT;
+
+@@ -1176,7 +1181,14 @@
+
+ } /* TLan_HandleInterrupts */
+
+-
++#ifdef HAVE_POLL_CONTROLLER
++static void TLan_Poll(struct net_device *dev)
++{
++ if (!netdump_mode) disable_irq(dev->irq);
++ TLan_HandleInterrupt(dev->irq, dev, NULL);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++#endif
+
+
+ /***************************************************************
+Index: linux-2.4.24/drivers/net/tulip/tulip_core.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/tulip/tulip_core.c 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/tulip/tulip_core.c 2004-05-07 16:58:39.000000000 -0400
+@@ -266,6 +266,7 @@
+ static struct net_device_stats *tulip_get_stats(struct net_device *dev);
+ static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+ static void set_rx_mode(struct net_device *dev);
++static void poll_tulip(struct net_device *dev);
+
+
+
+@@ -1728,6 +1729,9 @@
+ dev->get_stats = tulip_get_stats;
+ dev->do_ioctl = private_ioctl;
+ dev->set_multicast_list = set_rx_mode;
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = &poll_tulip;
++#endif
+
+ if (register_netdev(dev))
+ goto err_out_free_ring;
+@@ -1902,6 +1906,24 @@
+ }
+
+
++#ifdef HAVE_POLL_CONTROLLER
++
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++
++static void poll_tulip (struct net_device *dev)
++{
++ if (!netdump_mode) disable_irq(dev->irq);
++ tulip_interrupt (dev->irq, dev, NULL);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++
++#endif
++
++
+ static struct pci_driver tulip_driver = {
+ name: DRV_NAME,
+ id_table: tulip_pci_tbl,
+Index: linux-2.4.24/drivers/net/e100/e100_main.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/e100/e100_main.c 2004-05-07 16:58:39.000000000 -0400
++++ linux-2.4.24/drivers/net/e100/e100_main.c 2004-05-07 17:00:21.000000000 -0400
+@@ -664,6 +664,10 @@
+ goto err_unregister_netdev;
+ }
+
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = e100_netpoll;
++#endif
++
+ e100nics++;
+
+ e100_get_speed_duplex_caps(bdp);
+Index: linux-2.4.24/drivers/net/e1000/e1000_main.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/e1000/e1000_main.c 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/e1000/e1000_main.c 2004-05-07 16:58:39.000000000 -0400
+@@ -182,6 +182,9 @@
+ static int e1000_resume(struct pci_dev *pdev);
+ #endif
+
++/* for netdump / net console */
++static void e1000_netpoll (struct net_device *dev);
++
+ struct notifier_block e1000_notifier_reboot = {
+ .notifier_call = e1000_notify_reboot,
+ .next = NULL,
+@@ -434,6 +437,10 @@
+ netdev->vlan_rx_add_vid = e1000_vlan_rx_add_vid;
+ netdev->vlan_rx_kill_vid = e1000_vlan_rx_kill_vid;
+
++#ifdef HAVE_POLL_CONTROLLER
++ netdev->poll_controller = e1000_netpoll;
++#endif
++
+ netdev->irq = pdev->irq;
+ netdev->mem_start = mmio_start;
+ netdev->mem_end = mmio_start + mmio_len;
+@@ -2899,4 +2906,20 @@
+ }
+ #endif
+
++#ifdef HAVE_POLL_CONTROLLER
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++
++static void e1000_netpoll (struct net_device *dev)
++{
++ if (!netdump_mode) disable_irq(dev->irq);
++ e1000_intr (dev->irq, dev, NULL);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++
++#endif
++
+ /* e1000_main.c */
+Index: linux-2.4.24/drivers/net/tg3.c
+===================================================================
+--- linux-2.4.24.orig/drivers/net/tg3.c 2003-11-28 13:26:20.000000000 -0500
++++ linux-2.4.24/drivers/net/tg3.c 2004-05-07 16:58:39.000000000 -0400
+@@ -216,6 +216,9 @@
+ #define tr16(reg) readw(tp->regs + (reg))
+ #define tr8(reg) readb(tp->regs + (reg))
+
++/* Added by mark.fasheh@oracle.com to help enable netdump on these cards */
++static void poll_tg3 (struct net_device *dev);
++
+ static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val)
+ {
+ unsigned long flags;
+@@ -7630,6 +7633,9 @@
+ dev->watchdog_timeo = TG3_TX_TIMEOUT;
+ dev->change_mtu = tg3_change_mtu;
+ dev->irq = pdev->irq;
++#ifdef HAVE_POLL_CONTROLLER
++ dev->poll_controller = &poll_tg3;
++#endif
+
+ err = tg3_get_invariants(tp);
+ if (err) {
+@@ -7862,5 +7868,23 @@
+ pci_unregister_driver(&tg3_driver);
+ }
+
++#ifdef HAVE_POLL_CONTROLLER
++
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++
++static void poll_tg3 (struct net_device *dev)
++{
++ if (!netdump_mode) disable_irq(dev->irq);
++ tg3_interrupt (dev->irq, dev, NULL);
++ if (!netdump_mode) enable_irq(dev->irq);
++}
++
++#endif
++
++
+ module_init(tg3_init);
+ module_exit(tg3_cleanup);
+Index: linux-2.4.24/include/asm-i386/kmap_types.h
+===================================================================
+--- linux-2.4.24.orig/include/asm-i386/kmap_types.h 2003-08-25 07:44:43.000000000 -0400
++++ linux-2.4.24/include/asm-i386/kmap_types.h 2004-05-07 16:59:12.000000000 -0400
+@@ -10,6 +10,7 @@
+ KM_BH_IRQ,
+ KM_SOFTIRQ0,
+ KM_SOFTIRQ1,
++ KM_NETDUMP,
+ KM_TYPE_NR
+ };
+
+Index: linux-2.4.24/include/linux/kernel.h
+===================================================================
+--- linux-2.4.24.orig/include/linux/kernel.h 2004-05-07 16:56:55.000000000 -0400
++++ linux-2.4.24/include/linux/kernel.h 2004-05-07 16:58:39.000000000 -0400
+@@ -104,6 +104,9 @@
+
+ extern void bust_spinlocks(int yes);
+ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
++struct pt_regs;
++extern void (*netdump_func) (struct pt_regs *regs);
++extern int netdump_mode;
+
+ extern int tainted;
+ extern const char *print_tainted(void);
+Index: linux-2.4.24/include/linux/netdevice.h
+===================================================================
+--- linux-2.4.24.orig/include/linux/netdevice.h 2003-11-28 13:26:21.000000000 -0500
++++ linux-2.4.24/include/linux/netdevice.h 2004-05-07 16:58:39.000000000 -0400
+@@ -435,6 +435,9 @@
+ unsigned char *haddr);
+ int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
+ int (*accept_fastpath)(struct net_device *, struct dst_entry*);
++#define HAVE_POLL_CONTROLLER
++ void (*poll_controller)(struct net_device *dev);
++ int (*rx_hook)(struct sk_buff *skb);
+
+ /* open/release and usage marking */
+ struct module *owner;
+Index: linux-2.4.24/kernel/panic.c
+===================================================================
+--- linux-2.4.24.orig/kernel/panic.c 2004-05-07 16:56:56.000000000 -0400
++++ linux-2.4.24/kernel/panic.c 2004-05-07 16:58:39.000000000 -0400
+@@ -62,6 +62,8 @@
+ vsprintf(buf, fmt, args);
+ va_end(args);
+ printk(KERN_EMERG "Kernel panic: %s\n",buf);
++ if (netdump_func)
++ BUG();
+ if (in_interrupt())
+ printk(KERN_EMERG "In interrupt handler - not syncing\n");
+ else if (!current->pid)
+Index: linux-2.4.24/net/core/dev.c
+===================================================================
+--- linux-2.4.24.orig/net/core/dev.c 2003-11-28 13:26:21.000000000 -0500
++++ linux-2.4.24/net/core/dev.c 2004-05-07 16:58:39.000000000 -0400
+@@ -1288,6 +1288,13 @@
+
+ local_irq_save(flags);
+
++ if (unlikely(skb->dev->rx_hook != NULL)) {
++ int ret;
++
++ ret = skb->dev->rx_hook(skb);
++ if (ret == NET_RX_DROP)
++ goto drop;
++ }
+ netdev_rx_stat[this_cpu].total++;
+ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+ if (queue->input_pkt_queue.qlen) {
--- /dev/null
+ Documentation/Configure.help | 66 ++
+ arch/alpha/defconfig | 7
+ arch/alpha/kernel/entry.S | 12
+ arch/arm/defconfig | 7
+ arch/arm/kernel/calls.S | 24
+ arch/i386/defconfig | 7
+ arch/ia64/defconfig | 7
+ arch/ia64/kernel/entry.S | 24
+ arch/m68k/defconfig | 7
+ arch/mips/defconfig | 7
+ arch/mips64/defconfig | 7
+ arch/ppc/defconfig | 14
+ arch/ppc64/kernel/misc.S | 2
+ arch/s390/defconfig | 7
+ arch/s390/kernel/entry.S | 24
+ arch/s390x/defconfig | 7
+ arch/s390x/kernel/entry.S | 24
+ arch/s390x/kernel/wrapper32.S | 92 +++
+ arch/sparc/defconfig | 7
+ arch/sparc/kernel/systbls.S | 10
+ arch/sparc64/defconfig | 7
+ arch/sparc64/kernel/systbls.S | 20
+ fs/Config.in | 14
+ fs/Makefile | 3
+ fs/ext2/Makefile | 4
+ fs/ext2/file.c | 5
+ fs/ext2/ialloc.c | 2
+ fs/ext2/inode.c | 34 -
+ fs/ext2/namei.c | 14
+ fs/ext2/super.c | 29
+ fs/ext2/symlink.c | 14
+ fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c | 103 +++
+ fs/ext3/Makefile | 10
+ fs/ext3/file.c | 5
+ fs/ext3/ialloc.c | 2
+ fs/ext3/inode.c | 35 -
+ fs/ext3/namei.c | 21
+ fs/ext3/super.c | 36 +
+ fs/ext3/symlink.c | 14
+ fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c | 111 +++
+ fs/jfs/jfs_xattr.h | 6
+ fs/jfs/xattr.c | 6
+ fs/mbcache.c | 648 ++++++++++++++++++++++
+ include/asm-arm/unistd.h | 2
+ include/asm-ia64/unistd.h | 13
+ include/asm-ppc64/unistd.h | 2
+ include/asm-s390/unistd.h | 15
+ include/asm-s390x/unistd.h | 15
+ include/asm-sparc/unistd.h | 24
+ include/asm-sparc64/unistd.h | 24
+ include/linux/cache_def.h | 15
+ include/linux/errno.h | 4
+ include/linux/ext2_fs.h | 31 -
+ include/linux/ext2_xattr.h | 157 +++++
+ include/linux/ext3_fs.h | 31 -
+ include/linux/ext3_jbd.h | 8
+ include/linux/ext3_xattr.h | 157 +++++
+ include/linux/fs.h | 2
+ include/linux/mbcache.h | 69 ++
+ kernel/ksyms.c | 4
+ mm/vmscan.c | 35 +
+ fs/ext3/ext3-exports.c | 14 +
+ 64 files changed, 4355 insertions(+), 195 deletions(-)
+
+Index: linux-DRV401/arch/ppc/defconfig
+===================================================================
+--- linux-DRV401.orig/arch/ppc/defconfig 2004-10-15 10:24:32.000000000 -0700
++++ linux-DRV401/arch/ppc/defconfig 2004-10-15 11:03:51.000000000 -0700
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated by make menuconfig: don't edit
+ #
++CONFIG_EXT3_FS_XATTR=y
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+Index: linux-DRV401/fs/Config.in
+===================================================================
+--- linux-DRV401.orig/fs/Config.in 2004-10-15 10:24:06.000000000 -0700
++++ linux-DRV401/fs/Config.in 2004-10-15 11:03:51.000000000 -0700
+@@ -22,6 +22,11 @@
+ dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
+
+ tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
++dep_bool ' Ext3 extended attribute block sharing' \
++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
++dep_bool ' Ext3 extended user attributes' \
++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+@@ -77,6 +82,11 @@
+ tristate 'ROM file system support' CONFIG_ROMFS_FS
+
+ tristate 'Second extended fs support' CONFIG_EXT2_FS
++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
++dep_bool ' Ext2 extended attribute block sharing' \
++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
++dep_bool ' Ext2 extended user attributes' \
++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
+
+ tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
+
+@@ -156,6 +166,10 @@
+ fi
+ fi
+
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+Index: linux-DRV401/fs/Makefile
+===================================================================
+--- linux-DRV401.orig/fs/Makefile 2004-10-15 10:39:15.000000000 -0700
++++ linux-DRV401/fs/Makefile 2004-10-15 11:03:51.000000000 -0700
+@@ -14,7 +14,7 @@
+ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \
+ fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
+ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
+- filesystems.o namespace.o seq_file.o quota.o
++ filesystems.o namespace.o seq_file.o quota.o xattr.o
+
+ ifeq ($(CONFIG_QUOTA),y)
+ obj-y += dquot.o
+@@ -76,6 +76,9 @@
+
+ obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
+
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE) += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+
+Index: linux-DRV401/fs/ext2/Makefile
+===================================================================
+--- linux-DRV401.orig/fs/ext2/Makefile 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/Makefile 2004-10-15 11:03:51.000000000 -0700
+@@ -13,4 +13,8 @@
+ ioctl.o namei.o super.o symlink.o
+ obj-m := $(O_TARGET)
+
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+Index: linux-DRV401/fs/ext2/file.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/file.c 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/file.c 2004-10-15 11:03:51.000000000 -0700
+@@ -20,6 +20,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/sched.h>
+
+ /*
+@@ -51,4 +52,8 @@
+
+ struct inode_operations ext2_file_inode_operations = {
+ truncate: ext2_truncate,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+Index: linux-DRV401/fs/ext2/ialloc.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/ialloc.c 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/ialloc.c 2004-10-15 11:03:51.000000000 -0700
+@@ -15,6 +15,7 @@
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+@@ -167,6 +168,7 @@
+ */
+ if (!is_bad_inode(inode)) {
+ /* Quota is already initialized in iput() */
++ ext2_xattr_delete_inode(inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+ }
+Index: linux-DRV401/fs/ext2/inode.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/inode.c 2004-10-15 10:24:00.000000000 -0700
++++ linux-DRV401/fs/ext2/inode.c 2004-10-15 11:03:51.000000000 -0700
+@@ -39,6 +39,18 @@
+ static int ext2_update_inode(struct inode * inode, int do_sync);
+
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext2_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+ * Called at each iput()
+ */
+ void ext2_put_inode (struct inode * inode)
+@@ -53,9 +65,7 @@
+ {
+ lock_kernel();
+
+- if (is_bad_inode(inode) ||
+- inode->i_ino == EXT2_ACL_IDX_INO ||
+- inode->i_ino == EXT2_ACL_DATA_INO)
++ if (is_bad_inode(inode))
+ goto no_delete;
+ inode->u.ext2_i.i_dtime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+@@ -792,6 +802,8 @@
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
++ if (ext2_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+@@ -879,8 +891,7 @@
+ unsigned long offset;
+ struct ext2_group_desc * gdp;
+
+- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
+- inode->i_ino != EXT2_ACL_DATA_INO &&
++ if ((inode->i_ino != EXT2_ROOT_INO &&
+ inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+ ext2_error (inode->i_sb, "ext2_read_inode",
+@@ -965,10 +976,7 @@
+ for (block = 0; block < EXT2_N_BLOCKS; block++)
+ inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+
+- if (inode->i_ino == EXT2_ACL_IDX_INO ||
+- inode->i_ino == EXT2_ACL_DATA_INO)
+- /* Nothing to do */ ;
+- else if (S_ISREG(inode->i_mode)) {
++ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext2_file_inode_operations;
+ inode->i_fop = &ext2_file_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+@@ -977,15 +985,17 @@
+ inode->i_fop = &ext2_dir_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ } else if (S_ISLNK(inode->i_mode)) {
+- if (!inode->i_blocks)
++ if (ext2_inode_is_fast_symlink(inode))
+ inode->i_op = &ext2_fast_symlink_inode_operations;
+ else {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext2_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ }
+- } else
++ } else {
++ inode->i_op = &ext2_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(raw_inode->i_block[0]));
++ }
+ brelse (bh);
+ inode->i_attr_flags = 0;
+ if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
+Index: linux-DRV401/fs/ext2/namei.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/namei.c 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/namei.c 2004-10-15 11:03:51.000000000 -0700
+@@ -31,6 +31,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/pagemap.h>
+
+ /*
+@@ -136,7 +137,7 @@
+
+ if (l > sizeof (inode->u.ext2_i.i_data)) {
+ /* slow symlink */
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext2_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ err = block_symlink(inode, symname, l);
+ if (err)
+@@ -345,4 +346,15 @@
+ rmdir: ext2_rmdir,
+ mknod: ext2_mknod,
+ rename: ext2_rename,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
++};
++
++struct inode_operations ext2_special_inode_operations = {
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+Index: linux-DRV401/fs/ext2/super.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/super.c 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/super.c 2004-10-15 11:03:51.000000000 -0700
+@@ -21,6 +21,7 @@
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -125,6 +126,7 @@
+ int db_count;
+ int i;
+
++ ext2_xattr_put_super(sb);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+@@ -175,6 +177,13 @@
+ this_char = strtok (NULL, ",")) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++ if (!strcmp (this_char, "user_xattr"))
++ set_opt (*mount_options, XATTR_USER);
++ else if (!strcmp (this_char, "nouser_xattr"))
++ clear_opt (*mount_options, XATTR_USER);
++ else
++#endif
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+@@ -424,6 +433,9 @@
+ blocksize = BLOCK_SIZE;
+
+ sb->u.ext2_sb.s_mount_opt = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
++#endif
+ if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
+ &sb->u.ext2_sb.s_mount_opt)) {
+ return NULL;
+@@ -810,12 +822,27 @@
+
+ static int __init init_ext2_fs(void)
+ {
+- return register_filesystem(&ext2_fs_type);
++ int error = init_ext2_xattr();
++ if (error)
++ return error;
++ error = init_ext2_xattr_user();
++ if (error)
++ goto fail;
++ error = register_filesystem(&ext2_fs_type);
++ if (!error)
++ return 0;
++
++ exit_ext2_xattr_user();
++fail:
++ exit_ext2_xattr();
++ return error;
+ }
+
+ static void __exit exit_ext2_fs(void)
+ {
+ unregister_filesystem(&ext2_fs_type);
++ exit_ext2_xattr_user();
++ exit_ext2_xattr();
+ }
+
+ EXPORT_NO_SYMBOLS;
+Index: linux-DRV401/fs/ext2/symlink.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/symlink.c 2004-10-15 10:23:59.000000000 -0700
++++ linux-DRV401/fs/ext2/symlink.c 2004-10-15 11:03:51.000000000 -0700
+@@ -19,6 +19,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+
+ static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -32,7 +33,20 @@
+ return vfs_follow_link(nd, s);
+ }
+
++struct inode_operations ext2_symlink_inode_operations = {
++ readlink: page_readlink,
++ follow_link: page_follow_link,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
++};
++
+ struct inode_operations ext2_fast_symlink_inode_operations = {
+ readlink: ext2_readlink,
+ follow_link: ext2_follow_link,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+Index: linux-DRV401/fs/ext2/xattr.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/xattr.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/ext2/xattr.c 2004-10-15 11:03:51.000000000 -0700
+@@ -0,0 +1,1212 @@
++/*
++ * linux/fs/ext2/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Extended attributes for symlinks and special files added per
++ * suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ * +------------------+
++ * | header |
++ * | entry 1 | |
++ * | entry 2 | | growing downwards
++ * | entry 3 | v
++ * | four null bytes |
++ * | . . . |
++ * | value 1 | ^
++ * | value 3 | | growing upwards
++ * | value 2 | |
++ * +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT2_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++/* These symbols may be needed by a module. */
++EXPORT_SYMBOL(ext2_xattr_register);
++EXPORT_SYMBOL(ext2_xattr_unregister);
++EXPORT_SYMBOL(ext2_xattr_get);
++EXPORT_SYMBOL(ext2_xattr_list);
++EXPORT_SYMBOL(ext2_xattr_set);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT2_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++ printk(KERN_DEBUG "inode %s:%ld: ", \
++ kdevname(inode->i_dev), inode->i_ino); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++# define ea_bdebug(bh, f...) do { \
++ printk(KERN_DEBUG "block %s:%ld: ", \
++ kdevname(bh->b_dev), bh->b_blocknr); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext2_xattr_set2(struct inode *, struct buffer_head *,
++ struct ext2_xattr_header *);
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++static int ext2_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext2_xattr_cache_find(struct inode *,
++ struct ext2_xattr_header *);
++static void ext2_xattr_cache_remove(struct buffer_head *);
++static void ext2_xattr_rehash(struct ext2_xattr_header *,
++ struct ext2_xattr_entry *);
++
++static struct mb_cache *ext2_xattr_cache;
++
++#else
++# define ext2_xattr_cache_insert(bh) 0
++# define ext2_xattr_cache_find(inode, header) NULL
++# define ext2_xattr_cache_remove(bh) while(0) {}
++# define ext2_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext2_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext2_xattr_sem);
++
++static inline int
++ext2_xattr_new_block(struct inode *inode, int * errp, int force)
++{
++ struct super_block *sb = inode->i_sb;
++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
++
++ /* How can we enforce the allocation? */
++ int block = ext2_new_block(inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++ if (!*errp)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++ return block;
++}
++
++static inline int
++ext2_xattr_quota_alloc(struct inode *inode, int force)
++{
++ /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++ if (!error)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++ int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++ return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext2_xattr_quota_free(struct inode *inode)
++{
++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext2_xattr_free_block(struct inode * inode, unsigned long block)
++{
++ ext2_free_blocks(inode, block, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext2_xattr_quota_free(inode) \
++ DQUOT_FREE_BLOCK(inode, 1)
++# define ext2_xattr_free_block(inode, block) \
++ ext2_free_blocks(inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++ return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++ return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
++{
++ int error = -EINVAL;
++
++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++ write_lock(&ext2_handler_lock);
++ if (!ext2_xattr_handlers[name_index-1]) {
++ ext2_xattr_handlers[name_index-1] = handler;
++ error = 0;
++ }
++ write_unlock(&ext2_handler_lock);
++ }
++ return error;
++}
++
++void
++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
++{
++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
++ write_lock(&ext2_handler_lock);
++ ext2_xattr_handlers[name_index-1] = NULL;
++ write_unlock(&ext2_handler_lock);
++ }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++ while (*a_prefix && *a == *a_prefix) {
++ a++;
++ a_prefix++;
++ }
++ return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static struct ext2_xattr_handler *
++ext2_xattr_resolve_name(const char **name)
++{
++ struct ext2_xattr_handler *handler = NULL;
++ int i;
++
++ if (!*name)
++ return NULL;
++ read_lock(&ext2_handler_lock);
++ for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
++ if (ext2_xattr_handlers[i]) {
++ const char *n = strcmp_prefix(*name,
++ ext2_xattr_handlers[i]->prefix);
++ if (n) {
++ handler = ext2_xattr_handlers[i];
++ *name = n;
++ break;
++ }
++ }
++ }
++ read_unlock(&ext2_handler_lock);
++ return handler;
++}
++
++static inline struct ext2_xattr_handler *
++ext2_xattr_handler(int name_index)
++{
++ struct ext2_xattr_handler *handler = NULL;
++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++ read_lock(&ext2_handler_lock);
++ handler = ext2_xattr_handlers[name_index-1];
++ read_unlock(&ext2_handler_lock);
++ }
++ return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_getxattr(struct dentry *dentry, const char *name,
++ void *buffer, size_t size)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++ return ext2_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_setxattr(struct dentry *dentry, const char *name,
++ const void *value, size_t size, int flags)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ if (size == 0)
++ value = ""; /* empty EA, do not remove */
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_removexattr(struct dentry *dentry, const char *name)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext2_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_entry *entry;
++ unsigned int block, size;
++ char *end;
++ int name_len, error;
++
++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++ name_index, name, buffer, (long)buffer_size);
++
++ if (name == NULL)
++ return -EINVAL;
++ if (!EXT2_I(inode)->i_file_acl)
++ return -ENOATTR;
++ block = EXT2_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* find named attribute */
++ name_len = strlen(name);
++
++ error = -ERANGE;
++ if (name_len > 255)
++ goto cleanup;
++ entry = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (name_index == entry->e_name_index &&
++ name_len == entry->e_name_len &&
++ memcmp(name, entry->e_name, name_len) == 0)
++ goto found;
++ entry = next;
++ }
++ /* Check the remaining name entries */
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ entry = next;
++ }
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ error = -ENOATTR;
++ goto cleanup;
++found:
++ /* check the buffer size */
++ if (entry->e_value_block != 0)
++ goto bad_block;
++ size = le32_to_cpu(entry->e_value_size);
++ if (size > inode->i_sb->s_blocksize ||
++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++ goto bad_block;
++
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (buffer) {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ /* return value of attribute */
++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++ size);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * ext2_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_entry *entry;
++ unsigned int block, size = 0;
++ char *buf, *end;
++ int error;
++
++ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++ buffer, (long)buffer_size);
++
++ if (!EXT2_I(inode)->i_file_acl)
++ return 0;
++ block = EXT2_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* compute the size required for the list of attribute names */
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT2_XATTR_NEXT(entry)) {
++ struct ext2_xattr_handler *handler;
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++
++ handler = ext2_xattr_handler(entry->e_name_index);
++ if (handler)
++ size += handler->list(NULL, inode, entry->e_name,
++ entry->e_name_len);
++ }
++
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (!buffer) {
++ error = size;
++ goto cleanup;
++ } else {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ }
++
++ /* list the attribute names */
++ buf = buffer;
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT2_XATTR_NEXT(entry)) {
++ struct ext2_xattr_handler *handler;
++
++ handler = ext2_xattr_handler(entry->e_name_index);
++ if (handler)
++ buf += handler->list(buf, inode, entry->e_name,
++ entry->e_name_len);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext2_xattr_update_super_block(struct super_block *sb)
++{
++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
++ return;
++
++ lock_super(sb);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
++#endif
++ EXT2_SB(sb)->s_es->s_feature_compat |=
++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
++ sb->s_dirt = 1;
++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
++ unlock_super(sb);
++}
++
++/*
++ * ext2_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++ const void *value, size_t value_len, int flags)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_header *header = NULL;
++ struct ext2_xattr_entry *here, *last;
++ unsigned int name_len;
++ int block = EXT2_I(inode)->i_file_acl;
++ int min_offs = sb->s_blocksize, not_found = 1, free, error;
++ char *end;
++
++ /*
++ * header -- Points either into bh, or to a temporarily
++ * allocated buffer.
++ * here -- The named entry found, or the place for inserting, within
++ * the block pointed to by header.
++ * last -- Points right after the last named entry within the block
++ * pointed to by header.
++ * min_offs -- The offset of the first value (values are aligned
++ * towards the end of the block).
++ * end -- Points right after the block pointed to by header.
++ */
++
++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++ name_index, name, value, (long)value_len);
++
++ if (IS_RDONLY(inode))
++ return -EROFS;
++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++ return -EPERM;
++ if (value == NULL)
++ value_len = 0;
++ if (name == NULL)
++ return -EINVAL;
++ name_len = strlen(name);
++ if (name_len > 255 || value_len > sb->s_blocksize)
++ return -ERANGE;
++ down(&ext2_xattr_sem);
++
++ if (block) {
++ /* The inode already has an extended attribute block. */
++
++ bh = sb_bread(sb, block);
++ error = -EIO;
++ if (!bh)
++ goto cleanup;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)),
++ le32_to_cpu(HDR(bh)->h_refcount));
++ header = HDR(bh);
++ end = bh->b_data + bh->b_size;
++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ header->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(sb, "ext2_xattr_set",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* Find the named attribute. */
++ here = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(here)) {
++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!here->e_value_block && here->e_value_size) {
++ int offs = le16_to_cpu(here->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ not_found = name_index - here->e_name_index;
++ if (!not_found)
++ not_found = name_len - here->e_name_len;
++ if (!not_found)
++ not_found = memcmp(name, here->e_name,name_len);
++ if (not_found <= 0)
++ break;
++ here = next;
++ }
++ last = here;
++ /* We still need to compute min_offs and last. */
++ while (!IS_LAST_ENTRY(last)) {
++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!last->e_value_block && last->e_value_size) {
++ int offs = le16_to_cpu(last->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ last = next;
++ }
++
++ /* Check whether we have enough space left. */
++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++ } else {
++ /* We will use a new extended attribute block. */
++ free = sb->s_blocksize -
++ sizeof(struct ext2_xattr_header) - sizeof(__u32);
++ here = last = NULL; /* avoid gcc uninitialized warning. */
++ }
++
++ if (not_found) {
++ /* Request to remove a nonexistent attribute? */
++ error = -ENOATTR;
++ if (flags & XATTR_REPLACE)
++ goto cleanup;
++ error = 0;
++ if (value == NULL)
++ goto cleanup;
++ else
++ free -= EXT2_XATTR_LEN(name_len);
++ } else {
++ /* Request to create an existing attribute? */
++ error = -EEXIST;
++ if (flags & XATTR_CREATE)
++ goto cleanup;
++ if (!here->e_value_block && here->e_value_size) {
++ unsigned int size = le32_to_cpu(here->e_value_size);
++
++ if (le16_to_cpu(here->e_value_offs) + size >
++ sb->s_blocksize || size > sb->s_blocksize)
++ goto bad_block;
++ free += EXT2_XATTR_SIZE(size);
++ }
++ }
++ free -= EXT2_XATTR_SIZE(value_len);
++ error = -ENOSPC;
++ if (free < 0)
++ goto cleanup;
++
++ /* Here we know that we can set the new attribute. */
++
++ if (header) {
++ if (header->h_refcount == cpu_to_le32(1)) {
++ ea_bdebug(bh, "modifying in-place");
++ ext2_xattr_cache_remove(bh);
++ } else {
++ int offset;
++
++ ea_bdebug(bh, "cloning");
++ header = kmalloc(bh->b_size, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memcpy(header, HDR(bh), bh->b_size);
++ header->h_refcount = cpu_to_le32(1);
++ offset = (char *)header - bh->b_data;
++ here = ENTRY((char *)here + offset);
++ last = ENTRY((char *)last + offset);
++ }
++ } else {
++ /* Allocate a buffer where we construct the new block. */
++ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memset(header, 0, sb->s_blocksize);
++ end = (char *)header + sb->s_blocksize;
++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
++ header->h_blocks = header->h_refcount = cpu_to_le32(1);
++ last = here = ENTRY(header+1);
++ }
++
++ if (not_found) {
++ /* Insert the new name. */
++ int size = EXT2_XATTR_LEN(name_len);
++ int rest = (char *)last - (char *)here;
++ memmove((char *)here + size, here, rest);
++ memset(here, 0, size);
++ here->e_name_index = name_index;
++ here->e_name_len = name_len;
++ memcpy(here->e_name, name, name_len);
++ } else {
++ /* Remove the old value. */
++ if (!here->e_value_block && here->e_value_size) {
++ char *first_val = (char *)header + min_offs;
++ int offs = le16_to_cpu(here->e_value_offs);
++ char *val = (char *)header + offs;
++ size_t size = EXT2_XATTR_SIZE(
++ le32_to_cpu(here->e_value_size));
++ memmove(first_val + size, first_val, val - first_val);
++ memset(first_val, 0, size);
++ here->e_value_offs = 0;
++ min_offs += size;
++
++ /* Adjust all value offsets. */
++ last = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(last)) {
++ int o = le16_to_cpu(last->e_value_offs);
++ if (!last->e_value_block && o < offs)
++ last->e_value_offs =
++ cpu_to_le16(o + size);
++ last = EXT2_XATTR_NEXT(last);
++ }
++ }
++ if (value == NULL) {
++ /* Remove this attribute. */
++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
++ /* This block is now empty. */
++ error = ext2_xattr_set2(inode, bh, NULL);
++ goto cleanup;
++ } else {
++ /* Remove the old name. */
++ int size = EXT2_XATTR_LEN(name_len);
++ last = ENTRY((char *)last - size);
++ memmove(here, (char*)here + size,
++ (char*)last - (char*)here);
++ memset(last, 0, size);
++ }
++ }
++ }
++
++ if (value != NULL) {
++ /* Insert the new value. */
++ here->e_value_size = cpu_to_le32(value_len);
++ if (value_len) {
++ size_t size = EXT2_XATTR_SIZE(value_len);
++ char *val = (char *)header + min_offs - size;
++ here->e_value_offs =
++ cpu_to_le16((char *)val - (char *)header);
++ memset(val + size - EXT2_XATTR_PAD, 0,
++ EXT2_XATTR_PAD); /* Clear the pad bytes. */
++ memcpy(val, value, value_len);
++ }
++ }
++ ext2_xattr_rehash(header, here);
++
++ error = ext2_xattr_set2(inode, bh, header);
++
++cleanup:
++ brelse(bh);
++ if (!(bh && header == HDR(bh)))
++ kfree(header);
++ up(&ext2_xattr_sem);
++
++ return error;
++}
++
++/*
++ * Second half of ext2_xattr_set(): Update the file system.
++ */
++static int
++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
++ struct ext2_xattr_header *header)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *new_bh = NULL;
++ int error;
++
++ if (header) {
++ new_bh = ext2_xattr_cache_find(inode, header);
++ if (new_bh) {
++ /*
++ * We found an identical block in the cache.
++ * The old block will be released after updating
++ * the inode.
++ */
++ ea_bdebug(old_bh, "reusing block %ld",
++ new_bh->b_blocknr);
++
++ error = -EDQUOT;
++ if (ext2_xattr_quota_alloc(inode, 1))
++ goto cleanup;
++
++ HDR(new_bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++ ea_bdebug(new_bh, "refcount now=%d",
++ le32_to_cpu(HDR(new_bh)->h_refcount));
++ } else if (old_bh && header == HDR(old_bh)) {
++ /* Keep this block. */
++ new_bh = old_bh;
++ ext2_xattr_cache_insert(new_bh);
++ } else {
++ /* We need to allocate a new block */
++ int force = EXT2_I(inode)->i_file_acl != 0;
++ int block = ext2_xattr_new_block(inode, &error, force);
++ if (error)
++ goto cleanup;
++ ea_idebug(inode, "creating block %d", block);
++
++ new_bh = sb_getblk(sb, block);
++ if (!new_bh) {
++ ext2_xattr_free_block(inode, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(new_bh);
++ memcpy(new_bh->b_data, header, new_bh->b_size);
++ mark_buffer_uptodate(new_bh, 1);
++ unlock_buffer(new_bh);
++ ext2_xattr_cache_insert(new_bh);
++
++ ext2_xattr_update_super_block(sb);
++ }
++ mark_buffer_dirty(new_bh);
++ if (IS_SYNC(inode)) {
++ ll_rw_block(WRITE, 1, &new_bh);
++ wait_on_buffer(new_bh);
++ error = -EIO;
++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
++ goto cleanup;
++ }
++ }
++
++ /* Update the inode. */
++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++ inode->i_ctime = CURRENT_TIME;
++ if (IS_SYNC(inode)) {
++ error = ext2_sync_inode (inode);
++ if (error)
++ goto cleanup;
++ } else
++ mark_inode_dirty(inode);
++
++ error = 0;
++ if (old_bh && old_bh != new_bh) {
++ /*
++ * If there was an old block, and we are not still using it,
++ * we now release the old block.
++ */
++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++ if (refcount == 1) {
++ /* Free the old block. */
++ ea_bdebug(old_bh, "freeing");
++ ext2_xattr_free_block(inode, old_bh->b_blocknr);
++ mark_buffer_clean(old_bh);
++ } else {
++ /* Decrement the refcount only. */
++ refcount--;
++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++ ext2_xattr_quota_free(inode);
++ mark_buffer_dirty(old_bh);
++ ea_bdebug(old_bh, "refcount now=%d", refcount);
++ }
++ }
++
++cleanup:
++ if (old_bh != new_bh)
++ brelse(new_bh);
++
++ return error;
++}
++
++/*
++ * ext2_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++ struct buffer_head *bh;
++ unsigned int block = EXT2_I(inode)->i_file_acl;
++
++ if (!block)
++ return;
++ down(&ext2_xattr_sem);
++
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh) {
++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++ "inode %ld: block %d read error", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++ ext2_xattr_cache_remove(bh);
++ ext2_xattr_free_block(inode, block);
++ bforget(bh);
++ bh = NULL;
++ } else {
++ HDR(bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ mark_buffer_dirty(bh);
++ if (IS_SYNC(inode)) {
++ ll_rw_block(WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ }
++ ext2_xattr_quota_free(inode);
++ }
++ EXT2_I(inode)->i_file_acl = 0;
++
++cleanup:
++ brelse(bh);
++ up(&ext2_xattr_sem);
++}
++
++/*
++ * ext2_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext2_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++/*
++ * ext2_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext2_xattr_cache_insert(struct buffer_head *bh)
++{
++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++ struct mb_cache_entry *ce;
++ int error;
++
++ ce = mb_cache_entry_alloc(ext2_xattr_cache);
++ if (!ce)
++ return -ENOMEM;
++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++ if (error) {
++ mb_cache_entry_free(ce);
++ if (error == -EBUSY) {
++ ea_bdebug(bh, "already in cache (%d cache entries)",
++ atomic_read(&ext2_xattr_cache->c_entry_count));
++ error = 0;
++ }
++ } else {
++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++ atomic_read(&ext2_xattr_cache->c_entry_count));
++ mb_cache_entry_release(ce);
++ }
++ return error;
++}
++
++/*
++ * ext2_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext2_xattr_cmp(struct ext2_xattr_header *header1,
++ struct ext2_xattr_header *header2)
++{
++ struct ext2_xattr_entry *entry1, *entry2;
++
++ entry1 = ENTRY(header1+1);
++ entry2 = ENTRY(header2+1);
++ while (!IS_LAST_ENTRY(entry1)) {
++ if (IS_LAST_ENTRY(entry2))
++ return 1;
++ if (entry1->e_hash != entry2->e_hash ||
++ entry1->e_name_len != entry2->e_name_len ||
++ entry1->e_value_size != entry2->e_value_size ||
++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++ return 1;
++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++ return -EIO;
++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++ le32_to_cpu(entry1->e_value_size)))
++ return 1;
++
++ entry1 = EXT2_XATTR_NEXT(entry1);
++ entry2 = EXT2_XATTR_NEXT(entry2);
++ }
++ if (!IS_LAST_ENTRY(entry2))
++ return 1;
++ return 0;
++}
++
++/*
++ * ext2_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
++{
++ __u32 hash = le32_to_cpu(header->h_hash);
++ struct mb_cache_entry *ce;
++
++ if (!header->h_hash)
++ return NULL; /* never share */
++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
++ while (ce) {
++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++ if (!bh) {
++ ext2_error(inode->i_sb, "ext2_xattr_cache_find",
++ "inode %ld: block %ld read error",
++ inode->i_ino, ce->e_block);
++ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++ EXT2_XATTR_REFCOUNT_MAX) {
++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++ le32_to_cpu(HDR(bh)->h_refcount),
++ EXT2_XATTR_REFCOUNT_MAX);
++ } else if (!ext2_xattr_cmp(header, HDR(bh))) {
++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++ mb_cache_entry_release(ce);
++ return bh;
++ }
++ brelse(bh);
++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++ }
++ return NULL;
++}
++
++/*
++ * ext2_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext2_xattr_cache_remove(struct buffer_head *bh)
++{
++ struct mb_cache_entry *ce;
++
++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
++ if (ce) {
++ ea_bdebug(bh, "removing (%d cache entries remaining)",
++ atomic_read(&ext2_xattr_cache->c_entry_count)-1);
++ mb_cache_entry_free(ce);
++ } else
++ ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
++ struct ext2_xattr_entry *entry)
++{
++ __u32 hash = 0;
++ char *name = entry->e_name;
++ int n;
++
++ for (n=0; n < entry->e_name_len; n++) {
++ hash = (hash << NAME_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++ *name++;
++ }
++
++ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++ __u32 *value = (__u32 *)((char *)header +
++ le16_to_cpu(entry->e_value_offs));
++ for (n = (le32_to_cpu(entry->e_value_size) +
++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
++ hash = (hash << VALUE_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++ le32_to_cpu(*value++);
++ }
++ }
++ entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext2_xattr_rehash(struct ext2_xattr_header *header,
++ struct ext2_xattr_entry *entry)
++{
++ struct ext2_xattr_entry *here;
++ __u32 hash = 0;
++
++ ext2_xattr_hash_entry(header, entry);
++ here = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(here)) {
++ if (!here->e_hash) {
++ /* Block is not shared if an entry's hash value == 0 */
++ hash = 0;
++ break;
++ }
++ hash = (hash << BLOCK_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++ le32_to_cpu(here->e_hash);
++ here = EXT2_XATTR_NEXT(here);
++ }
++ header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext2_xattr(void)
++{
++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
++ sizeof(struct mb_cache_entry) +
++ sizeof(struct mb_cache_entry_index), 1, 61);
++ if (!ext2_xattr_cache)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++ mb_cache_destroy(ext2_xattr_cache);
++}
++
++#else /* CONFIG_EXT2_FS_XATTR_SHARING */
++
++int __init
++init_ext2_xattr(void)
++{
++ return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++}
++
++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */
+Index: linux-DRV401/fs/ext2/xattr_user.c
+===================================================================
+--- linux-DRV401.orig/fs/ext2/xattr_user.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/ext2/xattr_user.c 2004-10-15 11:03:51.000000000 -0700
+@@ -0,0 +1,103 @@
++/*
++ * linux/fs/ext2/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++# include <linux/ext2_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext2_xattr_user_list(char *list, struct inode *inode,
++ const char *name, int name_len)
++{
++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return 0;
++
++ if (list) {
++ memcpy(list, XATTR_USER_PREFIX, prefix_len);
++ memcpy(list+prefix_len, name, name_len);
++ list[prefix_len + name_len] = '\0';
++ }
++ return prefix_len + name_len + 1;
++}
++
++static int
++ext2_xattr_user_get(struct inode *inode, const char *name,
++ void *buffer, size_t size)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++ error = ext2_permission_locked(inode, MAY_READ);
++#else
++ error = permission(inode, MAY_READ);
++#endif
++ if (error)
++ return error;
++
++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
++ buffer, size);
++}
++
++static int
++ext2_xattr_user_set(struct inode *inode, const char *name,
++ const void *value, size_t size, int flags)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++ if ( !S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++ error = ext2_permission_locked(inode, MAY_WRITE);
++#else
++ error = permission(inode, MAY_WRITE);
++#endif
++ if (error)
++ return error;
++
++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
++ value, size, flags);
++}
++
++struct ext2_xattr_handler ext2_xattr_user_handler = {
++ prefix: XATTR_USER_PREFIX,
++ list: ext2_xattr_user_list,
++ get: ext2_xattr_user_get,
++ set: ext2_xattr_user_set,
++};
++
++int __init
++init_ext2_xattr_user(void)
++{
++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
++ &ext2_xattr_user_handler);
++}
++
++void
++exit_ext2_xattr_user(void)
++{
++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
++ &ext2_xattr_user_handler);
++}
+Index: linux-DRV401/fs/ext3/Makefile
+===================================================================
+--- linux-DRV401.orig/fs/ext3/Makefile 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/fs/ext3/Makefile 2004-10-15 11:03:51.000000000 -0700
+@@ -1,5 +1,5 @@
+ #
+-# Makefile for the linux ext2-filesystem routines.
++# Makefile for the linux ext3-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+@@ -9,8 +9,14 @@
+
+ O_TARGET := ext3.o
+
++export-objs := ext3-exports.o
++
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o hash.o
++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m := $(O_TARGET)
+
++export-objs += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+Index: linux-DRV401/fs/ext3/file.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/file.c 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/fs/ext3/file.c 2004-10-15 11:03:51.000000000 -0700
+@@ -23,6 +23,7 @@
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+
+@@ -93,5 +94,9 @@
+ struct inode_operations ext3_file_inode_operations = {
+ truncate: ext3_truncate, /* BKL held */
+ setattr: ext3_setattr, /* BKL held */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
+
+Index: linux-DRV401/fs/ext3/ialloc.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/ialloc.c 2004-10-15 10:24:00.000000000 -0700
++++ linux-DRV401/fs/ext3/ialloc.c 2004-10-15 11:03:52.000000000 -0700
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -216,6 +217,7 @@
+ * as writing the quota to disk may need the lock as well.
+ */
+ DQUOT_INIT(inode);
++ ext3_xattr_delete_inode(handle, inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+
+Index: linux-DRV401/fs/ext3/inode.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/inode.c 2004-10-15 10:24:00.000000000 -0700
++++ linux-DRV401/fs/ext3/inode.c 2004-10-15 11:03:52.000000000 -0700
+@@ -39,6 +39,18 @@
+ */
+ #undef SEARCH_FROM_ZERO
+
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext3_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+@@ -48,7 +60,7 @@
+ * still needs to be revoked.
+ */
+
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ int blocknr)
+ {
+@@ -164,9 +176,7 @@
+ {
+ handle_t *handle;
+
+- if (is_bad_inode(inode) ||
+- inode->i_ino == EXT3_ACL_IDX_INO ||
+- inode->i_ino == EXT3_ACL_DATA_INO)
++ if (is_bad_inode(inode))
+ goto no_delete;
+
+ lock_kernel();
+@@ -1843,6 +1853,8 @@
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
++ if (ext3_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+@@ -1990,8 +2002,6 @@
+ struct ext3_group_desc * gdp;
+
+ if ((inode->i_ino != EXT3_ROOT_INO &&
+- inode->i_ino != EXT3_ACL_IDX_INO &&
+- inode->i_ino != EXT3_ACL_DATA_INO &&
+ inode->i_ino != EXT3_JOURNAL_INO &&
+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(
+@@ -2118,10 +2128,7 @@
+
+ brelse (iloc.bh);
+
+- if (inode->i_ino == EXT3_ACL_IDX_INO ||
+- inode->i_ino == EXT3_ACL_DATA_INO)
+- /* Nothing to do */ ;
+- else if (S_ISREG(inode->i_mode)) {
++ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+@@ -2129,15 +2136,17 @@
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+- if (!inode->i_blocks)
++ if (ext3_inode_is_fast_symlink(inode))
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+ else {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ }
+- } else
++ } else {
++ inode->i_op = &ext3_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(iloc.raw_inode->i_block[0]));
++ }
+ /* inode->i_attr_flags = 0; unused */
+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+Index: linux-DRV401/fs/ext3/namei.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/namei.c 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/fs/ext3/namei.c 2004-10-15 11:03:52.000000000 -0700
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1612,7 +1613,7 @@
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR);
++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -1620,7 +1621,6 @@
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+- inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ inode->i_nlink--; /* is this nlink == 0? */
+@@ -1647,9 +1647,6 @@
+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_block);
+ brelse (dir_block);
+- inode->i_mode = S_IFDIR | mode;
+- if (dir->i_mode & S_ISGID)
+- inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+ if (err) {
+@@ -2018,7 +2015,7 @@
+ goto out_stop;
+
+ if (l > sizeof (EXT3_I(inode)->i_data)) {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+ * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2245,4 +2242,16 @@
+ rmdir: ext3_rmdir, /* BKL held */
+ mknod: ext3_mknod, /* BKL held */
+ rename: ext3_rename, /* BKL held */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
++};
++
+Index: linux-DRV401/fs/ext3/super.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/super.c 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/fs/ext3/super.c 2004-10-15 11:03:52.000000000 -0700
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -404,6 +405,7 @@
+ kdev_t j_dev = sbi->s_journal->j_dev;
+ int i;
+
++ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -499,6 +501,7 @@
+ int is_remount)
+ {
+ unsigned long *mount_options = &sbi->s_mount_opt;
++
+ uid_t *resuid = &sbi->s_resuid;
+ gid_t *resgid = &sbi->s_resgid;
+ char * this_char;
+@@ -511,6 +514,13 @@
+ this_char = strtok (NULL, ",")) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++ if (!strcmp (this_char, "user_xattr"))
++ set_opt (*mount_options, XATTR_USER);
++ else if (!strcmp (this_char, "nouser_xattr"))
++ clear_opt (*mount_options, XATTR_USER);
++ else
++#endif
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+@@ -924,6 +934,12 @@
+ sbi->s_mount_opt = 0;
+ sbi->s_resuid = EXT3_DEF_RESUID;
+ sbi->s_resgid = EXT3_DEF_RESGID;
++
++ /* Default extended attribute flags */
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */
++#endif
++
+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+ sb->s_dev = 0;
+ goto out_fail;
+@@ -1742,12 +1758,27 @@
+
+ static int __init init_ext3_fs(void)
+ {
+- return register_filesystem(&ext3_fs_type);
++ int error = init_ext3_xattr();
++ if (error)
++ return error;
++ error = init_ext3_xattr_user();
++ if (error)
++ goto fail;
++ error = register_filesystem(&ext3_fs_type);
++ if (!error)
++ return 0;
++
++ exit_ext3_xattr_user();
++fail:
++ exit_ext3_xattr();
++ return error;
+ }
+
+ static void __exit exit_ext3_fs(void)
+ {
+ unregister_filesystem(&ext3_fs_type);
++ exit_ext3_xattr_user();
++ exit_ext3_xattr();
+ }
+
+ EXPORT_SYMBOL(ext3_force_commit);
+Index: linux-DRV401/fs/ext3/symlink.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/symlink.c 2004-10-15 10:24:00.000000000 -0700
++++ linux-DRV401/fs/ext3/symlink.c 2004-10-15 11:03:52.000000000 -0700
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -33,7 +34,20 @@
+ return vfs_follow_link(nd, s);
+ }
+
++struct inode_operations ext3_symlink_inode_operations = {
++ readlink: page_readlink, /* BKL not held. Don't need */
++ follow_link: page_follow_link, /* BKL not held. Don't need */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
++};
++
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+ readlink: ext3_readlink, /* BKL not held. Don't need */
+ follow_link: ext3_follow_link, /* BKL not held. Don't need */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
+Index: linux-DRV401/fs/ext3/xattr.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/xattr.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/ext3/xattr.c 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,1225 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ * suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ * +------------------+
++ * | header |
++ * | entry 1 | |
++ * | entry 2 | | growing downwards
++ * | entry 3 | v
++ * | four null bytes |
++ * | . . . |
++ * | value 1 | ^
++ * | value 3 | | growing upwards
++ * | value 2 | |
++ * +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++#define EXT3_EA_USER "user."
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++ printk(KERN_DEBUG "inode %s:%ld: ", \
++ kdevname(inode->i_dev), inode->i_ino); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++# define ea_bdebug(bh, f...) do { \
++ printk(KERN_DEBUG "block %s:%ld: ", \
++ kdevname(bh->b_dev), bh->b_blocknr); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++ struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++ struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++ struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) while(0) {}
++# define ext3_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++ int * errp, int force)
++{
++ struct super_block *sb = inode->i_sb;
++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++ /* How can we enforce the allocation? */
++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++ if (!*errp)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++ return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++ /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++ if (!error)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++ int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++ return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++ unsigned long block)
++{
++ ext3_free_blocks(handle, inode, block, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++ DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++ ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++ return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++ return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++ int error = -EINVAL;
++
++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++ write_lock(&ext3_handler_lock);
++ if (!ext3_xattr_handlers[name_index-1]) {
++ ext3_xattr_handlers[name_index-1] = handler;
++ error = 0;
++ }
++ write_unlock(&ext3_handler_lock);
++ }
++ return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++ write_lock(&ext3_handler_lock);
++ ext3_xattr_handlers[name_index-1] = NULL;
++ write_unlock(&ext3_handler_lock);
++ }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++ while (*a_prefix && *a == *a_prefix) {
++ a++;
++ a_prefix++;
++ }
++ return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++ struct ext3_xattr_handler *handler = NULL;
++ int i;
++
++ if (!*name)
++ return NULL;
++ read_lock(&ext3_handler_lock);
++ for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++ if (ext3_xattr_handlers[i]) {
++ const char *n = strcmp_prefix(*name,
++ ext3_xattr_handlers[i]->prefix);
++ if (n) {
++ handler = ext3_xattr_handlers[i];
++ *name = n;
++ break;
++ }
++ }
++ }
++ read_unlock(&ext3_handler_lock);
++ return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++ struct ext3_xattr_handler *handler = NULL;
++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++ read_lock(&ext3_handler_lock);
++ handler = ext3_xattr_handlers[name_index-1];
++ read_unlock(&ext3_handler_lock);
++ }
++ return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++ void *buffer, size_t size)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++ return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++ const void *value, size_t size, int flags)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ if (size == 0)
++ value = ""; /* empty EA, do not remove */
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_entry *entry;
++ unsigned int block, size;
++ char *end;
++ int name_len, error;
++
++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++ name_index, name, buffer, (long)buffer_size);
++
++ if (name == NULL)
++ return -EINVAL;
++ if (!EXT3_I(inode)->i_file_acl)
++ return -ENOATTR;
++ block = EXT3_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* find named attribute */
++ name_len = strlen(name);
++
++ error = -ERANGE;
++ if (name_len > 255)
++ goto cleanup;
++ entry = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (name_index == entry->e_name_index &&
++ name_len == entry->e_name_len &&
++ memcmp(name, entry->e_name, name_len) == 0)
++ goto found;
++ entry = next;
++ }
++ /* Check the remaining name entries */
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ entry = next;
++ }
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ error = -ENOATTR;
++ goto cleanup;
++found:
++ /* check the buffer size */
++ if (entry->e_value_block != 0)
++ goto bad_block;
++ size = le32_to_cpu(entry->e_value_size);
++ if (size > inode->i_sb->s_blocksize ||
++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++ goto bad_block;
++
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (buffer) {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ /* return value of attribute */
++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++ size);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_entry *entry;
++ unsigned int block, size = 0;
++ char *buf, *end;
++ int error;
++
++ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++ buffer, (long)buffer_size);
++
++ if (!EXT3_I(inode)->i_file_acl)
++ return 0;
++ block = EXT3_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* compute the size required for the list of attribute names */
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT3_XATTR_NEXT(entry)) {
++ struct ext3_xattr_handler *handler;
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++
++ handler = ext3_xattr_handler(entry->e_name_index);
++ if (handler)
++ size += handler->list(NULL, inode, entry->e_name,
++ entry->e_name_len);
++ }
++
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (!buffer) {
++ error = size;
++ goto cleanup;
++ } else {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ }
++
++ /* list the attribute names */
++ buf = buffer;
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT3_XATTR_NEXT(entry)) {
++ struct ext3_xattr_handler *handler;
++
++ handler = ext3_xattr_handler(entry->e_name_index);
++ if (handler)
++ buf += handler->list(buf, inode, entry->e_name,
++ entry->e_name_len);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++ struct super_block *sb)
++{
++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++ return;
++
++ lock_super(sb);
++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++ EXT3_SB(sb)->s_es->s_feature_compat |=
++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++ sb->s_dirt = 1;
++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++ unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++ const char *name, const void *value, size_t value_len, int flags)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_header *header = NULL;
++ struct ext3_xattr_entry *here, *last;
++ unsigned int name_len;
++ int block = EXT3_I(inode)->i_file_acl;
++ int min_offs = sb->s_blocksize, not_found = 1, free, error;
++ char *end;
++
++ /*
++ * header -- Points either into bh, or to a temporarily
++ * allocated buffer.
++ * here -- The named entry found, or the place for inserting, within
++ * the block pointed to by header.
++ * last -- Points right after the last named entry within the block
++ * pointed to by header.
++ * min_offs -- The offset of the first value (values are aligned
++ * towards the end of the block).
++ * end -- Points right after the block pointed to by header.
++ */
++
++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++ name_index, name, value, (long)value_len);
++
++ if (IS_RDONLY(inode))
++ return -EROFS;
++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++ return -EPERM;
++ if (value == NULL)
++ value_len = 0;
++ if (name == NULL)
++ return -EINVAL;
++ name_len = strlen(name);
++ if (name_len > 255 || value_len > sb->s_blocksize)
++ return -ERANGE;
++ down(&ext3_xattr_sem);
++
++ if (block) {
++ /* The inode already has an extended attribute block. */
++ bh = sb_bread(sb, block);
++ error = -EIO;
++ if (!bh)
++ goto cleanup;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)),
++ le32_to_cpu(HDR(bh)->h_refcount));
++ header = HDR(bh);
++ end = bh->b_data + bh->b_size;
++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ header->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(sb, "ext3_xattr_set",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* Find the named attribute. */
++ here = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(here)) {
++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!here->e_value_block && here->e_value_size) {
++ int offs = le16_to_cpu(here->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ not_found = name_index - here->e_name_index;
++ if (!not_found)
++ not_found = name_len - here->e_name_len;
++ if (!not_found)
++ not_found = memcmp(name, here->e_name,name_len);
++ if (not_found <= 0)
++ break;
++ here = next;
++ }
++ last = here;
++ /* We still need to compute min_offs and last. */
++ while (!IS_LAST_ENTRY(last)) {
++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!last->e_value_block && last->e_value_size) {
++ int offs = le16_to_cpu(last->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ last = next;
++ }
++
++ /* Check whether we have enough space left. */
++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++ } else {
++ /* We will use a new extended attribute block. */
++ free = sb->s_blocksize -
++ sizeof(struct ext3_xattr_header) - sizeof(__u32);
++ here = last = NULL; /* avoid gcc uninitialized warning. */
++ }
++
++ if (not_found) {
++ /* Request to remove a nonexistent attribute? */
++ error = -ENOATTR;
++ if (flags & XATTR_REPLACE)
++ goto cleanup;
++ error = 0;
++ if (value == NULL)
++ goto cleanup;
++ else
++ free -= EXT3_XATTR_LEN(name_len);
++ } else {
++ /* Request to create an existing attribute? */
++ error = -EEXIST;
++ if (flags & XATTR_CREATE)
++ goto cleanup;
++ if (!here->e_value_block && here->e_value_size) {
++ unsigned int size = le32_to_cpu(here->e_value_size);
++
++ if (le16_to_cpu(here->e_value_offs) + size >
++ sb->s_blocksize || size > sb->s_blocksize)
++ goto bad_block;
++ free += EXT3_XATTR_SIZE(size);
++ }
++ }
++ free -= EXT3_XATTR_SIZE(value_len);
++ error = -ENOSPC;
++ if (free < 0)
++ goto cleanup;
++
++ /* Here we know that we can set the new attribute. */
++
++ if (header) {
++ if (header->h_refcount == cpu_to_le32(1)) {
++ ea_bdebug(bh, "modifying in-place");
++ ext3_xattr_cache_remove(bh);
++ error = ext3_journal_get_write_access(handle, bh);
++ if (error)
++ goto cleanup;
++ } else {
++ int offset;
++
++ ea_bdebug(bh, "cloning");
++ header = kmalloc(bh->b_size, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memcpy(header, HDR(bh), bh->b_size);
++ header->h_refcount = cpu_to_le32(1);
++ offset = (char *)header - bh->b_data;
++ here = ENTRY((char *)here + offset);
++ last = ENTRY((char *)last + offset);
++ }
++ } else {
++ /* Allocate a buffer where we construct the new block. */
++ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memset(header, 0, sb->s_blocksize);
++ end = (char *)header + sb->s_blocksize;
++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++ header->h_blocks = header->h_refcount = cpu_to_le32(1);
++ last = here = ENTRY(header+1);
++ }
++
++ if (not_found) {
++ /* Insert the new name. */
++ int size = EXT3_XATTR_LEN(name_len);
++ int rest = (char *)last - (char *)here;
++ memmove((char *)here + size, here, rest);
++ memset(here, 0, size);
++ here->e_name_index = name_index;
++ here->e_name_len = name_len;
++ memcpy(here->e_name, name, name_len);
++ } else {
++ /* Remove the old value. */
++ if (!here->e_value_block && here->e_value_size) {
++ char *first_val = (char *)header + min_offs;
++ int offs = le16_to_cpu(here->e_value_offs);
++ char *val = (char *)header + offs;
++ size_t size = EXT3_XATTR_SIZE(
++ le32_to_cpu(here->e_value_size));
++ memmove(first_val + size, first_val, val - first_val);
++ memset(first_val, 0, size);
++ here->e_value_offs = 0;
++ min_offs += size;
++
++ /* Adjust all value offsets. */
++ last = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(last)) {
++ int o = le16_to_cpu(last->e_value_offs);
++ if (!last->e_value_block && o < offs)
++ last->e_value_offs =
++ cpu_to_le16(o + size);
++ last = EXT3_XATTR_NEXT(last);
++ }
++ }
++ if (value == NULL) {
++ /* Remove this attribute. */
++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++ /* This block is now empty. */
++ error = ext3_xattr_set2(handle, inode, bh,NULL);
++ goto cleanup;
++ } else {
++ /* Remove the old name. */
++ int size = EXT3_XATTR_LEN(name_len);
++ last = ENTRY((char *)last - size);
++ memmove(here, (char*)here + size,
++ (char*)last - (char*)here);
++ memset(last, 0, size);
++ }
++ }
++ }
++
++ if (value != NULL) {
++ /* Insert the new value. */
++ here->e_value_size = cpu_to_le32(value_len);
++ if (value_len) {
++ size_t size = EXT3_XATTR_SIZE(value_len);
++ char *val = (char *)header + min_offs - size;
++ here->e_value_offs =
++ cpu_to_le16((char *)val - (char *)header);
++ memset(val + size - EXT3_XATTR_PAD, 0,
++ EXT3_XATTR_PAD); /* Clear the pad bytes. */
++ memcpy(val, value, value_len);
++ }
++ }
++ ext3_xattr_rehash(header, here);
++
++ error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++ brelse(bh);
++ if (!(bh && header == HDR(bh)))
++ kfree(header);
++ up(&ext3_xattr_sem);
++
++ return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++ struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *new_bh = NULL;
++ int error;
++
++ if (header) {
++ new_bh = ext3_xattr_cache_find(inode, header);
++ if (new_bh) {
++ /*
++ * We found an identical block in the cache.
++ * The old block will be released after updating
++ * the inode.
++ */
++ ea_bdebug(old_bh, "reusing block %ld",
++ new_bh->b_blocknr);
++
++ error = -EDQUOT;
++ if (ext3_xattr_quota_alloc(inode, 1))
++ goto cleanup;
++
++ error = ext3_journal_get_write_access(handle, new_bh);
++ if (error)
++ goto cleanup;
++ HDR(new_bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++ ea_bdebug(new_bh, "refcount now=%d",
++ le32_to_cpu(HDR(new_bh)->h_refcount));
++ } else if (old_bh && header == HDR(old_bh)) {
++ /* Keep this block. */
++ new_bh = old_bh;
++ ext3_xattr_cache_insert(new_bh);
++ } else {
++ /* We need to allocate a new block */
++ int force = EXT3_I(inode)->i_file_acl != 0;
++ int block = ext3_xattr_new_block(handle, inode,
++ &error, force);
++ if (error)
++ goto cleanup;
++ ea_idebug(inode, "creating block %d", block);
++
++ new_bh = sb_getblk(sb, block);
++ if (!new_bh) {
++getblk_failed: ext3_xattr_free_block(handle, inode, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(new_bh);
++ error = ext3_journal_get_create_access(handle, new_bh);
++ if (error) {
++ unlock_buffer(new_bh);
++ goto getblk_failed;
++ }
++ memcpy(new_bh->b_data, header, new_bh->b_size);
++ mark_buffer_uptodate(new_bh, 1);
++ unlock_buffer(new_bh);
++ ext3_xattr_cache_insert(new_bh);
++
++ ext3_xattr_update_super_block(handle, sb);
++ }
++ error = ext3_journal_dirty_metadata(handle, new_bh);
++ if (error)
++ goto cleanup;
++ }
++
++ /* Update the inode. */
++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++ inode->i_ctime = CURRENT_TIME;
++ ext3_mark_inode_dirty(handle, inode);
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++ error = 0;
++ if (old_bh && old_bh != new_bh) {
++ /*
++ * If there was an old block, and we are not still using it,
++ * we now release the old block.
++ */
++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++ error = ext3_journal_get_write_access(handle, old_bh);
++ if (error)
++ goto cleanup;
++ if (refcount == 1) {
++ /* Free the old block. */
++ ea_bdebug(old_bh, "freeing");
++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++ /* ext3_forget() calls bforget() for us, but we
++ let our caller release old_bh, so we need to
++ duplicate the handle before. */
++ get_bh(old_bh);
++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++ } else {
++ /* Decrement the refcount only. */
++ refcount--;
++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++ ext3_xattr_quota_free(inode);
++ ext3_journal_dirty_metadata(handle, old_bh);
++ ea_bdebug(old_bh, "refcount now=%d", refcount);
++ }
++ }
++
++cleanup:
++ if (old_bh != new_bh)
++ brelse(new_bh);
++
++ return error;
++}
++
++/*
++ * ext3_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++ struct buffer_head *bh;
++ unsigned int block = EXT3_I(inode)->i_file_acl;
++
++ if (!block)
++ return;
++ down(&ext3_xattr_sem);
++
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh) {
++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++ "inode %ld: block %d read error", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ goto cleanup;
++ }
++ ext3_journal_get_write_access(handle, bh);
++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++ ext3_xattr_cache_remove(bh);
++ ext3_xattr_free_block(handle, inode, block);
++ ext3_forget(handle, 1, inode, bh, block);
++ bh = NULL;
++ } else {
++ HDR(bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ ext3_journal_dirty_metadata(handle, bh);
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++ ext3_xattr_quota_free(inode);
++ }
++ EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++ brelse(bh);
++ up(&ext3_xattr_sem);
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++ struct mb_cache_entry *ce;
++ int error;
++
++ ce = mb_cache_entry_alloc(ext3_xattr_cache);
++ if (!ce)
++ return -ENOMEM;
++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++ if (error) {
++ mb_cache_entry_free(ce);
++ if (error == -EBUSY) {
++ ea_bdebug(bh, "already in cache (%d cache entries)",
++ atomic_read(&ext3_xattr_cache->c_entry_count));
++ error = 0;
++ }
++ } else {
++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++ atomic_read(&ext3_xattr_cache->c_entry_count));
++ mb_cache_entry_release(ce);
++ }
++ return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++ struct ext3_xattr_header *header2)
++{
++ struct ext3_xattr_entry *entry1, *entry2;
++
++ entry1 = ENTRY(header1+1);
++ entry2 = ENTRY(header2+1);
++ while (!IS_LAST_ENTRY(entry1)) {
++ if (IS_LAST_ENTRY(entry2))
++ return 1;
++ if (entry1->e_hash != entry2->e_hash ||
++ entry1->e_name_len != entry2->e_name_len ||
++ entry1->e_value_size != entry2->e_value_size ||
++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++ return 1;
++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++ return -EIO;
++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++ le32_to_cpu(entry1->e_value_size)))
++ return 1;
++
++ entry1 = EXT3_XATTR_NEXT(entry1);
++ entry2 = EXT3_XATTR_NEXT(entry2);
++ }
++ if (!IS_LAST_ENTRY(entry2))
++ return 1;
++ return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++ __u32 hash = le32_to_cpu(header->h_hash);
++ struct mb_cache_entry *ce;
++
++ if (!header->h_hash)
++ return NULL; /* never share */
++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++ while (ce) {
++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++ if (!bh) {
++ ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++ "inode %ld: block %ld read error",
++ inode->i_ino, ce->e_block);
++ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++ EXT3_XATTR_REFCOUNT_MAX) {
++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++ le32_to_cpu(HDR(bh)->h_refcount),
++ EXT3_XATTR_REFCOUNT_MAX);
++ } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++ mb_cache_entry_release(ce);
++ return bh;
++ }
++ brelse(bh);
++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++ }
++ return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++ struct mb_cache_entry *ce;
++
++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++ if (ce) {
++ ea_bdebug(bh, "removing (%d cache entries remaining)",
++ atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++ mb_cache_entry_free(ce);
++ } else
++ ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++ struct ext3_xattr_entry *entry)
++{
++ __u32 hash = 0;
++ char *name = entry->e_name;
++ int n;
++
++ for (n=0; n < entry->e_name_len; n++) {
++ hash = (hash << NAME_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++ *name++;
++ }
++
++ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++ __u32 *value = (__u32 *)((char *)header +
++ le16_to_cpu(entry->e_value_offs));
++ for (n = (le32_to_cpu(entry->e_value_size) +
++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++ hash = (hash << VALUE_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++ le32_to_cpu(*value++);
++ }
++ }
++ entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++ struct ext3_xattr_entry *entry)
++{
++ struct ext3_xattr_entry *here;
++ __u32 hash = 0;
++
++ ext3_xattr_hash_entry(header, entry);
++ here = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(here)) {
++ if (!here->e_hash) {
++ /* Block is not shared if an entry's hash value == 0 */
++ hash = 0;
++ break;
++ }
++ hash = (hash << BLOCK_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++ le32_to_cpu(here->e_hash);
++ here = EXT3_XATTR_NEXT(here);
++ }
++ header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++ sizeof(struct mb_cache_entry) +
++ sizeof(struct mb_cache_entry_index), 1, 61);
++ if (!ext3_xattr_cache)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++ if (ext3_xattr_cache)
++ mb_cache_destroy(ext3_xattr_cache);
++ ext3_xattr_cache = NULL;
++}
++
++#else /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++ return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */
+Index: linux-DRV401/fs/ext3/xattr_user.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/xattr_user.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/ext3/xattr_user.c 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,111 @@
++/*
++ * linux/fs/ext3/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++# include <linux/ext3_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext3_xattr_user_list(char *list, struct inode *inode,
++ const char *name, int name_len)
++{
++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return 0;
++
++ if (list) {
++ memcpy(list, XATTR_USER_PREFIX, prefix_len);
++ memcpy(list+prefix_len, name, name_len);
++ list[prefix_len + name_len] = '\0';
++ }
++ return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_user_get(struct inode *inode, const char *name,
++ void *buffer, size_t size)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++ error = ext3_permission_locked(inode, MAY_READ);
++#else
++ error = permission(inode, MAY_READ);
++#endif
++ if (error)
++ return error;
++
++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
++ buffer, size);
++}
++
++static int
++ext3_xattr_user_set(struct inode *inode, const char *name,
++ const void *value, size_t size, int flags)
++{
++ handle_t *handle;
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++ if ( !S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++ error = ext3_permission_locked(inode, MAY_WRITE);
++#else
++ error = permission(inode, MAY_WRITE);
++#endif
++ if (error)
++ return error;
++
++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
++ value, size, flags);
++ ext3_journal_stop(handle, inode);
++
++ return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_user_handler = {
++ prefix: XATTR_USER_PREFIX,
++ list: ext3_xattr_user_list,
++ get: ext3_xattr_user_get,
++ set: ext3_xattr_user_set,
++};
++
++int __init
++init_ext3_xattr_user(void)
++{
++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
++ &ext3_xattr_user_handler);
++}
++
++void
++exit_ext3_xattr_user(void)
++{
++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
++ &ext3_xattr_user_handler);
++}
+Index: linux-DRV401/fs/ext3/ext3-exports.c
+===================================================================
+--- linux-DRV401.orig/fs/ext3/ext3-exports.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/ext3/ext3-exports.c 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+Index: linux-DRV401/fs/mbcache.c
+===================================================================
+--- linux-DRV401.orig/fs/mbcache.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/mbcache.c 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,648 @@
++/*
++ * linux/fs/mbcache.c
++ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++/*
++ * Filesystem Meta Information Block Cache (mbcache)
++ *
++ * The mbcache caches blocks of block devices that need to be located
++ * by their device/block number, as well as by other criteria (such
++ * as the block's contents).
++ *
++ * There can only be one cache entry in a cache per device and block number.
++ * Additional indexes need not be unique in this sense. The number of
++ * additional indexes (=other criteria) can be hardwired at compile time
++ * or specified at cache create time.
++ *
++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
++ * in the cache. A valid entry is in the main hash tables of the cache,
++ * and may also be in the lru list. An invalid entry is not in any hashes
++ * or lists.
++ *
++ * A valid cache entry is only in the lru list if no handles refer to it.
++ * Invalid cache entries will be freed when the last handle to the cache
++ * entry is released. Entries that cannot be freed immediately are put
++ * back on the lru list.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/cache_def.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/mbcache.h>
++
++
++#ifdef MB_CACHE_DEBUG
++# define mb_debug(f...) do { \
++ printk(KERN_DEBUG f); \
++ printk("\n"); \
++ } while (0)
++#define mb_assert(c) do { if (!(c)) \
++ printk(KERN_ERR "assertion " #c " failed\n"); \
++ } while(0)
++#else
++# define mb_debug(f...) do { } while(0)
++# define mb_assert(c) do { } while(0)
++#endif
++#define mb_error(f...) do { \
++ printk(KERN_ERR f); \
++ printk("\n"); \
++ } while(0)
++
++MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
++MODULE_LICENSE("GPL");
++#endif
++
++EXPORT_SYMBOL(mb_cache_create);
++EXPORT_SYMBOL(mb_cache_shrink);
++EXPORT_SYMBOL(mb_cache_destroy);
++EXPORT_SYMBOL(mb_cache_entry_alloc);
++EXPORT_SYMBOL(mb_cache_entry_insert);
++EXPORT_SYMBOL(mb_cache_entry_release);
++EXPORT_SYMBOL(mb_cache_entry_takeout);
++EXPORT_SYMBOL(mb_cache_entry_free);
++EXPORT_SYMBOL(mb_cache_entry_dup);
++EXPORT_SYMBOL(mb_cache_entry_get);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++EXPORT_SYMBOL(mb_cache_entry_find_first);
++EXPORT_SYMBOL(mb_cache_entry_find_next);
++#endif
++
++
++/*
++ * Global data: list of all mbcache's, lru list, and a spinlock for
++ * accessing cache data structures on SMP machines. The lru list is
++ * global across all mbcaches.
++ */
++
++static LIST_HEAD(mb_cache_list);
++static LIST_HEAD(mb_cache_lru_list);
++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
++
++static inline int
++mb_cache_indexes(struct mb_cache *cache)
++{
++#ifdef MB_CACHE_INDEXES_COUNT
++ return MB_CACHE_INDEXES_COUNT;
++#else
++ return cache->c_indexes_count;
++#endif
++}
++
++/*
++ * What the mbcache registers as to get shrunk dynamically.
++ */
++
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
++
++static struct cache_definition mb_cache_definition = {
++ "mb_cache",
++ mb_cache_memory_pressure
++};
++
++
++static inline int
++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
++{
++ return !list_empty(&ce->e_block_list);
++}
++
++
++static inline void
++__mb_cache_entry_unhash(struct mb_cache_entry *ce)
++{
++ int n;
++
++ if (__mb_cache_entry_is_hashed(ce)) {
++ list_del_init(&ce->e_block_list);
++ for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
++ list_del(&ce->e_indexes[n].o_list);
++ }
++}
++
++
++static inline void
++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
++{
++ struct mb_cache *cache = ce->e_cache;
++
++ mb_assert(atomic_read(&ce->e_used) == 0);
++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
++ /* free failed -- put back on the lru list
++ for freeing later. */
++ spin_lock(&mb_cache_spinlock);
++ list_add(&ce->e_lru_list, &mb_cache_lru_list);
++ spin_unlock(&mb_cache_spinlock);
++ } else {
++ kmem_cache_free(cache->c_entry_cache, ce);
++ atomic_dec(&cache->c_entry_count);
++ }
++}
++
++
++static inline void
++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
++{
++ if (atomic_dec_and_test(&ce->e_used)) {
++ if (__mb_cache_entry_is_hashed(ce))
++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
++ else {
++ spin_unlock(&mb_cache_spinlock);
++ __mb_cache_entry_forget(ce, GFP_KERNEL);
++ return;
++ }
++ }
++ spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_memory_pressure() memory pressure callback
++ *
++ * This function is called by the kernel memory management when memory
++ * gets low.
++ *
++ * @priority: Amount by which to shrink the cache (0 = highes priority)
++ * @gfp_mask: (ignored)
++ */
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++ int count = 0;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &mb_cache_list) {
++ struct mb_cache *cache =
++ list_entry(l, struct mb_cache, c_cache_list);
++ mb_debug("cache %s (%d)", cache->c_name,
++ atomic_read(&cache->c_entry_count));
++ count += atomic_read(&cache->c_entry_count);
++ }
++ mb_debug("trying to free %d of %d entries",
++ count / (priority ? priority : 1), count);
++ if (priority)
++ count /= priority;
++ while (count-- && !list_empty(&mb_cache_lru_list)) {
++ struct mb_cache_entry *ce =
++ list_entry(mb_cache_lru_list.next,
++ struct mb_cache_entry, e_lru_list);
++ list_del(&ce->e_lru_list);
++ __mb_cache_entry_unhash(ce);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ }
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), gfp_mask);
++ }
++}
++
++
++/*
++ * mb_cache_create() create a new cache
++ *
++ * All entries in one cache are equal size. Cache entries may be from
++ * multiple devices. If this is the first mbcache created, registers
++ * the cache with kernel memory management. Returns NULL if no more
++ * memory was available.
++ *
++ * @name: name of the cache (informal)
++ * @cache_op: contains the callback called when freeing a cache entry
++ * @entry_size: The size of a cache entry, including
++ * struct mb_cache_entry
++ * @indexes_count: number of additional indexes in the cache. Must equal
++ * MB_CACHE_INDEXES_COUNT if the number of indexes is
++ * hardwired.
++ * @bucket_count: number of hash buckets
++ */
++struct mb_cache *
++mb_cache_create(const char *name, struct mb_cache_op *cache_op,
++ size_t entry_size, int indexes_count, int bucket_count)
++{
++ int m=0, n;
++ struct mb_cache *cache = NULL;
++
++ if(entry_size < sizeof(struct mb_cache_entry) +
++ indexes_count * sizeof(struct mb_cache_entry_index))
++ return NULL;
++
++ MOD_INC_USE_COUNT;
++ cache = kmalloc(sizeof(struct mb_cache) +
++ indexes_count * sizeof(struct list_head), GFP_KERNEL);
++ if (!cache)
++ goto fail;
++ cache->c_name = name;
++ cache->c_op.free = NULL;
++ if (cache_op)
++ cache->c_op.free = cache_op->free;
++ atomic_set(&cache->c_entry_count, 0);
++ cache->c_bucket_count = bucket_count;
++#ifdef MB_CACHE_INDEXES_COUNT
++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
++#else
++ cache->c_indexes_count = indexes_count;
++#endif
++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!cache->c_block_hash)
++ goto fail;
++ for (n=0; n<bucket_count; n++)
++ INIT_LIST_HEAD(&cache->c_block_hash[n]);
++ for (m=0; m<indexes_count; m++) {
++ cache->c_indexes_hash[m] = kmalloc(bucket_count *
++ sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!cache->c_indexes_hash[m])
++ goto fail;
++ for (n=0; n<bucket_count; n++)
++ INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
++ }
++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
++ if (!cache->c_entry_cache)
++ goto fail;
++
++ spin_lock(&mb_cache_spinlock);
++ list_add(&cache->c_cache_list, &mb_cache_list);
++ spin_unlock(&mb_cache_spinlock);
++ return cache;
++
++fail:
++ if (cache) {
++ while (--m >= 0)
++ kfree(cache->c_indexes_hash[m]);
++ if (cache->c_block_hash)
++ kfree(cache->c_block_hash);
++ kfree(cache);
++ }
++ MOD_DEC_USE_COUNT;
++ return NULL;
++}
++
++
++/*
++ * mb_cache_shrink()
++ *
++ * Removes all cache entires of a device from the cache. All cache entries
++ * currently in use cannot be freed, and thus remain in the cache.
++ *
++ * @cache: which cache to shrink
++ * @dev: which device's cache entries to shrink
++ */
++void
++mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_lru_list);
++ if (ce->e_dev == dev) {
++ list_del(&ce->e_lru_list);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ __mb_cache_entry_unhash(ce);
++ }
++ }
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), GFP_KERNEL);
++ }
++}
++
++
++/*
++ * mb_cache_destroy()
++ *
++ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
++ * and then destroys it. If this was the last mbcache, un-registers the
++ * mbcache from kernel memory management.
++ */
++void
++mb_cache_destroy(struct mb_cache *cache)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++ int n;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_lru_list);
++ if (ce->e_cache == cache) {
++ list_del(&ce->e_lru_list);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ __mb_cache_entry_unhash(ce);
++ }
++ }
++ list_del(&cache->c_cache_list);
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), GFP_KERNEL);
++ }
++
++ if (atomic_read(&cache->c_entry_count) > 0) {
++ mb_error("cache %s: %d orphaned entries",
++ cache->c_name,
++ atomic_read(&cache->c_entry_count));
++ }
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
++ /* We don't have kmem_cache_destroy() in 2.2.x */
++ kmem_cache_shrink(cache->c_entry_cache);
++#else
++ kmem_cache_destroy(cache->c_entry_cache);
++#endif
++ for (n=0; n < mb_cache_indexes(cache); n++)
++ kfree(cache->c_indexes_hash[n]);
++ kfree(cache->c_block_hash);
++ kfree(cache);
++
++ MOD_DEC_USE_COUNT;
++}
++
++
++/*
++ * mb_cache_entry_alloc()
++ *
++ * Allocates a new cache entry. The new entry will not be valid initially,
++ * and thus cannot be looked up yet. It should be filled with data, and
++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
++ * if no more memory was available.
++ */
++struct mb_cache_entry *
++mb_cache_entry_alloc(struct mb_cache *cache)
++{
++ struct mb_cache_entry *ce;
++
++ atomic_inc(&cache->c_entry_count);
++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
++ if (ce) {
++ INIT_LIST_HEAD(&ce->e_lru_list);
++ INIT_LIST_HEAD(&ce->e_block_list);
++ ce->e_cache = cache;
++ atomic_set(&ce->e_used, 1);
++ }
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_insert()
++ *
++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
++ * the cache. After this, the cache entry can be looked up, but is not yet
++ * in the lru list as the caller still holds a handle to it. Returns 0 on
++ * success, or -EBUSY if a cache entry for that device + inode exists
++ * already (this may happen after a failed lookup, if another process has
++ * inserted the same cache entry in the meantime).
++ *
++ * @dev: device the cache entry belongs to
++ * @block: block number
++ * @keys: array of additional keys. There must be indexes_count entries
++ * in the array (as specified when creating the cache).
++ */
++int
++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
++ unsigned long block, unsigned int keys[])
++{
++ struct mb_cache *cache = ce->e_cache;
++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++ struct list_head *l;
++ int error = -EBUSY, n;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &cache->c_block_hash[bucket]) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_block_list);
++ if (ce->e_dev == dev && ce->e_block == block)
++ goto out;
++ }
++ __mb_cache_entry_unhash(ce);
++ ce->e_dev = dev;
++ ce->e_block = block;
++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
++ for (n=0; n<mb_cache_indexes(cache); n++) {
++ ce->e_indexes[n].o_key = keys[n];
++ bucket = keys[n] % cache->c_bucket_count;
++ list_add(&ce->e_indexes[n].o_list,
++ &cache->c_indexes_hash[n][bucket]);
++ }
++out:
++ spin_unlock(&mb_cache_spinlock);
++ return error;
++}
++
++
++/*
++ * mb_cache_entry_release()
++ *
++ * Release a handle to a cache entry. When the last handle to a cache entry
++ * is released it is either freed (if it is invalid) or otherwise inserted
++ * in to the lru list.
++ */
++void
++mb_cache_entry_release(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_takeout()
++ *
++ * Take a cache entry out of the cache, making it invalid. The entry can later
++ * be re-inserted using mb_cache_entry_insert(), or released using
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_takeout(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ mb_assert(list_empty(&ce->e_lru_list));
++ __mb_cache_entry_unhash(ce);
++ spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_entry_free()
++ *
++ * This is equivalent to the sequence mb_cache_entry_takeout() --
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_free(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ mb_assert(list_empty(&ce->e_lru_list));
++ __mb_cache_entry_unhash(ce);
++ __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_dup()
++ *
++ * Duplicate a handle to a cache entry (does not duplicate the cache entry
++ * itself). After the call, both the old and the new handle must be released.
++ */
++struct mb_cache_entry *
++mb_cache_entry_dup(struct mb_cache_entry *ce)
++{
++ atomic_inc(&ce->e_used);
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_get()
++ *
++ * Get a cache entry by device / block number. (There can only be one entry
++ * in the cache per device and block.) Returns NULL if no such cache entry
++ * exists.
++ */
++struct mb_cache_entry *
++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
++{
++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &cache->c_block_hash[bucket]) {
++ ce = list_entry(l, struct mb_cache_entry, e_block_list);
++ if (ce->e_dev == dev && ce->e_block == block) {
++ if (!list_empty(&ce->e_lru_list))
++ list_del_init(&ce->e_lru_list);
++ atomic_inc(&ce->e_used);
++ goto cleanup;
++ }
++ }
++ ce = NULL;
++
++cleanup:
++ spin_unlock(&mb_cache_spinlock);
++ return ce;
++}
++
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++
++static struct mb_cache_entry *
++__mb_cache_entry_find(struct list_head *l, struct list_head *head,
++ int index, kdev_t dev, unsigned int key)
++{
++ while (l != head) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry,
++ e_indexes[index].o_list);
++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
++ if (!list_empty(&ce->e_lru_list))
++ list_del_init(&ce->e_lru_list);
++ atomic_inc(&ce->e_used);
++ return ce;
++ }
++ l = l->next;
++ }
++ return NULL;
++}
++
++
++/*
++ * mb_cache_entry_find_first()
++ *
++ * Find the first cache entry on a given device with a certain key in
++ * an additional index. Additonal matches can be found with
++ * mb_cache_entry_find_next(). Returns NULL if no match was found.
++ *
++ * @cache: the cache to search
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
++ unsigned int key)
++{
++ unsigned int bucket = key % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ mb_assert(index < mb_cache_indexes(cache));
++ spin_lock(&mb_cache_spinlock);
++ l = cache->c_indexes_hash[index][bucket].next;
++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++ index, dev, key);
++ spin_unlock(&mb_cache_spinlock);
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_find_next()
++ *
++ * Find the next cache entry on a given device with a certain key in an
++ * additional index. Returns NULL if no match could be found. The previous
++ * entry is atomatically released, so that mb_cache_entry_find_next() can
++ * be called like this:
++ *
++ * entry = mb_cache_entry_find_first();
++ * while (entry) {
++ * ...
++ * entry = mb_cache_entry_find_next(entry, ...);
++ * }
++ *
++ * @prev: The previous match
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
++ unsigned int key)
++{
++ struct mb_cache *cache = prev->e_cache;
++ unsigned int bucket = key % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ mb_assert(index < mb_cache_indexes(cache));
++ spin_lock(&mb_cache_spinlock);
++ l = prev->e_indexes[index].o_list.next;
++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++ index, dev, key);
++ __mb_cache_entry_release_unlock(prev);
++ return ce;
++}
++
++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
++
++static int __init init_mbcache(void)
++{
++ register_cache(&mb_cache_definition);
++ return 0;
++}
++
++static void __exit exit_mbcache(void)
++{
++ unregister_cache(&mb_cache_definition);
++}
++
++module_init(init_mbcache)
++module_exit(exit_mbcache)
++
+Index: linux-DRV401/fs/xattr.c
+===================================================================
+--- linux-DRV401.orig/fs/xattr.c 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/fs/xattr.c 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,355 @@
++/*
++ File: fs/xattr.c
++
++ Extended attribute handling.
++
++ Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
++ Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
++ */
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/smp_lock.h>
++#include <linux/file.h>
++#include <linux/xattr.h>
++#include <asm/uaccess.h>
++
++/*
++ * Extended attribute memory allocation wrappers, originally
++ * based on the Intermezzo PRESTO_ALLOC/PRESTO_FREE macros.
++ * The vmalloc use here is very uncommon - extended attributes
++ * are supposed to be small chunks of metadata, and it is quite
++ * unusual to have very many extended attributes, so lists tend
++ * to be quite short as well. The 64K upper limit is derived
++ * from the extended attribute size limit used by XFS.
++ * Intentionally allow zero @size for value/list size requests.
++ */
++static void *
++xattr_alloc(size_t size, size_t limit)
++{
++ void *ptr;
++
++ if (size > limit)
++ return ERR_PTR(-E2BIG);
++
++ if (!size) /* size request, no buffer is needed */
++ return NULL;
++ else if (size <= PAGE_SIZE)
++ ptr = kmalloc((unsigned long) size, GFP_KERNEL);
++ else
++ ptr = vmalloc((unsigned long) size);
++ if (!ptr)
++ return ERR_PTR(-ENOMEM);
++ return ptr;
++}
++
++static void
++xattr_free(void *ptr, size_t size)
++{
++ if (!size) /* size request, no buffer was needed */
++ return;
++ else if (size <= PAGE_SIZE)
++ kfree(ptr);
++ else
++ vfree(ptr);
++}
++
++/*
++ * Extended attribute SET operations
++ */
++static long
++setxattr(struct dentry *d, char *name, void *value, size_t size, int flags)
++{
++ int error;
++ void *kvalue;
++ char kname[XATTR_NAME_MAX + 1];
++
++ if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
++ return -EINVAL;
++
++ error = strncpy_from_user(kname, name, sizeof(kname));
++ if (error == 0 || error == sizeof(kname))
++ error = -ERANGE;
++ if (error < 0)
++ return error;
++
++ kvalue = xattr_alloc(size, XATTR_SIZE_MAX);
++ if (IS_ERR(kvalue))
++ return PTR_ERR(kvalue);
++
++ if (size > 0 && copy_from_user(kvalue, value, size)) {
++ xattr_free(kvalue, size);
++ return -EFAULT;
++ }
++
++ error = -EOPNOTSUPP;
++ if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
++ down(&d->d_inode->i_sem);
++ lock_kernel();
++ error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
++ unlock_kernel();
++ up(&d->d_inode->i_sem);
++ }
++
++ xattr_free(kvalue, size);
++ return error;
++}
++
++asmlinkage long
++sys_setxattr(char *path, char *name, void *value, size_t size, int flags)
++{
++ struct nameidata nd;
++ int error;
++
++ error = user_path_walk(path, &nd);
++ if (error)
++ return error;
++ error = setxattr(nd.dentry, name, value, size, flags);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage long
++sys_lsetxattr(char *path, char *name, void *value, size_t size, int flags)
++{
++ struct nameidata nd;
++ int error;
++
++ error = user_path_walk_link(path, &nd);
++ if (error)
++ return error;
++ error = setxattr(nd.dentry, name, value, size, flags);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage long
++sys_fsetxattr(int fd, char *name, void *value, size_t size, int flags)
++{
++ struct file *f;
++ int error = -EBADF;
++
++ f = fget(fd);
++ if (!f)
++ return error;
++ error = setxattr(f->f_dentry, name, value, size, flags);
++ fput(f);
++ return error;
++}
++
++/*
++ * Extended attribute GET operations
++ */
++static ssize_t
++getxattr(struct dentry *d, char *name, void *value, size_t size)
++{
++ ssize_t error;
++ void *kvalue;
++ char kname[XATTR_NAME_MAX + 1];
++
++ error = strncpy_from_user(kname, name, sizeof(kname));
++ if (error == 0 || error == sizeof(kname))
++ error = -ERANGE;
++ if (error < 0)
++ return error;
++
++ kvalue = xattr_alloc(size, XATTR_SIZE_MAX);
++ if (IS_ERR(kvalue))
++ return PTR_ERR(kvalue);
++
++ error = -EOPNOTSUPP;
++ if (d->d_inode->i_op && d->d_inode->i_op->getxattr) {
++ down(&d->d_inode->i_sem);
++ lock_kernel();
++ error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
++ unlock_kernel();
++ up(&d->d_inode->i_sem);
++ }
++
++ if (kvalue && error > 0)
++ if (copy_to_user(value, kvalue, error))
++ error = -EFAULT;
++ xattr_free(kvalue, size);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_getxattr(char *path, char *name, void *value, size_t size)
++{
++ struct nameidata nd;
++ ssize_t error;
++
++ error = user_path_walk(path, &nd);
++ if (error)
++ return error;
++ error = getxattr(nd.dentry, name, value, size);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_lgetxattr(char *path, char *name, void *value, size_t size)
++{
++ struct nameidata nd;
++ ssize_t error;
++
++ error = user_path_walk_link(path, &nd);
++ if (error)
++ return error;
++ error = getxattr(nd.dentry, name, value, size);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_fgetxattr(int fd, char *name, void *value, size_t size)
++{
++ struct file *f;
++ ssize_t error = -EBADF;
++
++ f = fget(fd);
++ if (!f)
++ return error;
++ error = getxattr(f->f_dentry, name, value, size);
++ fput(f);
++ return error;
++}
++
++/*
++ * Extended attribute LIST operations
++ */
++static ssize_t
++listxattr(struct dentry *d, char *list, size_t size)
++{
++ ssize_t error;
++ char *klist;
++
++ klist = (char *)xattr_alloc(size, XATTR_LIST_MAX);
++ if (IS_ERR(klist))
++ return PTR_ERR(klist);
++
++ error = -EOPNOTSUPP;
++ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
++ down(&d->d_inode->i_sem);
++ lock_kernel();
++ error = d->d_inode->i_op->listxattr(d, klist, size);
++ unlock_kernel();
++ up(&d->d_inode->i_sem);
++ }
++
++ if (klist && error > 0)
++ if (copy_to_user(list, klist, error))
++ error = -EFAULT;
++ xattr_free(klist, size);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_listxattr(char *path, char *list, size_t size)
++{
++ struct nameidata nd;
++ ssize_t error;
++
++ error = user_path_walk(path, &nd);
++ if (error)
++ return error;
++ error = listxattr(nd.dentry, list, size);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_llistxattr(char *path, char *list, size_t size)
++{
++ struct nameidata nd;
++ ssize_t error;
++
++ error = user_path_walk_link(path, &nd);
++ if (error)
++ return error;
++ error = listxattr(nd.dentry, list, size);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage ssize_t
++sys_flistxattr(int fd, char *list, size_t size)
++{
++ struct file *f;
++ ssize_t error = -EBADF;
++
++ f = fget(fd);
++ if (!f)
++ return error;
++ error = listxattr(f->f_dentry, list, size);
++ fput(f);
++ return error;
++}
++
++/*
++ * Extended attribute REMOVE operations
++ */
++static long
++removexattr(struct dentry *d, char *name)
++{
++ int error;
++ char kname[XATTR_NAME_MAX + 1];
++
++ error = strncpy_from_user(kname, name, sizeof(kname));
++ if (error == 0 || error == sizeof(kname))
++ error = -ERANGE;
++ if (error < 0)
++ return error;
++
++ error = -EOPNOTSUPP;
++ if (d->d_inode->i_op && d->d_inode->i_op->removexattr) {
++ down(&d->d_inode->i_sem);
++ lock_kernel();
++ error = d->d_inode->i_op->removexattr(d, kname);
++ unlock_kernel();
++ up(&d->d_inode->i_sem);
++ }
++ return error;
++}
++
++asmlinkage long
++sys_removexattr(char *path, char *name)
++{
++ struct nameidata nd;
++ int error;
++
++ error = user_path_walk(path, &nd);
++ if (error)
++ return error;
++ error = removexattr(nd.dentry, name);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage long
++sys_lremovexattr(char *path, char *name)
++{
++ struct nameidata nd;
++ int error;
++
++ error = user_path_walk_link(path, &nd);
++ if (error)
++ return error;
++ error = removexattr(nd.dentry, name);
++ path_release(&nd);
++ return error;
++}
++
++asmlinkage long
++sys_fremovexattr(int fd, char *name)
++{
++ struct file *f;
++ int error = -EBADF;
++
++ f = fget(fd);
++ if (!f)
++ return error;
++ error = removexattr(f->f_dentry, name);
++ fput(f);
++ return error;
++}
+Index: linux-DRV401/include/linux/cache_def.h
+===================================================================
+--- linux-DRV401.orig/include/linux/cache_def.h 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/include/linux/cache_def.h 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,15 @@
++/*
++ * linux/cache_def.h
++ * Handling of caches defined in drivers, filesystems, ...
++ *
++ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++struct cache_definition {
++ const char *name;
++ void (*shrink)(int, unsigned int);
++ struct list_head link;
++};
++
++extern void register_cache(struct cache_definition *);
++extern void unregister_cache(struct cache_definition *);
+Index: linux-DRV401/include/linux/errno.h
+===================================================================
+--- linux-DRV401.orig/include/linux/errno.h 2004-10-15 10:26:15.000000000 -0700
++++ linux-DRV401/include/linux/errno.h 2004-10-15 11:03:52.000000000 -0700
+@@ -23,4 +23,8 @@
+
+ #endif
+
++/* Defined for extended attributes */
++#define ENOATTR ENODATA /* No such attribute */
++#define ENOTSUP EOPNOTSUPP /* Operation not supported */
++
+ #endif
+Index: linux-DRV401/include/linux/ext2_fs.h
+===================================================================
+--- linux-DRV401.orig/include/linux/ext2_fs.h 2004-10-15 10:26:11.000000000 -0700
++++ linux-DRV401/include/linux/ext2_fs.h 2004-10-15 11:03:52.000000000 -0700
+@@ -57,8 +57,6 @@
+ */
+ #define EXT2_BAD_INO 1 /* Bad blocks inode */
+ #define EXT2_ROOT_INO 2 /* Root inode */
+-#define EXT2_ACL_IDX_INO 3 /* ACL inode */
+-#define EXT2_ACL_DATA_INO 4 /* ACL inode */
+ #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */
+ #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */
+
+@@ -86,7 +84,6 @@
+ #else
+ # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
+ #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+@@ -121,28 +118,6 @@
+ #endif
+
+ /*
+- * ACL structures
+- */
+-struct ext2_acl_header /* Header of Access Control Lists */
+-{
+- __u32 aclh_size;
+- __u32 aclh_file_count;
+- __u32 aclh_acle_count;
+- __u32 aclh_first_acle;
+-};
+-
+-struct ext2_acl_entry /* Access Control List Entry */
+-{
+- __u32 acle_size;
+- __u16 acle_perms; /* Access permissions */
+- __u16 acle_type; /* Type of entry */
+- __u16 acle_tag; /* User or group identity */
+- __u16 acle_pad1;
+- __u32 acle_next; /* Pointer on next entry for the */
+- /* same inode or on next free entry */
+-};
+-
+-/*
+ * Structure of a blocks group descriptor
+ */
+ struct ext2_group_desc
+@@ -314,6 +289,7 @@
+ #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
+ #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
+ #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */
++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+
+ #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
+ #define set_opt(o, opt) o |= EXT2_MOUNT_##opt
+@@ -397,6 +373,7 @@
+
+ #ifdef __KERNEL__
+ #define EXT2_SB(sb) (&((sb)->u.ext2_sb))
++#define EXT2_I(inode) (&((inode)->u.ext2_i))
+ #else
+ /* Assume that user mode programs are passing in an ext2fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+@@ -466,7 +443,7 @@
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
+ #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff
+
+-#define EXT2_FEATURE_COMPAT_SUPP 0
++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE
+ #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -623,8 +600,10 @@
+
+ /* namei.c */
+ extern struct inode_operations ext2_dir_inode_operations;
++extern struct inode_operations ext2_special_inode_operations;
+
+ /* symlink.c */
++extern struct inode_operations ext2_symlink_inode_operations;
+ extern struct inode_operations ext2_fast_symlink_inode_operations;
+
+ #endif /* __KERNEL__ */
+Index: linux-DRV401/include/linux/ext2_xattr.h
+===================================================================
+--- linux-DRV401.orig/include/linux/ext2_xattr.h 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/include/linux/ext2_xattr.h 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,157 @@
++/*
++ File: linux/ext2_xattr.h
++
++ On-disk format of extended attributes for the ext2 filesystem.
++
++ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT2_XATTR_MAGIC 0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT2_XATTR_REFCOUNT_MAX 1024
++
++/* Name indexes */
++#define EXT2_XATTR_INDEX_MAX 10
++#define EXT2_XATTR_INDEX_USER 1
++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2
++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3
++
++struct ext2_xattr_header {
++ __u32 h_magic; /* magic number for identification */
++ __u32 h_refcount; /* reference count */
++ __u32 h_blocks; /* number of disk blocks used */
++ __u32 h_hash; /* hash value of all attributes */
++ __u32 h_reserved[4]; /* zero right now */
++};
++
++struct ext2_xattr_entry {
++ __u8 e_name_len; /* length of name */
++ __u8 e_name_index; /* attribute name index */
++ __u16 e_value_offs; /* offset in disk block of value */
++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
++ __u32 e_value_size; /* size of attribute value */
++ __u32 e_hash; /* hash value of name and value */
++ char e_name[0]; /* attribute name */
++};
++
++#define EXT2_XATTR_PAD_BITS 2
++#define EXT2_XATTR_PAD (1<<EXT2_XATTR_PAD_BITS)
++#define EXT2_XATTR_ROUND (EXT2_XATTR_PAD-1)
++#define EXT2_XATTR_LEN(name_len) \
++ (((name_len) + EXT2_XATTR_ROUND + \
++ sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
++#define EXT2_XATTR_NEXT(entry) \
++ ( (struct ext2_xattr_entry *)( \
++ (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
++#define EXT2_XATTR_SIZE(size) \
++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT2_FS_XATTR
++
++struct ext2_xattr_handler {
++ char *prefix;
++ size_t (*list)(char *list, struct inode *inode, const char *name,
++ int name_len);
++ int (*get)(struct inode *inode, const char *name, void *buffer,
++ size_t size);
++ int (*set)(struct inode *inode, const char *name, const void *buffer,
++ size_t size, int flags);
++};
++
++extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
++
++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
++extern int ext2_removexattr(struct dentry *, const char *);
++
++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext2_xattr_list(struct inode *, char *, size_t);
++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext2_xattr_delete_inode(struct inode *);
++extern void ext2_xattr_put_super(struct super_block *);
++
++extern int init_ext2_xattr(void) __init;
++extern void exit_ext2_xattr(void);
++
++# else /* CONFIG_EXT2_FS_XATTR */
++# define ext2_setxattr NULL
++# define ext2_getxattr NULL
++# define ext2_listxattr NULL
++# define ext2_removexattr NULL
++
++static inline int
++ext2_xattr_get(struct inode *inode, int name_index,
++ const char *name, void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++ const void *value, size_t size, int flags)
++{
++ return -ENOTSUP;
++}
++
++static inline void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++}
++
++static inline void
++ext2_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext2_xattr(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext2_xattr(void)
++{
++}
++
++# endif /* CONFIG_EXT2_FS_XATTR */
++
++# ifdef CONFIG_EXT2_FS_XATTR_USER
++
++extern int init_ext2_xattr_user(void) __init;
++extern void exit_ext2_xattr_user(void);
++
++# else /* CONFIG_EXT2_FS_XATTR_USER */
++
++static inline int
++init_ext2_xattr_user(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext2_xattr_user(void)
++{
++}
++
++# endif /* CONFIG_EXT2_FS_XATTR_USER */
++
++#endif /* __KERNEL__ */
++
+Index: linux-DRV401/include/linux/ext3_fs.h
+===================================================================
+--- linux-DRV401.orig/include/linux/ext3_fs.h 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/include/linux/ext3_fs.h 2004-10-15 11:03:52.000000000 -0700
+@@ -63,8 +63,6 @@
+ */
+ #define EXT3_BAD_INO 1 /* Bad blocks inode */
+ #define EXT3_ROOT_INO 2 /* Root inode */
+-#define EXT3_ACL_IDX_INO 3 /* ACL inode */
+-#define EXT3_ACL_DATA_INO 4 /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+ #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header /* Header of Access Control Lists */
+-{
+- __u32 aclh_size;
+- __u32 aclh_file_count;
+- __u32 aclh_acle_count;
+- __u32 aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+- __u32 acle_size;
+- __u16 acle_perms; /* Access permissions */
+- __u16 acle_type; /* Type of entry */
+- __u16 acle_tag; /* User or group identity */
+- __u16 acle_pad1;
+- __u32 acle_next; /* Pointer on next entry for the */
+- /* same inode or on next free entry */
+-};
+-
+-/*
+ * Structure of a blocks group descriptor
+ */
+ struct ext3_group_desc
+@@ -344,6 +319,7 @@
+ #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -520,7 +496,7 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+
+-#define EXT3_FEATURE_COMPAT_SUPP 0
++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -703,6 +679,7 @@
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+
+@@ -771,8 +748,10 @@
+
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
++extern struct inode_operations ext3_special_inode_operations;
+
+ /* symlink.c */
++extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
+
+Index: linux-DRV401/include/linux/ext3_jbd.h
+===================================================================
+--- linux-DRV401.orig/include/linux/ext3_jbd.h 2004-10-15 10:39:16.000000000 -0700
++++ linux-DRV401/include/linux/ext3_jbd.h 2004-10-15 11:03:52.000000000 -0700
+@@ -30,13 +30,19 @@
+
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS 8
+
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS 8
++
+ /* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++ EXT3_XATTR_TRANS_BLOCKS - 2)
+
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+
+Index: linux-DRV401/include/linux/ext3_xattr.h
+===================================================================
+--- linux-DRV401.orig/include/linux/ext3_xattr.h 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/include/linux/ext3_xattr.h 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,157 @@
++/*
++ File: linux/ext3_xattr.h
++
++ On-disk format of extended attributes for the ext3 filesystem.
++
++ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC 0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX 1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX 10
++#define EXT3_XATTR_INDEX_USER 1
++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
++
++struct ext3_xattr_header {
++ __u32 h_magic; /* magic number for identification */
++ __u32 h_refcount; /* reference count */
++ __u32 h_blocks; /* number of disk blocks used */
++ __u32 h_hash; /* hash value of all attributes */
++ __u32 h_reserved[4]; /* zero right now */
++};
++
++struct ext3_xattr_entry {
++ __u8 e_name_len; /* length of name */
++ __u8 e_name_index; /* attribute name index */
++ __u16 e_value_offs; /* offset in disk block of value */
++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
++ __u32 e_value_size; /* size of attribute value */
++ __u32 e_hash; /* hash value of name and value */
++ char e_name[0]; /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS 2
++#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++ (((name_len) + EXT3_XATTR_ROUND + \
++ sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++ ( (struct ext3_xattr_entry *)( \
++ (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++ char *prefix;
++ size_t (*list)(char *list, struct inode *inode, const char *name,
++ int name_len);
++ int (*get)(struct inode *inode, const char *name, void *buffer,
++ size_t size);
++ int (*set)(struct inode *inode, const char *name, const void *buffer,
++ size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else /* CONFIG_EXT3_FS_XATTR */
++# define ext3_setxattr NULL
++# define ext3_getxattr NULL
++# define ext3_listxattr NULL
++# define ext3_removexattr NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++ const char *name, const void *value, size_t size, int flags)
++{
++ return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif /* __KERNEL__ */
++
+Index: linux-DRV401/include/linux/fs.h
+===================================================================
+--- linux-DRV401.orig/include/linux/fs.h 2004-10-15 10:39:15.000000000 -0700
++++ linux-DRV401/include/linux/fs.h 2004-10-15 11:03:52.000000000 -0700
+@@ -936,6 +936,10 @@
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct dentry *, struct iattr *);
++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
++ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
++ ssize_t (*listxattr) (struct dentry *, char *, size_t);
++ int (*removexattr) (struct dentry *, const char *);
+ };
+
+ struct seq_file;
+Index: linux-DRV401/include/linux/mbcache.h
+===================================================================
+--- linux-DRV401.orig/include/linux/mbcache.h 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/include/linux/mbcache.h 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,69 @@
++/*
++ File: linux/mbcache.h
++
++ (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++/* Hardwire the number of additional indexes */
++#define MB_CACHE_INDEXES_COUNT 1
++
++struct mb_cache_entry;
++
++struct mb_cache_op {
++ int (*free)(struct mb_cache_entry *, int);
++};
++
++struct mb_cache {
++ struct list_head c_cache_list;
++ const char *c_name;
++ struct mb_cache_op c_op;
++ atomic_t c_entry_count;
++ int c_bucket_count;
++#ifndef MB_CACHE_INDEXES_COUNT
++ int c_indexes_count;
++#endif
++ kmem_cache_t *c_entry_cache;
++ struct list_head *c_block_hash;
++ struct list_head *c_indexes_hash[0];
++};
++
++struct mb_cache_entry_index {
++ struct list_head o_list;
++ unsigned int o_key;
++};
++
++struct mb_cache_entry {
++ struct list_head e_lru_list;
++ struct mb_cache *e_cache;
++ atomic_t e_used;
++ kdev_t e_dev;
++ unsigned long e_block;
++ struct list_head e_block_list;
++ struct mb_cache_entry_index e_indexes[0];
++};
++
++/* Functions on caches */
++
++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
++ int, int);
++void mb_cache_shrink(struct mb_cache *, kdev_t);
++void mb_cache_destroy(struct mb_cache *);
++
++/* Functions on cache entries */
++
++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
++ unsigned int[]);
++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
++void mb_cache_entry_release(struct mb_cache_entry *);
++void mb_cache_entry_takeout(struct mb_cache_entry *);
++void mb_cache_entry_free(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
++ unsigned long);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
++ kdev_t, unsigned int);
++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
++ kdev_t, unsigned int);
++#endif
+Index: linux-DRV401/include/linux/xattr.h
+===================================================================
+--- linux-DRV401.orig/include/linux/xattr.h 2004-10-12 08:56:38.404764448 -0700
++++ linux-DRV401/include/linux/xattr.h 2004-10-15 11:03:52.000000000 -0700
+@@ -0,0 +1,15 @@
++/*
++ File: linux/xattr.h
++
++ Extended attributes handling.
++
++ Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
++ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved.
++*/
++#ifndef _LINUX_XATTR_H
++#define _LINUX_XATTR_H
++
++#define XATTR_CREATE 0x1 /* set the value, fail if attr already exists */
++#define XATTR_REPLACE 0x2 /* set the value, fail if attr does not exist */
++
++#endif /* _LINUX_XATTR_H */
+Index: linux-DRV401/include/linux/limits.h
+===================================================================
+--- linux-DRV401.orig/include/linux/limits.h 2004-10-15 10:26:20.000000000 -0700
++++ linux-DRV401/include/linux/limits.h 2004-10-15 11:03:52.000000000 -0700
+@@ -13,6 +13,9 @@
+ #define NAME_MAX 255 /* # chars in a file name */
+ #define PATH_MAX 4096 /* # chars in a path name including nul */
+ #define PIPE_BUF 4096 /* # bytes in atomic write to a pipe */
++#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */
++#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */
++#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */
+
+ #define RTSIG_MAX 32
+
+Index: linux-DRV401/kernel/ksyms.c
+===================================================================
+--- linux-DRV401.orig/kernel/ksyms.c 2004-10-15 10:39:15.000000000 -0700
++++ linux-DRV401/kernel/ksyms.c 2004-10-15 11:03:52.000000000 -0700
+@@ -11,6 +11,7 @@
+
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/cdrom.h>
+@@ -88,6 +89,7 @@
+ EXPORT_SYMBOL(exit_files);
+ EXPORT_SYMBOL(exit_fs);
+ EXPORT_SYMBOL(exit_sighand);
++EXPORT_SYMBOL(copy_fs_struct);
+ EXPORT_SYMBOL(unshare_files);
+
+ /* internal kernel memory management */
+@@ -105,6 +107,8 @@
+ EXPORT_SYMBOL(kmem_cache_shrink);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
++EXPORT_SYMBOL(register_cache);
++EXPORT_SYMBOL(unregister_cache);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+Index: linux-DRV401/mm/vmscan.c
+===================================================================
+--- linux-DRV401.orig/mm/vmscan.c 2004-10-15 10:24:07.000000000 -0700
++++ linux-DRV401/mm/vmscan.c 2004-10-15 11:08:53.000000000 -0700
+@@ -15,6 +15,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/swapctl.h>
++#include <linux/cache_def.h>
+ #include <linux/smp_lock.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+@@ -31,6 +32,39 @@
+ */
+ #define DEF_PRIORITY (6)
+
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_add(&cache->link, &cache_definitions);
++ up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_del(&cache->link);
++ up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++ struct list_head *p;
++
++ if (down_trylock(&other_caches_sem))
++ return;
++
++ list_for_each_prev(p, &cache_definitions) {
++ struct cache_definition *cache =
++ list_entry(p, struct cache_definition, link);
++
++ cache->shrink(priority, gfp_mask);
++ }
++ up(&other_caches_sem);
++}
++
+ /*
+ * The swap-out function returns 1 if it successfully
+ * scanned all the pages it was asked to (`count').
+@@ -584,6 +618,7 @@
+
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
++ shrink_other_caches(priority, gfp_mask);
+ #ifdef CONFIG_QUOTA
+ shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ #endif
--- /dev/null
+ Documentation/Configure.help | 66 ++
+ arch/ia64/defconfig | 7
+ fs/Config.in | 14
+ fs/Makefile | 3
+ fs/ext2/Makefile | 4
+ fs/ext2/file.c | 5
+ fs/ext2/ialloc.c | 2
+ fs/ext2/inode.c | 34 -
+ fs/ext2/namei.c | 14
+ fs/ext2/super.c | 29
+ fs/ext2/symlink.c | 14
+ fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c | 103 +++
+ fs/ext3/Makefile | 9
+ fs/ext3/ext3-exports.c | 13
+ fs/ext3/file.c | 5
+ fs/ext3/ialloc.c | 2
+ fs/ext3/inode.c | 35 -
+ fs/ext3/namei.c | 21
+ fs/ext3/super.c | 36 +
+ fs/ext3/symlink.c | 14
+ fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c | 111 +++
+ fs/jfs/jfs_xattr.h | 6
+ fs/jfs/xattr.c | 6
+ fs/mbcache.c | 648 ++++++++++++++++++++++
+ include/linux/cache_def.h | 15
+ include/linux/errno.h | 4
+ include/linux/ext2_fs.h | 31 -
+ include/linux/ext2_xattr.h | 157 +++++
+ include/linux/ext3_fs.h | 31 -
+ include/linux/ext3_jbd.h | 8
+ include/linux/ext3_xattr.h | 157 +++++
+ include/linux/fs.h | 2
+ include/linux/mbcache.h | 69 ++
+ kernel/ksyms.c | 4
+ mm/vmscan.c | 35 +
+ 62 files changed, 4343 insertions(+), 182 deletions(-)
+
+Index: linux-2.4.19.SuSE/Documentation/Configure.help
+===================================================================
+--- linux-2.4.19.SuSE.orig/Documentation/Configure.help 2004-05-03 11:20:17.000000000 -0700
++++ linux-2.4.19.SuSE/Documentation/Configure.help 2004-05-03 11:50:22.000000000 -0700
+@@ -15296,6 +15296,39 @@
+
+ If unsure, say N.
+
++Ext2 extended attributes
++CONFIG_EXT2_FS_XATTR
++ Extended attributes are name:value pairs associated with inodes by
++ the kernel or by users (see the attr(5) manual page, or visit
++ <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext2 extended attribute block sharing
++CONFIG_EXT2_FS_XATTR_SHARING
++ This options enables code for sharing identical extended attribute
++ blocks among multiple inodes.
++
++ Usually, say Y.
++
++Ext2 extended user attributes
++CONFIG_EXT2_FS_XATTR_USER
++ This option enables extended user attributes on ext2. Processes can
++ associate extended user attributes with inodes to store additional
++ information such as the character encoding of files, etc. (see the
++ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext2 trusted extended attributes
++CONFIG_EXT2_FS_XATTR_TRUSTED
++ This option enables extended attributes on ext2 that are accessible
++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++ is only the super user. Trusted extended attributes are meant for
++ implementing system/security services.
++
++ If unsure, say N.
++
+ Ext3 journalling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+ This is the journalling version of the Second extended file system
+@@ -15354,6 +15387,39 @@
+
+ If unsure, say N.
+
++Ext3 extended attributes
++CONFIG_EXT3_FS_XATTR
++ Extended attributes are name:value pairs associated with inodes by
++ the kernel or by users (see the attr(5) manual page, or visit
++ <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext3 extended attribute block sharing
++CONFIG_EXT3_FS_XATTR_SHARING
++ This options enables code for sharing identical extended attribute
++ blocks among multiple inodes.
++
++ Usually, say Y.
++
++Ext3 extended user attributes
++CONFIG_EXT3_FS_XATTR_USER
++ This option enables extended user attributes on ext3. Processes can
++ associate extended user attributes with inodes to store additional
++ information such as the character encoding of files, etc. (see the
++ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext3 trusted extended attributes
++CONFIG_EXT3_FS_XATTR_TRUSTED
++ This option enables extended attributes on ext3 that are accessible
++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++ is only the super user. Trusted extended attributes are meant for
++ implementing system/security services.
++
++ If unsure, say N.
++
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+ This is a generic journalling layer for block devices. It is
+Index: linux-2.4.19.SuSE/arch/ia64/defconfig
+===================================================================
+--- linux-2.4.19.SuSE.orig/arch/ia64/defconfig 2004-05-03 11:19:10.000000000 -0700
++++ linux-2.4.19.SuSE/arch/ia64/defconfig 2004-05-03 11:50:22.000000000 -0700
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++CONFIG_EXT3_FS_XATTR=y
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+
+ #
+ # Code maturity level options
+Index: linux-2.4.19.SuSE/fs/Config.in
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/Config.in 2004-05-03 11:18:52.000000000 -0700
++++ linux-2.4.19.SuSE/fs/Config.in 2004-05-03 11:50:22.000000000 -0700
+@@ -203,6 +203,10 @@
+ #tristate 'Meta block cache' CONFIG_FS_MBCACHE
+ define_tristate CONFIG_FS_MBCACHE y
+
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+Index: linux-2.4.19.SuSE/fs/Makefile
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/Makefile 2004-05-03 11:22:49.000000000 -0700
++++ linux-2.4.19.SuSE/fs/Makefile 2004-05-03 11:50:22.000000000 -0700
+@@ -104,6 +104,9 @@
+ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
+ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
+
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE) += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+
+Index: linux-2.4.19.SuSE/fs/ext2/Makefile
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext2/Makefile 2004-05-03 11:18:46.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext2/Makefile 2004-05-03 11:50:22.000000000 -0700
+@@ -18,4 +18,8 @@
+ obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
+ obj-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
+
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+Index: linux-2.4.19.SuSE/fs/ext2/inode.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext2/inode.c 2004-05-03 11:18:47.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext2/inode.c 2004-05-03 11:50:22.000000000 -0700
+@@ -52,6 +52,18 @@
+ }
+
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext2_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+ * Called at each iput()
+ */
+ void ext2_put_inode (struct inode * inode)
+@@ -806,6 +818,8 @@
+ return;
+ if (ext2_inode_is_fast_symlink(inode))
+ return;
++ if (ext2_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+Index: linux-2.4.19.SuSE/fs/ext2/super.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext2/super.c 2004-05-03 11:18:47.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext2/super.c 2004-05-03 11:50:22.000000000 -0700
+@@ -70,6 +70,7 @@
+ {
+ va_list args;
+
++ ext2_xattr_put_super(sb);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS;
+ sb->u.ext2_sb.s_es->s_state =
+Index: linux-2.4.19.SuSE/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/inode.c 2004-05-03 11:18:47.000000000 -0700
++++ linux-2.4.19.SuSE/fs/ext3/inode.c 2004-05-03 11:50:22.000000000 -0700
+@@ -54,6 +54,18 @@
+ inode->i_blocks - ea_blocks == 0);
+ }
+
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext3_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+@@ -1968,6 +1980,8 @@
+ return;
+ if (ext3_inode_is_fast_symlink(inode))
+ return;
++ if (ext3_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c 2004-02-18 07:26:44.000000000 -0800
++++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c 2004-05-03 11:50:22.000000000 -0700
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+Index: linux-2.4.19.SuSE/include/linux/errno.h
+===================================================================
+--- linux-2.4.19.SuSE.orig/include/linux/errno.h 2004-05-03 11:20:21.000000000 -0700
++++ linux-2.4.19.SuSE/include/linux/errno.h 2004-05-03 11:50:22.000000000 -0700
+@@ -30,4 +30,8 @@
+
+ #endif
+
++/* Defined for extended attributes */
++#define ENOATTR ENODATA /* No such attribute */
++#define ENOTSUP EOPNOTSUPP /* Operation not supported */
++
+ #endif
+Index: linux-2.4.19.SuSE/kernel/ksyms.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-05-03 11:22:48.000000000 -0700
++++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-05-03 11:50:22.000000000 -0700
+@@ -12,6 +12,7 @@
+ #define __KERNEL_SYSCALLS__
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/cdrom.h>
+Index: linux-2.4.19.SuSE/mm/vmscan.c
+===================================================================
+--- linux-2.4.19.SuSE.orig/mm/vmscan.c 2004-05-03 11:18:53.000000000 -0700
++++ linux-2.4.19.SuSE/mm/vmscan.c 2004-05-03 11:50:22.000000000 -0700
+@@ -32,6 +32,39 @@
+ */
+ int vm_passes = 60;
+
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_add(&cache->link, &cache_definitions);
++ up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_del(&cache->link);
++ up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++ struct list_head *p;
++
++ if (down_trylock(&other_caches_sem))
++ return;
++
++ list_for_each_prev(p, &cache_definitions) {
++ struct cache_definition *cache =
++ list_entry(p, struct cache_definition, link);
++
++ cache->shrink(priority, gfp_mask);
++ }
++ up(&other_caches_sem);
++}
++
+ /*
+ * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
+ * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
--- /dev/null
+Index: linux-2.4.18-chaos/include/linux/list.h
+===================================================================
+--- linux-2.4.18-chaos.orig/include/linux/list.h 2003-11-23 00:07:05.000000000 +0300
++++ linux-2.4.18-chaos/include/linux/list.h 2003-12-11 00:25:15.000000000 +0300
+@@ -173,6 +173,67 @@
+ for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
+ pos = pos->prev, prefetch(pos->prev))
+
++/**
++ * list_for_each_entry - iterate over list of given type
++ * @pos: the type * to use as a loop counter.
++ * @head: the head for your list.
++ * @member: the name of the list_struct within the struct.
++ */
++#define list_for_each_entry(pos, head, member) \
++ for (pos = list_entry((head)->next, typeof(*pos), member), \
++ prefetch(pos->member.next); \
++ &pos->member != (head); \
++ pos = list_entry(pos->member.next, typeof(*pos), member), \
++ prefetch(pos->member.next))
++
++#ifndef list_for_each_entry_safe
++/**
++ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
++ * @pos: the type * to use as a loop counter.
++ * @n: another type * to use as temporary storage
++ * @head: the head for your list.
++ * @member: the name of the list_struct within the struct.
++ */
++#define list_for_each_entry_safe(pos, n, head, member) \
++ for (pos = list_entry((head)->next, typeof(*pos), member), \
++ n = list_entry(pos->member.next, typeof(*pos), member); \
++ &pos->member != (head); \
++ pos = n, n = list_entry(n->member.next, typeof(*n), member))
++#endif
++
++/**
++ * list_move - delete from one list and add as another's head
++ * @list: the entry to move
++ * @head: the head that will precede our entry
++ */
++static inline void list_move(struct list_head *list, struct list_head *head)
++{
++ __list_del(list->prev, list->next);
++ list_add(list, head);
++}
++
++/**
++ * list_move_tail - delete from one list and add as another's tail
++ * @list: the entry to move
++ * @head: the head that will follow our entry
++ */
++static inline void list_move_tail(struct list_head *list,
++ struct list_head *head)
++{
++ __list_del(list->prev, list->next);
++ list_add_tail(list, head);
++}
++
++/* 2.5 uses hlists for some things, like the d_hash. we'll treat them
++ * as 2.5 and let macros drop back.. */
++#define hlist_entry list_entry
++#define hlist_head list_head
++#define hlist_node list_head
++#define HLIST_HEAD LIST_HEAD
++#define INIT_HLIST_HEAD INIT_LIST_HEAD
++#define hlist_del_init list_del_init
++#define hlist_add_head list_add
++#define hlist_for_each_safe list_for_each_safe
+
+ #endif /* __KERNEL__ || _LVM_H_INCLUDE */
+
--- /dev/null
+Index: linux-2.4.24/drivers/net/netconsole.c\r
+===================================================================\r
+Index: bglio/drivers/net/netconsole.c
+===================================================================
+--- bglio.orig/drivers/net/netconsole.c 2004-05-07 15:50:22.000000000 -0700
++++ bglio/drivers/net/netconsole.c 2004-05-07 17:15:28.000000000 -0700
+@@ -12,6 +12,8 @@
+ *
+ * 2001-09-17 started by Ingo Molnar.
+ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson
++ * 2003-10-30 Add sysrq command processing by Wangdi <wangdi@clusterfs.com>
++ *
+ */
+
+ /****************************************************************
+@@ -51,6 +53,7 @@
+ #include <linux/tty_driver.h>
+ #include <linux/etherdevice.h>
+ #include <linux/elf.h>
++#include "netconsole.h"
+
+ static struct net_device *netconsole_dev;
+ static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port;
+@@ -62,12 +65,11 @@
+ static unsigned int mhz = 500, idle_timeout;
+ static unsigned long long mhz_cycles, jiffy_cycles;
+
+-#include "netconsole.h"
+
+ #define MAX_UDP_CHUNK 1460
+ #define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN)
+
+-#define DEBUG 0
++#define DEBUG 0
+ #if DEBUG
+ # define Dprintk(x...) printk(KERN_INFO x)
+ #else
+@@ -187,6 +189,22 @@
+ }
+ }
+ }
++void (*irqfunc)(int, void *, struct pt_regs *);
++
++static void netdump_poll(struct net_device *dev)
++{
++ int budget = 1;
++
++ disable_irq(dev->irq);
++
++ irqfunc(dev->irq, dev, 0);
++
++ if(dev->poll && test_bit(__LINK_STATE_RX_SCHED, &dev->state))
++ dev->poll(dev, &budget);
++
++ enable_irq(dev->irq);
++
++}
+
+ static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve)
+ {
+@@ -209,7 +227,7 @@
+ once = 0;
+ }
+ Dprintk("alloc skb: polling controller ...\n");
+- dev->poll_controller(dev);
++ netdump_poll(dev);
+ goto repeat;
+ }
+ }
+@@ -231,7 +249,7 @@
+ spin_unlock(&dev->xmit_lock);
+
+ Dprintk("xmit skb: polling controller ...\n");
+- dev->poll_controller(dev);
++ netdump_poll(dev);
+ zap_completion_queue();
+ goto repeat_poll;
+ }
+@@ -426,18 +444,79 @@
+ static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED;
+ static unsigned int log_offset;
+
++static int thread_stopped = 0;
++/*Interrupt function for netdump */
++static int sysrq_mode = 0;
++static int stop_sysrq_thread = 0;
++#define Set_Sysrq_mode() (sysrq_mode = 1)
++#define Clear_Sysrq_mode() (sysrq_mode = 0)
++static char send_cache[MAX_PRINT_CHUNK];
++static unsigned int send_cache_pos = 0;
++wait_queue_head_t sysrq_thread_queue;
++wait_queue_head_t sysrq_thread_waiter_queue;
++
++#define SEND_MSG_BUFFER(buf, len) \
++do \
++{ \
++ reply_t reply; \
++ unsigned int flags; \
++ __save_flags(flags); \
++ __cli(); \
++ reply.code = REPLY_LOG; \
++ reply.nr = 0; \
++ reply.info = 0; \
++ spin_lock(&sequence_lock); \
++ send_netlog_skb(dev, buf, len, &reply); \
++ spin_unlock(&sequence_lock); \
++ __restore_flags(flags); \
++}while(0);
++
++void netconsole_do_sysrq(req_t *req)
++{
++ struct pt_regs regs;
++ struct net_device *dev = netconsole_dev;
++
++ if (!dev)
++ return;
++ Set_Sysrq_mode();
++ get_current_regs(®s);
++ handle_sysrq((int)req->from, ®s, NULL);
++
++ if (send_cache_pos != 0){
++ SEND_MSG_BUFFER(send_cache, send_cache_pos);
++ memset(send_cache, 0, MAX_PRINT_CHUNK);
++ send_cache_pos = 0;
++ }
++
++ Clear_Sysrq_mode();
++}
+ static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len)
+ {
+ int len, left, i;
+ struct net_device *dev;
+ const char *msg = msg0;
+ reply_t reply;
+-
++
+ dev = netconsole_dev;
+ if (!dev || netdump_mode)
+ return;
+-
+- if (dev->poll_controller && netif_running(dev)) {
++ if (sysrq_mode){
++ unsigned long total_len = send_cache_pos + msg_len;
++ unsigned long left_len = msg_len;
++ while (total_len >= MAX_PRINT_CHUNK){
++ unsigned long send_len = MAX_PRINT_CHUNK - send_cache_pos;
++ memcpy(send_cache + send_cache_pos, msg, send_len);
++ SEND_MSG_BUFFER(send_cache, MAX_PRINT_CHUNK);
++ send_cache_pos = 0;
++ total_len -= MAX_PRINT_CHUNK;
++ left_len -= send_len;
++ }
++ if (left_len > 0){
++ memcpy(send_cache + send_cache_pos, msg + (msg_len -left_len), left_len);
++ send_cache_pos += left_len;
++ }
++ return;
++ }else if (netif_running(dev)) {
+ unsigned long flags;
+
+ __save_flags(flags);
+@@ -567,8 +646,6 @@
+ req_t *req;
+ struct net_device *dev;
+
+- if (!netdump_mode)
+- return NET_RX_SUCCESS;
+ #if DEBUG
+ {
+ static int packet_count;
+@@ -722,8 +799,16 @@
+ Dprintk("... netdump from: %08x.\n", req->from);
+ Dprintk("... netdump to: %08x.\n", req->to);
+
+- add_new_req(req);
++ if (netdump_mode)
++ add_new_req(req);
++ else if (req->command == COMM_SYSRQ){
++ add_new_req(req);
++ wake_up(&sysrq_thread_queue);
++ return NET_RX_DROP;
++ }
+ out:
++ if (!netdump_mode)
++ return NET_RX_SUCCESS;
+ return NET_RX_DROP;
+ }
+
+@@ -763,6 +848,7 @@
+ kunmap_atomic(kaddr, KM_NETDUMP);
+ }
+
++
+ /*
+ * This function waits for the client to acknowledge the receipt
+ * of the netdump startup reply, with the possibility of packets
+@@ -792,7 +878,7 @@
+ // wait 1 sec.
+ udelay(100);
+ Dprintk("handshake: polling controller ...\n");
+- dev->poll_controller(dev);
++ netdump_poll(dev);
+ zap_completion_queue();
+ req = get_new_req();
+ if (req)
+@@ -884,6 +970,7 @@
+ */
+ spin_lock_init(&dev->xmit_lock);
+
++#ifdef __i386__
+ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs));
+ ss = __KERNEL_DS;
+ if (regs->xcs & 3) {
+@@ -893,6 +980,7 @@
+ myregs = *regs;
+ myregs.esp = esp;
+ myregs.xss = (myregs.xss & 0xffff0000) | ss;
++#endif
+
+ rdtscll(t0);
+
+@@ -904,7 +992,7 @@
+ while (netdump_mode) {
+ __cli();
+ Dprintk("main netdump loop: polling controller ...\n");
+- dev->poll_controller(dev);
++ netdump_poll(dev);
+ zap_completion_queue();
+ #if !CLI
+ __sti();
+@@ -1009,6 +1097,32 @@
+ printk("NETDUMP END!\n");
+ __restore_flags(flags);
+ }
++static int netconsole_sysrq_schedule(void *arg)
++{
++ struct task_struct *tsk = current;
++
++ sprintf(tsk->comm, "sysrq_schedule");
++ sigfillset(&tsk->blocked);
++
++ /* main loop */
++ thread_stopped = 0;
++ for (;;) {
++ wait_event_interruptible(sysrq_thread_queue,
++ !list_empty(&request_list) || stop_sysrq_thread);
++ while (!list_empty(&request_list)) {
++ req_t *req = get_new_req();
++ if (req->command == COMM_SYSRQ)
++ netconsole_do_sysrq(req);
++ }
++ if (stop_sysrq_thread)
++ break;
++ wake_up(&sysrq_thread_waiter_queue);
++ }
++ thread_stopped = 1;
++ wake_up(&sysrq_thread_waiter_queue);
++ return 0;
++}
++
+
+ static char *dev;
+ static int netdump_target_eth_byte0 = 255;
+@@ -1087,11 +1201,12 @@
+
+ static struct console netconsole =
+ { flags: CON_ENABLED, write: write_netconsole_msg };
+-
+ static int init_netconsole(void)
+ {
+ struct net_device *ndev = NULL;
+ struct in_device *in_dev;
++ struct irqaction *action;
++ int rc = 0;
+
+ printk(KERN_INFO "netlog: using network device <%s>\n", dev);
+ // this will be valid once the device goes up.
+@@ -1101,10 +1216,6 @@
+ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev);
+ return -1;
+ }
+- if (!ndev->poll_controller) {
+- printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev);
+- return -1;
+- }
+ in_dev = in_dev_get(ndev);
+ if (!in_dev) {
+ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev);
+@@ -1137,8 +1248,6 @@
+ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) {
+ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n");
+ return -1;
+- }
+- if (netdump_target_ip) {
+ #define IP(x) ((unsigned char *)&netdump_target_ip)[x]
+ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n",
+ IP(3), IP(2), IP(1), IP(0));
+@@ -1214,12 +1323,27 @@
+
+ mhz_cycles = (unsigned long long)mhz * 1000000ULL;
+ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ);
+-
+- INIT_LIST_HEAD(&request_list);
+-
++
+ ndev->rx_hook = netconsole_rx_hook;
+ netdump_func = netconsole_netdump;
+ netconsole_dev = ndev;
++ /* find irq function of the ndev*/
++ action=find_irq_action(ndev->irq, ndev);
++ if (!action) {
++ printk(KERN_ERR "couldn't find irq handler for <%s>", dev);
++ return -1;
++ }
++ irqfunc = action->handler;
++
++ stop_sysrq_thread = 0;
++ INIT_LIST_HEAD(&request_list);
++ init_waitqueue_head(&sysrq_thread_queue);
++ init_waitqueue_head(&sysrq_thread_waiter_queue);
++ if ((rc = kernel_thread(netconsole_sysrq_schedule, NULL, 0)) < 0 ){
++ printk(KERN_ERR "Can not start netconsole sysrq thread: rc %d\n", rc);
++ return -1;
++ }
++
+ #define STARTUP_MSG "[...network console startup...]\n"
+ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG));
+
+@@ -1230,7 +1354,11 @@
+
+ static void cleanup_netconsole(void)
+ {
+- printk(KERN_INFO "netlog: network logging shut down.\n");
++ stop_sysrq_thread = 1;
++
++ wake_up(&sysrq_thread_queue);
++ wait_event(sysrq_thread_waiter_queue, thread_stopped);
++ printk(KERN_INFO"netlog: network logging shut down.\n");
+ unregister_console(&netconsole);
+
+ #define SHUTDOWN_MSG "[...network console shutdown...]\n"
+Index: bglio/drivers/net/netconsole.h
+===================================================================
+--- bglio.orig/drivers/net/netconsole.h 2004-05-07 15:50:22.000000000 -0700
++++ bglio/drivers/net/netconsole.h 2004-05-07 17:11:01.000000000 -0700
+@@ -29,7 +29,7 @@
+ *
+ ****************************************************************/
+
+-#define NETCONSOLE_VERSION 0x04
++#define NETCONSOLE_VERSION 0x03
+
+ enum netdump_commands {
+ COMM_NONE = 0,
+@@ -42,6 +42,8 @@
+ COMM_START_NETDUMP_ACK = 7,
+ COMM_GET_REGS = 8,
+ COMM_SHOW_STATE = 9,
++ COMM_START_WRITE_NETDUMP_ACK = 10,
++ COMM_SYSRQ = 11,
+ };
+
+ #define NETDUMP_REQ_SIZE (8+4*4)
+@@ -69,6 +71,7 @@
+ REPLY_REGS = 10,
+ REPLY_MAGIC = 11,
+ REPLY_SHOW_STATE = 12,
++ REPLY_SYSRQ = 13,
+ };
+
+ typedef struct netdump_reply_s {
+@@ -78,4 +81,24 @@
+ } reply_t;
+
+ #define HEADER_LEN (1 + sizeof(reply_t))
+-
++/* for netconsole */
++static inline void get_current_regs(struct pt_regs *regs)
++{
++#ifdef __i386__
++ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx));
++ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx));
++ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx));
++ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi));
++ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi));
++ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp));
++ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax));
++ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp));
++ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss));
++ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs));
++ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds));
++ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes));
++ __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags));
++ regs->eip = (unsigned long)current_text_addr();
++#endif
++}
++
+Index: bglio/arch/i386/kernel/irq.c
+===================================================================
+--- bglio.orig/arch/i386/kernel/irq.c 2004-05-07 15:50:17.000000000 -0700
++++ bglio/arch/i386/kernel/irq.c 2004-05-07 17:11:01.000000000 -0700
+@@ -182,7 +182,20 @@
+
+ return 0;
+ }
++struct irqaction *find_irq_action(unsigned int irq, void *dev_id)
++{
++ struct irqaction *a, *r=0;
+
++ spin_lock_irq(&irq_desc[irq].lock);
++ for(a=irq_desc[irq].action; a; a=a->next) {
++ if(a->dev_id == dev_id) {
++ r=a;
++ break;
++ }
++ }
++ spin_unlock_irq(&irq_desc[irq].lock);
++ return r;
++}
+
+ /*
+ * Global interrupt locks for SMP. Allow interrupts to come in on any
+Index: bglio/arch/i386/kernel/i386_ksyms.c
+===================================================================
+--- bglio.orig/arch/i386/kernel/i386_ksyms.c 2004-05-07 15:50:22.000000000 -0700
++++ bglio/arch/i386/kernel/i386_ksyms.c 2004-05-07 17:11:01.000000000 -0700
+@@ -66,6 +66,7 @@
+ EXPORT_SYMBOL(iounmap);
+ EXPORT_SYMBOL(enable_irq);
+ EXPORT_SYMBOL(disable_irq);
++EXPORT_SYMBOL(find_irq_action);
+ EXPORT_SYMBOL(disable_irq_nosync);
+ EXPORT_SYMBOL(probe_irq_mask);
+ EXPORT_SYMBOL(kernel_thread);
+@@ -186,7 +187,6 @@
+ EXPORT_SYMBOL(edd);
+ EXPORT_SYMBOL(eddnr);
+ #endif
+-
+ EXPORT_SYMBOL_GPL(show_mem);
+ EXPORT_SYMBOL_GPL(show_state);
+ EXPORT_SYMBOL_GPL(show_regs);
+Index: bglio/net/core/dev.c
+===================================================================
+--- bglio.orig/net/core/dev.c 2004-05-07 15:50:22.000000000 -0700
++++ bglio/net/core/dev.c 2004-05-07 17:11:01.000000000 -0700
+@@ -1476,6 +1476,16 @@
+
+ skb_bond(skb);
+
++ if (unlikely(skb->dev->rx_hook != NULL)) {
++ int ret;
++
++ ret = skb->dev->rx_hook(skb);
++ if (ret == NET_RX_DROP){
++ kfree_skb(skb);
++ return ret;
++ }
++ }
++
+ netdev_rx_stat[smp_processor_id()].total++;
+
+ #ifdef CONFIG_NET_FASTROUTE
+Index: bglio/include/asm-i386/irq.h
+===================================================================
+--- bglio.orig/include/asm-i386/irq.h 2004-05-07 15:25:28.000000000 -0700
++++ bglio/include/asm-i386/irq.h 2004-05-07 17:11:01.000000000 -0700
+@@ -38,7 +38,7 @@
+ extern void disable_irq_nosync(unsigned int);
+ extern void enable_irq(unsigned int);
+ extern void release_x86_irqs(struct task_struct *);
+-
++extern struct irqaction *find_irq_action(unsigned int irq, void *dev_id);
+ #ifdef CONFIG_X86_LOCAL_APIC
+ #define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
+ #endif
+Index: bglio/kernel/panic.c
+===================================================================
+--- bglio.orig/kernel/panic.c 2004-05-07 15:50:22.000000000 -0700
++++ bglio/kernel/panic.c 2004-05-07 17:11:01.000000000 -0700
+@@ -66,8 +66,6 @@
+ vsprintf(buf, fmt, args);
+ va_end(args);
+ printk(KERN_EMERG "Kernel panic: %s\n",buf);
+- if (netdump_func)
+- BUG();
+ if (in_interrupt())
+ printk(KERN_EMERG "In interrupt handler - not syncing\n");
+ else if (!current->pid)
--- /dev/null
+ fs/Makefile | 3
+ fs/file_table.c | 11 ++
+ fs/inode.c | 23 ++++-
+ fs/namei.c | 12 ++
+ fs/nfsd/export.c | 5 +
+ fs/nfsd/nfsfh.c | 65 +++++++++++++-
+ fs/nfsd/vfs.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++-----
+ include/linux/fs.h | 10 ++
+ kernel/ksyms.c | 2
+ 9 files changed, 337 insertions(+), 34 deletions(-)
+
+Index: linux-bgl/fs/nfsd/vfs.c
+===================================================================
+--- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700
++++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800
+@@ -77,6 +77,128 @@
+ static struct raparms * raparml;
+ static struct raparms * raparm_cache;
+
++static int link_raw(struct dentry *dold, struct dentry *ddir,
++ struct dentry *dnew)
++{
++ int err;
++
++ struct nameidata old_nd = { .dentry = dold };
++ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->link_raw(&old_nd, &nd);
++ d_instantiate(dnew, dold->d_inode);
++ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
++ dold->d_inode->i_op->revalidate_it(dnew, NULL);
++
++ return err;
++}
++
++static int unlink_raw(struct dentry *dentry, char *fname, int flen,
++ struct dentry *rdentry)
++{
++ int err;
++ struct qstr last = { .name = fname, .len = flen };
++ struct nameidata nd = { .dentry = dentry, .last = last };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->unlink_raw(&nd);
++ if (!err)
++ d_delete(rdentry);
++
++ return err;
++}
++
++static int rmdir_raw(struct dentry *dentry, char *fname, int flen,
++ struct dentry *rdentry)
++{
++ int err;
++ struct qstr last = { .name = fname, .len = flen };
++ struct nameidata nd = { .dentry = dentry, .last = last };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->rmdir_raw(&nd);
++ if(!err) {
++ rdentry->d_inode->i_flags |= S_DEAD;
++ d_delete(rdentry);
++ }
++
++ return err;
++}
++
++static int symlink_raw(struct dentry *dentry, char *fname, int flen,
++ char *path)
++{
++ int err;
++ struct qstr last = { .name = fname, .len = flen };
++ struct nameidata nd = { .dentry = dentry, .last = last };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->symlink_raw(&nd, path);
++
++ return err;
++}
++
++static int mkdir_raw(struct dentry *dentry, char *fname, int flen, int mode)
++{
++ int err;
++ struct qstr last = { .name = fname, .len = flen };
++ struct nameidata nd = { .dentry = dentry, .last = last };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->mkdir_raw(&nd, mode);
++
++ return err;
++}
++
++static int mknod_raw(struct dentry *dentry, char *fname, int flen, int mode,
++ dev_t dev)
++{
++ int err;
++ struct qstr last = { .name = fname, .len = flen };
++ struct nameidata nd = { .dentry = dentry, .last = last };
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ err = op->mknod_raw(&nd, mode, dev);
++
++ return err;
++}
++
++static int rename_raw(struct dentry *fdentry, struct dentry *tdentry,
++ struct dentry *odentry, struct dentry *ndentry)
++{
++ int err;
++
++ struct nameidata old_nd = { .dentry = fdentry, .last = odentry->d_name};
++ struct nameidata new_nd = { .dentry = tdentry, .last = ndentry->d_name};
++ struct inode_operations *op = old_nd.dentry->d_inode->i_op;
++ err = op->rename_raw(&old_nd, &new_nd);
++ d_move(odentry, ndentry);
++
++ return err;
++}
++
++static int setattr_raw(struct inode *inode, struct iattr *iap)
++{
++ int err;
++
++ iap->ia_valid |= ATTR_RAW;
++ err = inode->i_op->setattr_raw(inode, iap);
++
++ return err;
++}
++
++int revalidate_it(struct dentry *dentry, struct lookup_intent *it)
++{
++ int err = 0;
++
++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it) &&
++ !d_invalidate(dentry)) {
++ dput(dentry);
++ err = -EINVAL;
++ dentry = NULL;
++ return err;
++ }
++ }
++
++ return err;
++}
++
+ /*
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+@@ -304,7 +426,10 @@
+ }
+ err = nfserr_notsync;
+ if (!check_guard || guardtime == inode->i_ctime) {
+- err = notify_change(dentry, iap);
++ if ( dentry->d_inode->i_op && dentry->d_inode->i_op->setattr_raw)
++ err = setattr_raw(dentry->d_inode, iap);
++ else
++ err = notify_change(dentry, iap);
+ err = nfserrno(err);
+ }
+ if (size_change) {
+@@ -431,6 +556,7 @@
+ {
+ struct dentry *dentry;
+ struct inode *inode;
++ struct lookup_intent it;
+ int err;
+
+ /* If we get here, then the client has already done an "open", and (hopefully)
+@@ -477,6 +603,14 @@
+ filp->f_mode = FMODE_READ;
+ }
+
++ intent_init(&it, IT_OPEN, (filp->f_flags & ~O_ACCMODE) | filp->f_mode);
++
++ err = revalidate_it(dentry, &it);
++ if (err)
++ goto out_nfserr;
++
++ filp->f_it = ⁢
++
+ err = 0;
+ if (filp->f_op && filp->f_op->open) {
+ err = filp->f_op->open(inode, filp);
+@@ -491,7 +625,11 @@
+ atomic_dec(&filp->f_count);
+ }
+ }
++
+ out_nfserr:
++ if (it.it_op_release)
++ intent_release(&it);
++
+ if (err)
+ err = nfserrno(err);
+ out:
+@@ -822,7 +960,7 @@
+ {
+ struct dentry *dentry, *dchild;
+ struct inode *dirp;
+- int err;
++ int err, error = -EOPNOTSUPP;
+
+ err = nfserr_perm;
+ if (!flen)
+@@ -838,20 +976,44 @@
+ dentry = fhp->fh_dentry;
+ dirp = dentry->d_inode;
+
++ switch (type) {
++ case S_IFDIR:
++ if (dirp->i_op->mkdir_raw)
++ error = mkdir_raw(dentry, fname, flen, iap->ia_mode);
++ break;
++ case S_IFCHR:
++ case S_IFBLK:
++ case S_IFIFO:
++ case S_IFSOCK:
++ case S_IFREG:
++ if (dirp->i_op->mknod_raw) {
++ if (type == S_IFREG)
++ rdev = 0;
++ error = mknod_raw(dentry, fname, flen, iap->ia_mode, rdev);
++ }
++ break;
++ default:
++ printk("nfsd: bad file type %o in nfsd_create\n", type);
++ }
++
+ err = nfserr_notdir;
+- if(!dirp->i_op || !dirp->i_op->lookup)
++ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it))
+ goto out;
+ /*
+ * Check whether the response file handle has been verified yet.
+ * If it has, the parent directory should already be locked.
+ */
+- if (!resfhp->fh_dentry) {
+- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
+- fh_lock(fhp);
++ if (!resfhp->fh_dentry || dirp->i_op->lookup_it) {
++ /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create
++ and nfsd_proc_create in case of lustre
++ */
++ if (!resfhp->fh_dentry)
++ fh_lock(fhp);
+ dchild = lookup_one_len(fname, dentry, flen);
+ err = PTR_ERR(dchild);
+ if (IS_ERR(dchild))
+ goto out_nfserr;
++ resfhp->fh_dentry = NULL;
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ if (err)
+ goto out;
+@@ -872,10 +1034,12 @@
+ * Make sure the child dentry is still negative ...
+ */
+ err = nfserr_exist;
+- if (dchild->d_inode) {
+- dprintk("nfsd_create: dentry %s/%s not negative!\n",
+- dentry->d_name.name, dchild->d_name.name);
+- goto out;
++ if ( error == -EOPNOTSUPP) {
++ if (dchild->d_inode) {
++ dprintk("nfsd_create: dentry %s/%s not negative!\n",
++ dentry->d_name.name, dchild->d_name.name);
++ goto out;
++ }
+ }
+
+ if (!(iap->ia_valid & ATTR_MODE))
+@@ -888,16 +1052,19 @@
+ err = nfserr_perm;
+ switch (type) {
+ case S_IFREG:
+- err = vfs_create(dirp, dchild, iap->ia_mode);
++ if (error == -EOPNOTSUPP)
++ err = vfs_create(dirp, dchild, iap->ia_mode);
+ break;
+ case S_IFDIR:
+- err = vfs_mkdir(dirp, dchild, iap->ia_mode);
++ if (error == -EOPNOTSUPP)
++ err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
++ if (error == -EOPNOTSUPP)
++ err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+ break;
+ default:
+ printk("nfsd: bad file type %o in nfsd_create\n", type);
+@@ -966,7 +1133,13 @@
+ /* Get all the sanity checks out of the way before
+ * we lock the parent. */
+ err = nfserr_notdir;
+- if(!dirp->i_op || !dirp->i_op->lookup)
++ if (dirp->i_op->mknod_raw) {
++ err = mknod_raw(dentry, fname, flen, iap->ia_mode, 0);
++ if (err && err != -EOPNOTSUPP)
++ goto out;
++ }
++
++ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it))
+ goto out;
+ fh_lock(fhp);
+
+@@ -1017,6 +1190,8 @@
+ case NFS3_CREATE_GUARDED:
+ err = nfserr_exist;
+ }
++ if(dirp->i_op->mknod_raw)
++ err = 0;
+ goto out;
+ }
+
+@@ -1123,7 +1298,7 @@
+ struct iattr *iap)
+ {
+ struct dentry *dentry, *dnew;
+- int err, cerr;
++ int err, cerr, error = -EOPNOTSUPP;
+
+ err = nfserr_noent;
+ if (!flen || !plen)
+@@ -1137,12 +1312,18 @@
+ goto out;
+ fh_lock(fhp);
+ dentry = fhp->fh_dentry;
++
++ if (dentry->d_inode->i_op->symlink_raw)
++ error = symlink_raw(dentry, fname, flen, path);
++
+ dnew = lookup_one_len(fname, dentry, flen);
+ err = PTR_ERR(dnew);
+ if (IS_ERR(dnew))
+ goto out_nfserr;
+
+- err = vfs_symlink(dentry->d_inode, dnew, path);
++ err = error;
++ if (err == -EOPNOTSUPP || !dentry->d_inode->i_op->symlink_raw)
++ err = vfs_symlink(dentry->d_inode, dnew, path);
+ if (!err) {
+ if (EX_ISSYNC(fhp->fh_export))
+ nfsd_sync_dir(dentry);
+@@ -1152,7 +1333,10 @@
+ iap->ia_valid |= ATTR_CTIME;
+ iap->ia_mode = (iap->ia_mode&S_IALLUGO)
+ | S_IFLNK;
+- err = notify_change(dnew, iap);
++ if (dnew->d_inode->i_op && dnew->d_inode->i_op->setattr_raw)
++ err = setattr_raw(dnew->d_inode, iap);
++ else
++ err = notify_change(dnew, iap);
+ if (!err && EX_ISSYNC(fhp->fh_export))
+ write_inode_now(dentry->d_inode, 1);
+ }
+@@ -1210,7 +1394,10 @@
+ dold = tfhp->fh_dentry;
+ dest = dold->d_inode;
+
+- err = vfs_link(dold, dirp, dnew);
++ if (dirp->i_op->link_raw)
++ err = link_raw(dold, ddir, dnew);
++ else
++ err = vfs_link(dold, dirp, dnew);
+ if (!err) {
+ if (EX_ISSYNC(ffhp->fh_export)) {
+ nfsd_sync_dir(ddir);
+@@ -1295,7 +1482,10 @@
+ err = nfserr_perm;
+ } else
+ #endif
+- err = vfs_rename(fdir, odentry, tdir, ndentry);
++ if(fdir->i_op->rename_raw)
++ err = rename_raw(fdentry, tdentry, odentry, ndentry);
++ else
++ err = vfs_rename(fdir, odentry, tdir, ndentry);
+ if (!err && EX_ISSYNC(tfhp->fh_export)) {
+ nfsd_sync_dir(tdentry);
+ nfsd_sync_dir(fdentry);
+@@ -1316,7 +1506,7 @@
+ fill_post_wcc(tfhp);
+ double_up(&tdir->i_sem, &fdir->i_sem);
+ ffhp->fh_locked = tfhp->fh_locked = 0;
+-
++
+ out:
+ return err;
+ }
+@@ -1362,9 +1552,15 @@
+ err = nfserr_perm;
+ } else
+ #endif
+- err = vfs_unlink(dirp, rdentry);
++ if (dirp->i_op->unlink_raw)
++ err = unlink_raw(dentry, fname, flen, rdentry);
++ else
++ err = vfs_unlink(dirp, rdentry);
+ } else { /* It's RMDIR */
+- err = vfs_rmdir(dirp, rdentry);
++ if (dirp->i_op->rmdir_raw)
++ err = rmdir_raw(dentry, fname, flen, rdentry);
++ else
++ err = vfs_rmdir(dirp, rdentry);
+ }
+
+ dput(rdentry);
+Index: linux-bgl/fs/nfsd/nfsfh.c
+===================================================================
+--- linux-bgl.orig/fs/nfsd/nfsfh.c 2003-07-02 08:44:08.000000000 -0700
++++ linux-bgl/fs/nfsd/nfsfh.c 2004-12-28 17:13:59.942919514 -0800
+@@ -36,6 +36,15 @@
+ int sequence; /* sequence counter */
+ };
+
++static struct dentry *lookup_it(struct inode *inode, struct dentry * dentry)
++{
++ if (inode->i_op->lookup_it)
++ return inode->i_op->lookup_it(inode, dentry, NULL, 0);
++ else
++ return inode->i_op->lookup(inode, dentry);
++
++}
++
+ /*
+ * A rather strange filldir function to capture
+ * the name matching the specified inode number.
+@@ -75,6 +84,8 @@
+ int error;
+ struct file file;
+ struct nfsd_getdents_callback buffer;
++ struct lookup_intent it;
++ struct file *filp = NULL;
+
+ error = -ENOTDIR;
+ if (!dir || !S_ISDIR(dir->i_mode))
+@@ -85,9 +96,37 @@
+ /*
+ * Open the directory ...
+ */
+- error = init_private_file(&file, dentry, FMODE_READ);
+- if (error)
++ if (dentry->d_op && dentry->d_op->d_revalidate_it) {
++ if ((dentry->d_flags & DCACHE_NFSD_DISCONNECTED) &&
++ (dentry->d_parent == dentry) ) {
++ it.it_op_release = NULL;
++ /*
++ * XXX Temporary Hack: Simulating init_private_file without
++ * f_op->open for disconnected dentry Since we don't have actual
++ * dentry->d_name to revalidate in revalidate_it()
++ */
++ filp = &file;
++ memset(filp, 0, sizeof(*filp));
++ filp->f_mode = FMODE_READ;
++ atomic_set(&filp->f_count, 1);
++ filp->f_dentry = dentry;
++ filp->f_uid = current->fsuid;
++ filp->f_gid = current->fsgid;
++ filp->f_op = dentry->d_inode->i_fop;
++ error = 0;
++ } else {
++ intent_init(&it, IT_OPEN, 0);
++ error = revalidate_it(dentry, &it);
++ if (error)
++ goto out;
++ error = init_private_file_it(&file, dentry, FMODE_READ, &it);
++ }
++ } else {
++ error = init_private_file_it(&file, dentry, FMODE_READ, NULL);
++ }
++ if (error)
+ goto out;
++
+ error = -EINVAL;
+ if (!file.f_op->readdir)
+ goto out_close;
+@@ -113,9 +152,13 @@
+ }
+
+ out_close:
+- if (file.f_op->release)
++ if (file.f_op->release && !filp)
+ file.f_op->release(dir, &file);
+ out:
++ if (dentry->d_op &&
++ dentry->d_op->d_revalidate_it &&
++ it.it_op_release && !filp)
++ intent_release(&it);
+ return error;
+ }
+
+@@ -273,7 +316,7 @@
+ /* I'm going to assume that if the returned dentry is different, then
+ * it is well connected. But nobody returns different dentrys do they?
+ */
+- pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry);
++ pdentry = lookup_it(child->d_inode, tdentry);
+ d_drop(tdentry); /* we never want ".." hashed */
+ if (!pdentry && tdentry->d_inode == NULL) {
+ /* File system cannot find ".." ... sad but possible */
+@@ -304,6 +347,8 @@
+ igrab(tdentry->d_inode);
+ pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
+ }
++ if (child->d_op && child->d_op->d_revalidate_it)
++ pdentry->d_op = child->d_op;
+ }
+ if (pdentry == NULL)
+ pdentry = ERR_PTR(-ENOMEM);
+@@ -461,6 +506,8 @@
+ struct dentry *pdentry;
+ struct inode *parent;
+
++ if (result->d_op && result->d_op->d_revalidate_it)
++ dentry->d_op = result->d_op;
+ pdentry = nfsd_findparent(dentry);
+ err = PTR_ERR(pdentry);
+ if (IS_ERR(pdentry))
+@@ -648,6 +695,11 @@
+
+ inode = dentry->d_inode;
+
++ /* cache coherency for non-device filesystems */
++ if (inode->i_op && inode->i_op->revalidate_it) {
++ inode->i_op->revalidate_it(dentry, NULL);
++ }
++
+ /* Type check. The correct error return for type mismatches
+ * does not seem to be generally agreed upon. SunOS seems to
+ * use EISDIR if file isn't S_IFREG; a comment in the NFSv3
+@@ -878,8 +930,9 @@
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+ goto out;
+ out_uptodate:
+- printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
+- dentry->d_parent->d_name.name, dentry->d_name.name);
++ if(!dentry->d_parent->d_inode->i_op->mkdir_raw)
++ printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
++ dentry->d_parent->d_name.name, dentry->d_name.name);
+ goto out;
+ }
+
+Index: linux-bgl/fs/Makefile
+===================================================================
+--- linux-bgl.orig/fs/Makefile 2004-12-28 17:13:56.898868625 -0800
++++ linux-bgl/fs/Makefile 2004-12-28 17:13:59.943919356 -0800
+@@ -7,7 +7,8 @@
+
+ O_TARGET := fs.o
+
+-export-objs := filesystems.o open.o dcache.o buffer.o inode.o
++export-objs := filesystems.o open.o dcache.o buffer.o inode.o namei.o \
++ file_table.o
+ mod-subdirs := nls
+
+ obj-y := open.o read_write.o devices.o file_table.o buffer.o \
+Index: linux-bgl/fs/namei.c
+===================================================================
+--- linux-bgl.orig/fs/namei.c 2004-12-28 17:13:56.265835195 -0800
++++ linux-bgl/fs/namei.c 2004-12-28 17:13:59.947918720 -0800
+@@ -22,6 +22,7 @@
+ #include <linux/dnotify.h>
+ #include <linux/smp_lock.h>
+ #include <linux/personality.h>
++#include <linux/module.h>
+
+ #include <asm/namei.h>
+ #include <asm/uaccess.h>
+@@ -100,6 +101,7 @@
+ it->it_op_release(it);
+
+ }
++EXPORT_SYMBOL(intent_release);
+
+ /* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+@@ -889,7 +891,8 @@
+
+
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base,
++ int len, struct lookup_intent *it)
+ {
+ unsigned long hash;
+ struct qstr this;
+@@ -909,11 +912,16 @@
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash_it(&this, base, NULL);
++ return lookup_hash_it(&this, base, it);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++ return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+ * namei()
+ *
+Index: linux-bgl/fs/file_table.c
+===================================================================
+--- linux-bgl.orig/fs/file_table.c 2003-07-02 08:44:42.000000000 -0700
++++ linux-bgl/fs/file_table.c 2004-12-28 17:13:59.948918562 -0800
+@@ -82,7 +82,8 @@
+ * and call the open function (if any). The caller must verify that
+ * inode->i_fop is not NULL.
+ */
+-int init_private_file(struct file *filp, struct dentry *dentry, int mode)
++int init_private_file_it(struct file *filp, struct dentry *dentry, int mode,
++ struct lookup_intent *it)
+ {
+ memset(filp, 0, sizeof(*filp));
+ filp->f_mode = mode;
+@@ -90,12 +91,20 @@
+ filp->f_dentry = dentry;
+ filp->f_uid = current->fsuid;
+ filp->f_gid = current->fsgid;
++ if (it)
++ filp->f_it = it;
+ filp->f_op = dentry->d_inode->i_fop;
+ if (filp->f_op->open)
+ return filp->f_op->open(dentry->d_inode, filp);
+ else
+ return 0;
+ }
++EXPORT_SYMBOL(init_private_file_it);
++
++int init_private_file(struct file *filp, struct dentry *dentry, int mode)
++{
++ return init_private_file_it(filp, dentry, mode, NULL);
++}
+
+ void fput(struct file * file)
+ {
+Index: linux-bgl/fs/inode.c
+===================================================================
+--- linux-bgl.orig/fs/inode.c 2004-12-28 17:13:56.635910389 -0800
++++ linux-bgl/fs/inode.c 2004-12-28 17:13:59.950918244 -0800
+@@ -971,9 +971,10 @@
+ }
+
+
+-struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque)
++static inline struct inode *ifind(struct super_block *sb, unsigned long ino,
++ struct list_head *head,
++ find_inode_t find_actor, void *opaque)
+ {
+- struct list_head * head = inode_hashtable + hash(sb,ino);
+ struct inode * inode;
+
+ spin_lock(&inode_lock);
+@@ -986,6 +987,24 @@
+ }
+ spin_unlock(&inode_lock);
+
++ return NULL;
++}
++
++struct inode *ilookup4(struct super_block *sb, unsigned long ino,
++ find_inode_t find_actor, void *opaque)
++{
++ struct list_head * head = inode_hashtable + hash(sb,ino);
++ return ifind(sb, ino, head, find_actor, opaque);
++}
++
++struct inode *iget4(struct super_block *sb, unsigned long ino,
++ find_inode_t find_actor, void *opaque)
++{
++ struct list_head * head = inode_hashtable + hash(sb,ino);
++ struct inode *inode = ifind(sb, ino, head, find_actor, opaque);
++ if (inode)
++ return inode;
++
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+Index: linux-bgl/kernel/ksyms.c
+===================================================================
+--- linux-bgl.orig/kernel/ksyms.c 2004-12-28 17:13:56.978855920 -0800
++++ linux-bgl/kernel/ksyms.c 2004-12-28 17:13:59.951918085 -0800
+@@ -142,6 +142,7 @@
+ EXPORT_SYMBOL(igrab);
+ EXPORT_SYMBOL(iunique);
+ EXPORT_SYMBOL(iget4);
++EXPORT_SYMBOL(ilookup4);
+ EXPORT_SYMBOL(iput);
+ EXPORT_SYMBOL(force_delete);
+ EXPORT_SYMBOL(follow_up);
+@@ -152,6 +153,7 @@
+ EXPORT_SYMBOL(path_release);
+ EXPORT_SYMBOL(__user_walk);
+ EXPORT_SYMBOL(lookup_one_len);
++EXPORT_SYMBOL(lookup_one_len_it);
+ EXPORT_SYMBOL(lookup_hash);
+ EXPORT_SYMBOL(sys_close);
+ EXPORT_SYMBOL(dcache_lock);
+Index: linux-bgl/include/linux/fs.h
+===================================================================
+--- linux-bgl.orig/include/linux/fs.h 2004-12-28 17:13:59.471860200 -0800
++++ linux-bgl/include/linux/fs.h 2004-12-28 17:13:59.955917450 -0800
+@@ -93,6 +93,9 @@
+ #define FS_SINGLE 8 /* Filesystem that can have only one superblock */
+ #define FS_NOMOUNT 16 /* Never mount from userland */
+ #define FS_LITTER 32 /* Keeps the tree in dcache */
++#define FS_NFSEXP_FSID 64 /* Use file system specific fsid for
++ * exporting non device filesystems.
++ */
+ #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon
+ * as nfs_rename() will be cleaned up
+ */
+@@ -1149,6 +1152,9 @@
+ struct nameidata *nd, struct lookup_intent *it);
+ extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
+ int flags, struct lookup_intent *it);
++extern int revalidate_it(struct dentry *dentry, struct lookup_intent *it);
++extern int init_private_file_it(struct file *, struct dentry *dentry, int mode,
++ struct lookup_intent *it);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+
+@@ -1418,6 +1424,8 @@
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
++extern struct dentry * lookup_one_len_it(const char *, struct dentry *, int,
++ struct lookup_intent *);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
+@@ -1431,6 +1439,8 @@
+
+ typedef int (*find_inode_t)(struct inode *, unsigned long, void *);
+ extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *);
++extern struct inode * ilookup4(struct super_block *, unsigned long,
++ find_inode_t, void *);
+ static inline struct inode *iget(struct super_block *sb, unsigned long ino)
+ {
+ return iget4(sb, ino, NULL, NULL);
--- /dev/null
+Index: linux-2.4.19-pre1/include/linux/sched.h
+===================================================================
+--- linux-2.4.19-pre1.orig/include/linux/sched.h 2003-11-21 04:05:05.000000000 +0300
++++ linux-2.4.19-pre1/include/linux/sched.h 2003-11-21 04:10:29.000000000 +0300
+@@ -927,6 +927,11 @@
+ return res;
+ }
+
++static inline int need_resched(void)
++{
++ return (unlikely(current->need_resched));
++}
++
+ #endif /* __KERNEL__ */
+
+ #endif
--- /dev/null
+ include/linux/socket.h | 4 ++++
+ net/netsyms.c | 2 ++
+ net/socket.c | 2 +-
+ 3 files changed, 7 insertions(+), 1 deletion(-)
+
+Index: linux-DRV401/include/linux/socket.h
+===================================================================
+--- linux-DRV401.orig/include/linux/socket.h 2004-10-15 10:26:20.000000000 -0700
++++ linux-DRV401/include/linux/socket.h 2004-10-15 11:11:09.000000000 -0700
+@@ -260,6 +260,10 @@
+ extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen);
+ extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr);
+ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
++struct socket;
++extern int sock_map_fd(struct socket *sock);
++extern struct socket *sockfd_lookup(int fd, int *err);
++
+ #endif
+ #endif /* not kernel and not glibc */
+ #endif /* _LINUX_SOCKET_H */
+Index: linux-DRV401/net/netsyms.c
+===================================================================
+--- linux-DRV401.orig/net/netsyms.c 2004-10-15 11:10:52.000000000 -0700
++++ linux-DRV401/net/netsyms.c 2004-10-15 11:11:09.000000000 -0700
+@@ -159,6 +159,8 @@
+ EXPORT_SYMBOL(put_cmsg);
+ EXPORT_SYMBOL(sock_kmalloc);
+ EXPORT_SYMBOL(sock_kfree_s);
++EXPORT_SYMBOL(sockfd_lookup);
++EXPORT_SYMBOL(sock_map_fd);
+
+ #ifdef CONFIG_FILTER
+ EXPORT_SYMBOL(sk_run_filter);
+Index: linux-DRV401/net/socket.c
+===================================================================
+--- linux-DRV401.orig/net/socket.c 2004-10-15 10:24:16.000000000 -0700
++++ linux-DRV401/net/socket.c 2004-10-15 11:11:09.000000000 -0700
+@@ -326,7 +326,7 @@
+ * but we take care of internal coherence yet.
+ */
+
+-static int sock_map_fd(struct socket *sock)
++int sock_map_fd(struct socket *sock)
+ {
+ int fd;
+ struct qstr this;
--- /dev/null
+Index: linux-2.4.19-pre1/include/linux/skbuff.h
+===================================================================
+--- linux-2.4.19-pre1.orig/include/linux/skbuff.h 2001-11-22 22:46:26.000000000 +0300
++++ linux-2.4.19-pre1/include/linux/skbuff.h 2004-01-14 01:15:13.000000000 +0300
+@@ -116,6 +116,30 @@
+ __u16 size;
+ };
+
++/* Support for callback when skb data has been released */
++typedef struct zccd /* Zero Copy Callback Descriptor */
++{ /* (embed as first member of custom struct) */
++ atomic_t zccd_count; /* reference count */
++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++ atomic_set (&d->zccd_count, 1);
++ d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d) /* take a reference */
++{
++ atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d) /* release a reference */
++{
++ if (atomic_dec_and_test (&d->zccd_count))
++ (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+ * the end of the header data, ie. at skb->end.
+ */
+@@ -123,6 +147,12 @@
+ atomic_t dataref;
+ unsigned int nr_frags;
+ struct sk_buff *frag_list;
++ zccd_t *zccd; /* zero copy descriptor */
++ zccd_t *zccd2; /* 2nd zero copy descriptor */
++ /* NB we expect zero-copy data to be at least 1 packet, so
++ * having 2 zccds means we don't unneccessarily split the packet
++ * where consecutive zero-copy sends abutt.
++ */
+ skb_frag_t frags[MAX_SKB_FRAGS];
+ };
+
+Index: linux-2.4.19-pre1/include/net/tcp.h
+===================================================================
+--- linux-2.4.19-pre1.orig/include/net/tcp.h 2001-11-22 22:47:22.000000000 +0300
++++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300
+@@ -640,6 +640,8 @@
+
+ extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
+ extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd);
+
+ extern int tcp_ioctl(struct sock *sk,
+ int cmd,
+@@ -733,6 +735,9 @@
+ struct msghdr *msg,
+ int len, int nonblock,
+ int flags, int *addr_len);
++extern int tcp_recvpackets(struct sock *sk,
++ struct sk_buff_head *packets,
++ int len, int nonblock);
+
+ extern int tcp_listen_start(struct sock *sk);
+
+Index: linux-2.4.19-pre1/net/netsyms.c
+===================================================================
+--- linux-2.4.19-pre1.orig/net/netsyms.c 2004-01-14 01:10:37.000000000 +0300
++++ linux-2.4.19-pre1/net/netsyms.c 2004-01-14 01:15:54.000000000 +0300
+@@ -409,6 +409,9 @@
+
+ #endif
+
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
++
+ EXPORT_SYMBOL(netlink_set_err);
+ EXPORT_SYMBOL(netlink_broadcast);
+ EXPORT_SYMBOL(netlink_unicast);
+Index: linux-2.4.19-pre1/net/core/skbuff.c
+===================================================================
+--- linux-2.4.19-pre1.orig/net/core/skbuff.c 2001-12-21 20:42:05.000000000 +0300
++++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300
+@@ -208,6 +208,8 @@
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
++ skb_shinfo(skb)->zccd2 = NULL;
+ return skb;
+
+ nodata:
+@@ -276,6 +278,10 @@
+ {
+ if (!skb->cloned ||
+ atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -532,6 +538,8 @@
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
++ skb_shinfo(skb)->zccd2 = NULL;
+
+ /* We are no longer a clone, even if we were. */
+ skb->cloned = 0;
+@@ -578,6 +586,14 @@
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+@@ -620,6 +636,8 @@
+ u8 *data;
+ int size = nhead + (skb->end - skb->head) + ntail;
+ long off;
++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
+
+ if (skb_shared(skb))
+ BUG();
+@@ -641,6 +659,11 @@
+ if (skb_shinfo(skb)->frag_list)
+ skb_clone_fraglist(skb);
+
++ if (zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (zccd); /* extra ref (pages are shared) */
++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (zccd2); /* extra ref (pages are shared) */
++
+ skb_release_data(skb);
+
+ off = (data+nhead) - skb->head;
+@@ -655,6 +678,8 @@
+ skb->nh.raw += off;
+ skb->cloned = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
++ skb_shinfo(skb)->zccd = zccd;
++ skb_shinfo(skb)->zccd2 = zccd2;
+ return 0;
+
+ nodata:
+Index: linux-2.4.19-pre1/net/ipv4/tcp.c
+===================================================================
+--- linux-2.4.19-pre1.orig/net/ipv4/tcp.c 2001-12-21 20:42:05.000000000 +0300
++++ linux-2.4.19-pre1/net/ipv4/tcp.c 2004-01-14 01:15:13.000000000 +0300
+@@ -744,7 +744,7 @@
+ goto out;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
+
+ static inline int
+ can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
+@@ -823,7 +823,8 @@
+ return err;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
+ {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int mss_now;
+@@ -871,6 +872,17 @@
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
++
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++ skb_shinfo(skb)->zccd2 != NULL &&
++ skb_shinfo(skb)->zccd != zccd && /* not the same one */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ tcp_mark_push (tp, skb);
++ goto new_segment;
++ }
++
+ if (can_coalesce(skb, i, page, offset)) {
+ skb_shinfo(skb)->frags[i-1].size += copy;
+ } else if (i < MAX_SKB_FRAGS) {
+@@ -881,6 +893,20 @@
+ goto new_segment;
+ }
+
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ zccd_get (zccd); /* bump ref count */
++
++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++ skb_shinfo(skb)->zccd = zccd;
++ else
++ skb_shinfo(skb)->zccd2 = zccd;
++ }
++
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->ip_summed = CHECKSUM_HW;
+@@ -944,7 +970,31 @@
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+- res = do_tcp_sendpages(sk, &page, offset, size, flags);
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return res;
++}
++
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd)
++{
++ ssize_t res;
++ struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++ BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++ lock_sock(sk);
++ TCP_CHECK_TIMER(sk);
++
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return res;
+@@ -1683,6 +1733,202 @@
+ goto out;
+ }
+
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++ int len, int nonblock)
++{
++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
++ int copied;
++ long timeo;
++
++ BUG_TRAP (len > 0);
++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++ lock_sock(sk);
++
++ TCP_CHECK_TIMER(sk);
++
++ copied = -ENOTCONN;
++ if (sk->state == TCP_LISTEN)
++ goto out;
++
++ copied = 0;
++ timeo = sock_rcvtimeo(sk, nonblock);
++
++ do {
++ struct sk_buff * skb;
++ u32 offset;
++ unsigned long used;
++ int exhausted;
++ int eaten;
++
++ /* Are we at urgent data? Stop if we have read anything. */
++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++ break;
++
++ /* We need to check signals first, to get correct SIGURG
++ * handling. FIXME: Need to check this doesnt impact 1003.1g
++ * and move it down to the bottom of the loop
++ */
++ if (signal_pending(current)) {
++ if (copied)
++ break;
++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++ break;
++ }
++
++ /* Next get a buffer. */
++
++ skb = skb_peek(&sk->receive_queue);
++
++ if (skb == NULL) /* nothing ready */
++ {
++ if (copied) {
++ if (sk->err ||
++ sk->state == TCP_CLOSE ||
++ (sk->shutdown & RCV_SHUTDOWN) ||
++ !timeo ||
++ (0))
++ break;
++ } else {
++ if (sk->done)
++ break;
++
++ if (sk->err) {
++ copied = sock_error(sk);
++ break;
++ }
++
++ if (sk->shutdown & RCV_SHUTDOWN)
++ break;
++
++ if (sk->state == TCP_CLOSE) {
++ if (!sk->done) {
++ /* This occurs when user tries to read
++ * from never connected socket.
++ */
++ copied = -ENOTCONN;
++ break;
++ }
++ break;
++ }
++
++ if (!timeo) {
++ copied = -EAGAIN;
++ break;
++ }
++ }
++
++ cleanup_rbuf(sk, copied);
++ timeo = tcp_data_wait(sk, timeo);
++ continue;
++ }
++
++ BUG_TRAP (atomic_read (&skb->users) == 1);
++
++ exhausted = eaten = 0;
++
++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++ if (skb->h.th->syn)
++ offset--;
++
++ used = skb->len - offset;
++
++ if (tp->urg_data) {
++ u32 urg_offset = tp->urg_seq - tp->copied_seq;
++ if (urg_offset < used) {
++ if (!urg_offset) { /* at urgent date */
++ if (!sk->urginline) {
++ tp->copied_seq++; /* discard the single byte of urgent data */
++ offset++;
++ used--;
++ }
++ } else /* truncate read */
++ used = urg_offset;
++ }
++ }
++
++ BUG_TRAP (used >= 0);
++ if (len < used)
++ used = len;
++
++ if (used == 0)
++ exhausted = 1;
++ else
++ {
++ if (skb_is_nonlinear (skb))
++ {
++ int rc = skb_linearize (skb, GFP_KERNEL);
++
++ printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++ if (rc)
++ {
++ if (!copied)
++ copied = rc;
++ break;
++ }
++ }
++
++ if ((offset + used) == skb->len) /* consuming the whole packet */
++ {
++ __skb_unlink (skb, &sk->receive_queue);
++ dst_release (skb->dst);
++ skb_orphan (skb);
++ __skb_pull (skb, offset);
++ __skb_queue_tail (packets, skb);
++ exhausted = eaten = 1;
++ }
++ else /* consuming only part of the packet */
++ {
++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++ if (skb2 == NULL)
++ {
++ if (!copied)
++ copied = -ENOMEM;
++ break;
++ }
++
++ dst_release (skb2->dst);
++ __skb_pull (skb2, offset);
++ __skb_trim (skb2, used);
++ __skb_queue_tail (packets, skb2);
++ }
++
++ tp->copied_seq += used;
++ copied += used;
++ len -= used;
++ }
++
++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++ tp->urg_data = 0;
++ tcp_fast_path_check(sk, tp);
++ }
++
++ if (!exhausted)
++ continue;
++
++ if (skb->h.th->fin)
++ {
++ tp->copied_seq++;
++ if (!eaten)
++ tcp_eat_skb (sk, skb);
++ break;
++ }
++
++ if (!eaten)
++ tcp_eat_skb (sk, skb);
++
++ } while (len > 0);
++
++ out:
++ /* Clean up data we have read: This will do ACK frames. */
++ cleanup_rbuf(sk, copied);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return copied;
++}
++
+ /*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
--- /dev/null
+ fs/dcache.c | 19 ++
+ fs/exec.c | 17 +-
+ fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++-------
+ fs/namespace.c | 28 +++-
+ fs/open.c | 172 +++++++++++++++++++-------
+ fs/stat.c | 52 +++++---
+ include/linux/dcache.h | 60 +++++++++
+ include/linux/fs.h | 32 ++++
+ include/linux/fs_struct.h | 4
+ kernel/exit.c | 3
+ kernel/fork.c | 3
+ kernel/ksyms.c | 1
+ 12 files changed, 558 insertions(+), 128 deletions(-)
+
+Index: linux.mcp2/fs/dcache.c
+===================================================================
+--- linux.mcp2.orig/fs/dcache.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/dcache.c 2004-05-05 14:19:59.000000000 -0700
+@@ -181,6 +181,13 @@
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
++
++ /* network invalidation by Lustre */
++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
++ spin_unlock(&dcache_lock);
++ return 0;
++ }
++
+ /*
+ * Check whether to do a partial shrink_dcache
+ * to get rid of unused child entries.
+@@ -830,13 +837,19 @@
+ * Adds a dentry to the hash according to its name.
+ */
+
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+ struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+ if (!list_empty(&entry->d_hash)) BUG();
+- spin_lock(&dcache_lock);
++ if (lock) spin_lock(&dcache_lock);
+ list_add(&entry->d_hash, list);
+- spin_unlock(&dcache_lock);
++ if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++ __d_rehash(entry, 1);
+ }
+
+ #define do_switch(x,y) do { \
+Index: linux.mcp2/fs/exec.c
+===================================================================
+--- linux.mcp2.orig/fs/exec.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/exec.c 2004-05-05 14:19:59.000000000 -0700
+@@ -107,8 +107,10 @@
+ struct file * file;
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_OPEN,
++ .it_flags = FMODE_READ|FMODE_EXEC };
+
+- error = user_path_walk(library, &nd);
++ error = user_path_walk_it(library, &nd, &it);
+ if (error)
+ goto out;
+
+@@ -120,7 +122,8 @@
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++ intent_release(&it);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -342,9 +345,11 @@
+ struct inode *inode;
+ struct file *file;
+ int err = 0;
++ struct lookup_intent it = { .it_op = IT_OPEN,
++ .it_flags = FMODE_READ|FMODE_EXEC };
+
+ if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
+- err = path_walk(name, &nd);
++ err = path_walk_it(name, &nd, &it);
+ file = ERR_PTR(err);
+ if (!err) {
+ inode = nd.dentry->d_inode;
+@@ -356,7 +361,8 @@
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++ intent_release(&it);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+@@ -368,6 +374,7 @@
+ return file;
+ }
+ }
++ intent_release(&it);
+ path_release(&nd);
+ }
+ goto out;
+@@ -969,7 +976,7 @@
+ goto close_fail;
+ if (!file->f_op->write)
+ goto close_fail;
+- if (do_truncate(file->f_dentry, 0) != 0)
++ if (do_truncate(file->f_dentry, 0, 0) != 0)
+ goto close_fail;
+
+ retval = binfmt->core_dump(signr, regs, file);
+Index: linux.mcp2/fs/namei.c
+===================================================================
+--- linux.mcp2.orig/fs/namei.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/namei.c 2004-05-05 14:28:26.000000000 -0700
+@@ -94,6 +94,13 @@
+ * XEmacs seems to be relying on it...
+ */
+
++void intent_release(struct lookup_intent *it)
++{
++ if (it && it->it_op_release)
++ it->it_op_release(it);
++
++}
++
+ /* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+ * kernel data space before using them..
+@@ -260,10 +267,19 @@
+ * Internal lookup() using the new generic dcache.
+ * SMP-safe
+ */
+-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
++ int flags, struct lookup_intent *it)
+ {
+ struct dentry * dentry = d_lookup(parent, name);
+
++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) &&
++ !d_invalidate(dentry)) {
++ dput(dentry);
++ dentry = NULL;
++ }
++ return dentry;
++ } else
+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+ if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
+ dput(dentry);
+@@ -281,11 +297,15 @@
+ * make sure that nobody added the entry to the dcache in the meantime..
+ * SMP-safe
+ */
+-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
++ int flags, struct lookup_intent *it)
+ {
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
++ int counter = 0;
+
++again:
++ counter++;
+ down(&dir->i_sem);
+ /*
+ * First re-do the cached lookup just in case it was created
+@@ -300,6 +320,9 @@
+ result = ERR_PTR(-ENOMEM);
+ if (dentry) {
+ lock_kernel();
++ if (dir->i_op->lookup_it)
++ result = dir->i_op->lookup_it(dir, dentry, it, flags);
++ else
+ result = dir->i_op->lookup(dir, dentry);
+ unlock_kernel();
+ if (result)
+@@ -321,6 +344,15 @@
+ dput(result);
+ result = ERR_PTR(-ENOENT);
+ }
++ } else if (result->d_op && result->d_op->d_revalidate_it) {
++ if (!result->d_op->d_revalidate_it(result, flags, it) &&
++ !d_invalidate(result)) {
++ dput(result);
++ if (counter > 10)
++ result = ERR_PTR(-ESTALE);
++ if (!IS_ERR(result))
++ goto again;
++ }
+ }
+ return result;
+ }
+@@ -332,7 +364,8 @@
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
++ struct lookup_intent *it)
+ {
+ int err;
+ if (current->link_count >= 5)
+@@ -346,10 +379,12 @@
+ current->link_count++;
+ current->total_link_count++;
+ UPDATE_ATIME(dentry->d_inode);
++ nd->intent = it;
+ err = dentry->d_inode->i_op->follow_link(dentry, nd);
+ current->link_count--;
+ return err;
+ loop:
++ intent_release(it);
+ path_release(nd);
+ return -ELOOP;
+ }
+@@ -447,7 +482,8 @@
+ *
+ * We expect 'base' to be positive and a directory.
+ */
+-int link_path_walk(const char * name, struct nameidata *nd)
++int link_path_walk_it(const char *name, struct nameidata *nd,
++ struct lookup_intent *it)
+ {
+ struct dentry *dentry;
+ struct inode *inode;
+@@ -520,9 +556,10 @@
+ break;
+ }
+ /* This does the actual lookups.. */
+- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
+ if (!dentry) {
+- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE,
++ NULL);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ break;
+@@ -540,7 +577,7 @@
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
+- err = do_follow_link(dentry, nd);
++ err = do_follow_link(dentry, nd, NULL);
+ dput(dentry);
+ if (err)
+ goto return_err;
+@@ -556,7 +593,7 @@
+ nd->dentry = dentry;
+ }
+ err = -ENOTDIR;
+- if (!inode->i_op->lookup)
++ if (!inode->i_op->lookup && !inode->i_op->lookup_it)
+ break;
+ continue;
+ /* here ends the main loop */
+@@ -583,9 +620,9 @@
+ if (err < 0)
+ break;
+ }
+- dentry = cached_lookup(nd->dentry, &this, 0);
++ dentry = cached_lookup(nd->dentry, &this, 0, it);
+ if (!dentry) {
+- dentry = real_lookup(nd->dentry, &this, 0);
++ dentry = real_lookup(nd->dentry, &this, 0, it);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ break;
+@@ -595,7 +632,7 @@
+ inode = dentry->d_inode;
+ if ((lookup_flags & LOOKUP_FOLLOW)
+ && inode && inode->i_op && inode->i_op->follow_link) {
+- err = do_follow_link(dentry, nd);
++ err = do_follow_link(dentry, nd, it);
+ dput(dentry);
+ if (err)
+ goto return_err;
+@@ -609,7 +646,8 @@
+ goto no_inode;
+ if (lookup_flags & LOOKUP_DIRECTORY) {
+ err = -ENOTDIR;
+- if (!inode->i_op || !inode->i_op->lookup)
++ if (!inode->i_op ||
++ (!inode->i_op->lookup && !inode->i_op->lookup_it))
+ break;
+ }
+ goto return_base;
+@@ -633,6 +671,34 @@
+ * Check the cached dentry for staleness.
+ */
+ dentry = nd->dentry;
++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
++ err = -ESTALE;
++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) {
++ struct dentry *new;
++ err = permission(dentry->d_parent->d_inode,
++ MAY_EXEC);
++ if (err)
++ break;
++ new = real_lookup(dentry->d_parent,
++ &dentry->d_name, 0, NULL);
++ if (IS_ERR(new)) {
++ err = PTR_ERR(new);
++ break;
++ }
++ d_invalidate(dentry);
++ dput(dentry);
++ nd->dentry = new;
++ }
++ if (!nd->dentry->d_inode)
++ goto no_inode;
++ if (lookup_flags & LOOKUP_DIRECTORY) {
++ err = -ENOTDIR;
++ if (!nd->dentry->d_inode->i_op ||
++ (!nd->dentry->d_inode->i_op->lookup &&
++ !nd->dentry->d_inode->i_op->lookup_it))
++ break;
++ }
++ } else
+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+ err = -ESTALE;
+ if (!dentry->d_op->d_revalidate(dentry, 0)) {
+@@ -646,15 +703,28 @@
+ dput(dentry);
+ break;
+ }
++ if (err)
++ intent_release(it);
+ path_release(nd);
+ return_err:
+ return err;
+ }
+
++int link_path_walk(const char * name, struct nameidata *nd)
++{
++ return link_path_walk_it(name, nd, NULL);
++}
++
++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
++{
++ current->total_link_count = 0;
++ return link_path_walk_it(name, nd, it);
++}
++
+ int path_walk(const char * name, struct nameidata *nd)
+ {
+ current->total_link_count = 0;
+- return link_path_walk(name, nd);
++ return link_path_walk_it(name, nd, NULL);
+ }
+
+ /* SMP-safe */
+@@ -743,6 +813,7 @@
+ {
+ nd->last_type = LAST_ROOT; /* if there are only slashes... */
+ nd->flags = flags;
++ nd->intent = NULL;
+ if (*name=='/')
+ return walk_init_root(name,nd);
+ read_lock(¤t->fs->lock);
+@@ -757,7 +828,8 @@
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+-struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
++ struct lookup_intent *it)
+ {
+ struct dentry * dentry;
+ struct inode *inode;
+@@ -780,13 +852,16 @@
+ goto out;
+ }
+
+- dentry = cached_lookup(base, name, 0);
++ dentry = cached_lookup(base, name, 0, it);
+ if (!dentry) {
+ struct dentry *new = d_alloc(base, name);
+ dentry = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto out;
+ lock_kernel();
++ if (inode->i_op->lookup_it)
++ dentry = inode->i_op->lookup_it(inode, new, it, 0);
++ else
+ dentry = inode->i_op->lookup(inode, new);
+ unlock_kernel();
+ if (!dentry)
+@@ -798,6 +873,12 @@
+ return dentry;
+ }
+
++struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
++{
++ return lookup_hash_it(name, base, NULL);
++}
++
++
+ /* SMP-safe */
+ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
+ {
+@@ -819,7 +900,7 @@
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return lookup_hash_it(&this, base, NULL);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+@@ -851,6 +932,23 @@
+ return err;
+ }
+
++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
++ struct lookup_intent *it)
++{
++ char *tmp;
++ int err;
++
++ tmp = getname(name);
++ err = PTR_ERR(tmp);
++ if (!IS_ERR(tmp)) {
++ err = 0;
++ if (path_init(tmp, flags, nd))
++ err = path_walk_it(tmp, nd, it);
++ putname(tmp);
++ }
++ return err;
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -946,7 +1044,8 @@
+ return retval;
+ }
+
+-int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode,
++ struct lookup_intent *it)
+ {
+ int error;
+
+@@ -959,12 +1058,15 @@
+ goto exit_lock;
+
+ error = -EACCES; /* shouldn't it be ENOSYS? */
+- if (!dir->i_op || !dir->i_op->create)
++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it))
+ goto exit_lock;
+
+ DQUOT_INIT(dir);
+ lock_kernel();
+- error = dir->i_op->create(dir, dentry, mode);
++ if (dir->i_op->create_it)
++ error = dir->i_op->create_it(dir, dentry, mode, it);
++ else
++ error = dir->i_op->create(dir, dentry, mode);
+ unlock_kernel();
+ exit_lock:
+ up(&dir->i_zombie);
+@@ -973,6 +1075,11 @@
+ return error;
+ }
+
++int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
++{
++ return vfs_create_it(dir, dentry, mode, NULL);
++}
++
+ /*
+ * open_namei()
+ *
+@@ -987,7 +1094,8 @@
+ * for symlinks (where the permissions are checked later).
+ * SMP-safe
+ */
+-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
++int open_namei_it(const char *pathname, int flag, int mode,
++ struct nameidata *nd, struct lookup_intent *it)
+ {
+ int acc_mode, error = 0;
+ struct inode *inode;
+@@ -997,12 +1105,14 @@
+
+ acc_mode = ACC_MODE(flag);
+
++ if (it)
++ it->it_flags = flag;
+ /*
+ * The simplest case - just a plain lookup.
+ */
+ if (!(flag & O_CREAT)) {
+ if (path_init(pathname, lookup_flags(flag), nd))
+- error = path_walk(pathname, nd);
++ error = path_walk_it(pathname, nd, it);
+ if (error)
+ return error;
+ dentry = nd->dentry;
+@@ -1012,6 +1122,10 @@
+ /*
+ * Create - we need to know the parent.
+ */
++ if (it) {
++ it->it_create_mode = mode;
++ it->it_op |= IT_CREAT;
++ }
+ if (path_init(pathname, LOOKUP_PARENT, nd))
+ error = path_walk(pathname, nd);
+ if (error)
+@@ -1028,7 +1142,7 @@
+
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+
+ do_last:
+ error = PTR_ERR(dentry);
+@@ -1037,10 +1151,11 @@
+ goto exit;
+ }
+
++ it->it_create_mode = mode;
+ /* Negative dentry, just create the file */
+ if (!dentry->d_inode) {
+- error = vfs_create(dir->d_inode, dentry,
+- mode & ~current->fs->umask);
++ error = vfs_create_it(dir->d_inode, dentry,
++ mode & ~current->fs->umask, it);
+ up(&dir->d_inode->i_sem);
+ dput(nd->dentry);
+ nd->dentry = dentry;
+@@ -1144,7 +1259,7 @@
+ if (!error) {
+ DQUOT_INIT(inode);
+
+- error = do_truncate(dentry, 0);
++ error = do_truncate(dentry, 0, 1);
+ }
+ put_write_access(inode);
+ if (error)
+@@ -1156,8 +1271,10 @@
+ return 0;
+
+ exit_dput:
++ intent_release(it);
+ dput(dentry);
+ exit:
++ intent_release(it);
+ path_release(nd);
+ return error;
+
+@@ -1176,7 +1293,10 @@
+ * are done. Procfs-like symlinks just set LAST_BIND.
+ */
+ UPDATE_ATIME(dentry->d_inode);
++ nd->intent = it;
+ error = dentry->d_inode->i_op->follow_link(dentry, nd);
++ if (error)
++ intent_release(it);
+ dput(dentry);
+ if (error)
+ return error;
+@@ -1198,13 +1318,20 @@
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+ putname(nd->last.name);
+ goto do_last;
+ }
+
++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
++{
++ return open_namei_it(pathname, flag, mode, nd, NULL);
++}
++
++
+ /* SMP-safe */
+-static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
++static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
++ struct lookup_intent *it)
+ {
+ struct dentry *dentry;
+
+@@ -1212,7 +1339,7 @@
+ dentry = ERR_PTR(-EEXIST);
+ if (nd->last_type != LAST_NORM)
+ goto fail;
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+ if (IS_ERR(dentry))
+ goto fail;
+ if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
+@@ -1269,7 +1396,20 @@
+ error = path_walk(tmp, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 0);
++
++ if (nd.last_type != LAST_NORM) {
++ error = -EEXIST;
++ goto out2;
++ }
++ if (nd.dentry->d_inode->i_op->mknod_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mknod_raw(&nd, mode, dev);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++
++ dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(dentry);
+
+ mode &= ~current->fs->umask;
+@@ -1290,6 +1426,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1338,7 +1475,18 @@
+ error = path_walk(tmp, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 1);
++ if (nd.last_type != LAST_NORM) {
++ error = -EEXIST;
++ goto out2;
++ }
++ if (nd.dentry->d_inode->i_op->mkdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mkdir_raw(&nd, mode);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++ dentry = lookup_create(&nd, 1, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_mkdir(nd.dentry->d_inode, dentry,
+@@ -1346,6 +1490,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1447,8 +1592,16 @@
+ error = -EBUSY;
+ goto exit1;
+ }
++ if (nd.dentry->d_inode->i_op->rmdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ error = op->rmdir_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
+ down(&nd.dentry->d_inode->i_sem);
+- dentry = lookup_hash(&nd.last, nd.dentry);
++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_rmdir(nd.dentry->d_inode, dentry);
+@@ -1507,8 +1660,15 @@
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
++ if (nd.dentry->d_inode->i_op->unlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->unlink_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
+ down(&nd.dentry->d_inode->i_sem);
+- dentry = lookup_hash(&nd.last, nd.dentry);
++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ /* Why not before? Because we want correct error value */
+@@ -1576,15 +1736,27 @@
+ error = path_walk(to, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 0);
++ if (nd.last_type != LAST_NORM) {
++ error = -EEXIST;
++ goto out2;
++ }
++ if (nd.dentry->d_inode->i_op->symlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->symlink_raw(&nd, from);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++ dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_symlink(nd.dentry->d_inode, dentry, from);
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++ out2:
+ path_release(&nd);
+-out:
++ out:
+ putname(to);
+ }
+ putname(from);
+@@ -1667,7 +1835,18 @@
+ error = -EXDEV;
+ if (old_nd.mnt != nd.mnt)
+ goto out_release;
+- new_dentry = lookup_create(&nd, 0);
++ if (nd.last_type != LAST_NORM) {
++ error = -EEXIST;
++ goto out_release;
++ }
++ if (nd.dentry->d_inode->i_op->link_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->link_raw(&old_nd, &nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out_release;
++ }
++ new_dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(new_dentry);
+ if (!IS_ERR(new_dentry)) {
+ error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+@@ -1713,7 +1888,7 @@
+ * locking].
+ */
+ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
+- struct inode *new_dir, struct dentry *new_dentry)
++ struct inode *new_dir, struct dentry *new_dentry)
+ {
+ int error;
+ struct inode *target;
+@@ -1792,7 +1967,7 @@
+ }
+
+ int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
+- struct inode *new_dir, struct dentry *new_dentry)
++ struct inode *new_dir, struct dentry *new_dentry)
+ {
+ int error;
+
+@@ -1883,9 +2058,18 @@
+ if (newnd.last_type != LAST_NORM)
+ goto exit2;
+
++ if (old_dir->d_inode->i_op->rename_raw) {
++ lock_kernel();
++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd);
++ unlock_kernel();
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit2;
++ }
++
+ double_lock(new_dir, old_dir);
+
+- old_dentry = lookup_hash(&oldnd.last, old_dir);
++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL);
+ error = PTR_ERR(old_dentry);
+ if (IS_ERR(old_dentry))
+ goto exit3;
+@@ -1901,16 +2085,16 @@
+ if (newnd.last.name[newnd.last.len])
+ goto exit4;
+ }
+- new_dentry = lookup_hash(&newnd.last, new_dir);
++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry))
+ goto exit4;
+
++
+ lock_kernel();
+ error = vfs_rename(old_dir->d_inode, old_dentry,
+ new_dir->d_inode, new_dentry);
+ unlock_kernel();
+-
+ dput(new_dentry);
+ exit4:
+ dput(old_dentry);
+@@ -1961,20 +2145,26 @@
+ }
+
+ static inline int
+-__vfs_follow_link(struct nameidata *nd, const char *link)
++__vfs_follow_link(struct nameidata *nd, const char *link,
++ struct lookup_intent *it)
+ {
+ int res = 0;
+ char *name;
+ if (IS_ERR(link))
+ goto fail;
+
++ if (it == NULL)
++ it = nd->intent;
++ else if (it != nd->intent)
++ printk("it != nd->intent: tell phil@clusterfs.com\n");
++
+ if (*link == '/') {
+ path_release(nd);
+ if (!walk_init_root(link, nd))
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
+- res = link_path_walk(link, nd);
++ res = link_path_walk_it(link, nd, it);
+ out:
+ if (current->link_count || res || nd->last_type!=LAST_NORM)
+ return res;
+@@ -1996,7 +2186,13 @@
+
+ int vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+- return __vfs_follow_link(nd, link);
++ return __vfs_follow_link(nd, link, NULL);
++}
++
++int vfs_follow_link_it(struct nameidata *nd, const char *link,
++ struct lookup_intent *it)
++{
++ return __vfs_follow_link(nd, link, it);
+ }
+
+ /* get the link contents into pagecache */
+@@ -2038,7 +2234,7 @@
+ {
+ struct page *page = NULL;
+ char *s = page_getlink(dentry, &page);
+- int res = __vfs_follow_link(nd, s);
++ int res = __vfs_follow_link(nd, s, NULL);
+ if (page) {
+ kunmap(page);
+ page_cache_release(page);
+Index: linux.mcp2/fs/namespace.c
+===================================================================
+--- linux.mcp2.orig/fs/namespace.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/namespace.c 2004-05-05 14:22:06.000000000 -0700
+@@ -97,6 +97,7 @@
+ {
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
++ UNPIN(old_nd->dentry, old_nd->mnt, 1);
+ mnt->mnt_parent = mnt;
+ mnt->mnt_mountpoint = mnt->mnt_root;
+ list_del_init(&mnt->mnt_child);
+@@ -108,6 +109,7 @@
+ {
+ mnt->mnt_parent = mntget(nd->mnt);
+ mnt->mnt_mountpoint = dget(nd->dentry);
++ PIN(nd->dentry, nd->mnt, 1);
+ list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+ list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
+ nd->dentry->d_mounted++;
+@@ -491,15 +493,18 @@
+ {
+ struct nameidata old_nd;
+ struct vfsmount *mnt = NULL;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int err = mount_is_safe(nd);
+ if (err)
+ return err;
+ if (!old_name || !*old_name)
+ return -EINVAL;
+ if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd))
+- err = path_walk(old_name, &old_nd);
+- if (err)
++ err = path_walk_it(old_name, &old_nd, &it);
++ if (err) {
++ intent_release(&it);
+ return err;
++ }
+
+ down_write(¤t->namespace->sem);
+ err = -EINVAL;
+@@ -522,6 +527,7 @@
+ }
+
+ up_write(¤t->namespace->sem);
++ intent_release(&it);
+ path_release(&old_nd);
+ return err;
+ }
+@@ -706,6 +712,7 @@
+ unsigned long flags, void *data_page)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int retval = 0;
+ int mnt_flags = 0;
+
+@@ -731,9 +738,11 @@
+
+ /* ... and get the mountpoint */
+ if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
+- retval = path_walk(dir_name, &nd);
+- if (retval)
++ retval = path_walk_it(dir_name, &nd, &it);
++ if (retval) {
++ intent_release(&it);
+ return retval;
++ }
+
+ if (flags & MS_REMOUNT)
+ retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+@@ -745,6 +754,8 @@
+ else
+ retval = do_add_mount(&nd, type_page, flags, mnt_flags,
+ dev_name, data_page);
++
++ intent_release(&it);
+ path_release(&nd);
+ return retval;
+ }
+@@ -910,6 +921,8 @@
+ {
+ struct vfsmount *tmp;
+ struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
++ struct lookup_intent new_it = { .it_op = IT_GETATTR };
++ struct lookup_intent old_it = { .it_op = IT_GETATTR };
+ char *name;
+ int error;
+
+@@ -924,7 +937,7 @@
+ goto out0;
+ error = 0;
+ if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
+- error = path_walk(name, &new_nd);
++ error = path_walk_it(name, &new_nd, &new_it);
+ putname(name);
+ if (error)
+ goto out0;
+@@ -938,7 +951,7 @@
+ goto out1;
+ error = 0;
+ if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
+- error = path_walk(name, &old_nd);
++ error = path_walk_it(name, &old_nd, &old_it);
+ putname(name);
+ if (error)
+ goto out1;
+@@ -994,8 +1007,10 @@
+ up(&old_nd.dentry->d_inode->i_zombie);
+ up_write(¤t->namespace->sem);
+ path_release(&user_nd);
++ intent_release(&old_it);
+ path_release(&old_nd);
+ out1:
++ intent_release(&new_it);
+ path_release(&new_nd);
+ out0:
+ unlock_kernel();
+Index: linux.mcp2/fs/open.c
+===================================================================
+--- linux.mcp2.orig/fs/open.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/open.c 2004-05-05 14:30:34.000000000 -0700
+@@ -19,6 +19,8 @@
+ #include <asm/uaccess.h>
+
+ #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
++extern int path_walk_it(const char *name, struct nameidata *nd,
++ struct lookup_intent *it);
+
+ int vfs_statfs(struct super_block *sb, struct statfs *buf)
+ {
+@@ -95,9 +97,10 @@
+ write_unlock(&files->file_lock);
+ }
+
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+ struct inode *inode = dentry->d_inode;
++ struct inode_operations *op = dentry->d_inode->i_op;
+ int error;
+ struct iattr newattrs;
+
+@@ -108,7 +111,13 @@
+ down(&inode->i_sem);
+ newattrs.ia_size = length;
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+- error = notify_change(dentry, &newattrs);
++ if (called_from_open)
++ newattrs.ia_valid |= ATTR_FROM_OPEN;
++ if (op->setattr_raw) {
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ } else
++ error = notify_change(dentry, &newattrs);
+ up(&inode->i_sem);
+ return error;
+ }
+@@ -118,12 +127,13 @@
+ struct nameidata nd;
+ struct inode * inode;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd, &it);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -163,11 +173,13 @@
+ error = locks_verify_truncate(inode, NULL, length);
+ if (!error) {
+ DQUOT_INIT(inode);
+- error = do_truncate(nd.dentry, length);
++ intent_release(&it);
++ error = do_truncate(nd.dentry, length, 0);
+ }
+ put_write_access(inode);
+
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+@@ -215,7 +227,7 @@
+
+ error = locks_verify_truncate(inode, file, length);
+ if (!error)
+- error = do_truncate(dentry, length);
++ error = do_truncate(dentry, length, 0);
+ out_putf:
+ fput(file);
+ out:
+@@ -260,11 +272,13 @@
+ struct inode * inode;
+ struct iattr newattrs;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, NULL);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+
++ /* this is safe without a Lustre lock because it only depends
++ on the super block */
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto dput_and_out;
+@@ -279,11 +293,25 @@
+ goto dput_and_out;
+
+ newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+- } else {
++ }
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ }
++
++ error = -EPERM;
++ if (!times) {
+ if (current->fsuid != inode->i_uid &&
+ (error = permission(inode,MAY_WRITE)) != 0)
+ goto dput_and_out;
+ }
++
+ error = notify_change(nd.dentry, &newattrs);
+ dput_and_out:
+ path_release(&nd);
+@@ -304,12 +332,14 @@
+ struct inode * inode;
+ struct iattr newattrs;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, NULL);
+
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+
++ /* this is safe without a Lustre lock because it only depends
++ on the super block */
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto dput_and_out;
+@@ -324,7 +354,20 @@
+ newattrs.ia_atime = times[0].tv_sec;
+ newattrs.ia_mtime = times[1].tv_sec;
+ newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+- } else {
++ }
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ }
++
++ error = -EPERM;
++ if (!utimes) {
+ if (current->fsuid != inode->i_uid &&
+ (error = permission(inode,MAY_WRITE)) != 0)
+ goto dput_and_out;
+@@ -347,6 +390,7 @@
+ int old_fsuid, old_fsgid;
+ kernel_cap_t old_cap;
+ int res;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+@@ -364,13 +408,14 @@
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = user_path_walk(filename, &nd);
++ res = user_path_walk_it(filename, &nd, &it);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode);
+ /* SuS v2 requires we report a read only fs too */
+ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+ && !special_file(nd.dentry->d_inode->i_mode))
+ res = -EROFS;
++ intent_release(&it);
+ path_release(&nd);
+ }
+
+@@ -386,6 +431,7 @@
+ int error;
+ struct nameidata nd;
+ char *name;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ name = getname(filename);
+ error = PTR_ERR(name);
+@@ -394,7 +440,7 @@
+
+ error = 0;
+ if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
+- error = path_walk(name, &nd);
++ error = path_walk_it(name, &nd, &it);
+ putname(name);
+ if (error)
+ goto out;
+@@ -406,6 +452,7 @@
+ set_fs_pwd(current->fs, nd.mnt, nd.dentry);
+
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+@@ -446,6 +493,7 @@
+ int error;
+ struct nameidata nd;
+ char *name;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ name = getname(filename);
+ error = PTR_ERR(name);
+@@ -454,7 +502,7 @@
+
+ path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+- error = path_walk(name, &nd);
++ error = path_walk_it(name, &nd, &it);
+ putname(name);
+ if (error)
+ goto out;
+@@ -471,39 +519,56 @@
+ set_fs_altroot();
+ error = 0;
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+ }
+
+-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++int chmod_common(struct dentry *dentry, mode_t mode)
+ {
+- struct inode * inode;
+- struct dentry * dentry;
+- struct file * file;
+- int err = -EBADF;
++ struct inode *inode = dentry->d_inode;
+ struct iattr newattrs;
++ int err = -EROFS;
+
+- file = fget(fd);
+- if (!file)
++ if (IS_RDONLY(inode))
+ goto out;
+
+- dentry = file->f_dentry;
+- inode = dentry->d_inode;
++ if (inode->i_op->setattr_raw) {
++ newattrs.ia_mode = mode;
++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ err = inode->i_op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (err != -EOPNOTSUPP)
++ goto out;
++ }
+
+- err = -EROFS;
+- if (IS_RDONLY(inode))
+- goto out_putf;
+ err = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto out_putf;
++ goto out;
++
+ if (mode == (mode_t) -1)
+ mode = inode->i_mode;
+ newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ err = notify_change(dentry, &newattrs);
+
+-out_putf:
++out:
++ return err;
++}
++
++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++{
++ struct file * file;
++ int err = -EBADF;
++
++ file = fget(fd);
++ if (!file)
++ goto out;
++
++ err = chmod_common(file->f_dentry, mode);
++
+ fput(file);
+ out:
+ return err;
+@@ -512,30 +577,14 @@
+ asmlinkage long sys_chmod(const char * filename, mode_t mode)
+ {
+ struct nameidata nd;
+- struct inode * inode;
+ int error;
+- struct iattr newattrs;
+
+ error = user_path_walk(filename, &nd);
+ if (error)
+ goto out;
+- inode = nd.dentry->d_inode;
+-
+- error = -EROFS;
+- if (IS_RDONLY(inode))
+- goto dput_and_out;
+
+- error = -EPERM;
+- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto dput_and_out;
++ error = chmod_common(nd.dentry, mode);
+
+- if (mode == (mode_t) -1)
+- mode = inode->i_mode;
+- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+- error = notify_change(nd.dentry, &newattrs);
+-
+-dput_and_out:
+ path_release(&nd);
+ out:
+ return error;
+@@ -555,6 +604,20 @@
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto out;
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = dentry->d_inode->i_op;
++
++ newattrs.ia_uid = user;
++ newattrs.ia_gid = group;
++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ return error;
++ }
++
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out;
+@@ -659,6 +722,7 @@
+ {
+ int namei_flags, error;
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_OPEN };
+
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+@@ -666,14 +730,15 @@
+ if (namei_flags & O_TRUNC)
+ namei_flags |= 2;
+
+- error = open_namei(filename, namei_flags, mode, &nd);
+- if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
++ error = open_namei_it(filename, namei_flags, mode, &nd, &it);
++ if (error)
++ return ERR_PTR(error);
+
+- return ERR_PTR(error);
++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
+ }
+
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++ int flags, struct lookup_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -710,12 +775,15 @@
+ }
+
+ if (f->f_op && f->f_op->open) {
++ f->f_it = it;
+ error = f->f_op->open(inode,f);
++ f->f_it = NULL;
+ if (error)
+ goto cleanup_all;
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
++ intent_release(it);
+ return f;
+
+ cleanup_all:
+@@ -730,11 +798,17 @@
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+ }
+
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++ return dentry_open_it(dentry, mnt, flags, NULL);
++}
++
+ /*
+ * Find an empty file descriptor entry, and mark it busy.
+ */
+Index: linux.mcp2/fs/stat.c
+===================================================================
+--- linux.mcp2.orig/fs/stat.c 2004-01-19 07:49:43.000000000 -0800
++++ linux.mcp2/fs/stat.c 2004-05-05 14:19:59.000000000 -0700
+@@ -17,10 +17,12 @@
+ * Revalidate the inode. This is required for proper NFS attribute caching.
+ */
+ static __inline__ int
+-do_revalidate(struct dentry *dentry)
++do_revalidate(struct dentry *dentry, struct lookup_intent *it)
+ {
+ struct inode * inode = dentry->d_inode;
+- if (inode->i_op && inode->i_op->revalidate)
++ if (inode->i_op && inode->i_op->revalidate_it)
++ return inode->i_op->revalidate_it(dentry, it);
++ else if (inode->i_op && inode->i_op->revalidate)
+ return inode->i_op->revalidate(dentry);
+ return 0;
+ }
+@@ -135,13 +139,15 @@
+ asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_old_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -151,13 +157,15 @@
+ asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -172,13 +180,15 @@
+ asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_old_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -189,13 +199,15 @@
+ asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -216,7 +228,7 @@
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_old_stat(dentry->d_inode, statbuf);
+ fput(f);
+@@ -235,7 +247,7 @@
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_new_stat(dentry->d_inode, statbuf);
+ fput(f);
+@@ -257,7 +269,7 @@
+
+ error = -EINVAL;
+ if (inode->i_op && inode->i_op->readlink &&
+- !(error = do_revalidate(nd.dentry))) {
++ !(error = do_revalidate(nd.dentry, NULL))) {
+ UPDATE_ATIME(inode);
+ error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
+ }
+@@ -333,12 +345,14 @@
+ {
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat64(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -348,12 +362,14 @@
+ {
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat64(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -368,7 +384,7 @@
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_new_stat64(dentry->d_inode, statbuf);
+ fput(f);
+Index: linux.mcp2/include/linux/dcache.h
+===================================================================
+--- linux.mcp2.orig/include/linux/dcache.h 2004-04-23 16:52:28.000000000 -0700
++++ linux.mcp2/include/linux/dcache.h 2004-05-05 14:19:59.000000000 -0700
+@@ -5,6 +5,51 @@
+
+ #include <asm/atomic.h>
+ #include <linux/mount.h>
++#include <linux/string.h>
++
++#define IT_OPEN 0x0001
++#define IT_CREAT 0x0002
++#define IT_READDIR 0x0004
++#define IT_GETATTR 0x0008
++#define IT_LOOKUP 0x0010
++#define IT_UNLINK 0x0020
++#define IT_GETXATTR 0x0040
++#define IT_EXEC 0x0080
++#define IT_PIN 0x0100
++
++#define IT_FL_LOCKED 0x0001
++#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */
++
++#define INTENT_MAGIC 0x19620323
++
++
++struct lustre_intent_data {
++ int it_disposition;
++ int it_status;
++ __u64 it_lock_handle;
++ void *it_data;
++ int it_lock_mode;
++ int it_int_flags;
++};
++struct lookup_intent {
++ int it_magic;
++ void (*it_op_release)(struct lookup_intent *);
++ int it_op;
++ int it_flags;
++ int it_create_mode;
++ union {
++ struct lustre_intent_data lustre;
++ } d;
++};
++
++static inline void intent_init(struct lookup_intent *it, int op, int flags)
++{
++ memset(it, 0, sizeof(*it));
++ it->it_magic = INTENT_MAGIC;
++ it->it_op = op;
++ it->it_flags = flags;
++}
++
+
+ /*
+ * linux/include/linux/dcache.h
+@@ -90,8 +135,22 @@
+ int (*d_delete)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *);
++ void (*d_pin)(struct dentry *, struct vfsmount * , int);
++ void (*d_unpin)(struct dentry *, struct vfsmount *, int);
+ };
+
++#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \
++ de->d_op->d_pin(de, mnt, flag);
++#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \
++ de->d_op->d_unpin(de, mnt, flag);
++
++
++/* defined in fs/namei.c */
++extern void intent_release(struct lookup_intent *it);
++/* defined in fs/dcache.c */
++extern void __d_rehash(struct dentry * entry, int lock);
++
+ /* the dentry parameter passed to d_hash and d_compare is the parent
+ * directory of the entries to be compared. It is used in case these
+ * functions need any directory specific information for determining
+@@ -123,6 +182,7 @@
+ * s_nfsd_free_path semaphore will be down
+ */
+ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */
+
+ extern spinlock_t dcache_lock;
+
+Index: linux.mcp2/include/linux/fs.h
+===================================================================
+--- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:12:28.000000000 -0700
++++ linux.mcp2/include/linux/fs.h 2004-05-05 14:19:59.000000000 -0700
+@@ -73,6 +73,7 @@
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+
+ #define READ 0
+ #define WRITE 1
+@@ -335,6 +336,9 @@
+ #define ATTR_MTIME_SET 256
+ #define ATTR_FORCE 512 /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG 1024
++#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */
++#define ATTR_CTIME_SET 0x2000
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+@@ -470,6 +474,7 @@
+ struct pipe_inode_info *i_pipe;
+ struct block_device *i_bdev;
+ struct char_device *i_cdev;
++ void *i_filterdata;
+
+ unsigned long i_dnotify_mask; /* Directory notify events */
+ struct dnotify_struct *i_dnotify; /* for directory notifications */
+@@ -574,6 +579,7 @@
+
+ /* needed for tty driver, and maybe others */
+ void *private_data;
++ struct lookup_intent *f_it;
+
+ /* preallocated helper kiobuf to speedup O_DIRECT */
+ struct kiobuf *f_iobuf;
+@@ -692,6 +698,7 @@
+ struct qstr last;
+ unsigned int flags;
+ int last_type;
++ struct lookup_intent *intent;
+ };
+
+ #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */
+@@ -840,7 +847,8 @@
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
++ struct inode *new_dir, struct dentry *new_dentry);
+
+ /*
+ * File types
+@@ -900,21 +908,32 @@
+
+ struct inode_operations {
+ int (*create) (struct inode *,struct dentry *,int);
++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *);
++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
++ int (*link_raw) (struct nameidata *,struct nameidata *);
+ int (*unlink) (struct inode *,struct dentry *);
++ int (*unlink_raw) (struct nameidata *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
++ int (*symlink_raw) (struct nameidata *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
++ int (*mkdir_raw) (struct nameidata *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
++ int (*rmdir_raw) (struct nameidata *);
+ int (*mknod) (struct inode *,struct dentry *,int,int);
++ int (*mknod_raw) (struct nameidata *,int,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
++ int (*rename_raw) (struct nameidata *, struct nameidata *);
+ int (*readlink) (struct dentry *, char *,int);
+ int (*follow_link) (struct dentry *, struct nameidata *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int);
+ int (*revalidate) (struct dentry *);
++ int (*revalidate_it) (struct dentry *, struct lookup_intent *);
+ int (*setattr) (struct dentry *, struct iattr *);
++ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct dentry *, struct iattr *);
+ };
+
+@@ -1115,10 +1134,14 @@
+
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern int open_namei_it(const char *filename, int namei_flags, int mode,
++ struct nameidata *nd, struct lookup_intent *it);
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++ int flags, struct lookup_intent *it);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+
+@@ -1380,6 +1403,7 @@
+ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+
+ extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
+ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+@@ -1390,6 +1414,8 @@
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
+
+ extern void iput(struct inode *);
+ extern void force_delete(struct inode *);
+@@ -1499,6 +1525,8 @@
+
+ extern int vfs_readlink(struct dentry *, char *, int, const char *);
+ extern int vfs_follow_link(struct nameidata *, const char *);
++extern int vfs_follow_link_it(struct nameidata *, const char *,
++ struct lookup_intent *it);
+ extern int page_readlink(struct dentry *, char *, int);
+ extern int page_follow_link(struct dentry *, struct nameidata *);
+ extern struct inode_operations page_symlink_inode_operations;
+Index: linux.mcp2/include/linux/fs_struct.h
+===================================================================
+--- linux.mcp2.orig/include/linux/fs_struct.h 2004-01-19 07:49:42.000000000 -0800
++++ linux.mcp2/include/linux/fs_struct.h 2004-05-05 14:19:59.000000000 -0700
+@@ -34,10 +34,12 @@
+ write_lock(&fs->lock);
+ old_root = fs->root;
+ old_rootmnt = fs->rootmnt;
++ PIN(dentry, mnt, 1);
+ fs->rootmnt = mntget(mnt);
+ fs->root = dget(dentry);
+ write_unlock(&fs->lock);
+ if (old_root) {
++ UNPIN(old_root, old_rootmnt, 1);
+ dput(old_root);
+ mntput(old_rootmnt);
+ }
+@@ -57,10 +59,12 @@
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ old_pwdmnt = fs->pwdmnt;
++ PIN(dentry, mnt, 0);
+ fs->pwdmnt = mntget(mnt);
+ fs->pwd = dget(dentry);
+ write_unlock(&fs->lock);
+ if (old_pwd) {
++ UNPIN(old_pwd, old_pwdmnt, 0);
+ dput(old_pwd);
+ mntput(old_pwdmnt);
+ }
+Index: linux.mcp2/kernel/exit.c
+===================================================================
+--- linux.mcp2.orig/kernel/exit.c 2004-01-19 07:49:44.000000000 -0800
++++ linux.mcp2/kernel/exit.c 2004-05-05 14:19:59.000000000 -0700
+@@ -252,11 +252,14 @@
+ {
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
++ UNPIN(fs->pwd, fs->pwdmnt, 0);
++ UNPIN(fs->root, fs->rootmnt, 1);
+ dput(fs->root);
+ mntput(fs->rootmnt);
+ dput(fs->pwd);
+ mntput(fs->pwdmnt);
+ if (fs->altroot) {
++ UNPIN(fs->altroot, fs->altrootmnt, 1);
+ dput(fs->altroot);
+ mntput(fs->altrootmnt);
+ }
+Index: linux.mcp2/kernel/fork.c
+===================================================================
+--- linux.mcp2.orig/kernel/fork.c 2004-01-19 07:49:44.000000000 -0800
++++ linux.mcp2/kernel/fork.c 2004-05-05 14:19:59.000000000 -0700
+@@ -384,10 +384,13 @@
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->rootmnt = mntget(old->rootmnt);
++ PIN(old->pwd, old->pwdmnt, 0);
++ PIN(old->root, old->rootmnt, 1);
+ fs->root = dget(old->root);
+ fs->pwdmnt = mntget(old->pwdmnt);
+ fs->pwd = dget(old->pwd);
+ if (old->altroot) {
++ PIN(old->altroot, old->altrootmnt, 1);
+ fs->altrootmnt = mntget(old->altrootmnt);
+ fs->altroot = dget(old->altroot);
+ } else {
+Index: linux.mcp2/kernel/ksyms.c
+===================================================================
+--- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:12:28.000000000 -0700
++++ linux.mcp2/kernel/ksyms.c 2004-05-05 14:19:59.000000000 -0700
+@@ -264,6 +264,7 @@
+ EXPORT_SYMBOL(set_page_dirty);
+ EXPORT_SYMBOL(vfs_readlink);
+ EXPORT_SYMBOL(vfs_follow_link);
++EXPORT_SYMBOL(vfs_follow_link_it);
+ EXPORT_SYMBOL(page_readlink);
+ EXPORT_SYMBOL(page_follow_link);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
--- /dev/null
+Index: linux.mcp2/kernel/ksyms.c
+===================================================================
+--- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:57:48.000000000 -0700
++++ linux.mcp2/kernel/ksyms.c 2004-05-05 15:32:44.000000000 -0700
+@@ -108,6 +108,7 @@
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+ EXPORT_SYMBOL(__vmalloc);
++extern struct page * vmalloc_to_page(void *addr);
+ EXPORT_SYMBOL(vmalloc_to_page);
+ EXPORT_SYMBOL(mem_map);
+ EXPORT_SYMBOL(remap_page_range);
--- /dev/null
+dev_read_only_2.4.20-rh.patch
+exports_2.4.19-bgl.patch
+lustre_version.patch
+vfs_intent-2.4.19-bgl.patch
+invalidate_show-2.4.19-bgl.patch
+export-truncate-bgl.patch
+iod-stock-24-exports-2.4.19-bgl.patch
+ext3-htree-2.4.19-bgl.patch
+linux-2.4.19-bgl-xattr-0.8.54.patch
+ext3-2.4.20-fixes.patch
+ext3-2.4-ino_t.patch
+ext3-largefile.patch
+ext3-truncate_blocks.patch
+ext3-unmount_sync.patch
+ext3-use-after-free-2.4.19-pre1.patch
+ext3-orphan_lock.patch
+ext3-noread-2.4.20.patch
+ext3-delete_thread-2.4.20.patch
+extN-wantedi.patch
+ext3-san-2.4.20.patch
+ext3-map_inode_page.patch
+ext3-error-export.patch
+iopen-2.4.19-bgl.patch
+tcp-zero-copy-2.4.19-pre1.patch
+jbd-dont-account-blocks-twice.patch
+jbd-commit-tricks.patch
+ext3-no-write-super.patch
+add_page_private-2.4.19-bgl.patch
+socket-exports-2.4.19-bgl.patch
+removepage-2.4.20.patch
+jbd-ctx_switch.patch
+jbd-flushtime-2.4.19-suse.patch
+jbd-get_write_access.patch
+nfs_export_kernel-2.4.19-bgl.patch
+ext3-raw-lookup.patch
+ext3-ea-in-inode-2.4.20.patch
+listman-2.4.19-bgl.patch
+ext3-trusted_ea-2.4.20.patch
+jbd-2.4.19-pre1-jcberr.patch
+resched-2.4.19-pre1.patch
+ext3-xattr-ptr-arith-fix.patch
+vmalloc_to_page-2.4.19-bgl.patch
+ext3-truncate-buffer-head.patch
+kallsyms-2.4-bgl.patch
+kksymoops-2.4-bgl.patch
+export-show_task-2.4-bgl.patch
BOOT_ARCHS=""
JENSEN_ARCHS=""
SMP_ARCHS="x86_64 ia64"
-BIGSMP_ARCHS="i686 ppc"
+BIGSMP_ARCHS="i686"
+PSERIES64_ARCHS="ppc"
UP_ARCHS=""
SRC_ARCHS=""
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inllen1 < 1) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
CERROR("requires a TARGET UUID\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inllen1 > 37) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
CERROR("client UUID must be less than 38 characters\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inllen2 < 1) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
CERROR("setup requires a SERVER UUID\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inllen2 > 37) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
CERROR("target UUID must be less than 38 characters\n");
RETURN(-EINVAL);
}
sema_init(&cli->cl_sem, 1);
cli->cl_conn_count = 0;
- memcpy(server_uuid.uuid, lcfg->lcfg_inlbuf2,
- min_t(unsigned int, lcfg->lcfg_inllen2, sizeof(server_uuid)));
+ memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+ min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+ sizeof(server_uuid)));
cli->cl_dirty = 0;
cli->cl_avail_grant = 0;
imp->imp_generation = 0;
imp->imp_initial_recov = 1;
INIT_LIST_HEAD(&imp->imp_pinger_chain);
- memcpy(imp->imp_target_uuid.uuid, lcfg->lcfg_inlbuf1,
- lcfg->lcfg_inllen1);
+ memcpy(imp->imp_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+ LUSTRE_CFG_BUFLEN(lcfg, 1));
class_import_put(imp);
cli->cl_import = imp;
cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
cli->cl_sandev = to_kdev_t(0);
- if (lcfg->lcfg_inllen3 != 0) {
- if (!strcmp(lcfg->lcfg_inlbuf3, "inactive")) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+ if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
name, obddev->obd_name,
imp->imp_target_uuid.uuid);
imp->imp_invalid = 1;
- if (lcfg->lcfg_inllen4 != 0)
- mgmt_name = lcfg->lcfg_inlbuf4;
+ if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0)
+ mgmt_name = lustre_cfg_string(lcfg, 4);
} else {
- mgmt_name = lcfg->lcfg_inlbuf3;
+ mgmt_name = lustre_cfg_string(lcfg, 3);
}
}
case LDLM_GL_CALLBACK:
OBD_FAIL_RETURN(OBD_FAIL_LDLM_GL_CALLBACK, 0);
break;
- case OBD_LOG_CANCEL:
+ case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
rc = llog_origin_handle_cancel(req);
ldlm_callback_reply(req, rc);
if (req->rq_export == NULL) {
struct ldlm_request *dlm_req;
- CERROR("operation %d with bad export from %s\n",
- req->rq_reqmsg->opc,
- req->rq_peerstr);
- CERROR("--> export cookie: "LPX64"\n",
+
+ CERROR("operation %d from %s with bad export cookie "LPU64"\n",
+ req->rq_reqmsg->opc, req->rq_peerstr,
req->rq_reqmsg->handle.cookie);
+
dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req),
lustre_swab_ldlm_request);
if (dlm_req != NULL)
ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1);
+
ldlm_callback_reply(req, -ENOTCONN);
RETURN(0);
}
if (rc)
break;
RETURN(0);
-
+ case OBD_LOG_CANCEL:
+ OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
+ rc = llog_origin_handle_cancel(req);
+ ldlm_callback_reply(req, rc);
+ RETURN(0);
default:
CERROR("invalid opcode %d\n", req->rq_reqmsg->opc);
ldlm_callback_reply(req, -EINVAL);
libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c llite_lib.h
# for make rpms -- need cleanup
-liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c
+liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c \
+ llite_lib.h
liblustre.a : $(LUSTRE_LIBS) $(PTL_LIBS) $(SYSIO_LIBS)
$(srcdir)/genlib.sh $(SYSIO) $(AR) $(LINK) || ( rm -f $@; exit 1 )
int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
char *peer = "MDS_PEER_UUID";
struct obd_device *obd;
struct lustre_handle mdc_conn = {0, };
CERROR("Can't parse NAL tcp\n");
RETURN(-EINVAL);
}
- LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL);
- lcfg.lcfg_nid = nid;
- lcfg.lcfg_inllen1 = strlen(peer) + 1;
- lcfg.lcfg_inlbuf1 = peer;
- lcfg.lcfg_nal = nal;
- err = class_process_config(&lcfg);
+
+ lustre_cfg_bufs_reset(&bufs, NULL);
+ lustre_cfg_bufs_set_string(&bufs, 1, peer);
+ lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs);
+ lcfg->lcfg_nid = nid;
+ lcfg->lcfg_nal = nal;
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out, err);
- LCFG_INIT(lcfg, LCFG_ATTACH, name);
- lcfg.lcfg_inlbuf1 = "mdc";
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = mdc_uuid.uuid;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);
+ lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid);
+ lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out_del_uuid, err);
- LCFG_INIT(lcfg, LCFG_SETUP, name);
- lcfg.lcfg_inlbuf1 = g_zconf_mdsname;
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = peer;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, g_zconf_mdsname);
+ lustre_cfg_bufs_set_string(&bufs, 2, peer);
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out_detach, err);
err = obd_disconnect(exp);
out_cleanup:
- LCFG_INIT(lcfg, LCFG_CLEANUP, name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out, err);
out_detach:
- LCFG_INIT(lcfg, LCFG_DETACH, name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lcfg = lustre_cfg_new(LCFG_DETACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out, err);
out_del_uuid:
- LCFG_INIT(lcfg, LCFG_DEL_UUID, name);
- lcfg.lcfg_inllen1 = strlen(peer) + 1;
- lcfg.lcfg_inlbuf1 = peer;
- err = class_process_config(&lcfg);
-
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, peer);
+ lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
out:
if (rc == 0)
rc = err;
libtestcommon_a_SOURCES = test_common.c test_common.h
-echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c
+echo_test_SOURCES = echo_test.c $(top_srcdir)/lustre/utils/parser.c $(top_srcdir)/lustre/utils/obd.c $(top_srcdir)/lustre/utils/lustre_cfg.c
echo_test_CFLAGS = $(LL_CFLAGS)
echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread
echo_test_DEPENDENCIES=$(top_builddir)/lustre/liblustre/liblsupport.a
#include <linux/obd.h>
#include <linux/obd_class.h>
#include <procbridge.h>
+#include <linux/obd_ost.h>
#define LIBLUSTRE_TEST 1
#include "../utils/lctl.c"
static int connect_echo_client(void)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
ptl_nid_t nid;
char *peer = "ECHO_PEER_NID";
class_uuid_t osc_uuid, echo_uuid;
}
/* add uuid */
- LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL);
- lcfg.lcfg_nid = nid;
- lcfg.lcfg_inllen1 = strlen(peer) + 1;
- lcfg.lcfg_inlbuf1 = peer;
- lcfg.lcfg_nal = nal;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, NULL);
+ lustre_cfg_bufs_set_string(&bufs, 1, peer);
+ lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs);
+ lcfg->lcfg_nid = nid;
+ lcfg->lcfg_nal = nal;
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed add_uuid\n");
RETURN(-EINVAL);
}
/* attach osc */
- LCFG_INIT(lcfg, LCFG_ATTACH, osc_dev_name);
- lcfg.lcfg_inlbuf1 = "osc";
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = osc_uuid_str.uuid;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, LUSTRE_OSC_NAME);
+ lustre_cfg_bufs_set_string(&bufs, 2, osc_uuid_str.uuid);
+ lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed attach osc\n");
RETURN(-EINVAL);
}
/* setup osc */
- LCFG_INIT(lcfg, LCFG_SETUP, osc_dev_name);
- lcfg.lcfg_inlbuf1 = echo_server_ostname;
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = peer;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, osc_dev_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, echo_server_ostname);
+ lustre_cfg_bufs_set_string(&bufs, 2, peer);
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed setup osc\n");
RETURN(-EINVAL);
}
/* attach echo_client */
- LCFG_INIT(lcfg, LCFG_ATTACH, echo_dev_name);
- lcfg.lcfg_inlbuf1 = "echo_client";
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = echo_uuid_str.uuid;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, echo_dev_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, "echo_client");
+ lustre_cfg_bufs_set_string(&bufs, 2, echo_uuid_str.uuid);
+ lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed attach echo_client\n");
RETURN(-EINVAL);
}
/* setup echo_client */
- LCFG_INIT(lcfg, LCFG_SETUP, echo_dev_name);
- lcfg.lcfg_inlbuf1 = osc_dev_name;
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = NULL;
- lcfg.lcfg_inllen2 = 0;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, echo_dev_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, osc_dev_name);
+ lustre_cfg_bufs_set_string(&bufs, 2, NULL);
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed setup echo_client\n");
RETURN(-EINVAL);
static int disconnect_echo_client(void)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg = NULL;
int err;
ENTRY;
/* cleanup echo_client */
- LCFG_INIT(lcfg, LCFG_CLEANUP, echo_dev_name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, echo_dev_name);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ err = class_process_config(lcfg);
if (err < 0) {
+ lustre_cfg_free(lcfg);
CERROR("failed cleanup echo_client\n");
RETURN(-EINVAL);
}
/* detach echo_client */
- LCFG_INIT(lcfg, LCFG_DETACH, echo_dev_name);
- err = class_process_config(&lcfg);
+ lcfg->lcfg_command = LCFG_DETACH;
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed detach echo_client\n");
RETURN(-EINVAL);
}
/* cleanup osc */
- LCFG_INIT(lcfg, LCFG_CLEANUP, osc_dev_name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, osc_dev_name);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ err = class_process_config(lcfg);
if (err < 0) {
+ lustre_cfg_free(lcfg);
CERROR("failed cleanup osc device\n");
RETURN(-EINVAL);
}
/* detach osc */
- LCFG_INIT(lcfg, LCFG_DETACH, osc_dev_name);
- err = class_process_config(&lcfg);
+ lcfg->lcfg_command = LCFG_DETACH;
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0) {
CERROR("failed detach osc device\n");
RETURN(-EINVAL);
if (rc)
return(-EFAULT);
+ /*
+ * This is coming from userspace, so should be in
+ * local endian. But the MDS would like it in little
+ * endian, so we swab it before we send it.
+ */
if (lum.lmm_magic != LOV_USER_MAGIC)
RETURN(-EINVAL);
+ if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC))
+ lustre_swab_lov_user_md(&lum);
+
+ /* swabbing is done in lov_setstripe() on server side */
rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
&attr, &lum, sizeof(lum), NULL, 0, &request);
if (rc) {
lmm = lustre_msg_buf(request->rq_repmsg, 1, lmmsize);
LASSERT(lmm != NULL);
LASSERT_REPSWABBED(request, 1);
+
+ /*
+ * This is coming from the MDS, so is probably in
+ * little endian. We convert it to host endian before
+ * passing it to userspace.
+ */
+ if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
+ lustre_swab_lov_user_md((struct lov_user_md *)lmm);
+ lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
+ }
+
rc = copy_to_user(lump, lmm, lmmsize);
if (rc)
GOTO(out_get, rc = -EFAULT);
LASSERT(lmm != NULL);
LASSERT_REPSWABBED(request, 1);
+ /*
+ * This is coming from the MDS, so is probably in
+ * little endian. We convert it to host endian before
+ * passing it to userspace.
+ */
+ if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
+ lustre_swab_lov_user_md((struct lov_user_md *)lmm);
+ lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
+ }
+
if (cmd == IOC_MDC_GETFILEINFO) {
struct lov_user_mds_data *lmdp;
lstat_t st = { 0 };
#define log2(n) ffz(~(n))
#endif
+/* We need to have some extra twiddling here because some systems have
+ * no random state when they start up. */
+static void
+lustre_generate_random_uuid(class_uuid_t uuid)
+{
+ struct timeval t;
+ int *i, j, k;
+
+ ENTRY;
+ LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0);
+
+ j = jiffies;
+ do_gettimeofday(&t);
+ k = t.tv_usec;
+
+ generate_random_uuid(uuid);
+
+ for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) {
+ *i ^= j ^ k;
+ j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff);
+ k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000);
+ }
+
+ EXIT;
+}
+
struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
{
struct ll_sb_info *sbi = NULL;
INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
ll_s2sbi_nocast(sb) = sbi;
- generate_random_uuid(uuid);
+ lustre_generate_random_uuid(uuid);
class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+ CDEBUG(D_HA, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
spin_lock(&ll_sb_lock);
list_add_tail(&sbi->ll_list, &ll_super_blocks);
struct lustre_md md;
kdev_t devno;
int err;
+ ENTRY;
obd = class_name2obd(mdc);
if (!obd) {
EXIT;
}
-
char *ll_read_opt(const char *opt, char *data)
{
char *value;
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
for (this_char = strtok (options, ",");
this_char != NULL;
- this_char = strtok (NULL, ",")) {
+ this_char = strtok (NULL, ","))
#else
- while ((this_char = strsep (&opt_ptr, ",")) != NULL) {
+ while ((this_char = strsep (&opt_ptr, ",")) != NULL)
#endif
+ {
CDEBUG(D_SUPER, "this_char %s\n", this_char);
if (!*ost && (*ost = ll_read_opt("osc", this_char)))
continue;
int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
struct config_llog_instance *cfg, int allow_recov)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg *lcfg = NULL;
+ struct lustre_cfg_bufs bufs;
struct portals_cfg pcfg;
char * peer = "MDS_PEER_UUID";
struct obd_device *obd;
if (lmd_bad_magic(lmd))
RETURN(-EINVAL);
- generate_random_uuid(uuid);
+ lustre_generate_random_uuid(uuid);
class_uuid_unparse(uuid, &mdc_uuid);
+ CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid);
if (lmd->lmd_local_nid) {
PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID);
GOTO(out, err);
}
- LCFG_INIT(lcfg, LCFG_ADD_UUID, name);
- lcfg.lcfg_nid = lmd->lmd_server_nid;
- lcfg.lcfg_inllen1 = strlen(peer) + 1;
- lcfg.lcfg_inlbuf1 = peer;
- lcfg.lcfg_nal = lmd->lmd_nal;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, peer);
+
+ lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs);
+ lcfg->lcfg_nal = lmd->lmd_nal;
+ lcfg->lcfg_nid = lmd->lmd_server_nid;
+ LASSERT(lcfg->lcfg_nal);
+ LASSERT(lcfg->lcfg_nid);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out_del_conn, err);
- LCFG_INIT(lcfg, LCFG_ATTACH, name);
- lcfg.lcfg_inlbuf1 = "mdc";
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = mdc_uuid.uuid;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);
+ lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid);
+
+ lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out_del_uuid, err);
- LCFG_INIT(lcfg, LCFG_SETUP, name);
- lcfg.lcfg_inlbuf1 = lmd->lmd_mds;
- lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
- lcfg.lcfg_inlbuf2 = peer;
- lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, lmd->lmd_mds);
+ lustre_cfg_bufs_set_string(&bufs, 2, peer);
+
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out_detach, err);
err = obd_disconnect(exp);
out_cleanup:
- LCFG_INIT(lcfg, LCFG_CLEANUP, name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out, err);
out_detach:
- LCFG_INIT(lcfg, LCFG_DETACH, name);
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lcfg = lustre_cfg_new(LCFG_DETACH, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err < 0)
GOTO(out, err);
out_del_uuid:
- LCFG_INIT(lcfg, LCFG_DEL_UUID, name);
- lcfg.lcfg_inllen1 = strlen(peer) + 1;
- lcfg.lcfg_inlbuf1 = peer;
- err = class_process_config(&lcfg);
+ lustre_cfg_bufs_reset(&bufs, name);
+ lustre_cfg_bufs_set_string(&bufs, 1, peer);
+ lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs);
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
out_del_conn:
if (lmd->lmd_nal == SOCKNAL ||
static void lustre_manual_cleanup(struct ll_sb_info *sbi)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
struct obd_device *obd;
int next = 0;
- while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) != NULL)
- {
+ while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) != NULL){
int err;
- LCFG_INIT(lcfg, LCFG_CLEANUP, obd->obd_name);
- err = class_process_config(&lcfg);
+ /* the lcfg is almost the same for both ops */
+ lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+
+ err = class_process_config(lcfg);
if (err) {
CERROR("cleanup failed: %s\n", obd->obd_name);
//continue;
}
- LCFG_INIT(lcfg, LCFG_DETACH, obd->obd_name);
- err = class_process_config(&lcfg);
+ lcfg->lcfg_command = LCFG_DETACH;
+ err = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
if (err) {
CERROR("detach failed: %s\n", obd->obd_name);
//continue;
OBD_ALLOC(cln_prof, len);
sprintf(cln_prof, "%s-clean", sbi->ll_lmd->lmd_profile);
- err = lustre_process_log(sbi->ll_lmd, cln_prof, &cfg,
- 0);
+ err = lustre_process_log(sbi->ll_lmd, cln_prof, &cfg,0);
if (err < 0) {
CERROR("Unable to process log: %s\n", cln_prof);
lustre_manual_cleanup(sbi);
lustre_common_put_super(sb);
if (sbi->ll_lmd != NULL) {
+#if 0
char * cln_prof;
int len = strlen(sbi->ll_lmd->lmd_profile) + sizeof("-clean")+1;
int err;
OBD_FREE(cln_prof, len);
free_lmd:
+#else
+ lustre_manual_cleanup(sbi);
+#endif
OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
OBD_FREE(sbi->ll_instance, strlen(sbi->ll_instance) + 1);
}
int count;
ENTRY;
- if (lcfg->lcfg_inllen1 < 1) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
CERROR("LOV setup requires a descriptor\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inllen2 < 1) {
+ if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
CERROR("LOV setup requires an OST UUID list\n");
RETURN(-EINVAL);
}
- desc = (struct lov_desc *)lcfg->lcfg_inlbuf1;
- if (sizeof(*desc) > lcfg->lcfg_inllen1) {
+ desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+ if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
CERROR("descriptor size wrong: %d > %d\n",
- (int)sizeof(*desc), lcfg->lcfg_inllen1);
+ (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
RETURN(-EINVAL);
}
+ if (desc->ld_magic != LOV_DESC_MAGIC) {
+ if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+ CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+ obd->obd_name, desc);
+ lustre_swab_lov_desc(desc);
+ } else {
+ CERROR("%s: Bad lov desc magic: %#x\n",
+ obd->obd_name, desc->ld_magic);
+ RETURN(-EINVAL);
+ }
+ }
+
+ desc->ld_active_tgt_count = 0;
count = desc->ld_tgt_count;
- uuids = (struct obd_uuid *)lcfg->lcfg_inlbuf2;
- if (sizeof(*uuids) * count != lcfg->lcfg_inllen2) {
+ uuids = (struct obd_uuid *)lustre_cfg_buf(lcfg, 2);
+ if (sizeof(*uuids) * count != LUSTRE_CFG_BUFLEN(lcfg, 2)) {
CERROR("UUID array size wrong: %u * %u != %u\n",
- (int)sizeof(*uuids), count, lcfg->lcfg_inllen2);
+ (int)sizeof(*uuids), count,
+ LUSTRE_CFG_BUFLEN(lcfg, 2));
RETURN(-EINVAL);
}
RETURN(-EFAULT);
if (lum.lmm_magic != LOV_USER_MAGIC) {
- CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x\n",
- lum.lmm_magic, LOV_USER_MAGIC);
- RETURN(-EINVAL);
+ if (lum.lmm_magic == __swab32(LOV_USER_MAGIC)) {
+ lustre_swab_lov_user_md(&lum);
+ } else {
+ CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+ " %#08x != %#08x\n",
+ lum.lmm_magic, LOV_USER_MAGIC);
+ RETURN(-EINVAL);
+ }
}
if (lum.lmm_pattern == 0) {
#warning "kernel code has old extents/mballoc patch, disabling"
#undef EXT3_MULTIBLOCK_ALLOCATOR
#endif
+#ifndef EXT3_EXTENTS_FL
+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#endif
#ifdef EXT3_MULTIBLOCK_ALLOCATOR
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
ext3_up_truncate_sem(inode);
lock_24kernel();
- handle = journal_start(EXT3_JOURNAL(inode), count + EXT3_ALLOC_NEEDED + 1);
+ handle = journal_start(EXT3_JOURNAL(inode), count+EXT3_ALLOC_NEEDED+1);
unlock_24kernel();
if (IS_ERR(handle)) {
ext3_down_truncate_sem(inode);
}
}
+static int round_up(int val)
+{
+ int ret = 1;
+ while (val) {
+ val >>= 1;
+ ret <<= 1;
+ }
+ return ret;
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay. We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+ struct mds_body *body, int size[5])
+{
+ int new_size, old_size;
+ struct lustre_msg *new_msg;
+
+ /* save old size */
+ old_size = lustre_msg_size(5, size);
+
+ size[4] = body->eadatasize;
+ new_size = lustre_msg_size(5, size);
+ OBD_ALLOC(new_msg, new_size);
+ if (new_msg != NULL) {
+ struct lustre_msg *old_msg = req->rq_reqmsg;
+ long irqflags;
+
+ DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u\n",
+ body->eadatasize);
+ memcpy(new_msg, old_msg, old_size);
+ new_msg->buflens[4] = body->eadatasize;
+
+ spin_lock_irqsave(&req->rq_lock, irqflags);
+ req->rq_reqmsg = new_msg;
+ req->rq_reqlen = new_size;
+ spin_unlock_irqrestore(&req->rq_lock, irqflags);
+
+ OBD_FREE(old_msg, old_size);
+ } else {
+ body->valid &= ~OBD_MD_FLEASIZE;
+ body->eadatasize = 0;
+ }
+}
+
/* We always reserve enough space in the reply packet for a stripe MD, because
* we don't know in advance the file type. */
int mdc_enqueue(struct obd_export *exp,
struct obd_device *obddev = class_exp2obd(exp);
struct ldlm_res_id res_id =
{ .name = {data->fid1.id, data->fid1.generation} };
- int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
+ int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
int rc, flags = LDLM_FL_HAS_INTENT;
int repsize[4] = {sizeof(struct ldlm_reply),
sizeof(struct mds_body),
size[2] = sizeof(struct mds_rec_create);
size[3] = data->namelen + 1;
- size[4] = obddev->u.cli.cl_max_mds_easize;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE,
+ /* As an optimization, we allocate an RPC request buffer for
+ * at least a default-sized LOV EA even if we aren't sending
+ * one. We grow the whole request to the next power-of-two
+ * size since we get that much from a slab allocation anyways.
+ * This avoids an allocation below in the common case where
+ * we need to save a default-sized LOV EA for open replay. */
+ size[4] = max(lmmsize, obddev->u.cli.cl_default_mds_easize);
+ rc = lustre_msg_size(5, size);
+ if (rc & (rc - 1))
+ size[4] = min(size[4] + round_up(rc) - rc,
+ obddev->u.cli.cl_max_mds_easize);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE,
5, size, NULL);
if (!req)
RETURN(-ENOMEM);
}
if ((body->valid & OBD_MD_FLEASIZE) != 0) {
- void *replayea;
/* The eadata is opaque; just check that it is
* there. Eventually, obd_unpackmd() will check
* the contents */
CERROR ("Missing/short eadata\n");
RETURN (-EPROTO);
}
+ /* We save the reply LOV EA in case we have to replay
+ * a create for recovery. If we didn't allocate a
+ * large enough request buffer above we need to
+ * reallocate it here to hold the actual LOV EA. */
if (it->it_op & IT_OPEN) {
- replayea = lustre_msg_buf(req->rq_reqmsg, 4,
- obddev->u.cli.cl_max_mds_easize);
- LASSERT(replayea);
- memcpy(replayea, eadata, body->eadatasize);
+ if (req->rq_reqmsg->buflens[4] <
+ body->eadatasize)
+ mdc_realloc_openmsg(req, body, size);
+
+ lmm = lustre_msg_buf(req->rq_reqmsg, 4,
+ body->eadatasize);
+ if (lmm)
+ memcpy(lmm, eadata, body->eadatasize);
}
}
}
RETURN(rc);
}
-/* Initialize the maximum LOV EA and cookie sizes. This allows
+/* Initialize the default and maximum LOV EA and cookie sizes. This allows
* us to make MDS RPCs with large enough reply buffers to hold the
* maximum-sized (= maximum striped) EA and cookie without having to
* calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
{
struct obd_device *obd = mdc_exp->exp_obd;
struct client_obd *cli = &obd->u.cli;
- struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
struct lov_desc desc;
__u32 valsize = sizeof(desc);
int rc, size;
ENTRY;
+ size = obd_size_diskmd(lov_exp, NULL);
+ if (cli->cl_max_mds_easize < size)
+ cli->cl_max_mds_easize = size;
+
rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc",
&valsize, &desc);
- if (rc < 0)
+ if (rc)
RETURN(rc);
- lsm.lsm_stripe_count = desc.ld_tgt_count;
- size = obd_size_diskmd(lov_exp, &lsm);
- if (cli->cl_max_mds_easize < size)
- cli->cl_max_mds_easize = size;
+ /* If default_stripe_count is zero we stripe over all OSTs */
+ if (desc.ld_default_stripe_count != 0) {
+ struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC,
+ .lsm_stripe_count =
+ desc.ld_default_stripe_count };
+ size = obd_size_diskmd(lov_exp, &lsm);
+ }
+ if (cli->cl_default_mds_easize < size)
+ cli->cl_default_mds_easize = size;
size = desc.ld_tgt_count * sizeof(struct llog_cookie);
if (cli->cl_max_mds_cookiesize < size)
int rc = 0;
ENTRY;
- if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2)
+ if (lcfg->lcfg_bufcount < 3)
RETURN(rc = -EINVAL);
- obd->obd_fsops = fsfilt_get_ops(lcfg->lcfg_inlbuf2);
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0)
+ RETURN(rc = -EINVAL);
+
+ obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
if (IS_ERR(obd->obd_fsops))
RETURN(rc = PTR_ERR(obd->obd_fsops));
options = (char *)page;
memset(options, 0, PAGE_SIZE);
-
+
/* here we use "iopen_nopriv" hardcoded, because it affects MDS utility
* and the rest of options are passed by mount options. Probably this
* should be moved to somewhere else like startup scripts or lconf. */
sprintf(options, "iopen_nopriv");
- if (lcfg->lcfg_inllen4 > 0 && lcfg->lcfg_inlbuf4)
+ if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4))
sprintf(options + strlen(options), ",%s",
- lcfg->lcfg_inlbuf4);
+ lustre_cfg_string(lcfg, 4));
- mnt = do_kern_mount(lcfg->lcfg_inlbuf2, 0,
- lcfg->lcfg_inlbuf1, (void *)options);
+ mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0,
+ lustre_cfg_string(lcfg, 1), (void *)options);
free_page(page);
if (IS_ERR(mnt)) {
rc = PTR_ERR(mnt);
GOTO(err_ops, rc);
}
- CDEBUG(D_SUPER, "%s: mnt = %p\n", lcfg->lcfg_inlbuf1, mnt);
-
+ CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt);
+
LASSERT(!ll_check_rdonly(ll_sbdev(mnt->mnt_sb)));
sema_init(&mds->mds_orphan_recovery_sem, 1);
if (rc < 0)
GOTO(err_fs, rc);
- if (lcfg->lcfg_inllen3 > 0 && lcfg->lcfg_inlbuf3) {
+ if (lcfg->lcfg_bufcount >= 4 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
class_uuid_t uuid;
generate_random_uuid(uuid);
class_uuid_unparse(uuid, &mds->mds_lov_uuid);
- OBD_ALLOC(mds->mds_profile, lcfg->lcfg_inllen3);
+ OBD_ALLOC(mds->mds_profile, LUSTRE_CFG_BUFLEN(lcfg, 3));
if (mds->mds_profile == NULL)
GOTO(err_fs, rc = -ENOMEM);
- memcpy(mds->mds_profile, lcfg->lcfg_inlbuf3,
- lcfg->lcfg_inllen3);
+ strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3),
+ LUSTRE_CFG_BUFLEN(lcfg, 3));
}
"Recovery progress can be monitored by watching "
"/proc/fs/lustre/mds/%s/recovery_status.\n",
obd->obd_name,
- lcfg->lcfg_inlbuf1,
+ lustre_cfg_string(lcfg, 1),
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1)
? "client" : "clients",
obd->obd_name);
} else {
LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n",
- obd->obd_name, lcfg->lcfg_inlbuf1,
+ obd->obd_name,
+ lustre_cfg_string(lcfg, 1),
obd->obd_replayable ? "enabled" : "disabled");
}
#include "mds_internal.h"
-void le_lov_desc_to_cpu (struct lov_desc *ld)
-{
- ld->ld_tgt_count = le32_to_cpu (ld->ld_tgt_count);
- ld->ld_default_stripe_count = le32_to_cpu (ld->ld_default_stripe_count);
- ld->ld_default_stripe_size = le32_to_cpu (ld->ld_default_stripe_size);
- ld->ld_pattern = le32_to_cpu (ld->ld_pattern);
-}
-
-void cpu_to_le_lov_desc (struct lov_desc *ld)
-{
- ld->ld_tgt_count = cpu_to_le32 (ld->ld_tgt_count);
- ld->ld_default_stripe_count = cpu_to_le32 (ld->ld_default_stripe_count);
- ld->ld_default_stripe_size = cpu_to_le32 (ld->ld_default_stripe_size);
- ld->ld_pattern = cpu_to_le32 (ld->ld_pattern);
-}
-
void mds_lov_update_objids(struct obd_device *obd, obd_id *ids)
{
struct mds_obd *mds = &obd->u.mds;
struct obd_run_ctxt saved;
int rc = 0;
+ ENTRY;
+ CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
+
switch (cmd) {
case OBD_IOC_RECORD: {
char *name = data->ioc_inlbuf1;
MODULES := obdclass llog_test
-obdclass-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o class_obd.o
+obdclass-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o
+obdclass-objs += class_obd.o
obdclass-objs += debug.o genops.o sysctl.o uuid.o llog_ioctl.o
obdclass-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
obdclass-objs += statfs_pack.o obdo.o obd_config.o
noinst_LIBRARIES = liblustreclass.a
liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c
liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c
-liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c
-liblustreclass_a_SOURCES += llog_lvfs.c #llog_ioctl.c rbtree.c
+liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c
+liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c #llog_ioctl.c rbtree.c
liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_VERSION=\"32\" -DBUILD_VERSION=\"1\"
liblustreclass_a_CFLAGS = $(LLCFLAGS)
switch (cmd) {
case OBD_IOC_PROCESS_CFG: {
- char *buf;
struct lustre_cfg *lcfg;
if (!data->ioc_plen1 || !data->ioc_pbuf1) {
CERROR("No config buffer passed!\n");
GOTO(out, err = -EINVAL);
}
- err = lustre_cfg_getdata(&buf, data->ioc_plen1,
- data->ioc_pbuf1, 0);
+
+ err = lustre_cfg_sanity_check(data->ioc_pbuf1,
+ data->ioc_plen1);
+ if (err)
+ GOTO(out, err);
+
+ OBD_ALLOC(lcfg, data->ioc_plen1);
+ err = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
if (err)
GOTO(out, err);
- lcfg = (struct lustre_cfg* ) buf;
err = class_process_config(lcfg);
- lustre_cfg_freedata(buf, data->ioc_plen1);
+ OBD_FREE(lcfg, data->ioc_plen1);
GOTO(out, err);
}
};
#endif
+#define OBD_INIT_CHECK
+#ifdef OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+ long long llval;
+ __u64 u64val;
+ char buf[64];
+ int len, ret = 0;
+
+ CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s, LPSZ=%s, LPSSZ=%s\n",
+ LPU64, LPD64, LPX64, LPSZ, LPSSZ);
+
+ CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", OBD_OBJECT_EOF);
+
+ llval = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "llval OBD_OBJECT_EOF = "LPX64"\n", llval);
+ if (llval != OBD_OBJECT_EOF) {
+ CDEBUG(D_ERROR, "long long "LPX64"(%d) != 0xffffffffffffffff\n",
+ llval, sizeof(llval));
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPX64, llval);
+ if (len != 18) {
+ CDEBUG(D_WARNING, "LPX64 wrong length! strlen(%s)=%d != 18\n",
+ buf, len);
+ ret = -EINVAL;
+ }
+
+ u64val = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+ if (u64val != OBD_OBJECT_EOF) {
+ CDEBUG(D_ERROR, "__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+ u64val, sizeof(u64val));
+ ret = -EINVAL;
+ }
+ if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+ CDEBUG(D_ERROR, "__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+ u64val, sizeof(u64val));
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPX64, u64val);
+ if (len != 18) {
+ CDEBUG(D_WARNING, "LPX64 wrong length! strlen(%s)=%d != 18\n",
+ buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPU64, u64val);
+ if (len != 20) {
+ CDEBUG(D_WARNING, "LPU64 wrong length! strlen(%s)=%d != 20\n",
+ buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPD64, u64val);
+ if (len != 2) {
+ CDEBUG(D_WARNING, "LPD64 wrong length! strlen(%s)=%d != 2\n",
+ buf, len);
+ ret = -EINVAL;
+ }
+ if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) {
+ CDEBUG(D_WARNING, "mask failed: u64val "LPU64" >= %lu\n",
+ u64val, PAGE_SIZE);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+#else
+#define obd_init_checks() do {} while(0)
+#endif
+
#ifdef __KERNEL__
static int __init init_obdclass(void)
#else
#ifdef __KERNEL__
obd_sysctl_init();
#endif
+ obd_init_checks();
#ifdef LPROCFS
proc_lustre_root = proc_mkdir("lustre", proc_root_fs);
/* Iterate the obd_device list looking devices have grp_uuid. Start
searching at *next, and if a device is found, the next index to look
- it is saved in *next. If next is NULL, then the first matching device
+ at is saved in *next. If next is NULL, then the first matching device
will always be returned. */
struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
{
if (flags & LLOG_F_IS_CAT) {
INIT_LIST_HEAD(&handle->u.chd.chd_head);
llh->llh_size = sizeof(struct llog_logid_rec);
- }
- else if (flags & LLOG_F_IS_PLAIN)
+ } else if (flags & LLOG_F_IS_PLAIN) {
INIT_LIST_HEAD(&handle->u.phd.phd_entry);
- else
+ } else {
+ CERROR("Unknown flags: %#x (Expected %#x or %#x\n",
+ flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
LBUG();
+ }
if (rc) {
OBD_FREE(llh, sizeof(*llh));
{
struct llog_log_hdr *llh = loghandle->lgh_hdr;
struct llog_process_cat_data *cd = catdata;
- void *buf;
+ char *buf;
__u64 cur_offset = LLOG_CHUNK_SIZE;
- int rc = 0, index = 1, last_index, idx;
+ int rc = 0, index = 1, last_index;
int saved_index = 0;
ENTRY;
if (index == last_index + 1)
break;
+ CDEBUG(D_OTHER, "index: %d last_index %d\n",
+ index, last_index);
+
/* get the buf with our target record; avoid old garbage */
memset(buf, 0, LLOG_CHUNK_SIZE);
rc = llog_next_block(loghandle, &saved_index, index,
if (rc)
GOTO(out, rc);
- rec = buf;
- idx = rec->lrh_index;
- if (idx < index)
- CDEBUG(D_HA, "index %u : idx %u\n", index, idx);
- while (idx < index) {
- rec = (struct llog_rec_hdr *)
- ((char *)rec + rec->lrh_len);
- idx ++;
- }
+ /* NB: when rec->lrh_len is accessed it is already swabbed
+ * since it is used at the "end" of the loop and the rec
+ * swabbing is done at the beginning of the loop. */
+ for (rec = (struct llog_rec_hdr *)buf;
+ (char *)rec < buf + LLOG_CHUNK_SIZE;
+ rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+ CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+ rec, rec->lrh_type);
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec, NULL);
+
+ CDEBUG(D_OTHER, "after swabbing, type: %#x\n",
+ rec->lrh_type);
- /* process records in buffer, starting where we found one */
- while ((void *)rec < buf + LLOG_CHUNK_SIZE) {
if (rec->lrh_index == 0)
GOTO(out, 0); /* no more records */
+ if (rec->lrh_index < index) {
+ CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+ rec->lrh_index);
+ continue;
+ }
+
+ CDEBUG(D_OTHER,
+ "lrh_index: %d lrh_len: %d (%d remains)\n",
+ rec->lrh_index, rec->lrh_len,
+ (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
/* if set, process the callback on this record */
if (ext2_test_bit(index, llh->llh_bitmap)) {
rc = cb(loghandle, rec, data);
}
if (rc)
GOTO(out, rc);
+ } else {
+ CDEBUG(D_OTHER, "Skipped index %d\n", index);
}
/* next record, still in buffer? */
++index;
if (index > last_index)
GOTO(out, rc = 0);
- rec = (struct llog_rec_hdr *)
- ((char *)rec + rec->lrh_len);
}
}
tail.lrt_len = rec.lrh_len = len;
tail.lrt_index = rec.lrh_index = index;
- rec.lrh_type = 0;
+ rec.lrh_type = LLOG_PAD_MAGIC;
rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
if (rc) {
handle->lgh_file->f_dentry->d_name.name);
} else {
struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
- /*
- * These need to be fixed for bug 1987
- */
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+ lustre_swab_llog_hdr(handle->lgh_hdr);
+
if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
handle->lgh_file->f_dentry->d_name.len,
RETURN(-EINVAL);
}
- tail = buf + rc - sizeof(struct llog_rec_tail);
+ rec = buf;
+ tail = (struct llog_rec_tail *)((char *)buf + rc - sizeof(struct llog_rec_tail));
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) {
+ lustre_swab_llog_rec(rec, tail);
+ }
+
*cur_idx = tail->lrt_index;
/* this shouldn't happen */
/* sanity check that the start of the new buffer is no farther
* than the record that we wanted. This shouldn't happen. */
- rec = buf;
if (rec->lrh_index > next_idx) {
CERROR("missed desired record? %u > %u\n",
rec->lrh_index, next_idx);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004-2005 Cluster File Systems, Inc.
+ * Author: jacob berkman <jacob@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+ CDEBUG(D_OTHER, "llogd body: %p\n", d);
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_oid: "LPX64"\n", d->lgd_logid.lgl_oid);
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogr: "LPX64"\n", d->lgd_logid.lgl_ogr);
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+ CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+ CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+ CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+ CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+ CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+ CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+ ENTRY;
+ print_llogd_body(d);
+ __swab64s (&d->lgd_logid.lgl_oid);
+ __swab64s (&d->lgd_logid.lgl_ogr);
+ __swab32s (&d->lgd_logid.lgl_ogen);
+ __swab32s (&d->lgd_ctxt_idx);
+ __swab32s (&d->lgd_llh_flags);
+ __swab32s (&d->lgd_index);
+ __swab32s (&d->lgd_saved_index);
+ __swab32s (&d->lgd_len);
+ __swab64s (&d->lgd_cur_offset);
+ print_llogd_body(d);
+ EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+ __swab64s (&d->lgdc_gen.mnt_cnt);
+ __swab64s (&d->lgdc_gen.conn_cnt);
+ __swab64s (&d->lgdc_logid.lgl_oid);
+ __swab64s (&d->lgdc_logid.lgl_ogr);
+ __swab32s (&d->lgdc_logid.lgl_ogen);
+ __swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid (struct ll_fid *fid)
+{
+ __swab64s (&fid->id);
+ __swab32s (&fid->generation);
+ __swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail)
+{
+ __swab32s(&rec->lrh_len);
+ __swab32s(&rec->lrh_index);
+ __swab32s(&rec->lrh_type);
+
+ switch (rec->lrh_type) {
+ case OST_SZ_REC: {
+ struct llog_size_change_rec *lsc =
+ (struct llog_size_change_rec *)rec;
+
+ lustre_swab_ll_fid(&lsc->lsc_fid);
+ __swab32s(&lsc->lsc_io_epoch);
+
+ break;
+ }
+
+ case OST_RAID1_REC:
+ break;
+
+ case MDS_UNLINK_REC: {
+ struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+ __swab64s(&lur->lur_oid);
+ __swab32s(&lur->lur_ogen);
+
+ break;
+ }
+
+ case OBD_CFG_REC:
+ case PTL_CFG_REC:
+ /* these are swabbed as they are consumed */
+ break;
+
+ case LLOG_HDR_MAGIC: {
+ struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+ __swab64s(&llh->llh_timestamp);
+ __swab32s(&llh->llh_count);
+ __swab32s(&llh->llh_bitmap_offset);
+ __swab32s(&llh->llh_flags);
+ __swab32s(&llh->llh_size);
+ __swab32s(&llh->llh_cat_idx);
+ if (tail != &llh->llh_tail) {
+ __swab32s(&llh->llh_tail.lrt_index);
+ __swab32s(&llh->llh_tail.lrt_len);
+ }
+
+ break;
+ }
+
+ case LLOG_LOGID_MAGIC: {
+ struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+ __swab64s(&lid->lid_id.lgl_oid);
+ __swab64s(&lid->lid_id.lgl_ogr);
+ __swab32s(&lid->lid_id.lgl_ogen);
+ break;
+ }
+
+ /* ignore old pad records of type 0 */
+ case 0:
+ break;
+
+ default:
+ CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+ rec->lrh_type, rec);
+ }
+
+ if (tail) {
+ __swab32s(&tail->lrt_len);
+ __swab32s(&tail->lrt_index);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+ CDEBUG(D_OTHER, "llog header: %p\n", h);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+ CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+ CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+ CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+ CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+ CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+ CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+ ENTRY;
+ print_llog_hdr(h);
+
+ lustre_swab_llog_rec(&h->llh_hdr, &h->llh_tail);
+
+ print_llog_hdr(h);
+ EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+#define PRINT_PCFG32(x) CDEBUG(D_OTHER, "\tpcfg->pcfg_"#x": %#x\n", pcfg->pcfg_##x)
+#define PRINT_PCFG64(x) CDEBUG(D_OTHER, "\tpcfg->pcfg_"#x": "LPX64"\n", pcfg->pcfg_##x)
+
+static void print_portals_cfg(struct portals_cfg *pcfg)
+{
+ ENTRY;
+
+ if (!(portal_debug & D_OTHER)) /* don't loop on nothing */
+ return;
+ CDEBUG(D_OTHER, "portals_cfg: %p\n", pcfg);
+ PRINT_PCFG32(version);
+ PRINT_PCFG32(command);
+
+ PRINT_PCFG32(nal);
+ PRINT_PCFG32(flags);
+
+ PRINT_PCFG32(gw_nal);
+ PRINT_PCFG64(nid);
+ PRINT_PCFG64(nid2);
+ PRINT_PCFG64(nid3);
+ PRINT_PCFG32(id);
+ PRINT_PCFG32(misc);
+ PRINT_PCFG32(fd);
+ PRINT_PCFG32(count);
+ PRINT_PCFG32(size);
+ PRINT_PCFG32(wait);
+
+ PRINT_PCFG32(plen1);
+ PRINT_PCFG32(plen2);
+
+ EXIT;
+}
+
+void lustre_swab_portals_cfg(struct portals_cfg *pcfg)
+{
+ ENTRY;
+
+ __swab32s(&pcfg->pcfg_version);
+ __swab32s(&pcfg->pcfg_command);
+
+ __swab32s(&pcfg->pcfg_nal);
+ __swab32s(&pcfg->pcfg_flags);
+
+ __swab32s(&pcfg->pcfg_gw_nal);
+ __swab64s(&pcfg->pcfg_nid);
+ __swab64s(&pcfg->pcfg_nid2);
+ __swab64s(&pcfg->pcfg_nid3);
+ __swab32s(&pcfg->pcfg_id);
+ __swab32s(&pcfg->pcfg_misc);
+ __swab32s(&pcfg->pcfg_fd);
+ __swab32s(&pcfg->pcfg_count);
+ __swab32s(&pcfg->pcfg_size);
+ __swab32s(&pcfg->pcfg_wait);
+
+ __swab32s(&pcfg->pcfg_plen1);
+ __swab32s(&pcfg->pcfg_plen2);
+
+ print_portals_cfg(pcfg);
+ EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_portals_cfg);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+ ENTRY;
+
+ if (!(portal_debug & D_OTHER)) /* don't loop on nothing */
+ return;
+ CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: "LPX64"\n", lcfg->lcfg_nid);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_nal: %#x\n", lcfg->lcfg_nal);
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+ if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+ for (i = 0; i < lcfg->lcfg_bufcount; i++)
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+ i, lcfg->lcfg_buflens[i]);
+ EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+ ENTRY;
+
+ __swab32s(&lcfg->lcfg_version);
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+ CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+ lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+ EXIT;
+ return;
+ }
+
+ __swab32s(&lcfg->lcfg_command);
+
+ __swab32s(&lcfg->lcfg_num);
+ __swab32s(&lcfg->lcfg_flags);
+ __swab64s(&lcfg->lcfg_nid);
+ __swab32s(&lcfg->lcfg_nal);
+
+ __swab32s(&lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+ __swab32s(&lcfg->lcfg_buflens[i]);
+
+ print_lustre_cfg(lcfg);
+ EXIT;
+ return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
}
llog_init_handle(llh, LLOG_F_IS_PLAIN, &uuid);
- lcr.lcr_hdr.lrh_len = lcr.lcr_tail.lrt_len = cpu_to_le32(sizeof(lcr));
- lcr.lcr_hdr.lrh_type = cpu_to_le32(OST_SZ_REC);
+ lcr.lcr_hdr.lrh_len = lcr.lcr_tail.lrt_len = sizeof(lcr);
+ lcr.lcr_hdr.lrh_type = OST_SZ_REC;
rc = llog_write_rec(llh, &lcr.lcr_hdr, NULL, 0, NULL, -1);
if (rc) {
CERROR("7: write one log record failed: %d\n", rc);
int rc;
ENTRY;
- if (lcfg->lcfg_inllen1 < 1) {
+ if (lcfg->lcfg_bufcount < 2) {
+ CERROR("requires a TARGET OBD name\n");
+ RETURN(-EINVAL);
+ }
+
+ if (lcfg->lcfg_buflens[1] < 1) {
CERROR("requires a TARGET OBD name\n");
RETURN(-EINVAL);
}
- tgt = class_name2obd(lcfg->lcfg_inlbuf1);
+ tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
CERROR("target device not attached or not set up (%s)\n",
- lcfg->lcfg_inlbuf1);
+ lustre_cfg_string(lcfg, 1));
RETURN(-EINVAL);
}
int rc;
int nob = strnlen (uuid, PAGE_SIZE) + 1;
+ LASSERT(nid != 0);
+ LASSERT(nal != 0);
+
if (nob > PAGE_SIZE)
return -EINVAL;
int class_attach(struct lustre_cfg *lcfg)
{
struct obd_type *type;
- struct obd_device *obd;
+ struct obd_device *obd = NULL;
char *typename, *name, *namecopy, *uuid;
int rc, len, cleanup_phase = 0;
- if (!lcfg->lcfg_inllen1 || !lcfg->lcfg_inlbuf1) {
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
CERROR("No type passed!\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inlbuf1[lcfg->lcfg_inllen1 - 1] != 0) {
- CERROR("Type not nul terminated!\n");
- RETURN(-EINVAL);
- }
- typename = lcfg->lcfg_inlbuf1;
+ typename = lustre_cfg_string(lcfg, 1);
- if (!lcfg->lcfg_dev_namelen || !lcfg->lcfg_dev_name) {
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
CERROR("No name passed!\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_dev_name[lcfg->lcfg_dev_namelen - 1] != 0) {
- CERROR("Name not nul terminated!\n");
- RETURN(-EINVAL);
- }
- name = lcfg->lcfg_dev_name;
+ name = lustre_cfg_string(lcfg, 0);
- if (!lcfg->lcfg_inllen2 || !lcfg->lcfg_inlbuf2) {
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
CERROR("No UUID passed!\n");
RETURN(-EINVAL);
}
- if (lcfg->lcfg_inlbuf2[lcfg->lcfg_inllen2 - 1] != 0) {
- CERROR("UUID not nul terminated!\n");
- RETURN(-EINVAL);
- }
- uuid = lcfg->lcfg_inlbuf2;
+ uuid = lustre_cfg_string(lcfg, 2);
CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
- MKSTR(lcfg->lcfg_inlbuf1),
- MKSTR(lcfg->lcfg_dev_name), MKSTR(lcfg->lcfg_inlbuf2));
+ MKSTR(typename), MKSTR(name), MKSTR(uuid));
/* find the type */
type = class_get_type(typename);
obd->obd_stopping = 1;
spin_unlock(&obd->obd_dev_lock);
- if (lcfg->lcfg_inlbuf1) {
- for (flag = lcfg->lcfg_inlbuf1; *flag != 0; flag++)
+ if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+ for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
switch (*flag) {
case 'F':
obd->obd_force = 1;
int class_process_config(struct lustre_cfg *lcfg)
{
struct obd_device *obd;
- char str[PTL_NALFMT_SIZE];
+ char nidstr[PTL_NALFMT_SIZE];
int err;
LASSERT(lcfg && !IS_ERR(lcfg));
-
CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
/* Commands that don't need a device */
}
case LCFG_ADD_UUID: {
CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
- " (%s), nal %x\n", lcfg->lcfg_inlbuf1, lcfg->lcfg_nid,
- portals_nid2str(lcfg->lcfg_nal, lcfg->lcfg_nid, str),
+ " (%s), nal %x\n", lustre_cfg_string(lcfg, 1),
+ lcfg->lcfg_nid,
+ portals_nid2str(lcfg->lcfg_nal, lcfg->lcfg_nid, nidstr),
lcfg->lcfg_nal);
- err = class_add_uuid(lcfg->lcfg_inlbuf1, lcfg->lcfg_nid,
+ err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid,
lcfg->lcfg_nal);
GOTO(out, err);
}
case LCFG_DEL_UUID: {
CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
- lcfg->lcfg_inlbuf1 == NULL ? "<all uuids>" :
- lcfg->lcfg_inlbuf1);
+ (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+ ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
- err = class_del_uuid(lcfg->lcfg_inlbuf1);
+ err = class_del_uuid(lustre_cfg_string(lcfg, 1));
GOTO(out, err);
}
case LCFG_MOUNTOPT: {
CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
- lcfg->lcfg_inlbuf1, lcfg->lcfg_inlbuf2,
- lcfg->lcfg_inlbuf3);
+ lustre_cfg_string(lcfg, 1),
+ lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
/* set these mount options somewhere, so ll_fill_super
* can find them. */
- err = class_add_profile(lcfg->lcfg_inllen1, lcfg->lcfg_inlbuf1,
- lcfg->lcfg_inllen2, lcfg->lcfg_inlbuf2,
- lcfg->lcfg_inllen3, lcfg->lcfg_inlbuf3);
+ err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+ lustre_cfg_string(lcfg, 1),
+ LUSTRE_CFG_BUFLEN(lcfg, 2),
+ lustre_cfg_string(lcfg, 2),
+ LUSTRE_CFG_BUFLEN(lcfg, 3),
+ lustre_cfg_string(lcfg, 3));
GOTO(out, err);
}
case LCFG_DEL_MOUNTOPT: {
- CDEBUG(D_IOCTL, "mountopt: profile %s\n", lcfg->lcfg_inlbuf1);
+ CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+ lustre_cfg_string(lcfg, 1));
/* set these mount options somewhere, so ll_fill_super
* can find them. */
- class_del_profile(lcfg->lcfg_inlbuf1);
+ class_del_profile(lustre_cfg_string(lcfg, 1));
GOTO(out, err = 0);
}
case LCFG_SET_TIMEOUT: {
}
case LCFG_SET_UPCALL: {
CDEBUG(D_IOCTL, "setting lustre ucpall to: %s\n",
- lcfg->lcfg_inlbuf1);
- if (lcfg->lcfg_inllen1 > sizeof obd_lustre_upcall)
+ lustre_cfg_string(lcfg, 1));
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof obd_lustre_upcall)
GOTO(out, err = -EINVAL);
- memcpy(obd_lustre_upcall, lcfg->lcfg_inlbuf1,
- lcfg->lcfg_inllen1);
+ strncpy(obd_lustre_upcall, lustre_cfg_string(lcfg, 1),
+ sizeof (obd_lustre_upcall));
GOTO(out, err = 0);
}
}
/* Commands that require a device */
- obd = class_name2obd(lcfg->lcfg_dev_name);
+ obd = class_name2obd(lustre_cfg_string(lcfg, 0));
if (obd == NULL) {
- if (lcfg->lcfg_dev_name == NULL)
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
CERROR("this lcfg command requires a device name\n");
else
- CERROR("no device for: %s\n", lcfg->lcfg_dev_name);
+ CERROR("no device for: %s\n",
+ lustre_cfg_string(lcfg, 0));
GOTO(out, err = -EINVAL);
}
char *cfg_buf = (char*) (rec + 1);
int rc = 0;
ENTRY;
- if (rec->lrh_type == OBD_CFG_REC) {
- char *buf;
- struct lustre_cfg *lcfg;
- char *old_name = NULL;
- int old_len = 0;
- char *old_uuid = NULL;
- int old_uuid_len = 0;
+ switch (rec->lrh_type) {
+ case OBD_CFG_REC: {
+ struct lustre_cfg *lcfg, *lcfg_new;
+ struct lustre_cfg_bufs bufs;
char *inst_name = NULL;
int inst_len = 0;
+ int inst = 0;
+
+ lcfg = (struct lustre_cfg *)cfg_buf;
+ if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION))
+ lustre_swab_lustre_cfg(lcfg);
- rc = lustre_cfg_getdata(&buf, cfg_len, cfg_buf, 1);
+ rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
if (rc)
GOTO(out, rc);
- lcfg = (struct lustre_cfg* ) buf;
- if (cfg && cfg->cfg_instance && lcfg->lcfg_dev_name) {
- inst_len = strlen(lcfg->lcfg_dev_name) +
- strlen(cfg->cfg_instance) + 2;
+ lustre_cfg_bufs_init(&bufs, lcfg);
+
+ if (cfg && cfg->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
+ inst = 1;
+ inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+ strlen(cfg->cfg_instance) + 1;
OBD_ALLOC(inst_name, inst_len);
if (inst_name == NULL)
GOTO(out, rc = -ENOMEM);
- sprintf(inst_name, "%s-%s", lcfg->lcfg_dev_name,
+ sprintf(inst_name, "%s-%s",
+ lustre_cfg_string(lcfg, 0),
cfg->cfg_instance);
- old_name = lcfg->lcfg_dev_name;
- old_len = lcfg->lcfg_dev_namelen;
- lcfg->lcfg_dev_name = inst_name;
- lcfg->lcfg_dev_namelen = strlen(inst_name) + 1;
+ lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
}
if (cfg && lcfg->lcfg_command == LCFG_ATTACH) {
- old_uuid = lcfg->lcfg_inlbuf2;
- old_uuid_len = lcfg->lcfg_inllen2;
-
- lcfg->lcfg_inlbuf2 = (char*)&cfg->cfg_uuid.uuid;
- lcfg->lcfg_inllen2 = sizeof(cfg->cfg_uuid);
+ lustre_cfg_bufs_set_string(&bufs, 2, cfg->cfg_uuid.uuid);
}
- rc = class_process_config(lcfg);
+ lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
- if (old_name) {
- lcfg->lcfg_dev_name = old_name;
- lcfg->lcfg_dev_namelen = old_len;
- OBD_FREE(inst_name, inst_len);
- }
+ lcfg_new->lcfg_num = lcfg->lcfg_num;
+ lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+ lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+ lcfg_new->lcfg_nal = lcfg->lcfg_nal;
- if (old_uuid) {
- lcfg->lcfg_inlbuf2 = old_uuid;
- lcfg->lcfg_inllen2 = old_uuid_len;
- }
+ rc = class_process_config(lcfg_new);
+ lustre_cfg_free(lcfg_new);
- lustre_cfg_freedata(buf, cfg_len);
- } else if (rec->lrh_type == PTL_CFG_REC) {
+ if (inst)
+ OBD_FREE(inst_name, inst_len);
+ break;
+ }
+ case PTL_CFG_REC: {
struct portals_cfg *pcfg = (struct portals_cfg *)cfg_buf;
+ if (pcfg->pcfg_version != PORTALS_CFG_VERSION) {
+ if (pcfg->pcfg_version == __swab32(PORTALS_CFG_VERSION)) {
+ CDEBUG(D_OTHER, "swabbing portals_cfg %p\n",
+ pcfg);
+ lustre_swab_portals_cfg(pcfg);
+ } else {
+ CERROR("Unknown portals_cfg version: %#x "
+ "(expecting %#x)\n",
+ pcfg->pcfg_version,
+ PORTALS_CFG_VERSION);
+ RETURN(-EINVAL);
+ }
+ }
if (pcfg->pcfg_command ==NAL_CMD_REGISTER_MYNID &&
cfg->cfg_local_nid != PTL_NID_ANY) {
pcfg->pcfg_nid = cfg->cfg_local_nid;
}
rc = libcfs_nal_cmd(pcfg);
+ break;
+ }
+ default:
+ CERROR("Unknown llog record type %#x encountered\n",
+ rec->lrh_type);
+ break;
}
out:
RETURN(rc);
int rc, rc2;
ENTRY;
+ CDEBUG(D_INFO, "looking up llog %s\n", name);
rc = llog_create(ctxt, &llh, NULL, name);
if (rc)
RETURN(rc);
int rc = 0;
ENTRY;
if (rec->lrh_type == OBD_CFG_REC) {
- char *buf;
struct lustre_cfg *lcfg;
+ int i;
- rc = lustre_cfg_getdata(&buf, cfg_len, cfg_buf, 1);
+ rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
if (rc)
GOTO(out, rc);
- lcfg = (struct lustre_cfg* ) buf;
+ lcfg = (struct lustre_cfg *)cfg_buf;
CDEBUG(D_INFO, "lcfg command: %x\n", lcfg->lcfg_command);
- if (lcfg->lcfg_dev_name)
+ if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0)
CDEBUG(D_INFO, " devname: %s\n",
- lcfg->lcfg_dev_name);
+ lustre_cfg_string(lcfg, 0));
if (lcfg->lcfg_flags)
CDEBUG(D_INFO, " flags: %x\n", lcfg->lcfg_flags);
if (lcfg->lcfg_nid)
CDEBUG(D_INFO, " nal: %x\n", lcfg->lcfg_nal);
if (lcfg->lcfg_num)
CDEBUG(D_INFO, " nal: %x\n", lcfg->lcfg_num);
- if (lcfg->lcfg_inlbuf1)
- CDEBUG(D_INFO, " inlbuf1: %s\n",lcfg->lcfg_inlbuf1);
- if (lcfg->lcfg_inlbuf2)
- CDEBUG(D_INFO, " inlbuf2: %s\n",lcfg->lcfg_inlbuf2);
- if (lcfg->lcfg_inlbuf3)
- CDEBUG(D_INFO, " inlbuf3: %s\n",lcfg->lcfg_inlbuf3);
- if (lcfg->lcfg_inlbuf4)
- CDEBUG(D_INFO, " inlbuf4: %s\n",lcfg->lcfg_inlbuf4);
-
- lustre_cfg_freedata(buf, cfg_len);
+ for (i = 1; i < lcfg->lcfg_bufcount; i++)
+ if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0)
+ CDEBUG(D_INFO, " inlbuf%d: %s\n", i,
+ lustre_cfg_string(lcfg, i));
} else if (rec->lrh_type == PTL_CFG_REC) {
struct portals_cfg *pcfg = (struct portals_cfg *)cfg_buf;
-
CDEBUG(D_INFO, "pcfg command: %d\n", pcfg->pcfg_command);
if (pcfg->pcfg_nal)
CDEBUG(D_INFO, " nal: %x\n",
int rc;
ENTRY;
- if (lcfg->lcfg_inllen1 < 1) {
+ if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
CERROR("requires a TARGET OBD name\n");
RETURN(-EINVAL);
}
- tgt = class_name2obd(lcfg->lcfg_inlbuf1);
+ tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
CERROR("device not attached or not set up (%s)\n",
- lcfg->lcfg_inlbuf1);
+ lustre_cfg_string(lcfg, 1));
RETURN(-EINVAL);
}
rc = obd_connect(&conn, tgt, &echo_uuid);
if (rc) {
- CERROR("fail to connect to device %s\n", lcfg->lcfg_inlbuf1);
+ CERROR("fail to connect to device %s\n",
+ lustre_cfg_string(lcfg, 1));
return (rc);
}
ec->ec_exp = class_conn2export(&conn);
struct lustre_cfg* lcfg = buf;
struct filter_obd *filter = &obd->u.filter;
struct vfsmount *mnt;
+ char *str;
char ns_name[48];
int rc = 0;
ENTRY;
- if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2)
+ if (lcfg->lcfg_bufcount < 3 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 2) < 1)
RETURN(-EINVAL);
- obd->obd_fsops = fsfilt_get_ops(lcfg->lcfg_inlbuf2);
+ obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
if (IS_ERR(obd->obd_fsops))
RETURN(PTR_ERR(obd->obd_fsops));
- mnt = do_kern_mount(lcfg->lcfg_inlbuf2, MS_NOATIME | MS_NODIRATIME,
- lcfg->lcfg_inlbuf1, (void *)option);
+ mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), MS_NOATIME | MS_NODIRATIME,
+ lustre_cfg_string(lcfg, 1), (void *)option);
rc = PTR_ERR(mnt);
if (IS_ERR(mnt))
GOTO(err_ops, rc);
LASSERT(!ll_check_rdonly(ll_sbdev(mnt->mnt_sb)));
- if (lcfg->lcfg_inllen3 > 0 && lcfg->lcfg_inlbuf3) {
- if (*lcfg->lcfg_inlbuf3 == 'f') {
+ if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+ str = lustre_cfg_string(lcfg, 3);
+ if (*str == 'f') {
obd->obd_replayable = 1;
obd_sync_filter = 1;
CWARN("%s: recovery enabled\n", obd->obd_name);
} else {
- if (*lcfg->lcfg_inlbuf3 != 'n') {
+ if (*str != 'n') {
CERROR("unrecognised flag '%c'\n",
- *lcfg->lcfg_inlbuf3);
+ *str);
}
// XXX Robert? Why do we get errors here
// GOTO(err_mntput, rc = -EINVAL);
"Recovery progress can be monitored by watching "
"/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
obd->obd_name,
- lcfg->lcfg_inlbuf1,
+ lustre_cfg_string(lcfg, 1),
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1)
? "client" : "clients",
obd->obd_name);
} else {
LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n",
- obd->obd_name, lcfg->lcfg_inlbuf1,
+ obd->obd_name,
+ lustre_cfg_string(lcfg, 1),
obd->obd_replayable ? "enabled" : "disabled");
}
struct lustre_cfg* lcfg = buf;
int rc;
- if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2)
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 1) || !LUSTRE_CFG_BUFLEN(lcfg, 2))
RETURN(-EINVAL);
- rc = filter_common_setup(obd, len, buf, lcfg->lcfg_inlbuf4);
+ rc = filter_common_setup(obd, len, buf, lustre_cfg_buf(lcfg, 4));
lprocfs_init_vars(filter, &lvars);
if (rc == 0 && lprocfs_obd_setup(obd, lvars.obd_vars) == 0 &&
#include <linux/lustre_fsfilt.h>
#include "filter_internal.h"
-
/* We should only change the file mtime (and not the ctime, like
* update_inode_times() in generic_file_write()) when we only change data. */
void inode_update_time(struct inode *inode, int ctime_too)
* leaves it there, sometimes generating io from it at later truncates.
* Someday very soon we'll be performing our brw_kiovec() IO to and
* from the page cache. */
-
check_pending_bhs(KIOBUF_GET_BLOCKS(iobuf), iobuf->nr_pages,
inode->i_dev, 1 << inode->i_blkbits);
int rc = 0;
ENTRY;
- if (!(le32_to_cpu(llh->lgh_hdr->llh_flags) & LLOG_F_IS_PLAIN)) {
+ if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
CERROR("log is not plain\n");
RETURN(-EINVAL);
}
cookie.lgc_lgl = llh->lgh_id;
cookie.lgc_subsys = LLOG_UNLINK_ORIG_CTXT;
- cookie.lgc_index = le32_to_cpu(rec->lrh_index);
+ cookie.lgc_index = rec->lrh_index;
if (rec->lrh_type == LLOG_GEN_REC) {
lgr = (struct llog_gen_rec *)rec;
struct lustre_cfg* lcfg = buf;
char *option = NULL;
- if (!lcfg->lcfg_inlbuf2)
+ if (lcfg->lcfg_bufcount < 3 || LUSTRE_CFG_BUFLEN(lcfg, 2) < 1)
RETURN(-EINVAL);
/* for extN/ext3 filesystem, we must mount it with 'writeback' mode */
- if (!strcmp(lcfg->lcfg_inlbuf2, "extN"))
+ if (!strcmp(lustre_cfg_string(lcfg, 2), "ldiskfs"))
option = "data=writeback";
- else if (!strcmp(lcfg->lcfg_inlbuf2, "ext3"))
+ else if (!strcmp(lustre_cfg_string(lcfg, 2), "ext3"))
option = "data=writeback,asyncdel";
else
LBUG(); /* just a reminder */
struct client_obd *cli = &obddev->u.cli;
ENTRY;
- if (lcfg->lcfg_inllen3 < 1) {
+ if (lcfg->lcfg_bufcount < 4 || LUSTRE_CFG_BUFLEN(lcfg, 3) < 1) {
CERROR("setup requires a SAN device pathname\n");
RETURN(-EINVAL);
}
client_obd_setup(obddev, len, buf);
- cli->cl_sandev = path2dev(lcfg->lcfg_inlbuf3);
+ cli->cl_sandev = path2dev(lustre_cfg_string(lcfg, 3));
if (!kdev_t_to_nr(cli->cl_sandev)) {
- CERROR("%s seems not a valid SAN device\n", lcfg->lcfg_inlbuf3);
+ CERROR("%s seems not a valid SAN device\n",
+ lustre_cfg_string(lcfg, 3));
RETURN(-EINVAL);
}
ptl_nid_t peer_nid;
int i;
char str[PTL_NALFMT_SIZE];
- int rc = lustre_uuid_to_peer(uuid->uuid,
- &peer_nal, &peer_nid);
+ int rc;
+
+ ENTRY;
+
+ rc = lustre_uuid_to_peer (uuid->uuid, &peer_nal, &peer_nid);
+
if (rc != 0)
RETURN (rc);
peer->peer_id.nid = peer_nid;
peer->peer_id.pid = LUSTRE_SRV_PTL_PID;
peer->peer_ni = pni;
- return (0);
+ RETURN(0);
}
}
GOTO(out, rc =-EFAULT);
}
+ /* The log records are swabbed as they are processed */
ptr = lustre_msg_buf(req->rq_repmsg, 1, len);
if (ptr == NULL) {
CERROR ("Can't unpack bitmap\n");
__swab64s (&m->last_committed);
__swab64s (&m->transno);
__swab32s (&m->status);
- __swab32s (&m->bufcount);
__swab32s (&m->flags);
+ __swab32s (&m->conn_cnt);
+ __swab32s (&m->bufcount);
}
required_len = HDR_SIZE(m->bufcount);
__swab64s(&lvb->lvb_blocks);
}
-void lustre_swab_ll_fid (struct ll_fid *fid)
-{
- __swab64s (&fid->id);
- __swab32s (&fid->generation);
- __swab32s (&fid->f_type);
-}
-
void lustre_swab_mds_status_req (struct mds_status_req *r)
{
__swab32s (&r->flags);
/* uuid endian insensitive */
}
+static void print_lum (struct lov_user_md *lum)
+{
+ CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+ CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+ CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+ CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lum->lmm_object_id);
+ CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lum->lmm_object_gr);
+ CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+ CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+ CDEBUG(D_OTHER, "\tlmm_stripe_offset: %#x\n", lum->lmm_stripe_offset);
+}
+
+void lustre_swab_lov_user_md(struct lov_user_md *lum)
+{
+ ENTRY;
+ CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
+ __swab32s(&lum->lmm_magic);
+ __swab32s(&lum->lmm_pattern);
+ __swab64s(&lum->lmm_object_id);
+ __swab64s(&lum->lmm_object_gr);
+ __swab32s(&lum->lmm_stripe_size);
+ __swab16s(&lum->lmm_stripe_count);
+ __swab16s(&lum->lmm_stripe_offset);
+ print_lum(lum);
+ EXIT;
+}
+
+static void print_lum_objs(struct lov_user_md *lum)
+{
+ struct lov_user_ost_data *lod;
+ int i;
+ ENTRY;
+ if (!(portal_debug & D_OTHER)) /* don't loop on nothing */
+ return;
+ CDEBUG(D_OTHER, "lov_user_md_objects: %p\n", lum);
+ for (i = 0; i < lum->lmm_stripe_count; i++) {
+ lod = &lum->lmm_objects[i];
+ CDEBUG(D_OTHER, "(%i) lod->l_object_id: "LPX64"\n", i, lod->l_object_id);
+ CDEBUG(D_OTHER, "(%i) lod->l_object_gr: "LPX64"\n", i, lod->l_object_gr);
+ CDEBUG(D_OTHER, "(%i) lod->l_ost_gen: %#x\n", i, lod->l_ost_gen);
+ CDEBUG(D_OTHER, "(%i) lod->l_ost_idx: %#x\n", i, lod->l_ost_idx);
+ }
+ EXIT;
+}
+
+void lustre_swab_lov_user_md_objects(struct lov_user_md *lum)
+{
+ struct lov_user_ost_data *lod;
+ int i;
+ ENTRY;
+ for (i = 0; i < lum->lmm_stripe_count; i++) {
+ lod = &lum->lmm_objects[i];
+ __swab64s(&lod->l_object_id);
+ __swab64s(&lod->l_object_gr);
+ __swab32s(&lod->l_ost_gen);
+ __swab32s(&lod->l_ost_idx);
+ }
+ print_lum_objs(lum);
+ EXIT;
+}
+
void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
{
int i;
__swab16s (&r->r_error_cnt);
}
-/* no one calls this */
-int llog_log_swabbed(struct llog_log_hdr *hdr)
-{
- if (hdr->llh_hdr.lrh_type == __swab32(LLOG_HDR_MAGIC))
- return 1;
- if (hdr->llh_hdr.lrh_type == LLOG_HDR_MAGIC)
- return 0;
- return -1;
-}
-
-void lustre_swab_llogd_body (struct llogd_body *d)
-{
- __swab64s (&d->lgd_logid.lgl_oid);
- __swab64s (&d->lgd_logid.lgl_ogr);
- __swab32s (&d->lgd_logid.lgl_ogen);
- __swab32s (&d->lgd_ctxt_idx);
- __swab32s (&d->lgd_llh_flags);
- __swab32s (&d->lgd_index);
- __swab32s (&d->lgd_saved_index);
- __swab32s (&d->lgd_len);
- __swab64s (&d->lgd_cur_offset);
-}
-
-void lustre_swab_llog_hdr (struct llog_log_hdr *h)
-{
- __swab32s (&h->llh_hdr.lrh_index);
- __swab32s (&h->llh_hdr.lrh_len);
- __swab32s (&h->llh_hdr.lrh_type);
- __swab64s (&h->llh_timestamp);
- __swab32s (&h->llh_count);
- __swab32s (&h->llh_bitmap_offset);
- __swab32s (&h->llh_flags);
- __swab32s (&h->llh_tail.lrt_index);
- __swab32s (&h->llh_tail.lrt_len);
-}
-
-void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
-{
- __swab64s (&d->lgdc_gen.mnt_cnt);
- __swab64s (&d->lgdc_gen.conn_cnt);
- __swab64s (&d->lgdc_logid.lgl_oid);
- __swab64s (&d->lgdc_logid.lgl_ogr);
- __swab32s (&d->lgdc_logid.lgl_ogen);
- __swab32s (&d->lgdc_ctxt_idx);
-}
-
void lustre_assert_wire_constants(void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux schnapps.adilger.int 2.4.22-l32 #4 Thu Jan 8 14:32:57 MST 2004 i686 i686
- * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */
+ * running on Linux milano 2.4.21-20.EL_87k.6-b_release_1_3_3.200410121845smp #1 SMP Tue Oct
+ * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
/* Constants... */
(long long)offsetof(struct obdo, o_gr));
LASSERTF((int)sizeof(((struct obdo *)0)->o_gr) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_gr));
- LASSERTF(offsetof(struct obdo, o_atime) == 16, " found %lld\n",
- (long long)offsetof(struct obdo, o_atime));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+ LASSERTF(offsetof(struct obdo, o_size) == 16, " found %lld\n",
+ (long long)offsetof(struct obdo, o_size));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_size));
LASSERTF(offsetof(struct obdo, o_mtime) == 24, " found %lld\n",
(long long)offsetof(struct obdo, o_mtime));
LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_mtime));
- LASSERTF(offsetof(struct obdo, o_ctime) == 32, " found %lld\n",
+ LASSERTF(offsetof(struct obdo, o_atime) == 32, " found %lld\n",
+ (long long)offsetof(struct obdo, o_atime));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+ LASSERTF(offsetof(struct obdo, o_ctime) == 40, " found %lld\n",
(long long)offsetof(struct obdo, o_ctime));
LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_ctime));
- LASSERTF(offsetof(struct obdo, o_size) == 40, " found %lld\n",
- (long long)offsetof(struct obdo, o_size));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_size));
LASSERTF(offsetof(struct obdo, o_blocks) == 48, " found %lld\n",
(long long)offsetof(struct obdo, o_blocks));
LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, " found %lld\n",
(long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
}
-
EXPORT_SYMBOL(lustre_swab_ost_body);
EXPORT_SYMBOL(lustre_swab_ost_last_id);
EXPORT_SYMBOL(lustre_swab_ost_lvb);
-EXPORT_SYMBOL(lustre_swab_ll_fid);
EXPORT_SYMBOL(lustre_swab_mds_status_req);
EXPORT_SYMBOL(lustre_swab_mds_body);
EXPORT_SYMBOL(lustre_swab_mds_rec_setattr);
EXPORT_SYMBOL(lustre_swab_mds_rec_unlink);
EXPORT_SYMBOL(lustre_swab_mds_rec_rename);
EXPORT_SYMBOL(lustre_swab_lov_desc);
+EXPORT_SYMBOL(lustre_swab_lov_user_md);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
EXPORT_SYMBOL(lustre_swab_ldlm_intent);
if (req->rq_send_state == LUSTRE_IMP_FULL)
pc = &ptlrpcd_pc;
- else
+ else
pc = &ptlrpcd_recovery_pc;
ptlrpc_set_add_new_req(pc->pc_set, req);
req->rq_ptlrpcd_data = pc;
-
+
ptlrpcd_wake(req);
}
request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1,
&llcd->llcd_cookiebytes,
bufs);
+ /* XXX FIXME bug 249, 5515 */
+ request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+ request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
if (request == NULL) {
rc = -ENOMEM;
# include <linux/config.h>
# include <linux/module.h>
# include <linux/kmod.h>
+# include <linux/list.h>
#else
# include <liblustre.h>
#endif
list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
spin_unlock_irqrestore(&svc->srv_lock, flags);
-
+
rc = ptlrpc_register_rqbd(rqbd);
if (rc != 0)
break;
-
+
posted = 1;
}
int main(int argc, char **argv)
{
+#ifdef O_DIRECT
int fd;
char *wbuf;
int blocks, seek_blocks;
printf("PASS\n");
return 0;
+#else /* !O_DIRECT */
+#warning O_DIRECT not defined, directio test will fail
+ printf("O_DIRECT not defined\n");
+ return 1;
+#endif /* !O_DIRECT */
}
RECORDSOUT=`grep "records out" $LOG | cut -d + -f1`
FILESIZE=`ls -l $OOS | awk '{ print $5 }'`
-if [ $RECORDSOUT -ne $(($FILESIZE / 1024)) ]; then
+if [ "$RECORDSOUT" -ne $(($FILESIZE / 1024)) ]; then
echo "ERROR: blocks written by dd not equal to the size of file"
SUCCESS=0
fi
`grep "records out" $LOG2 | cut -d+ -f 1`))
FILESIZE=$((`ls -l $OOS | awk '{print $5}'` + `ls -l $OOS2 | awk '{print $5}'`))
-if [ $RECORDSOUT -ne $(($FILESIZE / 1024)) ]; then
+if [ "$RECORDSOUT" -ne $(($FILESIZE / 1024)) ]; then
echo "ERROR: blocks written by dd not equal to the size of file"
SUCCESS=0
fi
#include <sys/ioctl.h>
#include <lustre/lustre_user.h>
+#ifndef O_DIRECT
+#define O_DIRECT 0
+#endif
int main(int argc, char *argv[])
{
{"O_NONBLOCK", O_NONBLOCK},
{"O_NDELAY", O_NDELAY},
{"O_SYNC", O_SYNC},
+#ifdef O_DIRECT
{"O_DIRECT", O_DIRECT},
+#endif
{"O_LARGEFILE", O_LARGEFILE},
{"O_DIRECTORY", O_DIRECTORY},
{"O_NOFOLLOW", O_NOFOLLOW},
int mode_set=0;
int flag_set=0;
int file_set=0;
- char c;
+ int c;
char* cloned_flags = NULL;
if (argc == 1)
int main(int argc, char *argv[])
{
unsigned long n;
- char msg[100], c, *end = NULL;
+ char msg[100], *end = NULL;
int h1, h2;
- int i;
+ int i, c;
while ((c = getopt(argc, argv, "cf:n:rs:vx")) != EOF) {
switch(c) {
int main(int argc, char ** argv)
{
- long i, count, iter = LONG_MAX, mode = 0, offset;
+ long i, c, count, iter = LONG_MAX, mode = 0, offset;
long int start, length = LONG_MAX, last, rc = 0;
char parent[4096], *t;
- char c, *prog = argv[0], *base;
+ char *prog = argv[0], *base;
int seed = 0;
int fd = -1;
flags = O_RDWR | O_CREAT;
}
if (strchr(argv[3], 'd')) {
+#ifdef O_DIRECT
flags |= O_DIRECT;
+#else
+ fprintf(stderr,
+ "%s: O_DIRECT not supported in this build\n",
+ argv[0]);
+ exit(1);
+#endif
}
if (!cmd)
usage(argv[0]);
} else {
cmd = READ | WRITE;
- flags = O_RDWR | O_CREAT | O_DIRECT;
+ flags = O_RDWR | O_CREAT;
+#ifdef O_DIRECT
+ flags |= O_DIRECT;
+#else
+ fprintf(stderr, "%s: warning: not setting O_DIRECT\n",
+ argv[0]);
+#endif
}
if (argc >= 5) {
}
printf("%s: %s on %s(objid "LPX64") for "LPU64"x%ld pages \n",
- argv[0], flags & O_DIRECT ? "directio" : "i/o",
+ argv[0],
+#ifdef O_DIRECT
+ flags & O_DIRECT ? "directio" : "i/o",
+#else
+ "i/o",
+#endif
argv[1], objid, count, pg_vec);
fd = open(argv[1], flags | O_LARGEFILE);
#!/usr/bin/env python
+# -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+# vim:expandtab:shiftwidth=8:tabstop=8:
#
# Copyright (C) 2002-2003 Cluster File Systems, Inc.
# Authors: Robert Read <rread@clusterfs.com>
return ""
def sys_get_local_nid(net_type, wildcard, cluster_id):
- """Return the local nid."""
- local = ""
- if sys_get_elan_position_file() and net_type == 'elan':
- local = sys_get_local_address('elan', '*', cluster_id)
- else:
- local = sys_get_local_address(net_type, wildcard, cluster_id)
- return local
-
-def sys_get_local_address(net_type, wildcard, cluster_id):
"""Return the local address for the network type."""
local = ""
- if net_type in ('tcp','openib','iib','vib','ra'):
+
+ # don't need a real nid for config log - client will replace (bug5619)
+ if config.record:
+ local = "54321"
+ elif net_type in ('tcp','openib','iib','vib','ra'):
if ':' in wildcard:
iface, star = string.split(wildcard, ':')
local = if2addr(iface)
if len(self.hostaddr) == 0:
self.hostaddr.append(self.nid)
if '*' in self.hostaddr[0]:
- self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
+ self.hostaddr[0] = sys_get_local_nid(self.net_type, self.hostaddr[0], self.cluster_id)
if not self.hostaddr[0]:
panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
debug("hostaddr:", self.hostaddr[0])
client_name = node_db.getName()
for prof_uuid in node_db.get_refs('profile'):
prof_db = node_db.lookup(prof_uuid)
- # refactor this into a funtion to test "clientness"
- # of a node.
+ # refactor this into a funtion to test "clientness" of a node.
for ref_class, ref_uuid in prof_db.get_all_refs():
if ref_class in ('mountpoint','echoclient'):
+ thing = self.db.lookup(ref_uuid);
+ fs_uuid = thing.get_first_ref('filesystem')
+ if not fs_uuid in self.filesystem_uuids:
+ continue;
+
debug("recording", client_name)
old_noexec = config.noexec
config.noexec = 0
if is_prepared(self.name):
self.cleanup()
try:
- srv = choose_local_server(self.get_servers())
- if srv:
+ srv_list = find_local_servers(self.get_servers())
+ for srv in srv_list:
lctl.connect(srv)
- else:
- routes = find_route(self.get_servers())
- if len(routes) == 0:
- panic("no route to", self.target_uuid)
- for (srv, r) in routes:
- lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
+
+ routes = find_route(self.get_servers())
+ for (srv, r) in routes:
+ lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
+ srv_list.append(srv)
+
+ if len(srv_list) == 0:
+ panic("no local servers and no route to", self.target_uuid)
except CommandError, e:
if not ignore_connect_failure:
raise e
- if srv:
+ if srv_list[0]:
+ srv = srv_list[0]
if self.target_uuid in config.inactive and self.permits_inactive():
debug("%s inactive" % self.target_uuid)
inactive_p = "inactive"
lctl.newdev(self.module, self.name, self.uuid,
setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
inactive_p, self.mgmt_name))
+ else:
+ panic("Unable to create OSC for ", self.target_uuid)
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
try:
- srv = choose_local_server(self.get_servers())
- if srv:
+ srv_list = find_local_servers(self.get_servers())
+ for srv in srv_list:
lctl.disconnect(srv)
- else:
- for (srv, r) in find_route(self.get_servers()):
- lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
+
+ routes = find_route(self.get_servers())
+ for (srv, r) in routes:
+ lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
debug("find_local_routes:", local_routes)
-def choose_local_server(srv_list):
+def find_local_servers(srv_list):
+ result = []
+
for srv in srv_list:
if local_cluster(srv.net_type, srv.cluster_id):
- return srv
+ result.append(srv)
+ return result
def local_cluster(net_type, cluster_id):
for cluster in local_clusters:
if not new_uuid:
raise Lustre.LconfError("doRecovery: no active target found for: " +
tgt_uuid)
- net = choose_local_server(get_ost_net(lustreDB, new_uuid))
- if not net:
+ srv_list = find_local_servers(get_ost_net(lustreDB, new_uuid))
+ if not srv_list[0]:
raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
- log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
- try:
- oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
- lustreDB.close()
- if oldnet:
- lctl.disconnect(oldnet)
- except CommandError, e:
- log("recover: disconnect", nid_uuid, "failed: ")
- e.dump()
+ for srv in srv_list:
+ log("Reconnecting", tgt_uuid, " to ", srv.nid_uuid);
+ try:
+ oldsrv = get_server_by_nid_uuid(lustreDB, nid_uuid)
+ lustreDB.close()
+ if oldsrv:
+ lctl.disconnect(oldsrv)
+ except CommandError, e:
+ log("recover: disconnect", nid_uuid, "failed: ")
+ e.dump()
- try:
- lctl.connect(net)
- except CommandError, e:
- log("recover: connect failed")
- e.dump()
+ try:
+ lctl.connect(srv)
+ except CommandError, e:
+ log("recover: connect failed")
+ e.dump()
- lctl.recover(client_uuid, net.nid_uuid)
+ lctl.recover(client_uuid, srv.nid_uuid)
def setupModulePath(cmd, portals_dir = PORTALS_DIR):
#include "parser.h"
#include "obdctl.h"
+unsigned int portal_subsystem_debug = 0;
+
/* all functions */
static int lfs_setstripe(int argc, char **argv);
static int lfs_find(int argc, char **argv);
if (errno != EEXIST && errno != EALREADY)
errmsg = strerror(errno);
- fprintf(stderr, "error on ioctl for '%s' (%d): %s\n",
- name, fd, errmsg);
+ fprintf(stderr, "error on ioctl "LPX64" for '%s' (%d): %s\n",
+ (__u64)LL_IOC_LOV_SETSTRIPE, name, fd, errmsg);
rc = -errno;
}
if (close(fd) < 0) {
(param->verbose || !param->obduuid));
break;
default:
- printf("unknown lmm_magic: 0x%08X\n", *(__u32 *)param->lum);
+ printf("unknown lmm_magic: %#x (expecting %#x)\n",
+ *(__u32 *)param->lum, LOV_USER_MAGIC_V1);
return;
}
}
*
****************************************************************************/
struct opt_map {
- const char *opt; /* option name */
- int skip; /* skip in mtab option string */
- int inv; /* true if flag value should be inverted */
- int mask; /* flag mask value */
+ const char *opt; /* option name */
+ int skip; /* skip in mtab option string */
+ int inv; /* true if flag value should be inverted */
+ int mask; /* flag mask value */
};
static const struct opt_map opt_map[] = {
return 1;
case SOCKNAL:
+ /* We need to do this before the mount is started if routing */
+ system("/sbin/modprobe ksocknal");
case TCPNAL:
case OPENIBNAL:
case IIBNAL:
NULL};
int i = 0;
+ /* We need to do this before the mount is started if routing */
+ system("/sbin/modprobe kqswnal");
do {
rc = get_local_elan_id(pfiles[i], buf);
} while (rc != 0 && pfiles[++i] != NULL);
int jt_lcfg_attach(int argc, char **argv)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
int rc;
- LCFG_INIT(lcfg, LCFG_ATTACH, lcfg_devname);
-
if (argc != 2 && argc != 3 && argc != 4)
return CMD_HELP;
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
+ lustre_cfg_bufs_reset(&bufs, NULL);
+
+ lustre_cfg_bufs_set_string(&bufs, 1, argv[1]);
if (argc >= 3) {
- lcfg.lcfg_dev_namelen = strlen(argv[2]) + 1;
- lcfg.lcfg_dev_name = argv[2];
+ lustre_cfg_bufs_set_string(&bufs, 0, argv[2]);
} else {
fprintf(stderr, "error: %s: LCFG_ATTACH requires a name\n",
jt_cmdname(argv[0]));
}
if (argc == 4) {
- lcfg.lcfg_inllen2 = strlen(argv[3]) + 1;
- lcfg.lcfg_inlbuf2 = argv[3];
+ lustre_cfg_bufs_set_string(&bufs, 2, argv[3]);
}
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0) {
fprintf(stderr, "error: %s: LCFG_ATTACH %s\n",
jt_cmdname(argv[0]), strerror(rc = errno));
int jt_lcfg_setup(int argc, char **argv)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
+ int i;
int rc;
if (lcfg_devname == NULL) {
return -EINVAL;
}
- LCFG_INIT(lcfg, LCFG_SETUP, lcfg_devname);
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
if (argc > 5)
return CMD_HELP;
- if (argc > 1) {
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
- }
- if (argc > 2) {
- lcfg.lcfg_inllen2 = strlen(argv[2]) + 1;
- lcfg.lcfg_inlbuf2 = argv[2];
- }
- if (argc > 3) {
- lcfg.lcfg_inllen3 = strlen(argv[3]) + 1;
- lcfg.lcfg_inlbuf3 = argv[3];
- }
- if (argc > 4) {
- lcfg.lcfg_inllen4 = strlen(argv[4]) + 1;
- lcfg.lcfg_inlbuf4 = argv[4];
+ for (i = 1; i < argc; i++) {
+ lustre_cfg_bufs_set_string(&bufs, i, argv[i]);
}
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0)
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
int jt_obd_detach(int argc, char **argv)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
int rc;
if (lcfg_devname == NULL) {
return -EINVAL;
}
- LCFG_INIT(lcfg, LCFG_DETACH, lcfg_devname);
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
if (argc != 1)
return CMD_HELP;
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_DETACH, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0)
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
int jt_obd_cleanup(int argc, char **argv)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
char force = 'F';
char failover = 'A';
char flags[3];
return -EINVAL;
}
- LCFG_INIT(lcfg, LCFG_CLEANUP, lcfg_devname);
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
if (argc < 1 || argc > 3)
return CMD_HELP;
return CMD_HELP;
}
- lcfg.lcfg_inllen1 = flag_cnt;
- if (flag_cnt)
- lcfg.lcfg_inlbuf1 = flags;
+ if (flag_cnt) {
+ lustre_cfg_bufs_set(&bufs, 1, flags, flag_cnt);
+ }
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0)
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
{
char tmp[64];
int rc;
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
+
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+ if (uuid)
+ lustre_cfg_bufs_set_string(&bufs, 1, uuid);
- LCFG_INIT(lcfg, LCFG_ADD_UUID, lcfg_devname);
- lcfg.lcfg_nid = nid;
- lcfg.lcfg_inllen1 = strlen(uuid) + 1;
- lcfg.lcfg_inlbuf1 = uuid;
- lcfg.lcfg_nal = nal;
+ lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs);
+ lcfg->lcfg_nid = nid;
+ lcfg->lcfg_nal = nal;
- rc = lcfg_ioctl(func, OBD_DEV_ID, &lcfg);
+#if 0
+ fprintf(stderr, "adding\tnal: %d\tnid: %d\tuuid: %s\n",
+ lcfg->lcfg_nid, lcfg->lcfg_nal, uuid);
+#endif
+ rc = lcfg_ioctl(func, OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc) {
fprintf(stderr, "IOC_PORTAL_ADD_UUID failed: %s\n",
strerror(errno));
int jt_lcfg_del_uuid(int argc, char **argv)
{
int rc;
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
if (argc != 2) {
fprintf(stderr, "usage: %s <uuid>\n", argv[0]);
return 0;
}
- LCFG_INIT(lcfg, LCFG_DEL_UUID, lcfg_devname);
-
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
if (strcmp (argv[1], "_all_"))
- {
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
- }
+ lustre_cfg_bufs_set_string(&bufs, 1, argv[1]);
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc) {
fprintf(stderr, "IOC_PORTAL_DEL_UUID failed: %s\n",
strerror(errno));
int jt_lcfg_lov_setup(int argc, char **argv)
{
- struct lustre_cfg lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
struct lov_desc desc;
- struct obd_uuid *uuidarray, *ptr;
+ struct obd_uuid *uuidarray = NULL, *ptr;
int rc, i;
char *end;
- LCFG_INIT(lcfg, LCFG_SETUP, lcfg_devname);
-
if (argc <= 6)
return CMD_HELP;
memset(&desc, 0, sizeof(desc));
obd_str2uuid(&desc.ld_uuid, argv[1]);
desc.ld_tgt_count = argc - 6;
+ desc.ld_magic = LOV_DESC_MAGIC;
desc.ld_default_stripe_count = strtoul(argv[2], &end, 0);
if (*end) {
fprintf(stderr, "error: %s: bad default stripe count '%s'\n",
strcpy((char *)ptr, argv[i]);
}
- lcfg.lcfg_inllen1 = sizeof(desc);
- lcfg.lcfg_inlbuf1 = (char *)&desc;
- lcfg.lcfg_inllen2 = desc.ld_tgt_count * sizeof(*uuidarray);
- lcfg.lcfg_inlbuf2 = (char *)uuidarray;
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+ lustre_cfg_bufs_set(&bufs, 1, &desc, sizeof(desc));
+ lustre_cfg_bufs_set(&bufs, 2, uuidarray,
+ desc.ld_tgt_count * sizeof(*uuidarray));
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc)
fprintf(stderr, "error: %s: ioctl error: %s\n",
jt_cmdname(argv[0]), strerror(rc = errno));
int jt_lcfg_mount_option(int argc, char **argv)
{
int rc;
- struct lustre_cfg lcfg;
-
- LCFG_INIT(lcfg, LCFG_MOUNTOPT, lcfg_devname);
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
+ int i;
if (argc < 3 || argc > 4)
return CMD_HELP;
- /* profile name */
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
- /* osc name */
- lcfg.lcfg_inllen2 = strlen(argv[2]) + 1;
- lcfg.lcfg_inlbuf2 = argv[2];
- if (argc == 4) {
- /* mdc name */
- lcfg.lcfg_inllen3 = strlen(argv[3]) + 1;
- lcfg.lcfg_inlbuf3 = argv[3];
- }
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+
+ for (i = 1; i < argc; i++)
+ lustre_cfg_bufs_set_string(&bufs, i, argv[i]);
+
+ lcfg = lustre_cfg_new(LCFG_MOUNTOPT, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0) {
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
}
-
return rc;
}
int jt_lcfg_del_mount_option(int argc, char **argv)
{
int rc;
- struct lustre_cfg lcfg;
-
- LCFG_INIT(lcfg, LCFG_DEL_MOUNTOPT, lcfg_devname);
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
if (argc != 2)
return CMD_HELP;
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+
/* profile name */
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
+ lustre_cfg_bufs_set_string(&bufs, 1, argv[1]);
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_DEL_MOUNTOPT, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0) {
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
}
-
return rc;
}
int jt_lcfg_set_timeout(int argc, char **argv)
{
int rc;
- struct lustre_cfg lcfg;
-
- LCFG_INIT(lcfg, LCFG_SET_TIMEOUT, lcfg_devname);
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
if (argc != 2)
return CMD_HELP;
- lcfg.lcfg_num = atoi(argv[1]);
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+ lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs);
+ lcfg->lcfg_num = atoi(argv[1]);
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0) {
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
}
-
return rc;
}
int jt_lcfg_set_lustre_upcall(int argc, char **argv)
{
int rc;
- struct lustre_cfg lcfg;
-
- LCFG_INIT(lcfg, LCFG_SET_UPCALL, lcfg_devname);
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg *lcfg;
if (argc != 2)
return CMD_HELP;
+ lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+
/* profile name */
- lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
- lcfg.lcfg_inlbuf1 = argv[1];
+ lustre_cfg_bufs_set_string(&bufs, 1, argv[1]);
- rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ lcfg = lustre_cfg_new(LCFG_SET_UPCALL, &bufs);
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+ lustre_cfg_free(lcfg);
if (rc < 0) {
fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
strerror(rc = errno));
}
-
return rc;
}
-
int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
{
int opc;
- char lcfg_rawbuf[8192];
- char * lcfg_buf= lcfg_rawbuf;
struct obd_ioctl_data data;
- int len;
int rc;
- memset(lcfg_buf, 0, sizeof(lcfg_rawbuf));
- if (lustre_cfg_pack(lcfg, &lcfg_buf, sizeof(lcfg_rawbuf), &len)) {
- fprintf(stderr, "error: %s: invalid ioctl\n",
- jt_cmdname(func));
- return -2;
- }
-
IOC_INIT(data);
data.ioc_type = LUSTRE_CFG_TYPE;
- data.ioc_plen1 = len;
- data.ioc_pbuf1 = lcfg_buf;
+ data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
+ lcfg->lcfg_buflens);
+ data.ioc_pbuf1 = (void *)lcfg;
IOC_PACK(func, data);
- if (jt_recording)
+ if (jt_recording) {
opc = OBD_IOC_DORECORD;
- else
+ } else {
opc = OBD_IOC_PROCESS_CFG;
-
+ }
rc = l_ioctl(dev_id, opc, buf);
- if (rc == 0)
- rc = lustre_cfg_unpack(lcfg, lcfg_buf, sizeof(lcfg_rawbuf));
return rc;
}
CHECK_STRUCT(obdo);
CHECK_MEMBER(obdo, o_id);
CHECK_MEMBER(obdo, o_gr);
- CHECK_MEMBER(obdo, o_atime);
+ CHECK_MEMBER(obdo, o_size);
CHECK_MEMBER(obdo, o_mtime);
+ CHECK_MEMBER(obdo, o_atime);
CHECK_MEMBER(obdo, o_ctime);
- CHECK_MEMBER(obdo, o_size);
CHECK_MEMBER(obdo, o_blocks);
CHECK_MEMBER(obdo, o_grant);
CHECK_MEMBER(obdo, o_blksize);
void lustre_assert_wire_constants(void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux schnapps.adilger.int 2.4.22-l32 #4 Thu Jan 8 14:32:57 MST 2004 i686 i686
- * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */
+ * running on Linux milano 2.4.21-20.EL_87k.6-b_release_1_3_3.200410121845smp #1 SMP Tue Oct
+ * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
/* Constants... */
(long long)offsetof(struct obdo, o_gr));
LASSERTF((int)sizeof(((struct obdo *)0)->o_gr) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_gr));
- LASSERTF(offsetof(struct obdo, o_atime) == 16, " found %lld\n",
- (long long)offsetof(struct obdo, o_atime));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+ LASSERTF(offsetof(struct obdo, o_size) == 16, " found %lld\n",
+ (long long)offsetof(struct obdo, o_size));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_size));
LASSERTF(offsetof(struct obdo, o_mtime) == 24, " found %lld\n",
(long long)offsetof(struct obdo, o_mtime));
LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_mtime));
- LASSERTF(offsetof(struct obdo, o_ctime) == 32, " found %lld\n",
+ LASSERTF(offsetof(struct obdo, o_atime) == 32, " found %lld\n",
+ (long long)offsetof(struct obdo, o_atime));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+ LASSERTF(offsetof(struct obdo, o_ctime) == 40, " found %lld\n",
(long long)offsetof(struct obdo, o_ctime));
LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_ctime));
- LASSERTF(offsetof(struct obdo, o_size) == 40, " found %lld\n",
- (long long)offsetof(struct obdo, o_size));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_size));
LASSERTF(offsetof(struct obdo, o_blocks) == 48, " found %lld\n",
(long long)offsetof(struct obdo, o_blocks));
LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, " found %lld\n",