From: jacob Date: Thu, 24 Mar 2005 22:50:56 +0000 (+0000) Subject: Land b1_4_bgl on b1_4. X-Git-Tag: v1_8_0_110~486^7~113 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=f24168f10c68991c15f74ae2b770181c6a9b15f9;p=fs%2Flustre-release.git Land b1_4_bgl on b1_4. Portals b1_4_bgl will land on HEAD after 1.4.2 is released, so for now children of b1_4 need to use the b1_4_bgl branch of portals. Existing filesystems will need to --write_conf on the MDS as the log format has incompatible changes. --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 91d78e4c..40edd50 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -3,8 +3,14 @@ * bug fixes - fix deadlock in obdfilter statistics vs. object create (5811) - fix for HPUX NFS client breakage when NFS exporting Lustre (5781) + - mdc_enqueue does not need max_mds_easize request buffer on send (5707) + - swab llog records of type '0' so we get proper header size/idx (5861) + - send llog cancel req to DLM cancel portal instead of cb portal (5515) * miscellania - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs + - allow --write-conf on an MDS with different nettype than client (5619) + - don't write config llogs to MDS for mounts not from that MDS (5617) + - lconf should create multiple TCP connections from a client (5201) - init scripts are now turned off by default; run chkconfig --on lustre and chkconfig --on lustrefs to use them diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 68e6b6d..c3e7cb9 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1 +1 @@ -m4_define([LUSTRE_VERSION],[1.4.1]) +m4_define([LUSTRE_VERSION],[1.4.1.1]) diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 8e130d0..de915f8 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -34,8 +34,11 @@ #ifdef HAVE_SYS_USER_H # include #endif - -#include "ioctl.h" +#ifdef HAVE_SYS_IOCTL_H +# include +#else +# include "ioctl.h" +#endif /* !HAVE_SYS_IOCTL_H */ #include #include diff --git a/lustre/include/linux/lustre_cfg.h b/lustre/include/linux/lustre_cfg.h index 4f230d2..20c28f6 100644 --- a/lustre/include/linux/lustre_cfg.h +++ b/lustre/include/linux/lustre_cfg.h @@ -23,7 +23,15 @@ #ifndef _LUSTRE_CFG_H #define _LUSTRE_CFG_H -#define LUSTRE_CFG_VERSION 0x00010001 +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)])) enum lcfg_command_type { LCFG_ATTACH = 0x00cf001, @@ -38,6 +46,12 @@ enum lcfg_command_type { LCFG_SET_UPCALL = 0x00cf010, }; +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + uint32_t lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + uint32_t lcfg_bufcount; +}; + struct lustre_cfg { uint32_t lcfg_version; uint32_t lcfg_command; @@ -47,198 +61,171 @@ struct lustre_cfg { uint64_t lcfg_nid; uint32_t lcfg_nal; - /* inline buffers for various arguments */ - uint32_t lcfg_dev_namelen; - char *lcfg_dev_name; - uint32_t lcfg_inllen1; - char *lcfg_inlbuf1; - uint32_t lcfg_inllen2; - char *lcfg_inlbuf2; - uint32_t lcfg_inllen3; - char *lcfg_inlbuf3; - uint32_t lcfg_inllen4; - char *lcfg_inlbuf4; - - char lcfg_bulk[0]; - + uint32_t lcfg_bufcount; + uint32_t lcfg_buflens[0]; }; -#define LCFG_INIT(l, cmd, name) \ -do { \ - memset(&(l), 0, sizeof(l)); \ - (l).lcfg_version = LUSTRE_CFG_VERSION; \ - (l).lcfg_command = (cmd); \ - if (name) { \ - (l).lcfg_dev_namelen = strlen(name) + 1; \ - (l).lcfg_dev_name = name; \ - } \ - \ -} while (0) - -#ifndef __KERNEL__ -static inline int lustre_cfg_packlen(struct lustre_cfg *lcfg) +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) \ + ? 0 \ + : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + uint32_t index, + void *buf, + uint32_t buflen) { - int len = size_round(sizeof(struct lustre_cfg)); - len += size_round(lcfg->lcfg_dev_namelen); - len += size_round(lcfg->lcfg_inllen1); - len += size_round(lcfg->lcfg_inllen2); - len += size_round(lcfg->lcfg_inllen3); - len += size_round(lcfg->lcfg_inllen4); - return size_round(len); + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + if (bufs == NULL) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; } -static inline int lustre_cfg_pack(struct lustre_cfg *data, char **pbuf, - int max, int *plen) +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + uint32_t index, + char *str) { - char *ptr; - struct lustre_cfg *overlay; - int len; + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} - len = lustre_cfg_packlen(data); +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} - data->lcfg_version = LUSTRE_CFG_VERSION; +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) +{ + int i; + int offset; + int bufcount; + LASSERT (lcfg != NULL); + LASSERT (index >= 0); + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += size_round(lcfg->lcfg_buflens[i]); + return (char *)lcfg + offset; +} - if (*pbuf && len > max) - return 1; - if (*pbuf == NULL) { - *pbuf = malloc(len); +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + int i; + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); } - if (!*pbuf) - return 1; - overlay = (struct lustre_cfg *)*pbuf; - memcpy(*pbuf, data, sizeof(*data)); - - ptr = overlay->lcfg_bulk; - if (data->lcfg_dev_name) - LOGL(data->lcfg_dev_name, data->lcfg_dev_namelen, ptr); - if (data->lcfg_inlbuf1) - LOGL(data->lcfg_inlbuf1, data->lcfg_inllen1, ptr); - if (data->lcfg_inlbuf2) - LOGL(data->lcfg_inlbuf2, data->lcfg_inllen2, ptr); - if (data->lcfg_inlbuf3) - LOGL(data->lcfg_inlbuf3, data->lcfg_inllen3, ptr); - if (data->lcfg_inlbuf4) - LOGL(data->lcfg_inlbuf4, data->lcfg_inllen4, ptr); - - *plen = len; - - return 0; } -static inline int lustre_cfg_unpack(struct lustre_cfg *data, char *pbuf, - int max) +static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) { - char *ptr; - struct lustre_cfg *overlay; - - if (!pbuf) - return 1; - overlay = (struct lustre_cfg *)pbuf; - - /* Preserve the caller's buffer pointers */ - overlay->lcfg_dev_name = data->lcfg_dev_name; - overlay->lcfg_inlbuf1 = data->lcfg_inlbuf1; - overlay->lcfg_inlbuf2 = data->lcfg_inlbuf2; - overlay->lcfg_inlbuf3 = data->lcfg_inlbuf3; - overlay->lcfg_inlbuf4 = data->lcfg_inlbuf4; - - memcpy(data, pbuf, sizeof(*data)); - - ptr = overlay->lcfg_bulk; - if (data->lcfg_dev_name) - LOGU(data->lcfg_dev_name, data->lcfg_dev_namelen, ptr); - if (data->lcfg_inlbuf1) - LOGU(data->lcfg_inlbuf1, data->lcfg_inllen1, ptr); - if (data->lcfg_inlbuf2) - LOGU(data->lcfg_inlbuf2, data->lcfg_inllen2, ptr); - if (data->lcfg_inlbuf3) - LOGU(data->lcfg_inlbuf3, data->lcfg_inllen3, ptr); - if (data->lcfg_inlbuf4) - LOGU(data->lcfg_inlbuf4, data->lcfg_inllen4, ptr); - - return 0; -} -#endif + char *s; -#include + if (!lcfg->lcfg_buflens[index]) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (!s) + return NULL; -static inline int lustre_cfg_getdata(char **buf, int len, void *arg, int kernel) + /* make sure it's NULL terminated, even if this kills a char + * of data + */ + s[lcfg->lcfg_buflens[index] - 1] = '\0'; + return s; +} + +static inline int lustre_cfg_len(uint32_t bufcount, uint32_t *buflens) { - struct lustre_cfg *lcfg; - int err; - int offset = 0; + int i; + int len; ENTRY; - if (len > OBD_MAX_IOCTL_BUFFER) { - CERROR("User buffer len %d exceeds %d max buffer\n", - len, OBD_MAX_IOCTL_BUFFER); - return -EINVAL; - } - if (len < sizeof(struct lustre_cfg)) { - CERROR("OBD: user buffer too small for lustre_cfg\n"); - return -EINVAL; - } + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += size_round(buflens[i]); - /* XXX allocate this more intelligently, using kmalloc when - * appropriate */ - OBD_ALLOC(*buf, len); - if (*buf == NULL) { - CERROR("Cannot allocate control buffer of len %d\n", len); - RETURN(-EINVAL); - } + RETURN(size_round(len)); +} - if (kernel) { - memcpy(*buf, (void *)arg, len); - } else { - err = copy_from_user(*buf, (void *)arg, len); - if (err) - RETURN(err); - } - lcfg = (struct lustre_cfg *)*buf; +#include - if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { - CERROR("Version mismatch kernel: %#x application: %#x\n", - LUSTRE_CFG_VERSION, lcfg->lcfg_version); - return -EINVAL; - } +static inline struct lustre_cfg *lustre_cfg_new(int cmd, + struct lustre_cfg_bufs *bufs) +{ + struct lustre_cfg *lcfg; + char *ptr; + int i; + ENTRY; - if (lcfg->lcfg_dev_name) { - lcfg->lcfg_dev_name = &lcfg->lcfg_bulk[0]; - offset += size_round(lcfg->lcfg_dev_namelen); - } + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) + RETURN(lcfg); - if (lcfg->lcfg_inllen1) { - lcfg->lcfg_inlbuf1 = &lcfg->lcfg_bulk[0] + offset; - offset += size_round(lcfg->lcfg_inllen1); - } + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; - if (lcfg->lcfg_inllen2) { - lcfg->lcfg_inlbuf2 = &lcfg->lcfg_bulk[0] + offset; - offset += size_round(lcfg->lcfg_inllen2); + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr); } + RETURN(lcfg); +} - if (lcfg->lcfg_inllen3) { - lcfg->lcfg_inlbuf3 = &lcfg->lcfg_bulk[0] + offset; - offset += size_round(lcfg->lcfg_inllen3); - } +static inline void lustre_cfg_free(struct lustre_cfg *lcfg) +{ + int len; - if (lcfg->lcfg_inllen4) { - lcfg->lcfg_inlbuf4 = &lcfg->lcfg_bulk[0] + offset; - } + len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); + OBD_FREE(lcfg, len); EXIT; - return 0; + return; } -static inline void lustre_cfg_freedata(char *buf, int len) +static inline int lustre_cfg_sanity_check(void *buf, int len) { + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; ENTRY; + if (!lcfg) + RETURN(-EINVAL); - OBD_FREE(buf, len); - EXIT; - return; + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + RETURN(-EINVAL); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + RETURN(-EINVAL); + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + RETURN(-EINVAL); + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + RETURN(-EINVAL); + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + RETURN(-EINVAL); + + RETURN(0); } /* Passed by mount */ diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 58f7b92..a409d97 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -244,10 +244,10 @@ typedef uint32_t obd_count; struct obdo { obd_id o_id; obd_gr o_gr; - obd_time o_atime; + obd_size o_size; obd_time o_mtime; + obd_time o_atime; obd_time o_ctime; - obd_size o_size; obd_blocks o_blocks; /* brw: cli sent cached bytes */ obd_size o_grant; obd_blksize o_blksize; /* optimal IO blocksize */ @@ -623,6 +623,8 @@ extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn); * array of UUIDs returned by the MDS. With the current * protocol, this will limit the max number of OSTs per LOV */ +#define LOV_DESC_MAGIC 0xB0CCDE5C + struct lov_desc { __u32 ld_tgt_count; /* how many OBD's */ __u32 ld_active_tgt_count; /* how many active */ @@ -633,6 +635,8 @@ struct lov_desc { struct obd_uuid ld_uuid; }; +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + extern void lustre_swab_lov_desc (struct lov_desc *ld); /* @@ -827,17 +831,30 @@ struct llog_catid { /* Log data record types - there is no specific reason that these need to * be related to the RPC opcodes, but no reason not to (may be handy later?) */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + typedef enum { - OST_SZ_REC = 0x10600000 | (OST_SAN_WRITE << 8), - OST_RAID1_REC = 0x10600000 | ((OST_SAN_WRITE + 1) << 8), - MDS_UNLINK_REC = 0x10610000 | (MDS_REINT << 8) | REINT_UNLINK, - OBD_CFG_REC = 0x10620000, - PTL_CFG_REC = 0x10630000, - LLOG_GEN_REC = 0x10640000, - LLOG_HDR_MAGIC = 0x10645539, - LLOG_LOGID_MAGIC = 0x1064553b, + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0, + OST_SZ_REC = LLOG_OP_MAGIC | (OST_SAN_WRITE << 8), + OST_RAID1_REC = LLOG_OP_MAGIC | ((OST_SAN_WRITE + 1) << 8), + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_UNLINK, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, } llog_op_type; +/* + * for now, continue to support old pad records which have 0 for their + * type but still need to be swabbed for their length + */ +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == \ + __swab32(LLOG_OP_MAGIC) || \ + (((r)->lrh_type == 0) && ((r)->lrh_len > LLOG_CHUNK_SIZE))) + /* Log record header - stored in little endian order. * Each record must start with this struct, end with a llog_rec_tail, * and be a multiple of 256 bits in size. @@ -970,9 +987,21 @@ struct llogd_conn_body { __u32 lgdc_ctxt_idx; } __attribute__((packed)); +extern void lustre_swab_lov_user_md(struct lov_user_md *lum); +extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum); + +/* llog_swab.c */ extern void lustre_swab_llogd_body (struct llogd_body *d); extern void lustre_swab_llog_hdr (struct llog_log_hdr *h); extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d); +extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec, + struct llog_rec_tail *tail); + +struct portals_cfg; +extern void lustre_swab_portals_cfg(struct portals_cfg *pcfg); + +struct lustre_cfg; +extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); static inline struct ll_fid *obdo_fid(struct obdo *oa) { diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index c6146bd..e1f6afe 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -237,7 +237,8 @@ struct client_obd { struct semaphore cl_sem; int cl_conn_count; /* max_mds_easize is purely a performance thing so we don't have to - * call obd_size_wiremd() all the time. */ + * call obd_size_diskmd() all the time. */ + int cl_default_mds_easize; int cl_max_mds_easize; int cl_max_mds_cookiesize; kdev_t cl_sandev; diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 5ced170..a492902 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -319,7 +319,12 @@ do { \ # define OBD_GFP_MASK GFP_NOFS #endif +#ifdef __KERNEL__ #define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, OBD_GFP_MASK) +#else +#define OBD_ALLOC(ptr, size) (ptr = malloc(size)) +#endif + #define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL) #ifdef __arch_um__ @@ -357,6 +362,7 @@ do { \ #define POISON_PAGE(page, val) do { } while (0) #endif +#ifdef __KERNEL__ #define OBD_FREE(ptr, size) \ do { \ LASSERT(ptr); \ @@ -367,6 +373,9 @@ do { \ kfree(ptr); \ (ptr) = (void *)0xdeadbeef; \ } while (0) +#else +#define OBD_FREE(ptr, size) ((void)(size), free((ptr))) +#endif #ifdef __arch_um__ # define OBD_VFREE(ptr, size) OBD_FREE(ptr, size) diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config new file mode 100644 index 0000000..f0a8a4b --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config @@ -0,0 +1,2844 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_X86=y +CONFIG_MMU=y +CONFIG_UID16=y +CONFIG_GENERIC_ISA_DMA=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +# CONFIG_STANDALONE is not set + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_HOTPLUG=y +CONFIG_EVLOG=y +# CONFIG_EVLOG_FWPRINTK is not set +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_EMBEDDED is not set + +# +# Class Based Kernel Resource Management +# +CONFIG_CKRM=y +CONFIG_RCFS_FS=m +CONFIG_CKRM_TYPE_TASKCLASS=y +CONFIG_CKRM_RES_NUMTASKS=m +CONFIG_CKRM_TYPE_SOCKETCLASS=y +CONFIG_CKRM_RBCE=m +CONFIG_CKRM_CRBCE=m +CONFIG_DELAY_ACCT=y +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_OBSOLETE_MODPARM=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y + +# +# Processor type and features +# +# CONFIG_X86_PC is not set +# CONFIG_X86_ELAN is not set +# CONFIG_X86_VOYAGER is not set +# CONFIG_X86_NUMAQ is not set +# CONFIG_X86_SUMMIT is not set +# CONFIG_X86_BIGSMP is not set +# CONFIG_X86_VISWS is not set +CONFIG_X86_GENERICARCH=y +# CONFIG_X86_ES7000 is not set +CONFIG_X86_CYCLONE_TIMER=y +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +CONFIG_MPENTIUMII=y +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +CONFIG_X86_GENERIC=y +CONFIG_X86_CMPXCHG=y +CONFIG_X86_XADD=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_INTEL_USERCOPY=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +# CONFIG_HPET_TIMER is not set +# CONFIG_HPET_EMULATE_RTC is not set +CONFIG_SMP=y +CONFIG_NR_CPUS=128 +CONFIG_SCHED_SMT=y +# CONFIG_PREEMPT is not set +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_TSC=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCE_NONFATAL is not set +CONFIG_X86_MCE_P4THERMAL=y +CONFIG_TOSHIBA=m +CONFIG_I8K=m +CONFIG_MICROCODE=m +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_NOHIGHMEM is not set +# CONFIG_HIGHMEM4G is not set +CONFIG_HIGHMEM64G=y +CONFIG_HIGHMEM=y +CONFIG_X86_PAE=y +# CONFIG_NUMA is not set +CONFIG_HIGHPTE=y +# CONFIG_MATH_EMULATION is not set +CONFIG_MTRR=y +CONFIG_EFI=y +CONFIG_IRQBALANCE=y +CONFIG_HAVE_DEC_LOCK=y +CONFIG_BOOT_IOREMAP=y +CONFIG_REGPARM=y + +# +# Special options +# +CONFIG_PROC_MM=y + +# +# Power management options (ACPI, APM) +# +CONFIG_PM=y +# CONFIG_SOFTWARE_SUSPEND is not set +# CONFIG_PM_DISK is not set + +# +# ACPI (Advanced Configuration and Power Interface) Support +# +CONFIG_ACPI=y +CONFIG_ACPI_BOOT=y +CONFIG_ACPI_INTERPRETER=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_SLEEP_PROC_FS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=m +CONFIG_ACPI_FAN=m +CONFIG_ACPI_PROCESSOR=m +CONFIG_ACPI_THERMAL=m +# CONFIG_ACPI_ASUS is not set +CONFIG_ACPI_TOSHIBA=m +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_BUS=y +CONFIG_ACPI_EC=y +CONFIG_ACPI_POWER=y +CONFIG_ACPI_PCI=y +CONFIG_ACPI_SYSTEM=y +CONFIG_X86_PM_TIMER=y +CONFIG_ACPI_INITRD=y + +# +# APM (Advanced Power Management) BIOS Support +# +CONFIG_APM=y +# CONFIG_APM_IGNORE_USER_SUSPEND is not set +CONFIG_APM_DO_ENABLE=y +# CONFIG_APM_CPU_IDLE is not set +CONFIG_APM_DISPLAY_BLANK=y +# CONFIG_APM_RTC_IS_GMT is not set +CONFIG_APM_ALLOW_INTS=y +# CONFIG_APM_REAL_MODE_POWER_OFF is not set + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_PROC_INTF=y +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +# CONFIG_CPU_FREQ_24_API is not set +CONFIG_CPU_FREQ_TABLE=m + +# +# CPUFreq processor drivers +# +CONFIG_X86_ACPI_CPUFREQ=m +# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set +CONFIG_X86_POWERNOW_K6=m +CONFIG_X86_POWERNOW_K7=m +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_POWERNOW_K8_ACPI=y +CONFIG_X86_GX_SUSPMOD=m +CONFIG_X86_SPEEDSTEP_CENTRINO=m +CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE=y +# CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI is not set +CONFIG_X86_SPEEDSTEP_ICH=m +CONFIG_X86_SPEEDSTEP_SMI=m +CONFIG_X86_P4_CLOCKMOD=m +CONFIG_X86_SPEEDSTEP_LIB=m +CONFIG_X86_LONGRUN=m +CONFIG_X86_LONGHAUL=m + +# +# Bus options (PCI, PCMCIA, EISA, MCA, ISA) +# +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GOMMCONFIG is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +# CONFIG_PCI_USE_VECTOR is not set +# CONFIG_PCI_LEGACY_PROC is not set +# CONFIG_PCI_NAMES is not set +CONFIG_ISA=y +# CONFIG_EISA is not set +# CONFIG_MCA is not set +CONFIG_SCx200=m + +# +# PCMCIA/CardBus support +# +CONFIG_PCMCIA=m +# CONFIG_PCMCIA_DEBUG is not set +CONFIG_YENTA=m +CONFIG_CARDBUS=y +CONFIG_I82092=m +CONFIG_I82365=m +CONFIG_TCIC=m +CONFIG_PCMCIA_PROBE=y + +# +# PCI Hotplug Support +# +CONFIG_HOTPLUG_PCI=m +CONFIG_HOTPLUG_PCI_FAKE=m +CONFIG_HOTPLUG_PCI_COMPAQ=m +CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM=y +CONFIG_HOTPLUG_PCI_IBM=m +CONFIG_HOTPLUG_PCI_AMD=m +CONFIG_HOTPLUG_PCI_ACPI=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_PCIE=m +# CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set +# CONFIG_HOTPLUG_PCI_SHPC is not set + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_AOUT=m +CONFIG_BINFMT_MISC=m + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_FW_LOADER=m +# CONFIG_DEBUG_DRIVER is not set + +# +# Memory Technology Devices (MTD) +# +CONFIG_MTD=m +# CONFIG_MTD_DEBUG is not set +CONFIG_MTD_PARTITIONS=m +CONFIG_MTD_CONCAT=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=m +CONFIG_MTD_BLOCK=m +# CONFIG_MTD_BLOCK_RO is not set +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +CONFIG_MTD_CFI_ADV_OPTIONS=y +CONFIG_MTD_CFI_NOSWAP=y +# CONFIG_MTD_CFI_BE_BYTE_SWAP is not set +# CONFIG_MTD_CFI_LE_BYTE_SWAP is not set +# CONFIG_MTD_CFI_GEOMETRY is not set +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +CONFIG_MTD_ABSENT=m +CONFIG_MTD_OBSOLETE_CHIPS=y +CONFIG_MTD_AMDSTD=m +CONFIG_MTD_SHARP=m +CONFIG_MTD_JEDEC=m + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +CONFIG_MTD_PHYSMAP_START=0x8000000 +CONFIG_MTD_PHYSMAP_LEN=0x4000000 +CONFIG_MTD_PHYSMAP_BUSWIDTH=2 +CONFIG_MTD_PNC2000=m +CONFIG_MTD_SC520CDP=m +CONFIG_MTD_NETSC520=m +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_ELAN_104NC=m +CONFIG_MTD_OCTAGON=m +CONFIG_MTD_VMAX=m +CONFIG_MTD_SCx200_DOCFLASH=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICH2ROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_DILNETPC=m +CONFIG_MTD_DILNETPC_BOOTSIZE=0x80000 +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +CONFIG_MTD_PMC551_BUGFIX=y +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_SLRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLKMTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOC2000=m +CONFIG_MTD_DOC2001=m +CONFIG_MTD_DOC2001PLUS=m +CONFIG_MTD_DOCPROBE=m +CONFIG_MTD_DOCPROBE_ADVANCED=y +CONFIG_MTD_DOCPROBE_ADDRESS=0x0000 +CONFIG_MTD_DOCPROBE_HIGH=y +CONFIG_MTD_DOCPROBE_55AA=y + +# +# NAND Flash Device Drivers +# +CONFIG_MTD_NAND=m +# CONFIG_MTD_NAND_VERIFY_WRITE is not set +CONFIG_MTD_NAND_IDS=m + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_OTHER=y +CONFIG_PARPORT_1284=y + +# +# Plug and Play support +# +CONFIG_PNP=y +# CONFIG_PNP_DEBUG is not set + +# +# Protocols +# +CONFIG_ISAPNP=y +CONFIG_PNPBIOS=y +CONFIG_PNPBIOS_PROC_FS=y + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +CONFIG_BLK_DEV_XD=m +CONFIG_PARIDE=m +CONFIG_PARIDE_PARPORT=m + +# +# Parallel IDE high-level drivers +# +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m + +# +# Parallel IDE protocol modules +# +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_BPCK6=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_CPQ_DA=m +CONFIG_BLK_CPQ_CISS_DA=m +CONFIG_CISS_SCSI_TAPE=y +CONFIG_BLK_CPQ_CISS_DA_NEW=m +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_CARMEL=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=64000 +CONFIG_BLK_DEV_INITRD=y +CONFIG_LBD=y +CONFIG_CIPHER_TWOFISH=m + +# +# ATA/ATAPI/MFM/RLL support +# +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +# CONFIG_BLK_DEV_HD_IDE is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +CONFIG_IDEDISK_STROKE=y +CONFIG_BLK_DEV_IDECS=m +CONFIG_BLK_DEV_IDECD=m +CONFIG_BLK_DEV_IDETAPE=m +CONFIG_BLK_DEV_IDEFLOPPY=y +CONFIG_BLK_DEV_IDESCSI=m +# CONFIG_IDE_TASK_IOCTL is not set +# CONFIG_IDE_TASKFILE_IO is not set + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +CONFIG_BLK_DEV_CMD640=y +CONFIG_BLK_DEV_CMD640_ENHANCED=y +CONFIG_BLK_DEV_IDEPNP=y +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +CONFIG_BLK_DEV_OFFBOARD=y +CONFIG_BLK_DEV_GENERIC=y +CONFIG_BLK_DEV_OPTI621=y +CONFIG_BLK_DEV_RZ1000=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +CONFIG_IDEDMA_ONLYDISK=y +CONFIG_BLK_DEV_ADMA=y +CONFIG_BLK_DEV_AEC62XX=y +CONFIG_BLK_DEV_ALI15X3=y +# CONFIG_WDC_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +CONFIG_BLK_DEV_ATIIXP=y +CONFIG_BLK_DEV_CMD64X=y +CONFIG_BLK_DEV_TRIFLEX=y +CONFIG_BLK_DEV_CY82C693=y +CONFIG_BLK_DEV_CS5520=m +CONFIG_BLK_DEV_CS5530=m +CONFIG_BLK_DEV_HPT34X=y +CONFIG_HPT34X_AUTODMA=y +CONFIG_BLK_DEV_HPT366=y +CONFIG_BLK_DEV_SC1200=y +CONFIG_BLK_DEV_PIIX=y +CONFIG_BLK_DEV_NS87415=y +CONFIG_BLK_DEV_PDC202XX_OLD=y +CONFIG_PDC202XX_BURST=y +CONFIG_BLK_DEV_PDC202XX_NEW=y +CONFIG_PDC202XX_FORCE=y +CONFIG_BLK_DEV_SVWKS=y +CONFIG_BLK_DEV_SIIMAGE=y +CONFIG_BLK_DEV_SIS5513=y +CONFIG_BLK_DEV_SLC90E66=y +CONFIG_BLK_DEV_TRM290=y +CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_IDE_CHIPSETS=y + +# +# Note: most of these also require special kernel boot parameters +# +CONFIG_BLK_DEV_4DRIVES=y +CONFIG_BLK_DEV_ALI14XX=y +CONFIG_BLK_DEV_DTC2278=y +CONFIG_BLK_DEV_HT6560B=y +# CONFIG_BLK_DEV_PDC4030 is not set +CONFIG_BLK_DEV_QD65XX=y +CONFIG_BLK_DEV_UMC8672=y +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +CONFIG_SCSI=m +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI Transport Attributes +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m + +# +# SCSI low-level drivers +# +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_7000FASST=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AHA152X=m +CONFIG_SCSI_AHA1542=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=5000 +# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set +# CONFIG_AIC7XXX_DEBUG_ENABLE is not set +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC7XXX_OLD=m +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_BUILD_FIRMWARE is not set +# CONFIG_AIC79XX_ENABLE_RD_STRM is not set +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +# CONFIG_SCSI_AIC79XX_NEW is not set +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_IN2000=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_SCSI_SATA=y +CONFIG_SCSI_SATA_SVW=m +CONFIG_SCSI_ATA_PIIX=m +CONFIG_SCSI_SATA_PROMISE=m +CONFIG_SCSI_SATA_SIL=m +CONFIG_SCSI_SATA_SIS=m +CONFIG_SCSI_SATA_VIA=m +CONFIG_SCSI_SATA_VITESSE=m +CONFIG_SCSI_BUSLOGIC=m +# CONFIG_SCSI_OMIT_FLASHPOINT is not set +# CONFIG_SCSI_CPQFCTS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_DTC3280=m +CONFIG_SCSI_EATA=m +CONFIG_SCSI_EATA_TAGGED_QUEUE=y +CONFIG_SCSI_EATA_LINKED_COMMANDS=y +CONFIG_SCSI_EATA_MAX_TAGS=16 +CONFIG_SCSI_EATA_PIO=m +CONFIG_SCSI_FUTURE_DOMAIN=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_GENERIC_NCR5380=m +CONFIG_SCSI_GENERIC_NCR5380_MMIO=m +CONFIG_SCSI_GENERIC_NCR53C400=y +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_NCR53C406A=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_PAS16=m +CONFIG_SCSI_PSI240I=m +CONFIG_SCSI_QLOGIC_FAS=m +CONFIG_SCSI_QLOGIC_ISP=m +CONFIG_SCSI_QLOGIC_FC=m +CONFIG_SCSI_QLOGIC_FC_FIRMWARE=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA2XXX=m +CONFIG_SCSI_QLA21XX=m +CONFIG_SCSI_QLA22XX=m +CONFIG_SCSI_QLA2300=m +CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA6312=m +CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA2XXX_FAILOVER=y +CONFIG_SCSI_QLA4XXX=m +CONFIG_SCSI_QLA4XXX_FAILOVER=y +CONFIG_SCSI_SYM53C416=m +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_DC390T=m +CONFIG_SCSI_T128=m +CONFIG_SCSI_U14_34F=m +CONFIG_SCSI_U14_34F_TAGGED_QUEUE=y +CONFIG_SCSI_U14_34F_LINKED_COMMANDS=y +CONFIG_SCSI_U14_34F_MAX_TAGS=8 +CONFIG_SCSI_ULTRASTOR=m +CONFIG_SCSI_NSP32=m +CONFIG_SCSI_DEBUG=m + +# +# PCMCIA SCSI adapter support +# +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_NINJA_SCSI=m +CONFIG_PCMCIA_QLOGIC=m + +# +# Old CD-ROM drivers (not SCSI, not IDE) +# +CONFIG_CD_NO_IDESCSI=y +CONFIG_AZTCD=m +CONFIG_GSCD=m +CONFIG_MCD=m +CONFIG_MCD_IRQ=11 +CONFIG_MCD_BASE=0x300 +CONFIG_OPTCD=m +CONFIG_SJCD=m +CONFIG_ISP16_CDI=m +CONFIG_CDU535=m + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_RAID6=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_FLAKEY=m +CONFIG_BLK_DEV_DM_BBR=m + +# +# Fusion MPT device support +# +CONFIG_FUSION=m +CONFIG_FUSION_MAX_SGE=40 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m + +# +# IEEE 1394 (FireWire) support +# +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m + +# +# I2O device support +# +CONFIG_I2O=m +CONFIG_I2O_CONFIG=m +CONFIG_I2O_BLOCK=m +CONFIG_I2O_SCSI=m +CONFIG_I2O_PROC=m + +# +# Networking support +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=m +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=m +CONFIG_UNIX=y +CONFIG_NET_KEY=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_FWMARK=y +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +CONFIG_IP_ROUTE_VERBOSE=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +# CONFIG_ACCEPT_QUEUES is not set + +# +# IP: Virtual Server Configuration +# +CONFIG_IP_VS=m +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IPV6=m +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_NDISC_NEW=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_TUNNEL=m + +# +# MOBILE IPv6 (EXPERIMENTAL) +# +CONFIG_IPV6_MOBILITY=m +CONFIG_IPV6_MOBILITY_MN=m +CONFIG_IPV6_MOBILITY_HA=m +# CONFIG_IPV6_MOBILITY_DEBUG is not set +CONFIG_DECNET=m +CONFIG_DECNET_SIOCGIFCONF=y +# CONFIG_DECNET_ROUTER is not set +CONFIG_BRIDGE=m +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_BRIDGE_NETFILTER=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_TFTP=m +CONFIG_IP_NF_AMANDA=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_IPRANGE=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_PKTTYPE=m +CONFIG_IP_NF_MATCH_POLICY=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_RECENT=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_DSCP=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_HELPER=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_CONNTRACK=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_MATCH_PHYSDEV=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_SAME=m +# CONFIG_IP_NF_NAT_LOCAL is not set +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_NAT_TFTP=m +CONFIG_IP_NF_NAT_AMANDA=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_DSCP=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_CLASSIFY=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_IP_NF_COMPAT_IPCHAINS=m +CONFIG_IP_NF_COMPAT_IPFWADM=m +CONFIG_IP_NF_CONNTRACK_MARK=y +CONFIG_IP_NF_TARGET_CONNMARK=m +CONFIG_IP_NF_MATCH_CONNMARK=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_IP6_NF_FTP=m +CONFIG_IP6_NF_QUEUE=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_AHESP=m +CONFIG_IP6_NF_MATCH_LENGTH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_CONNTRACK=m +CONFIG_IP6_NF_MATCH_STATE=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m + +# +# DECnet: Netfilter Configuration +# +CONFIG_DECNET_NF_GRABULATOR=m + +# +# Bridge: Netfilter Configuration +# +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_XFRM=y +CONFIG_XFRM_USER=m + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_MSG is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_HMAC_NONE is not set +# CONFIG_SCTP_HMAC_SHA1 is not set +CONFIG_SCTP_HMAC_MD5=y +CONFIG_ATM=y +CONFIG_ATM_CLIP=y +CONFIG_ATM_CLIP_NO_ICMP=y +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_VLAN_8021Q=m +CONFIG_LLC=y +CONFIG_LLC2=m +CONFIG_IPX=m +# CONFIG_IPX_INTERN is not set +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=y +CONFIG_LTPC=m +CONFIG_COPS=m +CONFIG_COPS_DAYNA=y +CONFIG_COPS_TANGENT=y +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_X25=m +CONFIG_LAPB=m +# CONFIG_NET_DIVERT is not set +CONFIG_ECONET=m +# CONFIG_ECONET_AUNUDP is not set +# CONFIG_ECONET_NATIVE is not set +CONFIG_WAN_ROUTER=m +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_CSZ=m +CONFIG_NET_SCH_ATM=y +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_DELAY=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_ISA=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +CONFIG_ETHERTAP=m +CONFIG_NET_SB1000=m + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +CONFIG_MII=m +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_EL1=m +CONFIG_EL2=m +CONFIG_ELPLUS=m +CONFIG_EL16=m +CONFIG_EL3=m +CONFIG_3C515=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_LANCE=m +CONFIG_NET_VENDOR_SMC=y +CONFIG_WD80x3=m +CONFIG_ULTRA=m +CONFIG_SMC9194=m +CONFIG_NET_VENDOR_RACAL=y +CONFIG_NI52=m +CONFIG_NI65=m + +# +# Tulip family network device support +# +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_TULIP=m +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_AT1700=m +CONFIG_DEPCA=m +CONFIG_HP100=m +CONFIG_NET_ISA=y +CONFIG_E2100=m +CONFIG_EWRK3=m +CONFIG_EEXPRESS=m +CONFIG_EEXPRESS_PRO=m +CONFIG_HPLAN_PLUS=m +CONFIG_HPLAN=m +CONFIG_LP486E=m +CONFIG_ETH16I=m +CONFIG_NE2000=m +CONFIG_ZNET=m +CONFIG_SEEQ8005=m +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +CONFIG_AMD8111_ETH=m +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_ADAPTEC_STARFIRE_NAPI=y +CONFIG_AC3200=m +CONFIG_APRICOT=m +CONFIG_B44=m +CONFIG_FORCEDETH=m +CONFIG_CS89x0=m +CONFIG_DGRS=m +CONFIG_EEPRO100=m +# CONFIG_EEPRO100_PIO is not set +CONFIG_E100=m +CONFIG_E100_NAPI=y +CONFIG_FEALNX=m +CONFIG_NATSEMI=m +CONFIG_NE2K_PCI=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_8139_RXBUF_IDX=2 +CONFIG_SIS900=m +CONFIG_EPIC100=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_TLAN=m +CONFIG_VIA_RHINE=m +# CONFIG_VIA_RHINE_MMIO is not set +CONFIG_NET_POCKET=y +CONFIG_ATP=m +CONFIG_DE600=m +CONFIG_DE620=m + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_DL2K=m +CONFIG_E1000=m +CONFIG_E1000_NAPI=y +CONFIG_E1000_NEW=m +CONFIG_E1000_NEW_NAPI=y +CONFIG_NS83820=m +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_R8169=m +CONFIG_SIS190=m +CONFIG_SK98LIN=m +CONFIG_TIGON3=m +CONFIG_NET_BROADCOM=m +CONFIG_NET_BROADCOM_NEW=m +CONFIG_NET_BCM44=m +CONFIG_TIGON3_NEW=m + +# +# Ethernet (10000 Mbit) +# +CONFIG_IXGB=m +CONFIG_IXGB_NAPI=y +CONFIG_S2IO=m +CONFIG_S2IO_NAPI=y +CONFIG_FDDI=y +# CONFIG_DEFXX is not set +CONFIG_SKFP=m +CONFIG_HIPPI=y +CONFIG_ROADRUNNER=m +CONFIG_ROADRUNNER_LARGE_RINGS=y +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_PPPOATM=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Wireless LAN (non-hamradio) +# +CONFIG_NET_RADIO=y + +# +# Obsolete Wireless cards support (pre-802.11) +# +CONFIG_STRIP=m +# CONFIG_ARLAN is not set +CONFIG_WAVELAN=m +CONFIG_PCMCIA_WAVELAN=m +CONFIG_PCMCIA_NETWAVE=m + +# +# Wireless 802.11 Frequency Hopping cards support +# +CONFIG_PCMCIA_RAYCS=m + +# +# Wireless 802.11b ISA/PCI cards support +# +CONFIG_AIRO=m +CONFIG_HERMES=m +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m + +# +# Wireless 802.11b Pcmcia/Cardbus cards support +# +CONFIG_PCMCIA_HERMES=m +CONFIG_AIRO_CS=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_PCMCIA_WL3501=m + +# +# Prism GT/Duette 802.11(a/b/g) PCI/Cardbus support +# +CONFIG_PRISM54=m +CONFIG_NET_WIRELESS=y + +# +# Token Ring devices +# +CONFIG_TR=y +CONFIG_IBMTR=m +CONFIG_IBMOL=m +CONFIG_IBMLS=m +CONFIG_3C359=m +CONFIG_TMS380TR=m +CONFIG_TMSPCI=m +CONFIG_SKISA=m +CONFIG_PROTEON=m +CONFIG_ABYSS=m +CONFIG_SMCTR=m +CONFIG_NET_FC=y +CONFIG_NET_LPFC=m +CONFIG_RCPCI=m +CONFIG_SHAPER=m +CONFIG_NETCONSOLE=m + +# +# Wan interfaces +# +CONFIG_WAN=y +CONFIG_HOSTESS_SV11=m +# CONFIG_COSA is not set +CONFIG_DSCC4=m +CONFIG_DSCC4_PCISYNC=y +CONFIG_DSCC4_PCI_RST=y +CONFIG_LANMEDIA=m +CONFIG_SEALEVEL_4021=m +CONFIG_SYNCLINK_SYNCPPP=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=y +CONFIG_HDLC_RAW_ETH=y +CONFIG_HDLC_CISCO=y +CONFIG_HDLC_FR=y +CONFIG_HDLC_PPP=y +CONFIG_HDLC_X25=y +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +# CONFIG_WANXL_BUILD_FIRMWARE is not set +CONFIG_PC300=m +CONFIG_PC300_MLPPP=y +CONFIG_N2=m +CONFIG_C101=m +CONFIG_FARSYNC=m +CONFIG_DLCI=m +CONFIG_DLCI_COUNT=24 +CONFIG_DLCI_MAX=8 +CONFIG_SDLA=m +# CONFIG_WAN_ROUTER_DRIVERS is not set +CONFIG_LAPBETHER=m +CONFIG_X25_ASY=m +# CONFIG_SBNI is not set + +# +# PCMCIA network device support +# +CONFIG_NET_PCMCIA=y +CONFIG_PCMCIA_3C589=m +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_PCMCIA_PCNET=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_PCMCIA_SMC91C92=m +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_PCMCIA_AXNET=m +CONFIG_ARCNET_COM20020_CS=m +CONFIG_PCMCIA_IBMTR=m + +# +# ATM drivers +# +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +CONFIG_ATM_IDT77252_RCV_ALL=y +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E_MAYBE=m +CONFIG_ATM_FORE200E_PCA=y +CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_FORE200E=m +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y + +# +# Amateur Radio support +# +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_BPQETHER=m +CONFIG_SCC=m +CONFIG_SCC_DELAY=y +CONFIG_SCC_TRXECHO=y +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_BAYCOM_EPP=m +CONFIG_YAM=m + +# +# IrDA (infrared) support +# +CONFIG_IRDA=m + +# +# IrDA protocols +# +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +CONFIG_IRDA_ULTRA=y + +# +# IrDA options +# +CONFIG_IRDA_CACHE_LAST_LSAP=y +# CONFIG_IRDA_FAST_RR is not set +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# + +# +# SIR device drivers +# +CONFIG_IRTTY_SIR=m + +# +# Dongle support +# +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_MA600_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_MCP2120_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_ACT200L_DONGLE=m + +# +# Old SIR device drivers +# + +# +# Old Serial dongle support +# + +# +# FIR device drivers +# +CONFIG_USB_IRDA=m +CONFIG_SIGMATEL_FIR=m +CONFIG_NSC_FIR=m +CONFIG_WINBOND_FIR=m +CONFIG_TOSHIBA_FIR=m +CONFIG_SMC_IRCC_FIR=m +CONFIG_ALI_FIR=m +CONFIG_VLSI_FIR=m +CONFIG_VIA_FIR=m + +# +# Bluetooth support +# +CONFIG_BT=m +CONFIG_BT_L2CAP=m +CONFIG_BT_SCO=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m + +# +# Bluetooth device drivers +# +CONFIG_BT_HCIUSB=m +CONFIG_BT_HCIUSB_SCO=y +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_BCSP_TXCRC=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIBTUART=m +CONFIG_BT_HCIVHCI=m +CONFIG_NETPOLL=y +CONFIG_NETPOLL_RX=y +CONFIG_NETPOLL_TRAP=y +CONFIG_NET_POLL_CONTROLLER=y + +# +# ISDN subsystem +# +CONFIG_ISDN=m + +# +# Old ISDN4Linux +# +CONFIG_ISDN_I4L=m +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_IPPP_FILTER=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y +CONFIG_ISDN_X25=y + +# +# ISDN feature submodules +# + +# +# ISDN4Linux hardware drivers +# + +# +# Passive cards +# +CONFIG_ISDN_DRV_HISAX=m + +# +# D-channel protocol features +# +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 + +# +# HiSax supported cards +# +CONFIG_HISAX_16_0=y +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_AVM_A1=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_IX1MICROR2=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_ASUSCOM=y +CONFIG_HISAX_TELEINT=y +CONFIG_HISAX_HFCS=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_SPORTSTER=y +CONFIG_HISAX_MIC=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_ISURF=y +CONFIG_HISAX_HSTSAPHIR=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +CONFIG_HISAX_DEBUG=y + +# +# HiSax PCMCIA card service modules +# +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_TELES_CS=m + +# +# HiSax sub driver modules +# +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_HFCUSB=m +CONFIG_HISAX_FRITZ_PCIPNP=m +CONFIG_HISAX_HDLC=y + +# +# Active cards +# +CONFIG_ISDN_DRV_ICN=m +CONFIG_ISDN_DRV_PCBIT=m +CONFIG_ISDN_DRV_SC=m +CONFIG_ISDN_DRV_ACT2000=m +CONFIG_ISDN_DRV_TPAM=m + +# +# CAPI subsystem +# +CONFIG_ISDN_CAPI=m +CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_CAPIFS_BOOL=y +CONFIG_ISDN_CAPI_CAPIFS=m +CONFIG_ISDN_CAPI_CAPIDRV=m + +# +# CAPI hardware drivers +# + +# +# Active AVM cards +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_T1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m + +# +# Active Eicon DIVA Server cards +# +CONFIG_CAPI_EICON=y +CONFIG_ISDN_DIVAS=m +CONFIG_ISDN_DIVAS_BRIPCI=y +CONFIG_ISDN_DIVAS_PRIPCI=y +CONFIG_ISDN_DIVAS_DIVACAPI=m +CONFIG_ISDN_DIVAS_USERIDI=m +CONFIG_ISDN_DIVAS_MAINT=m + +# +# Telephony Support +# +CONFIG_PHONE=m +CONFIG_PHONE_IXJ=m +CONFIG_PHONE_IXJ_PCMCIA=m + +# +# Input device support +# +CONFIG_INPUT=y + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_TSDEV=m +CONFIG_INPUT_TSDEV_SCREEN_X=240 +CONFIG_INPUT_TSDEV_SCREEN_Y=320 +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input I/O drivers +# +CONFIG_GAMEPORT=m +CONFIG_SOUND_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_VORTEX=m +CONFIG_GAMEPORT_FM801=m +CONFIG_GAMEPORT_CS461x=m +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +CONFIG_KEYBOARD_SUNKBD=m +# CONFIG_KEYBOARD_LKKBD is not set +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_INPORT=m +CONFIG_MOUSE_ATIXL=y +CONFIG_MOUSE_LOGIBM=m +CONFIG_MOUSE_PC110PAD=m +# CONFIG_MOUSE_VSXXXAA is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=y +CONFIG_JOYSTICK_IFORCE_232=y +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDDLER=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +# CONFIG_INPUT_JOYDUMP is not set +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_PCSPKR=y +CONFIG_INPUT_UINPUT=m + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +CONFIG_ECC=m +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_N_HDLC=m +CONFIG_STALDRV=y + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_CS=m +# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_NR_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_MULTIPORT=y +CONFIG_SERIAL_8250_RSA=y + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_ICOM is not set +CONFIG_SERIAL_JSM=m +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_TIPAR=m +CONFIG_QIC02_TAPE=m +CONFIG_QIC02_DYNCONF=y + +# +# Setting runtime QIC-02 configuration is done with qic02conf +# + +# +# from the tpqic02-support package. It is available at +# + +# +# metalab.unc.edu or ftp://titus.cfw.com/pub/Linux/util/ +# + +# +# IPMI +# +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_PANIC_EVENT=y +CONFIG_IPMI_PANIC_STRING=y +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_KCS=m +CONFIG_IPMI_WATCHDOG=m + +# +# Watchdog Cards +# +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_AMD7XX_TCO=m +CONFIG_SC520_WDT=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_WAFER_WDT=m +CONFIG_I8XX_TCO=m +CONFIG_SC1200_WDT=m +CONFIG_SCx200_WDT=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_MACHZ_WDT=m + +# +# ISA-based Watchdog Cards +# +CONFIG_PCWATCHDOG=m +CONFIG_MIXCOMWD=m +CONFIG_WDT=m +CONFIG_WDT_501=y + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m +CONFIG_WDT_501_PCI=y + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_HW_RANDOM=m +CONFIG_NVRAM=m +CONFIG_RTC=y +CONFIG_DTLK=m +CONFIG_R3964=m +CONFIG_APPLICOM=m +CONFIG_SONYPI=m + +# +# Ftape, the floppy tape device driver +# +CONFIG_AGP=m +CONFIG_AGP_ALI=m +CONFIG_AGP_ATI=m +CONFIG_AGP_AMD=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_INTEL_MCH=m +CONFIG_AGP_NVIDIA=m +CONFIG_AGP_SIS=m +CONFIG_AGP_SWORKS=m +CONFIG_AGP_VIA=m +CONFIG_AGP_EFFICEON=m +# CONFIG_DRM is not set + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +# CONFIG_MWAVE is not set +CONFIG_SCx200_GPIO=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=4096 +CONFIG_HANGCHECK_TIMER=m +CONFIG_VTUNE=m + +# +# Linux InfraRed Controller +# +CONFIG_LIRC_SUPPORT=m +CONFIG_LIRC_MAX_DEV=2 +CONFIG_LIRC_BT829=m +CONFIG_LIRC_IT87=m +CONFIG_LIRC_ATIUSB=m +CONFIG_LIRC_SERIAL=m +# CONFIG_LIRC_HOMEBREW is not set +CONFIG_LIRC_PORT_SERIAL=0x3f8 +CONFIG_LIRC_IRQ_SERIAL=4 +CONFIG_LIRC_SIR=m +CONFIG_LIRC_PORT_SIR=0x3f8 +CONFIG_LIRC_IRQ_SIR=4 + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m + +# +# I2C Algorithms +# +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCF=m + +# +# I2C Hardware Bus support +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_I801=m +CONFIG_I2C_I810=m +CONFIG_I2C_ISA=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_PROSAVAGE=m +CONFIG_I2C_SAVAGE4=m +CONFIG_SCx200_I2C=m +CONFIG_SCx200_I2C_SCL=12 +CONFIG_SCx200_I2C_SDA=13 +CONFIG_SCx200_ACB=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m +CONFIG_I2C_VOODOO3=m + +# +# Hardware Sensors Chip support +# +CONFIG_I2C_SENSOR=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_FSCHER=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83627HF=m + +# +# Other I2C Chip support +# +CONFIG_SENSORS_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set + +# +# Misc devices +# +CONFIG_IBM_ASM=m + +# +# Multimedia devices +# +CONFIG_VIDEO_DEV=m + +# +# Video For Linux +# + +# +# Video Adapters +# +CONFIG_VIDEO_BT848=m +CONFIG_VIDEO_PMS=m +CONFIG_VIDEO_BWQCAM=m +CONFIG_VIDEO_CQCAM=m +CONFIG_VIDEO_W9966=m +CONFIG_VIDEO_CPIA=m +CONFIG_VIDEO_CPIA_PP=m +CONFIG_VIDEO_CPIA_USB=m +CONFIG_VIDEO_SAA5246A=m +CONFIG_VIDEO_SAA5249=m +CONFIG_TUNER_3036=m +CONFIG_VIDEO_STRADIS=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_DC30=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZORAN_LML33R10=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DPC=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_CX88=m + +# +# Radio Adapters +# +CONFIG_RADIO_CADET=m +CONFIG_RADIO_RTRACK=m +CONFIG_RADIO_RTRACK2=m +CONFIG_RADIO_AZTECH=m +CONFIG_RADIO_GEMTEK=m +CONFIG_RADIO_GEMTEK_PCI=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_MAESTRO=m +CONFIG_RADIO_MIROPCM20=m +# CONFIG_RADIO_MIROPCM20_RDS is not set +CONFIG_RADIO_SF16FMI=m +CONFIG_RADIO_SF16FMR2=m +CONFIG_RADIO_TERRATEC=m +CONFIG_RADIO_TRUST=m +CONFIG_RADIO_TYPHOON=m +CONFIG_RADIO_TYPHOON_PROC_FS=y +CONFIG_RADIO_ZOLTRIX=m + +# +# Digital Video Broadcasting Devices +# +CONFIG_DVB=y +CONFIG_DVB_CORE=m + +# +# Supported Frontend Modules +# +CONFIG_DVB_TWINHAN_DST=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_SP887X_FIRMWARE_FILE="/etc/dvb/sc_main.mc" +CONFIG_DVB_ALPS_TDLB7=m +CONFIG_DVB_ALPS_TDMB7=m +CONFIG_DVB_ATMEL_AT76C651=m +CONFIG_DVB_CX24110=m +CONFIG_DVB_GRUNDIG_29504_491=m +CONFIG_DVB_GRUNDIG_29504_401=m +CONFIG_DVB_MT312=m +CONFIG_DVB_VES1820=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_TDA1004X_FIRMWARE_FILE="/usr/lib/hotplug/firmware/tda1004x.bin" +CONFIG_DVB_NXT6000=m + +# +# Supported SAA7146 based PCI Adapters +# +CONFIG_DVB_AV7110=m +# CONFIG_DVB_AV7110_FIRMWARE is not set +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m + +# +# Supported USB Adapters +# +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m + +# +# Supported FlexCopII (B2C2) Adapters +# +CONFIG_DVB_B2C2_SKYSTAR=m + +# +# Supported BT878 Adapters +# +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_VIDEO_VIDEOBUF=m +CONFIG_VIDEO_TUNER=m +CONFIG_VIDEO_BUF=m +CONFIG_VIDEO_BTCX=m +CONFIG_VIDEO_IR=m + +# +# Graphics support +# +CONFIG_FB=y +CONFIG_FB_PM2=m +CONFIG_FB_PM2_FIFO_DISCONNECT=y +CONFIG_FB_CYBER2000=m +CONFIG_FB_IMSTT=y +CONFIG_FB_VGA16=m +CONFIG_FB_VESA=y +CONFIG_VIDEO_SELECT=y +CONFIG_FB_HGA=m +CONFIG_FB_RIVA=m +CONFIG_FB_I810=m +CONFIG_FB_I810_GTF=y +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON_OLD is not set +CONFIG_FB_RADEON=m +CONFIG_FB_RADEON_I2C=y +# CONFIG_FB_RADEON_DEBUG is not set +# CONFIG_FB_ATY128 is not set +CONFIG_FB_ATY=m +CONFIG_FB_ATY_CT=y +CONFIG_FB_ATY_GX=y +CONFIG_FB_ATY_XL_INIT=y +CONFIG_FB_SIS=m +CONFIG_FB_SIS_300=y +CONFIG_FB_SIS_315=y +CONFIG_FB_NEOMAGIC=m +CONFIG_FB_KYRO=m +CONFIG_FB_3DFX=m +CONFIG_FB_VOODOO1=m +CONFIG_FB_TRIDENT=m +# CONFIG_FB_VIRTUAL is not set + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_MDA_CONSOLE=m +CONFIG_DUMMY_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_PCI_CONSOLE=y +# CONFIG_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y + +# +# Logo configuration +# +# CONFIG_LOGO is not set + +# +# Bootsplash configuration +# +CONFIG_BOOTSPLASH=y + +# +# Sound +# +CONFIG_SOUND=m + +# +# Advanced Linux Sound Architecture +# +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_RTCTIMER=m +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +CONFIG_SND_DEBUG_MEMORY=y +# CONFIG_SND_DEBUG_DETECT is not set + +# +# Generic devices +# +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL4_LIB=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_DUMMY=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m + +# +# ISA devices +# +CONFIG_SND_AD1816A=m +CONFIG_SND_AD1848=m +CONFIG_SND_CS4231=m +CONFIG_SND_CS4232=m +CONFIG_SND_CS4236=m +CONFIG_SND_ES968=m +CONFIG_SND_ES1688=m +CONFIG_SND_ES18XX=m +CONFIG_SND_GUSCLASSIC=m +CONFIG_SND_GUSEXTREME=m +CONFIG_SND_GUSMAX=m +CONFIG_SND_INTERWAVE=m +CONFIG_SND_INTERWAVE_STB=m +CONFIG_SND_OPTI92X_AD1848=m +CONFIG_SND_OPTI92X_CS4231=m +CONFIG_SND_OPTI93X=m +CONFIG_SND_SB8=m +CONFIG_SND_SB16=m +CONFIG_SND_SBAWE=m +CONFIG_SND_SB16_CSP=y +CONFIG_SND_WAVEFRONT=m +CONFIG_SND_ALS100=m +CONFIG_SND_AZT2320=m +CONFIG_SND_CMI8330=m +CONFIG_SND_DT019X=m +CONFIG_SND_OPL3SA2=m +CONFIG_SND_SGALAXY=m +CONFIG_SND_SSCAPE=m + +# +# PCI devices +# +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CS4281=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_KORG1212=m +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_HDSP=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_YMFPCI=m +CONFIG_SND_ALS4000=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VX222=m + +# +# ALSA USB devices +# +CONFIG_SND_USB_AUDIO=m + +# +# PCMCIA devices +# +# CONFIG_SND_VXPOCKET is not set +# CONFIG_SND_VXP440 is not set +# CONFIG_SND_PDAUDIOCF is not set + +# +# Open Sound System +# +CONFIG_SOUND_PRIME=m +CONFIG_SOUND_BT878=m +CONFIG_SOUND_CMPCI=m +CONFIG_SOUND_CMPCI_FM=y +CONFIG_SOUND_CMPCI_FMIO=0x388 +CONFIG_SOUND_CMPCI_MIDI=y +CONFIG_SOUND_CMPCI_MPUIO=0x330 +CONFIG_SOUND_CMPCI_JOYSTICK=y +CONFIG_SOUND_CMPCI_CM8738=y +# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set +CONFIG_SOUND_CMPCI_SPDIFLOOP=y +CONFIG_SOUND_CMPCI_SPEAKERS=2 +CONFIG_SOUND_EMU10K1=m +CONFIG_MIDI_EMU10K1=y +# CONFIG_SOUND_FUSION is not set +CONFIG_SOUND_CS4281=m +CONFIG_SOUND_ES1370=m +CONFIG_SOUND_ES1371=m +CONFIG_SOUND_ESSSOLO1=m +CONFIG_SOUND_MAESTRO=m +CONFIG_SOUND_MAESTRO3=m +CONFIG_SOUND_ICH=m +CONFIG_SOUND_SONICVIBES=m +CONFIG_SOUND_TRIDENT=m +# CONFIG_SOUND_MSNDCLAS is not set +# CONFIG_SOUND_MSNDPIN is not set +CONFIG_SOUND_VIA82CXXX=m +CONFIG_MIDI_VIA82CXXX=y +CONFIG_SOUND_OSS=m +CONFIG_SOUND_TRACEINIT=y +CONFIG_SOUND_DMAP=y +# CONFIG_SOUND_AD1816 is not set +CONFIG_SOUND_AD1889=m +CONFIG_SOUND_SGALAXY=m +CONFIG_SOUND_ADLIB=m +CONFIG_SOUND_ACI_MIXER=m +CONFIG_SOUND_CS4232=m +CONFIG_SOUND_SSCAPE=m +CONFIG_SOUND_GUS=m +# CONFIG_SOUND_GUS16 is not set +CONFIG_SOUND_GUSMAX=y +CONFIG_SOUND_VMIDI=m +CONFIG_SOUND_TRIX=m +CONFIG_SOUND_MSS=m +CONFIG_SOUND_MPU401=m +CONFIG_SOUND_NM256=m +CONFIG_SOUND_MAD16=m +CONFIG_MAD16_OLDCARD=y +CONFIG_SOUND_PAS=m +CONFIG_SOUND_PSS=m +CONFIG_PSS_MIXER=y +# CONFIG_PSS_HAVE_BOOT is not set +CONFIG_SOUND_SB=m +# CONFIG_SOUND_AWE32_SYNTH is not set +CONFIG_SOUND_WAVEFRONT=m +CONFIG_SOUND_MAUI=m +CONFIG_SOUND_YM3812=m +CONFIG_SOUND_OPL3SA1=m +CONFIG_SOUND_OPL3SA2=m +CONFIG_SOUND_YMFPCI=m +CONFIG_SOUND_YMFPCI_LEGACY=y +CONFIG_SOUND_UART6850=m +CONFIG_SOUND_AEDSP16=m +CONFIG_SC6600=y +CONFIG_SC6600_JOY=y +CONFIG_SC6600_CDROM=4 +CONFIG_SC6600_CDROMBASE=0x0 +# CONFIG_AEDSP16_MSS is not set +# CONFIG_AEDSP16_SBPRO is not set +CONFIG_AEDSP16_MPU401=y +CONFIG_SOUND_TVMIXER=m +CONFIG_SOUND_KAHLUA=m +CONFIG_SOUND_ALI5455=m +CONFIG_SOUND_FORTE=m +CONFIG_SOUND_RME96XX=m +CONFIG_SOUND_AD1980=m + +# +# USB support +# +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_SPLIT_ISO=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_UHCI_HCD=m + +# +# USB Device Class drivers +# +CONFIG_USB_AUDIO=m + +# +# USB Bluetooth TTY can only be used with disabled Bluetooth subsystem +# +CONFIG_USB_MIDI=m +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=y +CONFIG_USB_STORAGE_FREECOM=y +CONFIG_USB_STORAGE_ISD200=y +CONFIG_USB_STORAGE_DPCM=y +CONFIG_USB_STORAGE_HP8200e=y +CONFIG_USB_STORAGE_SDDR09=y +CONFIG_USB_STORAGE_SDDR55=y +CONFIG_USB_STORAGE_JUMPSHOT=y + +# +# USB Human Interface Devices (HID) +# +CONFIG_USB_HID=m +CONFIG_USB_HIDINPUT=y +CONFIG_HID_FF=y +CONFIG_HID_PID=y +CONFIG_LOGITECH_FF=y +CONFIG_THRUSTMASTER_FF=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +CONFIG_USB_AIPTEK=m +CONFIG_USB_WACOM=m +CONFIG_USB_KBTAB=m +CONFIG_USB_POWERMATE=m +CONFIG_USB_MTOUCH=m +CONFIG_USB_XPAD=m +CONFIG_USB_ATI_REMOTE=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USB_HPUSBSCSI=m + +# +# USB Multimedia devices +# +CONFIG_USB_DABUSB=m +CONFIG_USB_VICAM=m +CONFIG_USB_DSBR=m +CONFIG_USB_IBMCAM=m +CONFIG_USB_KONICAWC=m +CONFIG_USB_OV511=m +CONFIG_USB_SE401=m +CONFIG_USB_STV680=m +CONFIG_USB_W9968CF=m + +# +# USB Network adaptors +# +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_USBNET=m + +# +# USB Host-to-Host Cables +# +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_GENESYS=y +CONFIG_USB_NET1080=y +CONFIG_USB_PL2301=y + +# +# Intelligent USB Devices/Gadgets +# +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_ZAURUS=y +CONFIG_USB_CDCETHER=y + +# +# USB Network Adapters +# +CONFIG_USB_AX8817X=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m + +# +# USB Serial Converter support +# +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KEYSPAN_MPR=y +CONFIG_USB_SERIAL_KEYSPAN_USA28=y +CONFIG_USB_SERIAL_KEYSPAN_USA28X=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +CONFIG_USB_SERIAL_KEYSPAN_USA19=y +CONFIG_USB_SERIAL_KEYSPAN_USA18X=y +CONFIG_USB_SERIAL_KEYSPAN_USA19W=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +CONFIG_USB_SERIAL_KEYSPAN_USA49W=y +CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_EZUSB=y + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_TIGL=m +CONFIG_USB_AUERSWALD=m +CONFIG_USB_RIO500=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_LED=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_SPEEDTOUCH=m +# CONFIG_USB_TEST is not set + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set + +# +# InfiniBand support +# +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_SDP is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_UDAPL_HELPER=m +CONFIG_INFINIBAND_MELLANOX_HCA=m +CONFIG_AUDIT=m + +# +# File systems +# +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_JBD=m +CONFIG_JBD_DEBUG=y +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_DMAPI=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_FS_POSIX_ACL=y +CONFIG_XFS_FS=m +CONFIG_XFS_RT=y +CONFIG_XFS_QUOTA=m +CONFIG_XFS_DMAPI=y +CONFIG_XFS_SECURITY=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_MINIX_FS=y +CONFIG_ROMFS_FS=m +CONFIG_DMAPI=m +# CONFIG_DMAPI_DEBUG is not set +CONFIG_QUOTA=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_ZISOFS_FS=y +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +# CONFIG_NTFS_RW is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +# CONFIG_DEVFS_FS is not set +CONFIG_DEVPTS_FS_XATTR=y +CONFIG_DEVPTS_FS_SECURITY=y +CONFIG_TMPFS=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=m +# CONFIG_KLOG_CHANNEL is not set + +# +# Miscellaneous filesystems +# +CONFIG_ADFS_FS=m +# CONFIG_ADFS_FS_RW is not set +CONFIG_AFFS_FS=m +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EFS_FS=m +CONFIG_JFFS_FS=m +CONFIG_JFFS_FS_VERBOSE=0 +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +# CONFIG_JFFS2_FS_NAND is not set +CONFIG_CRAMFS=m +CONFIG_VXFS_FS=m +CONFIG_HPFS_FS=m +CONFIG_QNX4FS_FS=m +# CONFIG_QNX4FS_RW is not set +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_DIRECTIO=y +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +CONFIG_NFSD_ACL=y +CONFIG_NFS_ACL_SUPPORT=y +# CONFIG_NFSD_V4 is not set +CONFIG_NFSD_TCP=y +CONFIG_LOCKD=y +CONFIG_STATD=y +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=m +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +CONFIG_SMB_FS=m +CONFIG_SMB_NLS_DEFAULT=y +CONFIG_SMB_NLS_REMOTE="cp850" +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +CONFIG_CODA_FS=m +# CONFIG_CODA_FS_OLD_API is not set +# CONFIG_INTERMEZZO_FS is not set +CONFIG_AFS_FS=m +CONFIG_RXRPC=m + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +# CONFIG_AMIGA_PARTITION is not set +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +# CONFIG_MINIX_SUBPARTITION is not set +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_NEC98_PARTITION=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m +CONFIG_FSHOOKS=y + +# +# Profiling support +# +CONFIG_PROFILING=y +CONFIG_OPROFILE=m + +# +# Kernel hacking +# +CONFIG_CRASH_DUMP=m +CONFIG_KERNTYPES=y +CONFIG_CRASH_DUMP_BLOCKDEV=m +CONFIG_CRASH_DUMP_NETDEV=m +# CONFIG_CRASH_DUMP_MEMDEV is not set +CONFIG_CRASH_DUMP_COMPRESS_RLE=m +CONFIG_CRASH_DUMP_COMPRESS_GZIP=m +CONFIG_DEBUG_KERNEL=y +CONFIG_EARLY_PRINTK=y +# CONFIG_KPROBES is not set +# CONFIG_DEBUGREG is not set +CONFIG_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_SLAB is not set +CONFIG_MAGIC_SYSRQ=y +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_HIGHMEM is not set +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_KDB is not set +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y +# CONFIG_HOOK is not set + +# +# Security options +# +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_CAPABILITIES=m +CONFIG_SECURITY_ROOTPLUG=m +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +# CONFIG_SECURITY_SELINUX_MLS is not set + +# +# IBM Crypto Hardware support +# +CONFIG_IBM_CRYPTO=m +CONFIG_ICA_LEEDSLITE=m + +# +# Cryptographic options +# +CONFIG_CRYPTO=y +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_AES=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_TEST=m + +# +# Library routines +# +CONFIG_CRC32=y +CONFIG_QSORT=y +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m + +# +# Build options +# +CONFIG_SUSE_KERNEL=y +CONFIG_CFGNAME="bigsmp" +CONFIG_RELEASE="SLES9_SP1_BRANCH_2004110217390391" +CONFIG_X86_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y +CONFIG_PC=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config new file mode 100644 index 0000000..0048533 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config @@ -0,0 +1,1452 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_64BIT=y +CONFIG_MMU=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_HAVE_DEC_LOCK=y +CONFIG_EARLY_PRINTK=y +CONFIG_COMPAT=y +CONFIG_FRAME_POINTER=y +CONFIG_FORCE_MAX_ZONEORDER=13 + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=19 +CONFIG_HOTPLUG=y +CONFIG_EVLOG=y +# CONFIG_EVLOG_FWPRINTK is not set +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_EMBEDDED is not set + +# +# Class Based Kernel Resource Management +# +CONFIG_CKRM=y +CONFIG_RCFS_FS=m +CONFIG_CKRM_TYPE_TASKCLASS=y +CONFIG_CKRM_RES_NUMTASKS=m +CONFIG_CKRM_CPU_SCHEDULE=y +# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set +CONFIG_CKRM_TYPE_SOCKETCLASS=y +CONFIG_CKRM_RBCE=m +CONFIG_CKRM_CRBCE=m +CONFIG_DELAY_ACCT=y +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +CONFIG_OBSOLETE_MODPARM=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y + +# +# Platform support +# +# CONFIG_PPC_ISERIES is not set +CONFIG_PPC_PSERIES=y +CONFIG_PPC=y +CONFIG_PPC64=y +CONFIG_PPC_OF=y +CONFIG_ALTIVEC=y +# CONFIG_PPC_PMAC is not set +CONFIG_PPC_SPLPAR=y +# CONFIG_BOOTX_TEXT is not set +# CONFIG_POWER4_ONLY is not set +# CONFIG_IOMMU_VMERGE is not set +CONFIG_SMP=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_NR_CPUS=128 +# CONFIG_HMT is not set +CONFIG_DISCONTIGMEM=y +CONFIG_NUMA=y +CONFIG_SCHED_SMT=y +CONFIG_PPC_RTAS=y +CONFIG_RTAS_FLASH=m +CONFIG_SCANLOG=m +CONFIG_LPARCFG=y +CONFIG_PPC_VPURR=y + +# +# General setup +# +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=m +# CONFIG_PCI_LEGACY_PROC is not set +# CONFIG_PCI_NAMES is not set +CONFIG_HOTPLUG_CPU=y + +# +# PCMCIA/CardBus support +# +# CONFIG_PCMCIA is not set + +# +# PCI Hotplug Support +# +CONFIG_HOTPLUG_PCI=y +# CONFIG_HOTPLUG_PCI_FAKE is not set +# CONFIG_HOTPLUG_PCI_CPCI is not set +# CONFIG_HOTPLUG_PCI_PCIE is not set +# CONFIG_HOTPLUG_PCI_SHPC is not set +CONFIG_HOTPLUG_PCI_RPA=y +CONFIG_HOTPLUG_PCI_RPA_DLPAR=y +CONFIG_PROC_DEVICETREE=y +# CONFIG_CMDLINE_BOOL is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_FW_LOADER=m +# CONFIG_DEBUG_DRIVER is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Bluesmoke - error detection and reporting (RAS) +# +# CONFIG_BLUESMOKE is not set + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_OTHER=y +CONFIG_PARPORT_1284=y + +# +# Plug and Play support +# + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +# CONFIG_PARIDE is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +# CONFIG_BLK_DEV_CARMEL is not set +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=123456 +CONFIG_BLK_DEV_INITRD=y +CONFIG_CIPHER_TWOFISH=m + +# +# ATA/ATAPI/MFM/RLL support +# +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +CONFIG_BLK_DEV_IDEDISK=y +# CONFIG_IDEDISK_MULTI_MODE is not set +# CONFIG_IDEDISK_STROKE is not set +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_IDEFLOPPY is not set +# CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_IDE_TASK_IOCTL=y +# CONFIG_IDE_TASKFILE_IO is not set + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +# CONFIG_BLK_DEV_OFFBOARD is not set +CONFIG_BLK_DEV_GENERIC=y +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_SL82C105=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +CONFIG_BLK_DEV_IDEDMA_FORCED=y +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_ADMA=y +# CONFIG_BLK_DEV_AEC62XX is not set +# CONFIG_BLK_DEV_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_TRIFLEX is not set +# CONFIG_BLK_DEV_CY82C693 is not set +# CONFIG_BLK_DEV_CS5520 is not set +# CONFIG_BLK_DEV_CS5530 is not set +# CONFIG_BLK_DEV_HPT34X is not set +# CONFIG_BLK_DEV_HPT366 is not set +# CONFIG_BLK_DEV_SC1200 is not set +# CONFIG_BLK_DEV_PIIX is not set +# CONFIG_BLK_DEV_NS87415 is not set +CONFIG_BLK_DEV_PDC202XX_OLD=y +CONFIG_PDC202XX_BURST=y +CONFIG_BLK_DEV_PDC202XX_NEW=y +# CONFIG_PDC202XX_FORCE is not set +# CONFIG_BLK_DEV_SVWKS is not set +CONFIG_BLK_DEV_SIIMAGE=y +# CONFIG_BLK_DEV_SLC90E66 is not set +# CONFIG_BLK_DEV_TRM290 is not set +# CONFIG_BLK_DEV_VIA82CXXX is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +CONFIG_SCSI=m +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_SD_IOSTATS=y +CONFIG_CHR_DEV_ST=m +# CONFIG_CHR_DEV_OSST is not set +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI Transport Attributes +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m + +# +# SCSI low-level drivers +# +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC7XXX_OLD is not set +# CONFIG_SCSI_AIC79XX is not set +# CONFIG_SCSI_AIC79XX_NEW is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_MEGARAID_NEWGEN is not set +# CONFIG_MEGARAID_LEGACY is not set +# CONFIG_SCSI_SATA is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_CPQFCTS is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_IPS is not set +CONFIG_SCSI_IBMVSCSI=m +CONFIG_SCSI_IBMVSCSIS=m +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_PPA is not set +# CONFIG_SCSI_IMM is not set +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +CONFIG_SCSI_QLA2XXX=m +# CONFIG_SCSI_QLA21XX is not set +# CONFIG_SCSI_QLA22XX is not set +CONFIG_SCSI_QLA2300=m +# CONFIG_SCSI_QLA2322 is not set +# CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_QLA6322 is not set +CONFIG_SCSI_QLA2XXX_FAILOVER=y +CONFIG_SCSI_QLA4XXX=m +CONFIG_SCSI_QLA4XXX_FAILOVER=y +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_DC390T is not set +CONFIG_SCSI_DEBUG=m + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_RAID6=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_FLAKEY=m +CONFIG_BLK_DEV_DM_BBR=m + +# +# Fusion MPT device support +# +# CONFIG_FUSION is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# + +# +# Texas Instruments PCILynx requires I2C +# +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m + +# +# I2O device support +# +# CONFIG_I2O is not set + +# +# Macintosh device drivers +# + +# +# Networking support +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_UNIX=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +# CONFIG_IP_ROUTE_FWMARK is not set +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +# CONFIG_IP_ROUTE_VERBOSE is not set +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +# CONFIG_ACCEPT_QUEUES is not set + +# +# IP: Virtual Server Configuration +# +# CONFIG_IP_VS is not set +CONFIG_IPV6=m +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_NDISC_NEW=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_TUNNEL=m + +# +# MOBILE IPv6 (EXPERIMENTAL) +# +CONFIG_IPV6_MOBILITY=m +CONFIG_IPV6_MOBILITY_MN=m +CONFIG_IPV6_MOBILITY_HA=m +# CONFIG_IPV6_MOBILITY_DEBUG is not set +# CONFIG_DECNET is not set +CONFIG_BRIDGE=m +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_BRIDGE_NETFILTER=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_TFTP=m +CONFIG_IP_NF_AMANDA=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_IPRANGE=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_PKTTYPE=m +CONFIG_IP_NF_MATCH_POLICY=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_RECENT=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_DSCP=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_HELPER=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_CONNTRACK=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_MATCH_PHYSDEV=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_SAME=m +CONFIG_IP_NF_NAT_LOCAL=y +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_NAT_TFTP=m +CONFIG_IP_NF_NAT_AMANDA=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_DSCP=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_CLASSIFY=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# CONFIG_IP_NF_COMPAT_IPCHAINS is not set +# CONFIG_IP_NF_COMPAT_IPFWADM is not set +CONFIG_IP_NF_CONNTRACK_MARK=y +CONFIG_IP_NF_TARGET_CONNMARK=m +CONFIG_IP_NF_MATCH_CONNMARK=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_IP6_NF_FTP=m +CONFIG_IP6_NF_QUEUE=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_AHESP=m +CONFIG_IP6_NF_MATCH_LENGTH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_CONNTRACK=m +CONFIG_IP6_NF_MATCH_STATE=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m + +# +# Bridge: Netfilter Configuration +# +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +# CONFIG_BRIDGE_EBT_SNAT is not set +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_XFRM=y +CONFIG_XFRM_USER=m + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_MSG is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +CONFIG_SCTP_HMAC_NONE=y +# CONFIG_SCTP_HMAC_SHA1 is not set +# CONFIG_SCTP_HMAC_MD5 is not set +# CONFIG_ATM is not set +CONFIG_VLAN_8021Q=m +CONFIG_LLC=y +CONFIG_LLC2=m +CONFIG_IPX=m +CONFIG_IPX_INTERN=y +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=y +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_CSZ=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_DELAY=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +# CONFIG_ETHERTAP is not set + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +CONFIG_MII=y +# CONFIG_OAKNET is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=m +CONFIG_TYPHOON=m + +# +# Tulip family network device support +# +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +# CONFIG_AMD8111_ETH is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_B44 is not set +# CONFIG_FORCEDETH is not set +# CONFIG_DGRS is not set +# CONFIG_EEPRO100 is not set +CONFIG_E100=m +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +# CONFIG_NE2K_PCI is not set +# CONFIG_8139CP is not set +# CONFIG_8139TOO is not set +# CONFIG_SIS900 is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_VIA_RHINE is not set + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +# CONFIG_DL2K is not set +CONFIG_E1000=m +CONFIG_E1000_NAPI=y +CONFIG_E1000_NEW=m +CONFIG_E1000_NEW_NAPI=y +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_R8169 is not set +# CONFIG_SIS190 is not set +# CONFIG_SK98LIN is not set +CONFIG_TIGON3=m +CONFIG_NET_BROADCOM=m +CONFIG_NET_BROADCOM_NEW=m +# CONFIG_NET_BCM44 is not set +CONFIG_TIGON3_NEW=m + +# +# Ethernet (10000 Mbit) +# +CONFIG_IXGB=m +CONFIG_IXGB_NAPI=y +CONFIG_S2IO=m +CONFIG_S2IO_NAPI=y +CONFIG_IBMVETH=m +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +# CONFIG_PLIP is not set +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices +# +CONFIG_TR=y +CONFIG_IBMOL=m +# CONFIG_IBMLS is not set +# CONFIG_3C359 is not set +# CONFIG_TMS380TR is not set +CONFIG_NET_FC=y +CONFIG_NET_LPFC=m +CONFIG_SHAPER=m +CONFIG_NETCONSOLE=m + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# Amateur Radio support +# +# CONFIG_HAMRADIO is not set + +# +# IrDA (infrared) support +# +# CONFIG_IRDA is not set + +# +# Bluetooth support +# +# CONFIG_BT is not set +CONFIG_NETPOLL=y +CONFIG_NETPOLL_RX=y +CONFIG_NETPOLL_TRAP=y +CONFIG_NET_POLL_CONTROLLER=y + +# +# ISDN subsystem +# +# CONFIG_ISDN is not set + +# +# Telephony Support +# +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_TSDEV=m +CONFIG_INPUT_TSDEV_SCREEN_X=240 +CONFIG_INPUT_TSDEV_SCREEN_Y=320 +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input I/O drivers +# +# CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PARKBD is not set +# CONFIG_SERIO_PCIPS2 is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +# CONFIG_KEYBOARD_POSFILTER is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +CONFIG_INPUT_MISC=y +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_UINPUT=m + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +CONFIG_ECC=m +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_EXTENDED is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_PMACZILOG is not set +CONFIG_SERIAL_ICOM=m +CONFIG_SERIAL_JSM=m +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_PRINTER is not set +# CONFIG_PPDEV is not set +# CONFIG_TIPAR is not set +CONFIG_HVC_CONSOLE=y +CONFIG_HVCS=m +# CONFIG_QIC02_TAPE is not set + +# +# IPMI +# +# CONFIG_IPMI_HANDLER is not set + +# +# Watchdog Cards +# +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set +# CONFIG_RTC is not set +# CONFIG_GEN_RTC is not set +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set + +# +# Ftape, the floppy tape device driver +# +# CONFIG_AGP is not set +# CONFIG_DRM is not set +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=4096 + +# +# Linux InfraRed Controller +# +# CONFIG_LIRC_SUPPORT is not set +# CONFIG_LIRC_HOMEBREW is not set + +# +# I2C support +# +# CONFIG_I2C is not set + +# +# Misc devices +# + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set + +# +# Digital Video Broadcasting Devices +# +# CONFIG_DVB is not set + +# +# Graphics support +# +CONFIG_FB=y +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +CONFIG_FB_OF=y +# CONFIG_FB_CT65550 is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_S3TRIO is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_RIVA is not set +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G450=y +CONFIG_FB_MATROX_G100=y +CONFIG_FB_MATROX_MULTIHEAD=y +# CONFIG_FB_RADEON_OLD is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_VIRTUAL is not set + +# +# Console display driver support +# +# CONFIG_VGA_CONSOLE is not set +# CONFIG_MDA_CONSOLE is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_PCI_CONSOLE=y +# CONFIG_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y + +# +# Logo configuration +# +# CONFIG_LOGO is not set + +# +# Bootsplash configuration +# + +# +# Sound +# +# CONFIG_SOUND is not set + +# +# USB support +# +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_SPLIT_ISO=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_OHCI_HCD=m +# CONFIG_USB_UHCI_HCD is not set + +# +# USB Device Class drivers +# +# CONFIG_USB_BLUETOOTH_TTY is not set +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=y +CONFIG_USB_STORAGE_FREECOM=y +CONFIG_USB_STORAGE_ISD200=y +CONFIG_USB_STORAGE_DPCM=y +CONFIG_USB_STORAGE_HP8200e=y +CONFIG_USB_STORAGE_SDDR09=y +CONFIG_USB_STORAGE_SDDR55=y +CONFIG_USB_STORAGE_JUMPSHOT=y + +# +# USB Human Interface Devices (HID) +# +CONFIG_USB_HID=m +CONFIG_USB_HIDINPUT=y +# CONFIG_HID_FF is not set +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +CONFIG_USB_AIPTEK=m +CONFIG_USB_WACOM=m +CONFIG_USB_KBTAB=m +CONFIG_USB_POWERMATE=m +CONFIG_USB_MTOUCH=m +CONFIG_USB_XPAD=m +CONFIG_USB_ATI_REMOTE=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USB_HPUSBSCSI=m + +# +# USB Multimedia devices +# +# CONFIG_USB_DABUSB is not set + +# +# Video4Linux support is needed for USB Multimedia device support +# + +# +# USB Network adaptors +# +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_USBNET=m + +# +# USB Host-to-Host Cables +# +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_GENESYS=y +CONFIG_USB_NET1080=y +CONFIG_USB_PL2301=y + +# +# Intelligent USB Devices/Gadgets +# +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_ZAURUS=y +CONFIG_USB_CDCETHER=y + +# +# USB Network Adapters +# +CONFIG_USB_AX8817X=y + +# +# USB port drivers +# +# CONFIG_USB_USS720 is not set + +# +# USB Serial Converter support +# +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KEYSPAN_MPR=y +CONFIG_USB_SERIAL_KEYSPAN_USA28=y +CONFIG_USB_SERIAL_KEYSPAN_USA28X=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +CONFIG_USB_SERIAL_KEYSPAN_USA19=y +CONFIG_USB_SERIAL_KEYSPAN_USA18X=y +CONFIG_USB_SERIAL_KEYSPAN_USA19W=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +CONFIG_USB_SERIAL_KEYSPAN_USA49W=y +CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_EZUSB=y + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +# CONFIG_USB_TIGL is not set +# CONFIG_USB_AUERSWALD is not set +# CONFIG_USB_RIO500 is not set +CONFIG_USB_LEGOTOWER=m +# CONFIG_USB_LCD is not set +CONFIG_USB_LED=m +CONFIG_USB_CYTHERM=m +# CONFIG_USB_TEST is not set + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set + +# +# InfiniBand support +# +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_SDP is not set +# CONFIG_INFINIBAND_SRP is not set +# CONFIG_INFINIBAND_UDAPL_HELPER is not set +CONFIG_INFINIBAND_MELLANOX_HCA=m +CONFIG_AUDIT=m + +# +# File systems +# +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_DMAPI=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_FS_POSIX_ACL=y +CONFIG_XFS_FS=m +CONFIG_XFS_RT=y +CONFIG_XFS_QUOTA=m +CONFIG_XFS_DMAPI=y +CONFIG_XFS_SECURITY=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_MINIX_FS=m +# CONFIG_ROMFS_FS is not set +CONFIG_DMAPI=m +# CONFIG_DMAPI_DEBUG is not set +CONFIG_QUOTA=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=y +# CONFIG_AUTOFS4_FS is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_ZISOFS_FS=y +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +# CONFIG_DEVFS_FS is not set +CONFIG_DEVPTS_FS_XATTR=y +CONFIG_DEVPTS_FS_SECURITY=y +CONFIG_TMPFS=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=m +# CONFIG_KLOG_CHANNEL is not set + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_CRAMFS=y +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_DIRECTIO=y +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +CONFIG_NFSD_ACL=y +CONFIG_NFS_ACL_SUPPORT=y +# CONFIG_NFSD_V4 is not set +CONFIG_NFSD_TCP=y +CONFIG_LOCKD=y +CONFIG_STATD=y +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=m +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +CONFIG_SMB_FS=m +# CONFIG_SMB_NLS_DEFAULT is not set +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_NEC98_PARTITION=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m +CONFIG_FSHOOKS=y + +# +# Profiling support +# +CONFIG_PROFILING=y +CONFIG_OPROFILE=y + +# +# Kernel hacking +# +CONFIG_KERNTYPES=y +CONFIG_CRASH_DUMP=m +CONFIG_CRASH_DUMP_BLOCKDEV=m +CONFIG_CRASH_DUMP_NETDEV=m +# CONFIG_CRASH_DUMP_MEMDEV is not set +# CONFIG_CRASH_DUMP_SOFTBOOT is not set +CONFIG_CRASH_DUMP_COMPRESS_RLE=m +CONFIG_CRASH_DUMP_COMPRESS_GZIP=m +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_DEBUG_STACK_USAGE=y +# CONFIG_DEBUG_SLAB is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUGGER=y +CONFIG_XMON=y +# CONFIG_XMON_DEFAULT is not set +CONFIG_KDB=y +CONFIG_KDB_MODULES=y +CONFIG_KDB_OFF=y +# CONFIG_PPCDBG is not set +# CONFIG_DEBUG_INFO is not set +CONFIG_IRQSTACKS=y + +# +# Security options +# +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_CAPABILITIES=m +CONFIG_SECURITY_ROOTPLUG=m +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_MLS=y + +# +# IBM Crypto Hardware support +# +CONFIG_IBM_CRYPTO=m +CONFIG_ICA_LEEDSLITE=m + +# +# Cryptographic options +# +CONFIG_CRYPTO=y +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_AES=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_TEST=m + +# +# Library routines +# +CONFIG_CRC32=y +CONFIG_QSORT=y +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m + +# +# Build options +# +CONFIG_SUSE_KERNEL=y +CONFIG_CFGNAME="pseries64" +CONFIG_RELEASE="7.141" diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config new file mode 100644 index 0000000..2e87a08 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config @@ -0,0 +1,1451 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_64BIT=y +CONFIG_MMU=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_HAVE_DEC_LOCK=y +CONFIG_EARLY_PRINTK=y +CONFIG_COMPAT=y +CONFIG_FRAME_POINTER=y +CONFIG_FORCE_MAX_ZONEORDER=13 + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=19 +CONFIG_HOTPLUG=y +CONFIG_EVLOG=y +# CONFIG_EVLOG_FWPRINTK is not set +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_EMBEDDED is not set + +# +# Class Based Kernel Resource Management +# +CONFIG_CKRM=y +CONFIG_RCFS_FS=m +CONFIG_CKRM_TYPE_TASKCLASS=y +CONFIG_CKRM_RES_NUMTASKS=m +CONFIG_CKRM_CPU_SCHEDULE=y +# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set +CONFIG_CKRM_TYPE_SOCKETCLASS=y +CONFIG_CKRM_RBCE=m +CONFIG_CKRM_CRBCE=m +CONFIG_DELAY_ACCT=y +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +CONFIG_OBSOLETE_MODPARM=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y + +# +# Platform support +# +# CONFIG_PPC_ISERIES is not set +CONFIG_PPC_PSERIES=y +CONFIG_PPC=y +CONFIG_PPC64=y +CONFIG_PPC_OF=y +CONFIG_ALTIVEC=y +# CONFIG_PPC_PMAC is not set +CONFIG_PPC_SPLPAR=y +# CONFIG_BOOTX_TEXT is not set +# CONFIG_POWER4_ONLY is not set +# CONFIG_IOMMU_VMERGE is not set +CONFIG_SMP=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_NR_CPUS=128 +# CONFIG_HMT is not set +CONFIG_DISCONTIGMEM=y +CONFIG_NUMA=y +CONFIG_SCHED_SMT=y +CONFIG_PPC_RTAS=y +CONFIG_RTAS_FLASH=m +CONFIG_SCANLOG=m +CONFIG_LPARCFG=y +CONFIG_PPC_VPURR=y + +# +# General setup +# +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=m +# CONFIG_PCI_LEGACY_PROC is not set +# CONFIG_PCI_NAMES is not set +CONFIG_HOTPLUG_CPU=y + +# +# PCMCIA/CardBus support +# +# CONFIG_PCMCIA is not set + +# +# PCI Hotplug Support +# +CONFIG_HOTPLUG_PCI=y +# CONFIG_HOTPLUG_PCI_FAKE is not set +# CONFIG_HOTPLUG_PCI_CPCI is not set +# CONFIG_HOTPLUG_PCI_PCIE is not set +# CONFIG_HOTPLUG_PCI_SHPC is not set +CONFIG_HOTPLUG_PCI_RPA=y +CONFIG_HOTPLUG_PCI_RPA_DLPAR=y +CONFIG_PROC_DEVICETREE=y +# CONFIG_CMDLINE_BOOL is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_FW_LOADER=m +# CONFIG_DEBUG_DRIVER is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Bluesmoke - error detection and reporting (RAS) +# +# CONFIG_BLUESMOKE is not set + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_OTHER=y +CONFIG_PARPORT_1284=y + +# +# Plug and Play support +# + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +# CONFIG_PARIDE is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +# CONFIG_BLK_DEV_CARMEL is not set +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=123456 +CONFIG_BLK_DEV_INITRD=y +CONFIG_CIPHER_TWOFISH=m + +# +# ATA/ATAPI/MFM/RLL support +# +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +CONFIG_BLK_DEV_IDEDISK=y +# CONFIG_IDEDISK_MULTI_MODE is not set +# CONFIG_IDEDISK_STROKE is not set +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_IDEFLOPPY is not set +# CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_IDE_TASK_IOCTL=y +# CONFIG_IDE_TASKFILE_IO is not set + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +# CONFIG_BLK_DEV_OFFBOARD is not set +CONFIG_BLK_DEV_GENERIC=y +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_SL82C105=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +CONFIG_BLK_DEV_IDEDMA_FORCED=y +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_ADMA=y +# CONFIG_BLK_DEV_AEC62XX is not set +# CONFIG_BLK_DEV_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_TRIFLEX is not set +# CONFIG_BLK_DEV_CY82C693 is not set +# CONFIG_BLK_DEV_CS5520 is not set +# CONFIG_BLK_DEV_CS5530 is not set +# CONFIG_BLK_DEV_HPT34X is not set +# CONFIG_BLK_DEV_HPT366 is not set +# CONFIG_BLK_DEV_SC1200 is not set +# CONFIG_BLK_DEV_PIIX is not set +# CONFIG_BLK_DEV_NS87415 is not set +CONFIG_BLK_DEV_PDC202XX_OLD=y +CONFIG_PDC202XX_BURST=y +CONFIG_BLK_DEV_PDC202XX_NEW=y +# CONFIG_PDC202XX_FORCE is not set +# CONFIG_BLK_DEV_SVWKS is not set +CONFIG_BLK_DEV_SIIMAGE=y +# CONFIG_BLK_DEV_SLC90E66 is not set +# CONFIG_BLK_DEV_TRM290 is not set +# CONFIG_BLK_DEV_VIA82CXXX is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +CONFIG_SCSI=m +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +# CONFIG_CHR_DEV_OSST is not set +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI Transport Attributes +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m + +# +# SCSI low-level drivers +# +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC7XXX_OLD is not set +# CONFIG_SCSI_AIC79XX is not set +# CONFIG_SCSI_AIC79XX_NEW is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_MEGARAID_NEWGEN is not set +# CONFIG_MEGARAID_LEGACY is not set +# CONFIG_SCSI_SATA is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_CPQFCTS is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_IPS is not set +CONFIG_SCSI_IBMVSCSI=m +CONFIG_SCSI_IBMVSCSIS=m +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_PPA is not set +# CONFIG_SCSI_IMM is not set +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +CONFIG_SCSI_QLA2XXX=m +# CONFIG_SCSI_QLA21XX is not set +# CONFIG_SCSI_QLA22XX is not set +CONFIG_SCSI_QLA2300=m +# CONFIG_SCSI_QLA2322 is not set +# CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_QLA6322 is not set +CONFIG_SCSI_QLA2XXX_FAILOVER=y +CONFIG_SCSI_QLA4XXX=m +CONFIG_SCSI_QLA4XXX_FAILOVER=y +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_DC390T is not set +CONFIG_SCSI_DEBUG=m + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_RAID6=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_FLAKEY=m +CONFIG_BLK_DEV_DM_BBR=m + +# +# Fusion MPT device support +# +# CONFIG_FUSION is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# + +# +# Texas Instruments PCILynx requires I2C +# +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m + +# +# I2O device support +# +# CONFIG_I2O is not set + +# +# Macintosh device drivers +# + +# +# Networking support +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_UNIX=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +# CONFIG_IP_ROUTE_FWMARK is not set +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +# CONFIG_IP_ROUTE_VERBOSE is not set +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +# CONFIG_ACCEPT_QUEUES is not set + +# +# IP: Virtual Server Configuration +# +# CONFIG_IP_VS is not set +CONFIG_IPV6=m +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_NDISC_NEW=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_TUNNEL=m + +# +# MOBILE IPv6 (EXPERIMENTAL) +# +CONFIG_IPV6_MOBILITY=m +CONFIG_IPV6_MOBILITY_MN=m +CONFIG_IPV6_MOBILITY_HA=m +# CONFIG_IPV6_MOBILITY_DEBUG is not set +# CONFIG_DECNET is not set +CONFIG_BRIDGE=m +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_BRIDGE_NETFILTER=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_TFTP=m +CONFIG_IP_NF_AMANDA=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_IPRANGE=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_PKTTYPE=m +CONFIG_IP_NF_MATCH_POLICY=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_RECENT=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_DSCP=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_HELPER=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_CONNTRACK=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_MATCH_PHYSDEV=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_SAME=m +CONFIG_IP_NF_NAT_LOCAL=y +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_NAT_TFTP=m +CONFIG_IP_NF_NAT_AMANDA=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_DSCP=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_CLASSIFY=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# CONFIG_IP_NF_COMPAT_IPCHAINS is not set +# CONFIG_IP_NF_COMPAT_IPFWADM is not set +CONFIG_IP_NF_CONNTRACK_MARK=y +CONFIG_IP_NF_TARGET_CONNMARK=m +CONFIG_IP_NF_MATCH_CONNMARK=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_IP6_NF_FTP=m +CONFIG_IP6_NF_QUEUE=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_AHESP=m +CONFIG_IP6_NF_MATCH_LENGTH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_CONNTRACK=m +CONFIG_IP6_NF_MATCH_STATE=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m + +# +# Bridge: Netfilter Configuration +# +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +# CONFIG_BRIDGE_EBT_SNAT is not set +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_XFRM=y +CONFIG_XFRM_USER=m + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_MSG is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +CONFIG_SCTP_HMAC_NONE=y +# CONFIG_SCTP_HMAC_SHA1 is not set +# CONFIG_SCTP_HMAC_MD5 is not set +# CONFIG_ATM is not set +CONFIG_VLAN_8021Q=m +CONFIG_LLC=y +CONFIG_LLC2=m +CONFIG_IPX=m +CONFIG_IPX_INTERN=y +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=y +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_CSZ=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_DELAY=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +# CONFIG_ETHERTAP is not set + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +CONFIG_MII=y +# CONFIG_OAKNET is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=m +CONFIG_TYPHOON=m + +# +# Tulip family network device support +# +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +# CONFIG_AMD8111_ETH is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_B44 is not set +# CONFIG_FORCEDETH is not set +# CONFIG_DGRS is not set +# CONFIG_EEPRO100 is not set +CONFIG_E100=m +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +# CONFIG_NE2K_PCI is not set +# CONFIG_8139CP is not set +# CONFIG_8139TOO is not set +# CONFIG_SIS900 is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_VIA_RHINE is not set + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +# CONFIG_DL2K is not set +CONFIG_E1000=m +CONFIG_E1000_NAPI=y +CONFIG_E1000_NEW=m +CONFIG_E1000_NEW_NAPI=y +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_R8169 is not set +# CONFIG_SIS190 is not set +# CONFIG_SK98LIN is not set +CONFIG_TIGON3=m +CONFIG_NET_BROADCOM=m +CONFIG_NET_BROADCOM_NEW=m +# CONFIG_NET_BCM44 is not set +CONFIG_TIGON3_NEW=m + +# +# Ethernet (10000 Mbit) +# +CONFIG_IXGB=m +CONFIG_IXGB_NAPI=y +CONFIG_S2IO=m +CONFIG_S2IO_NAPI=y +CONFIG_IBMVETH=m +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +# CONFIG_PLIP is not set +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices +# +CONFIG_TR=y +CONFIG_IBMOL=m +# CONFIG_IBMLS is not set +# CONFIG_3C359 is not set +# CONFIG_TMS380TR is not set +CONFIG_NET_FC=y +CONFIG_NET_LPFC=m +CONFIG_SHAPER=m +CONFIG_NETCONSOLE=m + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# Amateur Radio support +# +# CONFIG_HAMRADIO is not set + +# +# IrDA (infrared) support +# +# CONFIG_IRDA is not set + +# +# Bluetooth support +# +# CONFIG_BT is not set +CONFIG_NETPOLL=y +CONFIG_NETPOLL_RX=y +CONFIG_NETPOLL_TRAP=y +CONFIG_NET_POLL_CONTROLLER=y + +# +# ISDN subsystem +# +# CONFIG_ISDN is not set + +# +# Telephony Support +# +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_TSDEV=m +CONFIG_INPUT_TSDEV_SCREEN_X=240 +CONFIG_INPUT_TSDEV_SCREEN_Y=320 +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input I/O drivers +# +# CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PARKBD is not set +# CONFIG_SERIO_PCIPS2 is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +# CONFIG_KEYBOARD_POSFILTER is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +CONFIG_INPUT_MISC=y +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_UINPUT=m + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +CONFIG_ECC=m +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_EXTENDED is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_PMACZILOG is not set +CONFIG_SERIAL_ICOM=m +CONFIG_SERIAL_JSM=m +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_PRINTER is not set +# CONFIG_PPDEV is not set +# CONFIG_TIPAR is not set +CONFIG_HVC_CONSOLE=y +CONFIG_HVCS=m +# CONFIG_QIC02_TAPE is not set + +# +# IPMI +# +# CONFIG_IPMI_HANDLER is not set + +# +# Watchdog Cards +# +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set +# CONFIG_RTC is not set +# CONFIG_GEN_RTC is not set +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set + +# +# Ftape, the floppy tape device driver +# +# CONFIG_AGP is not set +# CONFIG_DRM is not set +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=4096 + +# +# Linux InfraRed Controller +# +# CONFIG_LIRC_SUPPORT is not set +# CONFIG_LIRC_HOMEBREW is not set + +# +# I2C support +# +# CONFIG_I2C is not set + +# +# Misc devices +# + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set + +# +# Digital Video Broadcasting Devices +# +# CONFIG_DVB is not set + +# +# Graphics support +# +CONFIG_FB=y +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +CONFIG_FB_OF=y +# CONFIG_FB_CT65550 is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_S3TRIO is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_RIVA is not set +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G450=y +CONFIG_FB_MATROX_G100=y +CONFIG_FB_MATROX_MULTIHEAD=y +# CONFIG_FB_RADEON_OLD is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_VIRTUAL is not set + +# +# Console display driver support +# +# CONFIG_VGA_CONSOLE is not set +# CONFIG_MDA_CONSOLE is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_PCI_CONSOLE=y +# CONFIG_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y + +# +# Logo configuration +# +# CONFIG_LOGO is not set + +# +# Bootsplash configuration +# + +# +# Sound +# +# CONFIG_SOUND is not set + +# +# USB support +# +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_SPLIT_ISO=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_OHCI_HCD=m +# CONFIG_USB_UHCI_HCD is not set + +# +# USB Device Class drivers +# +# CONFIG_USB_BLUETOOTH_TTY is not set +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=y +CONFIG_USB_STORAGE_FREECOM=y +CONFIG_USB_STORAGE_ISD200=y +CONFIG_USB_STORAGE_DPCM=y +CONFIG_USB_STORAGE_HP8200e=y +CONFIG_USB_STORAGE_SDDR09=y +CONFIG_USB_STORAGE_SDDR55=y +CONFIG_USB_STORAGE_JUMPSHOT=y + +# +# USB Human Interface Devices (HID) +# +CONFIG_USB_HID=m +CONFIG_USB_HIDINPUT=y +# CONFIG_HID_FF is not set +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +CONFIG_USB_AIPTEK=m +CONFIG_USB_WACOM=m +CONFIG_USB_KBTAB=m +CONFIG_USB_POWERMATE=m +CONFIG_USB_MTOUCH=m +CONFIG_USB_XPAD=m +CONFIG_USB_ATI_REMOTE=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USB_HPUSBSCSI=m + +# +# USB Multimedia devices +# +# CONFIG_USB_DABUSB is not set + +# +# Video4Linux support is needed for USB Multimedia device support +# + +# +# USB Network adaptors +# +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_USBNET=m + +# +# USB Host-to-Host Cables +# +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_GENESYS=y +CONFIG_USB_NET1080=y +CONFIG_USB_PL2301=y + +# +# Intelligent USB Devices/Gadgets +# +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_ZAURUS=y +CONFIG_USB_CDCETHER=y + +# +# USB Network Adapters +# +CONFIG_USB_AX8817X=y + +# +# USB port drivers +# +# CONFIG_USB_USS720 is not set + +# +# USB Serial Converter support +# +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KEYSPAN_MPR=y +CONFIG_USB_SERIAL_KEYSPAN_USA28=y +CONFIG_USB_SERIAL_KEYSPAN_USA28X=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +CONFIG_USB_SERIAL_KEYSPAN_USA19=y +CONFIG_USB_SERIAL_KEYSPAN_USA18X=y +CONFIG_USB_SERIAL_KEYSPAN_USA19W=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +CONFIG_USB_SERIAL_KEYSPAN_USA49W=y +CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_EZUSB=y + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +# CONFIG_USB_TIGL is not set +# CONFIG_USB_AUERSWALD is not set +# CONFIG_USB_RIO500 is not set +CONFIG_USB_LEGOTOWER=m +# CONFIG_USB_LCD is not set +CONFIG_USB_LED=m +CONFIG_USB_CYTHERM=m +# CONFIG_USB_TEST is not set + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set + +# +# InfiniBand support +# +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_SDP is not set +# CONFIG_INFINIBAND_SRP is not set +# CONFIG_INFINIBAND_UDAPL_HELPER is not set +CONFIG_INFINIBAND_MELLANOX_HCA=m +CONFIG_AUDIT=m + +# +# File systems +# +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_DMAPI=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_FS_POSIX_ACL=y +CONFIG_XFS_FS=m +CONFIG_XFS_RT=y +CONFIG_XFS_QUOTA=m +CONFIG_XFS_DMAPI=y +CONFIG_XFS_SECURITY=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_MINIX_FS=m +# CONFIG_ROMFS_FS is not set +CONFIG_DMAPI=m +# CONFIG_DMAPI_DEBUG is not set +CONFIG_QUOTA=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=y +# CONFIG_AUTOFS4_FS is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_ZISOFS_FS=y +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +# CONFIG_DEVFS_FS is not set +CONFIG_DEVPTS_FS_XATTR=y +CONFIG_DEVPTS_FS_SECURITY=y +CONFIG_TMPFS=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=m +# CONFIG_KLOG_CHANNEL is not set + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_CRAMFS=y +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_DIRECTIO=y +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +CONFIG_NFSD_ACL=y +CONFIG_NFS_ACL_SUPPORT=y +# CONFIG_NFSD_V4 is not set +CONFIG_NFSD_TCP=y +CONFIG_LOCKD=y +CONFIG_STATD=y +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=m +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +CONFIG_SMB_FS=m +# CONFIG_SMB_NLS_DEFAULT is not set +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_NEC98_PARTITION=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m +CONFIG_FSHOOKS=y + +# +# Profiling support +# +CONFIG_PROFILING=y +CONFIG_OPROFILE=y + +# +# Kernel hacking +# +CONFIG_KERNTYPES=y +CONFIG_CRASH_DUMP=m +CONFIG_CRASH_DUMP_BLOCKDEV=m +CONFIG_CRASH_DUMP_NETDEV=m +# CONFIG_CRASH_DUMP_MEMDEV is not set +# CONFIG_CRASH_DUMP_SOFTBOOT is not set +CONFIG_CRASH_DUMP_COMPRESS_RLE=m +CONFIG_CRASH_DUMP_COMPRESS_GZIP=m +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_DEBUG_STACK_USAGE=y +# CONFIG_DEBUG_SLAB is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUGGER=y +CONFIG_XMON=y +# CONFIG_XMON_DEFAULT is not set +CONFIG_KDB=y +CONFIG_KDB_MODULES=y +CONFIG_KDB_OFF=y +# CONFIG_PPCDBG is not set +# CONFIG_DEBUG_INFO is not set +CONFIG_IRQSTACKS=y + +# +# Security options +# +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_CAPABILITIES=m +CONFIG_SECURITY_ROOTPLUG=m +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_MLS=y + +# +# IBM Crypto Hardware support +# +CONFIG_IBM_CRYPTO=m +CONFIG_ICA_LEEDSLITE=m + +# +# Cryptographic options +# +CONFIG_CRYPTO=y +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_AES=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_TEST=m + +# +# Library routines +# +CONFIG_CRC32=y +CONFIG_QSORT=y +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m + +# +# Build options +# +CONFIG_SUSE_KERNEL=y +CONFIG_CFGNAME="pseries64" +CONFIG_RELEASE="SLES9_SP1_BRANCH_91" diff --git a/lustre/kernel_patches/patches/2.4.19-ext3.patch b/lustre/kernel_patches/patches/2.4.19-ext3.patch new file mode 100644 index 0000000..a167c6a --- /dev/null +++ b/lustre/kernel_patches/patches/2.4.19-ext3.patch @@ -0,0 +1,7892 @@ +diff -rup --new-file linux.mcp2/fs/ext3/Makefile linux_tmp/fs/ext3/Makefile +--- linux.mcp2/fs/ext3/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/Makefile 2001-12-21 09:41:55.000000000 -0800 +@@ -0,0 +1,16 @@ ++# ++# Makefile for the linux ext2-filesystem routines. ++# ++# Note! Dependencies are done automagically by 'make dep', which also ++# removes any old dependencies. DON'T put your own dependencies here ++# unless it's something special (ie not a .c file). ++# ++# Note 2! The CFLAGS definitions are now in the main makefile... ++ ++O_TARGET := ext3.o ++ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ ioctl.o namei.o super.o symlink.o ++obj-m := $(O_TARGET) ++ ++include $(TOPDIR)/Rules.make +diff -rup --new-file linux.mcp2/fs/ext3/balloc.c linux_tmp/fs/ext3/balloc.c +--- linux.mcp2/fs/ext3/balloc.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/balloc.c 2002-08-02 17:39:45.000000000 -0700 +@@ -0,0 +1,999 @@ ++/* ++ * linux/fs/ext3/balloc.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * balloc.c contains the blocks allocation and deallocation routines ++ */ ++ ++/* ++ * The free blocks are managed by bitmaps. A file system contains several ++ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap ++ * block for inodes, N blocks for the inode table and data blocks. ++ * ++ * The file system contains group descriptors which are located after the ++ * super block. Each descriptor contains the number of the bitmap block and ++ * the free blocks count in the block. The descriptors are loaded in memory ++ * when a file system is mounted (see ext3_read_super). ++ */ ++ ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, ++ unsigned int block_group, ++ struct buffer_head ** bh) ++{ ++ unsigned long group_desc; ++ unsigned long desc; ++ struct ext3_group_desc * gdp; ++ ++ if (block_group >= sb->u.ext3_sb.s_groups_count) { ++ ext3_error (sb, "ext3_get_group_desc", ++ "block_group >= groups_count - " ++ "block_group = %d, groups_count = %lu", ++ block_group, sb->u.ext3_sb.s_groups_count); ++ ++ return NULL; ++ } ++ ++ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); ++ desc = block_group % EXT3_DESC_PER_BLOCK(sb); ++ if (!sb->u.ext3_sb.s_group_desc[group_desc]) { ++ ext3_error (sb, "ext3_get_group_desc", ++ "Group descriptor not loaded - " ++ "block_group = %d, group_desc = %lu, desc = %lu", ++ block_group, group_desc, desc); ++ return NULL; ++ } ++ ++ gdp = (struct ext3_group_desc *) ++ sb->u.ext3_sb.s_group_desc[group_desc]->b_data; ++ if (bh) ++ *bh = sb->u.ext3_sb.s_group_desc[group_desc]; ++ return gdp + desc; ++} ++ ++/* ++ * Read the bitmap for a given block_group, reading into the specified ++ * slot in the superblock's bitmap cache. ++ * ++ * Return >=0 on success or a -ve error code. ++ */ ++ ++static int read_block_bitmap (struct super_block * sb, ++ unsigned int block_group, ++ unsigned long bitmap_nr) ++{ ++ struct ext3_group_desc * gdp; ++ struct buffer_head * bh = NULL; ++ int retval = -EIO; ++ ++ gdp = ext3_get_group_desc (sb, block_group, NULL); ++ if (!gdp) ++ goto error_out; ++ retval = 0; ++ bh = sb_bread(sb, le32_to_cpu(gdp->bg_block_bitmap)); ++ if (!bh) { ++ ext3_error (sb, "read_block_bitmap", ++ "Cannot read block bitmap - " ++ "block_group = %d, block_bitmap = %lu", ++ block_group, (unsigned long) gdp->bg_block_bitmap); ++ retval = -EIO; ++ } ++ /* ++ * On IO error, just leave a zero in the superblock's block pointer for ++ * this group. The IO will be retried next time. ++ */ ++error_out: ++ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; ++ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; ++ return retval; ++} ++ ++/* ++ * load_block_bitmap loads the block bitmap for a blocks group ++ * ++ * It maintains a cache for the last bitmaps loaded. This cache is managed ++ * with a LRU algorithm. ++ * ++ * Notes: ++ * 1/ There is one cache per mounted file system. ++ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, ++ * this function reads the bitmap without maintaining a LRU cache. ++ * ++ * Return the slot used to store the bitmap, or a -ve error code. ++ */ ++static int __load_block_bitmap (struct super_block * sb, ++ unsigned int block_group) ++{ ++ int i, j, retval = 0; ++ unsigned long block_bitmap_number; ++ struct buffer_head * block_bitmap; ++ ++ if (block_group >= sb->u.ext3_sb.s_groups_count) ++ ext3_panic (sb, "load_block_bitmap", ++ "block_group >= groups_count - " ++ "block_group = %d, groups_count = %lu", ++ block_group, sb->u.ext3_sb.s_groups_count); ++ ++ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { ++ if (sb->u.ext3_sb.s_block_bitmap[block_group]) { ++ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == ++ block_group) ++ return block_group; ++ ext3_error (sb, "__load_block_bitmap", ++ "block_group != block_bitmap_number"); ++ } ++ retval = read_block_bitmap (sb, block_group, block_group); ++ if (retval < 0) ++ return retval; ++ return block_group; ++ } ++ ++ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && ++ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) ++ ; ++ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && ++ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { ++ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; ++ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; ++ for (j = i; j > 0; j--) { ++ sb->u.ext3_sb.s_block_bitmap_number[j] = ++ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; ++ sb->u.ext3_sb.s_block_bitmap[j] = ++ sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ } ++ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; ++ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; ++ ++ /* ++ * There's still one special case here --- if block_bitmap == 0 ++ * then our last attempt to read the bitmap failed and we have ++ * just ended up caching that failure. Try again to read it. ++ */ ++ if (!block_bitmap) ++ retval = read_block_bitmap (sb, block_group, 0); ++ } else { ++ if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; ++ else ++ brelse (sb->u.ext3_sb.s_block_bitmap ++ [EXT3_MAX_GROUP_LOADED - 1]); ++ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; ++ j > 0; j--) { ++ sb->u.ext3_sb.s_block_bitmap_number[j] = ++ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; ++ sb->u.ext3_sb.s_block_bitmap[j] = ++ sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ } ++ retval = read_block_bitmap (sb, block_group, 0); ++ } ++ return retval; ++} ++ ++/* ++ * Load the block bitmap for a given block group. First of all do a couple ++ * of fast lookups for common cases and then pass the request onto the guts ++ * of the bitmap loader. ++ * ++ * Return the slot number of the group in the superblock bitmap cache's on ++ * success, or a -ve error code. ++ * ++ * There is still one inconsistency here --- if the number of groups in this ++ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of ++ * differentiating between a group for which we have never performed a bitmap ++ * IO request, and a group for which the last bitmap read request failed. ++ */ ++static inline int load_block_bitmap (struct super_block * sb, ++ unsigned int block_group) ++{ ++ int slot; ++ ++ /* ++ * Do the lookup for the slot. First of all, check if we're asking ++ * for the same slot as last time, and did we succeed that last time? ++ */ ++ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && ++ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && ++ sb->u.ext3_sb.s_block_bitmap[0]) { ++ return 0; ++ } ++ /* ++ * Or can we do a fast lookup based on a loaded group on a filesystem ++ * small enough to be mapped directly into the superblock? ++ */ ++ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && ++ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group ++ && sb->u.ext3_sb.s_block_bitmap[block_group]) { ++ slot = block_group; ++ } ++ /* ++ * If not, then do a full lookup for this block group. ++ */ ++ else { ++ slot = __load_block_bitmap (sb, block_group); ++ } ++ ++ /* ++ * <0 means we just got an error ++ */ ++ if (slot < 0) ++ return slot; ++ ++ /* ++ * If it's a valid slot, we may still have cached a previous IO error, ++ * in which case the bh in the superblock cache will be zero. ++ */ ++ if (!sb->u.ext3_sb.s_block_bitmap[slot]) ++ return -EIO; ++ ++ /* ++ * Must have been read in OK to get this far. ++ */ ++ return slot; ++} ++ ++/* Free given blocks, update quota and i_blocks field */ ++void ext3_free_blocks (handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count) ++{ ++ struct buffer_head *bitmap_bh; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ unsigned long bit; ++ unsigned long i; ++ int bitmap_nr; ++ unsigned long overflow; ++ struct super_block * sb; ++ struct ext3_group_desc * gdp; ++ struct ext3_super_block * es; ++ int err = 0, ret; ++ int dquot_freed_blocks = 0; ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ (block + count) > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug ("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ bitmap_nr = load_block_bitmap (sb, block_group); ++ if (bitmap_nr < 0) ++ goto error_return; ++ ++ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ sb->u.ext3_sb.s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ sb->u.ext3_sb.s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ /* ++ * We are about to start releasing blocks in the bitmap, ++ * so we need undo access. ++ */ ++ /* @@@ check errors */ ++ BUFFER_TRACE(bitmap_bh, "getting undo access"); ++ err = ext3_journal_get_undo_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ if (err) ++ goto error_return; ++ ++ for (i = 0; i < count; i++) { ++ /* ++ * An HJ special. This is expensive... ++ */ ++#ifdef CONFIG_JBD_DEBUG ++ { ++ struct buffer_head *debug_bh; ++ debug_bh = sb_get_hash_table(sb, block + i); ++ if (debug_bh) { ++ BUFFER_TRACE(debug_bh, "Deleted!"); ++ if (!bh2jh(bitmap_bh)->b_committed_data) ++ BUFFER_TRACE(debug_bh, ++ "No commited data in bitmap"); ++ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); ++ __brelse(debug_bh); ++ } ++ } ++#endif ++ BUFFER_TRACE(bitmap_bh, "clear bit"); ++ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { ++ ext3_error (sb, __FUNCTION__, ++ "bit already cleared for block %lu", ++ block + i); ++ BUFFER_TRACE(bitmap_bh, "bit already cleared"); ++ } else { ++ dquot_freed_blocks++; ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); ++ } ++ /* @@@ This prevents newly-allocated data from being ++ * freed and then reallocated within the same ++ * transaction. ++ * ++ * Ideally we would want to allow that to happen, but to ++ * do so requires making journal_forget() capable of ++ * revoking the queued write of a data block, which ++ * implies blocking on the journal lock. *forget() ++ * cannot block due to truncate races. ++ * ++ * Eventually we can fix this by making journal_forget() ++ * return a status indicating whether or not it was able ++ * to revoke the buffer. On successful revoke, it is ++ * safe not to set the allocation bit in the committed ++ * bitmap, because we know that there is no outstanding ++ * activity on the buffer any more and so it is safe to ++ * reallocate it. ++ */ ++ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); ++ J_ASSERT_BH(bitmap_bh, ++ bh2jh(bitmap_bh)->b_committed_data != NULL); ++ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); ++ } ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ /* And the superblock */ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); ++ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ ext3_std_error(sb, err); ++ unlock_super(sb); ++ if (dquot_freed_blocks) ++ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); ++ return; ++} ++ ++/* For ext3 allocations, we must not reuse any blocks which are ++ * allocated in the bitmap buffer's "last committed data" copy. This ++ * prevents deletes from freeing up the page for reuse until we have ++ * committed the delete transaction. ++ * ++ * If we didn't do this, then deleting something and reallocating it as ++ * data would allow the old block to be overwritten before the ++ * transaction committed (because we force data to disk before commit). ++ * This would lead to corruption if we crashed between overwriting the ++ * data and committing the delete. ++ * ++ * @@@ We may want to make this allocation behaviour conditional on ++ * data-writes at some point, and disable it for metadata allocations or ++ * sync-data inodes. ++ */ ++static int ext3_test_allocatable(int nr, struct buffer_head *bh) ++{ ++ if (ext3_test_bit(nr, bh->b_data)) ++ return 0; ++ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) ++ return 1; ++ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); ++} ++ ++/* ++ * Find an allocatable block in a bitmap. We honour both the bitmap and ++ * its last-committed copy (if that exists), and perform the "most ++ * appropriate allocation" algorithm of looking for a free block near ++ * the initial goal; then for a free byte somewhere in the bitmap; then ++ * for any free bit in the bitmap. ++ */ ++static int find_next_usable_block(int start, ++ struct buffer_head *bh, int maxblocks) ++{ ++ int here, next; ++ char *p, *r; ++ ++ if (start > 0) { ++ /* ++ * The goal was occupied; search forward for a free ++ * block within the next XX blocks. ++ * ++ * end_goal is more or less random, but it has to be ++ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the ++ * next 64-bit boundary is simple.. ++ */ ++ int end_goal = (start + 63) & ~63; ++ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); ++ if (here < end_goal && ext3_test_allocatable(here, bh)) ++ return here; ++ ++ ext3_debug ("Bit not found near goal\n"); ++ ++ } ++ ++ here = start; ++ if (here < 0) ++ here = 0; ++ ++ /* ++ * There has been no free block found in the near vicinity of ++ * the goal: do a search forward through the block groups, ++ * searching in each group first for an entire free byte in the ++ * bitmap and then for any free bit. ++ * ++ * Search first in the remainder of the current group ++ */ ++ p = ((char *) bh->b_data) + (here >> 3); ++ r = memscan(p, 0, (maxblocks - here + 7) >> 3); ++ next = (r - ((char *) bh->b_data)) << 3; ++ ++ if (next < maxblocks && ext3_test_allocatable(next, bh)) ++ return next; ++ ++ /* The bitmap search --- search forward alternately ++ * through the actual bitmap and the last-committed copy ++ * until we find a bit free in both. */ ++ ++ while (here < maxblocks) { ++ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, ++ maxblocks, here); ++ if (next >= maxblocks) ++ return -1; ++ if (ext3_test_allocatable(next, bh)) ++ return next; ++ ++ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); ++ here = ext3_find_next_zero_bit ++ ((unsigned long *) bh2jh(bh)->b_committed_data, ++ maxblocks, next); ++ } ++ return -1; ++} ++ ++/* ++ * ext3_new_block uses a goal block to assist allocation. If the goal is ++ * free, or there is a free block within 32 blocks of the goal, that block ++ * is allocated. Otherwise a forward search is made for a free block; within ++ * each block group the search first looks for an entire free byte in the block ++ * bitmap, and then for any free bit if that fails. ++ * This function also updates quota and i_blocks field. ++ */ ++int ext3_new_block (handle_t *handle, struct inode * inode, ++ unsigned long goal, u32 * prealloc_count, ++ u32 * prealloc_block, int * errp) ++{ ++ struct buffer_head * bh, *bhtmp; ++ struct buffer_head * bh2; ++#if 0 ++ char * p, * r; ++#endif ++ int i, j, k, tmp, alloctmp; ++ int bitmap_nr; ++ int fatal = 0, err; ++ int performed_allocation = 0; ++ struct super_block * sb; ++ struct ext3_group_desc * gdp; ++ struct ext3_super_block * es; ++#ifdef EXT3FS_DEBUG ++ static int goal_hits = 0, goal_attempts = 0; ++#endif ++ *errp = -ENOSPC; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_new_block: nonexistent device"); ++ return 0; ++ } ++ ++ /* ++ * Check quota for allocation of this block. ++ */ ++ if (DQUOT_ALLOC_BLOCK(inode, 1)) { ++ *errp = -EDQUOT; ++ return 0; ++ } ++ ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++ if (le32_to_cpu(es->s_free_blocks_count) <= ++ le32_to_cpu(es->s_r_blocks_count) && ++ ((sb->u.ext3_sb.s_resuid != current->fsuid) && ++ (sb->u.ext3_sb.s_resgid == 0 || ++ !in_group_p (sb->u.ext3_sb.s_resgid)) && ++ !capable(CAP_SYS_RESOURCE))) ++ goto out; ++ ++ ext3_debug ("goal=%lu.\n", goal); ++ ++ /* ++ * First, test whether the goal block is free. ++ */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ i = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ gdp = ext3_get_group_desc (sb, i, &bh2); ++ if (!gdp) ++ goto io_error; ++ ++ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { ++ j = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++#ifdef EXT3FS_DEBUG ++ if (j) ++ goal_attempts++; ++#endif ++ bitmap_nr = load_block_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ goto io_error; ++ ++ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ ++ ext3_debug ("goal is at %d:%d.\n", i, j); ++ ++ if (ext3_test_allocatable(j, bh)) { ++#ifdef EXT3FS_DEBUG ++ goal_hits++; ++ ext3_debug ("goal bit allocated.\n"); ++#endif ++ goto got_block; ++ } ++ ++ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb)); ++ if (j >= 0) ++ goto search_back; ++ } ++ ++ ext3_debug ("Bit not found in block group %d.\n", i); ++ ++ /* ++ * Now search the rest of the groups. We assume that ++ * i and gdp correctly point to the last group visited. ++ */ ++ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { ++ i++; ++ if (i >= sb->u.ext3_sb.s_groups_count) ++ i = 0; ++ gdp = ext3_get_group_desc (sb, i, &bh2); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out; ++ } ++ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { ++ bitmap_nr = load_block_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ goto io_error; ++ ++ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ j = find_next_usable_block(-1, bh, ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ if (j >= 0) ++ goto search_back; ++ } ++ } ++ ++ /* No space left on the device */ ++ goto out; ++ ++search_back: ++ /* ++ * We have succeeded in finding a free byte in the block ++ * bitmap. Now search backwards up to 7 bits to find the ++ * start of this group of free blocks. ++ */ ++ for ( k = 0; ++ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh); ++ k++, j--) ++ ; ++ ++got_block: ++ ++ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count); ++ ++ /* Make sure we use undo access for the bitmap, because it is ++ critical that we do the frozen_data COW on bitmap buffers in ++ all cases even if the buffer is in BJ_Forget state in the ++ committing transaction. */ ++ BUFFER_TRACE(bh, "get undo access for marking new block"); ++ fatal = ext3_journal_get_undo_access(handle, bh); ++ if (fatal) goto out; ++ ++ BUFFER_TRACE(bh2, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, bh2); ++ if (fatal) goto out; ++ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ if (fatal) goto out; ++ ++ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || ++ tmp == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range (tmp, le32_to_cpu(gdp->bg_inode_table), ++ sb->u.ext3_sb.s_itb_per_group)) ++ ext3_error (sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", tmp); ++ ++ /* The superblock lock should guard against anybody else beating ++ * us to this point! */ ++ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); ++ BUFFER_TRACE(bh, "setting bitmap bit"); ++ ext3_set_bit(j, bh->b_data); ++ performed_allocation = 1; ++ ++#ifdef CONFIG_JBD_DEBUG ++ { ++ struct buffer_head *debug_bh; ++ ++ /* Record bitmap buffer state in the newly allocated block */ ++ debug_bh = sb_get_hash_table(sb, tmp); ++ if (debug_bh) { ++ BUFFER_TRACE(debug_bh, "state when allocated"); ++ BUFFER_TRACE2(debug_bh, bh, "bitmap state"); ++ brelse(debug_bh); ++ } ++ } ++#endif ++ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data) ++ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data)); ++ bhtmp = bh; ++ alloctmp = j; ++ ++ ext3_debug ("found bit %d\n", j); ++ ++ /* ++ * Do block preallocation now if required. ++ */ ++#ifdef EXT3_PREALLOCATE ++ /* ++ * akpm: this is not enabled for ext3. Need to use ++ * ext3_test_allocatable() ++ */ ++ /* Writer: ->i_prealloc* */ ++ if (prealloc_count && !*prealloc_count) { ++ int prealloc_goal; ++ unsigned long next_block = tmp + 1; ++ ++ prealloc_goal = es->s_prealloc_blocks ? ++ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS; ++ ++ *prealloc_block = next_block; ++ /* Writer: end */ ++ for (k = 1; ++ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb); ++ k++, next_block++) { ++ if (DQUOT_PREALLOC_BLOCK(inode, 1)) ++ break; ++ /* Writer: ->i_prealloc* */ ++ if (*prealloc_block + *prealloc_count != next_block || ++ ext3_set_bit (j + k, bh->b_data)) { ++ /* Writer: end */ ++ DQUOT_FREE_BLOCK(inode, 1); ++ break; ++ } ++ (*prealloc_count)++; ++ /* Writer: end */ ++ } ++ /* ++ * As soon as we go for per-group spinlocks we'll need these ++ * done inside the loop above. ++ */ ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - ++ (k - 1)); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - ++ (k - 1)); ++ ext3_debug ("Preallocated a further %lu bits.\n", ++ (k - 1)); ++ } ++#endif ++ ++ j = tmp; ++ ++ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (!fatal) fatal = err; ++ ++ if (j >= le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_new_block", ++ "block(%d) >= blocks count(%d) - " ++ "block_group = %d, es == %p ",j, ++ le32_to_cpu(es->s_blocks_count), i, es); ++ goto out; ++ } ++ ++ /* ++ * It is up to the caller to add the new buffer to a journal ++ * list of some description. We don't know in advance whether ++ * the caller wants to use it as metadata or data. ++ */ ++ ++ ext3_debug ("allocating block %d. " ++ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); ++ ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); ++ ++ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor"); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (!fatal) fatal = err; ++ ++ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); ++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ if (!fatal) fatal = err; ++ ++ sb->s_dirt = 1; ++ if (fatal) ++ goto out; ++ ++ unlock_super (sb); ++ *errp = 0; ++ return j; ++ ++io_error: ++ *errp = -EIO; ++out: ++ if (fatal) { ++ *errp = fatal; ++ ext3_std_error(sb, fatal); ++ } ++ unlock_super (sb); ++ /* ++ * Undo the block allocation ++ */ ++ if (!performed_allocation) ++ DQUOT_FREE_BLOCK(inode, 1); ++ return 0; ++ ++} ++ ++unsigned long ext3_count_free_blocks (struct super_block * sb) ++{ ++#ifdef EXT3FS_DEBUG ++ struct ext3_super_block * es; ++ unsigned long desc_count, bitmap_count, x; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ int i; ++ ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++ desc_count = 0; ++ bitmap_count = 0; ++ gdp = NULL; ++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ gdp = ext3_get_group_desc (sb, i, NULL); ++ if (!gdp) ++ continue; ++ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); ++ bitmap_nr = load_block_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ continue; ++ ++ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], ++ sb->s_blocksize); ++ printk ("group %d: stored = %d, counted = %lu\n", ++ i, le16_to_cpu(gdp->bg_free_blocks_count), x); ++ bitmap_count += x; ++ } ++ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n", ++ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); ++ unlock_super (sb); ++ return bitmap_count; ++#else ++ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); ++#endif ++} ++ ++static inline int block_in_use (unsigned long block, ++ struct super_block * sb, ++ unsigned char * map) ++{ ++ return ext3_test_bit ((block - ++ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb), map); ++} ++ ++static inline int test_root(int a, int b) ++{ ++ if (a == 0) ++ return 1; ++ while (1) { ++ if (a == 1) ++ return 1; ++ if (a % b) ++ return 0; ++ a = a / b; ++ } ++} ++ ++int ext3_group_sparse(int group) ++{ ++ return (test_root(group, 3) || test_root(group, 5) || ++ test_root(group, 7)); ++} ++ ++/** ++ * ext3_bg_has_super - number of blocks used by the superblock in group ++ * @sb: superblock for filesystem ++ * @group: group number to check ++ * ++ * Return the number of blocks used by the superblock (primary or backup) ++ * in this group. Currently this will be only 0 or 1. ++ */ ++int ext3_bg_has_super(struct super_block *sb, int group) ++{ ++ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& ++ !ext3_group_sparse(group)) ++ return 0; ++ return 1; ++} ++ ++/** ++ * ext3_bg_num_gdb - number of blocks used by the group table in group ++ * @sb: superblock for filesystem ++ * @group: group number to check ++ * ++ * Return the number of blocks used by the group descriptor table ++ * (primary or backup) in this group. In the future there may be a ++ * different number of descriptor blocks in each group. ++ */ ++unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) ++{ ++ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& ++ !ext3_group_sparse(group)) ++ return 0; ++ return EXT3_SB(sb)->s_gdb_count; ++} ++ ++#ifdef CONFIG_EXT3_CHECK ++/* Called at mount-time, super-block is locked */ ++void ext3_check_blocks_bitmap (struct super_block * sb) ++{ ++ struct buffer_head * bh; ++ struct ext3_super_block * es; ++ unsigned long desc_count, bitmap_count, x, j; ++ unsigned long desc_blocks; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ int i; ++ ++ es = sb->u.ext3_sb.s_es; ++ desc_count = 0; ++ bitmap_count = 0; ++ gdp = NULL; ++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ gdp = ext3_get_group_desc (sb, i, NULL); ++ if (!gdp) ++ continue; ++ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); ++ bitmap_nr = load_block_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ continue; ++ ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; ++ ++ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data)) ++ ext3_error(sb, __FUNCTION__, ++ "Superblock in group %d is marked free", i); ++ ++ desc_blocks = ext3_bg_num_gdb(sb, i); ++ for (j = 0; j < desc_blocks; j++) ++ if (!ext3_test_bit(j + 1, bh->b_data)) ++ ext3_error(sb, __FUNCTION__, ++ "Descriptor block #%ld in group " ++ "%d is marked free", j, i); ++ ++ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), ++ sb, bh->b_data)) ++ ext3_error (sb, "ext3_check_blocks_bitmap", ++ "Block bitmap for group %d is marked free", ++ i); ++ ++ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), ++ sb, bh->b_data)) ++ ext3_error (sb, "ext3_check_blocks_bitmap", ++ "Inode bitmap for group %d is marked free", ++ i); ++ ++ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) ++ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, ++ sb, bh->b_data)) ++ ext3_error (sb, "ext3_check_blocks_bitmap", ++ "Block #%d of the inode table in " ++ "group %d is marked free", j, i); ++ ++ x = ext3_count_free (bh, sb->s_blocksize); ++ if (le16_to_cpu(gdp->bg_free_blocks_count) != x) ++ ext3_error (sb, "ext3_check_blocks_bitmap", ++ "Wrong free blocks count for group %d, " ++ "stored = %d, counted = %lu", i, ++ le16_to_cpu(gdp->bg_free_blocks_count), x); ++ bitmap_count += x; ++ } ++ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) ++ ext3_error (sb, "ext3_check_blocks_bitmap", ++ "Wrong free blocks count in super block, " ++ "stored = %lu, counted = %lu", ++ (unsigned long)le32_to_cpu(es->s_free_blocks_count), ++ bitmap_count); ++} ++#endif +diff -rup --new-file linux.mcp2/fs/ext3/bitmap.c linux_tmp/fs/ext3/bitmap.c +--- linux.mcp2/fs/ext3/bitmap.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/bitmap.c 2001-11-09 14:25:04.000000000 -0800 +@@ -0,0 +1,26 @@ ++/* ++ * linux/fs/ext3/bitmap.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ */ ++ ++#include ++ ++ ++static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; ++ ++unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) ++{ ++ unsigned int i; ++ unsigned long sum = 0; ++ ++ if (!map) ++ return (0); ++ for (i = 0; i < numchars; i++) ++ sum += nibblemap[map->b_data[i] & 0xf] + ++ nibblemap[(map->b_data[i] >> 4) & 0xf]; ++ return (sum); ++} +diff -rup --new-file linux.mcp2/fs/ext3/dir.c linux_tmp/fs/ext3/dir.c +--- linux.mcp2/fs/ext3/dir.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 +@@ -0,0 +1,190 @@ ++/* ++ * linux/fs/ext3/dir.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/dir.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * ext3 directory handling functions ++ * ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ */ ++ ++#include ++#include ++#include ++ ++static unsigned char ext3_filetype_table[] = { ++ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK ++}; ++ ++static int ext3_readdir(struct file *, void *, filldir_t); ++ ++struct file_operations ext3_dir_operations = { ++ read: generic_read_dir, ++ readdir: ext3_readdir, /* BKL held */ ++ ioctl: ext3_ioctl, /* BKL held */ ++ fsync: ext3_sync_file, /* BKL held */ ++}; ++ ++int ext3_check_dir_entry (const char * function, struct inode * dir, ++ struct ext3_dir_entry_2 * de, ++ struct buffer_head * bh, ++ unsigned long offset) ++{ ++ const char * error_msg = NULL; ++ const int rlen = le16_to_cpu(de->rec_len); ++ ++ if (rlen < EXT3_DIR_REC_LEN(1)) ++ error_msg = "rec_len is smaller than minimal"; ++ else if (rlen % 4 != 0) ++ error_msg = "rec_len % 4 != 0"; ++ else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) ++ error_msg = "rec_len is too small for name_len"; ++ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) ++ error_msg = "directory entry across blocks"; ++ else if (le32_to_cpu(de->inode) > ++ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ error_msg = "inode out of bounds"; ++ ++ if (error_msg != NULL) ++ ext3_error (dir->i_sb, function, ++ "bad entry in directory #%lu: %s - " ++ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", ++ dir->i_ino, error_msg, offset, ++ (unsigned long) le32_to_cpu(de->inode), ++ rlen, de->name_len); ++ return error_msg == NULL ? 1 : 0; ++} ++ ++static int ext3_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ int error = 0; ++ unsigned long offset, blk; ++ int i, num, stored; ++ struct buffer_head * bh, * tmp, * bha[16]; ++ struct ext3_dir_entry_2 * de; ++ struct super_block * sb; ++ int err; ++ struct inode *inode = filp->f_dentry->d_inode; ++ ++ sb = inode->i_sb; ++ ++ stored = 0; ++ bh = NULL; ++ offset = filp->f_pos & (sb->s_blocksize - 1); ++ ++ while (!error && !stored && filp->f_pos < inode->i_size) { ++ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); ++ bh = ext3_bread (0, inode, blk, 0, &err); ++ if (!bh) { ++ ext3_error (sb, "ext3_readdir", ++ "directory #%lu contains a hole at offset %lu", ++ inode->i_ino, (unsigned long)filp->f_pos); ++ filp->f_pos += sb->s_blocksize - offset; ++ continue; ++ } ++ ++ /* ++ * Do the readahead ++ */ ++ if (!offset) { ++ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; ++ i > 0; i--) { ++ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); ++ if (tmp && !buffer_uptodate(tmp) && ++ !buffer_locked(tmp)) ++ bha[num++] = tmp; ++ else ++ brelse (tmp); ++ } ++ if (num) { ++ ll_rw_block (READA, num, bha); ++ for (i = 0; i < num; i++) ++ brelse (bha[i]); ++ } ++ } ++ ++revalidate: ++ /* If the dir block has changed since the last call to ++ * readdir(2), then we might be pointing to an invalid ++ * dirent right now. Scan from the start of the block ++ * to make sure. */ ++ if (filp->f_version != inode->i_version) { ++ for (i = 0; i < sb->s_blocksize && i < offset; ) { ++ de = (struct ext3_dir_entry_2 *) ++ (bh->b_data + i); ++ /* It's too expensive to do a full ++ * dirent test each time round this ++ * loop, but we do have to test at ++ * least that it is non-zero. A ++ * failure will be detected in the ++ * dirent test below. */ ++ if (le16_to_cpu(de->rec_len) < ++ EXT3_DIR_REC_LEN(1)) ++ break; ++ i += le16_to_cpu(de->rec_len); ++ } ++ offset = i; ++ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) ++ | offset; ++ filp->f_version = inode->i_version; ++ } ++ ++ while (!error && filp->f_pos < inode->i_size ++ && offset < sb->s_blocksize) { ++ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); ++ if (!ext3_check_dir_entry ("ext3_readdir", inode, de, ++ bh, offset)) { ++ /* On error, skip the f_pos to the ++ next block. */ ++ filp->f_pos = (filp->f_pos | ++ (sb->s_blocksize - 1)) + 1; ++ brelse (bh); ++ return stored; ++ } ++ offset += le16_to_cpu(de->rec_len); ++ if (le32_to_cpu(de->inode)) { ++ /* We might block in the next section ++ * if the data destination is ++ * currently swapped out. So, use a ++ * version stamp to detect whether or ++ * not the directory has been modified ++ * during the copy operation. ++ */ ++ unsigned long version = filp->f_version; ++ unsigned char d_type = DT_UNKNOWN; ++ ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, ++ EXT3_FEATURE_INCOMPAT_FILETYPE) ++ && de->file_type < EXT3_FT_MAX) ++ d_type = ++ ext3_filetype_table[de->file_type]; ++ error = filldir(dirent, de->name, ++ de->name_len, ++ filp->f_pos, ++ le32_to_cpu(de->inode), ++ d_type); ++ if (error) ++ break; ++ if (version != filp->f_version) ++ goto revalidate; ++ stored ++; ++ } ++ filp->f_pos += le16_to_cpu(de->rec_len); ++ } ++ offset = 0; ++ brelse (bh); ++ } ++ UPDATE_ATIME(inode); ++ return 0; ++} +diff -rup --new-file linux.mcp2/fs/ext3/file.c linux_tmp/fs/ext3/file.c +--- linux.mcp2/fs/ext3/file.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/file.c 2001-11-15 13:37:55.000000000 -0800 +@@ -0,0 +1,94 @@ ++/* ++ * linux/fs/ext3/file.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/file.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * ext3 fs regular file handling primitives ++ * ++ * 64-bit file support on 64-bit platforms by Jakub Jelinek ++ * (jj@sunsite.ms.mff.cuni.cz) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Called when an inode is released. Note that this is different ++ * from ext3_file_open: open gets called at every open, but release ++ * gets called only when /all/ the files are closed. ++ */ ++static int ext3_release_file (struct inode * inode, struct file * filp) ++{ ++ if (filp->f_mode & FMODE_WRITE) ++ ext3_discard_prealloc (inode); ++ return 0; ++} ++ ++/* ++ * Called when an inode is about to be opened. ++ * We use this to disallow opening RW large files on 32bit systems if ++ * the caller didn't specify O_LARGEFILE. On 64bit systems we force ++ * on this flag in sys_open. ++ */ ++static int ext3_open_file (struct inode * inode, struct file * filp) ++{ ++ if (!(filp->f_flags & O_LARGEFILE) && ++ inode->i_size > 0x7FFFFFFFLL) ++ return -EFBIG; ++ return 0; ++} ++ ++/* ++ * ext3_file_write(). ++ * ++ * Most things are done in ext3_prepare_write() and ext3_commit_write(). ++ */ ++ ++static ssize_t ++ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ ++ /* ++ * Nasty: if the file is subject to synchronous writes then we need ++ * to force generic_osync_inode() to call ext3_write_inode(). ++ * We do that by marking the inode dirty. This adds much more ++ * computational expense than we need, but we're going to sync ++ * anyway. ++ */ ++ if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) ++ mark_inode_dirty(inode); ++ ++ return generic_file_write(file, buf, count, ppos); ++} ++ ++struct file_operations ext3_file_operations = { ++ llseek: generic_file_llseek, /* BKL held */ ++ read: generic_file_read, /* BKL not held. Don't need */ ++ write: ext3_file_write, /* BKL not held. Don't need */ ++ ioctl: ext3_ioctl, /* BKL held */ ++ mmap: generic_file_mmap, ++ open: ext3_open_file, /* BKL not held. Don't need */ ++ release: ext3_release_file, /* BKL not held. Don't need */ ++ fsync: ext3_sync_file, /* BKL held */ ++}; ++ ++struct inode_operations ext3_file_inode_operations = { ++ truncate: ext3_truncate, /* BKL held */ ++ setattr: ext3_setattr, /* BKL held */ ++}; ++ +diff -rup --new-file linux.mcp2/fs/ext3/fsync.c linux_tmp/fs/ext3/fsync.c +--- linux.mcp2/fs/ext3/fsync.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/fsync.c 2001-11-20 21:34:13.000000000 -0800 +@@ -0,0 +1,70 @@ ++/* ++ * linux/fs/ext3/fsync.c ++ * ++ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) ++ * from ++ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * from ++ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * ext3fs fsync primitive ++ * ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ * ++ * Removed unnecessary code duplication for little endian machines ++ * and excessive __inline__s. ++ * Andi Kleen, 1997 ++ * ++ * Major simplications and cleanup - we only need to do the metadata, because ++ * we can depend on generic_block_fdatasync() to sync the data blocks. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * akpm: A new design for ext3_sync_file(). ++ * ++ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). ++ * There cannot be a transaction open by this task. (AKPM: quotas?) ++ * Another task could have dirtied this inode. Its data can be in any ++ * state in the journalling system. ++ * ++ * What we do is just kick off a commit and wait on it. This will snapshot the ++ * inode to disk. ++ * ++ * Note that there is a serious optimisation we can make here: if the current ++ * inode is not part of j_running_transaction or j_committing_transaction ++ * then we have nothing to do. That would require implementation of t_ilist, ++ * which isn't too hard. ++ */ ++ ++int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) ++{ ++ struct inode *inode = dentry->d_inode; ++ int ret; ++ ++ J_ASSERT(ext3_journal_current_handle() == 0); ++ ++ /* ++ * fsync_inode_buffers() just walks i_dirty_buffers and waits ++ * on them. It's a no-op for full data journalling because ++ * i_dirty_buffers will be ampty. ++ * Really, we only need to start I/O on the dirty buffers - ++ * we'll end up waiting on them in commit. ++ */ ++ ret = fsync_inode_buffers(inode); ++ ret |= fsync_inode_data_buffers(inode); ++ ++ ext3_force_commit(inode->i_sb); ++ ++ return ret; ++} +diff -rup --new-file linux.mcp2/fs/ext3/ialloc.c linux_tmp/fs/ext3/ialloc.c +--- linux.mcp2/fs/ext3/ialloc.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/ialloc.c 2002-02-25 11:38:08.000000000 -0800 +@@ -0,0 +1,663 @@ ++/* ++ * linux/fs/ext3/ialloc.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * BSD ufs-inspired inode and directory allocation by ++ * Stephen Tweedie (sct@redhat.com), 1993 ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* ++ * ialloc.c contains the inodes allocation and deallocation routines ++ */ ++ ++/* ++ * The free inodes are managed by bitmaps. A file system contains several ++ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap ++ * block for inodes, N blocks for the inode table and data blocks. ++ * ++ * The file system contains group descriptors which are located after the ++ * super block. Each descriptor contains the number of the bitmap block and ++ * the free blocks count in the block. The descriptors are loaded in memory ++ * when a file system is mounted (see ext3_read_super). ++ */ ++ ++ ++/* ++ * Read the inode allocation bitmap for a given block_group, reading ++ * into the specified slot in the superblock's bitmap cache. ++ * ++ * Return >=0 on success or a -ve error code. ++ */ ++static int read_inode_bitmap (struct super_block * sb, ++ unsigned long block_group, ++ unsigned int bitmap_nr) ++{ ++ struct ext3_group_desc * gdp; ++ struct buffer_head * bh = NULL; ++ int retval = 0; ++ ++ gdp = ext3_get_group_desc (sb, block_group, NULL); ++ if (!gdp) { ++ retval = -EIO; ++ goto error_out; ++ } ++ bh = sb_bread(sb, le32_to_cpu(gdp->bg_inode_bitmap)); ++ if (!bh) { ++ ext3_error (sb, "read_inode_bitmap", ++ "Cannot read inode bitmap - " ++ "block_group = %lu, inode_bitmap = %lu", ++ block_group, (unsigned long) gdp->bg_inode_bitmap); ++ retval = -EIO; ++ } ++ /* ++ * On IO error, just leave a zero in the superblock's block pointer for ++ * this group. The IO will be retried next time. ++ */ ++error_out: ++ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; ++ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; ++ return retval; ++} ++ ++/* ++ * load_inode_bitmap loads the inode bitmap for a blocks group ++ * ++ * It maintains a cache for the last bitmaps loaded. This cache is managed ++ * with a LRU algorithm. ++ * ++ * Notes: ++ * 1/ There is one cache per mounted file system. ++ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, ++ * this function reads the bitmap without maintaining a LRU cache. ++ * ++ * Return the slot used to store the bitmap, or a -ve error code. ++ */ ++static int load_inode_bitmap (struct super_block * sb, ++ unsigned int block_group) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned long inode_bitmap_number; ++ struct buffer_head * inode_bitmap; ++ int i, j, retval = 0; ++ ++ if (block_group >= sbi->s_groups_count) ++ ext3_panic (sb, "load_inode_bitmap", ++ "block_group >= groups_count - " ++ "block_group = %d, groups_count = %lu", ++ block_group, sbi->s_groups_count); ++ if (sbi->s_loaded_inode_bitmaps > 0 && ++ sbi->s_inode_bitmap_number[0] == block_group && ++ sbi->s_inode_bitmap[0] != NULL) ++ return 0; ++ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { ++ if (sbi->s_inode_bitmap[block_group]) { ++ if (sbi->s_inode_bitmap_number[block_group] != ++ block_group) ++ ext3_panic(sb, "load_inode_bitmap", ++ "block_group != inode_bitmap_number"); ++ return block_group; ++ } ++ retval = read_inode_bitmap(sb, block_group, block_group); ++ if (retval < 0) ++ return retval; ++ return block_group; ++ } ++ ++ for (i = 0; i < sbi->s_loaded_inode_bitmaps && ++ sbi->s_inode_bitmap_number[i] != block_group; i++) ++ /* do nothing */; ++ if (i < sbi->s_loaded_inode_bitmaps && ++ sbi->s_inode_bitmap_number[i] == block_group) { ++ inode_bitmap_number = sbi->s_inode_bitmap_number[i]; ++ inode_bitmap = sbi->s_inode_bitmap[i]; ++ for (j = i; j > 0; j--) { ++ sbi->s_inode_bitmap_number[j] = ++ sbi->s_inode_bitmap_number[j - 1]; ++ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; ++ } ++ sbi->s_inode_bitmap_number[0] = inode_bitmap_number; ++ sbi->s_inode_bitmap[0] = inode_bitmap; ++ ++ /* ++ * There's still one special case here --- if inode_bitmap == 0 ++ * then our last attempt to read the bitmap failed and we have ++ * just ended up caching that failure. Try again to read it. ++ */ ++ if (!inode_bitmap) ++ retval = read_inode_bitmap (sb, block_group, 0); ++ } else { ++ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) ++ sbi->s_loaded_inode_bitmaps++; ++ else ++ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); ++ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { ++ sbi->s_inode_bitmap_number[j] = ++ sbi->s_inode_bitmap_number[j - 1]; ++ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; ++ } ++ retval = read_inode_bitmap (sb, block_group, 0); ++ } ++ return retval; ++} ++ ++/* ++ * NOTE! When we get the inode, we're the only people ++ * that have access to it, and as such there are no ++ * race conditions we have to worry about. The inode ++ * is not on the hash-lists, and it cannot be reached ++ * through the filesystem because the directory entry ++ * has been deleted earlier. ++ * ++ * HOWEVER: we must make sure that we get no aliases, ++ * which means that we have to call "clear_inode()" ++ * _before_ we mark the inode not in use in the inode ++ * bitmaps. Otherwise a newly created file might use ++ * the same inode number (not actually the same pointer ++ * though), and then we'd have two inodes sharing the ++ * same inode number and space on the harddisk. ++ */ ++void ext3_free_inode (handle_t *handle, struct inode * inode) ++{ ++ struct super_block * sb = inode->i_sb; ++ int is_directory; ++ unsigned long ino; ++ struct buffer_head * bh; ++ struct buffer_head * bh2; ++ unsigned long block_group; ++ unsigned long bit; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ struct ext3_super_block * es; ++ int fatal = 0, err; ++ ++ if (!inode->i_dev) { ++ printk ("ext3_free_inode: inode has no device\n"); ++ return; ++ } ++ if (atomic_read(&inode->i_count) > 1) { ++ printk ("ext3_free_inode: inode has count=%d\n", ++ atomic_read(&inode->i_count)); ++ return; ++ } ++ if (inode->i_nlink) { ++ printk ("ext3_free_inode: inode has nlink=%d\n", ++ inode->i_nlink); ++ return; ++ } ++ if (!sb) { ++ printk("ext3_free_inode: inode on nonexistent device\n"); ++ return; ++ } ++ ++ ino = inode->i_ino; ++ ext3_debug ("freeing inode %lu\n", ino); ++ ++ /* ++ * Note: we must free any quota before locking the superblock, ++ * as writing the quota to disk may need the lock as well. ++ */ ++ DQUOT_INIT(inode); ++ DQUOT_FREE_INODE(inode); ++ DQUOT_DROP(inode); ++ ++ is_directory = S_ISDIR(inode->i_mode); ++ ++ /* Do this BEFORE marking the inode not in use or returning an error */ ++ clear_inode (inode); ++ ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ++ ext3_error (sb, "ext3_free_inode", ++ "reserved or nonexistent inode %lu", ino); ++ goto error_return; ++ } ++ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); ++ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); ++ bitmap_nr = load_inode_bitmap (sb, block_group); ++ if (bitmap_nr < 0) ++ goto error_return; ++ ++ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, bh); ++ if (fatal) ++ goto error_return; ++ ++ /* Ok, now we can actually update the inode bitmaps.. */ ++ if (!ext3_clear_bit (bit, bh->b_data)) ++ ext3_error (sb, "ext3_free_inode", ++ "bit already cleared for inode %lu", ino); ++ else { ++ gdp = ext3_get_group_desc (sb, block_group, &bh2); ++ ++ BUFFER_TRACE(bh2, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, bh2); ++ if (fatal) goto error_return; ++ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); ++ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ if (fatal) goto error_return; ++ ++ if (gdp) { ++ gdp->bg_free_inodes_count = cpu_to_le16( ++ le16_to_cpu(gdp->bg_free_inodes_count) + 1); ++ if (is_directory) ++ gdp->bg_used_dirs_count = cpu_to_le16( ++ le16_to_cpu(gdp->bg_used_dirs_count) - 1); ++ } ++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (!fatal) fatal = err; ++ es->s_free_inodes_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, ++ "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ if (!fatal) fatal = err; ++ } ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (!fatal) ++ fatal = err; ++ sb->s_dirt = 1; ++error_return: ++ ext3_std_error(sb, fatal); ++ unlock_super(sb); ++} ++ ++/* ++ * There are two policies for allocating an inode. If the new inode is ++ * a directory, then a forward search is made for a block group with both ++ * free space and a low directory-to-inode ratio; if that fails, then of ++ * the groups with above-average free space, that group with the fewest ++ * directories already is chosen. ++ * ++ * For other inodes, search forward from the parent directory's block ++ * group to find a free inode. ++ */ ++struct inode * ext3_new_inode (handle_t *handle, ++ const struct inode * dir, int mode) ++{ ++ struct super_block * sb; ++ struct buffer_head * bh; ++ struct buffer_head * bh2; ++ int i, j, avefreei; ++ struct inode * inode; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ struct ext3_group_desc * tmp; ++ struct ext3_super_block * es; ++ int err = 0; ++ ++ /* Cannot create files in a deleted directory */ ++ if (!dir || !dir->i_nlink) ++ return ERR_PTR(-EPERM); ++ ++ sb = dir->i_sb; ++ inode = new_inode(sb); ++ if (!inode) ++ return ERR_PTR(-ENOMEM); ++ init_rwsem(&inode->u.ext3_i.truncate_sem); ++ ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++repeat: ++ gdp = NULL; ++ i = 0; ++ ++ if (S_ISDIR(mode)) { ++ avefreei = le32_to_cpu(es->s_free_inodes_count) / ++ sb->u.ext3_sb.s_groups_count; ++ if (!gdp) { ++ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { ++ struct buffer_head *temp_buffer; ++ tmp = ext3_get_group_desc (sb, j, &temp_buffer); ++ if (tmp && ++ le16_to_cpu(tmp->bg_free_inodes_count) && ++ le16_to_cpu(tmp->bg_free_inodes_count) >= ++ avefreei) { ++ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > ++ le16_to_cpu(gdp->bg_free_blocks_count))) { ++ i = j; ++ gdp = tmp; ++ bh2 = temp_buffer; ++ } ++ } ++ } ++ } ++ } else { ++ /* ++ * Try to place the inode in its parent directory ++ */ ++ i = dir->u.ext3_i.i_block_group; ++ tmp = ext3_get_group_desc (sb, i, &bh2); ++ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) ++ gdp = tmp; ++ else ++ { ++ /* ++ * Use a quadratic hash to find a group with a ++ * free inode ++ */ ++ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { ++ i += j; ++ if (i >= sb->u.ext3_sb.s_groups_count) ++ i -= sb->u.ext3_sb.s_groups_count; ++ tmp = ext3_get_group_desc (sb, i, &bh2); ++ if (tmp && ++ le16_to_cpu(tmp->bg_free_inodes_count)) { ++ gdp = tmp; ++ break; ++ } ++ } ++ } ++ if (!gdp) { ++ /* ++ * That failed: try linear search for a free inode ++ */ ++ i = dir->u.ext3_i.i_block_group + 1; ++ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { ++ if (++i >= sb->u.ext3_sb.s_groups_count) ++ i = 0; ++ tmp = ext3_get_group_desc (sb, i, &bh2); ++ if (tmp && ++ le16_to_cpu(tmp->bg_free_inodes_count)) { ++ gdp = tmp; ++ break; ++ } ++ } ++ } ++ } ++ ++ err = -ENOSPC; ++ if (!gdp) ++ goto fail; ++ ++ err = -EIO; ++ bitmap_nr = load_inode_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ goto fail; ++ ++ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ ++ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, ++ EXT3_INODES_PER_GROUP(sb))) < ++ EXT3_INODES_PER_GROUP(sb)) { ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) goto fail; ++ ++ if (ext3_set_bit (j, bh->b_data)) { ++ ext3_error (sb, "ext3_new_inode", ++ "bit already set for inode %d", j); ++ goto repeat; ++ } ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) goto fail; ++ } else { ++ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { ++ ext3_error (sb, "ext3_new_inode", ++ "Free inodes count corrupted in group %d", ++ i); ++ /* Is it really ENOSPC? */ ++ err = -ENOSPC; ++ if (sb->s_flags & MS_RDONLY) ++ goto fail; ++ ++ BUFFER_TRACE(bh2, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh2); ++ if (err) goto fail; ++ gdp->bg_free_inodes_count = 0; ++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) goto fail; ++ } ++ goto repeat; ++ } ++ j += i * EXT3_INODES_PER_GROUP(sb) + 1; ++ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { ++ ext3_error (sb, "ext3_new_inode", ++ "reserved inode or inode > inodes count - " ++ "block_group = %d,inode=%d", i, j); ++ err = -EIO; ++ goto fail; ++ } ++ ++ BUFFER_TRACE(bh2, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh2); ++ if (err) goto fail; ++ gdp->bg_free_inodes_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); ++ if (S_ISDIR(mode)) ++ gdp->bg_used_dirs_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); ++ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) goto fail; ++ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ if (err) goto fail; ++ es->s_free_inodes_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ sb->s_dirt = 1; ++ if (err) goto fail; ++ ++ inode->i_uid = current->fsuid; ++ if (test_opt (sb, GRPID)) ++ inode->i_gid = dir->i_gid; ++ else if (dir->i_mode & S_ISGID) { ++ inode->i_gid = dir->i_gid; ++ if (S_ISDIR(mode)) ++ mode |= S_ISGID; ++ } else ++ inode->i_gid = current->fsgid; ++ inode->i_mode = mode; ++ ++ inode->i_ino = j; ++ /* This is the optimal IO size (for stat), not the fs block size */ ++ inode->i_blksize = PAGE_SIZE; ++ inode->i_blocks = 0; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; ++ if (S_ISLNK(mode)) ++ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); ++#ifdef EXT3_FRAGMENTS ++ inode->u.ext3_i.i_faddr = 0; ++ inode->u.ext3_i.i_frag_no = 0; ++ inode->u.ext3_i.i_frag_size = 0; ++#endif ++ inode->u.ext3_i.i_file_acl = 0; ++ inode->u.ext3_i.i_dir_acl = 0; ++ inode->u.ext3_i.i_dtime = 0; ++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++#ifdef EXT3_PREALLOCATE ++ inode->u.ext3_i.i_prealloc_count = 0; ++#endif ++ inode->u.ext3_i.i_block_group = i; ++ ++ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) ++ inode->i_flags |= S_SYNC; ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ insert_inode_hash(inode); ++ inode->i_generation = sb->u.ext3_sb.s_next_generation++; ++ ++ inode->u.ext3_i.i_state = EXT3_STATE_NEW; ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err) goto fail; ++ ++ unlock_super (sb); ++ if(DQUOT_ALLOC_INODE(inode)) { ++ DQUOT_DROP(inode); ++ inode->i_flags |= S_NOQUOTA; ++ inode->i_nlink = 0; ++ iput(inode); ++ return ERR_PTR(-EDQUOT); ++ } ++ ext3_debug ("allocating inode %lu\n", inode->i_ino); ++ return inode; ++ ++fail: ++ unlock_super(sb); ++ iput(inode); ++ ext3_std_error(sb, err); ++ return ERR_PTR(err); ++} ++ ++/* Verify that we are loading a valid orphan from disk */ ++struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) ++{ ++ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); ++ unsigned long block_group; ++ int bit; ++ int bitmap_nr; ++ struct buffer_head *bh; ++ struct inode *inode = NULL; ++ ++ /* Error cases - e2fsck has already cleaned up for us */ ++ if (ino > max_ino) { ++ ext3_warning(sb, __FUNCTION__, ++ "bad orphan ino %ld! e2fsck was run?\n", ino); ++ return NULL; ++ } ++ ++ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); ++ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); ++ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || ++ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { ++ ext3_warning(sb, __FUNCTION__, ++ "inode bitmap error for orphan %ld\n", ino); ++ return NULL; ++ } ++ ++ /* Having the inode bit set should be a 100% indicator that this ++ * is a valid orphan (no e2fsck run on fs). Orphans also include ++ * inodes that were being truncated, so we can't check i_nlink==0. ++ */ ++ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || ++ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { ++ ext3_warning(sb, __FUNCTION__, ++ "bad orphan inode %ld! e2fsck was run?\n", ino); ++ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", ++ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); ++ printk(KERN_NOTICE "inode=%p\n", inode); ++ if (inode) { ++ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", ++ is_bad_inode(inode)); ++ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", ++ NEXT_ORPHAN(inode)); ++ printk(KERN_NOTICE "max_ino=%ld\n", max_ino); ++ } ++ /* Avoid freeing blocks if we got a bad deleted inode */ ++ if (inode && inode->i_nlink == 0) ++ inode->i_blocks = 0; ++ iput(inode); ++ return NULL; ++ } ++ ++ return inode; ++} ++ ++unsigned long ext3_count_free_inodes (struct super_block * sb) ++{ ++#ifdef EXT3FS_DEBUG ++ struct ext3_super_block * es; ++ unsigned long desc_count, bitmap_count, x; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ int i; ++ ++ lock_super (sb); ++ es = sb->u.ext3_sb.s_es; ++ desc_count = 0; ++ bitmap_count = 0; ++ gdp = NULL; ++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ gdp = ext3_get_group_desc (sb, i, NULL); ++ if (!gdp) ++ continue; ++ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); ++ bitmap_nr = load_inode_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ continue; ++ ++ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], ++ EXT3_INODES_PER_GROUP(sb) / 8); ++ printk ("group %d: stored = %d, counted = %lu\n", ++ i, le16_to_cpu(gdp->bg_free_inodes_count), x); ++ bitmap_count += x; ++ } ++ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", ++ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); ++ unlock_super (sb); ++ return desc_count; ++#else ++ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_CHECK ++/* Called at mount-time, super-block is locked */ ++void ext3_check_inodes_bitmap (struct super_block * sb) ++{ ++ struct ext3_super_block * es; ++ unsigned long desc_count, bitmap_count, x; ++ int bitmap_nr; ++ struct ext3_group_desc * gdp; ++ int i; ++ ++ es = sb->u.ext3_sb.s_es; ++ desc_count = 0; ++ bitmap_count = 0; ++ gdp = NULL; ++ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ gdp = ext3_get_group_desc (sb, i, NULL); ++ if (!gdp) ++ continue; ++ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); ++ bitmap_nr = load_inode_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ continue; ++ ++ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], ++ EXT3_INODES_PER_GROUP(sb) / 8); ++ if (le16_to_cpu(gdp->bg_free_inodes_count) != x) ++ ext3_error (sb, "ext3_check_inodes_bitmap", ++ "Wrong free inodes count in group %d, " ++ "stored = %d, counted = %lu", i, ++ le16_to_cpu(gdp->bg_free_inodes_count), x); ++ bitmap_count += x; ++ } ++ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) ++ ext3_error (sb, "ext3_check_inodes_bitmap", ++ "Wrong free inodes count in super block, " ++ "stored = %lu, counted = %lu", ++ (unsigned long)le32_to_cpu(es->s_free_inodes_count), ++ bitmap_count); ++} ++#endif +diff -rup --new-file linux.mcp2/fs/ext3/inode.c linux_tmp/fs/ext3/inode.c +--- linux.mcp2/fs/ext3/inode.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/inode.c 2002-08-02 17:39:45.000000000 -0700 +@@ -0,0 +1,2699 @@ ++/* ++ * linux/fs/ext3/inode.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/inode.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * Goal-directed block allocation by Stephen Tweedie ++ * (sct@redhat.com), 1993, 1998 ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ * 64-bit file support on 64-bit platforms by Jakub Jelinek ++ * (jj@sunsite.ms.mff.cuni.cz) ++ * ++ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * SEARCH_FROM_ZERO forces each block allocation to search from the start ++ * of the filesystem. This is to force rapid reallocation of recently-freed ++ * blocks. The file fragmentation is horrendous. ++ */ ++#undef SEARCH_FROM_ZERO ++ ++/* The ext3 forget function must perform a revoke if we are freeing data ++ * which has been journaled. Metadata (eg. indirect blocks) must be ++ * revoked in all cases. ++ * ++ * "bh" may be NULL: a metadata block may have been freed from memory ++ * but there may still be a record of it in the journal, and that record ++ * still needs to be revoked. ++ */ ++ ++static int ext3_forget(handle_t *handle, int is_metadata, ++ struct inode *inode, struct buffer_head *bh, ++ int blocknr) ++{ ++ int err; ++ ++ BUFFER_TRACE(bh, "enter"); ++ ++ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " ++ "data mode %lx\n", ++ bh, is_metadata, inode->i_mode, ++ test_opt(inode->i_sb, DATA_FLAGS)); ++ ++ /* Never use the revoke function if we are doing full data ++ * journaling: there is no need to, and a V1 superblock won't ++ * support it. Otherwise, only skip the revoke on un-journaled ++ * data blocks. */ ++ ++ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || ++ (!is_metadata && !ext3_should_journal_data(inode))) { ++ if (bh) { ++ BUFFER_TRACE(bh, "call journal_forget"); ++ ext3_journal_forget(handle, bh); ++ } ++ return 0; ++ } ++ ++ /* ++ * data!=journal && (is_metadata || should_journal_data(inode)) ++ */ ++ BUFFER_TRACE(bh, "call ext3_journal_revoke"); ++ err = ext3_journal_revoke(handle, blocknr, bh); ++ if (err) ++ ext3_abort(inode->i_sb, __FUNCTION__, ++ "error %d when attempting revoke", err); ++ BUFFER_TRACE(bh, "exit"); ++ return err; ++} ++ ++/* ++ * Truncate transactions can be complex and absolutely huge. So we need to ++ * be able to restart the transaction at a conventient checkpoint to make ++ * sure we don't overflow the journal. ++ * ++ * start_transaction gets us a new handle for a truncate transaction, ++ * and extend_transaction tries to extend the existing one a bit. If ++ * extend fails, we need to propagate the failure up and restart the ++ * transaction in the top-level truncate loop. --sct ++ */ ++ ++static handle_t *start_transaction(struct inode *inode) ++{ ++ long needed; ++ handle_t *result; ++ ++ needed = inode->i_blocks; ++ if (needed > EXT3_MAX_TRANS_DATA) ++ needed = EXT3_MAX_TRANS_DATA; ++ ++ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); ++ if (!IS_ERR(result)) ++ return result; ++ ++ ext3_std_error(inode->i_sb, PTR_ERR(result)); ++ return result; ++} ++ ++/* ++ * Try to extend this transaction for the purposes of truncation. ++ * ++ * Returns 0 if we managed to create more room. If we can't create more ++ * room, and the transaction must be restarted we return 1. ++ */ ++static int try_to_extend_transaction(handle_t *handle, struct inode *inode) ++{ ++ long needed; ++ ++ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) ++ return 0; ++ needed = inode->i_blocks; ++ if (needed > EXT3_MAX_TRANS_DATA) ++ needed = EXT3_MAX_TRANS_DATA; ++ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Restart the transaction associated with *handle. This does a commit, ++ * so before we call here everything must be consistently dirtied against ++ * this transaction. ++ */ ++static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) ++{ ++ long needed = inode->i_blocks; ++ if (needed > EXT3_MAX_TRANS_DATA) ++ needed = EXT3_MAX_TRANS_DATA; ++ jbd_debug(2, "restarting handle %p\n", handle); ++ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); ++} ++ ++/* ++ * Called at each iput() ++ */ ++void ext3_put_inode (struct inode * inode) ++{ ++ ext3_discard_prealloc (inode); ++} ++ ++/* ++ * Called at the last iput() if i_nlink is zero. ++ */ ++void ext3_delete_inode (struct inode * inode) ++{ ++ handle_t *handle; ++ ++ if (is_bad_inode(inode) || ++ inode->i_ino == EXT3_ACL_IDX_INO || ++ inode->i_ino == EXT3_ACL_DATA_INO) ++ goto no_delete; ++ ++ lock_kernel(); ++ handle = start_transaction(inode); ++ if (IS_ERR(handle)) { ++ /* If we're going to skip the normal cleanup, we still ++ * need to make sure that the in-core orphan linked list ++ * is properly cleaned up. */ ++ ext3_orphan_del(NULL, inode); ++ ++ ext3_std_error(inode->i_sb, PTR_ERR(handle)); ++ unlock_kernel(); ++ goto no_delete; ++ } ++ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ inode->i_size = 0; ++ if (inode->i_blocks) ++ ext3_truncate(inode); ++ /* ++ * Kill off the orphan record which ext3_truncate created. ++ * AKPM: I think this can be inside the above `if'. ++ * Note that ext3_orphan_del() has to be able to cope with the ++ * deletion of a non-existent orphan - this is because we don't ++ * know if ext3_truncate() actually created an orphan record. ++ * (Well, we could do this if we need to, but heck - it works) ++ */ ++ ext3_orphan_del(handle, inode); ++ inode->u.ext3_i.i_dtime = CURRENT_TIME; ++ ++ /* ++ * One subtle ordering requirement: if anything has gone wrong ++ * (transaction abort, IO errors, whatever), then we can still ++ * do these next steps (the fs will already have been marked as ++ * having errors), but we can't free the inode if the mark_dirty ++ * fails. ++ */ ++ if (ext3_mark_inode_dirty(handle, inode)) ++ /* If that failed, just do the required in-core inode clear. */ ++ clear_inode(inode); ++ else ++ ext3_free_inode(handle, inode); ++ ext3_journal_stop(handle, inode); ++ unlock_kernel(); ++ return; ++no_delete: ++ clear_inode(inode); /* We must guarantee clearing of inode... */ ++} ++ ++void ext3_discard_prealloc (struct inode * inode) ++{ ++#ifdef EXT3_PREALLOCATE ++ lock_kernel(); ++ /* Writer: ->i_prealloc* */ ++ if (inode->u.ext3_i.i_prealloc_count) { ++ unsigned short total = inode->u.ext3_i.i_prealloc_count; ++ unsigned long block = inode->u.ext3_i.i_prealloc_block; ++ inode->u.ext3_i.i_prealloc_count = 0; ++ inode->u.ext3_i.i_prealloc_block = 0; ++ /* Writer: end */ ++ ext3_free_blocks (inode, block, total); ++ } ++ unlock_kernel(); ++#endif ++} ++ ++static int ext3_alloc_block (handle_t *handle, ++ struct inode * inode, unsigned long goal, int *err) ++{ ++#ifdef EXT3FS_DEBUG ++ static unsigned long alloc_hits = 0, alloc_attempts = 0; ++#endif ++ unsigned long result; ++ ++#ifdef EXT3_PREALLOCATE ++ /* Writer: ->i_prealloc* */ ++ if (inode->u.ext3_i.i_prealloc_count && ++ (goal == inode->u.ext3_i.i_prealloc_block || ++ goal + 1 == inode->u.ext3_i.i_prealloc_block)) ++ { ++ result = inode->u.ext3_i.i_prealloc_block++; ++ inode->u.ext3_i.i_prealloc_count--; ++ /* Writer: end */ ++ ext3_debug ("preallocation hit (%lu/%lu).\n", ++ ++alloc_hits, ++alloc_attempts); ++ } else { ++ ext3_discard_prealloc (inode); ++ ext3_debug ("preallocation miss (%lu/%lu).\n", ++ alloc_hits, ++alloc_attempts); ++ if (S_ISREG(inode->i_mode)) ++ result = ext3_new_block (inode, goal, ++ &inode->u.ext3_i.i_prealloc_count, ++ &inode->u.ext3_i.i_prealloc_block, err); ++ else ++ result = ext3_new_block (inode, goal, 0, 0, err); ++ /* ++ * AKPM: this is somewhat sticky. I'm not surprised it was ++ * disabled in 2.2's ext3. Need to integrate b_committed_data ++ * guarding with preallocation, if indeed preallocation is ++ * effective. ++ */ ++ } ++#else ++ result = ext3_new_block (handle, inode, goal, 0, 0, err); ++#endif ++ return result; ++} ++ ++ ++typedef struct { ++ u32 *p; ++ u32 key; ++ struct buffer_head *bh; ++} Indirect; ++ ++static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) ++{ ++ p->key = *(p->p = v); ++ p->bh = bh; ++} ++ ++static inline int verify_chain(Indirect *from, Indirect *to) ++{ ++ while (from <= to && from->key == *from->p) ++ from++; ++ return (from > to); ++} ++ ++/** ++ * ext3_block_to_path - parse the block number into array of offsets ++ * @inode: inode in question (we are only interested in its superblock) ++ * @i_block: block number to be parsed ++ * @offsets: array to store the offsets in ++ * ++ * To store the locations of file's data ext3 uses a data structure common ++ * for UNIX filesystems - tree of pointers anchored in the inode, with ++ * data blocks at leaves and indirect blocks in intermediate nodes. ++ * This function translates the block number into path in that tree - ++ * return value is the path length and @offsets[n] is the offset of ++ * pointer to (n+1)th node in the nth one. If @block is out of range ++ * (negative or too large) warning is printed and zero returned. ++ * ++ * Note: function doesn't find node addresses, so no IO is needed. All ++ * we need to know is the capacity of indirect blocks (taken from the ++ * inode->i_sb). ++ */ ++ ++/* ++ * Portability note: the last comparison (check that we fit into triple ++ * indirect block) is spelled differently, because otherwise on an ++ * architecture with 32-bit longs and 8Kb pages we might get into trouble ++ * if our filesystem had 8Kb blocks. We might use long long, but that would ++ * kill us on x86. Oh, well, at least the sign propagation does not matter - ++ * i_block would have to be negative in the very beginning, so we would not ++ * get there at all. ++ */ ++ ++static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) ++{ ++ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); ++ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); ++ const long direct_blocks = EXT3_NDIR_BLOCKS, ++ indirect_blocks = ptrs, ++ double_blocks = (1 << (ptrs_bits * 2)); ++ int n = 0; ++ ++ if (i_block < 0) { ++ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); ++ } else if (i_block < direct_blocks) { ++ offsets[n++] = i_block; ++ } else if ( (i_block -= direct_blocks) < indirect_blocks) { ++ offsets[n++] = EXT3_IND_BLOCK; ++ offsets[n++] = i_block; ++ } else if ((i_block -= indirect_blocks) < double_blocks) { ++ offsets[n++] = EXT3_DIND_BLOCK; ++ offsets[n++] = i_block >> ptrs_bits; ++ offsets[n++] = i_block & (ptrs - 1); ++ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { ++ offsets[n++] = EXT3_TIND_BLOCK; ++ offsets[n++] = i_block >> (ptrs_bits * 2); ++ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); ++ offsets[n++] = i_block & (ptrs - 1); ++ } else { ++ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); ++ } ++ return n; ++} ++ ++/** ++ * ext3_get_branch - read the chain of indirect blocks leading to data ++ * @inode: inode in question ++ * @depth: depth of the chain (1 - direct pointer, etc.) ++ * @offsets: offsets of pointers in inode/indirect blocks ++ * @chain: place to store the result ++ * @err: here we store the error value ++ * ++ * Function fills the array of triples and returns %NULL ++ * if everything went OK or the pointer to the last filled triple ++ * (incomplete one) otherwise. Upon the return chain[i].key contains ++ * the number of (i+1)-th block in the chain (as it is stored in memory, ++ * i.e. little-endian 32-bit), chain[i].p contains the address of that ++ * number (it points into struct inode for i==0 and into the bh->b_data ++ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect ++ * block for i>0 and NULL for i==0. In other words, it holds the block ++ * numbers of the chain, addresses they were taken from (and where we can ++ * verify that chain did not change) and buffer_heads hosting these ++ * numbers. ++ * ++ * Function stops when it stumbles upon zero pointer (absent block) ++ * (pointer to last triple returned, *@err == 0) ++ * or when it gets an IO error reading an indirect block ++ * (ditto, *@err == -EIO) ++ * or when it notices that chain had been changed while it was reading ++ * (ditto, *@err == -EAGAIN) ++ * or when it reads all @depth-1 indirect blocks successfully and finds ++ * the whole chain, all way to the data (returns %NULL, *err == 0). ++ */ ++static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, ++ Indirect chain[4], int *err) ++{ ++ struct super_block *sb = inode->i_sb; ++ Indirect *p = chain; ++ struct buffer_head *bh; ++ ++ *err = 0; ++ /* i_data is not going away, no lock needed */ ++ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); ++ if (!p->key) ++ goto no_block; ++ while (--depth) { ++ bh = sb_bread(sb, le32_to_cpu(p->key)); ++ if (!bh) ++ goto failure; ++ /* Reader: pointers */ ++ if (!verify_chain(chain, p)) ++ goto changed; ++ add_chain(++p, bh, (u32*)bh->b_data + *++offsets); ++ /* Reader: end */ ++ if (!p->key) ++ goto no_block; ++ } ++ return NULL; ++ ++changed: ++ *err = -EAGAIN; ++ goto no_block; ++failure: ++ *err = -EIO; ++no_block: ++ return p; ++} ++ ++/** ++ * ext3_find_near - find a place for allocation with sufficient locality ++ * @inode: owner ++ * @ind: descriptor of indirect block. ++ * ++ * This function returns the prefered place for block allocation. ++ * It is used when heuristic for sequential allocation fails. ++ * Rules are: ++ * + if there is a block to the left of our position - allocate near it. ++ * + if pointer will live in indirect block - allocate near that block. ++ * + if pointer will live in inode - allocate in the same ++ * cylinder group. ++ * Caller must make sure that @ind is valid and will stay that way. ++ */ ++ ++static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) ++{ ++ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; ++ u32 *p; ++ ++ /* Try to find previous block */ ++ for (p = ind->p - 1; p >= start; p--) ++ if (*p) ++ return le32_to_cpu(*p); ++ ++ /* No such thing, so let's try location of indirect block */ ++ if (ind->bh) ++ return ind->bh->b_blocknr; ++ ++ /* ++ * It is going to be refered from inode itself? OK, just put it into ++ * the same cylinder group then. ++ */ ++ return (inode->u.ext3_i.i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); ++} ++ ++/** ++ * ext3_find_goal - find a prefered place for allocation. ++ * @inode: owner ++ * @block: block we want ++ * @chain: chain of indirect blocks ++ * @partial: pointer to the last triple within a chain ++ * @goal: place to store the result. ++ * ++ * Normally this function find the prefered place for block allocation, ++ * stores it in *@goal and returns zero. If the branch had been changed ++ * under us we return -EAGAIN. ++ */ ++ ++static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], ++ Indirect *partial, unsigned long *goal) ++{ ++ /* Writer: ->i_next_alloc* */ ++ if (block == inode->u.ext3_i.i_next_alloc_block + 1) { ++ inode->u.ext3_i.i_next_alloc_block++; ++ inode->u.ext3_i.i_next_alloc_goal++; ++ } ++#ifdef SEARCH_FROM_ZERO ++ inode->u.ext3_i.i_next_alloc_block = 0; ++ inode->u.ext3_i.i_next_alloc_goal = 0; ++#endif ++ /* Writer: end */ ++ /* Reader: pointers, ->i_next_alloc* */ ++ if (verify_chain(chain, partial)) { ++ /* ++ * try the heuristic for sequential allocation, ++ * failing that at least try to get decent locality. ++ */ ++ if (block == inode->u.ext3_i.i_next_alloc_block) ++ *goal = inode->u.ext3_i.i_next_alloc_goal; ++ if (!*goal) ++ *goal = ext3_find_near(inode, partial); ++#ifdef SEARCH_FROM_ZERO ++ *goal = 0; ++#endif ++ return 0; ++ } ++ /* Reader: end */ ++ return -EAGAIN; ++} ++ ++/** ++ * ext3_alloc_branch - allocate and set up a chain of blocks. ++ * @inode: owner ++ * @num: depth of the chain (number of blocks to allocate) ++ * @offsets: offsets (in the blocks) to store the pointers to next. ++ * @branch: place to store the chain in. ++ * ++ * This function allocates @num blocks, zeroes out all but the last one, ++ * links them into chain and (if we are synchronous) writes them to disk. ++ * In other words, it prepares a branch that can be spliced onto the ++ * inode. It stores the information about that chain in the branch[], in ++ * the same format as ext3_get_branch() would do. We are calling it after ++ * we had read the existing part of chain and partial points to the last ++ * triple of that (one with zero ->key). Upon the exit we have the same ++ * picture as after the successful ext3_get_block(), excpet that in one ++ * place chain is disconnected - *branch->p is still zero (we did not ++ * set the last link), but branch->key contains the number that should ++ * be placed into *branch->p to fill that gap. ++ * ++ * If allocation fails we free all blocks we've allocated (and forget ++ * their buffer_heads) and return the error value the from failed ++ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain ++ * as described above and return 0. ++ */ ++ ++static int ext3_alloc_branch(handle_t *handle, struct inode *inode, ++ int num, ++ unsigned long goal, ++ int *offsets, ++ Indirect *branch) ++{ ++ int blocksize = inode->i_sb->s_blocksize; ++ int n = 0, keys = 0; ++ int err = 0; ++ int i; ++ int parent = ext3_alloc_block(handle, inode, goal, &err); ++ ++ branch[0].key = cpu_to_le32(parent); ++ if (parent) { ++ for (n = 1; n < num; n++) { ++ struct buffer_head *bh; ++ /* Allocate the next block */ ++ int nr = ext3_alloc_block(handle, inode, parent, &err); ++ if (!nr) ++ break; ++ branch[n].key = cpu_to_le32(nr); ++ keys = n+1; ++ ++ /* ++ * Get buffer_head for parent block, zero it out ++ * and set the pointer to new one, then send ++ * parent to disk. ++ */ ++ bh = sb_getblk(inode->i_sb, parent); ++ branch[n].bh = bh; ++ lock_buffer(bh); ++ BUFFER_TRACE(bh, "call get_create_access"); ++ err = ext3_journal_get_create_access(handle, bh); ++ if (err) { ++ unlock_buffer(bh); ++ brelse(bh); ++ break; ++ } ++ ++ memset(bh->b_data, 0, blocksize); ++ branch[n].p = (u32*) bh->b_data + offsets[n]; ++ *branch[n].p = branch[n].key; ++ BUFFER_TRACE(bh, "marking uptodate"); ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ break; ++ ++ parent = nr; ++ } ++ } ++ if (n == num) ++ return 0; ++ ++ /* Allocation failed, free what we already allocated */ ++ for (i = 1; i < keys; i++) { ++ BUFFER_TRACE(branch[i].bh, "call journal_forget"); ++ ext3_journal_forget(handle, branch[i].bh); ++ } ++ for (i = 0; i < keys; i++) ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ return err; ++} ++ ++/** ++ * ext3_splice_branch - splice the allocated branch onto inode. ++ * @inode: owner ++ * @block: (logical) number of block we are adding ++ * @chain: chain of indirect blocks (with a missing link - see ++ * ext3_alloc_branch) ++ * @where: location of missing link ++ * @num: number of blocks we are adding ++ * ++ * This function verifies that chain (up to the missing link) had not ++ * changed, fills the missing link and does all housekeeping needed in ++ * inode (->i_blocks, etc.). In case of success we end up with the full ++ * chain to new block and return 0. Otherwise (== chain had been changed) ++ * we free the new blocks (forgetting their buffer_heads, indeed) and ++ * return -EAGAIN. ++ */ ++ ++static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, ++ Indirect chain[4], Indirect *where, int num) ++{ ++ int i; ++ int err = 0; ++ ++ /* ++ * If we're splicing into a [td]indirect block (as opposed to the ++ * inode) then we need to get write access to the [td]indirect block ++ * before the splice. ++ */ ++ if (where->bh) { ++ BUFFER_TRACE(where->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, where->bh); ++ if (err) ++ goto err_out; ++ } ++ /* Verify that place we are splicing to is still there and vacant */ ++ ++ /* Writer: pointers, ->i_next_alloc* */ ++ if (!verify_chain(chain, where-1) || *where->p) ++ /* Writer: end */ ++ goto changed; ++ ++ /* That's it */ ++ ++ *where->p = where->key; ++ inode->u.ext3_i.i_next_alloc_block = block; ++ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); ++#ifdef SEARCH_FROM_ZERO ++ inode->u.ext3_i.i_next_alloc_block = 0; ++ inode->u.ext3_i.i_next_alloc_goal = 0; ++#endif ++ /* Writer: end */ ++ ++ /* We are done with atomic stuff, now do the rest of housekeeping */ ++ ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* had we spliced it onto indirect block? */ ++ if (where->bh) { ++ /* ++ * akpm: If we spliced it onto an indirect block, we haven't ++ * altered the inode. Note however that if it is being spliced ++ * onto an indirect block at the very end of the file (the ++ * file is growing) then we *will* alter the inode to reflect ++ * the new i_size. But that is not done here - it is done in ++ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. ++ */ ++ jbd_debug(5, "splicing indirect only\n"); ++ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, where->bh); ++ if (err) ++ goto err_out; ++ } else { ++ /* ++ * OK, we spliced it into the inode itself on a direct block. ++ * Inode was dirtied above. ++ */ ++ jbd_debug(5, "splicing direct\n"); ++ } ++ return err; ++ ++changed: ++ /* ++ * AKPM: if where[i].bh isn't part of the current updating ++ * transaction then we explode nastily. Test this code path. ++ */ ++ jbd_debug(1, "the chain changed: try again\n"); ++ err = -EAGAIN; ++ ++err_out: ++ for (i = 1; i < num; i++) { ++ BUFFER_TRACE(where[i].bh, "call journal_forget"); ++ ext3_journal_forget(handle, where[i].bh); ++ } ++ /* For the normal collision cleanup case, we free up the blocks. ++ * On genuine filesystem errors we don't even think about doing ++ * that. */ ++ if (err == -EAGAIN) ++ for (i = 0; i < num; i++) ++ ext3_free_blocks(handle, inode, ++ le32_to_cpu(where[i].key), 1); ++ return err; ++} ++ ++/* ++ * Allocation strategy is simple: if we have to allocate something, we will ++ * have to go the whole way to leaf. So let's do it before attaching anything ++ * to tree, set linkage between the newborn blocks, write them if sync is ++ * required, recheck the path, free and repeat if check fails, otherwise ++ * set the last missing link (that will protect us from any truncate-generated ++ * removals - all blocks on the path are immune now) and possibly force the ++ * write on the parent block. ++ * That has a nice additional property: no special recovery from the failed ++ * allocations is needed - we simply release blocks and do not touch anything ++ * reachable from inode. ++ * ++ * akpm: `handle' can be NULL if create == 0. ++ * ++ * The BKL may not be held on entry here. Be sure to take it early. ++ */ ++ ++static int ext3_get_block_handle(handle_t *handle, struct inode *inode, ++ long iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ int err = -EIO; ++ int offsets[4]; ++ Indirect chain[4]; ++ Indirect *partial; ++ unsigned long goal; ++ int left; ++ int depth = ext3_block_to_path(inode, iblock, offsets); ++ loff_t new_size; ++ ++ J_ASSERT(handle != NULL || create == 0); ++ ++ if (depth == 0) ++ goto out; ++ ++ lock_kernel(); ++reread: ++ partial = ext3_get_branch(inode, depth, offsets, chain, &err); ++ ++ /* Simplest case - block found, no allocation needed */ ++ if (!partial) { ++ bh_result->b_state &= ~(1UL << BH_New); ++got_it: ++ bh_result->b_dev = inode->i_dev; ++ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); ++ bh_result->b_state |= (1UL << BH_Mapped); ++ /* Clean up and exit */ ++ partial = chain+depth-1; /* the whole chain */ ++ goto cleanup; ++ } ++ ++ /* Next simple case - plain lookup or failed read of indirect block */ ++ if (!create || err == -EIO) { ++cleanup: ++ while (partial > chain) { ++ BUFFER_TRACE(partial->bh, "call brelse"); ++ brelse(partial->bh); ++ partial--; ++ } ++ BUFFER_TRACE(bh_result, "returned"); ++ unlock_kernel(); ++out: ++ return err; ++ } ++ ++ /* ++ * Indirect block might be removed by truncate while we were ++ * reading it. Handling of that case (forget what we've got and ++ * reread) is taken out of the main path. ++ */ ++ if (err == -EAGAIN) ++ goto changed; ++ ++ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) ++ goto changed; ++ ++ left = (chain + depth) - partial; ++ ++ /* ++ * Block out ext3_truncate while we alter the tree ++ */ ++ down_read(&inode->u.ext3_i.truncate_sem); ++ err = ext3_alloc_branch(handle, inode, left, goal, ++ offsets+(partial-chain), partial); ++ ++ /* The ext3_splice_branch call will free and forget any buffers ++ * on the new chain if there is a failure, but that risks using ++ * up transaction credits, especially for bitmaps where the ++ * credits cannot be returned. Can we handle this somehow? We ++ * may need to return -EAGAIN upwards in the worst case. --sct */ ++ if (!err) ++ err = ext3_splice_branch(handle, inode, iblock, chain, ++ partial, left); ++ up_read(&inode->u.ext3_i.truncate_sem); ++ if (err == -EAGAIN) ++ goto changed; ++ if (err) ++ goto cleanup; ++ ++ new_size = inode->i_size; ++ /* ++ * This is not racy against ext3_truncate's modification of i_disksize ++ * because VM/VFS ensures that the file cannot be extended while ++ * truncate is in progress. It is racy between multiple parallel ++ * instances of get_block, but we have the BKL. ++ */ ++ if (new_size > inode->u.ext3_i.i_disksize) ++ inode->u.ext3_i.i_disksize = new_size; ++ ++ bh_result->b_state |= (1UL << BH_New); ++ goto got_it; ++ ++changed: ++ while (partial > chain) { ++ jbd_debug(1, "buffer chain changed, retrying\n"); ++ BUFFER_TRACE(partial->bh, "brelsing"); ++ brelse(partial->bh); ++ partial--; ++ } ++ goto reread; ++} ++ ++/* ++ * The BKL is not held on entry here. ++ */ ++static int ext3_get_block(struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ handle_t *handle = 0; ++ int ret; ++ ++ if (create) { ++ handle = ext3_journal_current_handle(); ++ J_ASSERT(handle != 0); ++ } ++ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ++ return ret; ++} ++ ++/* ++ * `handle' can be NULL if create is zero ++ */ ++struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, ++ long block, int create, int * errp) ++{ ++ struct buffer_head dummy; ++ int fatal = 0, err; ++ ++ J_ASSERT(handle != NULL || create == 0); ++ ++ dummy.b_state = 0; ++ dummy.b_blocknr = -1000; ++ buffer_trace_init(&dummy.b_history); ++ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); ++ if (!*errp && buffer_mapped(&dummy)) { ++ struct buffer_head *bh; ++ bh = sb_getblk(inode->i_sb, dummy.b_blocknr); ++ if (buffer_new(&dummy)) { ++ J_ASSERT(create != 0); ++ J_ASSERT(handle != 0); ++ ++ /* Now that we do not always journal data, we ++ should keep in mind whether this should ++ always journal the new buffer as metadata. ++ For now, regular file writes use ++ ext3_get_block instead, so it's not a ++ problem. */ ++ lock_kernel(); ++ lock_buffer(bh); ++ BUFFER_TRACE(bh, "call get_create_access"); ++ fatal = ext3_journal_get_create_access(handle, bh); ++ if (!fatal) { ++ memset(bh->b_data, 0, ++ inode->i_sb->s_blocksize); ++ mark_buffer_uptodate(bh, 1); ++ } ++ unlock_buffer(bh); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (!fatal) fatal = err; ++ unlock_kernel(); ++ } else { ++ BUFFER_TRACE(bh, "not a new buffer"); ++ } ++ if (fatal) { ++ *errp = fatal; ++ brelse(bh); ++ bh = NULL; ++ } ++ return bh; ++ } ++ return NULL; ++} ++ ++struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, ++ int block, int create, int *err) ++{ ++ struct buffer_head * bh; ++ int prev_blocks; ++ ++ prev_blocks = inode->i_blocks; ++ ++ bh = ext3_getblk (handle, inode, block, create, err); ++ if (!bh) ++ return bh; ++#ifdef EXT3_PREALLOCATE ++ /* ++ * If the inode has grown, and this is a directory, then use a few ++ * more of the preallocated blocks to keep directory fragmentation ++ * down. The preallocated blocks are guaranteed to be contiguous. ++ */ ++ if (create && ++ S_ISDIR(inode->i_mode) && ++ inode->i_blocks > prev_blocks && ++ EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { ++ int i; ++ struct buffer_head *tmp_bh; ++ ++ for (i = 1; ++ inode->u.ext3_i.i_prealloc_count && ++ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; ++ i++) { ++ /* ++ * ext3_getblk will zero out the contents of the ++ * directory for us ++ */ ++ tmp_bh = ext3_getblk(handle, inode, ++ block+i, create, err); ++ if (!tmp_bh) { ++ brelse (bh); ++ return 0; ++ } ++ brelse (tmp_bh); ++ } ++ } ++#endif ++ if (buffer_uptodate(bh)) ++ return bh; ++ ll_rw_block (READ, 1, &bh); ++ wait_on_buffer (bh); ++ if (buffer_uptodate(bh)) ++ return bh; ++ brelse (bh); ++ *err = -EIO; ++ return NULL; ++} ++ ++static int walk_page_buffers( handle_t *handle, ++ struct buffer_head *head, ++ unsigned from, ++ unsigned to, ++ int *partial, ++ int (*fn)( handle_t *handle, ++ struct buffer_head *bh)) ++{ ++ struct buffer_head *bh; ++ unsigned block_start, block_end; ++ unsigned blocksize = head->b_size; ++ int err, ret = 0; ++ ++ for ( bh = head, block_start = 0; ++ ret == 0 && (bh != head || !block_start); ++ block_start = block_end, bh = bh->b_this_page) ++ { ++ block_end = block_start + blocksize; ++ if (block_end <= from || block_start >= to) { ++ if (partial && !buffer_uptodate(bh)) ++ *partial = 1; ++ continue; ++ } ++ err = (*fn)(handle, bh); ++ if (!ret) ++ ret = err; ++ } ++ return ret; ++} ++ ++/* ++ * To preserve ordering, it is essential that the hole instantiation and ++ * the data write be encapsulated in a single transaction. We cannot ++ * close off a transaction and start a new one between the ext3_get_block() ++ * and the commit_write(). So doing the journal_start at the start of ++ * prepare_write() is the right place. ++ * ++ * Also, this function can nest inside ext3_writepage() -> ++ * block_write_full_page(). In that case, we *know* that ext3_writepage() ++ * has generated enough buffer credits to do the whole page. So we won't ++ * block on the journal in that case, which is good, because the caller may ++ * be PF_MEMALLOC. ++ * ++ * By accident, ext3 can be reentered when a transaction is open via ++ * quota file writes. If we were to commit the transaction while thus ++ * reentered, there can be a deadlock - we would be holding a quota ++ * lock, and the commit would never complete if another thread had a ++ * transaction open and was blocking on the quota lock - a ranking ++ * violation. ++ * ++ * So what we do is to rely on the fact that journal_stop/journal_start ++ * will _not_ run commit under these circumstances because handle->h_ref ++ * is elevated. We'll still have enough credits for the tiny quotafile ++ * write. ++ */ ++ ++static int do_journal_get_write_access(handle_t *handle, ++ struct buffer_head *bh) ++{ ++ return ext3_journal_get_write_access(handle, bh); ++} ++ ++static int ext3_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ struct inode *inode = page->mapping->host; ++ int ret, needed_blocks = ext3_writepage_trans_blocks(inode); ++ handle_t *handle; ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out; ++ } ++ unlock_kernel(); ++ ret = block_prepare_write(page, from, to, ext3_get_block); ++ lock_kernel(); ++ if (ret != 0) ++ goto prepare_write_failed; ++ ++ if (ext3_should_journal_data(inode)) { ++ ret = walk_page_buffers(handle, page->buffers, ++ from, to, NULL, do_journal_get_write_access); ++ if (ret) { ++ /* ++ * We're going to fail this prepare_write(), ++ * so commit_write() will not be called. ++ * We need to undo block_prepare_write()'s kmap(). ++ * AKPM: Do we need to clear PageUptodate? I don't ++ * think so. ++ */ ++ kunmap(page); ++ } ++ } ++prepare_write_failed: ++ if (ret) ++ ext3_journal_stop(handle, inode); ++out: ++ unlock_kernel(); ++ return ret; ++} ++ ++static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) ++{ ++ return ext3_journal_dirty_data(handle, bh, 0); ++} ++ ++/* ++ * For ext3_writepage(). We also brelse() the buffer to account for ++ * the bget() which ext3_writepage() performs. ++ */ ++static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) ++{ ++ int ret = ext3_journal_dirty_data(handle, bh, 1); ++ __brelse(bh); ++ return ret; ++} ++ ++/* For commit_write() in data=journal mode */ ++static int commit_write_fn(handle_t *handle, struct buffer_head *bh) ++{ ++ set_bit(BH_Uptodate, &bh->b_state); ++ return ext3_journal_dirty_metadata(handle, bh); ++} ++ ++/* ++ * We need to pick up the new inode size which generic_commit_write gave us ++ * `file' can be NULL - eg, when called from block_symlink(). ++ * ++ * ext3 inode->i_dirty_buffers policy: If we're journalling data we ++ * definitely don't want them to appear on the inode at all - instead ++ * we need to manage them at the JBD layer and we need to intercept ++ * the relevant sync operations and translate them into journal operations. ++ * ++ * If we're not journalling data then we can just leave the buffers ++ * on ->i_dirty_buffers. If someone writes them out for us then thanks. ++ * Otherwise we'll do it in commit, if we're using ordered data. ++ */ ++ ++static int ext3_commit_write(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ handle_t *handle = ext3_journal_current_handle(); ++ struct inode *inode = page->mapping->host; ++ int ret = 0, ret2; ++ ++ lock_kernel(); ++ if (ext3_should_journal_data(inode)) { ++ /* ++ * Here we duplicate the generic_commit_write() functionality ++ */ ++ int partial = 0; ++ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; ++ ++ ret = walk_page_buffers(handle, page->buffers, ++ from, to, &partial, commit_write_fn); ++ if (!partial) ++ SetPageUptodate(page); ++ kunmap(page); ++ if (pos > inode->i_size) ++ inode->i_size = pos; ++ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; ++ } else { ++ if (ext3_should_order_data(inode)) { ++ ret = walk_page_buffers(handle, page->buffers, ++ from, to, NULL, journal_dirty_sync_data); ++ } ++ /* Be careful here if generic_commit_write becomes a ++ * required invocation after block_prepare_write. */ ++ if (ret == 0) { ++ ret = generic_commit_write(file, page, from, to); ++ } else { ++ /* ++ * block_prepare_write() was called, but we're not ++ * going to call generic_commit_write(). So we ++ * need to perform generic_commit_write()'s kunmap ++ * by hand. ++ */ ++ kunmap(page); ++ } ++ } ++ if (inode->i_size > inode->u.ext3_i.i_disksize) { ++ inode->u.ext3_i.i_disksize = inode->i_size; ++ ret2 = ext3_mark_inode_dirty(handle, inode); ++ if (!ret) ++ ret = ret2; ++ } ++ ret2 = ext3_journal_stop(handle, inode); ++ unlock_kernel(); ++ if (!ret) ++ ret = ret2; ++ return ret; ++} ++ ++/* ++ * bmap() is special. It gets used by applications such as lilo and by ++ * the swapper to find the on-disk block of a specific piece of data. ++ * ++ * Naturally, this is dangerous if the block concerned is still in the ++ * journal. If somebody makes a swapfile on an ext3 data-journaling ++ * filesystem and enables swap, then they may get a nasty shock when the ++ * data getting swapped to that swapfile suddenly gets overwritten by ++ * the original zero's written out previously to the journal and ++ * awaiting writeback in the kernel's buffer cache. ++ * ++ * So, if we see any bmap calls here on a modified, data-journaled file, ++ * take extra steps to flush any blocks which might be in the cache. ++ */ ++static int ext3_bmap(struct address_space *mapping, long block) ++{ ++ struct inode *inode = mapping->host; ++ journal_t *journal; ++ int err; ++ ++ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { ++ /* ++ * This is a REALLY heavyweight approach, but the use of ++ * bmap on dirty files is expected to be extremely rare: ++ * only if we run lilo or swapon on a freshly made file ++ * do we expect this to happen. ++ * ++ * (bmap requires CAP_SYS_RAWIO so this does not ++ * represent an unprivileged user DOS attack --- we'd be ++ * in trouble if mortal users could trigger this path at ++ * will.) ++ * ++ * NB. EXT3_STATE_JDATA is not set on files other than ++ * regular files. If somebody wants to bmap a directory ++ * or symlink and gets confused because the buffer ++ * hasn't yet been flushed to disk, they deserve ++ * everything they get. ++ */ ++ ++ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; ++ journal = EXT3_JOURNAL(inode); ++ journal_lock_updates(journal); ++ err = journal_flush(journal); ++ journal_unlock_updates(journal); ++ ++ if (err) ++ return 0; ++ } ++ ++ return generic_block_bmap(mapping,block,ext3_get_block); ++} ++ ++static int bget_one(handle_t *handle, struct buffer_head *bh) ++{ ++ atomic_inc(&bh->b_count); ++ return 0; ++} ++ ++/* ++ * Note that we always start a transaction even if we're not journalling ++ * data. This is to preserve ordering: any hole instantiation within ++ * __block_write_full_page -> ext3_get_block() should be journalled ++ * along with the data so we don't crash and then get metadata which ++ * refers to old data. ++ * ++ * In all journalling modes block_write_full_page() will start the I/O. ++ * ++ * Problem: ++ * ++ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> ++ * ext3_writepage() ++ * ++ * Similar for: ++ * ++ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... ++ * ++ * Same applies to ext3_get_block(). We will deadlock on various things like ++ * lock_journal and i_truncate_sem. ++ * ++ * Setting PF_MEMALLOC here doesn't work - too many internal memory ++ * allocations fail. ++ * ++ * 16May01: If we're reentered then journal_current_handle() will be ++ * non-zero. We simply *return*. ++ * ++ * 1 July 2001: @@@ FIXME: ++ * In journalled data mode, a data buffer may be metadata against the ++ * current transaction. But the same file is part of a shared mapping ++ * and someone does a writepage() on it. ++ * ++ * We will move the buffer onto the async_data list, but *after* it has ++ * been dirtied. So there's a small window where we have dirty data on ++ * BJ_Metadata. ++ * ++ * Note that this only applies to the last partial page in the file. The ++ * bit which block_write_full_page() uses prepare/commit for. (That's ++ * broken code anyway: it's wrong for msync()). ++ * ++ * It's a rare case: affects the final partial page, for journalled data ++ * where the file is subject to bith write() and writepage() in the same ++ * transction. To fix it we'll need a custom block_write_full_page(). ++ * We'll probably need that anyway for journalling writepage() output. ++ * ++ * We don't honour synchronous mounts for writepage(). That would be ++ * disastrous. Any write() or metadata operation will sync the fs for ++ * us. ++ */ ++static int ext3_writepage(struct page *page) ++{ ++ struct inode *inode = page->mapping->host; ++ struct buffer_head *page_buffers; ++ handle_t *handle = NULL; ++ int ret = 0, err; ++ int needed; ++ int order_data; ++ ++ J_ASSERT(PageLocked(page)); ++ ++ /* ++ * We give up here if we're reentered, because it might be ++ * for a different filesystem. One *could* look for a ++ * nested transaction opportunity. ++ */ ++ lock_kernel(); ++ if (ext3_journal_current_handle()) ++ goto out_fail; ++ ++ needed = ext3_writepage_trans_blocks(inode); ++ if (current->flags & PF_MEMALLOC) ++ handle = ext3_journal_try_start(inode, needed); ++ else ++ handle = ext3_journal_start(inode, needed); ++ ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out_fail; ++ } ++ ++ order_data = ext3_should_order_data(inode) || ++ ext3_should_journal_data(inode); ++ ++ unlock_kernel(); ++ ++ page_buffers = NULL; /* Purely to prevent compiler warning */ ++ ++ /* bget() all the buffers */ ++ if (order_data) { ++ if (!page->buffers) ++ create_empty_buffers(page, ++ inode->i_dev, inode->i_sb->s_blocksize); ++ page_buffers = page->buffers; ++ walk_page_buffers(handle, page_buffers, 0, ++ PAGE_CACHE_SIZE, NULL, bget_one); ++ } ++ ++ ret = block_write_full_page(page, ext3_get_block); ++ ++ /* ++ * The page can become unlocked at any point now, and ++ * truncate can then come in and change things. So we ++ * can't touch *page from now on. But *page_buffers is ++ * safe due to elevated refcount. ++ */ ++ ++ handle = ext3_journal_current_handle(); ++ lock_kernel(); ++ ++ /* And attach them to the current transaction */ ++ if (order_data) { ++ err = walk_page_buffers(handle, page_buffers, ++ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); ++ if (!ret) ++ ret = err; ++ } ++ ++ err = ext3_journal_stop(handle, inode); ++ if (!ret) ++ ret = err; ++ unlock_kernel(); ++ return ret; ++ ++out_fail: ++ ++ unlock_kernel(); ++ SetPageDirty(page); ++ UnlockPage(page); ++ return ret; ++} ++ ++static int ext3_readpage(struct file *file, struct page *page) ++{ ++ return block_read_full_page(page,ext3_get_block); ++} ++ ++ ++static int ext3_flushpage(struct page *page, unsigned long offset) ++{ ++ journal_t *journal = EXT3_JOURNAL(page->mapping->host); ++ return journal_flushpage(journal, page, offset); ++} ++ ++static int ext3_releasepage(struct page *page, int wait) ++{ ++ journal_t *journal = EXT3_JOURNAL(page->mapping->host); ++ return journal_try_to_free_buffers(journal, page, wait); ++} ++ ++ ++struct address_space_operations ext3_aops = { ++ readpage: ext3_readpage, /* BKL not held. Don't need */ ++ writepage: ext3_writepage, /* BKL not held. We take it */ ++ sync_page: block_sync_page, ++ prepare_write: ext3_prepare_write, /* BKL not held. We take it */ ++ commit_write: ext3_commit_write, /* BKL not held. We take it */ ++ bmap: ext3_bmap, /* BKL held */ ++ flushpage: ext3_flushpage, /* BKL not held. Don't need */ ++ releasepage: ext3_releasepage, /* BKL not held. Don't need */ ++}; ++ ++/* ++ * ext3_block_truncate_page() zeroes out a mapping from file offset `from' ++ * up to the end of the block which corresponds to `from'. ++ * This required during truncate. We need to physically zero the tail end ++ * of that block so it doesn't yield old data if the file is later grown. ++ */ ++static int ext3_block_truncate_page(handle_t *handle, ++ struct address_space *mapping, loff_t from) ++{ ++ unsigned long index = from >> PAGE_CACHE_SHIFT; ++ unsigned offset = from & (PAGE_CACHE_SIZE-1); ++ unsigned blocksize, iblock, length, pos; ++ struct inode *inode = mapping->host; ++ struct page *page; ++ struct buffer_head *bh; ++ int err; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ length = offset & (blocksize - 1); ++ ++ /* Block boundary? Nothing to do */ ++ if (!length) ++ return 0; ++ ++ length = blocksize - length; ++ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++ ++ page = grab_cache_page(mapping, index); ++ err = -ENOMEM; ++ if (!page) ++ goto out; ++ ++ if (!page->buffers) ++ create_empty_buffers(page, inode->i_dev, blocksize); ++ ++ /* Find the buffer that contains "offset" */ ++ bh = page->buffers; ++ pos = blocksize; ++ while (offset >= pos) { ++ bh = bh->b_this_page; ++ iblock++; ++ pos += blocksize; ++ } ++ ++ err = 0; ++ if (!buffer_mapped(bh)) { ++ /* Hole? Nothing to do */ ++ if (buffer_uptodate(bh)) ++ goto unlock; ++ ext3_get_block(inode, iblock, bh, 0); ++ /* Still unmapped? Nothing to do */ ++ if (!buffer_mapped(bh)) ++ goto unlock; ++ } ++ ++ /* Ok, it's mapped. Make sure it's up-to-date */ ++ if (Page_Uptodate(page)) ++ set_bit(BH_Uptodate, &bh->b_state); ++ ++ if (!buffer_uptodate(bh)) { ++ err = -EIO; ++ ll_rw_block(READ, 1, &bh); ++ wait_on_buffer(bh); ++ /* Uhhuh. Read error. Complain and punt. */ ++ if (!buffer_uptodate(bh)) ++ goto unlock; ++ } ++ ++ if (ext3_should_journal_data(inode)) { ++ BUFFER_TRACE(bh, "get write access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto unlock; ++ } ++ ++ memset(kmap(page) + offset, 0, length); ++ flush_dcache_page(page); ++ kunmap(page); ++ ++ BUFFER_TRACE(bh, "zeroed end of block"); ++ ++ err = 0; ++ if (ext3_should_journal_data(inode)) { ++ err = ext3_journal_dirty_metadata(handle, bh); ++ } else { ++ if (ext3_should_order_data(inode)) ++ err = ext3_journal_dirty_data(handle, bh, 0); ++ __mark_buffer_dirty(bh); ++ } ++ ++unlock: ++ UnlockPage(page); ++ page_cache_release(page); ++out: ++ return err; ++} ++ ++/* ++ * Probably it should be a library function... search for first non-zero word ++ * or memcmp with zero_page, whatever is better for particular architecture. ++ * Linus? ++ */ ++static inline int all_zeroes(u32 *p, u32 *q) ++{ ++ while (p < q) ++ if (*p++) ++ return 0; ++ return 1; ++} ++ ++/** ++ * ext3_find_shared - find the indirect blocks for partial truncation. ++ * @inode: inode in question ++ * @depth: depth of the affected branch ++ * @offsets: offsets of pointers in that branch (see ext3_block_to_path) ++ * @chain: place to store the pointers to partial indirect blocks ++ * @top: place to the (detached) top of branch ++ * ++ * This is a helper function used by ext3_truncate(). ++ * ++ * When we do truncate() we may have to clean the ends of several ++ * indirect blocks but leave the blocks themselves alive. Block is ++ * partially truncated if some data below the new i_size is refered ++ * from it (and it is on the path to the first completely truncated ++ * data block, indeed). We have to free the top of that path along ++ * with everything to the right of the path. Since no allocation ++ * past the truncation point is possible until ext3_truncate() ++ * finishes, we may safely do the latter, but top of branch may ++ * require special attention - pageout below the truncation point ++ * might try to populate it. ++ * ++ * We atomically detach the top of branch from the tree, store the ++ * block number of its root in *@top, pointers to buffer_heads of ++ * partially truncated blocks - in @chain[].bh and pointers to ++ * their last elements that should not be removed - in ++ * @chain[].p. Return value is the pointer to last filled element ++ * of @chain. ++ * ++ * The work left to caller to do the actual freeing of subtrees: ++ * a) free the subtree starting from *@top ++ * b) free the subtrees whose roots are stored in ++ * (@chain[i].p+1 .. end of @chain[i].bh->b_data) ++ * c) free the subtrees growing from the inode past the @chain[0]. ++ * (no partially truncated stuff there). */ ++ ++static Indirect *ext3_find_shared(struct inode *inode, ++ int depth, ++ int offsets[4], ++ Indirect chain[4], ++ u32 *top) ++{ ++ Indirect *partial, *p; ++ int k, err; ++ ++ *top = 0; ++ /* Make k index the deepest non-null offest + 1 */ ++ for (k = depth; k > 1 && !offsets[k-1]; k--) ++ ; ++ partial = ext3_get_branch(inode, k, offsets, chain, &err); ++ /* Writer: pointers */ ++ if (!partial) ++ partial = chain + k-1; ++ /* ++ * If the branch acquired continuation since we've looked at it - ++ * fine, it should all survive and (new) top doesn't belong to us. ++ */ ++ if (!partial->key && *partial->p) ++ /* Writer: end */ ++ goto no_top; ++ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) ++ ; ++ /* ++ * OK, we've found the last block that must survive. The rest of our ++ * branch should be detached before unlocking. However, if that rest ++ * of branch is all ours and does not grow immediately from the inode ++ * it's easier to cheat and just decrement partial->p. ++ */ ++ if (p == chain + k - 1 && p > chain) { ++ p->p--; ++ } else { ++ *top = *p->p; ++ /* Nope, don't do this in ext3. Must leave the tree intact */ ++#if 0 ++ *p->p = 0; ++#endif ++ } ++ /* Writer: end */ ++ ++ while(partial > p) ++ { ++ brelse(partial->bh); ++ partial--; ++ } ++no_top: ++ return partial; ++} ++ ++/* ++ * Zero a number of block pointers in either an inode or an indirect block. ++ * If we restart the transaction we must again get write access to the ++ * indirect block for further modification. ++ * ++ * We release `count' blocks on disk, but (last - first) may be greater ++ * than `count' because there can be holes in there. ++ */ ++static void ++ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ++ unsigned long block_to_free, unsigned long count, ++ u32 *first, u32 *last) ++{ ++ u32 *p; ++ if (try_to_extend_transaction(handle, inode)) { ++ if (bh) { ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, bh); ++ } ++ ext3_mark_inode_dirty(handle, inode); ++ ext3_journal_test_restart(handle, inode); ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ } ++ ++ /* ++ * Any buffers which are on the journal will be in memory. We find ++ * them on the hash table so journal_revoke() will run journal_forget() ++ * on them. We've already detached each block from the file, so ++ * bforget() in journal_forget() should be safe. ++ * ++ * AKPM: turn on bforget in journal_forget()!!! ++ */ ++ for (p = first; p < last; p++) { ++ u32 nr = le32_to_cpu(*p); ++ if (nr) { ++ struct buffer_head *bh; ++ ++ *p = 0; ++ bh = sb_get_hash_table(inode->i_sb, nr); ++ ext3_forget(handle, 0, inode, bh, nr); ++ } ++ } ++ ++ ext3_free_blocks(handle, inode, block_to_free, count); ++} ++ ++/** ++ * ext3_free_data - free a list of data blocks ++ * @handle: handle for this transaction ++ * @inode: inode we are dealing with ++ * @this_bh: indirect buffer_head which contains *@first and *@last ++ * @first: array of block numbers ++ * @last: points immediately past the end of array ++ * ++ * We are freeing all blocks refered from that array (numbers are stored as ++ * little-endian 32-bit) and updating @inode->i_blocks appropriately. ++ * ++ * We accumulate contiguous runs of blocks to free. Conveniently, if these ++ * blocks are contiguous then releasing them at one time will only affect one ++ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't ++ * actually use a lot of journal space. ++ * ++ * @this_bh will be %NULL if @first and @last point into the inode's direct ++ * block pointers. ++ */ ++static void ext3_free_data(handle_t *handle, struct inode *inode, ++ struct buffer_head *this_bh, u32 *first, u32 *last) ++{ ++ unsigned long block_to_free = 0; /* Starting block # of a run */ ++ unsigned long count = 0; /* Number of blocks in the run */ ++ u32 *block_to_free_p = NULL; /* Pointer into inode/ind ++ corresponding to ++ block_to_free */ ++ unsigned long nr; /* Current block # */ ++ u32 *p; /* Pointer into inode/ind ++ for current block */ ++ int err; ++ ++ if (this_bh) { /* For indirect block */ ++ BUFFER_TRACE(this_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, this_bh); ++ /* Important: if we can't update the indirect pointers ++ * to the blocks, we can't free them. */ ++ if (err) ++ return; ++ } ++ ++ for (p = first; p < last; p++) { ++ nr = le32_to_cpu(*p); ++ if (nr) { ++ /* accumulate blocks to free if they're contiguous */ ++ if (count == 0) { ++ block_to_free = nr; ++ block_to_free_p = p; ++ count = 1; ++ } else if (nr == block_to_free + count) { ++ count++; ++ } else { ++ ext3_clear_blocks(handle, inode, this_bh, ++ block_to_free, ++ count, block_to_free_p, p); ++ block_to_free = nr; ++ block_to_free_p = p; ++ count = 1; ++ } ++ } ++ } ++ ++ if (count > 0) ++ ext3_clear_blocks(handle, inode, this_bh, block_to_free, ++ count, block_to_free_p, p); ++ ++ if (this_bh) { ++ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, this_bh); ++ } ++} ++ ++/** ++ * ext3_free_branches - free an array of branches ++ * @handle: JBD handle for this transaction ++ * @inode: inode we are dealing with ++ * @parent_bh: the buffer_head which contains *@first and *@last ++ * @first: array of block numbers ++ * @last: pointer immediately past the end of array ++ * @depth: depth of the branches to free ++ * ++ * We are freeing all blocks refered from these branches (numbers are ++ * stored as little-endian 32-bit) and updating @inode->i_blocks ++ * appropriately. ++ */ ++static void ext3_free_branches(handle_t *handle, struct inode *inode, ++ struct buffer_head *parent_bh, ++ u32 *first, u32 *last, int depth) ++{ ++ unsigned long nr; ++ u32 *p; ++ ++ if (is_handle_aborted(handle)) ++ return; ++ ++ if (depth--) { ++ struct buffer_head *bh; ++ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); ++ p = last; ++ while (--p >= first) { ++ nr = le32_to_cpu(*p); ++ if (!nr) ++ continue; /* A hole */ ++ ++ /* Go read the buffer for the next level down */ ++ bh = sb_bread(inode->i_sb, nr); ++ ++ /* ++ * A read failure? Report error and clear slot ++ * (should be rare). ++ */ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_free_branches", ++ "Read failure, inode=%ld, block=%ld", ++ inode->i_ino, nr); ++ continue; ++ } ++ ++ /* This zaps the entire block. Bottom up. */ ++ BUFFER_TRACE(bh, "free child branches"); ++ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, ++ (u32*)bh->b_data + addr_per_block, ++ depth); ++ ++ /* ++ * We've probably journalled the indirect block several ++ * times during the truncate. But it's no longer ++ * needed and we now drop it from the transaction via ++ * journal_revoke(). ++ * ++ * That's easy if it's exclusively part of this ++ * transaction. But if it's part of the committing ++ * transaction then journal_forget() will simply ++ * brelse() it. That means that if the underlying ++ * block is reallocated in ext3_get_block(), ++ * unmap_underlying_metadata() will find this block ++ * and will try to get rid of it. damn, damn. ++ * ++ * If this block has already been committed to the ++ * journal, a revoke record will be written. And ++ * revoke records must be emitted *before* clearing ++ * this block's bit in the bitmaps. ++ */ ++ ext3_forget(handle, 1, inode, bh, bh->b_blocknr); ++ ++ /* ++ * Everything below this this pointer has been ++ * released. Now let this top-of-subtree go. ++ * ++ * We want the freeing of this indirect block to be ++ * atomic in the journal with the updating of the ++ * bitmap block which owns it. So make some room in ++ * the journal. ++ * ++ * We zero the parent pointer *after* freeing its ++ * pointee in the bitmaps, so if extend_transaction() ++ * for some reason fails to put the bitmap changes and ++ * the release into the same transaction, recovery ++ * will merely complain about releasing a free block, ++ * rather than leaking blocks. ++ */ ++ if (is_handle_aborted(handle)) ++ return; ++ if (try_to_extend_transaction(handle, inode)) { ++ ext3_mark_inode_dirty(handle, inode); ++ ext3_journal_test_restart(handle, inode); ++ } ++ ++ ext3_free_blocks(handle, inode, nr, 1); ++ ++ if (parent_bh) { ++ /* ++ * The block which we have just freed is ++ * pointed to by an indirect block: journal it ++ */ ++ BUFFER_TRACE(parent_bh, "get_write_access"); ++ if (!ext3_journal_get_write_access(handle, ++ parent_bh)){ ++ *p = 0; ++ BUFFER_TRACE(parent_bh, ++ "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, ++ parent_bh); ++ } ++ } ++ } ++ } else { ++ /* We have reached the bottom of the tree. */ ++ BUFFER_TRACE(parent_bh, "free data blocks"); ++ ext3_free_data(handle, inode, parent_bh, first, last); ++ } ++} ++ ++/* ++ * ext3_truncate() ++ * ++ * We block out ext3_get_block() block instantiations across the entire ++ * transaction, and VFS/VM ensures that ext3_truncate() cannot run ++ * simultaneously on behalf of the same inode. ++ * ++ * As we work through the truncate and commmit bits of it to the journal there ++ * is one core, guiding principle: the file's tree must always be consistent on ++ * disk. We must be able to restart the truncate after a crash. ++ * ++ * The file's tree may be transiently inconsistent in memory (although it ++ * probably isn't), but whenever we close off and commit a journal transaction, ++ * the contents of (the filesystem + the journal) must be consistent and ++ * restartable. It's pretty simple, really: bottom up, right to left (although ++ * left-to-right works OK too). ++ * ++ * Note that at recovery time, journal replay occurs *before* the restart of ++ * truncate against the orphan inode list. ++ * ++ * The committed inode has the new, desired i_size (which is the same as ++ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see ++ * that this inode's truncate did not complete and it will again call ++ * ext3_truncate() to have another go. So there will be instantiated blocks ++ * to the right of the truncation point in a crashed ext3 filesystem. But ++ * that's fine - as long as they are linked from the inode, the post-crash ++ * ext3_truncate() run will find them and release them. ++ */ ++ ++void ext3_truncate(struct inode * inode) ++{ ++ handle_t *handle; ++ u32 *i_data = inode->u.ext3_i.i_data; ++ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); ++ int offsets[4]; ++ Indirect chain[4]; ++ Indirect *partial; ++ int nr = 0; ++ int n; ++ long last_block; ++ unsigned blocksize; ++ ++ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || ++ S_ISLNK(inode->i_mode))) ++ return; ++ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ return; ++ ++ ext3_discard_prealloc(inode); ++ ++ handle = start_transaction(inode); ++ if (IS_ERR(handle)) ++ return; /* AKPM: return what? */ ++ ++ blocksize = inode->i_sb->s_blocksize; ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); ++ ++ ++ n = ext3_block_to_path(inode, last_block, offsets); ++ if (n == 0) ++ goto out_stop; /* error */ ++ ++ /* ++ * OK. This truncate is going to happen. We add the inode to the ++ * orphan list, so that if this truncate spans multiple transactions, ++ * and we crash, we will resume the truncate when the filesystem ++ * recovers. It also marks the inode dirty, to catch the new size. ++ * ++ * Implication: the file must always be in a sane, consistent ++ * truncatable state while each transaction commits. ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* ++ * The orphan list entry will now protect us from any crash which ++ * occurs before the truncate completes, so it is now safe to propagate ++ * the new, shorter inode size (held for now in i_size) into the ++ * on-disk inode. We do this via i_disksize, which is the value which ++ * ext3 *really* writes onto the disk inode. ++ */ ++ inode->u.ext3_i.i_disksize = inode->i_size; ++ ++ /* ++ * From here we block out all ext3_get_block() callers who want to ++ * modify the block allocation tree. ++ */ ++ down_write(&inode->u.ext3_i.truncate_sem); ++ ++ if (n == 1) { /* direct blocks */ ++ ext3_free_data(handle, inode, NULL, i_data+offsets[0], ++ i_data + EXT3_NDIR_BLOCKS); ++ goto do_indirects; ++ } ++ ++ partial = ext3_find_shared(inode, n, offsets, chain, &nr); ++ /* Kill the top of shared branch (not detached) */ ++ if (nr) { ++ if (partial == chain) { ++ /* Shared branch grows from the inode */ ++ ext3_free_branches(handle, inode, NULL, ++ &nr, &nr+1, (chain+n-1) - partial); ++ *partial->p = 0; ++ /* ++ * We mark the inode dirty prior to restart, ++ * and prior to stop. No need for it here. ++ */ ++ } else { ++ /* Shared branch grows from an indirect block */ ++ BUFFER_TRACE(partial->bh, "get_write_access"); ++ ext3_free_branches(handle, inode, partial->bh, ++ partial->p, ++ partial->p+1, (chain+n-1) - partial); ++ } ++ } ++ /* Clear the ends of indirect blocks on the shared branch */ ++ while (partial > chain) { ++ ext3_free_branches(handle, inode, partial->bh, partial->p + 1, ++ (u32*)partial->bh->b_data + addr_per_block, ++ (chain+n-1) - partial); ++ BUFFER_TRACE(partial->bh, "call brelse"); ++ brelse (partial->bh); ++ partial--; ++ } ++do_indirects: ++ /* Kill the remaining (whole) subtrees */ ++ switch (offsets[0]) { ++ default: ++ nr = i_data[EXT3_IND_BLOCK]; ++ if (nr) { ++ ext3_free_branches(handle, inode, NULL, ++ &nr, &nr+1, 1); ++ i_data[EXT3_IND_BLOCK] = 0; ++ } ++ case EXT3_IND_BLOCK: ++ nr = i_data[EXT3_DIND_BLOCK]; ++ if (nr) { ++ ext3_free_branches(handle, inode, NULL, ++ &nr, &nr+1, 2); ++ i_data[EXT3_DIND_BLOCK] = 0; ++ } ++ case EXT3_DIND_BLOCK: ++ nr = i_data[EXT3_TIND_BLOCK]; ++ if (nr) { ++ ext3_free_branches(handle, inode, NULL, ++ &nr, &nr+1, 3); ++ i_data[EXT3_TIND_BLOCK] = 0; ++ } ++ case EXT3_TIND_BLOCK: ++ ; ++ } ++ up_write(&inode->u.ext3_i.truncate_sem); ++ inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * ext3_get_inode_loc returns with an extra refcount against the ++ * inode's underlying buffer_head on success. ++ */ ++ ++int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) ++{ ++ struct buffer_head *bh = 0; ++ unsigned long block; ++ unsigned long block_group; ++ unsigned long group_desc; ++ unsigned long desc; ++ unsigned long offset; ++ struct ext3_group_desc * gdp; ++ ++ if ((inode->i_ino != EXT3_ROOT_INO && ++ inode->i_ino != EXT3_ACL_IDX_INO && ++ inode->i_ino != EXT3_ACL_DATA_INO && ++ inode->i_ino != EXT3_JOURNAL_INO && ++ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || ++ inode->i_ino > le32_to_cpu( ++ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { ++ ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ "bad inode number: %lu", inode->i_ino); ++ goto bad_inode; ++ } ++ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); ++ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { ++ ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ "group >= groups count"); ++ goto bad_inode; ++ } ++ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); ++ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); ++ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; ++ if (!bh) { ++ ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ "Descriptor not loaded"); ++ goto bad_inode; ++ } ++ ++ gdp = (struct ext3_group_desc *) bh->b_data; ++ /* ++ * Figure out the offset within the block group inode table ++ */ ++ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * ++ EXT3_INODE_SIZE(inode->i_sb); ++ block = le32_to_cpu(gdp[desc].bg_inode_table) + ++ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); ++ if (!(bh = sb_bread(inode->i_sb, block))) { ++ ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ "unable to read inode block - " ++ "inode=%lu, block=%lu", inode->i_ino, block); ++ goto bad_inode; ++ } ++ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); ++ ++ iloc->bh = bh; ++ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); ++ iloc->block_group = block_group; ++ ++ return 0; ++ ++ bad_inode: ++ return -EIO; ++} ++ ++void ext3_read_inode(struct inode * inode) ++{ ++ struct ext3_iloc iloc; ++ struct ext3_inode *raw_inode; ++ struct buffer_head *bh; ++ int block; ++ ++ if(ext3_get_inode_loc(inode, &iloc)) ++ goto bad_inode; ++ bh = iloc.bh; ++ raw_inode = iloc.raw_inode; ++ init_rwsem(&inode->u.ext3_i.truncate_sem); ++ inode->i_mode = le16_to_cpu(raw_inode->i_mode); ++ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ if(!(test_opt (inode->i_sb, NO_UID32))) { ++ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ } ++ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); ++ inode->i_size = le32_to_cpu(raw_inode->i_size); ++ inode->i_atime = le32_to_cpu(raw_inode->i_atime); ++ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); ++ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); ++ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); ++ /* We now have enough fields to check if the inode was active or not. ++ * This is needed because nfsd might try to access dead inodes ++ * the test is that same one that e2fsck uses ++ * NeilBrown 1999oct15 ++ */ ++ if (inode->i_nlink == 0) { ++ if (inode->i_mode == 0 || ++ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { ++ /* this inode is deleted */ ++ brelse (bh); ++ goto bad_inode; ++ } ++ /* The only unlinked inodes we let through here have ++ * valid i_mode and are being read by the orphan ++ * recovery code: that's fine, we're about to complete ++ * the process of deleting those. */ ++ } ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ++ inode->i_version = ++event; ++ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); ++#ifdef EXT3_FRAGMENTS ++ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); ++ inode->u.ext3_i.i_frag_no = raw_inode->i_frag; ++ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; ++#endif ++ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); ++ if (!S_ISREG(inode->i_mode)) { ++ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); ++ } else { ++ inode->i_size |= ++ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; ++ } ++ inode->u.ext3_i.i_disksize = inode->i_size; ++ inode->i_generation = le32_to_cpu(raw_inode->i_generation); ++#ifdef EXT3_PREALLOCATE ++ inode->u.ext3_i.i_prealloc_count = 0; ++#endif ++ inode->u.ext3_i.i_block_group = iloc.block_group; ++ ++ /* ++ * NOTE! The in-memory inode i_data array is in little-endian order ++ * even on big-endian machines: we do NOT byteswap the block numbers! ++ */ ++ for (block = 0; block < EXT3_N_BLOCKS; block++) ++ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; ++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ++ brelse (iloc.bh); ++ ++ if (inode->i_ino == EXT3_ACL_IDX_INO || ++ inode->i_ino == EXT3_ACL_DATA_INO) ++ /* Nothing to do */ ; ++ else if (S_ISREG(inode->i_mode)) { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ inode->i_mapping->a_ops = &ext3_aops; ++ } else if (S_ISDIR(inode->i_mode)) { ++ inode->i_op = &ext3_dir_inode_operations; ++ inode->i_fop = &ext3_dir_operations; ++ } else if (S_ISLNK(inode->i_mode)) { ++ if (!inode->i_blocks) ++ inode->i_op = &ext3_fast_symlink_inode_operations; ++ else { ++ inode->i_op = &page_symlink_inode_operations; ++ inode->i_mapping->a_ops = &ext3_aops; ++ } ++ } else ++ init_special_inode(inode, inode->i_mode, ++ le32_to_cpu(iloc.raw_inode->i_block[0])); ++ /* inode->i_attr_flags = 0; unused */ ++ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { ++ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ ++ inode->i_flags |= S_SYNC; ++ } ++ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { ++ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ ++ inode->i_flags |= S_APPEND; ++ } ++ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) { ++ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ ++ inode->i_flags |= S_IMMUTABLE; ++ } ++ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { ++ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ ++ inode->i_flags |= S_NOATIME; ++ } ++ return; ++ ++bad_inode: ++ make_bad_inode(inode); ++ return; ++} ++ ++/* ++ * Post the struct inode info into an on-disk inode location in the ++ * buffer-cache. This gobbles the caller's reference to the ++ * buffer_head in the inode location struct. ++ */ ++ ++static int ext3_do_update_inode(handle_t *handle, ++ struct inode *inode, ++ struct ext3_iloc *iloc) ++{ ++ struct ext3_inode *raw_inode = iloc->raw_inode; ++ struct buffer_head *bh = iloc->bh; ++ int err = 0, rc, block; ++ ++ if (handle) { ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out_brelse; ++ } ++ raw_inode->i_mode = cpu_to_le16(inode->i_mode); ++ if(!(test_opt(inode->i_sb, NO_UID32))) { ++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); ++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); ++/* ++ * Fix up interoperability with old kernels. Otherwise, old inodes get ++ * re-used with the upper 16 bits of the uid/gid intact ++ */ ++ if(!inode->u.ext3_i.i_dtime) { ++ raw_inode->i_uid_high = ++ cpu_to_le16(high_16_bits(inode->i_uid)); ++ raw_inode->i_gid_high = ++ cpu_to_le16(high_16_bits(inode->i_gid)); ++ } else { ++ raw_inode->i_uid_high = 0; ++ raw_inode->i_gid_high = 0; ++ } ++ } else { ++ raw_inode->i_uid_low = ++ cpu_to_le16(fs_high2lowuid(inode->i_uid)); ++ raw_inode->i_gid_low = ++ cpu_to_le16(fs_high2lowgid(inode->i_gid)); ++ raw_inode->i_uid_high = 0; ++ raw_inode->i_gid_high = 0; ++ } ++ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); ++ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); ++ raw_inode->i_atime = cpu_to_le32(inode->i_atime); ++ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); ++ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); ++ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); ++ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); ++ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); ++#ifdef EXT3_FRAGMENTS ++ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); ++ raw_inode->i_frag = inode->u.ext3_i.i_frag_no; ++ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; ++#else ++ /* If we are not tracking these fields in the in-memory inode, ++ * then preserve them on disk, but still initialise them to zero ++ * for new inodes. */ ++ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { ++ raw_inode->i_faddr = 0; ++ raw_inode->i_frag = 0; ++ raw_inode->i_fsize = 0; ++ } ++#endif ++ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); ++ if (!S_ISREG(inode->i_mode)) { ++ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); ++ } else { ++ raw_inode->i_size_high = ++ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); ++ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { ++ struct super_block *sb = inode->i_sb; ++ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, ++ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || ++ EXT3_SB(sb)->s_es->s_rev_level == ++ cpu_to_le32(EXT3_GOOD_OLD_REV)) { ++ /* If this is the first large file ++ * created, add a flag to the superblock. ++ */ ++ err = ext3_journal_get_write_access(handle, ++ sb->u.ext3_sb.s_sbh); ++ if (err) ++ goto out_brelse; ++ ext3_update_dynamic_rev(sb); ++ EXT3_SET_RO_COMPAT_FEATURE(sb, ++ EXT3_FEATURE_RO_COMPAT_LARGE_FILE); ++ sb->s_dirt = 1; ++ handle->h_sync = 1; ++ err = ext3_journal_dirty_metadata(handle, ++ sb->u.ext3_sb.s_sbh); ++ } ++ } ++ } ++ raw_inode->i_generation = le32_to_cpu(inode->i_generation); ++ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) ++ raw_inode->i_block[0] = ++ cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); ++ else for (block = 0; block < EXT3_N_BLOCKS; block++) ++ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; ++ ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ rc = ext3_journal_dirty_metadata(handle, bh); ++ if (!err) ++ err = rc; ++ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; ++ ++out_brelse: ++ brelse (bh); ++ ext3_std_error(inode->i_sb, err); ++ return err; ++} ++ ++/* ++ * ext3_write_inode() ++ * ++ * We are called from a few places: ++ * ++ * - Within generic_file_write() for O_SYNC files. ++ * Here, there will be no transaction running. We wait for any running ++ * trasnaction to commit. ++ * ++ * - Within sys_sync(), kupdate and such. ++ * We wait on commit, if tol to. ++ * ++ * - Within prune_icache() (PF_MEMALLOC == true) ++ * Here we simply return. We can't afford to block kswapd on the ++ * journal commit. ++ * ++ * In all cases it is actually safe for us to return without doing anything, ++ * because the inode has been copied into a raw inode buffer in ++ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for ++ * knfsd. ++ * ++ * Note that we are absolutely dependent upon all inode dirtiers doing the ++ * right thing: they *must* call mark_inode_dirty() after dirtying info in ++ * which we are interested. ++ * ++ * It would be a bug for them to not do this. The code: ++ * ++ * mark_inode_dirty(inode) ++ * stuff(); ++ * inode->i_size = expr; ++ * ++ * is in error because a kswapd-driven write_inode() could occur while ++ * `stuff()' is running, and the new i_size will be lost. Plus the inode ++ * will no longer be on the superblock's dirty inode list. ++ */ ++void ext3_write_inode(struct inode *inode, int wait) ++{ ++ if (current->flags & PF_MEMALLOC) ++ return; ++ ++ if (ext3_journal_current_handle()) { ++ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); ++ return; ++ } ++ ++ if (!wait) ++ return; ++ ++ ext3_force_commit(inode->i_sb); ++} ++ ++/* ++ * ext3_setattr() ++ * ++ * Called from notify_change. ++ * ++ * We want to trap VFS attempts to truncate the file as soon as ++ * possible. In particular, we want to make sure that when the VFS ++ * shrinks i_size, we put the inode on the orphan list and modify ++ * i_disksize immediately, so that during the subsequent flushing of ++ * dirty pages and freeing of disk blocks, we can guarantee that any ++ * commit will leave the blocks being flushed in an unused state on ++ * disk. (On recovery, the inode will get truncated and the blocks will ++ * be freed, so we have a strong guarantee that no future commit will ++ * leave these blocks visible to the user.) ++ * ++ * This is only needed for regular files. rmdir() has its own path, and ++ * we can never truncate a direcory except on final unlink (at which ++ * point i_nlink is zero so recovery is easy.) ++ * ++ * Called with the BKL. ++ */ ++ ++int ext3_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++ struct inode *inode = dentry->d_inode; ++ int error, rc = 0; ++ const unsigned int ia_valid = attr->ia_valid; ++ ++ error = inode_change_ok(inode, attr); ++ if (error) ++ return error; ++ ++ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; ++ if (error) ++ return error; ++ } ++ ++ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { ++ handle_t *handle; ++ ++ handle = ext3_journal_start(inode, 3); ++ if (IS_ERR(handle)) { ++ error = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ error = ext3_orphan_add(handle, inode); ++ inode->u.ext3_i.i_disksize = attr->ia_size; ++ rc = ext3_mark_inode_dirty(handle, inode); ++ if (!error) ++ error = rc; ++ ext3_journal_stop(handle, inode); ++ } ++ ++ rc = inode_setattr(inode, attr); ++ ++ /* If inode_setattr's call to ext3_truncate failed to get a ++ * transaction handle at all, we need to clean up the in-core ++ * orphan list manually. */ ++ if (inode->i_nlink) ++ ext3_orphan_del(NULL, inode); ++ ++err_out: ++ ext3_std_error(inode->i_sb, error); ++ if (!error) ++ error = rc; ++ return error; ++} ++ ++ ++/* ++ * akpm: how many blocks doth make a writepage()? ++ * ++ * With N blocks per page, it may be: ++ * N data blocks ++ * 2 indirect block ++ * 2 dindirect ++ * 1 tindirect ++ * N+5 bitmap blocks (from the above) ++ * N+5 group descriptor summary blocks ++ * 1 inode block ++ * 1 superblock. ++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files ++ * ++ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ * ++ * With ordered or writeback data it's the same, less the N data blocks. ++ * ++ * If the inode's direct blocks can hold an integral number of pages then a ++ * page cannot straddle two indirect blocks, and we can only touch one indirect ++ * and dindirect block, and the "5" above becomes "3". ++ * ++ * This still overestimates under most circumstances. If we were to pass the ++ * start and end offsets in here as well we could do block_to_path() on each ++ * block and work out the exact number of indirects which are touched. Pah. ++ */ ++ ++int ext3_writepage_trans_blocks(struct inode *inode) ++{ ++ int bpp = ext3_journal_blocks_per_page(inode); ++ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; ++ int ret; ++ ++ if (ext3_should_journal_data(inode)) ++ ret = 3 * (bpp + indirects) + 2; ++ else ++ ret = 2 * (bpp + indirects) + 2; ++ ++#ifdef CONFIG_QUOTA ++ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return ret; ++} ++ ++int ++ext3_mark_iloc_dirty(handle_t *handle, ++ struct inode *inode, ++ struct ext3_iloc *iloc) ++{ ++ int err = 0; ++ ++ if (handle) { ++ /* the do_update_inode consumes one bh->b_count */ ++ atomic_inc(&iloc->bh->b_count); ++ err = ext3_do_update_inode(handle, inode, iloc); ++ /* ext3_do_update_inode() does journal_dirty_metadata */ ++ brelse(iloc->bh); ++ } else { ++ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); ++ } ++ return err; ++} ++ ++/* ++ * On success, We end up with an outstanding reference count against ++ * iloc->bh. This _must_ be cleaned up later. ++ */ ++ ++int ++ext3_reserve_inode_write(handle_t *handle, struct inode *inode, ++ struct ext3_iloc *iloc) ++{ ++ int err = 0; ++ if (handle) { ++ err = ext3_get_inode_loc(inode, iloc); ++ if (!err) { ++ BUFFER_TRACE(iloc->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, iloc->bh); ++ if (err) { ++ brelse(iloc->bh); ++ iloc->bh = NULL; ++ } ++ } ++ } ++ ext3_std_error(inode->i_sb, err); ++ return err; ++} ++ ++/* ++ * akpm: What we do here is to mark the in-core inode as clean ++ * with respect to inode dirtiness (it may still be data-dirty). ++ * This means that the in-core inode may be reaped by prune_icache ++ * without having to perform any I/O. This is a very good thing, ++ * because *any* task may call prune_icache - even ones which ++ * have a transaction open against a different journal. ++ * ++ * Is this cheating? Not really. Sure, we haven't written the ++ * inode out, but prune_icache isn't a user-visible syncing function. ++ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) ++ * we start and wait on commits. ++ * ++ * Is this efficient/effective? Well, we're being nice to the system ++ * by cleaning up our inodes proactively so they can be reaped ++ * without I/O. But we are potentially leaving up to five seconds' ++ * worth of inodes floating about which prune_icache wants us to ++ * write out. One way to fix that would be to get prune_icache() ++ * to do a write_super() to free up some memory. It has the desired ++ * effect. ++ */ ++int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_iloc iloc; ++ int err; ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (!err) ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ return err; ++} ++ ++/* ++ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() ++ * ++ * We're really interested in the case where a file is being extended. ++ * i_size has been changed by generic_commit_write() and we thus need ++ * to include the updated inode in the current transaction. ++ * ++ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks ++ * are allocated to the file. ++ * ++ * If the inode is marked synchronous, we don't honour that here - doing ++ * so would cause a commit on atime updates, which we don't bother doing. ++ * We handle synchronous inodes at the highest possible level. ++ */ ++void ext3_dirty_inode(struct inode *inode) ++{ ++ handle_t *current_handle = ext3_journal_current_handle(); ++ handle_t *handle; ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ goto out; ++ if (current_handle && ++ current_handle->h_transaction != handle->h_transaction) { ++ /* This task has a transaction open against a different fs */ ++ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); ++ } else { ++ jbd_debug(5, "marking dirty. outer handle=%p\n", ++ current_handle); ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ext3_journal_stop(handle, inode); ++out: ++ unlock_kernel(); ++} ++ ++#ifdef AKPM ++/* ++ * Bind an inode's backing buffer_head into this transaction, to prevent ++ * it from being flushed to disk early. Unlike ++ * ext3_reserve_inode_write, this leaves behind no bh reference and ++ * returns no iloc structure, so the caller needs to repeat the iloc ++ * lookup to mark the inode dirty later. ++ */ ++static inline int ++ext3_pin_inode(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_iloc iloc; ++ ++ int err = 0; ++ if (handle) { ++ err = ext3_get_inode_loc(inode, &iloc); ++ if (!err) { ++ BUFFER_TRACE(iloc.bh, "get_write_access"); ++ err = journal_get_write_access(handle, iloc.bh); ++ if (!err) ++ err = ext3_journal_dirty_metadata(handle, ++ iloc.bh); ++ brelse(iloc.bh); ++ } ++ } ++ ext3_std_error(inode->i_sb, err); ++ return err; ++} ++#endif ++ ++int ext3_change_inode_journal_flag(struct inode *inode, int val) ++{ ++ journal_t *journal; ++ handle_t *handle; ++ int err; ++ ++ /* ++ * We have to be very careful here: changing a data block's ++ * journaling status dynamically is dangerous. If we write a ++ * data block to the journal, change the status and then delete ++ * that block, we risk forgetting to revoke the old log record ++ * from the journal and so a subsequent replay can corrupt data. ++ * So, first we make sure that the journal is empty and that ++ * nobody is changing anything. ++ */ ++ ++ journal = EXT3_JOURNAL(inode); ++ if (is_journal_aborted(journal) || IS_RDONLY(inode)) ++ return -EROFS; ++ ++ journal_lock_updates(journal); ++ journal_flush(journal); ++ ++ /* ++ * OK, there are no updates running now, and all cached data is ++ * synced to disk. We are now in a completely consistent state ++ * which doesn't have anything in the journal, and we know that ++ * no filesystem updates are running, so it is safe to modify ++ * the inode's in-core data-journaling state flag now. ++ */ ++ ++ if (val) ++ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; ++ else ++ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; ++ ++ journal_unlock_updates(journal); ++ ++ /* Finally we can mark the inode as dirty. */ ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ err = ext3_mark_inode_dirty(handle, inode); ++ handle->h_sync = 1; ++ ext3_journal_stop(handle, inode); ++ ext3_std_error(inode->i_sb, err); ++ ++ return err; ++} ++ ++ ++/* ++ * ext3_aops_journal_start(). ++ * ++ * ++ * ++ * We need to take the inode semaphore *outside* the ++ * journal_start/journal_stop. Otherwise, a different task could do a ++ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule ++ * is: transaction open/closes are considered to be a locking operation ++ * and they nest *inside* ->i_sem. ++ * ---------------------------------------------------------------------------- ++ * Possible problem: ++ * ext3_file_write() ++ * -> generic_file_write() ++ * -> __alloc_pages() ++ * -> page_launder() ++ * -> ext3_writepage() ++ * ++ * And the writepage can be on a different fs while we have a ++ * transaction open against this one! Bad. ++ * ++ * I tried making the task PF_MEMALLOC here, but that simply results in ++ * 0-order allocation failures passed back to generic_file_write(). ++ * Instead, we rely on the reentrancy protection in ext3_writepage(). ++ * ---------------------------------------------------------------------------- ++ * When we do the journal_start() here we don't really need to reserve ++ * any blocks - we won't need any until we hit ext3_prepare_write(), ++ * which does all the needed journal extending. However! There is a ++ * problem with quotas: ++ * ++ * Thread 1: ++ * sys_sync ++ * ->sync_dquots ++ * ->commit_dquot ++ * ->lock_dquot ++ * ->write_dquot ++ * ->ext3_file_write ++ * ->journal_start ++ * ->ext3_prepare_write ++ * ->journal_extend ++ * ->journal_start ++ * Thread 2: ++ * ext3_create (for example) ++ * ->ext3_new_inode ++ * ->dquot_initialize ++ * ->lock_dquot ++ * ++ * Deadlock. Thread 1's journal_start blocks because thread 2 has a ++ * transaction open. Thread 2's transaction will never close because ++ * thread 2 is stuck waiting for the dquot lock. ++ * ++ * So. We must ensure that thread 1 *never* needs to extend the journal ++ * for quota writes. We do that by reserving enough journal blocks ++ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we ++ * need to extend" test in ext3_prepare_write() succeeds. ++ */ +diff -rup --new-file linux.mcp2/fs/ext3/ioctl.c linux_tmp/fs/ext3/ioctl.c +--- linux.mcp2/fs/ext3/ioctl.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/ioctl.c 2001-11-09 14:25:04.000000000 -0800 +@@ -0,0 +1,170 @@ ++/* ++ * linux/fs/ext3/ioctl.c ++ * ++ * Copyright (C) 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ unsigned int flags; ++ ++ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); ++ ++ switch (cmd) { ++ case EXT3_IOC_GETFLAGS: ++ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; ++ return put_user(flags, (int *) arg); ++ case EXT3_IOC_SETFLAGS: { ++ handle_t *handle = NULL; ++ int err; ++ struct ext3_iloc iloc; ++ unsigned int oldflags; ++ unsigned int jflag; ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ ++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ ++ if (get_user(flags, (int *) arg)) ++ return -EFAULT; ++ ++ oldflags = inode->u.ext3_i.i_flags; ++ ++ /* The JOURNAL_DATA flag is modifiable only by root */ ++ jflag = flags & EXT3_JOURNAL_DATA_FL; ++ ++ /* ++ * The IMMUTABLE and APPEND_ONLY flags can only be changed by ++ * the relevant capability. ++ * ++ * This test looks nicer. Thanks to Pauline Middelink ++ */ ++ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ } ++ ++ /* ++ * The JOURNAL_DATA flag can only be changed by ++ * the relevant capability. ++ */ ++ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { ++ if (!capable(CAP_SYS_RESOURCE)) ++ return -EPERM; ++ } ++ ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ flags = flags & EXT3_FL_USER_MODIFIABLE; ++ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; ++ inode->u.ext3_i.i_flags = flags; ++ ++ if (flags & EXT3_SYNC_FL) ++ inode->i_flags |= S_SYNC; ++ else ++ inode->i_flags &= ~S_SYNC; ++ if (flags & EXT3_APPEND_FL) ++ inode->i_flags |= S_APPEND; ++ else ++ inode->i_flags &= ~S_APPEND; ++ if (flags & EXT3_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ else ++ inode->i_flags &= ~S_IMMUTABLE; ++ if (flags & EXT3_NOATIME_FL) ++ inode->i_flags |= S_NOATIME; ++ else ++ inode->i_flags &= ~S_NOATIME; ++ inode->i_ctime = CURRENT_TIME; ++ ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext3_journal_stop(handle, inode); ++ if (err) ++ return err; ++ ++ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) ++ err = ext3_change_inode_journal_flag(inode, jflag); ++ return err; ++ } ++ case EXT3_IOC_GETVERSION: ++ case EXT3_IOC_GETVERSION_OLD: ++ return put_user(inode->i_generation, (int *) arg); ++ case EXT3_IOC_SETVERSION: ++ case EXT3_IOC_SETVERSION_OLD: { ++ handle_t *handle; ++ struct ext3_iloc iloc; ++ __u32 generation; ++ int err; ++ ++ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (get_user(generation, (int *) arg)) ++ return -EFAULT; ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ return err; ++ ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_generation = generation; ++ ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ ext3_journal_stop(handle, inode); ++ return err; ++ } ++#ifdef CONFIG_JBD_DEBUG ++ case EXT3_IOC_WAIT_FOR_READONLY: ++ /* ++ * This is racy - by the time we're woken up and running, ++ * the superblock could be released. And the module could ++ * have been unloaded. So sue me. ++ * ++ * Returns 1 if it slept, else zero. ++ */ ++ { ++ struct super_block *sb = inode->i_sb; ++ DECLARE_WAITQUEUE(wait, current); ++ int ret = 0; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); ++ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { ++ schedule(); ++ ret = 1; ++ } ++ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); ++ return ret; ++ } ++#endif ++ default: ++ return -ENOTTY; ++ } ++} +diff -rup --new-file linux.mcp2/fs/ext3/namei.c linux_tmp/fs/ext3/namei.c +--- linux.mcp2/fs/ext3/namei.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/namei.c 2001-11-09 14:25:04.000000000 -0800 +@@ -0,0 +1,1125 @@ ++/* ++ * linux/fs/ext3/namei.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/namei.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ * Directory entry file type support and forward compatibility hooks ++ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * define how far ahead to read directories while searching them. ++ */ ++#define NAMEI_RA_CHUNKS 2 ++#define NAMEI_RA_BLOCKS 4 ++#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) ++#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) ++ ++/* ++ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. ++ * ++ * `len <= EXT3_NAME_LEN' is guaranteed by caller. ++ * `de != NULL' is guaranteed by caller. ++ */ ++static inline int ext3_match (int len, const char * const name, ++ struct ext3_dir_entry_2 * de) ++{ ++ if (len != de->name_len) ++ return 0; ++ if (!de->inode) ++ return 0; ++ return !memcmp(name, de->name, len); ++} ++ ++/* ++ * Returns 0 if not found, -1 on failure, and 1 on success ++ */ ++static int inline search_dirblock(struct buffer_head * bh, ++ struct inode *dir, ++ struct dentry *dentry, ++ unsigned long offset, ++ struct ext3_dir_entry_2 ** res_dir) ++{ ++ struct ext3_dir_entry_2 * de; ++ char * dlimit; ++ int de_len; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ dlimit = bh->b_data + dir->i_sb->s_blocksize; ++ while ((char *) de < dlimit) { ++ /* this code is executed quadratically often */ ++ /* do minimal checking `by hand' */ ++ ++ if ((char *) de + namelen <= dlimit && ++ ext3_match (namelen, name, de)) { ++ /* found a match - just to be sure, do a full check */ ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, offset)) ++ return -1; ++ *res_dir = de; ++ return 1; ++ } ++ /* prevent looping on a bad block */ ++ de_len = le16_to_cpu(de->rec_len); ++ if (de_len <= 0) ++ return -1; ++ offset += de_len; ++ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); ++ } ++ return 0; ++} ++ ++/* ++ * ext3_find_entry() ++ * ++ * finds an entry in the specified directory with the wanted name. It ++ * returns the cache buffer in which the entry was found, and the entry ++ * itself (as a parameter - res_dir). It does NOT read the inode of the ++ * entry - you'll have to do that yourself if you want to. ++ * ++ * The returned buffer_head has ->b_count elevated. The caller is expected ++ * to brelse() it when appropriate. ++ */ ++static struct buffer_head * ext3_find_entry (struct dentry *dentry, ++ struct ext3_dir_entry_2 ** res_dir) ++{ ++ struct super_block * sb; ++ struct buffer_head * bh_use[NAMEI_RA_SIZE]; ++ struct buffer_head * bh, *ret = NULL; ++ unsigned long start, block, b; ++ int ra_max = 0; /* Number of bh's in the readahead ++ buffer, bh_use[] */ ++ int ra_ptr = 0; /* Current index into readahead ++ buffer */ ++ int num = 0; ++ int nblocks, i, err; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ *res_dir = NULL; ++ sb = dir->i_sb; ++ ++ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); ++ start = dir->u.ext3_i.i_dir_start_lookup; ++ if (start >= nblocks) ++ start = 0; ++ block = start; ++restart: ++ do { ++ /* ++ * We deal with the read-ahead logic here. ++ */ ++ if (ra_ptr >= ra_max) { ++ /* Refill the readahead buffer */ ++ ra_ptr = 0; ++ b = block; ++ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { ++ /* ++ * Terminate if we reach the end of the ++ * directory and must wrap, or if our ++ * search has finished at this block. ++ */ ++ if (b >= nblocks || (num && block == start)) { ++ bh_use[ra_max] = NULL; ++ break; ++ } ++ num++; ++ bh = ext3_getblk(NULL, dir, b++, 0, &err); ++ bh_use[ra_max] = bh; ++ if (bh) ++ ll_rw_block(READ, 1, &bh); ++ } ++ } ++ if ((bh = bh_use[ra_ptr++]) == NULL) ++ goto next; ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) { ++ /* read error, skip block & hope for the best */ ++ brelse(bh); ++ goto next; ++ } ++ i = search_dirblock(bh, dir, dentry, ++ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); ++ if (i == 1) { ++ dir->u.ext3_i.i_dir_start_lookup = block; ++ ret = bh; ++ goto cleanup_and_exit; ++ } else { ++ brelse(bh); ++ if (i < 0) ++ goto cleanup_and_exit; ++ } ++ next: ++ if (++block >= nblocks) ++ block = 0; ++ } while (block != start); ++ ++ /* ++ * If the directory has grown while we were searching, then ++ * search the last part of the directory before giving up. ++ */ ++ block = nblocks; ++ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); ++ if (block < nblocks) { ++ start = 0; ++ goto restart; ++ } ++ ++cleanup_and_exit: ++ /* Clean up the read-ahead blocks */ ++ for (; ra_ptr < ra_max; ra_ptr++) ++ brelse (bh_use[ra_ptr]); ++ return ret; ++} ++ ++static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) ++{ ++ struct inode * inode; ++ struct ext3_dir_entry_2 * de; ++ struct buffer_head * bh; ++ ++ if (dentry->d_name.len > EXT3_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ bh = ext3_find_entry(dentry, &de); ++ inode = NULL; ++ if (bh) { ++ unsigned long ino = le32_to_cpu(de->inode); ++ brelse (bh); ++ inode = iget(dir->i_sb, ino); ++ ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ } ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define S_SHIFT 12 ++static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { ++ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, ++ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR, ++ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV, ++ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV, ++ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO, ++ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK, ++ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK, ++}; ++ ++static inline void ext3_set_de_type(struct super_block *sb, ++ struct ext3_dir_entry_2 *de, ++ umode_t mode) { ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) ++ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; ++} ++ ++/* ++ * ext3_add_entry() ++ * ++ * adds a file entry to the specified directory, using the same ++ * semantics as ext3_find_entry(). It returns NULL if it failed. ++ * ++ * NOTE!! The inode part of 'de' is left at 0 - which means you ++ * may not sleep between calling this and putting something into ++ * the entry, as someone else might have used it while you slept. ++ */ ++ ++/* ++ * AKPM: the journalling code here looks wrong on the error paths ++ */ ++static int ext3_add_entry (handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset; ++ unsigned short rec_len; ++ struct buffer_head * bh; ++ struct ext3_dir_entry_2 * de, * de1; ++ struct super_block * sb; ++ int retval; ++ ++ sb = dir->i_sb; ++ ++ if (!namelen) ++ return -EINVAL; ++ bh = ext3_bread (handle, dir, 0, 0, &retval); ++ if (!bh) ++ return retval; ++ rec_len = EXT3_DIR_REC_LEN(namelen); ++ offset = 0; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ while (1) { ++ if ((char *)de >= sb->s_blocksize + bh->b_data) { ++ brelse (bh); ++ bh = NULL; ++ bh = ext3_bread (handle, dir, ++ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); ++ if (!bh) ++ return retval; ++ if (dir->i_size <= offset) { ++ if (dir->i_size == 0) { ++ brelse(bh); ++ return -ENOENT; ++ } ++ ++ ext3_debug ("creating next block\n"); ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ de->inode = 0; ++ de->rec_len = le16_to_cpu(sb->s_blocksize); ++ dir->u.ext3_i.i_disksize = ++ dir->i_size = offset + sb->s_blocksize; ++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, dir); ++ } else { ++ ++ ext3_debug ("skipping to next block\n"); ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ } ++ } ++ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, ++ offset)) { ++ brelse (bh); ++ return -ENOENT; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ if ((le32_to_cpu(de->inode) == 0 && ++ le16_to_cpu(de->rec_len) >= rec_len) || ++ (le16_to_cpu(de->rec_len) >= ++ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ /* By now the buffer is marked for journaling */ ++ offset += le16_to_cpu(de->rec_len); ++ if (le32_to_cpu(de->inode)) { ++ de1 = (struct ext3_dir_entry_2 *) ((char *) de + ++ EXT3_DIR_REC_LEN(de->name_len)); ++ de1->rec_len = ++ cpu_to_le16(le16_to_cpu(de->rec_len) - ++ EXT3_DIR_REC_LEN(de->name_len)); ++ de->rec_len = cpu_to_le16( ++ EXT3_DIR_REC_LEN(de->name_len)); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, dir); ++ dir->i_version = ++event; ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, bh); ++ brelse(bh); ++ return 0; ++ } ++ offset += le16_to_cpu(de->rec_len); ++ de = (struct ext3_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ brelse (bh); ++ return -ENOSPC; ++} ++ ++/* ++ * ext3_delete_entry deletes a directory entry by merging it with the ++ * previous entry ++ */ ++static int ext3_delete_entry (handle_t *handle, ++ struct inode * dir, ++ struct ext3_dir_entry_2 * de_del, ++ struct buffer_head * bh) ++{ ++ struct ext3_dir_entry_2 * de, * pde; ++ int i; ++ ++ i = 0; ++ pde = NULL; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ while (i < bh->b_size) { ++ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) ++ return -EIO; ++ if (de == de_del) { ++ BUFFER_TRACE(bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, bh); ++ if (pde) ++ pde->rec_len = ++ cpu_to_le16(le16_to_cpu(pde->rec_len) + ++ le16_to_cpu(de->rec_len)); ++ else ++ de->inode = 0; ++ dir->i_version = ++event; ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, bh); ++ return 0; ++ } ++ i += le16_to_cpu(de->rec_len); ++ pde = de; ++ de = (struct ext3_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return -ENOENT; ++} ++ ++/* ++ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we ++ * do not perform it in these functions. We perform it at the call site, ++ * if it is needed. ++ */ ++static inline void ext3_inc_count(handle_t *handle, struct inode *inode) ++{ ++ inode->i_nlink++; ++} ++ ++static inline void ext3_dec_count(handle_t *handle, struct inode *inode) ++{ ++ inode->i_nlink--; ++} ++ ++static int ext3_add_nondir(handle_t *handle, ++ struct dentry *dentry, struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ d_instantiate(dentry, inode); ++ return 0; ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ ++/* ++ * By the time this is called, we already have created ++ * the directory cache entry for the new file, but it ++ * is so far negative - it has no inode. ++ * ++ * If the create succeeds, we fill in the inode information ++ * with d_instantiate(). ++ */ ++static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) ++{ ++ handle_t *handle; ++ struct inode * inode; ++ int err; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext3_new_inode (handle, dir, mode); ++ err = PTR_ERR(inode); ++ if (!IS_ERR(inode)) { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ inode->i_mapping->a_ops = &ext3_aops; ++ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_add_nondir(handle, dentry, inode); ++ } ++ ext3_journal_stop(handle, dir); ++ return err; ++} ++ ++static int ext3_mknod (struct inode * dir, struct dentry *dentry, ++ int mode, int rdev) ++{ ++ handle_t *handle; ++ struct inode *inode; ++ int err; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext3_new_inode (handle, dir, mode); ++ err = PTR_ERR(inode); ++ if (!IS_ERR(inode)) { ++ init_special_inode(inode, mode, rdev); ++ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_add_nondir(handle, dentry, inode); ++ } ++ ext3_journal_stop(handle, dir); ++ return err; ++} ++ ++static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) ++{ ++ handle_t *handle; ++ struct inode * inode; ++ struct buffer_head * dir_block; ++ struct ext3_dir_entry_2 * de; ++ int err; ++ ++ if (dir->i_nlink >= EXT3_LINK_MAX) ++ return -EMLINK; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext3_new_inode (handle, dir, S_IFDIR); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out_stop; ++ ++ inode->i_op = &ext3_dir_inode_operations; ++ inode->i_fop = &ext3_dir_operations; ++ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_blocks = 0; ++ dir_block = ext3_bread (handle, inode, 0, 1, &err); ++ if (!dir_block) { ++ inode->i_nlink--; /* is this nlink == 0? */ ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } ++ BUFFER_TRACE(dir_block, "get_write_access"); ++ ext3_journal_get_write_access(handle, dir_block); ++ de = (struct ext3_dir_entry_2 *) dir_block->b_data; ++ de->inode = cpu_to_le32(inode->i_ino); ++ de->name_len = 1; ++ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); ++ strcpy (de->name, "."); ++ ext3_set_de_type(dir->i_sb, de, S_IFDIR); ++ de = (struct ext3_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ de->inode = cpu_to_le32(dir->i_ino); ++ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); ++ de->name_len = 2; ++ strcpy (de->name, ".."); ++ ext3_set_de_type(dir->i_sb, de, S_IFDIR); ++ inode->i_nlink = 2; ++ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, dir_block); ++ brelse (dir_block); ++ inode->i_mode = S_IFDIR | mode; ++ if (dir->i_mode & S_ISGID) ++ inode->i_mode |= S_ISGID; ++ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_add_entry (handle, dentry, inode); ++ if (err) ++ goto out_no_entry; ++ dir->i_nlink++; ++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, dir); ++ d_instantiate(dentry, inode); ++out_stop: ++ ext3_journal_stop(handle, dir); ++ return err; ++ ++out_no_entry: ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++} ++ ++/* ++ * routine to check that the specified directory is empty (for rmdir) ++ */ ++static int empty_dir (struct inode * inode) ++{ ++ unsigned long offset; ++ struct buffer_head * bh; ++ struct ext3_dir_entry_2 * de, * de1; ++ struct super_block * sb; ++ int err; ++ ++ sb = inode->i_sb; ++ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || ++ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { ++ ext3_warning (inode->i_sb, "empty_dir", ++ "bad directory (dir #%lu) - no data block", ++ inode->i_ino); ++ return 1; ++ } ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ de1 = (struct ext3_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ if (le32_to_cpu(de->inode) != inode->i_ino || ++ !le32_to_cpu(de1->inode) || ++ strcmp (".", de->name) || ++ strcmp ("..", de1->name)) { ++ ext3_warning (inode->i_sb, "empty_dir", ++ "bad directory (dir #%lu) - no `.' or `..'", ++ inode->i_ino); ++ brelse (bh); ++ return 1; ++ } ++ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); ++ de = (struct ext3_dir_entry_2 *) ++ ((char *) de1 + le16_to_cpu(de1->rec_len)); ++ while (offset < inode->i_size ) { ++ if (!bh || ++ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { ++ brelse (bh); ++ bh = ext3_bread (NULL, inode, ++ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); ++ if (!bh) { ++#if 0 ++ ext3_error (sb, "empty_dir", ++ "directory #%lu contains a hole at offset %lu", ++ inode->i_ino, offset); ++#endif ++ offset += sb->s_blocksize; ++ continue; ++ } ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ } ++ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh, ++ offset)) { ++ brelse (bh); ++ return 1; ++ } ++ if (le32_to_cpu(de->inode)) { ++ brelse (bh); ++ return 0; ++ } ++ offset += le16_to_cpu(de->rec_len); ++ de = (struct ext3_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ brelse (bh); ++ return 1; ++} ++ ++/* ext3_orphan_add() links an unlinked or truncated inode into a list of ++ * such inodes, starting at the superblock, in case we crash before the ++ * file is closed/deleted, or in case the inode truncate spans multiple ++ * transactions and the last transaction is not recovered after a crash. ++ * ++ * At filesystem recovery time, we walk this list deleting unlinked ++ * inodes and truncating linked inodes in ext3_orphan_cleanup(). ++ */ ++int ext3_orphan_add(handle_t *handle, struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct ext3_iloc iloc; ++ int err = 0, rc; ++ ++ lock_super(sb); ++ if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ goto out_unlock; ++ ++ /* Orphan handling is only valid for files with data blocks ++ * being truncated, or files being unlinked. */ ++ ++ /* @@@ FIXME: Observation from aviro: ++ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block ++ * here (on lock_super()), so race with ext3_link() which might bump ++ * ->i_nlink. For, say it, character device. Not a regular file, ++ * not a directory, not a symlink and ->i_nlink > 0. ++ */ ++ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || ++ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); ++ ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ if (err) ++ goto out_unlock; ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto out_unlock; ++ ++ /* Insert this inode at the head of the on-disk orphan list... */ ++ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); ++ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ++ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ rc = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ if (!err) ++ err = rc; ++ ++ /* Only add to the head of the in-memory list if all the ++ * previous operations succeeded. If the orphan_add is going to ++ * fail (possibly taking the journal offline), we can't risk ++ * leaving the inode on the orphan list: stray orphan-list ++ * entries can cause panics at unmount time. ++ * ++ * This is safe: on error we're going to ignore the orphan list ++ * anyway on the next recovery. */ ++ if (!err) ++ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ ++ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); ++ jbd_debug(4, "orphan inode %ld will point to %d\n", ++ inode->i_ino, NEXT_ORPHAN(inode)); ++out_unlock: ++ unlock_super(sb); ++ ext3_std_error(inode->i_sb, err); ++ return err; ++} ++ ++/* ++ * ext3_orphan_del() removes an unlinked or truncated inode from the list ++ * of such inodes stored on disk, because it is finally being cleaned up. ++ */ ++int ext3_orphan_del(handle_t *handle, struct inode *inode) ++{ ++ struct list_head *prev; ++ struct ext3_sb_info *sbi; ++ ino_t ino_next; ++ struct ext3_iloc iloc; ++ int err = 0; ++ ++ lock_super(inode->i_sb); ++ if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ unlock_super(inode->i_sb); ++ return 0; ++ } ++ ++ ino_next = NEXT_ORPHAN(inode); ++ prev = inode->u.ext3_i.i_orphan.prev; ++ sbi = EXT3_SB(inode->i_sb); ++ ++ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); ++ ++ list_del(&inode->u.ext3_i.i_orphan); ++ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ++ /* If we're on an error path, we may not have a valid ++ * transaction handle with which to update the orphan list on ++ * disk, but we still need to remove the inode from the linked ++ * list in memory. */ ++ if (!handle) ++ goto out; ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto out_err; ++ ++ if (prev == &sbi->s_orphan) { ++ jbd_debug(4, "superblock will point to %ld\n", ino_next); ++ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sbi->s_sbh); ++ if (err) ++ goto out_brelse; ++ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ++ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); ++ } else { ++ struct ext3_iloc iloc2; ++ struct inode *i_prev = ++ list_entry(prev, struct inode, u.ext3_i.i_orphan); ++ ++ jbd_debug(4, "orphan inode %ld will point to %ld\n", ++ i_prev->i_ino, ino_next); ++ err = ext3_reserve_inode_write(handle, i_prev, &iloc2); ++ if (err) ++ goto out_brelse; ++ NEXT_ORPHAN(i_prev) = ino_next; ++ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); ++ } ++ if (err) ++ goto out_brelse; ++ NEXT_ORPHAN(inode) = 0; ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ if (err) ++ goto out_brelse; ++ ++out_err: ++ ext3_std_error(inode->i_sb, err); ++out: ++ unlock_super(inode->i_sb); ++ return err; ++ ++out_brelse: ++ brelse(iloc.bh); ++ goto out_err; ++} ++ ++static int ext3_rmdir (struct inode * dir, struct dentry *dentry) ++{ ++ int retval; ++ struct inode * inode; ++ struct buffer_head * bh; ++ struct ext3_dir_entry_2 * de; ++ handle_t *handle; ++ ++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ retval = -ENOENT; ++ bh = ext3_find_entry (dentry, &de); ++ if (!bh) ++ goto end_rmdir; ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = dentry->d_inode; ++ DQUOT_INIT(inode); ++ ++ retval = -EIO; ++ if (le32_to_cpu(de->inode) != inode->i_ino) ++ goto end_rmdir; ++ ++ retval = -ENOTEMPTY; ++ if (!empty_dir (inode)) ++ goto end_rmdir; ++ ++ retval = ext3_delete_entry(handle, dir, de, bh); ++ if (retval) ++ goto end_rmdir; ++ if (inode->i_nlink != 2) ++ ext3_warning (inode->i_sb, "ext3_rmdir", ++ "empty directory has nlink!=2 (%d)", ++ inode->i_nlink); ++ inode->i_version = ++event; ++ inode->i_nlink = 0; ++ /* There's no need to set i_disksize: the fact that i_nlink is ++ * zero will ensure that the right thing happens during any ++ * recovery. */ ++ inode->i_size = 0; ++ ext3_orphan_add(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ dir->i_nlink--; ++ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, dir); ++ ++end_rmdir: ++ ext3_journal_stop(handle, dir); ++ brelse (bh); ++ return retval; ++} ++ ++static int ext3_unlink(struct inode * dir, struct dentry *dentry) ++{ ++ int retval; ++ struct inode * inode; ++ struct buffer_head * bh; ++ struct ext3_dir_entry_2 * de; ++ handle_t *handle; ++ ++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ retval = -ENOENT; ++ bh = ext3_find_entry (dentry, &de); ++ if (!bh) ++ goto end_unlink; ++ ++ inode = dentry->d_inode; ++ DQUOT_INIT(inode); ++ ++ retval = -EIO; ++ if (le32_to_cpu(de->inode) != inode->i_ino) ++ goto end_unlink; ++ ++ if (!inode->i_nlink) { ++ ext3_warning (inode->i_sb, "ext3_unlink", ++ "Deleting nonexistent file (%lu), %d", ++ inode->i_ino, inode->i_nlink); ++ inode->i_nlink = 1; ++ } ++ retval = ext3_delete_entry(handle, dir, de, bh); ++ if (retval) ++ goto end_unlink; ++ dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, dir); ++ inode->i_nlink--; ++ if (!inode->i_nlink) ++ ext3_orphan_add(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ inode->i_ctime = dir->i_ctime; ++ retval = 0; ++ ++end_unlink: ++ ext3_journal_stop(handle, dir); ++ brelse (bh); ++ return retval; ++} ++ ++static int ext3_symlink (struct inode * dir, ++ struct dentry *dentry, const char * symname) ++{ ++ handle_t *handle; ++ struct inode * inode; ++ int l, err; ++ ++ l = strlen(symname)+1; ++ if (l > dir->i_sb->s_blocksize) ++ return -ENAMETOOLONG; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out_stop; ++ ++ if (l > sizeof (inode->u.ext3_i.i_data)) { ++ inode->i_op = &page_symlink_inode_operations; ++ inode->i_mapping->a_ops = &ext3_aops; ++ /* ++ * block_symlink() calls back into ext3_prepare/commit_write. ++ * We have a transaction open. All is sweetness. It also sets ++ * i_size in generic_commit_write(). ++ */ ++ err = block_symlink(inode, symname, l); ++ if (err) ++ goto out_no_entry; ++ } else { ++ inode->i_op = &ext3_fast_symlink_inode_operations; ++ memcpy((char*)&inode->u.ext3_i.i_data,symname,l); ++ inode->i_size = l-1; ++ } ++ inode->u.ext3_i.i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_add_nondir(handle, dentry, inode); ++out_stop: ++ ext3_journal_stop(handle, dir); ++ return err; ++ ++out_no_entry: ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++} ++ ++static int ext3_link (struct dentry * old_dentry, ++ struct inode * dir, struct dentry *dentry) ++{ ++ handle_t *handle; ++ struct inode *inode = old_dentry->d_inode; ++ int err; ++ ++ if (S_ISDIR(inode->i_mode)) ++ return -EPERM; ++ ++ if (inode->i_nlink >= EXT3_LINK_MAX) ++ return -EMLINK; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode->i_ctime = CURRENT_TIME; ++ ext3_inc_count(handle, inode); ++ atomic_inc(&inode->i_count); ++ ++ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_add_nondir(handle, dentry, inode); ++ ext3_journal_stop(handle, dir); ++ return err; ++} ++ ++#define PARENT_INO(buffer) \ ++ ((struct ext3_dir_entry_2 *) ((char *) buffer + \ ++ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode ++ ++/* ++ * Anybody can rename anything with this: the permission checks are left to the ++ * higher-level routines. ++ */ ++static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ++ struct inode * new_dir,struct dentry *new_dentry) ++{ ++ handle_t *handle; ++ struct inode * old_inode, * new_inode; ++ struct buffer_head * old_bh, * new_bh, * dir_bh; ++ struct ext3_dir_entry_2 * old_de, * new_de; ++ int retval; ++ ++ old_bh = new_bh = dir_bh = NULL; ++ ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) ++ handle->h_sync = 1; ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de); ++ /* ++ * Check for inode number is _not_ due to possible IO errors. ++ * We might rmdir the source, keep it as pwd of some process ++ * and merrily kill the link to whatever was created under the ++ * same name. Goodbye sticky bit ;-< ++ */ ++ old_inode = old_dentry->d_inode; ++ retval = -ENOENT; ++ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) ++ goto end_rename; ++ ++ new_inode = new_dentry->d_inode; ++ new_bh = ext3_find_entry (new_dentry, &new_de); ++ if (new_bh) { ++ if (!new_inode) { ++ brelse (new_bh); ++ new_bh = NULL; ++ } else { ++ DQUOT_INIT(new_inode); ++ } ++ } ++ if (S_ISDIR(old_inode->i_mode)) { ++ if (new_inode) { ++ retval = -ENOTEMPTY; ++ if (!empty_dir (new_inode)) ++ goto end_rename; ++ } ++ retval = -EIO; ++ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); ++ if (!dir_bh) ++ goto end_rename; ++ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) ++ goto end_rename; ++ retval = -EMLINK; ++ if (!new_inode && new_dir!=old_dir && ++ new_dir->i_nlink >= EXT3_LINK_MAX) ++ goto end_rename; ++ } ++ if (!new_bh) { ++ retval = ext3_add_entry (handle, new_dentry, old_inode); ++ if (retval) ++ goto end_rename; ++ } else { ++ BUFFER_TRACE(new_bh, "get write access"); ++ BUFFER_TRACE(new_bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, new_bh); ++ new_de->inode = le32_to_cpu(old_inode->i_ino); ++ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, ++ EXT3_FEATURE_INCOMPAT_FILETYPE)) ++ new_de->file_type = old_de->file_type; ++ new_dir->i_version = ++event; ++ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, new_bh); ++ brelse(new_bh); ++ new_bh = NULL; ++ } ++ ++ /* ++ * Like most other Unix systems, set the ctime for inodes on a ++ * rename. ++ */ ++ old_inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, old_inode); ++ ++ /* ++ * ok, that's it ++ */ ++ ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ ++ if (new_inode) { ++ new_inode->i_nlink--; ++ new_inode->i_ctime = CURRENT_TIME; ++ } ++ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; ++ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ if (dir_bh) { ++ BUFFER_TRACE(dir_bh, "get_write_access"); ++ ext3_journal_get_write_access(handle, dir_bh); ++ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); ++ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); ++ ext3_journal_dirty_metadata(handle, dir_bh); ++ old_dir->i_nlink--; ++ if (new_inode) { ++ new_inode->i_nlink--; ++ } else { ++ new_dir->i_nlink++; ++ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_mark_inode_dirty(handle, new_dir); ++ } ++ } ++ ext3_mark_inode_dirty(handle, old_dir); ++ if (new_inode) { ++ ext3_mark_inode_dirty(handle, new_inode); ++ if (!new_inode->i_nlink) ++ ext3_orphan_add(handle, new_inode); ++ } ++ retval = 0; ++ ++end_rename: ++ brelse (dir_bh); ++ brelse (old_bh); ++ brelse (new_bh); ++ ext3_journal_stop(handle, old_dir); ++ return retval; ++} ++ ++/* ++ * directories can handle most operations... ++ */ ++struct inode_operations ext3_dir_inode_operations = { ++ create: ext3_create, /* BKL held */ ++ lookup: ext3_lookup, /* BKL held */ ++ link: ext3_link, /* BKL held */ ++ unlink: ext3_unlink, /* BKL held */ ++ symlink: ext3_symlink, /* BKL held */ ++ mkdir: ext3_mkdir, /* BKL held */ ++ rmdir: ext3_rmdir, /* BKL held */ ++ mknod: ext3_mknod, /* BKL held */ ++ rename: ext3_rename, /* BKL held */ ++}; +diff -rup --new-file linux.mcp2/fs/ext3/super.c linux_tmp/fs/ext3/super.c +--- linux.mcp2/fs/ext3/super.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/super.c 2002-02-25 11:38:08.000000000 -0800 +@@ -0,0 +1,1753 @@ ++/* ++ * linux/fs/ext3/super.c ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/inode.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * Big-endian to little-endian byte-swapping/bitmaps by ++ * David S. Miller (davem@caip.rutgers.edu), 1995 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_JBD_DEBUG ++static int ext3_ro_after; /* Make fs read-only after this many jiffies */ ++#endif ++ ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_create_journal(struct super_block *, struct ext3_super_block *, ++ int); ++static void ext3_commit_super (struct super_block * sb, ++ struct ext3_super_block * es, ++ int sync); ++static void ext3_mark_recovery_complete(struct super_block * sb, ++ struct ext3_super_block * es); ++static void ext3_clear_journal_err(struct super_block * sb, ++ struct ext3_super_block * es); ++ ++#ifdef CONFIG_JBD_DEBUG ++int journal_no_write[2]; ++ ++/* ++ * Debug code for turning filesystems "read-only" after a specified ++ * amount of time. This is for crash/recovery testing. ++ */ ++ ++static void make_rdonly(kdev_t dev, int *no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ *no_write = 0xdead0000 + dev; ++ } ++} ++ ++static void turn_fs_readonly(unsigned long arg) ++{ ++ struct super_block *sb = (struct super_block *)arg; ++ ++ make_rdonly(sb->s_dev, &journal_no_write[0]); ++ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]); ++ wake_up(&EXT3_SB(sb)->ro_wait_queue); ++} ++ ++static void setup_ro_after(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ init_timer(&sbi->turn_ro_timer); ++ if (ext3_ro_after) { ++ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n", ++ ext3_ro_after); ++ init_waitqueue_head(&sbi->ro_wait_queue); ++ journal_no_write[0] = 0; ++ journal_no_write[1] = 0; ++ sbi->turn_ro_timer.function = turn_fs_readonly; ++ sbi->turn_ro_timer.data = (unsigned long)sb; ++ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after; ++ ext3_ro_after = 0; ++ add_timer(&sbi->turn_ro_timer); ++ } ++} ++ ++static void clear_ro_after(struct super_block *sb) ++{ ++ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer); ++ journal_no_write[0] = 0; ++ journal_no_write[1] = 0; ++ ext3_ro_after = 0; ++} ++#else ++#define setup_ro_after(sb) do {} while (0) ++#define clear_ro_after(sb) do {} while (0) ++#endif ++ ++ ++static char error_buf[1024]; ++ ++/* Determine the appropriate response to ext3_error on a given filesystem */ ++ ++static int ext3_error_behaviour(struct super_block *sb) ++{ ++ /* First check for mount-time options */ ++ if (test_opt (sb, ERRORS_PANIC)) ++ return EXT3_ERRORS_PANIC; ++ if (test_opt (sb, ERRORS_RO)) ++ return EXT3_ERRORS_RO; ++ if (test_opt (sb, ERRORS_CONT)) ++ return EXT3_ERRORS_CONTINUE; ++ ++ /* If no overrides were specified on the mount, then fall back ++ * to the default behaviour set in the filesystem's superblock ++ * on disk. */ ++ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { ++ case EXT3_ERRORS_PANIC: ++ return EXT3_ERRORS_PANIC; ++ case EXT3_ERRORS_RO: ++ return EXT3_ERRORS_RO; ++ default: ++ break; ++ } ++ return EXT3_ERRORS_CONTINUE; ++} ++ ++/* Deal with the reporting of failure conditions on a filesystem such as ++ * inconsistencies detected or read IO failures. ++ * ++ * On ext2, we can store the error state of the filesystem in the ++ * superblock. That is not possible on ext3, because we may have other ++ * write ordering constraints on the superblock which prevent us from ++ * writing it out straight away; and given that the journal is about to ++ * be aborted, we can't rely on the current, or future, transactions to ++ * write out the superblock safely. ++ * ++ * We'll just use the journal_abort() error code to record an error in ++ * the journal instead. On recovery, the journal will compain about ++ * that error until we've noted it down and cleared it. ++ */ ++ ++static void ext3_handle_error(struct super_block *sb) ++{ ++ struct ext3_super_block *es = EXT3_SB(sb)->s_es; ++ ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; ++ es->s_state |= cpu_to_le32(EXT3_ERROR_FS); ++ ++ if (sb->s_flags & MS_RDONLY) ++ return; ++ ++ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) { ++ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; ++ journal_abort(EXT3_SB(sb)->s_journal, -EIO); ++ } ++ ++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) ++ panic ("EXT3-fs (device %s): panic forced after error\n", ++ bdevname(sb->s_dev)); ++ ++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) { ++ printk (KERN_CRIT "Remounting filesystem read-only\n"); ++ sb->s_flags |= MS_RDONLY; ++ } ++ ++ ext3_commit_super(sb, es, 1); ++} ++ ++void ext3_error (struct super_block * sb, const char * function, ++ const char * fmt, ...) ++{ ++ va_list args; ++ ++ va_start (args, fmt); ++ vsprintf (error_buf, fmt, args); ++ va_end (args); ++ ++ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n", ++ bdevname(sb->s_dev), function, error_buf); ++ ++ ext3_handle_error(sb); ++} ++ ++const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]) ++{ ++ char *errstr = NULL; ++ ++ switch (errno) { ++ case -EIO: ++ errstr = "IO failure"; ++ break; ++ case -ENOMEM: ++ errstr = "Out of memory"; ++ break; ++ case -EROFS: ++ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) ++ errstr = "Journal has aborted"; ++ else ++ errstr = "Readonly filesystem"; ++ break; ++ default: ++ /* If the caller passed in an extra buffer for unknown ++ * errors, textualise them now. Else we just return ++ * NULL. */ ++ if (nbuf) { ++ /* Check for truncated error codes... */ ++ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) ++ errstr = nbuf; ++ } ++ ++ break; ++ } ++ ++ return errstr; ++} ++ ++/* __ext3_std_error decodes expected errors from journaling functions ++ * automatically and invokes the appropriate error response. */ ++ ++void __ext3_std_error (struct super_block * sb, const char * function, ++ int errno) ++{ ++ char nbuf[16]; ++ const char *errstr = ext3_decode_error(sb, errno, nbuf); ++ ++ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", ++ bdevname(sb->s_dev), function, errstr); ++ ++ ext3_handle_error(sb); ++} ++ ++/* ++ * ext3_abort is a much stronger failure handler than ext3_error. The ++ * abort function may be used to deal with unrecoverable failures such ++ * as journal IO errors or ENOMEM at a critical moment in log management. ++ * ++ * We unconditionally force the filesystem into an ABORT|READONLY state, ++ * unless the error response on the fs has been set to panic in which ++ * case we take the easy way out and panic immediately. ++ */ ++ ++void ext3_abort (struct super_block * sb, const char * function, ++ const char * fmt, ...) ++{ ++ va_list args; ++ ++ printk (KERN_CRIT "ext3_abort called.\n"); ++ ++ va_start (args, fmt); ++ vsprintf (error_buf, fmt, args); ++ va_end (args); ++ ++ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) ++ panic ("EXT3-fs panic (device %s): %s: %s\n", ++ bdevname(sb->s_dev), function, error_buf); ++ ++ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n", ++ bdevname(sb->s_dev), function, error_buf); ++ ++ if (sb->s_flags & MS_RDONLY) ++ return; ++ ++ printk (KERN_CRIT "Remounting filesystem read-only\n"); ++ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ sb->s_flags |= MS_RDONLY; ++ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; ++ journal_abort(EXT3_SB(sb)->s_journal, -EIO); ++} ++ ++/* Deal with the reporting of failure conditions while running, such as ++ * inconsistencies in operation or invalid system states. ++ * ++ * Use ext3_error() for cases of invalid filesystem states, as that will ++ * record an error on disk and force a filesystem check on the next boot. ++ */ ++NORET_TYPE void ext3_panic (struct super_block * sb, const char * function, ++ const char * fmt, ...) ++{ ++ va_list args; ++ ++ va_start (args, fmt); ++ vsprintf (error_buf, fmt, args); ++ va_end (args); ++ ++ /* this is to prevent panic from syncing this filesystem */ ++ /* AKPM: is this sufficient? */ ++ sb->s_flags |= MS_RDONLY; ++ panic ("EXT3-fs panic (device %s): %s: %s\n", ++ bdevname(sb->s_dev), function, error_buf); ++} ++ ++void ext3_warning (struct super_block * sb, const char * function, ++ const char * fmt, ...) ++{ ++ va_list args; ++ ++ va_start (args, fmt); ++ vsprintf (error_buf, fmt, args); ++ va_end (args); ++ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n", ++ bdevname(sb->s_dev), function, error_buf); ++} ++ ++void ext3_update_dynamic_rev(struct super_block *sb) ++{ ++ struct ext3_super_block *es = EXT3_SB(sb)->s_es; ++ ++ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) ++ return; ++ ++ ext3_warning(sb, __FUNCTION__, ++ "updating to rev %d because of new feature flag, " ++ "running e2fsck is recommended", ++ EXT3_DYNAMIC_REV); ++ ++ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); ++ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); ++ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); ++ /* leave es->s_feature_*compat flags alone */ ++ /* es->s_uuid will be set by e2fsck if empty */ ++ ++ /* ++ * The rest of the superblock fields should be zero, and if not it ++ * means they are likely already in use, so leave them alone. We ++ * can leave it up to e2fsck to clean up any inconsistencies there. ++ */ ++} ++ ++/* ++ * Open the external journal device ++ */ ++static struct block_device *ext3_blkdev_get(kdev_t dev) ++{ ++ struct block_device *bdev; ++ int err = -ENODEV; ++ ++ bdev = bdget(kdev_t_to_nr(dev)); ++ if (bdev == NULL) ++ goto fail; ++ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); ++ if (err < 0) ++ goto fail; ++ return bdev; ++ ++fail: ++ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n", ++ bdevname(dev), err); ++ return NULL; ++} ++ ++/* ++ * Release the journal device ++ */ ++static int ext3_blkdev_put(struct block_device *bdev) ++{ ++ return blkdev_put(bdev, BDEV_FS); ++} ++ ++static int ext3_blkdev_remove(struct ext3_sb_info *sbi) ++{ ++ struct block_device *bdev; ++ int ret = -ENODEV; ++ ++ bdev = sbi->journal_bdev; ++ if (bdev) { ++ ret = ext3_blkdev_put(bdev); ++ sbi->journal_bdev = 0; ++ } ++ return ret; ++} ++ ++#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) ++ ++static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) ++{ ++ struct list_head *l; ++ ++ printk(KERN_ERR "sb orphan head is %d\n", ++ le32_to_cpu(sbi->s_es->s_last_orphan)); ++ ++ printk(KERN_ERR "sb_info orphan list:\n"); ++ list_for_each(l, &sbi->s_orphan) { ++ struct inode *inode = orphan_list_entry(l); ++ printk(KERN_ERR " " ++ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n", ++ inode->i_dev, inode->i_ino, inode, ++ inode->i_mode, inode->i_nlink, ++ le32_to_cpu(NEXT_ORPHAN(inode))); ++ } ++} ++ ++void ext3_put_super (struct super_block * sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_super_block *es = sbi->s_es; ++ kdev_t j_dev = sbi->s_journal->j_dev; ++ int i; ++ ++ journal_destroy(sbi->s_journal); ++ if (!(sb->s_flags & MS_RDONLY)) { ++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ es->s_state = le16_to_cpu(sbi->s_mount_state); ++ BUFFER_TRACE(sbi->s_sbh, "marking dirty"); ++ mark_buffer_dirty(sbi->s_sbh); ++ ext3_commit_super(sb, es, 1); ++ } ++ ++ for (i = 0; i < sbi->s_gdb_count; i++) ++ brelse(sbi->s_group_desc[i]); ++ kfree(sbi->s_group_desc); ++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) ++ brelse(sbi->s_inode_bitmap[i]); ++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) ++ brelse(sbi->s_block_bitmap[i]); ++ brelse(sbi->s_sbh); ++ ++ /* Debugging code just in case the in-memory inode orphan list ++ * isn't empty. The on-disk one can be non-empty if we've ++ * detected an error and taken the fs readonly, but the ++ * in-memory list had better be clean by this point. */ ++ if (!list_empty(&sbi->s_orphan)) ++ dump_orphan_list(sb, sbi); ++ J_ASSERT(list_empty(&sbi->s_orphan)); ++ ++ invalidate_buffers(sb->s_dev); ++ if (j_dev != sb->s_dev) { ++ /* ++ * Invalidate the journal device's buffers. We don't want them ++ * floating about in memory - the physical journal device may ++ * hotswapped, and it breaks the `ro-after' testing code. ++ */ ++ fsync_no_super(j_dev); ++ invalidate_buffers(j_dev); ++ ext3_blkdev_remove(sbi); ++ } ++ clear_ro_after(sb); ++ ++ return; ++} ++ ++static struct super_operations ext3_sops = { ++ read_inode: ext3_read_inode, /* BKL held */ ++ write_inode: ext3_write_inode, /* BKL not held. Don't need */ ++ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ ++ put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++ delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++ put_super: ext3_put_super, /* BKL held */ ++ write_super: ext3_write_super, /* BKL held */ ++ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ ++ unlockfs: ext3_unlockfs, /* BKL not held. We take it */ ++ statfs: ext3_statfs, /* BKL held */ ++ remount_fs: ext3_remount, /* BKL held */ ++}; ++ ++static int want_value(char *value, char *option) ++{ ++ if (!value || !*value) { ++ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n", ++ option); ++ return -1; ++ } ++ return 0; ++} ++ ++static int want_null_value(char *value, char *option) ++{ ++ if (*value) { ++ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n", ++ option, value); ++ return -1; ++ } ++ return 0; ++} ++ ++static int want_numeric(char *value, char *option, unsigned long *number) ++{ ++ if (want_value(value, option)) ++ return -1; ++ *number = simple_strtoul(value, &value, 0); ++ if (want_null_value(value, option)) ++ return -1; ++ return 0; ++} ++ ++/* ++ * This function has been shamelessly adapted from the msdos fs ++ */ ++static int parse_options (char * options, unsigned long * sb_block, ++ struct ext3_sb_info *sbi, ++ unsigned long * inum, ++ int is_remount) ++{ ++ unsigned long *mount_options = &sbi->s_mount_opt; ++ uid_t *resuid = &sbi->s_resuid; ++ gid_t *resgid = &sbi->s_resgid; ++ char * this_char; ++ char * value; ++ ++ if (!options) ++ return 1; ++ for (this_char = strtok (options, ","); ++ this_char != NULL; ++ this_char = strtok (NULL, ",")) { ++ if ((value = strchr (this_char, '=')) != NULL) ++ *value++ = 0; ++ if (!strcmp (this_char, "bsddf")) ++ clear_opt (*mount_options, MINIX_DF); ++ else if (!strcmp (this_char, "nouid32")) { ++ set_opt (*mount_options, NO_UID32); ++ } ++ else if (!strcmp (this_char, "abort")) ++ set_opt (*mount_options, ABORT); ++ else if (!strcmp (this_char, "check")) { ++ if (!value || !*value || !strcmp (value, "none")) ++ clear_opt (*mount_options, CHECK); ++ else ++#ifdef CONFIG_EXT3_CHECK ++ set_opt (*mount_options, CHECK); ++#else ++ printk(KERN_ERR ++ "EXT3 Check option not supported\n"); ++#endif ++ } ++ else if (!strcmp (this_char, "debug")) ++ set_opt (*mount_options, DEBUG); ++ else if (!strcmp (this_char, "errors")) { ++ if (want_value(value, "errors")) ++ return 0; ++ if (!strcmp (value, "continue")) { ++ clear_opt (*mount_options, ERRORS_RO); ++ clear_opt (*mount_options, ERRORS_PANIC); ++ set_opt (*mount_options, ERRORS_CONT); ++ } ++ else if (!strcmp (value, "remount-ro")) { ++ clear_opt (*mount_options, ERRORS_CONT); ++ clear_opt (*mount_options, ERRORS_PANIC); ++ set_opt (*mount_options, ERRORS_RO); ++ } ++ else if (!strcmp (value, "panic")) { ++ clear_opt (*mount_options, ERRORS_CONT); ++ clear_opt (*mount_options, ERRORS_RO); ++ set_opt (*mount_options, ERRORS_PANIC); ++ } ++ else { ++ printk (KERN_ERR ++ "EXT3-fs: Invalid errors option: %s\n", ++ value); ++ return 0; ++ } ++ } ++ else if (!strcmp (this_char, "grpid") || ++ !strcmp (this_char, "bsdgroups")) ++ set_opt (*mount_options, GRPID); ++ else if (!strcmp (this_char, "minixdf")) ++ set_opt (*mount_options, MINIX_DF); ++ else if (!strcmp (this_char, "nocheck")) ++ clear_opt (*mount_options, CHECK); ++ else if (!strcmp (this_char, "nogrpid") || ++ !strcmp (this_char, "sysvgroups")) ++ clear_opt (*mount_options, GRPID); ++ else if (!strcmp (this_char, "resgid")) { ++ unsigned long v; ++ if (want_numeric(value, "resgid", &v)) ++ return 0; ++ *resgid = v; ++ } ++ else if (!strcmp (this_char, "resuid")) { ++ unsigned long v; ++ if (want_numeric(value, "resuid", &v)) ++ return 0; ++ *resuid = v; ++ } ++ else if (!strcmp (this_char, "sb")) { ++ if (want_numeric(value, "sb", sb_block)) ++ return 0; ++ } ++#ifdef CONFIG_JBD_DEBUG ++ else if (!strcmp (this_char, "ro-after")) { ++ unsigned long v; ++ if (want_numeric(value, "ro-after", &v)) ++ return 0; ++ ext3_ro_after = v; ++ } ++#endif ++ /* Silently ignore the quota options */ ++ else if (!strcmp (this_char, "grpquota") ++ || !strcmp (this_char, "noquota") ++ || !strcmp (this_char, "quota") ++ || !strcmp (this_char, "usrquota")) ++ /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "journal")) { ++ /* @@@ FIXME */ ++ /* Eventually we will want to be able to create ++ a journal file here. For now, only allow the ++ user to specify an existing inode to be the ++ journal file. */ ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ ++ if (want_value(value, "journal")) ++ return 0; ++ if (!strcmp (value, "update")) ++ set_opt (*mount_options, UPDATE_JOURNAL); ++ else if (want_numeric(value, "journal", inum)) ++ return 0; ++ } ++ else if (!strcmp (this_char, "noload")) ++ set_opt (*mount_options, NOLOAD); ++ else if (!strcmp (this_char, "data")) { ++ int data_opt = 0; ++ ++ if (want_value(value, "data")) ++ return 0; ++ if (!strcmp (value, "journal")) ++ data_opt = EXT3_MOUNT_JOURNAL_DATA; ++ else if (!strcmp (value, "ordered")) ++ data_opt = EXT3_MOUNT_ORDERED_DATA; ++ else if (!strcmp (value, "writeback")) ++ data_opt = EXT3_MOUNT_WRITEBACK_DATA; ++ else { ++ printk (KERN_ERR ++ "EXT3-fs: Invalid data option: %s\n", ++ value); ++ return 0; ++ } ++ if (is_remount) { ++ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) != ++ data_opt) { ++ printk(KERN_ERR ++ "EXT3-fs: cannot change data " ++ "mode on remount\n"); ++ return 0; ++ } ++ } else { ++ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; ++ *mount_options |= data_opt; ++ } ++ } else { ++ printk (KERN_ERR ++ "EXT3-fs: Unrecognized mount option %s\n", ++ this_char); ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ++ int read_only) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int res = 0; ++ ++ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { ++ printk (KERN_ERR "EXT3-fs warning: revision level too high, " ++ "forcing read-only mode\n"); ++ res = MS_RDONLY; ++ } ++ if (read_only) ++ return res; ++ if (!(sbi->s_mount_state & EXT3_VALID_FS)) ++ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " ++ "running e2fsck is recommended\n"); ++ else if ((sbi->s_mount_state & EXT3_ERROR_FS)) ++ printk (KERN_WARNING ++ "EXT3-fs warning: mounting fs with errors, " ++ "running e2fsck is recommended\n"); ++ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && ++ le16_to_cpu(es->s_mnt_count) >= ++ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ++ printk (KERN_WARNING ++ "EXT3-fs warning: maximal mount count reached, " ++ "running e2fsck is recommended\n"); ++ else if (le32_to_cpu(es->s_checkinterval) && ++ (le32_to_cpu(es->s_lastcheck) + ++ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME)) ++ printk (KERN_WARNING ++ "EXT3-fs warning: checktime reached, " ++ "running e2fsck is recommended\n"); ++#if 0 ++ /* @@@ We _will_ want to clear the valid bit if we find ++ inconsistencies, to force a fsck at reboot. But for ++ a plain journaled filesystem we can keep it set as ++ valid forever! :) */ ++ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); ++#endif ++ if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) ++ es->s_max_mnt_count = ++ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); ++ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); ++ es->s_mtime = cpu_to_le32(CURRENT_TIME); ++ ext3_update_dynamic_rev(sb); ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ ext3_commit_super (sb, es, 1); ++ if (test_opt (sb, DEBUG)) ++ printk (KERN_INFO ++ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, " ++ "bpg=%lu, ipg=%lu, mo=%04lx]\n", ++ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize, ++ sbi->s_groups_count, ++ EXT3_BLOCKS_PER_GROUP(sb), ++ EXT3_INODES_PER_GROUP(sb), ++ sbi->s_mount_opt); ++ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", ++ bdevname(sb->s_dev)); ++ if (EXT3_SB(sb)->s_journal->j_inode == NULL) { ++ printk("external journal on %s\n", ++ bdevname(EXT3_SB(sb)->s_journal->j_dev)); ++ } else { ++ printk("internal journal\n"); ++ } ++#ifdef CONFIG_EXT3_CHECK ++ if (test_opt (sb, CHECK)) { ++ ext3_check_blocks_bitmap (sb); ++ ext3_check_inodes_bitmap (sb); ++ } ++#endif ++ setup_ro_after(sb); ++ return res; ++} ++ ++static int ext3_check_descriptors (struct super_block * sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); ++ struct ext3_group_desc * gdp = NULL; ++ int desc_block = 0; ++ int i; ++ ++ ext3_debug ("Checking group descriptors"); ++ ++ for (i = 0; i < sbi->s_groups_count; i++) ++ { ++ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) ++ gdp = (struct ext3_group_desc *) ++ sbi->s_group_desc[desc_block++]->b_data; ++ if (le32_to_cpu(gdp->bg_block_bitmap) < block || ++ le32_to_cpu(gdp->bg_block_bitmap) >= ++ block + EXT3_BLOCKS_PER_GROUP(sb)) ++ { ++ ext3_error (sb, "ext3_check_descriptors", ++ "Block bitmap for group %d" ++ " not in group (block %lu)!", ++ i, (unsigned long) ++ le32_to_cpu(gdp->bg_block_bitmap)); ++ return 0; ++ } ++ if (le32_to_cpu(gdp->bg_inode_bitmap) < block || ++ le32_to_cpu(gdp->bg_inode_bitmap) >= ++ block + EXT3_BLOCKS_PER_GROUP(sb)) ++ { ++ ext3_error (sb, "ext3_check_descriptors", ++ "Inode bitmap for group %d" ++ " not in group (block %lu)!", ++ i, (unsigned long) ++ le32_to_cpu(gdp->bg_inode_bitmap)); ++ return 0; ++ } ++ if (le32_to_cpu(gdp->bg_inode_table) < block || ++ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= ++ block + EXT3_BLOCKS_PER_GROUP(sb)) ++ { ++ ext3_error (sb, "ext3_check_descriptors", ++ "Inode table for group %d" ++ " not in group (block %lu)!", ++ i, (unsigned long) ++ le32_to_cpu(gdp->bg_inode_table)); ++ return 0; ++ } ++ block += EXT3_BLOCKS_PER_GROUP(sb); ++ gdp++; ++ } ++ return 1; ++} ++ ++ ++/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at ++ * the superblock) which were deleted from all directories, but held open by ++ * a process at the time of a crash. We walk the list and try to delete these ++ * inodes at recovery time (only with a read-write filesystem). ++ * ++ * In order to keep the orphan inode chain consistent during traversal (in ++ * case of crash during recovery), we link each inode into the superblock ++ * orphan list_head and handle it the same way as an inode deletion during ++ * normal operation (which journals the operations for us). ++ * ++ * We only do an iget() and an iput() on each inode, which is very safe if we ++ * accidentally point at an in-use or already deleted inode. The worst that ++ * can happen in this case is that we get a "bit already cleared" message from ++ * ext3_free_inode(). The only reason we would point at a wrong inode is if ++ * e2fsck was run on this filesystem, and it must have already done the orphan ++ * inode cleanup for us, so we can safely abort without any further action. ++ */ ++static void ext3_orphan_cleanup (struct super_block * sb, ++ struct ext3_super_block * es) ++{ ++ unsigned int s_flags = sb->s_flags; ++ int nr_orphans = 0, nr_truncates = 0; ++ if (!es->s_last_orphan) { ++ jbd_debug(4, "no orphan inodes to clean up\n"); ++ return; ++ } ++ ++ if (s_flags & MS_RDONLY) { ++ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", ++ bdevname(sb->s_dev)); ++ sb->s_flags &= ~MS_RDONLY; ++ } ++ ++ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { ++ if (es->s_last_orphan) ++ jbd_debug(1, "Errors on filesystem, " ++ "clearing orphan list.\n"); ++ es->s_last_orphan = 0; ++ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); ++ return; ++ } ++ ++ while (es->s_last_orphan) { ++ struct inode *inode; ++ ++ if (!(inode = ++ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { ++ es->s_last_orphan = 0; ++ break; ++ } ++ ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); ++ if (inode->i_nlink) { ++ printk(KERN_DEBUG __FUNCTION__ ++ ": truncating inode %ld to %Ld bytes\n", ++ inode->i_ino, inode->i_size); ++ jbd_debug(2, "truncating inode %ld to %Ld bytes\n", ++ inode->i_ino, inode->i_size); ++ ext3_truncate(inode); ++ nr_truncates++; ++ } else { ++ printk(KERN_DEBUG __FUNCTION__ ++ ": deleting unreferenced inode %ld\n", ++ inode->i_ino); ++ jbd_debug(2, "deleting unreferenced inode %ld\n", ++ inode->i_ino); ++ nr_orphans++; ++ } ++ iput(inode); /* The delete magic happens here! */ ++ } ++ ++#define PLURAL(x) (x), ((x)==1) ? "" : "s" ++ ++ if (nr_orphans) ++ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", ++ bdevname(sb->s_dev), PLURAL(nr_orphans)); ++ if (nr_truncates) ++ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", ++ bdevname(sb->s_dev), PLURAL(nr_truncates)); ++ sb->s_flags = s_flags; /* Restore MS_RDONLY status */ ++} ++ ++#define log2(n) ffz(~(n)) ++ ++/* ++ * Maximal file size. There is a direct, and {,double-,triple-}indirect ++ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. ++ * We need to be 1 filesystem block less than the 2^32 sector limit. ++ */ ++static loff_t ext3_max_size(int bits) ++{ ++ loff_t res = EXT3_NDIR_BLOCKS; ++ res += 1LL << (bits-2); ++ res += 1LL << (2*(bits-2)); ++ res += 1LL << (3*(bits-2)); ++ res <<= bits; ++ if (res > (512LL << 32) - (1 << bits)) ++ res = (512LL << 32) - (1 << bits); ++ return res; ++} ++ ++struct super_block * ext3_read_super (struct super_block * sb, void * data, ++ int silent) ++{ ++ struct buffer_head * bh; ++ struct ext3_super_block *es = 0; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned long sb_block = 1; ++ unsigned long logic_sb_block = 1; ++ unsigned long offset = 0; ++ unsigned long journal_inum = 0; ++ kdev_t dev = sb->s_dev; ++ int blocksize; ++ int hblock; ++ int db_count; ++ int i; ++ int needs_recovery; ++ ++#ifdef CONFIG_JBD_DEBUG ++ ext3_ro_after = 0; ++#endif ++ /* ++ * See what the current blocksize for the device is, and ++ * use that as the blocksize. Otherwise (or if the blocksize ++ * is smaller than the default) use the default. ++ * This is important for devices that have a hardware ++ * sectorsize that is larger than the default. ++ */ ++ blocksize = EXT3_MIN_BLOCK_SIZE; ++ hblock = get_hardsect_size(dev); ++ if (blocksize < hblock) ++ blocksize = hblock; ++ ++ sbi->s_mount_opt = 0; ++ sbi->s_resuid = EXT3_DEF_RESUID; ++ sbi->s_resgid = EXT3_DEF_RESGID; ++ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { ++ sb->s_dev = 0; ++ goto out_fail; ++ } ++ ++ sb->s_blocksize = blocksize; ++ set_blocksize (dev, blocksize); ++ ++ /* ++ * The ext3 superblock will not be buffer aligned for other than 1kB ++ * block sizes. We need to calculate the offset from buffer start. ++ */ ++ if (blocksize != EXT3_MIN_BLOCK_SIZE) { ++ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; ++ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; ++ } ++ ++ if (!(bh = sb_bread(sb, logic_sb_block))) { ++ printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); ++ goto out_fail; ++ } ++ /* ++ * Note: s_es must be initialized as soon as possible because ++ * some ext3 macro-instructions depend on its value ++ */ ++ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); ++ sbi->s_es = es; ++ sb->s_magic = le16_to_cpu(es->s_magic); ++ if (sb->s_magic != EXT3_SUPER_MAGIC) { ++ if (!silent) ++ printk(KERN_ERR ++ "VFS: Can't find ext3 filesystem on dev %s.\n", ++ bdevname(dev)); ++ goto failed_mount; ++ } ++ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && ++ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || ++ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || ++ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) ++ printk(KERN_WARNING ++ "EXT3-fs warning: feature flags set on rev 0 fs, " ++ "running e2fsck is recommended\n"); ++ /* ++ * Check feature flags regardless of the revision level, since we ++ * previously didn't change the revision level when setting the flags, ++ * so there is a chance incompat flags are set on a rev 0 filesystem. ++ */ ++ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) { ++ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " ++ "unsupported optional features (%x).\n", ++ bdevname(dev), i); ++ goto failed_mount; ++ } ++ if (!(sb->s_flags & MS_RDONLY) && ++ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){ ++ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " ++ "unsupported optional features (%x).\n", ++ bdevname(dev), i); ++ goto failed_mount; ++ } ++ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10; ++ sb->s_blocksize = 1 << sb->s_blocksize_bits; ++ ++ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE || ++ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) { ++ printk(KERN_ERR ++ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", ++ blocksize, bdevname(dev)); ++ goto failed_mount; ++ } ++ ++ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); ++ ++ if (sb->s_blocksize != blocksize) { ++ blocksize = sb->s_blocksize; ++ ++ /* ++ * Make sure the blocksize for the filesystem is larger ++ * than the hardware sectorsize for the machine. ++ */ ++ if (sb->s_blocksize < hblock) { ++ printk(KERN_ERR "EXT3-fs: blocksize %d too small for " ++ "device blocksize %d.\n", blocksize, hblock); ++ goto failed_mount; ++ } ++ ++ brelse (bh); ++ set_blocksize (dev, sb->s_blocksize); ++ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; ++ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; ++ bh = sb_bread(sb, logic_sb_block); ++ if (!bh) { ++ printk(KERN_ERR ++ "EXT3-fs: Can't read superblock on 2nd try.\n"); ++ return NULL; ++ } ++ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); ++ sbi->s_es = es; ++ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) { ++ printk (KERN_ERR ++ "EXT3-fs: Magic mismatch, very weird !\n"); ++ goto failed_mount; ++ } ++ } ++ ++ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { ++ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; ++ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; ++ } else { ++ sbi->s_inode_size = le16_to_cpu(es->s_inode_size); ++ sbi->s_first_ino = le32_to_cpu(es->s_first_ino); ++ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { ++ printk (KERN_ERR ++ "EXT3-fs: unsupported inode size: %d\n", ++ sbi->s_inode_size); ++ goto failed_mount; ++ } ++ } ++ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << ++ le32_to_cpu(es->s_log_frag_size); ++ if (blocksize != sbi->s_frag_size) { ++ printk(KERN_ERR ++ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", ++ sbi->s_frag_size, blocksize); ++ goto failed_mount; ++ } ++ sbi->s_frags_per_block = 1; ++ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); ++ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); ++ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); ++ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); ++ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block; ++ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); ++ sbi->s_sbh = bh; ++ if (sbi->s_resuid == EXT3_DEF_RESUID) ++ sbi->s_resuid = le16_to_cpu(es->s_def_resuid); ++ if (sbi->s_resgid == EXT3_DEF_RESGID) ++ sbi->s_resgid = le16_to_cpu(es->s_def_resgid); ++ sbi->s_mount_state = le16_to_cpu(es->s_state); ++ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); ++ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ ++ if (sbi->s_blocks_per_group > blocksize * 8) { ++ printk (KERN_ERR ++ "EXT3-fs: #blocks per group too big: %lu\n", ++ sbi->s_blocks_per_group); ++ goto failed_mount; ++ } ++ if (sbi->s_frags_per_group > blocksize * 8) { ++ printk (KERN_ERR ++ "EXT3-fs: #fragments per group too big: %lu\n", ++ sbi->s_frags_per_group); ++ goto failed_mount; ++ } ++ if (sbi->s_inodes_per_group > blocksize * 8) { ++ printk (KERN_ERR ++ "EXT3-fs: #inodes per group too big: %lu\n", ++ sbi->s_inodes_per_group); ++ goto failed_mount; ++ } ++ ++ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - ++ le32_to_cpu(es->s_first_data_block) + ++ EXT3_BLOCKS_PER_GROUP(sb) - 1) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / ++ EXT3_DESC_PER_BLOCK(sb); ++ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), ++ GFP_KERNEL); ++ if (sbi->s_group_desc == NULL) { ++ printk (KERN_ERR "EXT3-fs: not enough memory\n"); ++ goto failed_mount; ++ } ++ for (i = 0; i < db_count; i++) { ++ sbi->s_group_desc[i] = sb_bread(sb, logic_sb_block + i + 1); ++ if (!sbi->s_group_desc[i]) { ++ printk (KERN_ERR "EXT3-fs: " ++ "can't read group descriptor %d\n", i); ++ db_count = i; ++ goto failed_mount2; ++ } ++ } ++ if (!ext3_check_descriptors (sb)) { ++ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); ++ goto failed_mount2; ++ } ++ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) { ++ sbi->s_inode_bitmap_number[i] = 0; ++ sbi->s_inode_bitmap[i] = NULL; ++ sbi->s_block_bitmap_number[i] = 0; ++ sbi->s_block_bitmap[i] = NULL; ++ } ++ sbi->s_loaded_inode_bitmaps = 0; ++ sbi->s_loaded_block_bitmaps = 0; ++ sbi->s_gdb_count = db_count; ++ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); ++ /* ++ * set up enough so that it can read an inode ++ */ ++ sb->s_op = &ext3_sops; ++ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ ++ ++ sb->s_root = 0; ++ ++ needs_recovery = (es->s_last_orphan != 0 || ++ EXT3_HAS_INCOMPAT_FEATURE(sb, ++ EXT3_FEATURE_INCOMPAT_RECOVER)); ++ ++ /* ++ * The first inode we look at is the journal inode. Don't try ++ * root first: it may be modified in the journal! ++ */ ++ if (!test_opt(sb, NOLOAD) && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { ++ if (ext3_load_journal(sb, es)) ++ goto failed_mount2; ++ } else if (journal_inum) { ++ if (ext3_create_journal(sb, es, journal_inum)) ++ goto failed_mount2; ++ } else { ++ if (!silent) ++ printk (KERN_ERR ++ "ext3: No journal on filesystem on %s\n", ++ bdevname(dev)); ++ goto failed_mount2; ++ } ++ ++ /* We have now updated the journal if required, so we can ++ * validate the data journaling mode. */ ++ switch (test_opt(sb, DATA_FLAGS)) { ++ case 0: ++ /* No mode set, assume a default based on the journal ++ capabilities: ORDERED_DATA if the journal can ++ cope, else JOURNAL_DATA */ ++ if (journal_check_available_features ++ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) ++ set_opt(sbi->s_mount_opt, ORDERED_DATA); ++ else ++ set_opt(sbi->s_mount_opt, JOURNAL_DATA); ++ break; ++ ++ case EXT3_MOUNT_ORDERED_DATA: ++ case EXT3_MOUNT_WRITEBACK_DATA: ++ if (!journal_check_available_features ++ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { ++ printk(KERN_ERR "EXT3-fs: Journal does not support " ++ "requested data journaling mode\n"); ++ goto failed_mount3; ++ } ++ default: ++ break; ++ } ++ ++ /* ++ * The journal_load will have done any necessary log recovery, ++ * so we can safely mount the rest of the filesystem now. ++ */ ++ ++ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO)); ++ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) || ++ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) { ++ if (sb->s_root) { ++ dput(sb->s_root); ++ sb->s_root = NULL; ++ printk(KERN_ERR ++ "EXT3-fs: corrupt root inode, run e2fsck\n"); ++ } else ++ printk(KERN_ERR "EXT3-fs: get root inode failed\n"); ++ goto failed_mount3; ++ } ++ ++ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ /* ++ * akpm: core read_super() calls in here with the superblock locked. ++ * That deadlocks, because orphan cleanup needs to lock the superblock ++ * in numerous places. Here we just pop the lock - it's relatively ++ * harmless, because we are now ready to accept write_super() requests, ++ * and aviro says that's the only reason for hanging onto the ++ * superblock lock. ++ */ ++ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ++ unlock_super(sb); /* akpm: sigh */ ++ ext3_orphan_cleanup(sb, es); ++ lock_super(sb); ++ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (needs_recovery) ++ printk (KERN_INFO "EXT3-fs: recovery complete.\n"); ++ ext3_mark_recovery_complete(sb, es); ++ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", ++ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": ++ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": ++ "writeback"); ++ ++ return sb; ++ ++failed_mount3: ++ journal_destroy(sbi->s_journal); ++failed_mount2: ++ for (i = 0; i < db_count; i++) ++ brelse(sbi->s_group_desc[i]); ++ kfree(sbi->s_group_desc); ++failed_mount: ++ ext3_blkdev_remove(sbi); ++ brelse(bh); ++out_fail: ++ return NULL; ++} ++ ++static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) ++{ ++ struct inode *journal_inode; ++ journal_t *journal; ++ ++ /* First, test for the existence of a valid inode on disk. Bad ++ * things happen if we iget() an unused inode, as the subsequent ++ * iput() will try to delete it. */ ++ ++ journal_inode = iget(sb, journal_inum); ++ if (!journal_inode) { ++ printk(KERN_ERR "EXT3-fs: no journal found.\n"); ++ return NULL; ++ } ++ if (!journal_inode->i_nlink) { ++ make_bad_inode(journal_inode); ++ iput(journal_inode); ++ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); ++ return NULL; ++ } ++ ++ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", ++ journal_inode, journal_inode->i_size); ++ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { ++ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); ++ iput(journal_inode); ++ return NULL; ++ } ++ ++ journal = journal_init_inode(journal_inode); ++ if (!journal) { ++ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); ++ iput(journal_inode); ++ } ++ ++ return journal; ++} ++ ++static journal_t *ext3_get_dev_journal(struct super_block *sb, ++ int dev) ++{ ++ struct buffer_head * bh; ++ journal_t *journal; ++ int start; ++ int len; ++ int hblock, blocksize; ++ unsigned long sb_block; ++ unsigned long offset; ++ kdev_t journal_dev = to_kdev_t(dev); ++ struct ext3_super_block * es; ++ struct block_device *bdev; ++ ++ bdev = ext3_blkdev_get(journal_dev); ++ if (bdev == NULL) ++ return NULL; ++ ++ blocksize = sb->s_blocksize; ++ hblock = get_hardsect_size(journal_dev); ++ if (blocksize < hblock) { ++ printk(KERN_ERR ++ "EXT3-fs: blocksize too small for journal device.\n"); ++ goto out_bdev; ++ } ++ ++ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; ++ offset = EXT3_MIN_BLOCK_SIZE % blocksize; ++ set_blocksize(dev, blocksize); ++ if (!(bh = bread(dev, sb_block, blocksize))) { ++ printk(KERN_ERR "EXT3-fs: couldn't read superblock of " ++ "external journal\n"); ++ goto out_bdev; ++ } ++ ++ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); ++ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || ++ !(le32_to_cpu(es->s_feature_incompat) & ++ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { ++ printk(KERN_ERR "EXT3-fs: external journal has " ++ "bad superblock\n"); ++ brelse(bh); ++ goto out_bdev; ++ } ++ ++ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ++ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); ++ brelse(bh); ++ goto out_bdev; ++ } ++ ++ len = le32_to_cpu(es->s_blocks_count); ++ start = sb_block + 1; ++ brelse(bh); /* we're done with the superblock */ ++ ++ journal = journal_init_dev(journal_dev, sb->s_dev, ++ start, len, blocksize); ++ if (!journal) { ++ printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); ++ goto out_bdev; ++ } ++ ll_rw_block(READ, 1, &journal->j_sb_buffer); ++ wait_on_buffer(journal->j_sb_buffer); ++ if (!buffer_uptodate(journal->j_sb_buffer)) { ++ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); ++ goto out_journal; ++ } ++ if (ntohl(journal->j_superblock->s_nr_users) != 1) { ++ printk(KERN_ERR "EXT3-fs: External journal has more than one " ++ "user (unsupported) - %d\n", ++ ntohl(journal->j_superblock->s_nr_users)); ++ goto out_journal; ++ } ++ EXT3_SB(sb)->journal_bdev = bdev; ++ return journal; ++out_journal: ++ journal_destroy(journal); ++out_bdev: ++ ext3_blkdev_put(bdev); ++ return NULL; ++} ++ ++static int ext3_load_journal(struct super_block * sb, ++ struct ext3_super_block * es) ++{ ++ journal_t *journal; ++ int journal_inum = le32_to_cpu(es->s_journal_inum); ++ int journal_dev = le32_to_cpu(es->s_journal_dev); ++ int err = 0; ++ int really_read_only; ++ ++ really_read_only = is_read_only(sb->s_dev); ++ ++ /* ++ * Are we loading a blank journal or performing recovery after a ++ * crash? For recovery, we need to check in advance whether we ++ * can get read-write access to the device. ++ */ ++ ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { ++ if (sb->s_flags & MS_RDONLY) { ++ printk(KERN_INFO "EXT3-fs: INFO: recovery " ++ "required on readonly filesystem.\n"); ++ if (really_read_only) { ++ printk(KERN_ERR "EXT3-fs: write access " ++ "unavailable, cannot proceed.\n"); ++ return -EROFS; ++ } ++ printk (KERN_INFO "EXT3-fs: write access will " ++ "be enabled during recovery.\n"); ++ } ++ } ++ ++ if (journal_inum && journal_dev) { ++ printk(KERN_ERR "EXT3-fs: filesystem has both journal " ++ "and inode journals!\n"); ++ return -EINVAL; ++ } ++ ++ if (journal_inum) { ++ if (!(journal = ext3_get_journal(sb, journal_inum))) ++ return -EINVAL; ++ } else { ++ if (!(journal = ext3_get_dev_journal(sb, journal_dev))) ++ return -EINVAL; ++ } ++ ++ ++ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { ++ err = journal_update_format(journal); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: error updating journal.\n"); ++ journal_destroy(journal); ++ return err; ++ } ++ } ++ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) ++ err = journal_wipe(journal, !really_read_only); ++ if (!err) ++ err = journal_load(journal); ++ ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: error loading journal.\n"); ++ journal_destroy(journal); ++ return err; ++ } ++ ++ EXT3_SB(sb)->s_journal = journal; ++ ext3_clear_journal_err(sb, es); ++ return 0; ++} ++ ++static int ext3_create_journal(struct super_block * sb, ++ struct ext3_super_block * es, ++ int journal_inum) ++{ ++ journal_t *journal; ++ ++ if (sb->s_flags & MS_RDONLY) { ++ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " ++ "create journal.\n"); ++ return -EROFS; ++ } ++ ++ if (!(journal = ext3_get_journal(sb, journal_inum))) ++ return -EINVAL; ++ ++ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", ++ journal_inum); ++ ++ if (journal_create(journal)) { ++ printk(KERN_ERR "EXT3-fs: error creating journal.\n"); ++ journal_destroy(journal); ++ return -EIO; ++ } ++ ++ EXT3_SB(sb)->s_journal = journal; ++ ++ ext3_update_dynamic_rev(sb); ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); ++ ++ es->s_journal_inum = cpu_to_le32(journal_inum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ ++ return 0; ++} ++ ++static void ext3_commit_super (struct super_block * sb, ++ struct ext3_super_block * es, ++ int sync) ++{ ++ es->s_wtime = cpu_to_le32(CURRENT_TIME); ++ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); ++ mark_buffer_dirty(sb->u.ext3_sb.s_sbh); ++ if (sync) { ++ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); ++ wait_on_buffer(sb->u.ext3_sb.s_sbh); ++ } ++} ++ ++ ++/* ++ * Have we just finished recovery? If so, and if we are mounting (or ++ * remounting) the filesystem readonly, then we will end up with a ++ * consistent fs on disk. Record that fact. ++ */ ++static void ext3_mark_recovery_complete(struct super_block * sb, ++ struct ext3_super_block * es) ++{ ++ journal_flush(EXT3_SB(sb)->s_journal); ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && ++ sb->s_flags & MS_RDONLY) { ++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ sb->s_dirt = 0; ++ ext3_commit_super(sb, es, 1); ++ } ++} ++ ++/* ++ * If we are mounting (or read-write remounting) a filesystem whose journal ++ * has recorded an error from a previous lifetime, move that error to the ++ * main filesystem now. ++ */ ++static void ext3_clear_journal_err(struct super_block * sb, ++ struct ext3_super_block * es) ++{ ++ journal_t *journal; ++ int j_errno; ++ const char *errstr; ++ ++ journal = EXT3_SB(sb)->s_journal; ++ ++ /* ++ * Now check for any error status which may have been recorded in the ++ * journal by a prior ext3_error() or ext3_abort() ++ */ ++ ++ j_errno = journal_errno(journal); ++ if (j_errno) { ++ char nbuf[16]; ++ ++ errstr = ext3_decode_error(sb, j_errno, nbuf); ++ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " ++ "from previous mount: %s", errstr); ++ ext3_warning(sb, __FUNCTION__, "Marking fs in need of " ++ "filesystem check."); ++ ++ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); ++ ext3_commit_super (sb, es, 1); ++ ++ journal_clear_err(journal); ++ } ++} ++ ++/* ++ * Force the running and committing transactions to commit, ++ * and wait on the commit. ++ */ ++int ext3_force_commit(struct super_block *sb) ++{ ++ journal_t *journal; ++ int ret; ++ ++ if (sb->s_flags & MS_RDONLY) ++ return 0; ++ ++ journal = EXT3_SB(sb)->s_journal; ++ sb->s_dirt = 0; ++ lock_kernel(); /* important: lock down j_running_transaction */ ++ ret = ext3_journal_force_commit(journal); ++ unlock_kernel(); ++ return ret; ++} ++ ++/* ++ * Ext3 always journals updates to the superblock itself, so we don't ++ * have to propagate any other updates to the superblock on disk at this ++ * point. Just start an async writeback to get the buffers on their way ++ * to the disk. ++ * ++ * This implicitly triggers the writebehind on sync(). ++ */ ++ ++static int do_sync_supers = 0; ++MODULE_PARM(do_sync_supers, "i"); ++MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); ++ ++void ext3_write_super (struct super_block * sb) ++{ ++ tid_t target; ++ ++ if (down_trylock(&sb->s_lock) == 0) ++ BUG(); /* aviro detector */ ++ sb->s_dirt = 0; ++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); ++ ++ if (do_sync_supers) { ++ unlock_super(sb); ++ log_wait_commit(EXT3_SB(sb)->s_journal, target); ++ lock_super(sb); ++ } ++} ++ ++/* ++ * LVM calls this function before a (read-only) snapshot is created. This ++ * gives us a chance to flush the journal completely and mark the fs clean. ++ */ ++void ext3_write_super_lockfs(struct super_block *sb) ++{ ++ sb->s_dirt = 0; ++ ++ lock_kernel(); /* 2.4.5 forgot to do this for us */ ++ if (!(sb->s_flags & MS_RDONLY)) { ++ journal_t *journal = EXT3_SB(sb)->s_journal; ++ ++ /* Now we set up the journal barrier. */ ++ journal_lock_updates(journal); ++ journal_flush(journal); ++ ++ /* Journal blocked and flushed, clear needs_recovery flag. */ ++ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); ++ } ++ unlock_kernel(); ++} ++ ++/* ++ * Called by LVM after the snapshot is done. We need to reset the RECOVER ++ * flag here, even though the filesystem is not technically dirty yet. ++ */ ++void ext3_unlockfs(struct super_block *sb) ++{ ++ if (!(sb->s_flags & MS_RDONLY)) { ++ lock_kernel(); ++ lock_super(sb); ++ /* Reser the needs_recovery flag before the fs is unlocked. */ ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); ++ unlock_super(sb); ++ journal_unlock_updates(EXT3_SB(sb)->s_journal); ++ unlock_kernel(); ++ } ++} ++ ++int ext3_remount (struct super_block * sb, int * flags, char * data) ++{ ++ struct ext3_super_block * es; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned long tmp; ++ ++ clear_ro_after(sb); ++ ++ /* ++ * Allow the "check" option to be passed as a remount option. ++ */ ++ if (!parse_options(data, &tmp, sbi, &tmp, 1)) ++ return -EINVAL; ++ ++ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ++ ext3_abort(sb, __FUNCTION__, "Abort forced by user"); ++ ++ es = sbi->s_es; ++ ++ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { ++ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ++ return -EROFS; ++ ++ if (*flags & MS_RDONLY) { ++ /* ++ * First of all, the unconditional stuff we have to do ++ * to disable replay of the journal when we next remount ++ */ ++ sb->s_flags |= MS_RDONLY; ++ ++ /* ++ * OK, test if we are remounting a valid rw partition ++ * readonly, and if so set the rdonly flag and then ++ * mark the partition as valid again. ++ */ ++ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && ++ (sbi->s_mount_state & EXT3_VALID_FS)) ++ es->s_state = cpu_to_le16(sbi->s_mount_state); ++ ++ ext3_mark_recovery_complete(sb, es); ++ } else { ++ int ret; ++ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, ++ ~EXT3_FEATURE_RO_COMPAT_SUPP))) { ++ printk(KERN_WARNING "EXT3-fs: %s: couldn't " ++ "remount RDWR because of unsupported " ++ "optional features (%x).\n", ++ bdevname(sb->s_dev), ret); ++ return -EROFS; ++ } ++ /* ++ * Mounting a RDONLY partition read-write, so reread ++ * and store the current valid flag. (It may have ++ * been changed by e2fsck since we originally mounted ++ * the partition.) ++ */ ++ ext3_clear_journal_err(sb, es); ++ sbi->s_mount_state = le16_to_cpu(es->s_state); ++ if (!ext3_setup_super (sb, es, 0)) ++ sb->s_flags &= ~MS_RDONLY; ++ } ++ } ++ setup_ro_after(sb); ++ return 0; ++} ++ ++int ext3_statfs (struct super_block * sb, struct statfs * buf) ++{ ++ struct ext3_super_block *es = EXT3_SB(sb)->s_es; ++ unsigned long overhead; ++ int i; ++ ++ if (test_opt (sb, MINIX_DF)) ++ overhead = 0; ++ else { ++ /* ++ * Compute the overhead (FS structures) ++ */ ++ ++ /* ++ * All of the blocks before first_data_block are ++ * overhead ++ */ ++ overhead = le32_to_cpu(es->s_first_data_block); ++ ++ /* ++ * Add the overhead attributed to the superblock and ++ * block group descriptors. If the sparse superblocks ++ * feature is turned on, then not all groups have this. ++ */ ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ overhead += ext3_bg_has_super(sb, i) + ++ ext3_bg_num_gdb(sb, i); ++ ++ /* ++ * Every block group has an inode bitmap, a block ++ * bitmap, and an inode table. ++ */ ++ overhead += (EXT3_SB(sb)->s_groups_count * ++ (2 + EXT3_SB(sb)->s_itb_per_group)); ++ } ++ ++ buf->f_type = EXT3_SUPER_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; ++ buf->f_bfree = ext3_count_free_blocks (sb); ++ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); ++ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) ++ buf->f_bavail = 0; ++ buf->f_files = le32_to_cpu(es->s_inodes_count); ++ buf->f_ffree = ext3_count_free_inodes (sb); ++ buf->f_namelen = EXT3_NAME_LEN; ++ return 0; ++} ++ ++static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); ++ ++static int __init init_ext3_fs(void) ++{ ++ return register_filesystem(&ext3_fs_type); ++} ++ ++static void __exit exit_ext3_fs(void) ++{ ++ unregister_filesystem(&ext3_fs_type); ++} ++ ++EXPORT_NO_SYMBOLS; ++ ++MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); ++MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ++MODULE_LICENSE("GPL"); ++module_init(init_ext3_fs) ++module_exit(exit_ext3_fs) +diff -rup --new-file linux.mcp2/fs/ext3/symlink.c linux_tmp/fs/ext3/symlink.c +--- linux.mcp2/fs/ext3/symlink.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux_tmp/fs/ext3/symlink.c 2001-11-09 14:25:04.000000000 -0800 +@@ -0,0 +1,39 @@ ++/* ++ * linux/fs/ext3/symlink.c ++ * ++ * Only fast symlinks left here - the rest is done by generic code. AV, 1999 ++ * ++ * Copyright (C) 1992, 1993, 1994, 1995 ++ * Remy Card (card@masi.ibp.fr) ++ * Laboratoire MASI - Institut Blaise Pascal ++ * Universite Pierre et Marie Curie (Paris VI) ++ * ++ * from ++ * ++ * linux/fs/minix/symlink.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * ext3 symlink handling code ++ */ ++ ++#include ++#include ++#include ++ ++static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) ++{ ++ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; ++ return vfs_readlink(dentry, buffer, buflen, s); ++} ++ ++static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; ++ return vfs_follow_link(nd, s); ++} ++ ++struct inode_operations ext3_fast_symlink_inode_operations = { ++ readlink: ext3_readlink, /* BKL not held. Don't need */ ++ follow_link: ext3_follow_link, /* BKL not held. Don't need */ ++}; diff --git a/lustre/kernel_patches/patches/2.4.19-jbd.patch b/lustre/kernel_patches/patches/2.4.19-jbd.patch new file mode 100644 index 0000000..4f4b38e --- /dev/null +++ b/lustre/kernel_patches/patches/2.4.19-jbd.patch @@ -0,0 +1,6524 @@ +diff -ruP linux.mcp2/fs/jbd/Makefile linuxppc_2.4.19_final/fs/jbd/Makefile +--- linux.mcp2/fs/jbd/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/Makefile 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,15 @@ ++# ++# fs/jbd/Makefile ++# ++# Makefile for the linux journaling routines. ++# ++ ++export-objs := journal.o ++O_TARGET := jbd.o ++ ++obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o ++ ++obj-m := $(O_TARGET) ++ ++include $(TOPDIR)/Rules.make ++ +diff -ruP linux.mcp2/fs/jbd/checkpoint.c linuxppc_2.4.19_final/fs/jbd/checkpoint.c +--- linux.mcp2/fs/jbd/checkpoint.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/checkpoint.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,605 @@ ++/* ++ * linux/fs/checkpoint.c ++ * ++ * Written by Stephen C. Tweedie , 1999 ++ * ++ * Copyright 1999 Red Hat Software --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Checkpoint routines for the generic filesystem journaling code. ++ * Part of the ext2fs journaling system. ++ * ++ * Checkpointing is the process of ensuring that a section of the log is ++ * committed fully to disk, so that that portion of the log can be ++ * reused. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern spinlock_t journal_datalist_lock; ++ ++/* ++ * Unlink a buffer from a transaction. ++ * ++ * Called with journal_datalist_lock held. ++ */ ++ ++static inline void __buffer_unlink(struct journal_head *jh) ++{ ++ transaction_t *transaction; ++ ++ transaction = jh->b_cp_transaction; ++ jh->b_cp_transaction = NULL; ++ ++ jh->b_cpnext->b_cpprev = jh->b_cpprev; ++ jh->b_cpprev->b_cpnext = jh->b_cpnext; ++ if (transaction->t_checkpoint_list == jh) ++ transaction->t_checkpoint_list = jh->b_cpnext; ++ if (transaction->t_checkpoint_list == jh) ++ transaction->t_checkpoint_list = NULL; ++} ++ ++/* ++ * Try to release a checkpointed buffer from its transaction. ++ * Returns 1 if we released it. ++ * Requires journal_datalist_lock ++ */ ++static int __try_to_free_cp_buf(struct journal_head *jh) ++{ ++ int ret = 0; ++ struct buffer_head *bh = jh2bh(jh); ++ ++ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { ++ JBUFFER_TRACE(jh, "remove from checkpoint list"); ++ __journal_remove_checkpoint(jh); ++ __journal_remove_journal_head(bh); ++ BUFFER_TRACE(bh, "release"); ++ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ ++ refile_buffer(bh); ++ __brelse(bh); ++ ret = 1; ++ } ++ return ret; ++} ++ ++/* ++ * log_wait_for_space: wait until there is space in the journal. ++ * ++ * Called with the journal already locked, but it will be unlocked if we have ++ * to wait for a checkpoint to free up some space in the log. ++ */ ++ ++void log_wait_for_space(journal_t *journal, int nblocks) ++{ ++ while (log_space_left(journal) < nblocks) { ++ if (journal->j_flags & JFS_ABORT) ++ return; ++ unlock_journal(journal); ++ down(&journal->j_checkpoint_sem); ++ lock_journal(journal); ++ ++ /* Test again, another process may have checkpointed ++ * while we were waiting for the checkpoint lock */ ++ if (log_space_left(journal) < nblocks) { ++ log_do_checkpoint(journal, nblocks); ++ } ++ up(&journal->j_checkpoint_sem); ++ } ++} ++ ++/* ++ * Clean up a transaction's checkpoint list. ++ * ++ * We wait for any pending IO to complete and make sure any clean ++ * buffers are removed from the transaction. ++ * ++ * Return 1 if we performed any actions which might have destroyed the ++ * checkpoint. (journal_remove_checkpoint() deletes the transaction when ++ * the last checkpoint buffer is cleansed) ++ * ++ * Called with the journal locked. ++ * Called with journal_datalist_lock held. ++ */ ++static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) ++{ ++ struct journal_head *jh, *next_jh, *last_jh; ++ struct buffer_head *bh; ++ int ret = 0; ++ ++ assert_spin_locked(&journal_datalist_lock); ++ jh = transaction->t_checkpoint_list; ++ if (!jh) ++ return 0; ++ ++ last_jh = jh->b_cpprev; ++ next_jh = jh; ++ do { ++ jh = next_jh; ++ bh = jh2bh(jh); ++ if (buffer_locked(bh)) { ++ atomic_inc(&bh->b_count); ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ wait_on_buffer(bh); ++ /* the journal_head may have gone by now */ ++ BUFFER_TRACE(bh, "brelse"); ++ __brelse(bh); ++ goto out_return_1; ++ } ++ ++ if (jh->b_transaction != NULL) { ++ transaction_t *transaction = jh->b_transaction; ++ tid_t tid = transaction->t_tid; ++ ++ spin_unlock(&journal_datalist_lock); ++ log_start_commit(journal, transaction); ++ unlock_journal(journal); ++ log_wait_commit(journal, tid); ++ goto out_return_1; ++ } ++ ++ /* ++ * We used to test for (jh->b_list != BUF_CLEAN) here. ++ * But unmap_underlying_metadata() can place buffer onto ++ * BUF_CLEAN. Since refile_buffer() no longer takes buffers ++ * off checkpoint lists, we cope with it here ++ */ ++ /* ++ * AKPM: I think the buffer_jdirty test is redundant - it ++ * shouldn't have NULL b_transaction? ++ */ ++ next_jh = jh->b_cpnext; ++ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { ++ BUFFER_TRACE(bh, "remove from checkpoint"); ++ __journal_remove_checkpoint(jh); ++ __journal_remove_journal_head(bh); ++ refile_buffer(bh); ++ __brelse(bh); ++ ret = 1; ++ } ++ ++ jh = next_jh; ++ } while (jh != last_jh); ++ ++ return ret; ++out_return_1: ++ lock_journal(journal); ++ spin_lock(&journal_datalist_lock); ++ return 1; ++} ++ ++#define NR_BATCH 64 ++ ++static void __flush_batch(struct buffer_head **bhs, int *batch_count) ++{ ++ int i; ++ ++ spin_unlock(&journal_datalist_lock); ++ ll_rw_block(WRITE, *batch_count, bhs); ++ run_task_queue(&tq_disk); ++ spin_lock(&journal_datalist_lock); ++ for (i = 0; i < *batch_count; i++) { ++ struct buffer_head *bh = bhs[i]; ++ clear_bit(BH_JWrite, &bh->b_state); ++ BUFFER_TRACE(bh, "brelse"); ++ __brelse(bh); ++ } ++ *batch_count = 0; ++} ++ ++/* ++ * Try to flush one buffer from the checkpoint list to disk. ++ * ++ * Return 1 if something happened which requires us to abort the current ++ * scan of the checkpoint list. ++ * ++ * Called with journal_datalist_lock held. ++ */ ++static int __flush_buffer(journal_t *journal, struct journal_head *jh, ++ struct buffer_head **bhs, int *batch_count, ++ int *drop_count) ++{ ++ struct buffer_head *bh = jh2bh(jh); ++ int ret = 0; ++ ++ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { ++ J_ASSERT_JH(jh, jh->b_transaction == NULL); ++ ++ /* ++ * Important: we are about to write the buffer, and ++ * possibly block, while still holding the journal lock. ++ * We cannot afford to let the transaction logic start ++ * messing around with this buffer before we write it to ++ * disk, as that would break recoverability. ++ */ ++ BUFFER_TRACE(bh, "queue"); ++ atomic_inc(&bh->b_count); ++ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); ++ set_bit(BH_JWrite, &bh->b_state); ++ bhs[*batch_count] = bh; ++ (*batch_count)++; ++ if (*batch_count == NR_BATCH) { ++ __flush_batch(bhs, batch_count); ++ ret = 1; ++ } ++ } else { ++ int last_buffer = 0; ++ if (jh->b_cpnext == jh) { ++ /* We may be about to drop the transaction. Tell the ++ * caller that the lists have changed. ++ */ ++ last_buffer = 1; ++ } ++ if (__try_to_free_cp_buf(jh)) { ++ (*drop_count)++; ++ ret = last_buffer; ++ } ++ } ++ return ret; ++} ++ ++ ++/* ++ * Perform an actual checkpoint. We don't write out only enough to ++ * satisfy the current blocked requests: rather we submit a reasonably ++ * sized chunk of the outstanding data to disk at once for ++ * efficiency. log_wait_for_space() will retry if we didn't free enough. ++ * ++ * However, we _do_ take into account the amount requested so that once ++ * the IO has been queued, we can return as soon as enough of it has ++ * completed to disk. ++ * ++ * The journal should be locked before calling this function. ++ */ ++ ++/* @@@ `nblocks' is unused. Should it be used? */ ++int log_do_checkpoint (journal_t *journal, int nblocks) ++{ ++ transaction_t *transaction, *last_transaction, *next_transaction; ++ int result; ++ int target; ++ int batch_count = 0; ++ struct buffer_head *bhs[NR_BATCH]; ++ ++ jbd_debug(1, "Start checkpoint\n"); ++ ++ /* ++ * First thing: if there are any transactions in the log which ++ * don't need checkpointing, just eliminate them from the ++ * journal straight away. ++ */ ++ result = cleanup_journal_tail(journal); ++ jbd_debug(1, "cleanup_journal_tail returned %d\n", result); ++ if (result <= 0) ++ return result; ++ ++ /* ++ * OK, we need to start writing disk blocks. Try to free up a ++ * quarter of the log in a single checkpoint if we can. ++ */ ++ /* ++ * AKPM: check this code. I had a feeling a while back that it ++ * degenerates into a busy loop at unmount time. ++ */ ++ target = (journal->j_last - journal->j_first) / 4; ++ ++ spin_lock(&journal_datalist_lock); ++repeat: ++ transaction = journal->j_checkpoint_transactions; ++ if (transaction == NULL) ++ goto done; ++ last_transaction = transaction->t_cpprev; ++ next_transaction = transaction; ++ ++ do { ++ struct journal_head *jh, *last_jh, *next_jh; ++ int drop_count = 0; ++ int cleanup_ret, retry = 0; ++ ++ transaction = next_transaction; ++ next_transaction = transaction->t_cpnext; ++ jh = transaction->t_checkpoint_list; ++ last_jh = jh->b_cpprev; ++ next_jh = jh; ++ do { ++ jh = next_jh; ++ next_jh = jh->b_cpnext; ++ retry = __flush_buffer(journal, jh, bhs, &batch_count, ++ &drop_count); ++ } while (jh != last_jh && !retry); ++ if (batch_count) { ++ __flush_batch(bhs, &batch_count); ++ goto repeat; ++ } ++ if (retry) ++ goto repeat; ++ /* ++ * We have walked the whole transaction list without ++ * finding anything to write to disk. We had better be ++ * able to make some progress or we are in trouble. ++ */ ++ cleanup_ret = __cleanup_transaction(journal, transaction); ++ J_ASSERT(drop_count != 0 || cleanup_ret != 0); ++ goto repeat; /* __cleanup may have dropped lock */ ++ } while (transaction != last_transaction); ++ ++done: ++ spin_unlock(&journal_datalist_lock); ++ result = cleanup_journal_tail(journal); ++ if (result < 0) ++ return result; ++ ++ return 0; ++} ++ ++/* ++ * Check the list of checkpoint transactions for the journal to see if ++ * we have already got rid of any since the last update of the log tail ++ * in the journal superblock. If so, we can instantly roll the ++ * superblock forward to remove those transactions from the log. ++ * ++ * Return <0 on error, 0 on success, 1 if there was nothing to clean up. ++ * ++ * Called with the journal lock held. ++ * ++ * This is the only part of the journaling code which really needs to be ++ * aware of transaction aborts. Checkpointing involves writing to the ++ * main filesystem area rather than to the journal, so it can proceed ++ * even in abort state, but we must not update the journal superblock if ++ * we have an abort error outstanding. ++ */ ++ ++int cleanup_journal_tail(journal_t *journal) ++{ ++ transaction_t * transaction; ++ tid_t first_tid; ++ unsigned long blocknr, freed; ++ ++ /* OK, work out the oldest transaction remaining in the log, and ++ * the log block it starts at. ++ * ++ * If the log is now empty, we need to work out which is the ++ * next transaction ID we will write, and where it will ++ * start. */ ++ ++ /* j_checkpoint_transactions needs locking */ ++ spin_lock(&journal_datalist_lock); ++ transaction = journal->j_checkpoint_transactions; ++ if (transaction) { ++ first_tid = transaction->t_tid; ++ blocknr = transaction->t_log_start; ++ } else if ((transaction = journal->j_committing_transaction) != NULL) { ++ first_tid = transaction->t_tid; ++ blocknr = transaction->t_log_start; ++ } else if ((transaction = journal->j_running_transaction) != NULL) { ++ first_tid = transaction->t_tid; ++ blocknr = journal->j_head; ++ } else { ++ first_tid = journal->j_transaction_sequence; ++ blocknr = journal->j_head; ++ } ++ spin_unlock(&journal_datalist_lock); ++ J_ASSERT (blocknr != 0); ++ ++ /* If the oldest pinned transaction is at the tail of the log ++ already then there's not much we can do right now. */ ++ if (journal->j_tail_sequence == first_tid) ++ return 1; ++ ++ /* OK, update the superblock to recover the freed space. ++ * Physical blocks come first: have we wrapped beyond the end of ++ * the log? */ ++ freed = blocknr - journal->j_tail; ++ if (blocknr < journal->j_tail) ++ freed = freed + journal->j_last - journal->j_first; ++ ++ jbd_debug(1, ++ "Cleaning journal tail from %d to %d (offset %lu), " ++ "freeing %lu\n", ++ journal->j_tail_sequence, first_tid, blocknr, freed); ++ ++ journal->j_free += freed; ++ journal->j_tail_sequence = first_tid; ++ journal->j_tail = blocknr; ++ if (!(journal->j_flags & JFS_ABORT)) ++ journal_update_superblock(journal, 1); ++ return 0; ++} ++ ++ ++/* Checkpoint list management */ ++ ++/* ++ * journal_clean_checkpoint_list ++ * ++ * Find all the written-back checkpoint buffers in the journal and release them. ++ * ++ * Called with the journal locked. ++ * Called with journal_datalist_lock held. ++ * Returns number of bufers reaped (for debug) ++ */ ++ ++int __journal_clean_checkpoint_list(journal_t *journal) ++{ ++ transaction_t *transaction, *last_transaction, *next_transaction; ++ int ret = 0; ++ ++ transaction = journal->j_checkpoint_transactions; ++ if (transaction == 0) ++ goto out; ++ ++ last_transaction = transaction->t_cpprev; ++ next_transaction = transaction; ++ do { ++ struct journal_head *jh; ++ ++ transaction = next_transaction; ++ next_transaction = transaction->t_cpnext; ++ jh = transaction->t_checkpoint_list; ++ if (jh) { ++ struct journal_head *last_jh = jh->b_cpprev; ++ struct journal_head *next_jh = jh; ++ do { ++ jh = next_jh; ++ next_jh = jh->b_cpnext; ++ ret += __try_to_free_cp_buf(jh); ++ } while (jh != last_jh); ++ } ++ } while (transaction != last_transaction); ++out: ++ return ret; ++} ++ ++/* ++ * journal_remove_checkpoint: called after a buffer has been committed ++ * to disk (either by being write-back flushed to disk, or being ++ * committed to the log). ++ * ++ * We cannot safely clean a transaction out of the log until all of the ++ * buffer updates committed in that transaction have safely been stored ++ * elsewhere on disk. To achieve this, all of the buffers in a ++ * transaction need to be maintained on the transaction's checkpoint ++ * list until they have been rewritten, at which point this function is ++ * called to remove the buffer from the existing transaction's ++ * checkpoint list. ++ * ++ * This function is called with the journal locked. ++ * This function is called with journal_datalist_lock held. ++ */ ++ ++void __journal_remove_checkpoint(struct journal_head *jh) ++{ ++ transaction_t *transaction; ++ journal_t *journal; ++ ++ JBUFFER_TRACE(jh, "entry"); ++ ++ if ((transaction = jh->b_cp_transaction) == NULL) { ++ JBUFFER_TRACE(jh, "not on transaction"); ++ goto out; ++ } ++ ++ journal = transaction->t_journal; ++ ++ __buffer_unlink(jh); ++ ++ if (transaction->t_checkpoint_list != NULL) ++ goto out; ++ JBUFFER_TRACE(jh, "transaction has no more buffers"); ++ ++ /* There is one special case to worry about: if we have just ++ pulled the buffer off a committing transaction's forget list, ++ then even if the checkpoint list is empty, the transaction ++ obviously cannot be dropped! */ ++ ++ if (transaction == journal->j_committing_transaction) { ++ JBUFFER_TRACE(jh, "belongs to committing transaction"); ++ goto out; ++ } ++ ++ /* OK, that was the last buffer for the transaction: we can now ++ safely remove this transaction from the log */ ++ ++ __journal_drop_transaction(journal, transaction); ++ ++ /* Just in case anybody was waiting for more transactions to be ++ checkpointed... */ ++ wake_up(&journal->j_wait_logspace); ++out: ++ JBUFFER_TRACE(jh, "exit"); ++} ++ ++void journal_remove_checkpoint(struct journal_head *jh) ++{ ++ spin_lock(&journal_datalist_lock); ++ __journal_remove_checkpoint(jh); ++ spin_unlock(&journal_datalist_lock); ++} ++ ++/* ++ * journal_insert_checkpoint: put a committed buffer onto a checkpoint ++ * list so that we know when it is safe to clean the transaction out of ++ * the log. ++ * ++ * Called with the journal locked. ++ * Called with journal_datalist_lock held. ++ */ ++void __journal_insert_checkpoint(struct journal_head *jh, ++ transaction_t *transaction) ++{ ++ JBUFFER_TRACE(jh, "entry"); ++ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); ++ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); ++ ++ assert_spin_locked(&journal_datalist_lock); ++ jh->b_cp_transaction = transaction; ++ ++ if (!transaction->t_checkpoint_list) { ++ jh->b_cpnext = jh->b_cpprev = jh; ++ } else { ++ jh->b_cpnext = transaction->t_checkpoint_list; ++ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; ++ jh->b_cpprev->b_cpnext = jh; ++ jh->b_cpnext->b_cpprev = jh; ++ } ++ transaction->t_checkpoint_list = jh; ++} ++ ++void journal_insert_checkpoint(struct journal_head *jh, ++ transaction_t *transaction) ++{ ++ spin_lock(&journal_datalist_lock); ++ __journal_insert_checkpoint(jh, transaction); ++ spin_unlock(&journal_datalist_lock); ++} ++ ++/* ++ * We've finished with this transaction structure: adios... ++ * ++ * The transaction must have no links except for the checkpoint by this ++ * point. ++ * ++ * Called with the journal locked. ++ * Called with journal_datalist_lock held. ++ */ ++ ++void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) ++{ ++ assert_spin_locked(&journal_datalist_lock); ++ if (transaction->t_cpnext) { ++ transaction->t_cpnext->t_cpprev = transaction->t_cpprev; ++ transaction->t_cpprev->t_cpnext = transaction->t_cpnext; ++ if (journal->j_checkpoint_transactions == transaction) ++ journal->j_checkpoint_transactions = ++ transaction->t_cpnext; ++ if (journal->j_checkpoint_transactions == transaction) ++ journal->j_checkpoint_transactions = NULL; ++ } ++ ++ J_ASSERT (transaction->t_ilist == NULL); ++ J_ASSERT (transaction->t_buffers == NULL); ++ J_ASSERT (transaction->t_sync_datalist == NULL); ++ J_ASSERT (transaction->t_async_datalist == NULL); ++ J_ASSERT (transaction->t_forget == NULL); ++ J_ASSERT (transaction->t_iobuf_list == NULL); ++ J_ASSERT (transaction->t_shadow_list == NULL); ++ J_ASSERT (transaction->t_log_list == NULL); ++ J_ASSERT (transaction->t_checkpoint_list == NULL); ++ J_ASSERT (transaction->t_updates == 0); ++ ++ J_ASSERT (transaction->t_journal->j_committing_transaction != ++ transaction); ++ ++ jbd_debug (1, "Dropping transaction %d, all done\n", ++ transaction->t_tid); ++ kfree (transaction); ++} ++ +diff -ruP linux.mcp2/fs/jbd/commit.c linuxppc_2.4.19_final/fs/jbd/commit.c +--- linux.mcp2/fs/jbd/commit.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/commit.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,719 @@ ++/* ++ * linux/fs/commit.c ++ * ++ * Written by Stephen C. Tweedie , 1998 ++ * ++ * Copyright 1998 Red Hat corp --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Journal commit routines for the generic filesystem journaling code; ++ * part of the ext2fs journaling system. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern spinlock_t journal_datalist_lock; ++ ++/* ++ * Default IO end handler for temporary BJ_IO buffer_heads. ++ */ ++void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) ++{ ++ BUFFER_TRACE(bh, ""); ++ mark_buffer_uptodate(bh, uptodate); ++ unlock_buffer(bh); ++} ++ ++/* ++ * journal_commit_transaction ++ * ++ * The primary function for committing a transaction to the log. This ++ * function is called by the journal thread to begin a complete commit. ++ */ ++void journal_commit_transaction(journal_t *journal) ++{ ++ transaction_t *commit_transaction; ++ struct journal_head *jh, *new_jh, *descriptor; ++ struct journal_head *next_jh, *last_jh; ++ struct buffer_head *wbuf[64]; ++ int bufs; ++ int flags; ++ int err; ++ unsigned long blocknr; ++ char *tagp = NULL; ++ journal_header_t *header; ++ journal_block_tag_t *tag = NULL; ++ int space_left = 0; ++ int first_tag = 0; ++ int tag_flag; ++ int i; ++ ++ /* ++ * First job: lock down the current transaction and wait for ++ * all outstanding updates to complete. ++ */ ++ ++ lock_journal(journal); /* Protect journal->j_running_transaction */ ++ ++#ifdef COMMIT_STATS ++ spin_lock(&journal_datalist_lock); ++ summarise_journal_usage(journal); ++ spin_unlock(&journal_datalist_lock); ++#endif ++ ++ lock_kernel(); ++ ++ J_ASSERT (journal->j_running_transaction != NULL); ++ J_ASSERT (journal->j_committing_transaction == NULL); ++ ++ commit_transaction = journal->j_running_transaction; ++ J_ASSERT (commit_transaction->t_state == T_RUNNING); ++ ++ jbd_debug (1, "JBD: starting commit of transaction %d\n", ++ commit_transaction->t_tid); ++ ++ commit_transaction->t_state = T_LOCKED; ++ while (commit_transaction->t_updates != 0) { ++ unlock_journal(journal); ++ sleep_on(&journal->j_wait_updates); ++ lock_journal(journal); ++ } ++ ++ J_ASSERT (commit_transaction->t_outstanding_credits <= ++ journal->j_max_transaction_buffers); ++ ++ /* Do we need to erase the effects of a prior journal_flush? */ ++ if (journal->j_flags & JFS_FLUSHED) { ++ jbd_debug(3, "super block updated\n"); ++ journal_update_superblock(journal, 1); ++ } else { ++ jbd_debug(3, "superblock not updated\n"); ++ } ++ ++ /* ++ * First thing we are allowed to do is to discard any remaining ++ * BJ_Reserved buffers. Note, it is _not_ permissible to assume ++ * that there are no such buffers: if a large filesystem ++ * operation like a truncate needs to split itself over multiple ++ * transactions, then it may try to do a journal_restart() while ++ * there are still BJ_Reserved buffers outstanding. These must ++ * be released cleanly from the current transaction. ++ * ++ * In this case, the filesystem must still reserve write access ++ * again before modifying the buffer in the new transaction, but ++ * we do not require it to remember exactly which old buffers it ++ * has reserved. This is consistent with the existing behaviour ++ * that multiple journal_get_write_access() calls to the same ++ * buffer are perfectly permissable. ++ */ ++ ++ while (commit_transaction->t_reserved_list) { ++ jh = commit_transaction->t_reserved_list; ++ JBUFFER_TRACE(jh, "reserved, unused: refile"); ++ journal_refile_buffer(jh); ++ } ++ ++ /* ++ * Now try to drop any written-back buffers from the journal's ++ * checkpoint lists. We do this *before* commit because it potentially ++ * frees some memory ++ */ ++ spin_lock(&journal_datalist_lock); ++ __journal_clean_checkpoint_list(journal); ++ spin_unlock(&journal_datalist_lock); ++ ++ /* First part of the commit: force the revoke list out to disk. ++ * The revoke code generates its own metadata blocks on disk for this. ++ * ++ * It is important that we do this while the transaction is ++ * still locked. Generating the revoke records should not ++ * generate any IO stalls, so this should be quick; and doing ++ * the work while we have the transaction locked means that we ++ * only ever have to maintain the revoke list for one ++ * transaction at a time. ++ */ ++ ++ jbd_debug (3, "JBD: commit phase 1\n"); ++ ++ journal_write_revoke_records(journal, commit_transaction); ++ ++ /* ++ * Now that we have built the revoke records, we can start ++ * reusing the revoke list for a new running transaction. We ++ * can now safely start committing the old transaction: time to ++ * get a new running transaction for incoming filesystem updates ++ */ ++ ++ commit_transaction->t_state = T_FLUSH; ++ ++ wake_up(&journal->j_wait_transaction_locked); ++ ++ journal->j_committing_transaction = commit_transaction; ++ journal->j_running_transaction = NULL; ++ ++ commit_transaction->t_log_start = journal->j_head; ++ ++ unlock_kernel(); ++ ++ jbd_debug (3, "JBD: commit phase 2\n"); ++ ++ /* ++ * Now start flushing things to disk, in the order they appear ++ * on the transaction lists. Data blocks go first. ++ */ ++ ++ /* ++ * Whenever we unlock the journal and sleep, things can get added ++ * onto ->t_datalist, so we have to keep looping back to write_out_data ++ * until we *know* that the list is empty. ++ */ ++write_out_data: ++ ++ /* ++ * Cleanup any flushed data buffers from the data list. Even in ++ * abort mode, we want to flush this out as soon as possible. ++ * ++ * We take journal_datalist_lock to protect the lists from ++ * journal_try_to_free_buffers(). ++ */ ++ spin_lock(&journal_datalist_lock); ++ ++write_out_data_locked: ++ bufs = 0; ++ next_jh = commit_transaction->t_sync_datalist; ++ if (next_jh == NULL) ++ goto sync_datalist_empty; ++ last_jh = next_jh->b_tprev; ++ ++ do { ++ struct buffer_head *bh; ++ ++ jh = next_jh; ++ next_jh = jh->b_tnext; ++ bh = jh2bh(jh); ++ if (!buffer_locked(bh)) { ++ if (buffer_dirty(bh)) { ++ BUFFER_TRACE(bh, "start journal writeout"); ++ atomic_inc(&bh->b_count); ++ wbuf[bufs++] = bh; ++ } else { ++ BUFFER_TRACE(bh, "writeout complete: unfile"); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ __journal_remove_journal_head(bh); ++ refile_buffer(bh); ++ __brelse(bh); ++ } ++ } ++ if (bufs == ARRAY_SIZE(wbuf)) { ++ /* ++ * Major speedup: start here on the next scan ++ */ ++ J_ASSERT(commit_transaction->t_sync_datalist != 0); ++ commit_transaction->t_sync_datalist = jh; ++ break; ++ } ++ } while (jh != last_jh); ++ ++ if (bufs || current->need_resched) { ++ jbd_debug(2, "submit %d writes\n", bufs); ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ if (bufs) ++ ll_rw_block(WRITE, bufs, wbuf); ++ if (current->need_resched) ++ schedule(); ++ journal_brelse_array(wbuf, bufs); ++ lock_journal(journal); ++ spin_lock(&journal_datalist_lock); ++ if (bufs) ++ goto write_out_data_locked; ++ } ++ ++ /* ++ * Wait for all previously submitted IO on the data list to complete. ++ */ ++ jh = commit_transaction->t_sync_datalist; ++ if (jh == NULL) ++ goto sync_datalist_empty; ++ ++ do { ++ struct buffer_head *bh; ++ jh = jh->b_tprev; /* Wait on the last written */ ++ bh = jh2bh(jh); ++ if (buffer_locked(bh)) { ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ wait_on_buffer(bh); ++ /* the journal_head may have been removed now */ ++ lock_journal(journal); ++ goto write_out_data; ++ } else if (buffer_dirty(bh)) { ++ goto write_out_data_locked; ++ } ++ } while (jh != commit_transaction->t_sync_datalist); ++ goto write_out_data_locked; ++ ++sync_datalist_empty: ++ /* ++ * Wait for all the async writepage data. As they become unlocked ++ * in end_buffer_io_async(), the only place where they can be ++ * reaped is in try_to_free_buffers(), and we're locked against ++ * that. ++ */ ++ while ((jh = commit_transaction->t_async_datalist)) { ++ struct buffer_head *bh = jh2bh(jh); ++ if (buffer_locked(bh)) { ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ wait_on_buffer(bh); ++ lock_journal(journal); ++ spin_lock(&journal_datalist_lock); ++ continue; /* List may have changed */ ++ } ++ if (jh->b_next_transaction) { ++ /* ++ * For writepage() buffers in journalled data mode: a ++ * later transaction may want the buffer for "metadata" ++ */ ++ __journal_refile_buffer(jh); ++ } else { ++ BUFFER_TRACE(bh, "finished async writeout: unfile"); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ __journal_remove_journal_head(bh); ++ BUFFER_TRACE(bh, "finished async writeout: refile"); ++ /* It can sometimes be on BUF_LOCKED due to migration ++ * from syncdata to asyncdata */ ++ if (bh->b_list != BUF_CLEAN) ++ refile_buffer(bh); ++ __brelse(bh); ++ } ++ } ++ spin_unlock(&journal_datalist_lock); ++ ++ /* ++ * If we found any dirty or locked buffers, then we should have ++ * looped back up to the write_out_data label. If there weren't ++ * any then journal_clean_data_list should have wiped the list ++ * clean by now, so check that it is in fact empty. ++ */ ++ J_ASSERT (commit_transaction->t_sync_datalist == NULL); ++ J_ASSERT (commit_transaction->t_async_datalist == NULL); ++ ++ jbd_debug (3, "JBD: commit phase 3\n"); ++ ++ /* ++ * Way to go: we have now written out all of the data for a ++ * transaction! Now comes the tricky part: we need to write out ++ * metadata. Loop over the transaction's entire buffer list: ++ */ ++ commit_transaction->t_state = T_COMMIT; ++ ++ descriptor = 0; ++ bufs = 0; ++ while (commit_transaction->t_buffers) { ++ ++ /* Find the next buffer to be journaled... */ ++ ++ jh = commit_transaction->t_buffers; ++ ++ /* If we're in abort mode, we just un-journal the buffer and ++ release it for background writing. */ ++ ++ if (is_journal_aborted(journal)) { ++ JBUFFER_TRACE(jh, "journal is aborting: refile"); ++ journal_refile_buffer(jh); ++ /* If that was the last one, we need to clean up ++ * any descriptor buffers which may have been ++ * already allocated, even if we are now ++ * aborting. */ ++ if (!commit_transaction->t_buffers) ++ goto start_journal_io; ++ continue; ++ } ++ ++ /* Make sure we have a descriptor block in which to ++ record the metadata buffer. */ ++ ++ if (!descriptor) { ++ struct buffer_head *bh; ++ ++ J_ASSERT (bufs == 0); ++ ++ jbd_debug(4, "JBD: get descriptor\n"); ++ ++ descriptor = journal_get_descriptor_buffer(journal); ++ if (!descriptor) { ++ __journal_abort_hard(journal); ++ continue; ++ } ++ ++ bh = jh2bh(descriptor); ++ jbd_debug(4, "JBD: got buffer %ld (%p)\n", ++ bh->b_blocknr, bh->b_data); ++ header = (journal_header_t *)&bh->b_data[0]; ++ header->h_magic = htonl(JFS_MAGIC_NUMBER); ++ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); ++ header->h_sequence = htonl(commit_transaction->t_tid); ++ ++ tagp = &bh->b_data[sizeof(journal_header_t)]; ++ space_left = bh->b_size - sizeof(journal_header_t); ++ first_tag = 1; ++ set_bit(BH_JWrite, &bh->b_state); ++ wbuf[bufs++] = bh; ++ ++ /* Record it so that we can wait for IO ++ completion later */ ++ BUFFER_TRACE(bh, "ph3: file as descriptor"); ++ journal_file_buffer(descriptor, commit_transaction, ++ BJ_LogCtl); ++ } ++ ++ /* Where is the buffer to be written? */ ++ ++ err = journal_next_log_block(journal, &blocknr); ++ /* If the block mapping failed, just abandon the buffer ++ and repeat this loop: we'll fall into the ++ refile-on-abort condition above. */ ++ if (err) { ++ __journal_abort_hard(journal); ++ continue; ++ } ++ ++ /* Bump b_count to prevent truncate from stumbling over ++ the shadowed buffer! @@@ This can go if we ever get ++ rid of the BJ_IO/BJ_Shadow pairing of buffers. */ ++ atomic_inc(&jh2bh(jh)->b_count); ++ ++ /* Make a temporary IO buffer with which to write it out ++ (this will requeue both the metadata buffer and the ++ temporary IO buffer). new_bh goes on BJ_IO*/ ++ ++ set_bit(BH_JWrite, &jh2bh(jh)->b_state); ++ /* ++ * akpm: journal_write_metadata_buffer() sets ++ * new_bh->b_transaction to commit_transaction. ++ * We need to clean this up before we release new_bh ++ * (which is of type BJ_IO) ++ */ ++ JBUFFER_TRACE(jh, "ph3: write metadata"); ++ flags = journal_write_metadata_buffer(commit_transaction, ++ jh, &new_jh, blocknr); ++ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); ++ set_bit(BH_Lock, &jh2bh(new_jh)->b_state); ++ wbuf[bufs++] = jh2bh(new_jh); ++ ++ /* Record the new block's tag in the current descriptor ++ buffer */ ++ ++ tag_flag = 0; ++ if (flags & 1) ++ tag_flag |= JFS_FLAG_ESCAPE; ++ if (!first_tag) ++ tag_flag |= JFS_FLAG_SAME_UUID; ++ ++ tag = (journal_block_tag_t *) tagp; ++ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); ++ tag->t_flags = htonl(tag_flag); ++ tagp += sizeof(journal_block_tag_t); ++ space_left -= sizeof(journal_block_tag_t); ++ ++ if (first_tag) { ++ memcpy (tagp, journal->j_uuid, 16); ++ tagp += 16; ++ space_left -= 16; ++ first_tag = 0; ++ } ++ ++ /* If there's no more to do, or if the descriptor is full, ++ let the IO rip! */ ++ ++ if (bufs == ARRAY_SIZE(wbuf) || ++ commit_transaction->t_buffers == NULL || ++ space_left < sizeof(journal_block_tag_t) + 16) { ++ ++ jbd_debug(4, "JBD: Submit %d IOs\n", bufs); ++ ++ /* Write an end-of-descriptor marker before ++ submitting the IOs. "tag" still points to ++ the last tag we set up. */ ++ ++ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); ++ ++start_journal_io: ++ unlock_journal(journal); ++ for (i=0; ib_state); ++ bh->b_end_io = journal_end_buffer_io_sync; ++ submit_bh(WRITE, bh); ++ } ++ if (current->need_resched) ++ schedule(); ++ lock_journal(journal); ++ ++ /* Force a new descriptor to be generated next ++ time round the loop. */ ++ descriptor = NULL; ++ bufs = 0; ++ } ++ } ++ ++ /* Lo and behold: we have just managed to send a transaction to ++ the log. Before we can commit it, wait for the IO so far to ++ complete. Control buffers being written are on the ++ transaction's t_log_list queue, and metadata buffers are on ++ the t_iobuf_list queue. ++ ++ Wait for the transactions in reverse order. That way we are ++ less likely to be woken up until all IOs have completed, and ++ so we incur less scheduling load. ++ */ ++ ++ jbd_debug(3, "JBD: commit phase 4\n"); ++ ++ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */ ++ wait_for_iobuf: ++ while (commit_transaction->t_iobuf_list != NULL) { ++ struct buffer_head *bh; ++ jh = commit_transaction->t_iobuf_list->b_tprev; ++ bh = jh2bh(jh); ++ if (buffer_locked(bh)) { ++ unlock_journal(journal); ++ wait_on_buffer(bh); ++ lock_journal(journal); ++ goto wait_for_iobuf; ++ } ++ ++ clear_bit(BH_JWrite, &jh2bh(jh)->b_state); ++ ++ JBUFFER_TRACE(jh, "ph4: unfile after journal write"); ++ journal_unfile_buffer(jh); ++ ++ /* ++ * akpm: don't put back a buffer_head with stale pointers ++ * dangling around. ++ */ ++ J_ASSERT_JH(jh, jh->b_transaction != NULL); ++ jh->b_transaction = NULL; ++ ++ /* ++ * ->t_iobuf_list should contain only dummy buffer_heads ++ * which were created by journal_write_metadata_buffer(). ++ */ ++ bh = jh2bh(jh); ++ BUFFER_TRACE(bh, "dumping temporary bh"); ++ journal_unlock_journal_head(jh); ++ __brelse(bh); ++ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); ++ put_unused_buffer_head(bh); ++ ++ /* We also have to unlock and free the corresponding ++ shadowed buffer */ ++ jh = commit_transaction->t_shadow_list->b_tprev; ++ bh = jh2bh(jh); ++ clear_bit(BH_JWrite, &bh->b_state); ++ J_ASSERT_BH(bh, buffer_jdirty(bh)); ++ ++ /* The metadata is now released for reuse, but we need ++ to remember it against this transaction so that when ++ we finally commit, we can do any checkpointing ++ required. */ ++ JBUFFER_TRACE(jh, "file as BJ_Forget"); ++ journal_file_buffer(jh, commit_transaction, BJ_Forget); ++ /* Wake up any transactions which were waiting for this ++ IO to complete */ ++ wake_up(&bh->b_wait); ++ JBUFFER_TRACE(jh, "brelse shadowed buffer"); ++ __brelse(bh); ++ } ++ ++ J_ASSERT (commit_transaction->t_shadow_list == NULL); ++ ++ jbd_debug(3, "JBD: commit phase 5\n"); ++ ++ /* Here we wait for the revoke record and descriptor record buffers */ ++ wait_for_ctlbuf: ++ while (commit_transaction->t_log_list != NULL) { ++ struct buffer_head *bh; ++ ++ jh = commit_transaction->t_log_list->b_tprev; ++ bh = jh2bh(jh); ++ if (buffer_locked(bh)) { ++ unlock_journal(journal); ++ wait_on_buffer(bh); ++ lock_journal(journal); ++ goto wait_for_ctlbuf; ++ } ++ ++ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); ++ clear_bit(BH_JWrite, &bh->b_state); ++ journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ journal_unlock_journal_head(jh); ++ put_bh(bh); /* One for getblk */ ++ } ++ ++ jbd_debug(3, "JBD: commit phase 6\n"); ++ ++ if (is_journal_aborted(journal)) ++ goto skip_commit; ++ ++ /* Done it all: now write the commit record. We should have ++ * cleaned up our previous buffers by now, so if we are in abort ++ * mode we can now just skip the rest of the journal write ++ * entirely. */ ++ ++ descriptor = journal_get_descriptor_buffer(journal); ++ if (!descriptor) { ++ __journal_abort_hard(journal); ++ goto skip_commit; ++ } ++ ++ /* AKPM: buglet - add `i' to tmp! */ ++ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { ++ journal_header_t *tmp = ++ (journal_header_t*)jh2bh(descriptor)->b_data; ++ tmp->h_magic = htonl(JFS_MAGIC_NUMBER); ++ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); ++ tmp->h_sequence = htonl(commit_transaction->t_tid); ++ } ++ ++ unlock_journal(journal); ++ JBUFFER_TRACE(descriptor, "write commit block"); ++ { ++ struct buffer_head *bh = jh2bh(descriptor); ++ clear_bit(BH_Dirty, &bh->b_state); ++ bh->b_end_io = journal_end_buffer_io_sync; ++ submit_bh(WRITE, bh); ++ wait_on_buffer(bh); ++ put_bh(bh); /* One for getblk() */ ++ journal_unlock_journal_head(descriptor); ++ } ++ lock_journal(journal); ++ ++ /* End of a transaction! Finally, we can do checkpoint ++ processing: any buffers committed as a result of this ++ transaction can be removed from any checkpoint list it was on ++ before. */ ++ ++skip_commit: ++ ++ jbd_debug(3, "JBD: commit phase 7\n"); ++ ++ J_ASSERT(commit_transaction->t_sync_datalist == NULL); ++ J_ASSERT(commit_transaction->t_async_datalist == NULL); ++ J_ASSERT(commit_transaction->t_buffers == NULL); ++ J_ASSERT(commit_transaction->t_checkpoint_list == NULL); ++ J_ASSERT(commit_transaction->t_iobuf_list == NULL); ++ J_ASSERT(commit_transaction->t_shadow_list == NULL); ++ J_ASSERT(commit_transaction->t_log_list == NULL); ++ ++ while (commit_transaction->t_forget) { ++ transaction_t *cp_transaction; ++ struct buffer_head *bh; ++ ++ jh = commit_transaction->t_forget; ++ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || ++ jh->b_transaction == journal->j_running_transaction); ++ ++ /* ++ * If there is undo-protected committed data against ++ * this buffer, then we can remove it now. If it is a ++ * buffer needing such protection, the old frozen_data ++ * field now points to a committed version of the ++ * buffer, so rotate that field to the new committed ++ * data. ++ * ++ * Otherwise, we can just throw away the frozen data now. ++ */ ++ if (jh->b_committed_data) { ++ kfree(jh->b_committed_data); ++ jh->b_committed_data = NULL; ++ if (jh->b_frozen_data) { ++ jh->b_committed_data = jh->b_frozen_data; ++ jh->b_frozen_data = NULL; ++ } ++ } else if (jh->b_frozen_data) { ++ kfree(jh->b_frozen_data); ++ jh->b_frozen_data = NULL; ++ } ++ ++ spin_lock(&journal_datalist_lock); ++ cp_transaction = jh->b_cp_transaction; ++ if (cp_transaction) { ++ JBUFFER_TRACE(jh, "remove from old cp transaction"); ++ J_ASSERT_JH(jh, commit_transaction != cp_transaction); ++ __journal_remove_checkpoint(jh); ++ } ++ ++ /* Only re-checkpoint the buffer_head if it is marked ++ * dirty. If the buffer was added to the BJ_Forget list ++ * by journal_forget, it may no longer be dirty and ++ * there's no point in keeping a checkpoint record for ++ * it. */ ++ bh = jh2bh(jh); ++ if (buffer_jdirty(bh)) { ++ JBUFFER_TRACE(jh, "add to new checkpointing trans"); ++ __journal_insert_checkpoint(jh, commit_transaction); ++ JBUFFER_TRACE(jh, "refile for checkpoint writeback"); ++ __journal_refile_buffer(jh); ++ } else { ++ J_ASSERT_BH(bh, !buffer_dirty(bh)); ++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = 0; ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ } ++ spin_unlock(&journal_datalist_lock); ++ } ++ ++ /* Done with this transaction! */ ++ ++ jbd_debug(3, "JBD: commit phase 8\n"); ++ ++ J_ASSERT (commit_transaction->t_state == T_COMMIT); ++ commit_transaction->t_state = T_FINISHED; ++ ++ J_ASSERT (commit_transaction == journal->j_committing_transaction); ++ journal->j_commit_sequence = commit_transaction->t_tid; ++ journal->j_committing_transaction = NULL; ++ ++ spin_lock(&journal_datalist_lock); ++ if (commit_transaction->t_checkpoint_list == NULL) { ++ __journal_drop_transaction(journal, commit_transaction); ++ } else { ++ if (journal->j_checkpoint_transactions == NULL) { ++ journal->j_checkpoint_transactions = commit_transaction; ++ commit_transaction->t_cpnext = commit_transaction; ++ commit_transaction->t_cpprev = commit_transaction; ++ } else { ++ commit_transaction->t_cpnext = ++ journal->j_checkpoint_transactions; ++ commit_transaction->t_cpprev = ++ commit_transaction->t_cpnext->t_cpprev; ++ commit_transaction->t_cpnext->t_cpprev = ++ commit_transaction; ++ commit_transaction->t_cpprev->t_cpnext = ++ commit_transaction; ++ } ++ } ++ spin_unlock(&journal_datalist_lock); ++ ++ jbd_debug(1, "JBD: commit %d complete, head %d\n", ++ journal->j_commit_sequence, journal->j_tail_sequence); ++ ++ unlock_journal(journal); ++ wake_up(&journal->j_wait_done_commit); ++} +diff -ruP linux.mcp2/fs/jbd/journal.c linuxppc_2.4.19_final/fs/jbd/journal.c +--- linux.mcp2/fs/jbd/journal.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/journal.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,1877 @@ ++/* ++ * linux/fs/journal.c ++ * ++ * Written by Stephen C. Tweedie , 1998 ++ * ++ * Copyright 1998 Red Hat corp --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Generic filesystem journal-writing code; part of the ext2fs ++ * journaling system. ++ * ++ * This file manages journals: areas of disk reserved for logging ++ * transactional updates. This includes the kernel journaling thread ++ * which is responsible for scheduling updates to the log. ++ * ++ * We do not actually manage the physical storage of the journal in this ++ * file: that is left to a per-journal policy function, which allows us ++ * to store the journal within a filesystem-specified area for ext2 ++ * journaling (ext2 can use a reserved inode for storing the log). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(journal_start); ++EXPORT_SYMBOL(journal_try_start); ++EXPORT_SYMBOL(journal_restart); ++EXPORT_SYMBOL(journal_extend); ++EXPORT_SYMBOL(journal_stop); ++EXPORT_SYMBOL(journal_lock_updates); ++EXPORT_SYMBOL(journal_unlock_updates); ++EXPORT_SYMBOL(journal_get_write_access); ++EXPORT_SYMBOL(journal_get_create_access); ++EXPORT_SYMBOL(journal_get_undo_access); ++EXPORT_SYMBOL(journal_dirty_data); ++EXPORT_SYMBOL(journal_dirty_metadata); ++#if 0 ++EXPORT_SYMBOL(journal_release_buffer); ++#endif ++EXPORT_SYMBOL(journal_forget); ++#if 0 ++EXPORT_SYMBOL(journal_sync_buffer); ++#endif ++EXPORT_SYMBOL(journal_flush); ++EXPORT_SYMBOL(journal_revoke); ++ ++EXPORT_SYMBOL(journal_init_dev); ++EXPORT_SYMBOL(journal_init_inode); ++EXPORT_SYMBOL(journal_update_format); ++EXPORT_SYMBOL(journal_check_used_features); ++EXPORT_SYMBOL(journal_check_available_features); ++EXPORT_SYMBOL(journal_set_features); ++EXPORT_SYMBOL(journal_create); ++EXPORT_SYMBOL(journal_load); ++EXPORT_SYMBOL(journal_destroy); ++EXPORT_SYMBOL(journal_recover); ++EXPORT_SYMBOL(journal_update_superblock); ++EXPORT_SYMBOL(journal_abort); ++EXPORT_SYMBOL(journal_errno); ++EXPORT_SYMBOL(journal_ack_err); ++EXPORT_SYMBOL(journal_clear_err); ++EXPORT_SYMBOL(log_wait_commit); ++EXPORT_SYMBOL(log_start_commit); ++EXPORT_SYMBOL(journal_wipe); ++EXPORT_SYMBOL(journal_blocks_per_page); ++EXPORT_SYMBOL(journal_flushpage); ++EXPORT_SYMBOL(journal_try_to_free_buffers); ++EXPORT_SYMBOL(journal_bmap); ++EXPORT_SYMBOL(journal_force_commit); ++ ++static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); ++ ++/* ++ * journal_datalist_lock is used to protect data buffers: ++ * ++ * bh->b_transaction ++ * bh->b_tprev ++ * bh->b_tnext ++ * ++ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is ++ * async wrt everything else. ++ * ++ * It is also used for checkpoint data, also to protect against ++ * journal_try_to_free_buffer(): ++ * ++ * bh->b_cp_transaction ++ * bh->b_cpnext ++ * bh->b_cpprev ++ * transaction->t_checkpoint_list ++ * transaction->t_cpnext ++ * transaction->t_cpprev ++ * journal->j_checkpoint_transactions ++ * ++ * It is global at this time rather than per-journal because it's ++ * impossible for __journal_free_buffer to go from a buffer_head ++ * back to a journal_t unracily (well, not true. Fix later) ++ * ++ * ++ * The `datalist' and `checkpoint list' functions are quite ++ * separate and we could use two spinlocks here. ++ * ++ * lru_list_lock nests inside journal_datalist_lock. ++ */ ++spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED; ++ ++/* ++ * jh_splice_lock needs explantion. ++ * ++ * In a number of places we want to do things like: ++ * ++ * if (buffer_jbd(bh) && bh2jh(bh)->foo) ++ * ++ * This is racy on SMP, because another CPU could remove the journal_head ++ * in the middle of this expression. We need locking. ++ * ++ * But we can greatly optimise the locking cost by testing BH_JBD ++ * outside the lock. So, effectively: ++ * ++ * ret = 0; ++ * if (buffer_jbd(bh)) { ++ * spin_lock(&jh_splice_lock); ++ * if (buffer_jbd(bh)) { (* Still there? *) ++ * ret = bh2jh(bh)->foo; ++ * } ++ * spin_unlock(&jh_splice_lock); ++ * } ++ * return ret; ++ * ++ * Now, that protects us from races where another CPU can remove the ++ * journal_head. But it doesn't defend us from the situation where another ++ * CPU can *add* a journal_head. This is a correctness issue. But it's not ++ * a problem because a) the calling code was *already* racy and b) it often ++ * can't happen at the call site and c) the places where we add journal_heads ++ * tend to be under external locking. ++ */ ++spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED; ++ ++/* ++ * List of all journals in the system. Protected by the BKL. ++ */ ++static LIST_HEAD(all_journals); ++ ++/* ++ * Helper function used to manage commit timeouts ++ */ ++ ++static void commit_timeout(unsigned long __data) ++{ ++ struct task_struct * p = (struct task_struct *) __data; ++ ++ wake_up_process(p); ++} ++ ++/* Static check for data structure consistency. There's no code ++ * invoked --- we'll just get a linker failure if things aren't right. ++ */ ++void __journal_internal_check(void) ++{ ++ extern void journal_bad_superblock_size(void); ++ if (sizeof(struct journal_superblock_s) != 1024) ++ journal_bad_superblock_size(); ++} ++ ++/* ++ * kjournald: The main thread function used to manage a logging device ++ * journal. ++ * ++ * This kernel thread is responsible for two things: ++ * ++ * 1) COMMIT: Every so often we need to commit the current state of the ++ * filesystem to disk. The journal thread is responsible for writing ++ * all of the metadata buffers to disk. ++ * ++ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all ++ * of the data in that part of the log has been rewritten elsewhere on ++ * the disk. Flushing these old buffers to reclaim space in the log is ++ * known as checkpointing, and this thread is responsible for that job. ++ */ ++ ++journal_t *current_journal; // AKPM: debug ++ ++int kjournald(void *arg) ++{ ++ journal_t *journal = (journal_t *) arg; ++ transaction_t *transaction; ++ struct timer_list timer; ++ ++ current_journal = journal; ++ ++ lock_kernel(); ++ daemonize(); ++ reparent_to_init(); ++ spin_lock_irq(¤t->sigmask_lock); ++ sigfillset(¤t->blocked); ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ sprintf(current->comm, "kjournald"); ++ ++ /* Set up an interval timer which can be used to trigger a ++ commit wakeup after the commit interval expires */ ++ init_timer(&timer); ++ timer.data = (unsigned long) current; ++ timer.function = commit_timeout; ++ journal->j_commit_timer = &timer; ++ ++ /* Record that the journal thread is running */ ++ journal->j_task = current; ++ wake_up(&journal->j_wait_done_commit); ++ ++ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", ++ journal->j_commit_interval / HZ); ++ list_add(&journal->j_all_journals, &all_journals); ++ ++ /* And now, wait forever for commit wakeup events. */ ++ while (1) { ++ if (journal->j_flags & JFS_UNMOUNT) ++ break; ++ ++ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", ++ journal->j_commit_sequence, journal->j_commit_request); ++ ++ if (journal->j_commit_sequence != journal->j_commit_request) { ++ jbd_debug(1, "OK, requests differ\n"); ++ if (journal->j_commit_timer_active) { ++ journal->j_commit_timer_active = 0; ++ del_timer(journal->j_commit_timer); ++ } ++ ++ journal_commit_transaction(journal); ++ continue; ++ } ++ ++ wake_up(&journal->j_wait_done_commit); ++ interruptible_sleep_on(&journal->j_wait_commit); ++ ++ jbd_debug(1, "kjournald wakes\n"); ++ ++ /* Were we woken up by a commit wakeup event? */ ++ if ((transaction = journal->j_running_transaction) != NULL && ++ time_after_eq(jiffies, transaction->t_expires)) { ++ journal->j_commit_request = transaction->t_tid; ++ jbd_debug(1, "woke because of timeout\n"); ++ } ++ } ++ ++ if (journal->j_commit_timer_active) { ++ journal->j_commit_timer_active = 0; ++ del_timer_sync(journal->j_commit_timer); ++ } ++ ++ list_del(&journal->j_all_journals); ++ ++ journal->j_task = NULL; ++ wake_up(&journal->j_wait_done_commit); ++ unlock_kernel(); ++ jbd_debug(1, "Journal thread exiting.\n"); ++ return 0; ++} ++ ++static void journal_start_thread(journal_t *journal) ++{ ++ kernel_thread(kjournald, (void *) journal, ++ CLONE_VM | CLONE_FS | CLONE_FILES); ++ while (!journal->j_task) ++ sleep_on(&journal->j_wait_done_commit); ++} ++ ++static void journal_kill_thread(journal_t *journal) ++{ ++ journal->j_flags |= JFS_UNMOUNT; ++ ++ while (journal->j_task) { ++ wake_up(&journal->j_wait_commit); ++ sleep_on(&journal->j_wait_done_commit); ++ } ++} ++ ++#if 0 ++ ++This is no longer needed - we do it in commit quite efficiently. ++Note that if this function is resurrected, the loop needs to ++be reorganised into the next_jh/last_jh algorithm. ++ ++/* ++ * journal_clean_data_list: cleanup after data IO. ++ * ++ * Once the IO system has finished writing the buffers on the transaction's ++ * data list, we can remove those buffers from the list. This function ++ * scans the list for such buffers and removes them cleanly. ++ * ++ * We assume that the journal is already locked. ++ * We are called with journal_datalist_lock held. ++ * ++ * AKPM: This function looks inefficient. Approximately O(n^2) ++ * for potentially thousands of buffers. It no longer shows on profiles ++ * because these buffers are mainly dropped in journal_commit_transaction(). ++ */ ++ ++void __journal_clean_data_list(transaction_t *transaction) ++{ ++ struct journal_head *jh, *next; ++ ++ assert_spin_locked(&journal_datalist_lock); ++ ++restart: ++ jh = transaction->t_sync_datalist; ++ if (!jh) ++ goto out; ++ do { ++ next = jh->b_tnext; ++ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { ++ struct buffer_head *bh = jh2bh(jh); ++ BUFFER_TRACE(bh, "data writeout complete: unfile"); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ __journal_remove_journal_head(bh); ++ refile_buffer(bh); ++ __brelse(bh); ++ goto restart; ++ } ++ jh = next; ++ } while (transaction->t_sync_datalist && ++ jh != transaction->t_sync_datalist); ++out: ++ return; ++} ++#endif ++ ++/* ++ * journal_write_metadata_buffer: write a metadata buffer to the journal. ++ * ++ * Writes a metadata buffer to a given disk block. The actual IO is not ++ * performed but a new buffer_head is constructed which labels the data ++ * to be written with the correct destination disk block. ++ * ++ * Any magic-number escaping which needs to be done will cause a ++ * copy-out here. If the buffer happens to start with the ++ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the ++ * magic number is only written to the log for descripter blocks. In ++ * this case, we copy the data and replace the first word with 0, and we ++ * return a result code which indicates that this buffer needs to be ++ * marked as an escaped buffer in the corresponding log descriptor ++ * block. The missing word can then be restored when the block is read ++ * during recovery. ++ * ++ * If the source buffer has already been modified by a new transaction ++ * since we took the last commit snapshot, we use the frozen copy of ++ * that data for IO. If we end up using the existing buffer_head's data ++ * for the write, then we *have* to lock the buffer to prevent anyone ++ * else from using and possibly modifying it while the IO is in ++ * progress. ++ * ++ * The function returns a pointer to the buffer_heads to be used for IO. ++ * ++ * We assume that the journal has already been locked in this function. ++ * ++ * Return value: ++ * <0: Error ++ * >=0: Finished OK ++ * ++ * On success: ++ * Bit 0 set == escape performed on the data ++ * Bit 1 set == buffer copy-out performed (kfree the data after IO) ++ */ ++ ++static inline unsigned long virt_to_offset(void *p) ++{return ((unsigned long) p) & ~PAGE_MASK;} ++ ++int journal_write_metadata_buffer(transaction_t *transaction, ++ struct journal_head *jh_in, ++ struct journal_head **jh_out, ++ int blocknr) ++{ ++ int need_copy_out = 0; ++ int done_copy_out = 0; ++ int do_escape = 0; ++ char *mapped_data; ++ struct buffer_head *new_bh; ++ struct journal_head * new_jh; ++ struct page *new_page; ++ unsigned int new_offset; ++ ++ /* ++ * The buffer really shouldn't be locked: only the current committing ++ * transaction is allowed to write it, so nobody else is allowed ++ * to do any IO. ++ * ++ * akpm: except if we're journalling data, and write() output is ++ * also part of a shared mapping, and another thread has ++ * decided to launch a writepage() against this buffer. ++ */ ++ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); ++ ++ /* ++ * If a new transaction has already done a buffer copy-out, then ++ * we use that version of the data for the commit. ++ */ ++ ++ if (jh_in->b_frozen_data) { ++ done_copy_out = 1; ++ new_page = virt_to_page(jh_in->b_frozen_data); ++ new_offset = virt_to_offset(jh_in->b_frozen_data); ++ } else { ++ new_page = jh2bh(jh_in)->b_page; ++ new_offset = virt_to_offset(jh2bh(jh_in)->b_data); ++ } ++ ++ mapped_data = ((char *) kmap(new_page)) + new_offset; ++ ++ /* ++ * Check for escaping ++ */ ++ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { ++ need_copy_out = 1; ++ do_escape = 1; ++ } ++ ++ /* ++ * Do we need to do a data copy? ++ */ ++ ++ if (need_copy_out && !done_copy_out) { ++ char *tmp; ++ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); ++ ++ jh_in->b_frozen_data = tmp; ++ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); ++ ++ /* If we get to this path, we'll always need the new ++ address kmapped so that we can clear the escaped ++ magic number below. */ ++ kunmap(new_page); ++ new_page = virt_to_page(tmp); ++ new_offset = virt_to_offset(tmp); ++ mapped_data = ((char *) kmap(new_page)) + new_offset; ++ ++ done_copy_out = 1; ++ } ++ ++ /* ++ * Right, time to make up the new buffer_head. ++ */ ++ do { ++ new_bh = get_unused_buffer_head(0); ++ if (!new_bh) { ++ printk (KERN_NOTICE __FUNCTION__ ++ ": ENOMEM at get_unused_buffer_head, " ++ "trying again.\n"); ++ current->policy |= SCHED_YIELD; ++ schedule(); ++ } ++ } while (!new_bh); ++ /* keep subsequent assertions sane */ ++ new_bh->b_prev_free = 0; ++ new_bh->b_next_free = 0; ++ new_bh->b_state = 0; ++ init_buffer(new_bh, NULL, NULL); ++ atomic_set(&new_bh->b_count, 1); ++ new_jh = journal_add_journal_head(new_bh); ++ ++ set_bh_page(new_bh, new_page, new_offset); ++ ++ new_jh->b_transaction = NULL; ++ new_bh->b_size = jh2bh(jh_in)->b_size; ++ new_bh->b_dev = transaction->t_journal->j_dev; ++ new_bh->b_blocknr = blocknr; ++ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty); ++ ++ *jh_out = new_jh; ++ ++ /* ++ * Did we need to do an escaping? Now we've done all the ++ * copying, we can finally do so. ++ */ ++ ++ if (do_escape) ++ * ((unsigned int *) mapped_data) = 0; ++ kunmap(new_page); ++ ++ /* ++ * The to-be-written buffer needs to get moved to the io queue, ++ * and the original buffer whose contents we are shadowing or ++ * copying is moved to the transaction's shadow queue. ++ */ ++ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); ++ journal_file_buffer(jh_in, transaction, BJ_Shadow); ++ JBUFFER_TRACE(new_jh, "file as BJ_IO"); ++ journal_file_buffer(new_jh, transaction, BJ_IO); ++ ++ return do_escape | (done_copy_out << 1); ++} ++ ++/* ++ * Allocation code for the journal file. Manage the space left in the ++ * journal, so that we can begin checkpointing when appropriate. ++ */ ++ ++/* ++ * log_space_left: Return the number of free blocks left in the journal. ++ * ++ * Called with the journal already locked. ++ */ ++ ++int log_space_left (journal_t *journal) ++{ ++ int left = journal->j_free; ++ ++ /* Be pessimistic here about the number of those free blocks ++ * which might be required for log descriptor control blocks. */ ++ ++#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ ++ ++ left -= MIN_LOG_RESERVED_BLOCKS; ++ ++ if (left <= 0) ++ return 0; ++ left -= (left >> 3); ++ return left; ++} ++ ++/* ++ * This function must be non-allocating for PF_MEMALLOC tasks ++ */ ++tid_t log_start_commit (journal_t *journal, transaction_t *transaction) ++{ ++ tid_t target = journal->j_commit_request; ++ ++ lock_kernel(); /* Protect journal->j_running_transaction */ ++ ++ /* ++ * A NULL transaction asks us to commit the currently running ++ * transaction, if there is one. ++ */ ++ if (transaction) ++ target = transaction->t_tid; ++ else { ++ transaction = journal->j_running_transaction; ++ if (!transaction) ++ goto out; ++ target = transaction->t_tid; ++ } ++ ++ /* ++ * Are we already doing a recent enough commit? ++ */ ++ if (tid_geq(journal->j_commit_request, target)) ++ goto out; ++ ++ /* ++ * We want a new commit: OK, mark the request and wakup the ++ * commit thread. We do _not_ do the commit ourselves. ++ */ ++ ++ journal->j_commit_request = target; ++ jbd_debug(1, "JBD: requesting commit %d/%d\n", ++ journal->j_commit_request, ++ journal->j_commit_sequence); ++ wake_up(&journal->j_wait_commit); ++ ++out: ++ unlock_kernel(); ++ return target; ++} ++ ++/* ++ * Wait for a specified commit to complete. ++ * The caller may not hold the journal lock. ++ */ ++void log_wait_commit (journal_t *journal, tid_t tid) ++{ ++ lock_kernel(); ++#ifdef CONFIG_JBD_DEBUG ++ lock_journal(journal); ++ if (!tid_geq(journal->j_commit_request, tid)) { ++ printk(KERN_EMERG __FUNCTION__ ++ ": error: j_commit_request=%d, tid=%d\n", ++ journal->j_commit_request, tid); ++ } ++ unlock_journal(journal); ++#endif ++ while (tid_gt(tid, journal->j_commit_sequence)) { ++ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", ++ tid, journal->j_commit_sequence); ++ wake_up(&journal->j_wait_commit); ++ sleep_on(&journal->j_wait_done_commit); ++ } ++ unlock_kernel(); ++} ++ ++/* ++ * Log buffer allocation routines: ++ */ ++ ++int journal_next_log_block(journal_t *journal, unsigned long *retp) ++{ ++ unsigned long blocknr; ++ ++ J_ASSERT(journal->j_free > 1); ++ ++ blocknr = journal->j_head; ++ journal->j_head++; ++ journal->j_free--; ++ if (journal->j_head == journal->j_last) ++ journal->j_head = journal->j_first; ++ return journal_bmap(journal, blocknr, retp); ++} ++ ++/* ++ * Conversion of logical to physical block numbers for the journal ++ * ++ * On external journals the journal blocks are identity-mapped, so ++ * this is a no-op. If needed, we can use j_blk_offset - everything is ++ * ready. ++ */ ++int journal_bmap(journal_t *journal, unsigned long blocknr, ++ unsigned long *retp) ++{ ++ int err = 0; ++ unsigned long ret; ++ ++ if (journal->j_inode) { ++ ret = bmap(journal->j_inode, blocknr); ++ if (ret) ++ *retp = ret; ++ else { ++ printk (KERN_ALERT __FUNCTION__ ++ ": journal block not found " ++ "at offset %lu on %s\n", ++ blocknr, bdevname(journal->j_dev)); ++ err = -EIO; ++ __journal_abort_soft(journal, err); ++ } ++ } else { ++ *retp = blocknr; /* +journal->j_blk_offset */ ++ } ++ return err; ++} ++ ++/* ++ * We play buffer_head aliasing tricks to write data/metadata blocks to ++ * the journal without copying their contents, but for journal ++ * descriptor blocks we do need to generate bona fide buffers. ++ * ++ * We return a jh whose bh is locked and ready to be populated. ++ */ ++ ++struct journal_head * journal_get_descriptor_buffer(journal_t *journal) ++{ ++ struct buffer_head *bh; ++ unsigned long blocknr; ++ int err; ++ ++ err = journal_next_log_block(journal, &blocknr); ++ ++ if (err) ++ return NULL; ++ ++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); ++ lock_buffer(bh); ++ BUFFER_TRACE(bh, "return this buffer"); ++ return journal_add_journal_head(bh); ++} ++ ++/* ++ * Management for journal control blocks: functions to create and ++ * destroy journal_t structures, and to initialise and read existing ++ * journal blocks from disk. */ ++ ++/* First: create and setup a journal_t object in memory. We initialise ++ * very few fields yet: that has to wait until we have created the ++ * journal structures from from scratch, or loaded them from disk. */ ++ ++static journal_t * journal_init_common (void) ++{ ++ journal_t *journal; ++ int err; ++ ++ MOD_INC_USE_COUNT; ++ ++ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); ++ if (!journal) ++ goto fail; ++ memset(journal, 0, sizeof(*journal)); ++ ++ init_waitqueue_head(&journal->j_wait_transaction_locked); ++ init_waitqueue_head(&journal->j_wait_logspace); ++ init_waitqueue_head(&journal->j_wait_done_commit); ++ init_waitqueue_head(&journal->j_wait_checkpoint); ++ init_waitqueue_head(&journal->j_wait_commit); ++ init_waitqueue_head(&journal->j_wait_updates); ++ init_MUTEX(&journal->j_barrier); ++ init_MUTEX(&journal->j_checkpoint_sem); ++ init_MUTEX(&journal->j_sem); ++ ++ journal->j_commit_interval = (HZ * 5); ++ ++ /* The journal is marked for error until we succeed with recovery! */ ++ journal->j_flags = JFS_ABORT; ++ ++ /* Set up a default-sized revoke table for the new mount. */ ++ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); ++ if (err) { ++ kfree(journal); ++ goto fail; ++ } ++ return journal; ++fail: ++ MOD_DEC_USE_COUNT; ++ return NULL; ++} ++ ++/* journal_init_dev and journal_init_inode: ++ * ++ * Create a journal structure assigned some fixed set of disk blocks to ++ * the journal. We don't actually touch those disk blocks yet, but we ++ * need to set up all of the mapping information to tell the journaling ++ * system where the journal blocks are. ++ * ++ * journal_init_dev creates a journal which maps a fixed contiguous ++ * range of blocks on an arbitrary block device. ++ * ++ * journal_init_inode creates a journal which maps an on-disk inode as ++ * the journal. The inode must exist already, must support bmap() and ++ * must have all data blocks preallocated. ++ */ ++ ++journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, ++ int start, int len, int blocksize) ++{ ++ journal_t *journal = journal_init_common(); ++ struct buffer_head *bh; ++ ++ if (!journal) ++ return NULL; ++ ++ journal->j_dev = dev; ++ journal->j_fs_dev = fs_dev; ++ journal->j_blk_offset = start; ++ journal->j_maxlen = len; ++ journal->j_blocksize = blocksize; ++ ++ bh = getblk(journal->j_dev, start, journal->j_blocksize); ++ J_ASSERT(bh != NULL); ++ journal->j_sb_buffer = bh; ++ journal->j_superblock = (journal_superblock_t *)bh->b_data; ++ ++ return journal; ++} ++ ++journal_t * journal_init_inode (struct inode *inode) ++{ ++ struct buffer_head *bh; ++ journal_t *journal = journal_init_common(); ++ int err; ++ unsigned long blocknr; ++ ++ if (!journal) ++ return NULL; ++ ++ journal->j_dev = inode->i_dev; ++ journal->j_fs_dev = inode->i_dev; ++ journal->j_inode = inode; ++ jbd_debug(1, ++ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", ++ journal, bdevname(inode->i_dev), inode->i_ino, ++ (long long) inode->i_size, ++ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); ++ ++ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ journal->j_blocksize = inode->i_sb->s_blocksize; ++ ++ err = journal_bmap(journal, 0, &blocknr); ++ /* If that failed, give up */ ++ if (err) { ++ printk(KERN_ERR __FUNCTION__ ": Cannnot locate journal " ++ "superblock\n"); ++ kfree(journal); ++ return NULL; ++ } ++ ++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); ++ J_ASSERT(bh != NULL); ++ journal->j_sb_buffer = bh; ++ journal->j_superblock = (journal_superblock_t *)bh->b_data; ++ ++ return journal; ++} ++ ++/* ++ * If the journal init or create aborts, we need to mark the journal ++ * superblock as being NULL to prevent the journal destroy from writing ++ * back a bogus superblock. ++ */ ++static void journal_fail_superblock (journal_t *journal) ++{ ++ struct buffer_head *bh = journal->j_sb_buffer; ++ brelse(bh); ++ journal->j_sb_buffer = NULL; ++} ++ ++/* ++ * Given a journal_t structure, initialise the various fields for ++ * startup of a new journaling session. We use this both when creating ++ * a journal, and after recovering an old journal to reset it for ++ * subsequent use. ++ */ ++ ++static int journal_reset (journal_t *journal) ++{ ++ journal_superblock_t *sb = journal->j_superblock; ++ unsigned int first, last; ++ ++ first = ntohl(sb->s_first); ++ last = ntohl(sb->s_maxlen); ++ ++ journal->j_first = first; ++ journal->j_last = last; ++ ++ journal->j_head = first; ++ journal->j_tail = first; ++ journal->j_free = last - first; ++ ++ journal->j_tail_sequence = journal->j_transaction_sequence; ++ journal->j_commit_sequence = journal->j_transaction_sequence - 1; ++ journal->j_commit_request = journal->j_commit_sequence; ++ ++ journal->j_max_transaction_buffers = journal->j_maxlen / 4; ++ ++ /* Add the dynamic fields and write it to disk. */ ++ journal_update_superblock(journal, 1); ++ ++ lock_journal(journal); ++ journal_start_thread(journal); ++ unlock_journal(journal); ++ ++ return 0; ++} ++ ++/* ++ * Given a journal_t structure which tells us which disk blocks we can ++ * use, create a new journal superblock and initialise all of the ++ * journal fields from scratch. */ ++ ++int journal_create (journal_t *journal) ++{ ++ unsigned long blocknr; ++ struct buffer_head *bh; ++ journal_superblock_t *sb; ++ int i, err; ++ ++ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { ++ printk (KERN_ERR "Journal length (%d blocks) too short.\n", ++ journal->j_maxlen); ++ journal_fail_superblock(journal); ++ return -EINVAL; ++ } ++ ++ if (journal->j_inode == NULL) { ++ /* ++ * We don't know what block to start at! ++ */ ++ printk(KERN_EMERG __FUNCTION__ ++ ": creation of journal on external device!\n"); ++ BUG(); ++ } ++ ++ /* Zero out the entire journal on disk. We cannot afford to ++ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ ++ jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); ++ for (i = 0; i < journal->j_maxlen; i++) { ++ err = journal_bmap(journal, i, &blocknr); ++ if (err) ++ return err; ++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); ++ wait_on_buffer(bh); ++ memset (bh->b_data, 0, journal->j_blocksize); ++ BUFFER_TRACE(bh, "marking dirty"); ++ mark_buffer_dirty(bh); ++ BUFFER_TRACE(bh, "marking uptodate"); ++ mark_buffer_uptodate(bh, 1); ++ __brelse(bh); ++ } ++ ++ sync_dev(journal->j_dev); ++ jbd_debug(1, "JBD: journal cleared.\n"); ++ ++ /* OK, fill in the initial static fields in the new superblock */ ++ sb = journal->j_superblock; ++ ++ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER); ++ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2); ++ ++ sb->s_blocksize = htonl(journal->j_blocksize); ++ sb->s_maxlen = htonl(journal->j_maxlen); ++ sb->s_first = htonl(1); ++ ++ journal->j_transaction_sequence = 1; ++ ++ journal->j_flags &= ~JFS_ABORT; ++ journal->j_format_version = 2; ++ ++ return journal_reset(journal); ++} ++ ++/* ++ * Update a journal's dynamic superblock fields and write it to disk, ++ * optionally waiting for the IO to complete. ++*/ ++ ++void journal_update_superblock(journal_t *journal, int wait) ++{ ++ journal_superblock_t *sb = journal->j_superblock; ++ struct buffer_head *bh = journal->j_sb_buffer; ++ ++ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", ++ journal->j_tail, journal->j_tail_sequence, journal->j_errno); ++ ++ sb->s_sequence = htonl(journal->j_tail_sequence); ++ sb->s_start = htonl(journal->j_tail); ++ sb->s_errno = htonl(journal->j_errno); ++ ++ BUFFER_TRACE(bh, "marking dirty"); ++ mark_buffer_dirty(bh); ++ ll_rw_block(WRITE, 1, &bh); ++ if (wait) ++ wait_on_buffer(bh); ++ ++ /* If we have just flushed the log (by marking s_start==0), then ++ * any future commit will have to be careful to update the ++ * superblock again to re-record the true start of the log. */ ++ ++ if (sb->s_start) ++ journal->j_flags &= ~JFS_FLUSHED; ++ else ++ journal->j_flags |= JFS_FLUSHED; ++} ++ ++ ++/* ++ * Read the superblock for a given journal, performing initial ++ * validation of the format. ++ */ ++ ++static int journal_get_superblock(journal_t *journal) ++{ ++ struct buffer_head *bh; ++ journal_superblock_t *sb; ++ int err = -EIO; ++ ++ bh = journal->j_sb_buffer; ++ ++ J_ASSERT(bh != NULL); ++ if (!buffer_uptodate(bh)) { ++ ll_rw_block(READ, 1, &bh); ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) { ++ printk (KERN_ERR ++ "JBD: IO error reading journal superblock\n"); ++ goto out; ++ } ++ } ++ ++ sb = journal->j_superblock; ++ ++ err = -EINVAL; ++ ++ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) || ++ sb->s_blocksize != htonl(journal->j_blocksize)) { ++ printk(KERN_WARNING "JBD: no valid journal superblock found\n"); ++ goto out; ++ } ++ ++ switch(ntohl(sb->s_header.h_blocktype)) { ++ case JFS_SUPERBLOCK_V1: ++ journal->j_format_version = 1; ++ break; ++ case JFS_SUPERBLOCK_V2: ++ journal->j_format_version = 2; ++ break; ++ default: ++ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); ++ goto out; ++ } ++ ++ if (ntohl(sb->s_maxlen) < journal->j_maxlen) ++ journal->j_maxlen = ntohl(sb->s_maxlen); ++ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) { ++ printk (KERN_WARNING "JBD: journal file too short\n"); ++ goto out; ++ } ++ ++ return 0; ++ ++out: ++ journal_fail_superblock(journal); ++ return err; ++} ++ ++/* ++ * Load the on-disk journal superblock and read the key fields into the ++ * journal_t. ++ */ ++ ++static int load_superblock(journal_t *journal) ++{ ++ int err; ++ journal_superblock_t *sb; ++ ++ err = journal_get_superblock(journal); ++ if (err) ++ return err; ++ ++ sb = journal->j_superblock; ++ ++ journal->j_tail_sequence = ntohl(sb->s_sequence); ++ journal->j_tail = ntohl(sb->s_start); ++ journal->j_first = ntohl(sb->s_first); ++ journal->j_last = ntohl(sb->s_maxlen); ++ journal->j_errno = ntohl(sb->s_errno); ++ ++ return 0; ++} ++ ++ ++/* ++ * Given a journal_t structure which tells us which disk blocks contain ++ * a journal, read the journal from disk to initialise the in-memory ++ * structures. ++ */ ++ ++int journal_load(journal_t *journal) ++{ ++ int err; ++ ++ err = load_superblock(journal); ++ if (err) ++ return err; ++ ++ /* If this is a V2 superblock, then we have to check the ++ * features flags on it. */ ++ ++ if (journal->j_format_version >= 2) { ++ journal_superblock_t *sb = journal->j_superblock; ++ ++ if ((sb->s_feature_ro_compat & ++ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || ++ (sb->s_feature_incompat & ++ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { ++ printk (KERN_WARNING ++ "JBD: Unrecognised features on journal\n"); ++ return -EINVAL; ++ } ++ } ++ ++ /* Let the recovery code check whether it needs to recover any ++ * data from the journal. */ ++ if (journal_recover(journal)) ++ goto recovery_error; ++ ++ /* OK, we've finished with the dynamic journal bits: ++ * reinitialise the dynamic contents of the superblock in memory ++ * and reset them on disk. */ ++ if (journal_reset(journal)) ++ goto recovery_error; ++ ++ journal->j_flags &= ~JFS_ABORT; ++ journal->j_flags |= JFS_LOADED; ++ return 0; ++ ++recovery_error: ++ printk (KERN_WARNING "JBD: recovery failed\n"); ++ return -EIO; ++} ++ ++/* ++ * Release a journal_t structure once it is no longer in use by the ++ * journaled object. ++ */ ++ ++void journal_destroy (journal_t *journal) ++{ ++ /* Wait for the commit thread to wake up and die. */ ++ journal_kill_thread(journal); ++ ++ /* Force a final log commit */ ++ if (journal->j_running_transaction) ++ journal_commit_transaction(journal); ++ ++ /* Force any old transactions to disk */ ++ lock_journal(journal); ++ while (journal->j_checkpoint_transactions != NULL) ++ log_do_checkpoint(journal, 1); ++ ++ J_ASSERT(journal->j_running_transaction == NULL); ++ J_ASSERT(journal->j_committing_transaction == NULL); ++ J_ASSERT(journal->j_checkpoint_transactions == NULL); ++ ++ /* We can now mark the journal as empty. */ ++ journal->j_tail = 0; ++ journal->j_tail_sequence = ++journal->j_transaction_sequence; ++ if (journal->j_sb_buffer) { ++ journal_update_superblock(journal, 1); ++ brelse(journal->j_sb_buffer); ++ } ++ ++ if (journal->j_inode) ++ iput(journal->j_inode); ++ if (journal->j_revoke) ++ journal_destroy_revoke(journal); ++ ++ unlock_journal(journal); ++ kfree(journal); ++ MOD_DEC_USE_COUNT; ++} ++ ++ ++/* Published API: Check whether the journal uses all of a given set of ++ * features. Return true (non-zero) if it does. */ ++ ++int journal_check_used_features (journal_t *journal, unsigned long compat, ++ unsigned long ro, unsigned long incompat) ++{ ++ journal_superblock_t *sb; ++ ++ if (!compat && !ro && !incompat) ++ return 1; ++ if (journal->j_format_version == 1) ++ return 0; ++ ++ sb = journal->j_superblock; ++ ++ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && ++ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && ++ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) ++ return 1; ++ ++ return 0; ++} ++ ++/* Published API: Check whether the journaling code supports the use of ++ * all of a given set of features on this journal. Return true ++ * (non-zero) if it can. */ ++ ++int journal_check_available_features (journal_t *journal, unsigned long compat, ++ unsigned long ro, unsigned long incompat) ++{ ++ journal_superblock_t *sb; ++ ++ if (!compat && !ro && !incompat) ++ return 1; ++ ++ sb = journal->j_superblock; ++ ++ /* We can support any known requested features iff the ++ * superblock is in version 2. Otherwise we fail to support any ++ * extended sb features. */ ++ ++ if (journal->j_format_version != 2) ++ return 0; ++ ++ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && ++ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && ++ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) ++ return 1; ++ ++ return 0; ++} ++ ++/* Published API: Mark a given journal feature as present on the ++ * superblock. Returns true if the requested features could be set. */ ++ ++int journal_set_features (journal_t *journal, unsigned long compat, ++ unsigned long ro, unsigned long incompat) ++{ ++ journal_superblock_t *sb; ++ ++ if (journal_check_used_features(journal, compat, ro, incompat)) ++ return 1; ++ ++ if (!journal_check_available_features(journal, compat, ro, incompat)) ++ return 0; ++ ++ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", ++ compat, ro, incompat); ++ ++ sb = journal->j_superblock; ++ ++ sb->s_feature_compat |= cpu_to_be32(compat); ++ sb->s_feature_ro_compat |= cpu_to_be32(ro); ++ sb->s_feature_incompat |= cpu_to_be32(incompat); ++ ++ return 1; ++} ++ ++ ++/* ++ * Published API: ++ * Given an initialised but unloaded journal struct, poke about in the ++ * on-disk structure to update it to the most recent supported version. ++ */ ++ ++int journal_update_format (journal_t *journal) ++{ ++ journal_superblock_t *sb; ++ int err; ++ ++ err = journal_get_superblock(journal); ++ if (err) ++ return err; ++ ++ sb = journal->j_superblock; ++ ++ switch (ntohl(sb->s_header.h_blocktype)) { ++ case JFS_SUPERBLOCK_V2: ++ return 0; ++ case JFS_SUPERBLOCK_V1: ++ return journal_convert_superblock_v1(journal, sb); ++ default: ++ break; ++ } ++ return -EINVAL; ++} ++ ++static int journal_convert_superblock_v1(journal_t *journal, ++ journal_superblock_t *sb) ++{ ++ int offset, blocksize; ++ struct buffer_head *bh; ++ ++ printk(KERN_WARNING ++ "JBD: Converting superblock from version 1 to 2.\n"); ++ ++ /* Pre-initialise new fields to zero */ ++ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); ++ blocksize = ntohl(sb->s_blocksize); ++ memset(&sb->s_feature_compat, 0, blocksize-offset); ++ ++ sb->s_nr_users = cpu_to_be32(1); ++ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); ++ journal->j_format_version = 2; ++ ++ bh = journal->j_sb_buffer; ++ BUFFER_TRACE(bh, "marking dirty"); ++ mark_buffer_dirty(bh); ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ return 0; ++} ++ ++ ++/* ++ * Flush all data for a given journal to disk and empty the journal. ++ * Filesystems can use this when remounting readonly to ensure that ++ * recovery does not need to happen on remount. ++ */ ++ ++int journal_flush (journal_t *journal) ++{ ++ int err = 0; ++ transaction_t *transaction = NULL; ++ unsigned long old_tail; ++ ++ lock_kernel(); ++ ++ /* Force everything buffered to the log... */ ++ if (journal->j_running_transaction) { ++ transaction = journal->j_running_transaction; ++ log_start_commit(journal, transaction); ++ } else if (journal->j_committing_transaction) ++ transaction = journal->j_committing_transaction; ++ ++ /* Wait for the log commit to complete... */ ++ if (transaction) ++ log_wait_commit(journal, transaction->t_tid); ++ ++ /* ...and flush everything in the log out to disk. */ ++ lock_journal(journal); ++ while (!err && journal->j_checkpoint_transactions != NULL) ++ err = log_do_checkpoint(journal, journal->j_maxlen); ++ cleanup_journal_tail(journal); ++ ++ /* Finally, mark the journal as really needing no recovery. ++ * This sets s_start==0 in the underlying superblock, which is ++ * the magic code for a fully-recovered superblock. Any future ++ * commits of data to the journal will restore the current ++ * s_start value. */ ++ old_tail = journal->j_tail; ++ journal->j_tail = 0; ++ journal_update_superblock(journal, 1); ++ journal->j_tail = old_tail; ++ ++ unlock_journal(journal); ++ ++ J_ASSERT(!journal->j_running_transaction); ++ J_ASSERT(!journal->j_committing_transaction); ++ J_ASSERT(!journal->j_checkpoint_transactions); ++ J_ASSERT(journal->j_head == journal->j_tail); ++ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); ++ ++ unlock_kernel(); ++ ++ return err; ++} ++ ++/* ++ * Wipe out all of the contents of a journal, safely. This will produce ++ * a warning if the journal contains any valid recovery information. ++ * Must be called between journal_init_*() and journal_load(). ++ * ++ * If (write) is non-zero, then we wipe out the journal on disk; otherwise ++ * we merely suppress recovery. ++ */ ++ ++int journal_wipe (journal_t *journal, int write) ++{ ++ journal_superblock_t *sb; ++ int err = 0; ++ ++ J_ASSERT (!(journal->j_flags & JFS_LOADED)); ++ ++ err = load_superblock(journal); ++ if (err) ++ return err; ++ ++ sb = journal->j_superblock; ++ ++ if (!journal->j_tail) ++ goto no_recovery; ++ ++ printk (KERN_WARNING "JBD: %s recovery information on journal\n", ++ write ? "Clearing" : "Ignoring"); ++ ++ err = journal_skip_recovery(journal); ++ if (write) ++ journal_update_superblock(journal, 1); ++ ++ no_recovery: ++ return err; ++} ++ ++/* ++ * journal_dev_name: format a character string to describe on what ++ * device this journal is present. ++ */ ++ ++const char * journal_dev_name(journal_t *journal) ++{ ++ kdev_t dev; ++ ++ if (journal->j_inode) ++ dev = journal->j_inode->i_dev; ++ else ++ dev = journal->j_dev; ++ ++ return bdevname(dev); ++} ++ ++/* ++ * journal_abort: perform a complete, immediate shutdown of the ENTIRE ++ * journal (not of a single transaction). This operation cannot be ++ * undone without closing and reopening the journal. ++ * ++ * The journal_abort function is intended to support higher level error ++ * recovery mechanisms such as the ext2/ext3 remount-readonly error ++ * mode. ++ * ++ * Journal abort has very specific semantics. Any existing dirty, ++ * unjournaled buffers in the main filesystem will still be written to ++ * disk by bdflush, but the journaling mechanism will be suspended ++ * immediately and no further transaction commits will be honoured. ++ * ++ * Any dirty, journaled buffers will be written back to disk without ++ * hitting the journal. Atomicity cannot be guaranteed on an aborted ++ * filesystem, but we _do_ attempt to leave as much data as possible ++ * behind for fsck to use for cleanup. ++ * ++ * Any attempt to get a new transaction handle on a journal which is in ++ * ABORT state will just result in an -EROFS error return. A ++ * journal_stop on an existing handle will return -EIO if we have ++ * entered abort state during the update. ++ * ++ * Recursive transactions are not disturbed by journal abort until the ++ * final journal_stop, which will receive the -EIO error. ++ * ++ * Finally, the journal_abort call allows the caller to supply an errno ++ * which will be recored (if possible) in the journal superblock. This ++ * allows a client to record failure conditions in the middle of a ++ * transaction without having to complete the transaction to record the ++ * failure to disk. ext3_error, for example, now uses this ++ * functionality. ++ * ++ * Errors which originate from within the journaling layer will NOT ++ * supply an errno; a null errno implies that absolutely no further ++ * writes are done to the journal (unless there are any already in ++ * progress). ++ */ ++ ++/* Quick version for internal journal use (doesn't lock the journal). ++ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, ++ * and don't attempt to make any other journal updates. */ ++void __journal_abort_hard (journal_t *journal) ++{ ++ transaction_t *transaction; ++ ++ if (journal->j_flags & JFS_ABORT) ++ return; ++ ++ printk (KERN_ERR "Aborting journal on device %s.\n", ++ journal_dev_name(journal)); ++ ++ journal->j_flags |= JFS_ABORT; ++ transaction = journal->j_running_transaction; ++ if (transaction) ++ log_start_commit(journal, transaction); ++} ++ ++/* Soft abort: record the abort error status in the journal superblock, ++ * but don't do any other IO. */ ++void __journal_abort_soft (journal_t *journal, int errno) ++{ ++ if (journal->j_flags & JFS_ABORT) ++ return; ++ ++ if (!journal->j_errno) ++ journal->j_errno = errno; ++ ++ __journal_abort_hard(journal); ++ ++ if (errno) ++ journal_update_superblock(journal, 1); ++} ++ ++/* Full version for external use */ ++void journal_abort (journal_t *journal, int errno) ++{ ++ lock_journal(journal); ++ __journal_abort_soft(journal, errno); ++ unlock_journal(journal); ++} ++ ++int journal_errno (journal_t *journal) ++{ ++ int err; ++ ++ lock_journal(journal); ++ if (journal->j_flags & JFS_ABORT) ++ err = -EROFS; ++ else ++ err = journal->j_errno; ++ unlock_journal(journal); ++ return err; ++} ++ ++int journal_clear_err (journal_t *journal) ++{ ++ int err = 0; ++ ++ lock_journal(journal); ++ if (journal->j_flags & JFS_ABORT) ++ err = -EROFS; ++ else ++ journal->j_errno = 0; ++ unlock_journal(journal); ++ return err; ++} ++ ++void journal_ack_err (journal_t *journal) ++{ ++ lock_journal(journal); ++ if (journal->j_errno) ++ journal->j_flags |= JFS_ACK_ERR; ++ unlock_journal(journal); ++} ++ ++int journal_blocks_per_page(struct inode *inode) ++{ ++ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++} ++ ++/* ++ * shrink_journal_memory(). ++ * Called when we're under memory pressure. Free up all the written-back ++ * checkpointed metadata buffers. ++ */ ++void shrink_journal_memory(void) ++{ ++ struct list_head *list; ++ ++ lock_kernel(); ++ list_for_each(list, &all_journals) { ++ journal_t *journal = ++ list_entry(list, journal_t, j_all_journals); ++ spin_lock(&journal_datalist_lock); ++ __journal_clean_checkpoint_list(journal); ++ spin_unlock(&journal_datalist_lock); ++ } ++ unlock_kernel(); ++} ++ ++/* ++ * Simple support for retying memory allocations. Introduced to help to ++ * debug different VM deadlock avoidance strategies. ++ */ ++/* ++ * Simple support for retying memory allocations. Introduced to help to ++ * debug different VM deadlock avoidance strategies. ++ */ ++void * __jbd_kmalloc (char *where, size_t size, int flags, int retry) ++{ ++ void *p; ++ static unsigned long last_warning; ++ ++ while (1) { ++ p = kmalloc(size, flags); ++ if (p) ++ return p; ++ if (!retry) ++ return NULL; ++ /* Log every retry for debugging. Also log them to the ++ * syslog, but do rate-limiting on the non-debugging ++ * messages. */ ++ jbd_debug(1, "ENOMEM in %s, retrying.\n", where); ++ ++ if (time_after(jiffies, last_warning + 5*HZ)) { ++ printk(KERN_NOTICE ++ "ENOMEM in %s, retrying.\n", where); ++ last_warning = jiffies; ++ } ++ ++ current->policy |= SCHED_YIELD; ++ schedule(); ++ } ++} ++ ++/* ++ * Journal_head storage management ++ */ ++static kmem_cache_t *journal_head_cache; ++#ifdef CONFIG_JBD_DEBUG ++static atomic_t nr_journal_heads = ATOMIC_INIT(0); ++#endif ++ ++static int journal_init_journal_head_cache(void) ++{ ++ int retval; ++ ++ J_ASSERT(journal_head_cache == 0); ++ journal_head_cache = kmem_cache_create("journal_head", ++ sizeof(struct journal_head), ++ 0, /* offset */ ++ 0, /* flags */ ++ NULL, /* ctor */ ++ NULL); /* dtor */ ++ retval = 0; ++ if (journal_head_cache == 0) { ++ retval = -ENOMEM; ++ printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); ++ } ++ return retval; ++} ++ ++static void journal_destroy_journal_head_cache(void) ++{ ++ J_ASSERT(journal_head_cache != NULL); ++ kmem_cache_destroy(journal_head_cache); ++ journal_head_cache = 0; ++} ++ ++/* ++ * journal_head splicing and dicing ++ */ ++static struct journal_head *journal_alloc_journal_head(void) ++{ ++ struct journal_head *ret; ++ static unsigned long last_warning; ++ ++#ifdef CONFIG_JBD_DEBUG ++ atomic_inc(&nr_journal_heads); ++#endif ++ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); ++ if (ret == 0) { ++ jbd_debug(1, "out of memory for journal_head\n"); ++ if (time_after(jiffies, last_warning + 5*HZ)) { ++ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__ ++ ", retrying.\n"); ++ last_warning = jiffies; ++ } ++ while (ret == 0) { ++ current->policy |= SCHED_YIELD; ++ schedule(); ++ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); ++ } ++ } ++ return ret; ++} ++ ++static void journal_free_journal_head(struct journal_head *jh) ++{ ++#ifdef CONFIG_JBD_DEBUG ++ atomic_dec(&nr_journal_heads); ++ memset(jh, 0x5b, sizeof(*jh)); ++#endif ++ kmem_cache_free(journal_head_cache, jh); ++} ++ ++/* ++ * A journal_head is attached to a buffer_head whenever JBD has an ++ * interest in the buffer. ++ * ++ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit ++ * is set. This bit is tested in core kernel code where we need to take ++ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable ++ * there. ++ * ++ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. ++ * ++ * When a buffer has its BH_JBD bit set it is immune from being released by ++ * core kernel code, mainly via ->b_count. ++ * ++ * A journal_head may be detached from its buffer_head when the journal_head's ++ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. ++ * Various places in JBD call journal_remove_journal_head() to indicate that the ++ * journal_head can be dropped if needed. ++ * ++ * Various places in the kernel want to attach a journal_head to a buffer_head ++ * _before_ attaching the journal_head to a transaction. To protect the ++ * journal_head in this situation, journal_add_journal_head elevates the ++ * journal_head's b_jcount refcount by one. The caller must call ++ * journal_unlock_journal_head() to undo this. ++ * ++ * So the typical usage would be: ++ * ++ * (Attach a journal_head if needed. Increments b_jcount) ++ * struct journal_head *jh = journal_add_journal_head(bh); ++ * ... ++ * jh->b_transaction = xxx; ++ * journal_unlock_journal_head(jh); ++ * ++ * Now, the journal_head's b_jcount is zero, but it is safe from being released ++ * because it has a non-zero b_transaction. ++ */ ++ ++/* ++ * Give a buffer_head a journal_head. ++ * ++ * Doesn't need the journal lock. ++ * May sleep. ++ * Cannot be called with journal_datalist_lock held. ++ */ ++struct journal_head *journal_add_journal_head(struct buffer_head *bh) ++{ ++ struct journal_head *jh; ++ ++ spin_lock(&journal_datalist_lock); ++ if (buffer_jbd(bh)) { ++ jh = bh2jh(bh); ++ } else { ++ J_ASSERT_BH(bh, ++ (atomic_read(&bh->b_count) > 0) || ++ (bh->b_page && bh->b_page->mapping)); ++ spin_unlock(&journal_datalist_lock); ++ jh = journal_alloc_journal_head(); ++ memset(jh, 0, sizeof(*jh)); ++ spin_lock(&journal_datalist_lock); ++ ++ if (buffer_jbd(bh)) { ++ /* Someone did it for us! */ ++ J_ASSERT_BH(bh, bh->b_private != NULL); ++ journal_free_journal_head(jh); ++ jh = bh->b_private; ++ } else { ++ /* ++ * We actually don't need jh_splice_lock when ++ * adding a journal_head - only on removal. ++ */ ++ spin_lock(&jh_splice_lock); ++ set_bit(BH_JBD, &bh->b_state); ++ bh->b_private = jh; ++ jh->b_bh = bh; ++ atomic_inc(&bh->b_count); ++ spin_unlock(&jh_splice_lock); ++ BUFFER_TRACE(bh, "added journal_head"); ++ } ++ } ++ jh->b_jcount++; ++ spin_unlock(&journal_datalist_lock); ++ return bh->b_private; ++} ++ ++/* ++ * journal_remove_journal_head(): if the buffer isn't attached to a transaction ++ * and has a zero b_jcount then remove and release its journal_head. If we did ++ * see that the buffer is not used by any transaction we also "logically" ++ * decrement ->b_count. ++ * ++ * We in fact take an additional increment on ->b_count as a convenience, ++ * because the caller usually wants to do additional things with the bh ++ * after calling here. ++ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some ++ * time. Once the caller has run __brelse(), the buffer is eligible for ++ * reaping by try_to_free_buffers(). ++ * ++ * Requires journal_datalist_lock. ++ */ ++void __journal_remove_journal_head(struct buffer_head *bh) ++{ ++ struct journal_head *jh = bh2jh(bh); ++ ++ assert_spin_locked(&journal_datalist_lock); ++ J_ASSERT_JH(jh, jh->b_jcount >= 0); ++ atomic_inc(&bh->b_count); ++ if (jh->b_jcount == 0) { ++ if (jh->b_transaction == NULL && ++ jh->b_next_transaction == NULL && ++ jh->b_cp_transaction == NULL) { ++ J_ASSERT_BH(bh, buffer_jbd(bh)); ++ J_ASSERT_BH(bh, jh2bh(jh) == bh); ++ BUFFER_TRACE(bh, "remove journal_head"); ++ spin_lock(&jh_splice_lock); ++ bh->b_private = NULL; ++ jh->b_bh = NULL; /* debug, really */ ++ clear_bit(BH_JBD, &bh->b_state); ++ __brelse(bh); ++ spin_unlock(&jh_splice_lock); ++ journal_free_journal_head(jh); ++ } else { ++ BUFFER_TRACE(bh, "journal_head was locked"); ++ } ++ } ++} ++ ++void journal_unlock_journal_head(struct journal_head *jh) ++{ ++ spin_lock(&journal_datalist_lock); ++ J_ASSERT_JH(jh, jh->b_jcount > 0); ++ --jh->b_jcount; ++ if (!jh->b_jcount && !jh->b_transaction) { ++ struct buffer_head *bh; ++ bh = jh2bh(jh); ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ } ++ ++ spin_unlock(&journal_datalist_lock); ++} ++ ++void journal_remove_journal_head(struct buffer_head *bh) ++{ ++ spin_lock(&journal_datalist_lock); ++ __journal_remove_journal_head(bh); ++ spin_unlock(&journal_datalist_lock); ++} ++ ++/* ++ * /proc tunables ++ */ ++#if defined(CONFIG_JBD_DEBUG) ++int journal_enable_debug; ++EXPORT_SYMBOL(journal_enable_debug); ++#endif ++ ++#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) ++ ++static struct proc_dir_entry *proc_jbd_debug; ++ ++int read_jbd_debug(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int ret; ++ ++ ret = sprintf(page + off, "%d\n", journal_enable_debug); ++ *eof = 1; ++ return ret; ++} ++ ++int write_jbd_debug(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char buf[32]; ++ ++ if (count > ARRAY_SIZE(buf) - 1) ++ count = ARRAY_SIZE(buf) - 1; ++ if (copy_from_user(buf, buffer, count)) ++ return -EFAULT; ++ buf[ARRAY_SIZE(buf) - 1] = '\0'; ++ journal_enable_debug = simple_strtoul(buf, NULL, 10); ++ return count; ++} ++ ++#define JBD_PROC_NAME "sys/fs/jbd-debug" ++ ++static void __init create_jbd_proc_entry(void) ++{ ++ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); ++ if (proc_jbd_debug) { ++ /* Why is this so hard? */ ++ proc_jbd_debug->read_proc = read_jbd_debug; ++ proc_jbd_debug->write_proc = write_jbd_debug; ++ } ++} ++ ++static void __exit remove_jbd_proc_entry(void) ++{ ++ if (proc_jbd_debug) ++ remove_proc_entry(JBD_PROC_NAME, NULL); ++} ++ ++#else ++ ++#define create_jbd_proc_entry() do {} while (0) ++#define remove_jbd_proc_entry() do {} while (0) ++ ++#endif ++ ++/* ++ * Module startup and shutdown ++ */ ++ ++static int __init journal_init_caches(void) ++{ ++ int ret; ++ ++ ret = journal_init_revoke_caches(); ++ if (ret == 0) ++ ret = journal_init_journal_head_cache(); ++ return ret; ++} ++ ++static void journal_destroy_caches(void) ++{ ++ journal_destroy_revoke_caches(); ++ journal_destroy_journal_head_cache(); ++} ++ ++static int __init journal_init(void) ++{ ++ int ret; ++ ++ printk(KERN_INFO "Journalled Block Device driver loaded\n"); ++ ret = journal_init_caches(); ++ if (ret != 0) ++ journal_destroy_caches(); ++ create_jbd_proc_entry(); ++ return ret; ++} ++ ++static void __exit journal_exit(void) ++{ ++#ifdef CONFIG_JBD_DEBUG ++ int n = atomic_read(&nr_journal_heads); ++ if (n) ++ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); ++#endif ++ remove_jbd_proc_entry(); ++ journal_destroy_caches(); ++} ++ ++MODULE_LICENSE("GPL"); ++module_init(journal_init); ++module_exit(journal_exit); ++ +diff -ruP linux.mcp2/fs/jbd/recovery.c linuxppc_2.4.19_final/fs/jbd/recovery.c +--- linux.mcp2/fs/jbd/recovery.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/recovery.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,589 @@ ++/* ++ * linux/fs/recovery.c ++ * ++ * Written by Stephen C. Tweedie , 1999 ++ * ++ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Journal recovery routines for the generic filesystem journaling code; ++ * part of the ext2fs journaling system. ++ */ ++ ++#ifndef __KERNEL__ ++#include "jfs_user.h" ++#else ++#include ++#include ++#include ++#include ++#include ++#include ++#endif ++ ++/* ++ * Maintain information about the progress of the recovery job, so that ++ * the different passes can carry information between them. ++ */ ++struct recovery_info ++{ ++ tid_t start_transaction; ++ tid_t end_transaction; ++ ++ int nr_replays; ++ int nr_revokes; ++ int nr_revoke_hits; ++}; ++ ++enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; ++static int do_one_pass(journal_t *journal, ++ struct recovery_info *info, enum passtype pass); ++static int scan_revoke_records(journal_t *, struct buffer_head *, ++ tid_t, struct recovery_info *); ++ ++#ifdef __KERNEL__ ++ ++/* Release readahead buffers after use */ ++void journal_brelse_array(struct buffer_head *b[], int n) ++{ ++ while (--n >= 0) ++ brelse (b[n]); ++} ++ ++ ++/* ++ * When reading from the journal, we are going through the block device ++ * layer directly and so there is no readahead being done for us. We ++ * need to implement any readahead ourselves if we want it to happen at ++ * all. Recovery is basically one long sequential read, so make sure we ++ * do the IO in reasonably large chunks. ++ * ++ * This is not so critical that we need to be enormously clever about ++ * the readahead size, though. 128K is a purely arbitrary, good-enough ++ * fixed value. ++ */ ++ ++#define MAXBUF 8 ++static int do_readahead(journal_t *journal, unsigned int start) ++{ ++ int err; ++ unsigned int max, nbufs, next; ++ unsigned long blocknr; ++ struct buffer_head *bh; ++ ++ struct buffer_head * bufs[MAXBUF]; ++ ++ /* Do up to 128K of readahead */ ++ max = start + (128 * 1024 / journal->j_blocksize); ++ if (max > journal->j_maxlen) ++ max = journal->j_maxlen; ++ ++ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at ++ * a time to the block device IO layer. */ ++ ++ nbufs = 0; ++ ++ for (next = start; next < max; next++) { ++ err = journal_bmap(journal, next, &blocknr); ++ ++ if (err) { ++ printk (KERN_ERR "JBD: bad block at offset %u\n", ++ next); ++ goto failed; ++ } ++ ++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); ++ if (!bh) { ++ err = -ENOMEM; ++ goto failed; ++ } ++ ++ if (!buffer_uptodate(bh) && !buffer_locked(bh)) { ++ bufs[nbufs++] = bh; ++ if (nbufs == MAXBUF) { ++ ll_rw_block(READ, nbufs, bufs); ++ journal_brelse_array(bufs, nbufs); ++ nbufs = 0; ++ } ++ } else ++ brelse(bh); ++ } ++ ++ if (nbufs) ++ ll_rw_block(READ, nbufs, bufs); ++ err = 0; ++ ++failed: ++ if (nbufs) ++ journal_brelse_array(bufs, nbufs); ++ return err; ++} ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* ++ * Read a block from the journal ++ */ ++ ++static int jread(struct buffer_head **bhp, journal_t *journal, ++ unsigned int offset) ++{ ++ int err; ++ unsigned long blocknr; ++ struct buffer_head *bh; ++ ++ *bhp = NULL; ++ ++ J_ASSERT (offset < journal->j_maxlen); ++ ++ err = journal_bmap(journal, offset, &blocknr); ++ ++ if (err) { ++ printk (KERN_ERR "JBD: bad block at offset %u\n", ++ offset); ++ return err; ++ } ++ ++ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); ++ if (!bh) ++ return -ENOMEM; ++ ++ if (!buffer_uptodate(bh)) { ++ /* If this is a brand new buffer, start readahead. ++ Otherwise, we assume we are already reading it. */ ++ if (!buffer_req(bh)) ++ do_readahead(journal, offset); ++ wait_on_buffer(bh); ++ } ++ ++ if (!buffer_uptodate(bh)) { ++ printk (KERN_ERR "JBD: Failed to read block at offset %u\n", ++ offset); ++ brelse(bh); ++ return -EIO; ++ } ++ ++ *bhp = bh; ++ return 0; ++} ++ ++ ++/* ++ * Count the number of in-use tags in a journal descriptor block. ++ */ ++ ++static int count_tags(struct buffer_head *bh, int size) ++{ ++ char * tagp; ++ journal_block_tag_t * tag; ++ int nr = 0; ++ ++ tagp = &bh->b_data[sizeof(journal_header_t)]; ++ ++ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { ++ tag = (journal_block_tag_t *) tagp; ++ ++ nr++; ++ tagp += sizeof(journal_block_tag_t); ++ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) ++ tagp += 16; ++ ++ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) ++ break; ++ } ++ ++ return nr; ++} ++ ++ ++/* Make sure we wrap around the log correctly! */ ++#define wrap(journal, var) \ ++do { \ ++ if (var >= (journal)->j_last) \ ++ var -= ((journal)->j_last - (journal)->j_first); \ ++} while (0) ++ ++/* ++ * journal_recover ++ * ++ * The primary function for recovering the log contents when mounting a ++ * journaled device. ++ * ++ * Recovery is done in three passes. In the first pass, we look for the ++ * end of the log. In the second, we assemble the list of revoke ++ * blocks. In the third and final pass, we replay any un-revoked blocks ++ * in the log. ++ */ ++ ++int journal_recover(journal_t *journal) ++{ ++ int err; ++ journal_superblock_t * sb; ++ ++ struct recovery_info info; ++ ++ memset(&info, 0, sizeof(info)); ++ sb = journal->j_superblock; ++ ++ /* ++ * The journal superblock's s_start field (the current log head) ++ * is always zero if, and only if, the journal was cleanly ++ * unmounted. ++ */ ++ ++ if (!sb->s_start) { ++ jbd_debug(1, "No recovery required, last transaction %d\n", ++ ntohl(sb->s_sequence)); ++ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; ++ return 0; ++ } ++ ++ ++ err = do_one_pass(journal, &info, PASS_SCAN); ++ if (!err) ++ err = do_one_pass(journal, &info, PASS_REVOKE); ++ if (!err) ++ err = do_one_pass(journal, &info, PASS_REPLAY); ++ ++ jbd_debug(0, "JBD: recovery, exit status %d, " ++ "recovered transactions %u to %u\n", ++ err, info.start_transaction, info.end_transaction); ++ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", ++ info.nr_replays, info.nr_revoke_hits, info.nr_revokes); ++ ++ /* Restart the log at the next transaction ID, thus invalidating ++ * any existing commit records in the log. */ ++ journal->j_transaction_sequence = ++info.end_transaction; ++ ++ journal_clear_revoke(journal); ++ fsync_no_super(journal->j_fs_dev); ++ return err; ++} ++ ++/* ++ * journal_skip_recovery ++ * ++ * Locate any valid recovery information from the journal and set up the ++ * journal structures in memory to ignore it (presumably because the ++ * caller has evidence that it is out of date). ++ * ++ * We perform one pass over the journal to allow us to tell the user how ++ * much recovery information is being erased, and to let us initialise ++ * the journal transaction sequence numbers to the next unused ID. ++ */ ++ ++int journal_skip_recovery(journal_t *journal) ++{ ++ int err; ++ journal_superblock_t * sb; ++ ++ struct recovery_info info; ++ ++ memset (&info, 0, sizeof(info)); ++ sb = journal->j_superblock; ++ ++ err = do_one_pass(journal, &info, PASS_SCAN); ++ ++ if (err) { ++ printk(KERN_ERR "JBD: error %d scanning journal\n", err); ++ ++journal->j_transaction_sequence; ++ } else { ++#ifdef CONFIG_JBD_DEBUG ++ int dropped = info.end_transaction - ntohl(sb->s_sequence); ++#endif ++ ++ jbd_debug(0, ++ "JBD: ignoring %d transaction%s from the journal.\n", ++ dropped, (dropped == 1) ? "" : "s"); ++ journal->j_transaction_sequence = ++info.end_transaction; ++ } ++ ++ journal->j_tail = 0; ++ ++ return err; ++} ++ ++static int do_one_pass(journal_t *journal, ++ struct recovery_info *info, enum passtype pass) ++{ ++ ++ unsigned int first_commit_ID, next_commit_ID; ++ unsigned long next_log_block; ++ int err, success = 0; ++ journal_superblock_t * sb; ++ journal_header_t * tmp; ++ struct buffer_head * bh; ++ unsigned int sequence; ++ int blocktype; ++ ++ /* Precompute the maximum metadata descriptors in a descriptor block */ ++ int MAX_BLOCKS_PER_DESC; ++ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) ++ / sizeof(journal_block_tag_t)); ++ ++ /* ++ * First thing is to establish what we expect to find in the log ++ * (in terms of transaction IDs), and where (in terms of log ++ * block offsets): query the superblock. ++ */ ++ ++ sb = journal->j_superblock; ++ next_commit_ID = ntohl(sb->s_sequence); ++ next_log_block = ntohl(sb->s_start); ++ ++ first_commit_ID = next_commit_ID; ++ if (pass == PASS_SCAN) ++ info->start_transaction = first_commit_ID; ++ ++ jbd_debug(1, "Starting recovery pass %d\n", pass); ++ ++ /* ++ * Now we walk through the log, transaction by transaction, ++ * making sure that each transaction has a commit block in the ++ * expected place. Each complete transaction gets replayed back ++ * into the main filesystem. ++ */ ++ ++ while (1) { ++ int flags; ++ char * tagp; ++ journal_block_tag_t * tag; ++ struct buffer_head * obh; ++ struct buffer_head * nbh; ++ ++ /* If we already know where to stop the log traversal, ++ * check right now that we haven't gone past the end of ++ * the log. */ ++ ++ if (pass != PASS_SCAN) ++ if (tid_geq(next_commit_ID, info->end_transaction)) ++ break; ++ ++ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", ++ next_commit_ID, next_log_block, journal->j_last); ++ ++ /* Skip over each chunk of the transaction looking ++ * either the next descriptor block or the final commit ++ * record. */ ++ ++ jbd_debug(3, "JBD: checking block %ld\n", next_log_block); ++ err = jread(&bh, journal, next_log_block); ++ if (err) ++ goto failed; ++ ++ next_log_block++; ++ wrap(journal, next_log_block); ++ ++ /* What kind of buffer is it? ++ * ++ * If it is a descriptor block, check that it has the ++ * expected sequence number. Otherwise, we're all done ++ * here. */ ++ ++ tmp = (journal_header_t *)bh->b_data; ++ ++ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { ++ brelse(bh); ++ break; ++ } ++ ++ blocktype = ntohl(tmp->h_blocktype); ++ sequence = ntohl(tmp->h_sequence); ++ jbd_debug(3, "Found magic %d, sequence %d\n", ++ blocktype, sequence); ++ ++ if (sequence != next_commit_ID) { ++ brelse(bh); ++ break; ++ } ++ ++ /* OK, we have a valid descriptor block which matches ++ * all of the sequence number checks. What are we going ++ * to do with it? That depends on the pass... */ ++ ++ switch(blocktype) { ++ case JFS_DESCRIPTOR_BLOCK: ++ /* If it is a valid descriptor block, replay it ++ * in pass REPLAY; otherwise, just skip over the ++ * blocks it describes. */ ++ if (pass != PASS_REPLAY) { ++ next_log_block += ++ count_tags(bh, journal->j_blocksize); ++ wrap(journal, next_log_block); ++ brelse(bh); ++ continue; ++ } ++ ++ /* A descriptor block: we can now write all of ++ * the data blocks. Yay, useful work is finally ++ * getting done here! */ ++ ++ tagp = &bh->b_data[sizeof(journal_header_t)]; ++ while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) ++ <= journal->j_blocksize) { ++ unsigned long io_block; ++ ++ tag = (journal_block_tag_t *) tagp; ++ flags = ntohl(tag->t_flags); ++ ++ io_block = next_log_block++; ++ wrap(journal, next_log_block); ++ err = jread(&obh, journal, io_block); ++ if (err) { ++ /* Recover what we can, but ++ * report failure at the end. */ ++ success = err; ++ printk (KERN_ERR ++ "JBD: IO error %d recovering " ++ "block %ld in log\n", ++ err, io_block); ++ } else { ++ unsigned long blocknr; ++ ++ J_ASSERT(obh != NULL); ++ blocknr = ntohl(tag->t_blocknr); ++ ++ /* If the block has been ++ * revoked, then we're all done ++ * here. */ ++ if (journal_test_revoke ++ (journal, blocknr, ++ next_commit_ID)) { ++ brelse(obh); ++ ++info->nr_revoke_hits; ++ goto skip_write; ++ } ++ ++ /* Find a buffer for the new ++ * data being restored */ ++ nbh = getblk(journal->j_fs_dev, blocknr, ++ journal->j_blocksize); ++ if (nbh == NULL) { ++ printk(KERN_ERR ++ "JBD: Out of memory " ++ "during recovery.\n"); ++ err = -ENOMEM; ++ brelse(bh); ++ brelse(obh); ++ goto failed; ++ } ++ ++ lock_buffer(nbh); ++ memcpy(nbh->b_data, obh->b_data, ++ journal->j_blocksize); ++ if (flags & JFS_FLAG_ESCAPE) { ++ *((unsigned int *)bh->b_data) = ++ htonl(JFS_MAGIC_NUMBER); ++ } ++ ++ BUFFER_TRACE(nbh, "marking dirty"); ++ mark_buffer_dirty(nbh); ++ BUFFER_TRACE(nbh, "marking uptodate"); ++ mark_buffer_uptodate(nbh, 1); ++ unlock_buffer(nbh); ++ ++info->nr_replays; ++ /* ll_rw_block(WRITE, 1, &nbh); */ ++ brelse(obh); ++ brelse(nbh); ++ } ++ ++ skip_write: ++ tagp += sizeof(journal_block_tag_t); ++ if (!(flags & JFS_FLAG_SAME_UUID)) ++ tagp += 16; ++ ++ if (flags & JFS_FLAG_LAST_TAG) ++ break; ++ } ++ ++ brelse(bh); ++ continue; ++ ++ case JFS_COMMIT_BLOCK: ++ /* Found an expected commit block: not much to ++ * do other than move on to the next sequence ++ * number. */ ++ brelse(bh); ++ next_commit_ID++; ++ continue; ++ ++ case JFS_REVOKE_BLOCK: ++ /* If we aren't in the REVOKE pass, then we can ++ * just skip over this block. */ ++ if (pass != PASS_REVOKE) { ++ brelse(bh); ++ continue; ++ } ++ ++ err = scan_revoke_records(journal, bh, ++ next_commit_ID, info); ++ brelse(bh); ++ if (err) ++ goto failed; ++ continue; ++ ++ default: ++ jbd_debug(3, "Unrecognised magic %d, end of scan.\n", ++ blocktype); ++ goto done; ++ } ++ } ++ ++ done: ++ /* ++ * We broke out of the log scan loop: either we came to the ++ * known end of the log or we found an unexpected block in the ++ * log. If the latter happened, then we know that the "current" ++ * transaction marks the end of the valid log. ++ */ ++ ++ if (pass == PASS_SCAN) ++ info->end_transaction = next_commit_ID; ++ else { ++ /* It's really bad news if different passes end up at ++ * different places (but possible due to IO errors). */ ++ if (info->end_transaction != next_commit_ID) { ++ printk (KERN_ERR "JBD: recovery pass %d ended at " ++ "transaction %u, expected %u\n", ++ pass, next_commit_ID, info->end_transaction); ++ if (!success) ++ success = -EIO; ++ } ++ } ++ ++ return success; ++ ++ failed: ++ return err; ++} ++ ++ ++/* Scan a revoke record, marking all blocks mentioned as revoked. */ ++ ++static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, ++ tid_t sequence, struct recovery_info *info) ++{ ++ journal_revoke_header_t *header; ++ int offset, max; ++ ++ header = (journal_revoke_header_t *) bh->b_data; ++ offset = sizeof(journal_revoke_header_t); ++ max = ntohl(header->r_count); ++ ++ while (offset < max) { ++ unsigned long blocknr; ++ int err; ++ ++ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); ++ offset += 4; ++ err = journal_set_revoke(journal, blocknr, sequence); ++ if (err) ++ return err; ++ ++info->nr_revokes; ++ } ++ return 0; ++} +diff -ruP linux.mcp2/fs/jbd/revoke.c linuxppc_2.4.19_final/fs/jbd/revoke.c +--- linux.mcp2/fs/jbd/revoke.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/revoke.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,636 @@ ++/* ++ * linux/fs/revoke.c ++ * ++ * Written by Stephen C. Tweedie , 2000 ++ * ++ * Copyright 2000 Red Hat corp --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Journal revoke routines for the generic filesystem journaling code; ++ * part of the ext2fs journaling system. ++ * ++ * Revoke is the mechanism used to prevent old log records for deleted ++ * metadata from being replayed on top of newer data using the same ++ * blocks. The revoke mechanism is used in two separate places: ++ * ++ * + Commit: during commit we write the entire list of the current ++ * transaction's revoked blocks to the journal ++ * ++ * + Recovery: during recovery we record the transaction ID of all ++ * revoked blocks. If there are multiple revoke records in the log ++ * for a single block, only the last one counts, and if there is a log ++ * entry for a block beyond the last revoke, then that log entry still ++ * gets replayed. ++ * ++ * We can get interactions between revokes and new log data within a ++ * single transaction: ++ * ++ * Block is revoked and then journaled: ++ * The desired end result is the journaling of the new block, so we ++ * cancel the revoke before the transaction commits. ++ * ++ * Block is journaled and then revoked: ++ * The revoke must take precedence over the write of the block, so we ++ * need either to cancel the journal entry or to write the revoke ++ * later in the log than the log block. In this case, we choose the ++ * latter: journaling a block cancels any revoke record for that block ++ * in the current transaction, so any revoke for that block in the ++ * transaction must have happened after the block was journaled and so ++ * the revoke must take precedence. ++ * ++ * Block is revoked and then written as data: ++ * The data write is allowed to succeed, but the revoke is _not_ ++ * cancelled. We still need to prevent old log records from ++ * overwriting the new data. We don't even need to clear the revoke ++ * bit here. ++ * ++ * Revoke information on buffers is a tri-state value: ++ * ++ * RevokeValid clear: no cached revoke status, need to look it up ++ * RevokeValid set, Revoked clear: ++ * buffer has not been revoked, and cancel_revoke ++ * need do nothing. ++ * RevokeValid set, Revoked set: ++ * buffer has been revoked. ++ */ ++ ++#ifndef __KERNEL__ ++#include "jfs_user.h" ++#else ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#endif ++ ++static kmem_cache_t *revoke_record_cache; ++static kmem_cache_t *revoke_table_cache; ++ ++/* Each revoke record represents one single revoked block. During ++ journal replay, this involves recording the transaction ID of the ++ last transaction to revoke this block. */ ++ ++struct jbd_revoke_record_s ++{ ++ struct list_head hash; ++ tid_t sequence; /* Used for recovery only */ ++ unsigned long blocknr; ++}; ++ ++ ++/* The revoke table is just a simple hash table of revoke records. */ ++struct jbd_revoke_table_s ++{ ++ /* It is conceivable that we might want a larger hash table ++ * for recovery. Must be a power of two. */ ++ int hash_size; ++ int hash_shift; ++ struct list_head *hash_table; ++}; ++ ++ ++#ifdef __KERNEL__ ++static void write_one_revoke_record(journal_t *, transaction_t *, ++ struct journal_head **, int *, ++ struct jbd_revoke_record_s *); ++static void flush_descriptor(journal_t *, struct journal_head *, int); ++#endif ++ ++/* Utility functions to maintain the revoke table */ ++ ++/* Borrowed from buffer.c: this is a tried and tested block hash function */ ++static inline int hash(journal_t *journal, unsigned long block) ++{ ++ struct jbd_revoke_table_s *table = journal->j_revoke; ++ int hash_shift = table->hash_shift; ++ ++ return ((block << (hash_shift - 6)) ^ ++ (block >> 13) ^ ++ (block << (hash_shift - 12))) & (table->hash_size - 1); ++} ++ ++int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) ++{ ++ struct list_head *hash_list; ++ struct jbd_revoke_record_s *record; ++ ++repeat: ++ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); ++ if (!record) ++ goto oom; ++ ++ record->sequence = seq; ++ record->blocknr = blocknr; ++ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; ++ list_add(&record->hash, hash_list); ++ return 0; ++ ++oom: ++ if (!journal_oom_retry) ++ return -ENOMEM; ++ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); ++ current->policy |= SCHED_YIELD; ++ schedule(); ++ goto repeat; ++} ++ ++/* Find a revoke record in the journal's hash table. */ ++ ++static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, ++ unsigned long blocknr) ++{ ++ struct list_head *hash_list; ++ struct jbd_revoke_record_s *record; ++ ++ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; ++ ++ record = (struct jbd_revoke_record_s *) hash_list->next; ++ while (&(record->hash) != hash_list) { ++ if (record->blocknr == blocknr) ++ return record; ++ record = (struct jbd_revoke_record_s *) record->hash.next; ++ } ++ return NULL; ++} ++ ++int __init journal_init_revoke_caches(void) ++{ ++ revoke_record_cache = kmem_cache_create("revoke_record", ++ sizeof(struct jbd_revoke_record_s), ++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); ++ if (revoke_record_cache == 0) ++ return -ENOMEM; ++ ++ revoke_table_cache = kmem_cache_create("revoke_table", ++ sizeof(struct jbd_revoke_table_s), ++ 0, 0, NULL, NULL); ++ if (revoke_table_cache == 0) { ++ kmem_cache_destroy(revoke_record_cache); ++ revoke_record_cache = NULL; ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++void journal_destroy_revoke_caches(void) ++{ ++ kmem_cache_destroy(revoke_record_cache); ++ revoke_record_cache = 0; ++ kmem_cache_destroy(revoke_table_cache); ++ revoke_table_cache = 0; ++} ++ ++/* Initialise the revoke table for a given journal to a given size. */ ++ ++int journal_init_revoke(journal_t *journal, int hash_size) ++{ ++ int shift, tmp; ++ ++ J_ASSERT (journal->j_revoke == NULL); ++ ++ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); ++ if (!journal->j_revoke) ++ return -ENOMEM; ++ ++ /* Check that the hash_size is a power of two */ ++ J_ASSERT ((hash_size & (hash_size-1)) == 0); ++ ++ journal->j_revoke->hash_size = hash_size; ++ ++ shift = 0; ++ tmp = hash_size; ++ while((tmp >>= 1UL) != 0UL) ++ shift++; ++ journal->j_revoke->hash_shift = shift; ++ ++ journal->j_revoke->hash_table = ++ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); ++ if (!journal->j_revoke->hash_table) { ++ kmem_cache_free(revoke_table_cache, journal->j_revoke); ++ journal->j_revoke = NULL; ++ return -ENOMEM; ++ } ++ ++ for (tmp = 0; tmp < hash_size; tmp++) ++ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); ++ ++ return 0; ++} ++ ++/* Destoy a journal's revoke table. The table must already be empty! */ ++ ++void journal_destroy_revoke(journal_t *journal) ++{ ++ struct jbd_revoke_table_s *table; ++ struct list_head *hash_list; ++ int i; ++ ++ table = journal->j_revoke; ++ if (!table) ++ return; ++ ++ for (i=0; ihash_size; i++) { ++ hash_list = &table->hash_table[i]; ++ J_ASSERT (list_empty(hash_list)); ++ } ++ ++ kfree(table->hash_table); ++ kmem_cache_free(revoke_table_cache, table); ++ journal->j_revoke = NULL; ++} ++ ++ ++#ifdef __KERNEL__ ++ ++/* ++ * journal_revoke: revoke a given buffer_head from the journal. This ++ * prevents the block from being replayed during recovery if we take a ++ * crash after this current transaction commits. Any subsequent ++ * metadata writes of the buffer in this transaction cancel the ++ * revoke. ++ * ++ * Note that this call may block --- it is up to the caller to make ++ * sure that there are no further calls to journal_write_metadata ++ * before the revoke is complete. In ext3, this implies calling the ++ * revoke before clearing the block bitmap when we are deleting ++ * metadata. ++ * ++ * Revoke performs a journal_forget on any buffer_head passed in as a ++ * parameter, but does _not_ forget the buffer_head if the bh was only ++ * found implicitly. ++ * ++ * bh_in may not be a journalled buffer - it may have come off ++ * the hash tables without an attached journal_head. ++ * ++ * If bh_in is non-zero, journal_revoke() will decrement its b_count ++ * by one. ++ */ ++ ++int journal_revoke(handle_t *handle, unsigned long blocknr, ++ struct buffer_head *bh_in) ++{ ++ struct buffer_head *bh = NULL; ++ journal_t *journal; ++ kdev_t dev; ++ int err; ++ ++ if (bh_in) ++ BUFFER_TRACE(bh_in, "enter"); ++ ++ journal = handle->h_transaction->t_journal; ++ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ ++ J_ASSERT (!"Cannot set revoke feature!"); ++ return -EINVAL; ++ } ++ ++ dev = journal->j_fs_dev; ++ bh = bh_in; ++ ++ if (!bh) { ++ bh = get_hash_table(dev, blocknr, journal->j_blocksize); ++ if (bh) ++ BUFFER_TRACE(bh, "found on hash"); ++ } ++#ifdef JBD_EXPENSIVE_CHECKING ++ else { ++ struct buffer_head *bh2; ++ ++ /* If there is a different buffer_head lying around in ++ * memory anywhere... */ ++ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize); ++ if (bh2) { ++ /* ... and it has RevokeValid status... */ ++ if ((bh2 != bh) && ++ test_bit(BH_RevokeValid, &bh2->b_state)) ++ /* ...then it better be revoked too, ++ * since it's illegal to create a revoke ++ * record against a buffer_head which is ++ * not marked revoked --- that would ++ * risk missing a subsequent revoke ++ * cancel. */ ++ J_ASSERT_BH(bh2, test_bit(BH_Revoked, & ++ bh2->b_state)); ++ __brelse(bh2); ++ } ++ } ++#endif ++ ++ /* We really ought not ever to revoke twice in a row without ++ first having the revoke cancelled: it's illegal to free a ++ block twice without allocating it in between! */ ++ if (bh) { ++ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state)); ++ set_bit(BH_Revoked, &bh->b_state); ++ set_bit(BH_RevokeValid, &bh->b_state); ++ if (bh_in) { ++ BUFFER_TRACE(bh_in, "call journal_forget"); ++ journal_forget(handle, bh_in); ++ } else { ++ BUFFER_TRACE(bh, "call brelse"); ++ __brelse(bh); ++ } ++ } ++ ++ lock_journal(journal); ++ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); ++ err = insert_revoke_hash(journal, blocknr, ++ handle->h_transaction->t_tid); ++ unlock_journal(journal); ++ BUFFER_TRACE(bh_in, "exit"); ++ return err; ++} ++ ++/* ++ * Cancel an outstanding revoke. For use only internally by the ++ * journaling code (called from journal_get_write_access). ++ * ++ * We trust the BH_Revoked bit on the buffer if the buffer is already ++ * being journaled: if there is no revoke pending on the buffer, then we ++ * don't do anything here. ++ * ++ * This would break if it were possible for a buffer to be revoked and ++ * discarded, and then reallocated within the same transaction. In such ++ * a case we would have lost the revoked bit, but when we arrived here ++ * the second time we would still have a pending revoke to cancel. So, ++ * do not trust the Revoked bit on buffers unless RevokeValid is also ++ * set. ++ * ++ * The caller must have the journal locked. ++ */ ++int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) ++{ ++ struct jbd_revoke_record_s *record; ++ journal_t *journal = handle->h_transaction->t_journal; ++ int need_cancel; ++ int did_revoke = 0; /* akpm: debug */ ++ struct buffer_head *bh = jh2bh(jh); ++ ++ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); ++ ++ /* Is the existing Revoke bit valid? If so, we trust it, and ++ * only perform the full cancel if the revoke bit is set. If ++ * not, we can't trust the revoke bit, and we need to do the ++ * full search for a revoke record. */ ++ if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) ++ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); ++ else { ++ need_cancel = 1; ++ clear_bit(BH_Revoked, &bh->b_state); ++ } ++ ++ if (need_cancel) { ++ record = find_revoke_record(journal, bh->b_blocknr); ++ if (record) { ++ jbd_debug(4, "cancelled existing revoke on " ++ "blocknr %lu\n", bh->b_blocknr); ++ list_del(&record->hash); ++ kmem_cache_free(revoke_record_cache, record); ++ did_revoke = 1; ++ } ++ } ++ ++#ifdef JBD_EXPENSIVE_CHECKING ++ /* There better not be one left behind by now! */ ++ record = find_revoke_record(journal, bh->b_blocknr); ++ J_ASSERT_JH(jh, record == NULL); ++#endif ++ ++ /* Finally, have we just cleared revoke on an unhashed ++ * buffer_head? If so, we'd better make sure we clear the ++ * revoked status on any hashed alias too, otherwise the revoke ++ * state machine will get very upset later on. */ ++ if (need_cancel && !bh->b_pprev) { ++ struct buffer_head *bh2; ++ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); ++ if (bh2) { ++ clear_bit(BH_Revoked, &bh2->b_state); ++ __brelse(bh2); ++ } ++ } ++ ++ return did_revoke; ++} ++ ++ ++/* ++ * Write revoke records to the journal for all entries in the current ++ * revoke hash, deleting the entries as we go. ++ * ++ * Called with the journal lock held. ++ */ ++ ++void journal_write_revoke_records(journal_t *journal, ++ transaction_t *transaction) ++{ ++ struct journal_head *descriptor; ++ struct jbd_revoke_record_s *record; ++ struct jbd_revoke_table_s *revoke; ++ struct list_head *hash_list; ++ int i, offset, count; ++ ++ descriptor = NULL; ++ offset = 0; ++ count = 0; ++ revoke = journal->j_revoke; ++ ++ for (i = 0; i < revoke->hash_size; i++) { ++ hash_list = &revoke->hash_table[i]; ++ ++ while (!list_empty(hash_list)) { ++ record = (struct jbd_revoke_record_s *) ++ hash_list->next; ++ write_one_revoke_record(journal, transaction, ++ &descriptor, &offset, ++ record); ++ count++; ++ list_del(&record->hash); ++ kmem_cache_free(revoke_record_cache, record); ++ } ++ } ++ if (descriptor) ++ flush_descriptor(journal, descriptor, offset); ++ jbd_debug(1, "Wrote %d revoke records\n", count); ++} ++ ++/* ++ * Write out one revoke record. We need to create a new descriptor ++ * block if the old one is full or if we have not already created one. ++ */ ++ ++static void write_one_revoke_record(journal_t *journal, ++ transaction_t *transaction, ++ struct journal_head **descriptorp, ++ int *offsetp, ++ struct jbd_revoke_record_s *record) ++{ ++ struct journal_head *descriptor; ++ int offset; ++ journal_header_t *header; ++ ++ /* If we are already aborting, this all becomes a noop. We ++ still need to go round the loop in ++ journal_write_revoke_records in order to free all of the ++ revoke records: only the IO to the journal is omitted. */ ++ if (is_journal_aborted(journal)) ++ return; ++ ++ descriptor = *descriptorp; ++ offset = *offsetp; ++ ++ /* Make sure we have a descriptor with space left for the record */ ++ if (descriptor) { ++ if (offset == journal->j_blocksize) { ++ flush_descriptor(journal, descriptor, offset); ++ descriptor = NULL; ++ } ++ } ++ ++ if (!descriptor) { ++ descriptor = journal_get_descriptor_buffer(journal); ++ if (!descriptor) ++ return; ++ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; ++ header->h_magic = htonl(JFS_MAGIC_NUMBER); ++ header->h_blocktype = htonl(JFS_REVOKE_BLOCK); ++ header->h_sequence = htonl(transaction->t_tid); ++ ++ /* Record it so that we can wait for IO completion later */ ++ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); ++ journal_file_buffer(descriptor, transaction, BJ_LogCtl); ++ ++ offset = sizeof(journal_revoke_header_t); ++ *descriptorp = descriptor; ++ } ++ ++ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = ++ htonl(record->blocknr); ++ offset += 4; ++ *offsetp = offset; ++} ++ ++/* ++ * Flush a revoke descriptor out to the journal. If we are aborting, ++ * this is a noop; otherwise we are generating a buffer which needs to ++ * be waited for during commit, so it has to go onto the appropriate ++ * journal buffer list. ++ */ ++ ++static void flush_descriptor(journal_t *journal, ++ struct journal_head *descriptor, ++ int offset) ++{ ++ journal_revoke_header_t *header; ++ ++ if (is_journal_aborted(journal)) { ++ JBUFFER_TRACE(descriptor, "brelse"); ++ unlock_buffer(jh2bh(descriptor)); ++ __brelse(jh2bh(descriptor)); ++ return; ++ } ++ ++ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; ++ header->r_count = htonl(offset); ++ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state); ++ { ++ struct buffer_head *bh = jh2bh(descriptor); ++ BUFFER_TRACE(bh, "write"); ++ clear_bit(BH_Dirty, &bh->b_state); ++ bh->b_end_io = journal_end_buffer_io_sync; ++ submit_bh(WRITE, bh); ++ } ++} ++ ++#endif ++ ++/* ++ * Revoke support for recovery. ++ * ++ * Recovery needs to be able to: ++ * ++ * record all revoke records, including the tid of the latest instance ++ * of each revoke in the journal ++ * ++ * check whether a given block in a given transaction should be replayed ++ * (ie. has not been revoked by a revoke record in that or a subsequent ++ * transaction) ++ * ++ * empty the revoke table after recovery. ++ */ ++ ++/* ++ * First, setting revoke records. We create a new revoke record for ++ * every block ever revoked in the log as we scan it for recovery, and ++ * we update the existing records if we find multiple revokes for a ++ * single block. ++ */ ++ ++int journal_set_revoke(journal_t *journal, ++ unsigned long blocknr, ++ tid_t sequence) ++{ ++ struct jbd_revoke_record_s *record; ++ ++ record = find_revoke_record(journal, blocknr); ++ if (record) { ++ /* If we have multiple occurences, only record the ++ * latest sequence number in the hashed record */ ++ if (tid_gt(sequence, record->sequence)) ++ record->sequence = sequence; ++ return 0; ++ } ++ return insert_revoke_hash(journal, blocknr, sequence); ++} ++ ++/* ++ * Test revoke records. For a given block referenced in the log, has ++ * that block been revoked? A revoke record with a given transaction ++ * sequence number revokes all blocks in that transaction and earlier ++ * ones, but later transactions still need replayed. ++ */ ++ ++int journal_test_revoke(journal_t *journal, ++ unsigned long blocknr, ++ tid_t sequence) ++{ ++ struct jbd_revoke_record_s *record; ++ ++ record = find_revoke_record(journal, blocknr); ++ if (!record) ++ return 0; ++ if (tid_gt(sequence, record->sequence)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Finally, once recovery is over, we need to clear the revoke table so ++ * that it can be reused by the running filesystem. ++ */ ++ ++void journal_clear_revoke(journal_t *journal) ++{ ++ int i; ++ struct list_head *hash_list; ++ struct jbd_revoke_record_s *record; ++ struct jbd_revoke_table_s *revoke; ++ ++ revoke = journal->j_revoke; ++ ++ for (i = 0; i < revoke->hash_size; i++) { ++ hash_list = &revoke->hash_table[i]; ++ while (!list_empty(hash_list)) { ++ record = (struct jbd_revoke_record_s*) hash_list->next; ++ list_del(&record->hash); ++ kmem_cache_free(revoke_record_cache, record); ++ } ++ } ++} ++ +diff -ruP linux.mcp2/fs/jbd/transaction.c linuxppc_2.4.19_final/fs/jbd/transaction.c +--- linux.mcp2/fs/jbd/transaction.c 1969-12-31 16:00:00.000000000 -0800 ++++ linuxppc_2.4.19_final/fs/jbd/transaction.c 2004-05-17 13:56:17.000000000 -0700 +@@ -0,0 +1,2055 @@ ++/* ++ * linux/fs/transaction.c ++ * ++ * Written by Stephen C. Tweedie , 1998 ++ * ++ * Copyright 1998 Red Hat corp --- All Rights Reserved ++ * ++ * This file is part of the Linux kernel and is made available under ++ * the terms of the GNU General Public License, version 2, or at your ++ * option, any later version, incorporated herein by reference. ++ * ++ * Generic filesystem transaction handling code; part of the ext2fs ++ * journaling system. ++ * ++ * This file manages transactions (compound commits managed by the ++ * journaling code) and handles (individual atomic operations by the ++ * filesystem). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern spinlock_t journal_datalist_lock; ++ ++/* ++ * get_transaction: obtain a new transaction_t object. ++ * ++ * Simply allocate and initialise a new transaction. Create it in ++ * RUNNING state and add it to the current journal (which should not ++ * have an existing running transaction: we only make a new transaction ++ * once we have started to commit the old one). ++ * ++ * Preconditions: ++ * The journal MUST be locked. We don't perform atomic mallocs on the ++ * new transaction and we can't block without protecting against other ++ * processes trying to touch the journal while it is in transition. ++ */ ++ ++static transaction_t * get_transaction (journal_t * journal, int is_try) ++{ ++ transaction_t * transaction; ++ ++ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS); ++ if (!transaction) ++ return NULL; ++ ++ memset (transaction, 0, sizeof (transaction_t)); ++ ++ transaction->t_journal = journal; ++ transaction->t_state = T_RUNNING; ++ transaction->t_tid = journal->j_transaction_sequence++; ++ transaction->t_expires = jiffies + journal->j_commit_interval; ++ ++ /* Set up the commit timer for the new transaction. */ ++ J_ASSERT (!journal->j_commit_timer_active); ++ journal->j_commit_timer_active = 1; ++ journal->j_commit_timer->expires = transaction->t_expires; ++ add_timer(journal->j_commit_timer); ++ ++ J_ASSERT (journal->j_running_transaction == NULL); ++ journal->j_running_transaction = transaction; ++ ++ return transaction; ++} ++ ++/* ++ * Handle management. ++ * ++ * A handle_t is an object which represents a single atomic update to a ++ * filesystem, and which tracks all of the modifications which form part ++ * of that one update. ++ */ ++ ++/* ++ * start_this_handle: Given a handle, deal with any locking or stalling ++ * needed to make sure that there is enough journal space for the handle ++ * to begin. Attach the handle to a transaction and set up the ++ * transaction's buffer credits. ++ */ ++ ++static int start_this_handle(journal_t *journal, handle_t *handle) ++{ ++ transaction_t *transaction; ++ int needed; ++ int nblocks = handle->h_buffer_credits; ++ ++ jbd_debug(3, "New handle %p going live.\n", handle); ++ ++repeat: ++ ++ lock_journal(journal); ++ ++repeat_locked: ++ ++ if (is_journal_aborted(journal) || ++ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { ++ unlock_journal(journal); ++ return -EROFS; ++ } ++ ++ /* Wait on the journal's transaction barrier if necessary */ ++ if (journal->j_barrier_count) { ++ unlock_journal(journal); ++ sleep_on(&journal->j_wait_transaction_locked); ++ goto repeat; ++ } ++ ++ if (!journal->j_running_transaction) ++ get_transaction(journal, 0); ++ /* @@@ Error? */ ++ J_ASSERT(journal->j_running_transaction); ++ ++ transaction = journal->j_running_transaction; ++ ++ /* If the current transaction is locked down for commit, wait ++ * for the lock to be released. */ ++ ++ if (transaction->t_state == T_LOCKED) { ++ unlock_journal(journal); ++ jbd_debug(3, "Handle %p stalling...\n", handle); ++ sleep_on(&journal->j_wait_transaction_locked); ++ goto repeat; ++ } ++ ++ /* If there is not enough space left in the log to write all ++ * potential buffers requested by this operation, we need to ++ * stall pending a log checkpoint to free some more log ++ * space. */ ++ ++ needed = transaction->t_outstanding_credits + nblocks; ++ ++ if (needed > journal->j_max_transaction_buffers) { ++ /* If the current transaction is already too large, then ++ * start to commit it: we can then go back and attach ++ * this handle to a new transaction. */ ++ ++ jbd_debug(2, "Handle %p starting new commit...\n", handle); ++ log_start_commit(journal, transaction); ++ unlock_journal(journal); ++ sleep_on(&journal->j_wait_transaction_locked); ++ lock_journal(journal); ++ goto repeat_locked; ++ } ++ ++ /* ++ * The commit code assumes that it can get enough log space ++ * without forcing a checkpoint. This is *critical* for ++ * correctness: a checkpoint of a buffer which is also ++ * associated with a committing transaction creates a deadlock, ++ * so commit simply cannot force through checkpoints. ++ * ++ * We must therefore ensure the necessary space in the journal ++ * *before* starting to dirty potentially checkpointed buffers ++ * in the new transaction. ++ * ++ * The worst part is, any transaction currently committing can ++ * reduce the free space arbitrarily. Be careful to account for ++ * those buffers when checkpointing. ++ */ ++ ++ /* ++ * @@@ AKPM: This seems rather over-defensive. We're giving commit ++ * a _lot_ of headroom: 1/4 of the journal plus the size of ++ * the committing transaction. Really, we only need to give it ++ * committing_transaction->t_outstanding_credits plus "enough" for ++ * the log control blocks. ++ * Also, this test is inconsitent with the matching one in ++ * journal_extend(). ++ */ ++ needed = journal->j_max_transaction_buffers; ++ if (journal->j_committing_transaction) ++ needed += journal->j_committing_transaction-> ++ t_outstanding_credits; ++ ++ if (log_space_left(journal) < needed) { ++ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); ++ log_wait_for_space(journal, needed); ++ goto repeat_locked; ++ } ++ ++ /* OK, account for the buffers that this operation expects to ++ * use and add the handle to the running transaction. */ ++ ++ handle->h_transaction = transaction; ++ transaction->t_outstanding_credits += nblocks; ++ transaction->t_updates++; ++ transaction->t_handle_count++; ++ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", ++ handle, nblocks, transaction->t_outstanding_credits, ++ log_space_left(journal)); ++ ++ unlock_journal(journal); ++ ++ return 0; ++} ++ ++/* ++ * Obtain a new handle. ++ * ++ * We make sure that the transaction can guarantee at least nblocks of ++ * modified buffers in the log. We block until the log can guarantee ++ * that much space. ++ * ++ * This function is visible to journal users (like ext2fs), so is not ++ * called with the journal already locked. ++ * ++ * Return a pointer to a newly allocated handle, or NULL on failure ++ */ ++ ++handle_t *journal_start(journal_t *journal, int nblocks) ++{ ++ handle_t *handle = journal_current_handle(); ++ int err; ++ ++ if (!journal) ++ return ERR_PTR(-EROFS); ++ ++ if (handle) { ++ J_ASSERT(handle->h_transaction->t_journal == journal); ++ handle->h_ref++; ++ return handle; ++ } ++ ++ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ if (!handle) ++ return ERR_PTR(-ENOMEM); ++ memset (handle, 0, sizeof (handle_t)); ++ ++ handle->h_buffer_credits = nblocks; ++ handle->h_ref = 1; ++ current->journal_info = handle; ++ ++ err = start_this_handle(journal, handle); ++ if (err < 0) { ++ kfree(handle); ++ current->journal_info = NULL; ++ return ERR_PTR(err); ++ } ++ ++ return handle; ++} ++ ++/* ++ * Return zero on success ++ */ ++static int try_start_this_handle(journal_t *journal, handle_t *handle) ++{ ++ transaction_t *transaction; ++ int needed; ++ int nblocks = handle->h_buffer_credits; ++ int ret = 0; ++ ++ jbd_debug(3, "New handle %p maybe going live.\n", handle); ++ ++ lock_journal(journal); ++ ++ if (is_journal_aborted(journal) || ++ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { ++ ret = -EROFS; ++ goto fail_unlock; ++ } ++ ++ if (journal->j_barrier_count) ++ goto fail_unlock; ++ ++ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0) ++ goto fail_unlock; ++ ++ transaction = journal->j_running_transaction; ++ if (transaction->t_state == T_LOCKED) ++ goto fail_unlock; ++ ++ needed = transaction->t_outstanding_credits + nblocks; ++ /* We could run log_start_commit here */ ++ if (needed > journal->j_max_transaction_buffers) ++ goto fail_unlock; ++ ++ needed = journal->j_max_transaction_buffers; ++ if (journal->j_committing_transaction) ++ needed += journal->j_committing_transaction-> ++ t_outstanding_credits; ++ ++ if (log_space_left(journal) < needed) ++ goto fail_unlock; ++ ++ handle->h_transaction = transaction; ++ transaction->t_outstanding_credits += nblocks; ++ transaction->t_updates++; ++ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", ++ handle, nblocks, transaction->t_outstanding_credits, ++ log_space_left(journal)); ++ unlock_journal(journal); ++ return 0; ++ ++fail_unlock: ++ unlock_journal(journal); ++ if (ret >= 0) ++ ret = -1; ++ return ret; ++} ++ ++/* ++ * Try to start a handle, but non-blockingly. If we weren't able ++ * to, return an ERR_PTR value. ++ */ ++handle_t *journal_try_start(journal_t *journal, int nblocks) ++{ ++ handle_t *handle = journal_current_handle(); ++ int err; ++ ++ if (!journal) ++ return ERR_PTR(-EROFS); ++ ++ if (handle) { ++ jbd_debug(4, "h_ref %d -> %d\n", ++ handle->h_ref, ++ handle->h_ref + 1); ++ J_ASSERT(handle->h_transaction->t_journal == journal); ++ if (is_handle_aborted(handle)) ++ return ERR_PTR(-EIO); ++ handle->h_ref++; ++ return handle; ++ } else { ++ jbd_debug(4, "no current transaction\n"); ++ } ++ ++ if (is_journal_aborted(journal)) ++ return ERR_PTR(-EIO); ++ ++ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ if (!handle) ++ return ERR_PTR(-ENOMEM); ++ memset (handle, 0, sizeof (handle_t)); ++ ++ handle->h_buffer_credits = nblocks; ++ handle->h_ref = 1; ++ current->journal_info = handle; ++ ++ err = try_start_this_handle(journal, handle); ++ if (err < 0) { ++ kfree(handle); ++ current->journal_info = NULL; ++ return ERR_PTR(err); ++ } ++ ++ return handle; ++} ++ ++/* ++ * journal_extend: extend buffer credits. ++ * ++ * Some transactions, such as large extends and truncates, can be done ++ * atomically all at once or in several stages. The operation requests ++ * a credit for a number of buffer modications in advance, but can ++ * extend its credit if it needs more. ++ * ++ * journal_extend tries to give the running handle more buffer credits. ++ * It does not guarantee that allocation: this is a best-effort only. ++ * The calling process MUST be able to deal cleanly with a failure to ++ * extend here. ++ * ++ * Return 0 on success, non-zero on failure. ++ * ++ * return code < 0 implies an error ++ * return code > 0 implies normal transaction-full status. ++ */ ++ ++int journal_extend (handle_t *handle, int nblocks) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ int result; ++ int wanted; ++ ++ lock_journal (journal); ++ ++ result = -EIO; ++ if (is_handle_aborted(handle)) ++ goto error_out; ++ ++ result = 1; ++ ++ /* Don't extend a locked-down transaction! */ ++ if (handle->h_transaction->t_state != T_RUNNING) { ++ jbd_debug(3, "denied handle %p %d blocks: " ++ "transaction not running\n", handle, nblocks); ++ goto error_out; ++ } ++ ++ wanted = transaction->t_outstanding_credits + nblocks; ++ ++ if (wanted > journal->j_max_transaction_buffers) { ++ jbd_debug(3, "denied handle %p %d blocks: " ++ "transaction too large\n", handle, nblocks); ++ goto error_out; ++ } ++ ++ if (wanted > log_space_left(journal)) { ++ jbd_debug(3, "denied handle %p %d blocks: " ++ "insufficient log space\n", handle, nblocks); ++ goto error_out; ++ } ++ ++ handle->h_buffer_credits += nblocks; ++ transaction->t_outstanding_credits += nblocks; ++ result = 0; ++ ++ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); ++ ++error_out: ++ unlock_journal (journal); ++ return result; ++} ++ ++ ++/* ++ * journal_restart: restart a handle for a multi-transaction filesystem ++ * operation. ++ * ++ * If the journal_extend() call above fails to grant new buffer credits ++ * to a running handle, a call to journal_restart will commit the ++ * handle's transaction so far and reattach the handle to a new ++ * transaction capabable of guaranteeing the requested number of ++ * credits. ++ */ ++ ++int journal_restart(handle_t *handle, int nblocks) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ int ret; ++ ++ /* If we've had an abort of any type, don't even think about ++ * actually doing the restart! */ ++ if (is_handle_aborted(handle)) ++ return 0; ++ ++ /* First unlink the handle from its current transaction, and ++ * start the commit on that. */ ++ ++ J_ASSERT (transaction->t_updates > 0); ++ J_ASSERT (journal_current_handle() == handle); ++ ++ transaction->t_outstanding_credits -= handle->h_buffer_credits; ++ transaction->t_updates--; ++ ++ if (!transaction->t_updates) ++ wake_up(&journal->j_wait_updates); ++ ++ jbd_debug(2, "restarting handle %p\n", handle); ++ log_start_commit(journal, transaction); ++ ++ handle->h_buffer_credits = nblocks; ++ ret = start_this_handle(journal, handle); ++ return ret; ++} ++ ++ ++/* ++ * Barrier operation: establish a transaction barrier. ++ * ++ * This locks out any further updates from being started, and blocks ++ * until all existing updates have completed, returning only once the ++ * journal is in a quiescent state with no updates running. ++ * ++ * The journal lock should not be held on entry. ++ */ ++ ++void journal_lock_updates (journal_t *journal) ++{ ++ lock_journal(journal); ++ ++journal->j_barrier_count; ++ ++ /* Wait until there are no running updates */ ++ while (1) { ++ transaction_t *transaction = journal->j_running_transaction; ++ if (!transaction) ++ break; ++ if (!transaction->t_updates) ++ break; ++ ++ unlock_journal(journal); ++ sleep_on(&journal->j_wait_updates); ++ lock_journal(journal); ++ } ++ ++ unlock_journal(journal); ++ ++ /* We have now established a barrier against other normal ++ * updates, but we also need to barrier against other ++ * journal_lock_updates() calls to make sure that we serialise ++ * special journal-locked operations too. */ ++ down(&journal->j_barrier); ++} ++ ++/* ++ * Release a transaction barrier obtained with journal_lock_updates(). ++ * ++ * Should be called without the journal lock held. ++ */ ++ ++void journal_unlock_updates (journal_t *journal) ++{ ++ lock_journal(journal); ++ ++ J_ASSERT (journal->j_barrier_count != 0); ++ ++ up(&journal->j_barrier); ++ --journal->j_barrier_count; ++ wake_up(&journal->j_wait_transaction_locked); ++ unlock_journal(journal); ++} ++ ++/* ++ * journal_get_write_access: notify intent to modify a buffer for metadata ++ * (not data) update. ++ * ++ * If the buffer is already part of the current transaction, then there ++ * is nothing we need to do. If it is already part of a prior ++ * transaction which we are still committing to disk, then we need to ++ * make sure that we do not overwrite the old copy: we do copy-out to ++ * preserve the copy going to disk. We also account the buffer against ++ * the handle's metadata buffer credits (unless the buffer is already ++ * part of the transaction, that is). ++ * ++ * Returns an error code or 0 on success. ++ * ++ * In full data journalling mode the buffer may be of type BJ_AsyncData, ++ * because we're write()ing a buffer which is also part of a shared mapping. ++ */ ++ ++static int ++do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ int error; ++ char *frozen_buffer = NULL; ++ int need_copy = 0; ++ ++ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); ++ ++ JBUFFER_TRACE(jh, "entry"); ++repeat: ++ /* @@@ Need to check for errors here at some point. */ ++ ++ /* ++ * AKPM: neither bdflush nor kupdate run with the BKL. There's ++ * nothing we can do to prevent them from starting writeout of a ++ * BUF_DIRTY buffer at any time. And checkpointing buffers are on ++ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. ++ * ++ * However. It is very wrong for us to allow ext3 to start directly ++ * altering the ->b_data of buffers which may at that very time be ++ * undergoing writeout to the client filesystem. This can leave ++ * the filesystem in an inconsistent, transient state if we crash. ++ * So what we do is to steal the buffer if it is in checkpoint ++ * mode and dirty. The journal lock will keep out checkpoint-mode ++ * state transitions within journal_remove_checkpoint() and the buffer ++ * is locked to keep bdflush/kupdate/whoever away from it as well. ++ * ++ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a ++ * simple lock_journal(). This code here will care for locked buffers. ++ */ ++ /* ++ * The buffer_locked() || buffer_dirty() tests here are simply an ++ * optimisation tweak. If anyone else in the system decides to ++ * lock this buffer later on, we'll blow up. There doesn't seem ++ * to be a good reason why they should do this. ++ */ ++ if (jh->b_cp_transaction && ++ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { ++ unlock_journal(journal); ++ lock_buffer(jh2bh(jh)); ++ spin_lock(&journal_datalist_lock); ++ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { ++ /* OK, we need to steal it */ ++ JBUFFER_TRACE(jh, "stealing from checkpoint mode"); ++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ J_ASSERT_JH(jh, jh->b_frozen_data == NULL); ++ ++ J_ASSERT(handle->h_buffer_credits > 0); ++ handle->h_buffer_credits--; ++ ++ /* This will clear BH_Dirty and set BH_JBDDirty. */ ++ JBUFFER_TRACE(jh, "file as BJ_Reserved"); ++ __journal_file_buffer(jh, transaction, BJ_Reserved); ++ ++ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ ++ refile_buffer(jh2bh(jh)); ++ ++ /* ++ * The buffer is now hidden from bdflush. It is ++ * metadata against the current transaction. ++ */ ++ JBUFFER_TRACE(jh, "steal from cp mode is complete"); ++ } ++ spin_unlock(&journal_datalist_lock); ++ unlock_buffer(jh2bh(jh)); ++ lock_journal(journal); ++ goto repeat; ++ } ++ ++ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); ++ ++ error = -EROFS; ++ if (is_handle_aborted(handle)) ++ goto out_unlocked; ++ error = 0; ++ ++ spin_lock(&journal_datalist_lock); ++ ++ /* The buffer is already part of this transaction if ++ * b_transaction or b_next_transaction points to it. */ ++ ++ if (jh->b_transaction == transaction || ++ jh->b_next_transaction == transaction) ++ goto done_locked; ++ ++ /* If there is already a copy-out version of this buffer, then ++ * we don't need to make another one. */ ++ ++ if (jh->b_frozen_data) { ++ JBUFFER_TRACE(jh, "has frozen data"); ++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ jh->b_next_transaction = transaction; ++ ++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); ++ handle->h_buffer_credits--; ++ goto done_locked; ++ } ++ ++ /* Is there data here we need to preserve? */ ++ ++ if (jh->b_transaction && jh->b_transaction != transaction) { ++ JBUFFER_TRACE(jh, "owned by older transaction"); ++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ J_ASSERT_JH(jh, jh->b_transaction == ++ journal->j_committing_transaction); ++ ++ /* There is one case we have to be very careful about. ++ * If the committing transaction is currently writing ++ * this buffer out to disk and has NOT made a copy-out, ++ * then we cannot modify the buffer contents at all ++ * right now. The essence of copy-out is that it is the ++ * extra copy, not the primary copy, which gets ++ * journaled. If the primary copy is already going to ++ * disk then we cannot do copy-out here. */ ++ ++ if (jh->b_jlist == BJ_Shadow) { ++ JBUFFER_TRACE(jh, "on shadow: sleep"); ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ /* commit wakes up all shadow buffers after IO */ ++ sleep_on(&jh2bh(jh)->b_wait); ++ lock_journal(journal); ++ goto repeat; ++ } ++ ++ /* Only do the copy if the currently-owning transaction ++ * still needs it. If it is on the Forget list, the ++ * committing transaction is past that stage. The ++ * buffer had better remain locked during the kmalloc, ++ * but that should be true --- we hold the journal lock ++ * still and the buffer is already on the BUF_JOURNAL ++ * list so won't be flushed. ++ * ++ * Subtle point, though: if this is a get_undo_access, ++ * then we will be relying on the frozen_data to contain ++ * the new value of the committed_data record after the ++ * transaction, so we HAVE to force the frozen_data copy ++ * in that case. */ ++ ++ if (jh->b_jlist != BJ_Forget || force_copy) { ++ JBUFFER_TRACE(jh, "generate frozen data"); ++ if (!frozen_buffer) { ++ JBUFFER_TRACE(jh, "allocate memory for buffer"); ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, ++ GFP_NOFS); ++ lock_journal(journal); ++ if (!frozen_buffer) { ++ printk(KERN_EMERG __FUNCTION__ ++ "OOM for frozen_buffer\n"); ++ JBUFFER_TRACE(jh, "oom!"); ++ error = -ENOMEM; ++ spin_lock(&journal_datalist_lock); ++ goto done_locked; ++ } ++ goto repeat; ++ } ++ ++ jh->b_frozen_data = frozen_buffer; ++ frozen_buffer = NULL; ++ need_copy = 1; ++ } ++ jh->b_next_transaction = transaction; ++ } ++ ++ J_ASSERT(handle->h_buffer_credits > 0); ++ handle->h_buffer_credits--; ++ ++ /* Finally, if the buffer is not journaled right now, we need to ++ * make sure it doesn't get written to disk before the caller ++ * actually commits the new data. */ ++ ++ if (!jh->b_transaction) { ++ JBUFFER_TRACE(jh, "no transaction"); ++ J_ASSERT_JH(jh, !jh->b_next_transaction); ++ jh->b_transaction = transaction; ++ JBUFFER_TRACE(jh, "file as BJ_Reserved"); ++ __journal_file_buffer(jh, transaction, BJ_Reserved); ++ } ++ ++done_locked: ++ spin_unlock(&journal_datalist_lock); ++ if (need_copy) { ++ struct page *page; ++ int offset; ++ char *source; ++ ++ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh))); ++ page = jh2bh(jh)->b_page; ++ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; ++ source = kmap(page); ++ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); ++ kunmap(page); ++ } ++ ++ ++ /* If we are about to journal a buffer, then any revoke pending ++ on it is no longer valid. */ ++ journal_cancel_revoke(handle, jh); ++ ++out_unlocked: ++ if (frozen_buffer) ++ kfree(frozen_buffer); ++ ++ JBUFFER_TRACE(jh, "exit"); ++ return error; ++} ++ ++int journal_get_write_access (handle_t *handle, struct buffer_head *bh) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ struct journal_head *jh = journal_add_journal_head(bh); ++ int rc; ++ ++ /* We do not want to get caught playing with fields which the ++ * log thread also manipulates. Make sure that the buffer ++ * completes any outstanding IO before proceeding. */ ++ lock_journal(journal); ++ rc = do_get_write_access(handle, jh, 0); ++ journal_unlock_journal_head(jh); ++ unlock_journal(journal); ++ return rc; ++} ++ ++ ++/* ++ * When the user wants to journal a newly created buffer_head ++ * (ie. getblk() returned a new buffer and we are going to populate it ++ * manually rather than reading off disk), then we need to keep the ++ * buffer_head locked until it has been completely filled with new ++ * data. In this case, we should be able to make the assertion that ++ * the bh is not already part of an existing transaction. ++ * ++ * The buffer should already be locked by the caller by this point. ++ * There is no lock ranking violation: it was a newly created, ++ * unlocked buffer beforehand. */ ++ ++int journal_get_create_access (handle_t *handle, struct buffer_head *bh) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ struct journal_head *jh = journal_add_journal_head(bh); ++ int err; ++ ++ jbd_debug(5, "journal_head %p\n", jh); ++ lock_journal(journal); ++ err = -EROFS; ++ if (is_handle_aborted(handle)) ++ goto out; ++ err = 0; ++ ++ JBUFFER_TRACE(jh, "entry"); ++ /* The buffer may already belong to this transaction due to ++ * pre-zeroing in the filesystem's new_block code. It may also ++ * be on the previous, committing transaction's lists, but it ++ * HAS to be in Forget state in that case: the transaction must ++ * have deleted the buffer for it to be reused here. */ ++ J_ASSERT_JH(jh, (jh->b_transaction == transaction || ++ jh->b_transaction == NULL || ++ (jh->b_transaction == journal->j_committing_transaction && ++ jh->b_jlist == BJ_Forget))); ++ ++ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); ++ ++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); ++ handle->h_buffer_credits--; ++ ++ spin_lock(&journal_datalist_lock); ++ if (jh->b_transaction == NULL) { ++ jh->b_transaction = transaction; ++ JBUFFER_TRACE(jh, "file as BJ_Reserved"); ++ __journal_file_buffer(jh, transaction, BJ_Reserved); ++ JBUFFER_TRACE(jh, "refile"); ++ refile_buffer(jh2bh(jh)); ++ } else if (jh->b_transaction == journal->j_committing_transaction) { ++ JBUFFER_TRACE(jh, "set next transaction"); ++ jh->b_next_transaction = transaction; ++ } ++ spin_unlock(&journal_datalist_lock); ++ ++ /* ++ * akpm: I added this. ext3_alloc_branch can pick up new indirect ++ * blocks which contain freed but then revoked metadata. We need ++ * to cancel the revoke in case we end up freeing it yet again ++ * and the reallocating as data - this would cause a second revoke, ++ * which hits an assertion error. ++ */ ++ JBUFFER_TRACE(jh, "cancelling revoke"); ++ journal_cancel_revoke(handle, jh); ++ journal_unlock_journal_head(jh); ++out: ++ unlock_journal(journal); ++ return err; ++} ++ ++ ++ ++/* ++ * journal_get_undo_access: Notify intent to modify metadata with non- ++ * rewindable consequences ++ * ++ * Sometimes there is a need to distinguish between metadata which has ++ * been committed to disk and that which has not. The ext3fs code uses ++ * this for freeing and allocating space: we have to make sure that we ++ * do not reuse freed space until the deallocation has been committed, ++ * since if we overwrote that space we would make the delete ++ * un-rewindable in case of a crash. ++ * ++ * To deal with that, journal_get_undo_access requests write access to a ++ * buffer for parts of non-rewindable operations such as delete ++ * operations on the bitmaps. The journaling code must keep a copy of ++ * the buffer's contents prior to the undo_access call until such time ++ * as we know that the buffer has definitely been committed to disk. ++ * ++ * We never need to know which transaction the committed data is part ++ * of: buffers touched here are guaranteed to be dirtied later and so ++ * will be committed to a new transaction in due course, at which point ++ * we can discard the old committed data pointer. ++ * ++ * Returns error number or 0 on success. ++ */ ++ ++int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) ++{ ++ journal_t *journal = handle->h_transaction->t_journal; ++ int err; ++ struct journal_head *jh = journal_add_journal_head(bh); ++ ++ JBUFFER_TRACE(jh, "entry"); ++ lock_journal(journal); ++ ++ /* Do this first --- it can drop the journal lock, so we want to ++ * make sure that obtaining the committed_data is done ++ * atomically wrt. completion of any outstanding commits. */ ++ err = do_get_write_access (handle, jh, 1); ++ if (err) ++ goto out; ++ ++ if (!jh->b_committed_data) { ++ /* Copy out the current buffer contents into the ++ * preserved, committed copy. */ ++ JBUFFER_TRACE(jh, "generate b_committed data"); ++ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, ++ GFP_NOFS); ++ if (!jh->b_committed_data) { ++ printk(KERN_EMERG __FUNCTION__ ++ ": No memory for committed data!\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ memcpy (jh->b_committed_data, jh2bh(jh)->b_data, ++ jh2bh(jh)->b_size); ++ } ++ ++out: ++ if (!err) ++ J_ASSERT_JH(jh, jh->b_committed_data); ++ journal_unlock_journal_head(jh); ++ unlock_journal(journal); ++ return err; ++} ++ ++/* ++ * journal_dirty_data: mark a buffer as containing dirty data which ++ * needs to be flushed before we can commit the current transaction. ++ * ++ * The buffer is placed on the transaction's data list and is marked as ++ * belonging to the transaction. ++ * ++ * If `async' is set then the writebask will be initiated by the caller ++ * using submit_bh -> end_buffer_io_async. We put the buffer onto ++ * t_async_datalist. ++ * ++ * Returns error number or 0 on success. ++ * ++ * journal_dirty_data() can be called via page_launder->ext3_writepage ++ * by kswapd. So it cannot block. Happily, there's nothing here ++ * which needs lock_journal if `async' is set. ++ * ++ * When the buffer is on the current transaction we freely move it ++ * between BJ_AsyncData and BJ_SyncData according to who tried to ++ * change its state last. ++ */ ++ ++int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) ++{ ++ journal_t *journal = handle->h_transaction->t_journal; ++ int need_brelse = 0; ++ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; ++ struct journal_head *jh; ++ ++ if (is_handle_aborted(handle)) ++ return 0; ++ ++ jh = journal_add_journal_head(bh); ++ JBUFFER_TRACE(jh, "entry"); ++ ++ /* ++ * The buffer could *already* be dirty. Writeout can start ++ * at any time. ++ */ ++ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); ++ ++ /* ++ * What if the buffer is already part of a running transaction? ++ * ++ * There are two cases: ++ * 1) It is part of the current running transaction. Refile it, ++ * just in case we have allocated it as metadata, deallocated ++ * it, then reallocated it as data. ++ * 2) It is part of the previous, still-committing transaction. ++ * If all we want to do is to guarantee that the buffer will be ++ * written to disk before this new transaction commits, then ++ * being sure that the *previous* transaction has this same ++ * property is sufficient for us! Just leave it on its old ++ * transaction. ++ * ++ * In case (2), the buffer must not already exist as metadata ++ * --- that would violate write ordering (a transaction is free ++ * to write its data at any point, even before the previous ++ * committing transaction has committed). The caller must ++ * never, ever allow this to happen: there's nothing we can do ++ * about it in this layer. ++ */ ++ spin_lock(&journal_datalist_lock); ++ if (jh->b_transaction) { ++ JBUFFER_TRACE(jh, "has transaction"); ++ if (jh->b_transaction != handle->h_transaction) { ++ JBUFFER_TRACE(jh, "belongs to older transaction"); ++ J_ASSERT_JH(jh, jh->b_transaction == ++ journal->j_committing_transaction); ++ ++ /* @@@ IS THIS TRUE ? */ ++ /* ++ * Not any more. Scenario: someone does a write() ++ * in data=journal mode. The buffer's transaction has ++ * moved into commit. Then someone does another ++ * write() to the file. We do the frozen data copyout ++ * and set b_next_transaction to point to j_running_t. ++ * And while we're in that state, someone does a ++ * writepage() in an attempt to pageout the same area ++ * of the file via a shared mapping. At present that ++ * calls journal_dirty_data(), and we get right here. ++ * It may be too late to journal the data. Simply ++ * falling through to the next test will suffice: the ++ * data will be dirty and wil be checkpointed. The ++ * ordering comments in the next comment block still ++ * apply. ++ */ ++ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); ++ ++ /* ++ * If we're journalling data, and this buffer was ++ * subject to a write(), it could be metadata, forget ++ * or shadow against the committing transaction. Now, ++ * someone has dirtied the same darn page via a mapping ++ * and it is being writepage()'d. ++ * We *could* just steal the page from commit, with some ++ * fancy locking there. Instead, we just skip it - ++ * don't tie the page's buffers to the new transaction ++ * at all. ++ * Implication: if we crash before the writepage() data ++ * is written into the filesystem, recovery will replay ++ * the write() data. ++ */ ++ if (jh->b_jlist != BJ_None && ++ jh->b_jlist != BJ_SyncData && ++ jh->b_jlist != BJ_AsyncData) { ++ JBUFFER_TRACE(jh, "Not stealing"); ++ goto no_journal; ++ } ++ ++ /* ++ * This buffer may be undergoing writeout in commit. We ++ * can't return from here and let the caller dirty it ++ * again because that can cause the write-out loop in ++ * commit to never terminate. ++ */ ++ if (!async && buffer_dirty(bh)) { ++ atomic_inc(&bh->b_count); ++ spin_unlock(&journal_datalist_lock); ++ need_brelse = 1; ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ spin_lock(&journal_datalist_lock); ++ /* The buffer may become locked again at any ++ time if it is redirtied */ ++ } ++ ++ /* journal_clean_data_list() may have got there first */ ++ if (jh->b_transaction != NULL) { ++ JBUFFER_TRACE(jh, "unfile from commit"); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ } ++ /* The buffer will be refiled below */ ++ ++ } ++ /* ++ * Special case --- the buffer might actually have been ++ * allocated and then immediately deallocated in the previous, ++ * committing transaction, so might still be left on that ++ * transaction's metadata lists. ++ */ ++ if (jh->b_jlist != wanted_jlist) { ++ JBUFFER_TRACE(jh, "not on correct data list: unfile"); ++ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = NULL; ++ JBUFFER_TRACE(jh, "file as data"); ++ __journal_file_buffer(jh, handle->h_transaction, ++ wanted_jlist); ++ } ++ } else { ++ JBUFFER_TRACE(jh, "not on a transaction"); ++ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist); ++ } ++no_journal: ++ spin_unlock(&journal_datalist_lock); ++ if (need_brelse) { ++ BUFFER_TRACE(bh, "brelse"); ++ __brelse(bh); ++ } ++ JBUFFER_TRACE(jh, "exit"); ++ journal_unlock_journal_head(jh); ++ return 0; ++} ++ ++/* ++ * journal_dirty_metadata: mark a buffer as containing dirty metadata ++ * which needs to be journaled as part of the current transaction. ++ * ++ * The buffer is placed on the transaction's metadata list and is marked ++ * as belonging to the transaction. ++ * ++ * Special care needs to be taken if the buffer already belongs to the ++ * current committing transaction (in which case we should have frozen ++ * data present for that commit). In that case, we don't relink the ++ * buffer: that only gets done when the old transaction finally ++ * completes its commit. ++ * ++ * Returns error number or 0 on success. ++ */ ++ ++int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ struct journal_head *jh = bh2jh(bh); ++ ++ jbd_debug(5, "journal_head %p\n", jh); ++ JBUFFER_TRACE(jh, "entry"); ++ lock_journal(journal); ++ if (is_handle_aborted(handle)) ++ goto out_unlock; ++ ++ spin_lock(&journal_datalist_lock); ++ set_bit(BH_JBDDirty, &bh->b_state); ++ set_buffer_flushtime(bh); ++ ++ J_ASSERT_JH(jh, jh->b_transaction != NULL); ++ ++ /* ++ * Metadata already on the current transaction list doesn't ++ * need to be filed. Metadata on another transaction's list must ++ * be committing, and will be refiled once the commit completes: ++ * leave it alone for now. ++ */ ++ ++ if (jh->b_transaction != transaction) { ++ JBUFFER_TRACE(jh, "already on other transaction"); ++ J_ASSERT_JH(jh, jh->b_transaction == ++ journal->j_committing_transaction); ++ J_ASSERT_JH(jh, jh->b_next_transaction == transaction); ++ /* And this case is illegal: we can't reuse another ++ * transaction's data buffer, ever. */ ++ /* FIXME: writepage() should be journalled */ ++ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData); ++ goto done_locked; ++ } ++ ++ /* That test should have eliminated the following case: */ ++ J_ASSERT_JH(jh, jh->b_frozen_data == 0); ++ ++ JBUFFER_TRACE(jh, "file as BJ_Metadata"); ++ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); ++ ++done_locked: ++ spin_unlock(&journal_datalist_lock); ++ JBUFFER_TRACE(jh, "exit"); ++out_unlock: ++ unlock_journal(journal); ++ return 0; ++} ++ ++#if 0 ++/* ++ * journal_release_buffer: undo a get_write_access without any buffer ++ * updates, if the update decided in the end that it didn't need access. ++ * ++ * journal_get_write_access() can block, so it is quite possible for a ++ * journaling component to decide after the write access is returned ++ * that global state has changed and the update is no longer required. */ ++ ++void journal_release_buffer (handle_t *handle, struct buffer_head *bh) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ struct journal_head *jh = bh2jh(bh); ++ ++ lock_journal(journal); ++ JBUFFER_TRACE(jh, "entry"); ++ ++ /* If the buffer is reserved but not modified by this ++ * transaction, then it is safe to release it. In all other ++ * cases, just leave the buffer as it is. */ ++ ++ spin_lock(&journal_datalist_lock); ++ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && ++ !buffer_jdirty(jh2bh(jh))) { ++ JBUFFER_TRACE(jh, "unused: refiling it"); ++ handle->h_buffer_credits++; ++ __journal_refile_buffer(jh); ++ } ++ spin_unlock(&journal_datalist_lock); ++ ++ JBUFFER_TRACE(jh, "exit"); ++ unlock_journal(journal); ++} ++#endif ++ ++/* ++ * journal_forget: bforget() for potentially-journaled buffers. We can ++ * only do the bforget if there are no commits pending against the ++ * buffer. If the buffer is dirty in the current running transaction we ++ * can safely unlink it. ++ * ++ * bh may not be a journalled buffer at all - it may be a non-JBD ++ * buffer which came off the hashtable. Check for this. ++ * ++ * Decrements bh->b_count by one. ++ * ++ * Allow this call even if the handle has aborted --- it may be part of ++ * the caller's cleanup after an abort. ++ */ ++ ++void journal_forget (handle_t *handle, struct buffer_head *bh) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ struct journal_head *jh; ++ ++ BUFFER_TRACE(bh, "entry"); ++ ++ lock_journal(journal); ++ spin_lock(&journal_datalist_lock); ++ ++ if (!buffer_jbd(bh)) ++ goto not_jbd; ++ jh = bh2jh(bh); ++ ++ if (jh->b_transaction == handle->h_transaction) { ++ J_ASSERT_JH(jh, !jh->b_frozen_data); ++ ++ /* If we are forgetting a buffer which is already part ++ * of this transaction, then we can just drop it from ++ * the transaction immediately. */ ++ clear_bit(BH_Dirty, &bh->b_state); ++ clear_bit(BH_JBDDirty, &bh->b_state); ++ ++ JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); ++ J_ASSERT_JH(jh, !jh->b_committed_data); ++ ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = 0; ++ ++ /* ++ * We are no longer going to journal this buffer. ++ * However, the commit of this transaction is still ++ * important to the buffer: the delete that we are now ++ * processing might obsolete an old log entry, so by ++ * committing, we can satisfy the buffer's checkpoint. ++ * ++ * So, if we have a checkpoint on the buffer, we should ++ * now refile the buffer on our BJ_Forget list so that ++ * we know to remove the checkpoint after we commit. ++ */ ++ ++ if (jh->b_cp_transaction) { ++ __journal_file_buffer(jh, transaction, BJ_Forget); ++ } else { ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ if (!buffer_jbd(bh)) { ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ __bforget(bh); ++ return; ++ } ++ } ++ ++ } else if (jh->b_transaction) { ++ J_ASSERT_JH(jh, (jh->b_transaction == ++ journal->j_committing_transaction)); ++ /* However, if the buffer is still owned by a prior ++ * (committing) transaction, we can't drop it yet... */ ++ JBUFFER_TRACE(jh, "belongs to older transaction"); ++ /* ... but we CAN drop it from the new transaction if we ++ * have also modified it since the original commit. */ ++ ++ if (jh->b_next_transaction) { ++ J_ASSERT(jh->b_next_transaction == transaction); ++ jh->b_next_transaction = NULL; ++ } ++ } ++ ++not_jbd: ++ spin_unlock(&journal_datalist_lock); ++ unlock_journal(journal); ++ __brelse(bh); ++ return; ++} ++ ++#if 0 /* Unused */ ++/* ++ * journal_sync_buffer: flush a potentially-journaled buffer to disk. ++ * ++ * Used for O_SYNC filesystem operations. If the buffer is journaled, ++ * we need to complete the O_SYNC by waiting for the transaction to ++ * complete. It is an error to call journal_sync_buffer before ++ * journal_stop! ++ */ ++ ++void journal_sync_buffer(struct buffer_head *bh) ++{ ++ transaction_t *transaction; ++ journal_t *journal; ++ long sequence; ++ struct journal_head *jh; ++ ++ /* If the buffer isn't journaled, this is easy: just sync it to ++ * disk. */ ++ BUFFER_TRACE(bh, "entry"); ++ ++ spin_lock(&journal_datalist_lock); ++ if (!buffer_jbd(bh)) { ++ spin_unlock(&journal_datalist_lock); ++ return; ++ } ++ jh = bh2jh(bh); ++ if (jh->b_transaction == NULL) { ++ /* If the buffer has already been journaled, then this ++ * is a noop. */ ++ if (jh->b_cp_transaction == NULL) { ++ spin_unlock(&journal_datalist_lock); ++ return; ++ } ++ atomic_inc(&bh->b_count); ++ spin_unlock(&journal_datalist_lock); ++ ll_rw_block (WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ __brelse(bh); ++ goto out; ++ } ++ ++ /* Otherwise, just wait until the transaction is synced to disk. */ ++ transaction = jh->b_transaction; ++ journal = transaction->t_journal; ++ sequence = transaction->t_tid; ++ spin_unlock(&journal_datalist_lock); ++ ++ jbd_debug(2, "requesting commit for jh %p\n", jh); ++ log_start_commit (journal, transaction); ++ ++ while (tid_gt(sequence, journal->j_commit_sequence)) { ++ wake_up(&journal->j_wait_done_commit); ++ sleep_on(&journal->j_wait_done_commit); ++ } ++ JBUFFER_TRACE(jh, "exit"); ++out: ++ return; ++} ++#endif ++ ++/* ++ * All done for a particular handle. ++ * ++ * There is not much action needed here. We just return any remaining ++ * buffer credits to the transaction and remove the handle. The only ++ * complication is that we need to start a commit operation if the ++ * filesystem is marked for synchronous update. ++ * ++ * journal_stop itself will not usually return an error, but it may ++ * do so in unusual circumstances. In particular, expect it to ++ * return -EIO if a journal_abort has been executed since the ++ * transaction began. ++ */ ++ ++int journal_stop(handle_t *handle) ++{ ++ transaction_t *transaction = handle->h_transaction; ++ journal_t *journal = transaction->t_journal; ++ int old_handle_count, err; ++ ++ if (!handle) ++ return 0; ++ ++ J_ASSERT (transaction->t_updates > 0); ++ J_ASSERT (journal_current_handle() == handle); ++ ++ if (is_handle_aborted(handle)) ++ err = -EIO; ++ else ++ err = 0; ++ ++ if (--handle->h_ref > 0) { ++ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, ++ handle->h_ref); ++ return err; ++ } ++ ++ jbd_debug(4, "Handle %p going down\n", handle); ++ ++ /* ++ * Implement synchronous transaction batching. If the handle ++ * was synchronous, don't force a commit immediately. Let's ++ * yield and let another thread piggyback onto this transaction. ++ * Keep doing that while new threads continue to arrive. ++ * It doesn't cost much - we're about to run a commit and sleep ++ * on IO anyway. Speeds up many-threaded, many-dir operations ++ * by 30x or more... ++ */ ++ if (handle->h_sync) { ++ do { ++ old_handle_count = transaction->t_handle_count; ++ set_current_state(TASK_RUNNING); ++ current->policy |= SCHED_YIELD; ++ schedule(); ++ } while (old_handle_count != transaction->t_handle_count); ++ } ++ ++ current->journal_info = NULL; ++ transaction->t_outstanding_credits -= handle->h_buffer_credits; ++ transaction->t_updates--; ++ if (!transaction->t_updates) { ++ wake_up(&journal->j_wait_updates); ++ if (journal->j_barrier_count) ++ wake_up(&journal->j_wait_transaction_locked); ++ } ++ ++ /* ++ * If the handle is marked SYNC, we need to set another commit ++ * going! We also want to force a commit if the current ++ * transaction is occupying too much of the log, or if the ++ * transaction is too old now. ++ */ ++ if (handle->h_sync || ++ transaction->t_outstanding_credits > ++ journal->j_max_transaction_buffers || ++ time_after_eq(jiffies, transaction->t_expires)) { ++ /* Do this even for aborted journals: an abort still ++ * completes the commit thread, it just doesn't write ++ * anything to disk. */ ++ tid_t tid = transaction->t_tid; ++ ++ jbd_debug(2, "transaction too old, requesting commit for " ++ "handle %p\n", handle); ++ /* This is non-blocking */ ++ log_start_commit(journal, transaction); ++ ++ /* ++ * Special case: JFS_SYNC synchronous updates require us ++ * to wait for the commit to complete. ++ */ ++ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) ++ log_wait_commit(journal, tid); ++ } ++ kfree(handle); ++ return err; ++} ++ ++/* ++ * For synchronous operations: force any uncommitted trasnactions ++ * to disk. May seem kludgy, but it reuses all the handle batching ++ * code in a very simple manner. ++ */ ++int journal_force_commit(journal_t *journal) ++{ ++ handle_t *handle; ++ int ret = 0; ++ ++ lock_kernel(); ++ handle = journal_start(journal, 1); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out; ++ } ++ handle->h_sync = 1; ++ journal_stop(handle); ++out: ++ unlock_kernel(); ++ return ret; ++} ++ ++/* ++ * ++ * List management code snippets: various functions for manipulating the ++ * transaction buffer lists. ++ * ++ */ ++ ++/* ++ * Append a buffer to a transaction list, given the transaction's list head ++ * pointer. ++ * journal_datalist_lock is held. ++ */ ++ ++static inline void ++__blist_add_buffer(struct journal_head **list, struct journal_head *jh) ++{ ++ if (!*list) { ++ jh->b_tnext = jh->b_tprev = jh; ++ *list = jh; ++ } else { ++ /* Insert at the tail of the list to preserve order */ ++ struct journal_head *first = *list, *last = first->b_tprev; ++ jh->b_tprev = last; ++ jh->b_tnext = first; ++ last->b_tnext = first->b_tprev = jh; ++ } ++} ++ ++/* ++ * Remove a buffer from a transaction list, given the transaction's list ++ * head pointer. ++ * ++ * Called with journal_datalist_lock held, and the journal may not ++ * be locked. ++ */ ++ ++static inline void ++__blist_del_buffer(struct journal_head **list, struct journal_head *jh) ++{ ++ if (*list == jh) { ++ *list = jh->b_tnext; ++ if (*list == jh) ++ *list = 0; ++ } ++ jh->b_tprev->b_tnext = jh->b_tnext; ++ jh->b_tnext->b_tprev = jh->b_tprev; ++} ++ ++/* ++ * Remove a buffer from the appropriate transaction list. ++ * ++ * Note that this function can *change* the value of ++ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget, ++ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller ++ * is holding onto a copy of one of thee pointers, it could go bad. ++ * Generally the caller needs to re-read the pointer from the transaction_t. ++ * ++ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called ++ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that ++ * case, journal_datalist_lock will be held, and the journal may not be locked. ++ */ ++void __journal_unfile_buffer(struct journal_head *jh) ++{ ++ struct journal_head **list = 0; ++ transaction_t * transaction; ++ ++ assert_spin_locked(&journal_datalist_lock); ++ transaction = jh->b_transaction; ++ ++#ifdef __SMP__ ++ J_ASSERT (current->lock_depth >= 0); ++#endif ++ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); ++ ++ if (jh->b_jlist != BJ_None) ++ J_ASSERT_JH(jh, transaction != 0); ++ ++ switch (jh->b_jlist) { ++ case BJ_None: ++ return; ++ case BJ_SyncData: ++ list = &transaction->t_sync_datalist; ++ break; ++ case BJ_AsyncData: ++ list = &transaction->t_async_datalist; ++ break; ++ case BJ_Metadata: ++ transaction->t_nr_buffers--; ++ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); ++ list = &transaction->t_buffers; ++ break; ++ case BJ_Forget: ++ list = &transaction->t_forget; ++ break; ++ case BJ_IO: ++ list = &transaction->t_iobuf_list; ++ break; ++ case BJ_Shadow: ++ list = &transaction->t_shadow_list; ++ break; ++ case BJ_LogCtl: ++ list = &transaction->t_log_list; ++ break; ++ case BJ_Reserved: ++ list = &transaction->t_reserved_list; ++ break; ++ } ++ ++ __blist_del_buffer(list, jh); ++ jh->b_jlist = BJ_None; ++ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) { ++ set_bit(BH_Dirty, &jh2bh(jh)->b_state); ++ } ++} ++ ++void journal_unfile_buffer(struct journal_head *jh) ++{ ++ spin_lock(&journal_datalist_lock); ++ __journal_unfile_buffer(jh); ++ spin_unlock(&journal_datalist_lock); ++} ++ ++/* ++ * Called from journal_try_to_free_buffers(). The journal is not ++ * locked. lru_list_lock is not held. ++ * ++ * Here we see why journal_datalist_lock is global and not per-journal. ++ * We cannot get back to this buffer's journal pointer without locking ++ * out journal_clean_data_list() in some manner. ++ * ++ * One could use journal_datalist_lock to get unracy access to a ++ * per-journal lock. ++ * ++ * Called with journal_datalist_lock held. ++ * ++ * Returns non-zero iff we were able to free the journal_head. ++ */ ++static int __journal_try_to_free_buffer(struct buffer_head *bh, ++ int *locked_or_dirty) ++{ ++ struct journal_head *jh; ++ ++ assert_spin_locked(&journal_datalist_lock); ++ ++ jh = bh2jh(bh); ++ ++ if (buffer_locked(bh) || buffer_dirty(bh)) { ++ *locked_or_dirty = 1; ++ goto out; ++ } ++ ++ if (!buffer_uptodate(bh)) ++ goto out; ++ ++ if (jh->b_next_transaction != 0) ++ goto out; ++ ++ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { ++ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) { ++ /* A written-back ordered data buffer */ ++ JBUFFER_TRACE(jh, "release data"); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = 0; ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ } ++ } ++ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { ++ /* written-back checkpointed metadata buffer */ ++ if (jh->b_jlist == BJ_None) { ++ JBUFFER_TRACE(jh, "remove from checkpoint list"); ++ __journal_remove_checkpoint(jh); ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ } ++ } ++ return !buffer_jbd(bh); ++ ++out: ++ return 0; ++} ++ ++/* ++ * journal_try_to_free_buffers(). For all the buffers on this page, ++ * if they are fully written out ordered data, move them onto BUF_CLEAN ++ * so try_to_free_buffers() can reap them. Called with lru_list_lock ++ * not held. Does its own locking. ++ * ++ * This complicates JBD locking somewhat. We aren't protected by the ++ * BKL here. We wish to remove the buffer from its committing or ++ * running transaction's ->t_datalist via __journal_unfile_buffer. ++ * ++ * This may *change* the value of transaction_t->t_datalist, so anyone ++ * who looks at t_datalist needs to lock against this function. ++ * ++ * Even worse, someone may be doing a journal_dirty_data on this ++ * buffer. So we need to lock against that. journal_dirty_data() ++ * will come out of the lock with the buffer dirty, which makes it ++ * ineligible for release here. ++ * ++ * Who else is affected by this? hmm... Really the only contender ++ * is do_get_write_access() - it could be looking at the buffer while ++ * journal_try_to_free_buffer() is changing its state. But that ++ * cannot happen because we never reallocate freed data as metadata ++ * while the data is part of a transaction. Yes? ++ * ++ * This function returns non-zero if we wish try_to_free_buffers() ++ * to be called. We do this is the page is releasable by try_to_free_buffers(). ++ * We also do it if the page has locked or dirty buffers and the caller wants ++ * us to perform sync or async writeout. ++ */ ++int journal_try_to_free_buffers(journal_t *journal, ++ struct page *page, int gfp_mask) ++{ ++ struct buffer_head *bh; ++ struct buffer_head *tmp; ++ int locked_or_dirty = 0; ++ int call_ttfb = 1; ++ ++ J_ASSERT(PageLocked(page)); ++ ++ bh = page->buffers; ++ tmp = bh; ++ spin_lock(&journal_datalist_lock); ++ do { ++ struct buffer_head *p = tmp; ++ ++ tmp = tmp->b_this_page; ++ if (buffer_jbd(p)) ++ if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) ++ call_ttfb = 0; ++ } while (tmp != bh); ++ spin_unlock(&journal_datalist_lock); ++ ++ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) ++ goto out; ++ if (!locked_or_dirty) ++ goto out; ++ /* ++ * The VM wants us to do writeout, or to block on IO, or both. ++ * So we allow try_to_free_buffers to be called even if the page ++ * still has journalled buffers. ++ */ ++ call_ttfb = 1; ++out: ++ return call_ttfb; ++} ++ ++/* ++ * This buffer is no longer needed. If it is on an older transaction's ++ * checkpoint list we need to record it on this transaction's forget list ++ * to pin this buffer (and hence its checkpointing transaction) down until ++ * this transaction commits. If the buffer isn't on a checkpoint list, we ++ * release it. ++ * Returns non-zero if JBD no longer has an interest in the buffer. ++ */ ++static int dispose_buffer(struct journal_head *jh, ++ transaction_t *transaction) ++{ ++ int may_free = 1; ++ struct buffer_head *bh = jh2bh(jh); ++ ++ spin_lock(&journal_datalist_lock); ++ __journal_unfile_buffer(jh); ++ jh->b_transaction = 0; ++ ++ if (jh->b_cp_transaction) { ++ JBUFFER_TRACE(jh, "on running+cp transaction"); ++ __journal_file_buffer(jh, transaction, BJ_Forget); ++ clear_bit(BH_JBDDirty, &bh->b_state); ++ may_free = 0; ++ } else { ++ JBUFFER_TRACE(jh, "on running transaction"); ++ __journal_remove_journal_head(bh); ++ __brelse(bh); ++ } ++ spin_unlock(&journal_datalist_lock); ++ return may_free; ++} ++ ++/* ++ * journal_flushpage ++ * ++ * This code is tricky. It has a number of cases to deal with. ++ * ++ * There are two invariants which this code relies on: ++ * ++ * i_size must be updated on disk before we start calling flushpage on the ++ * data. ++ * ++ * This is done in ext3 by defining an ext3_setattr method which ++ * updates i_size before truncate gets going. By maintaining this ++ * invariant, we can be sure that it is safe to throw away any buffers ++ * attached to the current transaction: once the transaction commits, ++ * we know that the data will not be needed. ++ * ++ * Note however that we can *not* throw away data belonging to the ++ * previous, committing transaction! ++ * ++ * Any disk blocks which *are* part of the previous, committing ++ * transaction (and which therefore cannot be discarded immediately) are ++ * not going to be reused in the new running transaction ++ * ++ * The bitmap committed_data images guarantee this: any block which is ++ * allocated in one transaction and removed in the next will be marked ++ * as in-use in the committed_data bitmap, so cannot be reused until ++ * the next transaction to delete the block commits. This means that ++ * leaving committing buffers dirty is quite safe: the disk blocks ++ * cannot be reallocated to a different file and so buffer aliasing is ++ * not possible. ++ * ++ * ++ * The above applies mainly to ordered data mode. In writeback mode we ++ * don't make guarantees about the order in which data hits disk --- in ++ * particular we don't guarantee that new dirty data is flushed before ++ * transaction commit --- so it is always safe just to discard data ++ * immediately in that mode. --sct ++ */ ++ ++/* ++ * The journal_unmap_buffer helper function returns zero if the buffer ++ * concerned remains pinned as an anonymous buffer belonging to an older ++ * transaction. ++ * ++ * We're outside-transaction here. Either or both of j_running_transaction ++ * and j_committing_transaction may be NULL. ++ */ ++static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) ++{ ++ transaction_t *transaction; ++ struct journal_head *jh; ++ int may_free = 1; ++ ++ BUFFER_TRACE(bh, "entry"); ++ ++ if (!buffer_mapped(bh)) ++ return 1; ++ ++ /* It is safe to proceed here without the ++ * journal_datalist_spinlock because the buffers cannot be ++ * stolen by try_to_free_buffers as long as we are holding the ++ * page lock. --sct */ ++ ++ if (!buffer_jbd(bh)) ++ goto zap_buffer; ++ ++ jh = bh2jh(bh); ++ transaction = jh->b_transaction; ++ if (transaction == NULL) { ++ /* First case: not on any transaction. If it ++ * has no checkpoint link, then we can zap it: ++ * it's a writeback-mode buffer so we don't care ++ * if it hits disk safely. */ ++ if (!jh->b_cp_transaction) { ++ JBUFFER_TRACE(jh, "not on any transaction: zap"); ++ goto zap_buffer; ++ } ++ ++ if (!buffer_dirty(bh)) { ++ /* bdflush has written it. We can drop it now */ ++ goto zap_buffer; ++ } ++ ++ /* OK, it must be in the journal but still not ++ * written fully to disk: it's metadata or ++ * journaled data... */ ++ ++ if (journal->j_running_transaction) { ++ /* ... and once the current transaction has ++ * committed, the buffer won't be needed any ++ * longer. */ ++ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); ++ return dispose_buffer(jh, ++ journal->j_running_transaction); ++ } else { ++ /* There is no currently-running transaction. So the ++ * orphan record which we wrote for this file must have ++ * passed into commit. We must attach this buffer to ++ * the committing transaction, if it exists. */ ++ if (journal->j_committing_transaction) { ++ JBUFFER_TRACE(jh, "give to committing trans"); ++ return dispose_buffer(jh, ++ journal->j_committing_transaction); ++ } else { ++ /* The orphan record's transaction has ++ * committed. We can cleanse this buffer */ ++ clear_bit(BH_JBDDirty, &bh->b_state); ++ goto zap_buffer; ++ } ++ } ++ } else if (transaction == journal->j_committing_transaction) { ++ /* If it is committing, we simply cannot touch it. We ++ * can remove it's next_transaction pointer from the ++ * running transaction if that is set, but nothing ++ * else. */ ++ JBUFFER_TRACE(jh, "on committing transaction"); ++ if (jh->b_next_transaction) { ++ J_ASSERT(jh->b_next_transaction == ++ journal->j_running_transaction); ++ jh->b_next_transaction = NULL; ++ } ++ return 0; ++ } else { ++ /* Good, the buffer belongs to the running transaction. ++ * We are writing our own transaction's data, not any ++ * previous one's, so it is safe to throw it away ++ * (remember that we expect the filesystem to have set ++ * i_size already for this truncate so recovery will not ++ * expose the disk blocks we are discarding here.) */ ++ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); ++ may_free = dispose_buffer(jh, transaction); ++ } ++ ++zap_buffer: ++ if (buffer_dirty(bh)) ++ mark_buffer_clean(bh); ++ J_ASSERT_BH(bh, !buffer_jdirty(bh)); ++ clear_bit(BH_Uptodate, &bh->b_state); ++ clear_bit(BH_Mapped, &bh->b_state); ++ clear_bit(BH_Req, &bh->b_state); ++ clear_bit(BH_New, &bh->b_state); ++ return may_free; ++} ++ ++/* ++ * Return non-zero if the page's buffers were successfully reaped ++ */ ++int journal_flushpage(journal_t *journal, ++ struct page *page, ++ unsigned long offset) ++{ ++ struct buffer_head *head, *bh, *next; ++ unsigned int curr_off = 0; ++ int may_free = 1; ++ ++ if (!PageLocked(page)) ++ BUG(); ++ if (!page->buffers) ++ return 1; ++ ++ /* We will potentially be playing with lists other than just the ++ * data lists (especially for journaled data mode), so be ++ * cautious in our locking. */ ++ lock_journal(journal); ++ ++ head = bh = page->buffers; ++ do { ++ unsigned int next_off = curr_off + bh->b_size; ++ next = bh->b_this_page; ++ ++ /* AKPM: doing lock_buffer here may be overly paranoid */ ++ if (offset <= curr_off) { ++ /* This block is wholly outside the truncation point */ ++ lock_buffer(bh); ++ may_free &= journal_unmap_buffer(journal, bh); ++ unlock_buffer(bh); ++ } ++ curr_off = next_off; ++ bh = next; ++ ++ } while (bh != head); ++ ++ unlock_journal(journal); ++ ++ if (!offset) { ++ if (!may_free || !try_to_free_buffers(page, 0)) ++ return 0; ++ J_ASSERT(page->buffers == NULL); ++ } ++ return 1; ++} ++ ++/* ++ * File a buffer on the given transaction list. ++ */ ++void __journal_file_buffer(struct journal_head *jh, ++ transaction_t *transaction, int jlist) ++{ ++ struct journal_head **list = 0; ++ ++ assert_spin_locked(&journal_datalist_lock); ++ ++#ifdef __SMP__ ++ J_ASSERT (current->lock_depth >= 0); ++#endif ++ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); ++ J_ASSERT_JH(jh, jh->b_transaction == transaction || ++ jh->b_transaction == 0); ++ ++ if (jh->b_transaction) { ++ if (jh->b_jlist == jlist) ++ return; ++ __journal_unfile_buffer(jh); ++ } else { ++ jh->b_transaction = transaction; ++ } ++ ++ switch (jlist) { ++ case BJ_None: ++ J_ASSERT_JH(jh, !jh->b_committed_data); ++ J_ASSERT_JH(jh, !jh->b_frozen_data); ++ return; ++ case BJ_SyncData: ++ list = &transaction->t_sync_datalist; ++ break; ++ case BJ_AsyncData: ++ list = &transaction->t_async_datalist; ++ break; ++ case BJ_Metadata: ++ transaction->t_nr_buffers++; ++ list = &transaction->t_buffers; ++ break; ++ case BJ_Forget: ++ list = &transaction->t_forget; ++ break; ++ case BJ_IO: ++ list = &transaction->t_iobuf_list; ++ break; ++ case BJ_Shadow: ++ list = &transaction->t_shadow_list; ++ break; ++ case BJ_LogCtl: ++ list = &transaction->t_log_list; ++ break; ++ case BJ_Reserved: ++ list = &transaction->t_reserved_list; ++ break; ++ } ++ ++ __blist_add_buffer(list, jh); ++ jh->b_jlist = jlist; ++ ++ if (jlist == BJ_Metadata || jlist == BJ_Reserved || ++ jlist == BJ_Shadow || jlist == BJ_Forget) { ++ if (atomic_set_buffer_clean(jh2bh(jh))) { ++ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); ++ } ++ } ++} ++ ++void journal_file_buffer(struct journal_head *jh, ++ transaction_t *transaction, int jlist) ++{ ++ spin_lock(&journal_datalist_lock); ++ __journal_file_buffer(jh, transaction, jlist); ++ spin_unlock(&journal_datalist_lock); ++} ++ ++/* ++ * Remove a buffer from its current buffer list in preparation for ++ * dropping it from its current transaction entirely. If the buffer has ++ * already started to be used by a subsequent transaction, refile the ++ * buffer on that transaction's metadata list. ++ */ ++ ++void __journal_refile_buffer(struct journal_head *jh) ++{ ++ assert_spin_locked(&journal_datalist_lock); ++#ifdef __SMP__ ++ J_ASSERT_JH(jh, current->lock_depth >= 0); ++#endif ++ __journal_unfile_buffer(jh); ++ ++ /* If the buffer is now unused, just drop it. If it has been ++ modified by a later transaction, add it to the new ++ transaction's metadata list. */ ++ ++ jh->b_transaction = jh->b_next_transaction; ++ jh->b_next_transaction = NULL; ++ ++ if (jh->b_transaction != NULL) { ++ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); ++ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); ++ } else { ++ /* Onto BUF_DIRTY for writeback */ ++ refile_buffer(jh2bh(jh)); ++ } ++} ++ ++/* ++ * For the unlocked version of this call, also make sure that any ++ * hanging journal_head is cleaned up if necessary. ++ * ++ * __journal_refile_buffer is usually called as part of a single locked ++ * operation on a buffer_head, in which the caller is probably going to ++ * be hooking the journal_head onto other lists. In that case it is up ++ * to the caller to remove the journal_head if necessary. For the ++ * unlocked journal_refile_buffer call, the caller isn't going to be ++ * doing anything else to the buffer so we need to do the cleanup ++ * ourselves to avoid a jh leak. ++ * ++ * *** The journal_head may be freed by this call! *** ++ */ ++void journal_refile_buffer(struct journal_head *jh) ++{ ++ struct buffer_head *bh; ++ ++ spin_lock(&journal_datalist_lock); ++ bh = jh2bh(jh); ++ ++ __journal_refile_buffer(jh); ++ __journal_remove_journal_head(bh); ++ ++ spin_unlock(&journal_datalist_lock); ++ __brelse(bh); ++} diff --git a/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch b/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch new file mode 100644 index 0000000..9bb754a --- /dev/null +++ b/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch @@ -0,0 +1,15 @@ + include/linux/mm.h | 1 + + 1 files changed, 1 insertion(+) + +Index: linux.mcp2/include/linux/mm.h +=================================================================== +--- linux.mcp2.orig/include/linux/mm.h 2004-05-05 14:32:29.000000000 -0700 ++++ linux.mcp2/include/linux/mm.h 2004-05-05 14:46:54.000000000 -0700 +@@ -162,6 +162,7 @@ + protected by pagemap_lru_lock !! */ + struct page **pprev_hash; /* Complement to *next_hash. */ + struct buffer_head * buffers; /* Buffer maps us to a disk block. */ ++ unsigned long private; + + /* + * On machines where all RAM is mapped into kernel address space, diff --git a/lustre/kernel_patches/patches/dev_read_only_hp_2.4.19-suse.patch b/lustre/kernel_patches/patches/dev_read_only_hp_2.4.19-suse.patch new file mode 100644 index 0000000..f3de3c3 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only_hp_2.4.19-suse.patch @@ -0,0 +1,77 @@ + drivers/block/blkpg.c | 36 ++++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 3 +++ + drivers/ide/ide-disk.c | 4 ++++ + 3 files changed, 43 insertions(+) + +Index: linux-2.4.19.SuSE/drivers/block/blkpg.c +=================================================================== +--- linux-2.4.19.SuSE.orig/drivers/block/blkpg.c 2004-04-29 16:19:25.000000000 -0700 ++++ linux-2.4.19.SuSE/drivers/block/blkpg.c 2004-04-29 16:35:09.000000000 -0700 +@@ -296,3 +296,38 @@ + } + + EXPORT_SYMBOL(blk_ioctl); ++ ++ ++#define NUM_DEV_NO_WRITE 16 ++static int dev_no_write[NUM_DEV_NO_WRITE]; ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(kdev_t dev, int no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ dev_no_write[no_write] = 0xdead0000 + dev; ++ } ++} ++ ++int dev_check_rdonly(kdev_t dev) { ++ int i; ++ ++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { ++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && ++ dev == (dev_no_write[i] & 0xffff)) ++ return 1; ++ } ++ return 0; ++} ++ ++void dev_clear_rdonly(int no_write) { ++ dev_no_write[no_write] = 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); +Index: linux-2.4.19.SuSE/drivers/block/loop.c +=================================================================== +--- linux-2.4.19.SuSE.orig/drivers/block/loop.c 2004-04-29 16:19:25.000000000 -0700 ++++ linux-2.4.19.SuSE/drivers/block/loop.c 2004-04-29 16:32:56.000000000 -0700 +@@ -478,6 +478,9 @@ + spin_unlock_irq(&lo->lo_lock); + + if (rw == WRITE) { ++ if (dev_check_rdonly(rbh->b_rdev)) ++ goto err; ++ + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { +Index: linux-2.4.19.SuSE/drivers/ide/ide-disk.c +=================================================================== +--- linux-2.4.19.SuSE.orig/drivers/ide/ide-disk.c 2004-04-29 16:18:55.000000000 -0700 ++++ linux-2.4.19.SuSE/drivers/ide/ide-disk.c 2004-04-29 16:32:56.000000000 -0700 +@@ -558,6 +558,10 @@ + */ + static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) + { ++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { ++ ide_end_request(1, HWGROUP(drive)); ++ return ide_stopped; ++ } + if (IDE_CONTROL_REG) + OUT_BYTE(drive->ctl,IDE_CONTROL_REG); + diff --git a/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch b/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch new file mode 100644 index 0000000..a7bdb63 --- /dev/null +++ b/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch @@ -0,0 +1,32 @@ +Index: linux-bgl/kernel/sched.c +=================================================================== +--- linux-bgl.orig/kernel/sched.c 2003-07-02 08:43:33.000000000 -0700 ++++ linux-bgl/kernel/sched.c 2004-10-26 23:37:44.314193755 -0700 +@@ -1124,7 +1124,7 @@ + return retval; + } + +-static void show_task(struct task_struct * p) ++void show_task(struct task_struct * p) + { + unsigned long free = 0; + int state; +Index: linux-bgl/kernel/ksyms.c +=================================================================== +--- linux-bgl.orig/kernel/ksyms.c 2004-10-26 23:23:00.518654978 -0700 ++++ linux-bgl/kernel/ksyms.c 2004-10-26 23:38:29.289071295 -0700 +@@ -76,6 +76,7 @@ + }; + #endif + ++void show_task(struct task_struct *); + + EXPORT_SYMBOL(inter_module_register); + EXPORT_SYMBOL(inter_module_unregister); +@@ -595,3 +596,6 @@ + + EXPORT_SYMBOL(tasklist_lock); + EXPORT_SYMBOL(pidhash); ++ ++/* debug */ ++EXPORT_SYMBOL(show_task); diff --git a/lustre/kernel_patches/patches/export-truncate-bgl.patch b/lustre/kernel_patches/patches/export-truncate-bgl.patch new file mode 100644 index 0000000..9508215 --- /dev/null +++ b/lustre/kernel_patches/patches/export-truncate-bgl.patch @@ -0,0 +1,37 @@ + include/linux/mm.h | 1 + + mm/filemap.c | 3 ++- + 2 files changed, 3 insertions(+), 1 deletion(-) + +Index: linux-ion/include/linux/mm.h +=================================================================== +--- linux-ion.orig/include/linux/mm.h 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/include/linux/mm.h 2004-09-27 15:07:50.000000000 -0700 +@@ -593,6 +593,7 @@ + /* filemap.c */ + extern void remove_inode_page(struct page *); + extern unsigned long page_unuse(struct page *); ++extern void truncate_complete_page(struct page *); + extern void truncate_inode_pages(struct address_space *, loff_t); + + /* generic vm_area_ops exported for stackable file systems */ +Index: linux-ion/mm/filemap.c +=================================================================== +--- linux-ion.orig/mm/filemap.c 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/mm/filemap.c 2004-09-27 15:08:13.000000000 -0700 +@@ -231,7 +231,7 @@ + do_flushpage(page, partial); + } + +-static void truncate_complete_page(struct page *page) ++void truncate_complete_page(struct page *page) + { + /* Leave it on the LRU if it gets converted into anonymous buffers */ + if (!page->buffers || do_flushpage(page, 0)) +@@ -249,6 +249,7 @@ + remove_inode_page(page); + page_cache_release(page); + } ++EXPORT_SYMBOL(truncate_complete_page); + + static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); + static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) diff --git a/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch b/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch new file mode 100644 index 0000000..82a0182 --- /dev/null +++ b/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch @@ -0,0 +1,42 @@ + + + +Index: linux-ion/kernel/ksyms.c +=================================================================== +--- linux-ion.orig/kernel/ksyms.c 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/kernel/ksyms.c 2004-09-27 15:04:52.000000000 -0700 +@@ -286,6 +286,10 @@ + EXPORT_SYMBOL(dcache_readdir); + EXPORT_SYMBOL(dcache_dir_ops); + ++/* lustre */ ++EXPORT_SYMBOL(panic_notifier_list); ++EXPORT_SYMBOL(do_kern_mount); ++ + /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ + EXPORT_SYMBOL(default_llseek); + EXPORT_SYMBOL(dentry_open); +Index: linux-ion/include/linux/fs.h +=================================================================== +--- linux-ion.orig/include/linux/fs.h 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/include/linux/fs.h 2004-09-27 15:04:52.000000000 -0700 +@@ -1050,6 +1050,7 @@ + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); ++struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data); + extern void umount_tree(struct vfsmount *); + + #define kern_umount mntput +Index: linux-ion/mm/memory.c +=================================================================== +--- linux-ion.orig/mm/memory.c 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/mm/memory.c 2004-09-27 15:05:56.000000000 -0700 +@@ -401,6 +401,7 @@ + mm->rss = 0; + spin_unlock(&mm->page_table_lock); + } ++EXPORT_SYMBOL(zap_page_range); + + /* + * Do a quick page-table lookup for a single page. diff --git a/lustre/kernel_patches/patches/exports_2.4.19-suse2.patch b/lustre/kernel_patches/patches/exports_2.4.19-suse2.patch new file mode 100644 index 0000000..41744b9 --- /dev/null +++ b/lustre/kernel_patches/patches/exports_2.4.19-suse2.patch @@ -0,0 +1,59 @@ + fs/ext3/Makefile | 2 ++ + fs/ext3/super.c | 2 +- + include/linux/fs.h | 1 + + kernel/ksyms.c | 4 ++++ + 4 files changed, 8 insertions(+), 1 deletion(-) + +Index: linux-2.4.19.SuSE/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-04-29 16:18:08.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-04-29 16:36:09.000000000 -0700 +@@ -9,6 +9,8 @@ + + O_TARGET := ext3.o + ++export-objs := super.o inode.o ++ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) +Index: linux-2.4.19.SuSE/fs/ext3/super.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-04-29 16:18:08.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-04-29 16:36:09.000000000 -0700 +@@ -1821,7 +1821,7 @@ + exit_ext3_xattr(); + } + +-EXPORT_NO_SYMBOLS; ++EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +Index: linux-2.4.19.SuSE/include/linux/fs.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/fs.h 2004-04-29 16:19:41.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/fs.h 2004-04-29 16:36:52.000000000 -0700 +@@ -1174,6 +1174,7 @@ + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); ++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); + extern void umount_tree(struct vfsmount *); + + #define kern_umount mntput +Index: linux-2.4.19.SuSE/kernel/ksyms.c +=================================================================== +--- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-04-29 16:19:35.000000000 -0700 ++++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-04-29 16:36:09.000000000 -0700 +@@ -330,6 +330,10 @@ + EXPORT_SYMBOL(dcache_readdir); + EXPORT_SYMBOL(dcache_dir_ops); + ++/* lustre */ ++EXPORT_SYMBOL(pagecache_lock_cacheline); ++EXPORT_SYMBOL(do_kern_mount); ++ + /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ + EXPORT_SYMBOL(default_llseek); + EXPORT_SYMBOL(dentry_open); diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch new file mode 100644 index 0000000..a6b126e --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch @@ -0,0 +1,2552 @@ + fs/ext3/Makefile | 2 + fs/ext3/dir.c | 299 +++++++++ + fs/ext3/file.c | 3 + fs/ext3/hash.c | 215 ++++++ + fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++----- + fs/ext3/super.c | 7 + include/linux/ext3_fs.h | 85 ++ + include/linux/ext3_fs_sb.h | 2 + include/linux/ext3_jbd.h | 2 + include/linux/rbtree.h | 2 + lib/rbtree.c | 42 + + 11 files changed, 1887 insertions(+), 160 deletions(-) + +Index: linux-2.4.19.SuSE/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-05-27 11:07:21.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-05-27 11:08:28.000000000 -0700 +@@ -12,7 +12,7 @@ + export-objs := super.o inode.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o hash.o + obj-m := $(O_TARGET) + + obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o +Index: linux-2.4.19.SuSE/fs/ext3/dir.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 ++++ linux-2.4.19.SuSE/fs/ext3/dir.c 2004-05-27 11:08:28.000000000 -0700 +@@ -21,12 +21,16 @@ + #include + #include + #include ++#include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -35,6 +39,17 @@ + fsync: ext3_sync_file, /* BKL held */ + }; + ++ ++static unsigned char get_dtype(struct super_block *sb, int filetype) ++{ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || ++ (filetype >= EXT3_FT_MAX)) ++ return DT_UNKNOWN; ++ ++ return (ext3_filetype_table[filetype]); ++} ++ ++ + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -79,6 +94,16 @@ + + sb = inode->i_sb; + ++ if (is_dx(inode)) { ++ err = ext3_dx_readdir(filp, dirent, filldir); ++ if (err != ERR_BAD_DX_DIR) ++ return err; ++ /* ++ * We don't set the inode dirty flag since it's not ++ * critical that it get flushed back to the disk. ++ */ ++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; ++ } + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); +@@ -162,18 +187,12 @@ + * during the copy operation. + */ + unsigned long version = filp->f_version; +- unsigned char d_type = DT_UNKNOWN; + +- if (EXT3_HAS_INCOMPAT_FEATURE(sb, +- EXT3_FEATURE_INCOMPAT_FILETYPE) +- && de->file_type < EXT3_FT_MAX) +- d_type = +- ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), +- d_type); ++ get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) +@@ -188,3 +207,269 @@ + UPDATE_ATIME(inode); + return 0; + } ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * These functions convert from the major/minor hash to an f_pos ++ * value. ++ * ++ * Currently we only use major hash numer. This is unfortunate, but ++ * on 32-bit machines, the same VFS interface is used for lseek and ++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of ++ * lseek/telldir/seekdir will blow out spectacularly, and from within ++ * the ext2 low-level routine, we don't know if we're being called by ++ * a 64-bit version of the system call or the 32-bit version of the ++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir ++ * cookie. Sigh. ++ */ ++#define hash2pos(major, minor) (major >> 1) ++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) ++#define pos2min_hash(pos) (0) ++ ++/* ++ * This structure holds the nodes of the red-black tree used to store ++ * the directory entry in hash order. ++ */ ++struct fname { ++ __u32 hash; ++ __u32 minor_hash; ++ rb_node_t rb_hash; ++ struct fname *next; ++ __u32 inode; ++ __u8 name_len; ++ __u8 file_type; ++ char name[0]; ++}; ++ ++/* ++ * This functoin implements a non-recursive way of freeing all of the ++ * nodes in the red-black tree. ++ */ ++static void free_rb_tree_fname(rb_root_t *root) ++{ ++ rb_node_t *n = root->rb_node; ++ rb_node_t *parent; ++ struct fname *fname; ++ ++ while (n) { ++ /* Do the node's children first */ ++ if ((n)->rb_left) { ++ n = n->rb_left; ++ continue; ++ } ++ if (n->rb_right) { ++ n = n->rb_right; ++ continue; ++ } ++ /* ++ * The node has no children; free it, and then zero ++ * out parent's link to it. Finally go to the ++ * beginning of the loop and try to free the parent ++ * node. ++ */ ++ parent = n->rb_parent; ++ fname = rb_entry(n, struct fname, rb_hash); ++ kfree(fname); ++ if (!parent) ++ root->rb_node = 0; ++ else if (parent->rb_left == n) ++ parent->rb_left = 0; ++ else if (parent->rb_right == n) ++ parent->rb_right = 0; ++ n = parent; ++ } ++ root->rb_node = 0; ++} ++ ++ ++struct dir_private_info *create_dir_info(loff_t pos) ++{ ++ struct dir_private_info *p; ++ ++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ if (!p) ++ return NULL; ++ p->root.rb_node = 0; ++ p->curr_node = 0; ++ p->extra_fname = 0; ++ p->last_pos = 0; ++ p->curr_hash = pos2maj_hash(pos); ++ p->curr_minor_hash = pos2min_hash(pos); ++ p->next_hash = 0; ++ return p; ++} ++ ++void ext3_htree_free_dir_info(struct dir_private_info *p) ++{ ++ free_rb_tree_fname(&p->root); ++ kfree(p); ++} ++ ++/* ++ * Given a directory entry, enter it into the fname rb tree. ++ */ ++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent) ++{ ++ rb_node_t **p, *parent = NULL; ++ struct fname * fname, *new_fn; ++ struct dir_private_info *info; ++ int len; ++ ++ info = (struct dir_private_info *) dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++ len = sizeof(struct fname) + dirent->name_len + 1; ++ new_fn = kmalloc(len, GFP_KERNEL); ++ memset(new_fn, 0, len); ++ new_fn->hash = hash; ++ new_fn->minor_hash = minor_hash; ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = dirent->name_len; ++ new_fn->file_type = dirent->file_type; ++ memcpy(new_fn->name, dirent->name, dirent->name_len); ++ new_fn->name[dirent->name_len] = 0; ++ ++ while (*p) { ++ parent = *p; ++ fname = rb_entry(parent, struct fname, rb_hash); ++ ++ /* ++ * If the hash and minor hash match up, then we put ++ * them on a linked list. This rarely happens... ++ */ ++ if ((new_fn->hash == fname->hash) && ++ (new_fn->minor_hash == fname->minor_hash)) { ++ new_fn->next = fname->next; ++ fname->next = new_fn; ++ return; ++ } ++ ++ if (new_fn->hash < fname->hash) ++ p = &(*p)->rb_left; ++ else if (new_fn->hash > fname->hash) ++ p = &(*p)->rb_right; ++ else if (new_fn->minor_hash < fname->minor_hash) ++ p = &(*p)->rb_left; ++ else /* if (new_fn->minor_hash > fname->minor_hash) */ ++ p = &(*p)->rb_right; ++ } ++ ++ rb_link_node(&new_fn->rb_hash, parent, p); ++ rb_insert_color(&new_fn->rb_hash, &info->root); ++} ++ ++ ++ ++/* ++ * This is a helper function for ext3_dx_readdir. It calls filldir ++ * for all entres on the fname linked list. (Normally there is only ++ * one entry on the linked list, unless there are 62 bit hash collisions.) ++ */ ++static int call_filldir(struct file * filp, void * dirent, ++ filldir_t filldir, struct fname *fname) ++{ ++ struct dir_private_info *info = filp->private_data; ++ loff_t curr_pos; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct super_block * sb; ++ int error; ++ ++ sb = inode->i_sb; ++ ++ if (!fname) { ++ printk("call_filldir: called with null fname?!?\n"); ++ return 0; ++ } ++ curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ while (fname) { ++ error = filldir(dirent, fname->name, ++ fname->name_len, curr_pos, ++ fname->inode, ++ get_dtype(sb, fname->file_type)); ++ if (error) { ++ filp->f_pos = curr_pos; ++ info->extra_fname = fname->next; ++ return error; ++ } ++ fname = fname->next; ++ } ++ return 0; ++} ++ ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ struct dir_private_info *info = filp->private_data; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct fname *fname; ++ int ret; ++ ++ if (!info) { ++ info = create_dir_info(filp->f_pos); ++ if (!info) ++ return -ENOMEM; ++ filp->private_data = info; ++ } ++ ++ /* Some one has messed with f_pos; reset the world */ ++ if (info->last_pos != filp->f_pos) { ++ free_rb_tree_fname(&info->root); ++ info->curr_node = 0; ++ info->extra_fname = 0; ++ info->curr_hash = pos2maj_hash(filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ } ++ ++ /* ++ * If there are any leftover names on the hash collision ++ * chain, return them first. ++ */ ++ if (info->extra_fname && ++ call_filldir(filp, dirent, filldir, info->extra_fname)) ++ goto finished; ++ ++ if (!info->curr_node) ++ info->curr_node = rb_get_first(&info->root); ++ ++ while (1) { ++ /* ++ * Fill the rbtree if we have no more entries, ++ * or the inode has changed since we last read in the ++ * cached entries. ++ */ ++ if ((!info->curr_node) || ++ (filp->f_version != inode->i_version)) { ++ info->curr_node = 0; ++ free_rb_tree_fname(&info->root); ++ filp->f_version = inode->i_version; ++ ret = ext3_htree_fill_tree(filp, info->curr_hash, ++ info->curr_minor_hash, ++ &info->next_hash); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ break; ++ info->curr_node = rb_get_first(&info->root); ++ } ++ ++ fname = rb_entry(info->curr_node, struct fname, rb_hash); ++ info->curr_hash = fname->hash; ++ info->curr_minor_hash = fname->minor_hash; ++ if (call_filldir(filp, dirent, filldir, fname)) ++ break; ++ ++ info->curr_node = rb_get_next(info->curr_node); ++ if (!info->curr_node) { ++ info->curr_hash = info->next_hash; ++ info->curr_minor_hash = 0; ++ } ++ } ++finished: ++ info->last_pos = filp->f_pos; ++ UPDATE_ATIME(inode); ++ return 0; ++} ++#endif +Index: linux-2.4.19.SuSE/fs/ext3/namei.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/namei.c 2002-12-04 09:46:03.000000000 -0800 ++++ linux-2.4.19.SuSE/fs/ext3/namei.c 2004-05-27 11:08:52.000000000 -0700 +@@ -16,6 +16,12 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 ++ * Hash Tree Directory indexing cleanup ++ * Theodore Ts'o, 2002 + */ + + #include +@@ -40,6 +46,630 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++static struct buffer_head *ext3_append(handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ ++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#ifdef DX_DEBUG ++#define dxtrace(command) command ++#else ++#define dxtrace(command) ++#endif ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++#ifdef CONFIG_EXT3_INDEX ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static struct dx_frame *dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_frame *frame, ++ int *err); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, ++ struct dx_map_entry *offsets, int count); ++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash); ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err); ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++ ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x00ffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - ++ EXT3_DIR_REC_LEN(2) - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* ++ * Debug ++ */ ++#ifdef DX_DEBUG ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, ++ int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ printk(":%x.%u ", h.hash, ++ ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++#endif /* DX_DEBUG */ ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static struct dx_frame * ++dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ struct dx_frame *frame = frame_in; ++ u32 hash; ++ ++ frame->bh = NULL; ++ if (dentry) ++ dir = dentry->d_parent->d_inode; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ hinfo->hash_version = root->info.hash_version; ++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ if (dentry) ++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ hash = hinfo->hash; ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ if ((indirect = root->info.indirect_levels) > 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ while (frame >= frame_in) { ++ brelse(frame->bh); ++ frame--; ++ } ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (frames[0].bh == NULL) ++ return; ++ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse(frames[1].bh); ++ brelse(frames[0].bh); ++} ++ ++/* ++ * This function increments the frame pointer to search the next leaf ++ * block, and reads in the necessary intervening nodes if the search ++ * should be necessary. Whether or not the search is necessary is ++ * controlled by the hash parameter. If the hash value is even, then ++ * the search is only continued if the next block starts with that ++ * hash value. This is used if we are searching for a specific file. ++ * ++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. ++ * ++ * This function returns 1 if the caller should continue to search, ++ * or 0 if it should not. If there is an error reading one of the ++ * index blocks, it will return -1. ++ * ++ * If start_hash is non-null, it will be filled in with the starting ++ * hash of the next page. ++ */ ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash) ++{ ++ struct dx_frame *p; ++ struct buffer_head *bh; ++ int num_frames = 0; ++ __u32 bhash; ++ ++ *err = ENOENT; ++ p = frame; ++ /* ++ * Find the next leaf page by incrementing the frame pointer. ++ * If we run out of entries in the interior node, loop around and ++ * increment pointer in the parent node. When we break out of ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ ++ while (1) { ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ if (p == frames) ++ return 0; ++ num_frames++; ++ p--; ++ } ++ ++ /* ++ * If the hash is 1, then continue only if the next page has a ++ * continuation hash of any value. This is used for readdir ++ * handling. Otherwise, check to see if the hash matches the ++ * desired contiuation hash. If it doesn't, return since ++ * there's no point to read in the successive index pages. ++ */ ++ bhash = dx_get_hash(p->at); ++ if (start_hash) ++ *start_hash = bhash; ++ if ((hash & 1) == 0) { ++ if ((bhash & ~1) != hash) ++ return 0; ++ } ++ /* ++ * If the hash is HASH_NB_ALWAYS, we always go to the next ++ * block so no check is necessary ++ */ ++ while (num_frames--) { ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), ++ 0, err))) ++ return -1; /* Failure */ ++ p++; ++ brelse (p->bh); ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } ++ return 1; ++} ++ ++ ++/* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) ++{ ++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++/* ++ * This function fills a red-black tree with information from a ++ * directory. We start scanning the directory in hash order, starting ++ * at start_hash and start_minor_hash. ++ * ++ * This function returns the number of entries inserted into the tree, ++ * or a negative error code. ++ */ ++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash) ++{ ++ struct dx_hash_info hinfo; ++ struct buffer_head *bh; ++ struct ext3_dir_entry_2 *de, *top; ++ static struct dx_frame frames[2], *frame; ++ struct inode *dir; ++ int block, err; ++ int count = 0; ++ int ret; ++ __u32 hashval; ++ ++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, ++ start_minor_hash)); ++ dir = dir_file->f_dentry->d_inode; ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ ++ while (1) { ++ block = dx_get_block(frame->at); ++ dxtrace(printk("Reading block %d\n", block)); ++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) ++ goto errout; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) { ++ ext3fs_dirhash(de->name, de->name_len, &hinfo); ++ if ((hinfo.hash < start_hash) || ++ ((hinfo.hash == start_hash) && ++ (hinfo.minor_hash < start_minor_hash))) ++ continue; ++ ext3_htree_store_dirent(dir_file, hinfo.hash, ++ hinfo.minor_hash, de); ++ count++; ++ } ++ brelse (bh); ++ hashval = ~1; ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ frame, frames, &err, &hashval); ++ if (next_hash) ++ *next_hash = hashval; ++ if (ret == -1) ++ goto errout; ++ /* ++ * Stop if: (a) there are no more entries, or ++ * (b) we have inserted at least one entry and the ++ * next hash value is not a continuation ++ */ ++ if ((ret == 0) || ++ (count && ((hashval & 1) == 0))) ++ break; ++ } ++ dx_release(frames); ++ dxtrace(printk("Fill tree: returned %d entries\n", count)); ++ return count; ++errout: ++ dx_release(frames); ++ return (err); ++} ++ ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ while ((char *) de < base + size) ++ { ++ if (de->name_len && de->inode) { ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ map_tail--; ++ map_tail->hash = h.hash; ++ map_tail->offs = (u32) ((char *) de - base); ++ count++; ++ } ++ /* XXX: do we need to check rec_len == 0 case? -Chris */ ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries; ++ struct dx_entry *old = frame->at, *new = old + 1; ++ int count = dx_get_count(entries); ++ ++ assert(count < dx_get_limit(entries)); ++ assert(old < entries + count); ++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); ++ dx_set_hash(new, hash); ++ dx_set_block(new, block); ++ dx_set_count(entries, count + 1); ++} ++#endif ++ ++ ++static void ext3_update_dx_flag(struct inode *inode) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -96,6 +726,7 @@ + return 0; + } + ++ + /* + * ext3_find_entry() + * +@@ -107,6 +738,8 @@ + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -121,12 +754,32 @@ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; +- ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++ * old fashioned way. ++ */ ++ if (bh || (err != ERR_BAD_DX_DIR)) ++ return bh; ++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ } ++#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); +- start = dir->u.ext3_i.i_dir_start_lookup; ++ start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +@@ -167,7 +820,7 @@ + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { +- dir->u.ext3_i.i_dir_start_lookup = block; ++ EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { +@@ -198,6 +851,66 @@ + return ret; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err) ++{ ++ struct super_block * sb; ++ struct dx_hash_info hinfo; ++ u32 hash; ++ struct dx_frame frames[2], *frame; ++ struct ext3_dir_entry_2 *de, *top; ++ struct buffer_head *bh; ++ unsigned long block; ++ int retval; ++ int namelen = dentry->d_name.len; ++ const u8 *name = dentry->d_name.name; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ sb = dir->i_sb; ++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++ return NULL; ++ hash = hinfo.hash; ++ do { ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ goto errout; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto errout; ++ } ++ *res_dir = de; ++ dx_release (frames); ++ return bh; ++ } ++ brelse (bh); ++ /* Check to see if we should continue to search */ ++ retval = ext3_htree_next_block(dir, hash, frame, ++ frames, err, 0); ++ if (retval == -1) { ++ ext3_warning(sb, __FUNCTION__, ++ "error reading index page in directory #%lu", ++ dir->i_ino); ++ goto errout; ++ } ++ } while (retval == 1); ++ ++ *err = -ENOENT; ++errout: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++} ++#endif ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -214,8 +927,9 @@ + brelse (bh); + inode = iget(dir->i_sb, ino); + +- if (!inode) ++ if (!inode) { + return ERR_PTR(-EACCES); ++ } + } + d_add(dentry, inode); + return NULL; +@@ -239,6 +953,301 @@ + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct ext3_dir_entry_2 * ++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); ++ de->inode = 0; ++ map++; ++ to += rec_len; ++ } ++ return (struct ext3_dir_entry_2 *) (to - rec_len); ++} ++ ++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) ++{ ++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ unsigned rec_len = 0; ++ ++ prev = to = de; ++ while ((char*)de < base + size) { ++ next = (struct ext3_dir_entry_2 *) ((char *) de + ++ le16_to_cpu(de->rec_len)); ++ if (de->inode && de->name_len) { ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = cpu_to_le16(rec_len); ++ prev = to; ++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ } ++ de = next; ++ } ++ return prev; ++} ++ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ u32 hash2; ++ struct dx_map_entry *map; ++ char *data1 = (*bh)->b_data, *data2; ++ unsigned split; ++ struct ext3_dir_entry_2 *de = NULL, *de2; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) { ++ brelse(*bh); ++ *bh = NULL; ++ goto errout; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, *bh); ++ if (err) { ++ journal_error: ++ brelse(*bh); ++ brelse(bh2); ++ *bh = NULL; ++ ext3_std_error(dir->i_sb, err); ++ goto errout; ++ } ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ ++ data2 = bh2->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de = dx_pack_dirents(data1,blocksize); ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, frame->bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ dxtrace(dx_show_index ("frame", frame->entries)); ++errout: ++ return de; ++} ++#endif ++ ++ ++/* ++ * Add a new entry into a directory (leaf) block. If de is non-NULL, ++ * it points to a directory entry which is guaranteed to be large ++ * enough for new directory entry. If de is NULL, then ++ * add_dirent_to_buf will attempt search the directory block for ++ * space. It will return -ENOSPC if no space is available, and -EIO ++ * and -EEXIST if directory entry already exists. ++ * ++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In ++ * all other cases bh is released. ++ */ ++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct ext3_dir_entry_2 *de, ++ struct buffer_head * bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset = 0; ++ unsigned short reclen; ++ int nlen, rlen, err; ++ char *top; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ if (!de) { ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, ++ bh, offset)) { ++ brelse (bh); ++ return -EIO; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ break; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ if ((char *) de > top) ++ return -ENOSPC; ++ } ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return err; ++ } ++ ++ /* By now the buffer is marked for journaling */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ dir->i_version = ++event; ++ ext3_mark_inode_dirty(handle, dir); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return 0; ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * This converts a one block unindexed directory to a 3 block indexed ++ * directory, and adds the dentry to the indexed directory. ++ */ ++static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct buffer_head *bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de, *de2; ++ char *data1, *top; ++ unsigned len; ++ int retval; ++ unsigned blocksize; ++ struct dx_hash_info hinfo; ++ u32 block; ++ ++ blocksize = dir->i_sb->s_blocksize; ++ dxtrace(printk("Creating index\n")); ++ retval = ext3_journal_get_write_access(handle, bh); ++ if (retval) { ++ ext3_std_error(dir->i_sb, retval); ++ brelse(bh); ++ return retval; ++ } ++ root = (struct dx_root *) bh->b_data; ++ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) { ++ brelse(bh); ++ return retval; ++ } ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (struct ext3_dir_entry_2 *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (struct ext3_dir_entry_2 *) (&root->dotdot); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hinfo.hash_version = root->info.hash_version; ++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} ++#endif ++ + /* + * ext3_add_entry() + * +@@ -249,127 +1258,198 @@ + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +- +-/* +- * AKPM: the journalling code here looks wrong on the error paths +- */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; +- struct ext3_dir_entry_2 * de, * de1; ++ struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; ++#ifdef CONFIG_EXT3_INDEX ++ int dx_fallback=0; ++#endif ++ unsigned blocksize; ++ unsigned nlen, rlen; ++ u32 block, blocks; + + sb = dir->i_sb; +- +- if (!namelen) ++ blocksize = sb->s_blocksize; ++ if (!dentry->d_name.len) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ retval = ext3_dx_add_entry(handle, dentry, inode); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ return retval; ++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; ++ dx_fallback++; ++ ext3_mark_inode_dirty(handle, dir); ++ } ++#endif ++ blocks = dir->i_size >> sb->s_blocksize_bits; ++ for (block = 0, offset = 0; block < blocks; block++) { ++ bh = ext3_bread(handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (retval != -ENOSPC) ++ return retval; ++ ++#ifdef CONFIG_EXT3_INDEX ++ if (blocks == 1 && !dx_fallback && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ return make_indexed_dir(handle, dentry, inode, bh); ++#endif ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; +- } +- +- ext3_debug ("creating next block\n"); +- +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- } else { +- +- ext3_debug ("skipping to next block\n"); ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} + +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- } +- } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block * sb = dir->i_sb; ++ struct ext3_dir_entry_2 *de; ++ int err; ++ ++ frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ entries = frame->entries; ++ at = frame->at; ++ ++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (err != -ENOSPC) { ++ bh = 0; ++ goto cleanup; ++ } ++ ++ /* Block full, should compress but for now just split */ ++ dxtrace(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ ++ if (levels && (dx_get_count(frames->entries) == ++ dx_get_limit(frames->entries))) { ++ ext3_warning(sb, __FUNCTION__, ++ "Directory index full!\n"); ++ err = -ENOSPC; ++ goto cleanup; + } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; ++ ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) ++ goto cleanup; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); ++ node2->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ if (levels) { ++ unsigned icount1 = icount/2, icount2 = icount - icount1; ++ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ err = ext3_journal_get_write_access(handle, ++ frames[0].bh); ++ if (err) ++ goto journal_error; ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries >= icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- dir->i_version = ++event; +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); +- brelse(bh); +- return 0; ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace(dx_show_index ("node", frames[1].entries)); ++ dxtrace(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ } else { ++ dxtrace(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ err = ext3_journal_get_write_access(handle, ++ frame->bh); ++ if (err) ++ goto journal_error; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- brelse (bh); +- return -ENOSPC; ++ de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ if (!de) ++ goto cleanup; ++ err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ bh = 0; ++ goto cleanup; ++ ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++cleanup: ++ if (bh) ++ brelse(bh); ++ dx_release(frames); ++ return err; + } ++#endif + + /* + * ext3_delete_entry deletes a directory entry by merging it with the +@@ -453,9 +1533,11 @@ + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -480,9 +1562,11 @@ + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -512,9 +1596,11 @@ + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -526,7 +1612,8 @@ + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; +- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; ++ inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -555,21 +1642,19 @@ + brelse (dir_block); + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); +- if (err) +- goto out_no_entry; ++ if (err) { ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + dir->i_nlink++; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- inode->i_nlink = 0; +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + /* +@@ -656,7 +1741,7 @@ + int err = 0, rc; + + lock_super(sb); +- if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks +@@ -697,7 +1782,7 @@ + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) +- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", +@@ -715,25 +1800,26 @@ + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); +- if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); +- prev = inode->u.ext3_i.i_orphan.prev; ++ prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + +- list_del(&inode->u.ext3_i.i_orphan); +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ list_del(&ei->i_orphan); ++ INIT_LIST_HEAD(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on +@@ -794,8 +1880,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); +@@ -833,7 +1920,7 @@ + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -851,8 +1938,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -879,7 +1967,7 @@ + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -905,9 +1993,11 @@ + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -917,7 +2007,7 @@ + if (IS_ERR(inode)) + goto out_stop; + +- if (l > sizeof (inode->u.ext3_i.i_data)) { ++ if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* +@@ -926,25 +2016,23 @@ + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); +- if (err) +- goto out_no_entry; ++ if (err) { ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; +- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); ++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- ext3_dec_count(handle, inode); +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, +@@ -957,12 +2045,15 @@ + if (S_ISDIR(inode->i_mode)) + return -EPERM; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (inode->i_nlink >= EXT3_LINK_MAX) { + return -EMLINK; ++ } + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -996,9 +2087,11 @@ + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; +@@ -1078,7 +2171,7 @@ + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1090,7 +2183,7 @@ + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } +Index: linux-2.4.19.SuSE/fs/ext3/super.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-05-27 11:07:21.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-05-27 11:08:28.000000000 -0700 +@@ -741,6 +741,7 @@ + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO +@@ -751,6 +752,7 @@ + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +@@ -925,6 +927,7 @@ + return res; + } + ++ + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { +@@ -1113,6 +1116,9 @@ + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ for (i=0; i < 4; i++) ++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); ++ sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +@@ -1821,6 +1827,7 @@ + exit_ext3_xattr(); + } + ++EXPORT_SYMBOL(ext3_force_commit); + EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +Index: linux-2.4.19.SuSE/fs/ext3/file.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/file.c 2002-12-04 09:46:18.000000000 -0800 ++++ linux-2.4.19.SuSE/fs/ext3/file.c 2004-05-27 11:08:28.000000000 -0700 +@@ -38,6 +38,9 @@ + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); ++ if (is_dx(inode) && filp->private_data) ++ ext3_htree_free_dir_info(filp->private_data); ++ + return 0; + } + +Index: linux-2.4.19.SuSE/fs/ext3/hash.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/hash.c 1970-01-02 14:15:01.000000000 -0800 ++++ linux-2.4.19.SuSE/fs/ext3/hash.c 2004-05-27 11:08:28.000000000 -0700 +@@ -0,0 +1,215 @@ ++/* ++ * linux/fs/ext3/hash.c ++ * ++ * Copyright (C) 2002 by Theodore Ts'o ++ * ++ * This file is released under the GPL v2. ++ * ++ * This file may be redistributed under the terms of the GNU Public ++ * License. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DELTA 0x9E3779B9 ++ ++static void TEA_transform(__u32 buf[4], __u32 const in[]) ++{ ++ __u32 sum = 0; ++ __u32 b0 = buf[0], b1 = buf[1]; ++ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; ++ int n = 16; ++ ++ do { ++ sum += DELTA; ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); ++ } while(--n); ++ ++ buf[0] += b0; ++ buf[1] += b1; ++} ++ ++/* F, G and H are basic MD4 functions: selection, majority, parity */ ++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) ++#define H(x, y, z) ((x) ^ (y) ^ (z)) ++ ++/* ++ * The generic round function. The application is so specific that ++ * we don't bother protecting all the arguments with parens, as is generally ++ * good macro practice, in favor of extra legibility. ++ * Rotation is separate from addition to prevent recomputation ++ */ ++#define ROUND(f, a, b, c, d, x, s) \ ++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) ++#define K1 0 ++#define K2 013240474631UL ++#define K3 015666365641UL ++ ++/* ++ * Basic cut-down MD4 transform. Returns only 32 bits of result. ++ */ ++static void halfMD4Transform (__u32 buf[4], __u32 const in[]) ++{ ++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; ++ ++ /* Round 1 */ ++ ROUND(F, a, b, c, d, in[0] + K1, 3); ++ ROUND(F, d, a, b, c, in[1] + K1, 7); ++ ROUND(F, c, d, a, b, in[2] + K1, 11); ++ ROUND(F, b, c, d, a, in[3] + K1, 19); ++ ROUND(F, a, b, c, d, in[4] + K1, 3); ++ ROUND(F, d, a, b, c, in[5] + K1, 7); ++ ROUND(F, c, d, a, b, in[6] + K1, 11); ++ ROUND(F, b, c, d, a, in[7] + K1, 19); ++ ++ /* Round 2 */ ++ ROUND(G, a, b, c, d, in[1] + K2, 3); ++ ROUND(G, d, a, b, c, in[3] + K2, 5); ++ ROUND(G, c, d, a, b, in[5] + K2, 9); ++ ROUND(G, b, c, d, a, in[7] + K2, 13); ++ ROUND(G, a, b, c, d, in[0] + K2, 3); ++ ROUND(G, d, a, b, c, in[2] + K2, 5); ++ ROUND(G, c, d, a, b, in[4] + K2, 9); ++ ROUND(G, b, c, d, a, in[6] + K2, 13); ++ ++ /* Round 3 */ ++ ROUND(H, a, b, c, d, in[3] + K3, 3); ++ ROUND(H, d, a, b, c, in[7] + K3, 9); ++ ROUND(H, c, d, a, b, in[2] + K3, 11); ++ ROUND(H, b, c, d, a, in[6] + K3, 15); ++ ROUND(H, a, b, c, d, in[1] + K3, 3); ++ ROUND(H, d, a, b, c, in[5] + K3, 9); ++ ROUND(H, c, d, a, b, in[0] + K3, 11); ++ ROUND(H, b, c, d, a, in[4] + K3, 15); ++ ++ buf[0] += a; ++ buf[1] += b; ++ buf[2] += c; ++ buf[3] += d; ++} ++ ++#undef ROUND ++#undef F ++#undef G ++#undef H ++#undef K1 ++#undef K2 ++#undef K3 ++ ++/* The old legacy hash */ ++static __u32 dx_hack_hash (const char *name, int len) ++{ ++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) { ++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return (hash0 << 1); ++} ++ ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++{ ++ __u32 pad, val; ++ int i; ++ ++ pad = (__u32)len | ((__u32)len << 8); ++ pad |= pad << 16; ++ ++ val = pad; ++ if (len > num*4) ++ len = num * 4; ++ for (i=0; i < len; i++) { ++ if ((i % 4) == 0) ++ val = pad; ++ val = msg[i] + (val << 8); ++ if ((i % 4) == 3) { ++ *buf++ = val; ++ val = pad; ++ num--; ++ } ++ } ++ if (--num >= 0) ++ *buf++ = val; ++ while (--num >= 0) ++ *buf++ = pad; ++} ++ ++/* ++ * Returns the hash of a filename. If len is 0 and name is NULL, then ++ * this function can be used to test whether or not a hash version is ++ * supported. ++ * ++ * The seed is an 4 longword (32 bits) "secret" which can be used to ++ * uniquify a hash. If the seed is all zero's, then some default seed ++ * may be used. ++ * ++ * A particular hash version specifies whether or not the seed is ++ * represented, and whether or not the returned hash is 32 bits or 64 ++ * bits. 32 bit hashes will return 0 for the minor hash. ++ */ ++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) ++{ ++ __u32 hash; ++ __u32 minor_hash = 0; ++ const char *p; ++ int i; ++ __u32 in[8], buf[4]; ++ ++ /* Initialize the default seed for the hash checksum functions */ ++ buf[0] = 0x67452301; ++ buf[1] = 0xefcdab89; ++ buf[2] = 0x98badcfe; ++ buf[3] = 0x10325476; ++ ++ /* Check to see if the seed is all zero's */ ++ if (hinfo->seed) { ++ for (i=0; i < 4; i++) { ++ if (hinfo->seed[i]) ++ break; ++ } ++ if (i < 4) ++ memcpy(buf, hinfo->seed, sizeof(buf)); ++ } ++ ++ switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY: ++ hash = dx_hack_hash(name, len); ++ break; ++ case DX_HASH_HALF_MD4: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 8); ++ halfMD4Transform(buf, in); ++ len -= 32; ++ p += 32; ++ } ++ minor_hash = buf[2]; ++ hash = buf[1]; ++ break; ++ case DX_HASH_TEA: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 4); ++ TEA_transform(buf, in); ++ len -= 16; ++ p += 16; ++ } ++ hash = buf[0]; ++ minor_hash = buf[1]; ++ break; ++ default: ++ hinfo->hash = 0; ++ return -1; ++ } ++ hinfo->hash = hash & ~1; ++ hinfo->minor_hash = minor_hash; ++ return 0; ++} +Index: linux-2.4.19.SuSE/lib/rbtree.c +=================================================================== +--- linux-2.4.19.SuSE.orig/lib/rbtree.c 2002-08-02 17:39:46.000000000 -0700 ++++ linux-2.4.19.SuSE/lib/rbtree.c 2004-05-27 11:08:28.000000000 -0700 +@@ -17,6 +17,8 @@ + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c ++ ++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 + */ + + #include +@@ -294,3 +296,43 @@ + __rb_erase_color(child, parent, root); + } + EXPORT_SYMBOL(rb_erase); ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++rb_node_t *rb_get_first(rb_root_t *root) ++{ ++ rb_node_t *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return 0; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++EXPORT_SYMBOL(rb_get_first); ++ ++/* ++ * Given a node, this function will return the next node in the tree. ++ */ ++rb_node_t *rb_get_next(rb_node_t *n) ++{ ++ rb_node_t *parent; ++ ++ if (n->rb_right) { ++ n = n->rb_right; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++ } else { ++ while ((parent = n->rb_parent)) { ++ if (n == parent->rb_left) ++ return parent; ++ n = parent; ++ } ++ return 0; ++ } ++} ++EXPORT_SYMBOL(rb_get_next); ++ +Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h 2003-10-05 09:30:34.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/ext3_fs.h 2004-05-27 11:08:28.000000000 -0700 +@@ -40,6 +40,11 @@ + #define EXT3FS_VERSION "2.4-0.9.18" + + /* ++ * Always enable hashed directories ++ */ ++#define CONFIG_EXT3_INDEX ++ ++/* + * Debug code + */ + #ifdef EXT3FS_DEBUG +@@ -414,8 +419,11 @@ + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ +- +-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ ++ __u32 s_hash_seed[4]; /* HTREE hash seed */ ++ __u8 s_def_hash_version; /* Default hash version to use */ ++ __u8 s_reserved_char_pad; ++ __u16 s_reserved_word_pad; ++ __u32 s_reserved[192]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -552,9 +560,46 @@ + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#ifdef CONFIG_EXT3_INDEX ++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif ++ ++/* Legal values for the dx_root hash_version field: */ ++ ++#define DX_HASH_LEGACY 0 ++#define DX_HASH_HALF_MD4 1 ++#define DX_HASH_TEA 2 ++ ++/* hash info structure used by the directory hash */ ++struct dx_hash_info ++{ ++ u32 hash; ++ u32 minor_hash; ++ int hash_version; ++ u32 *seed; ++}; + + #ifdef __KERNEL__ + /* ++ * Control parameters used by ext3_htree_next_block ++ */ ++#define HASH_NB_ALWAYS 1 ++ ++ ++/* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc +@@ -564,6 +609,27 @@ + unsigned long block_group; + }; + ++ ++/* ++ * This structure is stuffed into the struct file's private_data field ++ * for directories. It is where we put information so that we can do ++ * readdir operations in hash tree order. ++ */ ++struct dir_private_info { ++ rb_root_t root; ++ rb_node_t *curr_node; ++ struct fname *extra_fname; ++ loff_t last_pos; ++ __u32 curr_hash; ++ __u32 curr_minor_hash; ++ __u32 next_hash; ++}; ++ ++/* ++ * Special error return code only used by dx_probe() and its callers. ++ */ ++#define ERR_BAD_DX_DIR -75000 ++ + /* + * Function prototypes + */ +@@ -591,11 +657,20 @@ + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, struct buffer_head *, +- unsigned long); ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent); ++extern void ext3_htree_free_dir_info(struct dir_private_info *p); ++ + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + ++/* hash.c */ ++extern int ext3fs_dirhash(const char *name, int len, struct ++ dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); +@@ -628,6 +703,8 @@ + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); ++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) +Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h 2003-10-05 09:16:36.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h 2004-05-27 11:08:28.000000000 -0700 +@@ -62,6 +62,8 @@ + int s_inode_size; + int s_first_ino; + u32 s_next_generation; ++ u32 s_hash_seed[4]; ++ int s_def_hash_version; + + /* Journaling */ + struct inode * s_journal_inode; +Index: linux-2.4.19.SuSE/include/linux/ext3_jbd.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/ext3_jbd.h 2003-10-05 09:30:34.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/ext3_jbd.h 2004-05-27 11:08:28.000000000 -0700 +@@ -69,6 +69,8 @@ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +Index: linux-2.4.19.SuSE/include/linux/rbtree.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/rbtree.h 2003-10-05 09:16:36.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/rbtree.h 2004-05-27 11:08:28.000000000 -0700 +@@ -120,6 +120,8 @@ + + extern void rb_insert_color(rb_node_t *, rb_root_t *); + extern void rb_erase(rb_node_t *, rb_root_t *); ++extern rb_node_t *rb_get_first(rb_root_t *root); ++extern rb_node_t *rb_get_next(rb_node_t *n); + + static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) + { diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch new file mode 100644 index 0000000..ca05893 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch @@ -0,0 +1,541 @@ + fs/ext3/file.c | 4 + fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 5 + include/linux/ext3_fs_sb.h | 10 + + 5 files changed, 365 insertions(+) + +Index: linux-2.4.20/fs/ext3/super.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/super.c 2004-01-12 20:13:37.000000000 +0300 ++++ linux-2.4.20/fs/ext3/super.c 2004-01-13 16:59:54.000000000 +0300 +@@ -48,6 +48,8 @@ + static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); + ++static int ext3_sync_fs(struct super_block * sb); ++ + #ifdef CONFIG_JBD_DEBUG + int journal_no_write[2]; + +@@ -398,6 +400,221 @@ + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, ++ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_delete; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) ++ goto out_delete; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_delete; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ sbi->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug("delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ goto out_delete; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&oei->i_orphan)); ++ ++ nei = EXT3_I(new_inode); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ nei->i_orphan = oei->i_orphan; ++ nei->i_orphan.next->prev = &nei->i_orphan; ++ nei->i_orphan.prev->next = &nei->i_orphan; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_delete: ++ ext3_delete_inode(old_inode); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -405,6 +622,7 @@ + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ J_ASSERT(sbi->s_delete_inodes == 0); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -453,9 +671,14 @@ + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ ++ sync_fs: ext3_sync_fs, + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ + unlockfs: ext3_unlockfs, /* BKL not held. We take it */ + statfs: ext3_statfs, /* BKL held */ +@@ -521,6 +744,13 @@ + clear_opt (*mount_options, XATTR_USER); + else + #endif ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1220,6 +1450,7 @@ + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +@@ -1625,6 +1856,21 @@ + } + } + ++static int ext3_sync_fs(struct super_block *sb) ++{ ++ tid_t target; ++ ++ if (atomic_read(&sb->s_active) == 0) { ++ /* fs is being umounted: time to stop delete thread */ ++ ext3_stop_delete_thread(EXT3_SB(sb)); ++ } ++ ++ sb->s_dirt = 0; ++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); ++ log_wait_commit(EXT3_SB(sb)->s_journal, target); ++ return 0; ++} ++ + /* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. +@@ -1682,6 +1928,9 @@ + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +Index: linux-2.4.20/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-12 20:13:37.000000000 +0300 ++++ linux-2.4.20/fs/ext3/inode.c 2004-01-13 16:55:45.000000000 +0300 +@@ -2552,6 +2552,118 @@ + return err; + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. +Index: linux-2.4.20/fs/ext3/file.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/file.c 2004-01-12 20:13:36.000000000 +0300 ++++ linux-2.4.20/fs/ext3/file.c 2004-01-13 16:55:45.000000000 +0300 +@@ -125,7 +125,11 @@ + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + setxattr: ext3_setxattr, /* BKL held */ + getxattr: ext3_getxattr, /* BKL held */ +Index: linux-2.4.20/fs/buffer.c +=================================================================== +--- linux-2.4.20.orig/fs/buffer.c 2003-05-16 05:29:12.000000000 +0400 ++++ linux-2.4.20/fs/buffer.c 2004-01-13 16:55:45.000000000 +0300 +@@ -328,6 +328,8 @@ + if (sb->s_dirt && sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); ++ if (sb->s_op && sb->s_op->sync_fs) ++ sb->s_op->sync_fs(sb); + unlock_kernel(); + + return sync_buffers(dev, 1); +Index: linux-2.4.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-12 20:13:37.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-13 16:55:45.000000000 +0300 +@@ -193,6 +193,7 @@ + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -320,6 +321,7 @@ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -696,6 +698,9 @@ + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, +Index: linux-2.4.20/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:13:37.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_fs_sb.h 2004-01-13 16:55:45.000000000 +0300 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 8 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -76,6 +78,14 @@ + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.4.20/include/linux/fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/fs.h 2004-01-12 20:13:36.000000000 +0300 ++++ linux-2.4.20/include/linux/fs.h 2004-01-13 16:55:45.000000000 +0300 +@@ -917,6 +917,7 @@ + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + void (*write_super) (struct super_block *); ++ int (*sync_fs) (struct super_block *); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); + int (*statfs) (struct super_block *, struct statfs *); diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch new file mode 100644 index 0000000..6e4c834 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch @@ -0,0 +1,2584 @@ + fs/ext3/Makefile | 2 + fs/ext3/dir.c | 302 +++++++++ + fs/ext3/file.c | 3 + fs/ext3/hash.c | 215 ++++++ + fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++----- + fs/ext3/super.c | 7 + include/linux/ext3_fs.h | 85 ++ + include/linux/ext3_fs_sb.h | 2 + include/linux/ext3_jbd.h | 2 + include/linux/rbtree.h | 2 + lib/rbtree.c | 42 + + 11 files changed, 1921 insertions(+), 161 deletions(-) + +Index: linux.mcp2/fs/ext3/dir.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/dir.c 2004-05-17 15:03:55.000000000 -0700 ++++ linux.mcp2/fs/ext3/dir.c 2004-05-17 15:07:06.000000000 -0700 +@@ -21,12 +21,16 @@ + #include + #include + #include ++#include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -35,6 +39,17 @@ + fsync: ext3_sync_file, /* BKL held */ + }; + ++ ++static unsigned char get_dtype(struct super_block *sb, int filetype) ++{ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || ++ (filetype >= EXT3_FT_MAX)) ++ return DT_UNKNOWN; ++ ++ return (ext3_filetype_table[filetype]); ++} ++ ++ + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -79,6 +94,16 @@ + + sb = inode->i_sb; + ++ if (is_dx(inode)) { ++ err = ext3_dx_readdir(filp, dirent, filldir); ++ if (err != ERR_BAD_DX_DIR) ++ return err; ++ /* ++ * We don't set the inode dirty flag since it's not ++ * critical that it get flushed back to the disk. ++ */ ++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; ++ } + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); +@@ -162,18 +187,12 @@ + * during the copy operation. + */ + unsigned long version = filp->f_version; +- unsigned char d_type = DT_UNKNOWN; + +- if (EXT3_HAS_INCOMPAT_FEATURE(sb, +- EXT3_FEATURE_INCOMPAT_FILETYPE) +- && de->file_type < EXT3_FT_MAX) +- d_type = +- ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), +- d_type); ++ get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) +@@ -188,3 +207,272 @@ + UPDATE_ATIME(inode); + return 0; + } ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * These functions convert from the major/minor hash to an f_pos ++ * value. ++ * ++ * Currently we only use major hash numer. This is unfortunate, but ++ * on 32-bit machines, the same VFS interface is used for lseek and ++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of ++ * lseek/telldir/seekdir will blow out spectacularly, and from within ++ * the ext2 low-level routine, we don't know if we're being called by ++ * a 64-bit version of the system call or the 32-bit version of the ++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir ++ * cookie. Sigh. ++ */ ++#define hash2pos(major, minor) (major >> 1) ++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) ++#define pos2min_hash(pos) (0) ++ ++/* ++ * This structure holds the nodes of the red-black tree used to store ++ * the directory entry in hash order. ++ */ ++struct fname { ++ __u32 hash; ++ __u32 minor_hash; ++ rb_node_t rb_hash; ++ struct fname *next; ++ __u32 inode; ++ __u8 name_len; ++ __u8 file_type; ++ char name[0]; ++}; ++ ++/* ++ * This functoin implements a non-recursive way of freeing all of the ++ * nodes in the red-black tree. ++ */ ++static void free_rb_tree_fname(rb_root_t *root) ++{ ++ rb_node_t *n = root->rb_node; ++ rb_node_t *parent; ++ struct fname *fname; ++ ++ while (n) { ++ /* Do the node's children first */ ++ if ((n)->rb_left) { ++ n = n->rb_left; ++ continue; ++ } ++ if (n->rb_right) { ++ n = n->rb_right; ++ continue; ++ } ++ /* ++ * The node has no children; free it, and then zero ++ * out parent's link to it. Finally go to the ++ * beginning of the loop and try to free the parent ++ * node. ++ */ ++ parent = n->rb_parent; ++ fname = rb_entry(n, struct fname, rb_hash); ++ kfree(fname); ++ if (!parent) ++ root->rb_node = 0; ++ else if (parent->rb_left == n) ++ parent->rb_left = 0; ++ else if (parent->rb_right == n) ++ parent->rb_right = 0; ++ n = parent; ++ } ++ root->rb_node = 0; ++} ++ ++ ++struct dir_private_info *create_dir_info(loff_t pos) ++{ ++ struct dir_private_info *p; ++ ++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ if (!p) ++ return NULL; ++ p->root.rb_node = 0; ++ p->curr_node = 0; ++ p->extra_fname = 0; ++ p->last_pos = 0; ++ p->curr_hash = pos2maj_hash(pos); ++ p->curr_minor_hash = pos2min_hash(pos); ++ p->next_hash = 0; ++ return p; ++} ++ ++void ext3_htree_free_dir_info(struct dir_private_info *p) ++{ ++ free_rb_tree_fname(&p->root); ++ kfree(p); ++} ++ ++/* ++ * Given a directory entry, enter it into the fname rb tree. ++ */ ++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent) ++{ ++ rb_node_t **p, *parent = NULL; ++ struct fname * fname, *new_fn; ++ struct dir_private_info *info; ++ int len; ++ ++ info = (struct dir_private_info *) dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++ len = sizeof(struct fname) + dirent->name_len + 1; ++ new_fn = kmalloc(len, GFP_KERNEL); ++ if (!new_fn) ++ return -ENOMEM; ++ memset(new_fn, 0, len); ++ new_fn->hash = hash; ++ new_fn->minor_hash = minor_hash; ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = dirent->name_len; ++ new_fn->file_type = dirent->file_type; ++ memcpy(new_fn->name, dirent->name, dirent->name_len); ++ new_fn->name[dirent->name_len] = 0; ++ ++ while (*p) { ++ parent = *p; ++ fname = rb_entry(parent, struct fname, rb_hash); ++ ++ /* ++ * If the hash and minor hash match up, then we put ++ * them on a linked list. This rarely happens... ++ */ ++ if ((new_fn->hash == fname->hash) && ++ (new_fn->minor_hash == fname->minor_hash)) { ++ new_fn->next = fname->next; ++ fname->next = new_fn; ++ return 0; ++ } ++ ++ if (new_fn->hash < fname->hash) ++ p = &(*p)->rb_left; ++ else if (new_fn->hash > fname->hash) ++ p = &(*p)->rb_right; ++ else if (new_fn->minor_hash < fname->minor_hash) ++ p = &(*p)->rb_left; ++ else /* if (new_fn->minor_hash > fname->minor_hash) */ ++ p = &(*p)->rb_right; ++ } ++ ++ rb_link_node(&new_fn->rb_hash, parent, p); ++ rb_insert_color(&new_fn->rb_hash, &info->root); ++ return 0; ++} ++ ++ ++ ++/* ++ * This is a helper function for ext3_dx_readdir. It calls filldir ++ * for all entres on the fname linked list. (Normally there is only ++ * one entry on the linked list, unless there are 62 bit hash collisions.) ++ */ ++static int call_filldir(struct file * filp, void * dirent, ++ filldir_t filldir, struct fname *fname) ++{ ++ struct dir_private_info *info = filp->private_data; ++ loff_t curr_pos; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct super_block * sb; ++ int error; ++ ++ sb = inode->i_sb; ++ ++ if (!fname) { ++ printk("call_filldir: called with null fname?!?\n"); ++ return 0; ++ } ++ curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ while (fname) { ++ error = filldir(dirent, fname->name, ++ fname->name_len, curr_pos, ++ fname->inode, ++ get_dtype(sb, fname->file_type)); ++ if (error) { ++ filp->f_pos = curr_pos; ++ info->extra_fname = fname->next; ++ return error; ++ } ++ fname = fname->next; ++ } ++ return 0; ++} ++ ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ struct dir_private_info *info = filp->private_data; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct fname *fname; ++ int ret; ++ ++ if (!info) { ++ info = create_dir_info(filp->f_pos); ++ if (!info) ++ return -ENOMEM; ++ filp->private_data = info; ++ } ++ ++ /* Some one has messed with f_pos; reset the world */ ++ if (info->last_pos != filp->f_pos) { ++ free_rb_tree_fname(&info->root); ++ info->curr_node = 0; ++ info->extra_fname = 0; ++ info->curr_hash = pos2maj_hash(filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ } ++ ++ /* ++ * If there are any leftover names on the hash collision ++ * chain, return them first. ++ */ ++ if (info->extra_fname && ++ call_filldir(filp, dirent, filldir, info->extra_fname)) ++ goto finished; ++ ++ if (!info->curr_node) ++ info->curr_node = rb_get_first(&info->root); ++ ++ while (1) { ++ /* ++ * Fill the rbtree if we have no more entries, ++ * or the inode has changed since we last read in the ++ * cached entries. ++ */ ++ if ((!info->curr_node) || ++ (filp->f_version != inode->i_version)) { ++ info->curr_node = 0; ++ free_rb_tree_fname(&info->root); ++ filp->f_version = inode->i_version; ++ ret = ext3_htree_fill_tree(filp, info->curr_hash, ++ info->curr_minor_hash, ++ &info->next_hash); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ break; ++ info->curr_node = rb_get_first(&info->root); ++ } ++ ++ fname = rb_entry(info->curr_node, struct fname, rb_hash); ++ info->curr_hash = fname->hash; ++ info->curr_minor_hash = fname->minor_hash; ++ if (call_filldir(filp, dirent, filldir, fname)) ++ break; ++ ++ info->curr_node = rb_get_next(info->curr_node); ++ if (!info->curr_node) { ++ info->curr_hash = info->next_hash; ++ info->curr_minor_hash = 0; ++ } ++ } ++finished: ++ info->last_pos = filp->f_pos; ++ UPDATE_ATIME(inode); ++ return 0; ++} ++#endif +Index: linux.mcp2/fs/ext3/file.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/file.c 2004-05-17 15:03:55.000000000 -0700 ++++ linux.mcp2/fs/ext3/file.c 2004-05-17 15:07:06.000000000 -0700 +@@ -35,6 +35,9 @@ + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); ++ if (is_dx(inode) && filp->private_data) ++ ext3_htree_free_dir_info(filp->private_data); ++ + return 0; + } + +Index: linux.mcp2/fs/ext3/hash.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/hash.c 2002-04-11 07:25:15.000000000 -0700 ++++ linux.mcp2/fs/ext3/hash.c 2004-05-17 15:07:06.000000000 -0700 +@@ -0,0 +1,215 @@ ++/* ++ * linux/fs/ext3/hash.c ++ * ++ * Copyright (C) 2002 by Theodore Ts'o ++ * ++ * This file is released under the GPL v2. ++ * ++ * This file may be redistributed under the terms of the GNU Public ++ * License. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DELTA 0x9E3779B9 ++ ++static void TEA_transform(__u32 buf[4], __u32 const in[]) ++{ ++ __u32 sum = 0; ++ __u32 b0 = buf[0], b1 = buf[1]; ++ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; ++ int n = 16; ++ ++ do { ++ sum += DELTA; ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); ++ } while(--n); ++ ++ buf[0] += b0; ++ buf[1] += b1; ++} ++ ++/* F, G and H are basic MD4 functions: selection, majority, parity */ ++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) ++#define H(x, y, z) ((x) ^ (y) ^ (z)) ++ ++/* ++ * The generic round function. The application is so specific that ++ * we don't bother protecting all the arguments with parens, as is generally ++ * good macro practice, in favor of extra legibility. ++ * Rotation is separate from addition to prevent recomputation ++ */ ++#define ROUND(f, a, b, c, d, x, s) \ ++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) ++#define K1 0 ++#define K2 013240474631UL ++#define K3 015666365641UL ++ ++/* ++ * Basic cut-down MD4 transform. Returns only 32 bits of result. ++ */ ++static void halfMD4Transform (__u32 buf[4], __u32 const in[]) ++{ ++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; ++ ++ /* Round 1 */ ++ ROUND(F, a, b, c, d, in[0] + K1, 3); ++ ROUND(F, d, a, b, c, in[1] + K1, 7); ++ ROUND(F, c, d, a, b, in[2] + K1, 11); ++ ROUND(F, b, c, d, a, in[3] + K1, 19); ++ ROUND(F, a, b, c, d, in[4] + K1, 3); ++ ROUND(F, d, a, b, c, in[5] + K1, 7); ++ ROUND(F, c, d, a, b, in[6] + K1, 11); ++ ROUND(F, b, c, d, a, in[7] + K1, 19); ++ ++ /* Round 2 */ ++ ROUND(G, a, b, c, d, in[1] + K2, 3); ++ ROUND(G, d, a, b, c, in[3] + K2, 5); ++ ROUND(G, c, d, a, b, in[5] + K2, 9); ++ ROUND(G, b, c, d, a, in[7] + K2, 13); ++ ROUND(G, a, b, c, d, in[0] + K2, 3); ++ ROUND(G, d, a, b, c, in[2] + K2, 5); ++ ROUND(G, c, d, a, b, in[4] + K2, 9); ++ ROUND(G, b, c, d, a, in[6] + K2, 13); ++ ++ /* Round 3 */ ++ ROUND(H, a, b, c, d, in[3] + K3, 3); ++ ROUND(H, d, a, b, c, in[7] + K3, 9); ++ ROUND(H, c, d, a, b, in[2] + K3, 11); ++ ROUND(H, b, c, d, a, in[6] + K3, 15); ++ ROUND(H, a, b, c, d, in[1] + K3, 3); ++ ROUND(H, d, a, b, c, in[5] + K3, 9); ++ ROUND(H, c, d, a, b, in[0] + K3, 11); ++ ROUND(H, b, c, d, a, in[4] + K3, 15); ++ ++ buf[0] += a; ++ buf[1] += b; ++ buf[2] += c; ++ buf[3] += d; ++} ++ ++#undef ROUND ++#undef F ++#undef G ++#undef H ++#undef K1 ++#undef K2 ++#undef K3 ++ ++/* The old legacy hash */ ++static __u32 dx_hack_hash (const char *name, int len) ++{ ++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) { ++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return (hash0 << 1); ++} ++ ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++{ ++ __u32 pad, val; ++ int i; ++ ++ pad = (__u32)len | ((__u32)len << 8); ++ pad |= pad << 16; ++ ++ val = pad; ++ if (len > num*4) ++ len = num * 4; ++ for (i=0; i < len; i++) { ++ if ((i % 4) == 0) ++ val = pad; ++ val = msg[i] + (val << 8); ++ if ((i % 4) == 3) { ++ *buf++ = val; ++ val = pad; ++ num--; ++ } ++ } ++ if (--num >= 0) ++ *buf++ = val; ++ while (--num >= 0) ++ *buf++ = pad; ++} ++ ++/* ++ * Returns the hash of a filename. If len is 0 and name is NULL, then ++ * this function can be used to test whether or not a hash version is ++ * supported. ++ * ++ * The seed is an 4 longword (32 bits) "secret" which can be used to ++ * uniquify a hash. If the seed is all zero's, then some default seed ++ * may be used. ++ * ++ * A particular hash version specifies whether or not the seed is ++ * represented, and whether or not the returned hash is 32 bits or 64 ++ * bits. 32 bit hashes will return 0 for the minor hash. ++ */ ++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) ++{ ++ __u32 hash; ++ __u32 minor_hash = 0; ++ const char *p; ++ int i; ++ __u32 in[8], buf[4]; ++ ++ /* Initialize the default seed for the hash checksum functions */ ++ buf[0] = 0x67452301; ++ buf[1] = 0xefcdab89; ++ buf[2] = 0x98badcfe; ++ buf[3] = 0x10325476; ++ ++ /* Check to see if the seed is all zero's */ ++ if (hinfo->seed) { ++ for (i=0; i < 4; i++) { ++ if (hinfo->seed[i]) ++ break; ++ } ++ if (i < 4) ++ memcpy(buf, hinfo->seed, sizeof(buf)); ++ } ++ ++ switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY: ++ hash = dx_hack_hash(name, len); ++ break; ++ case DX_HASH_HALF_MD4: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 8); ++ halfMD4Transform(buf, in); ++ len -= 32; ++ p += 32; ++ } ++ minor_hash = buf[2]; ++ hash = buf[1]; ++ break; ++ case DX_HASH_TEA: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 4); ++ TEA_transform(buf, in); ++ len -= 16; ++ p += 16; ++ } ++ hash = buf[0]; ++ minor_hash = buf[1]; ++ break; ++ default: ++ hinfo->hash = 0; ++ return -1; ++ } ++ hinfo->hash = hash & ~1; ++ hinfo->minor_hash = minor_hash; ++ return 0; ++} +Index: linux.mcp2/fs/ext3/Makefile +=================================================================== +--- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:03:55.000000000 -0700 ++++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:07:06.000000000 -0700 +@@ -10,7 +10,7 @@ + O_TARGET := ext3.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o hash.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +Index: linux.mcp2/fs/ext3/namei.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:03:55.000000000 -0700 ++++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:07:06.000000000 -0700 +@@ -16,6 +16,12 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 ++ * Hash Tree Directory indexing cleanup ++ * Theodore Ts'o, 2002 + */ + + #include +@@ -38,6 +44,642 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++static struct buffer_head *ext3_append(handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ ++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#ifdef DX_DEBUG ++#define dxtrace(command) command ++#else ++#define dxtrace(command) ++#endif ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++#ifdef CONFIG_EXT3_INDEX ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static struct dx_frame *dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_frame *frame, ++ int *err); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, ++ struct dx_map_entry *offsets, int count); ++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash); ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err); ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++ ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x00ffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - ++ EXT3_DIR_REC_LEN(2) - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* ++ * Debug ++ */ ++#ifdef DX_DEBUG ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, ++ int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ printk(":%x.%u ", h.hash, ++ ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++#endif /* DX_DEBUG */ ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static struct dx_frame * ++dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ struct dx_frame *frame = frame_in; ++ u32 hash; ++ ++ frame->bh = NULL; ++ if (dentry) ++ dir = dentry->d_parent->d_inode; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ hinfo->hash_version = root->info.hash_version; ++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ if (dentry) ++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ hash = hinfo->hash; ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ if ((indirect = root->info.indirect_levels) > 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ while (frame >= frame_in) { ++ brelse(frame->bh); ++ frame--; ++ } ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (frames[0].bh == NULL) ++ return; ++ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse(frames[1].bh); ++ brelse(frames[0].bh); ++} ++ ++/* ++ * This function increments the frame pointer to search the next leaf ++ * block, and reads in the necessary intervening nodes if the search ++ * should be necessary. Whether or not the search is necessary is ++ * controlled by the hash parameter. If the hash value is even, then ++ * the search is only continued if the next block starts with that ++ * hash value. This is used if we are searching for a specific file. ++ * ++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. ++ * ++ * This function returns 1 if the caller should continue to search, ++ * or 0 if it should not. If there is an error reading one of the ++ * index blocks, it will return -1. ++ * ++ * If start_hash is non-null, it will be filled in with the starting ++ * hash of the next page. ++ */ ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash) ++{ ++ struct dx_frame *p; ++ struct buffer_head *bh; ++ int num_frames = 0; ++ __u32 bhash; ++ ++ *err = ENOENT; ++ p = frame; ++ /* ++ * Find the next leaf page by incrementing the frame pointer. ++ * If we run out of entries in the interior node, loop around and ++ * increment pointer in the parent node. When we break out of ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ ++ while (1) { ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ if (p == frames) ++ return 0; ++ num_frames++; ++ p--; ++ } ++ ++ /* ++ * If the hash is 1, then continue only if the next page has a ++ * continuation hash of any value. This is used for readdir ++ * handling. Otherwise, check to see if the hash matches the ++ * desired contiuation hash. If it doesn't, return since ++ * there's no point to read in the successive index pages. ++ */ ++ bhash = dx_get_hash(p->at); ++ if (start_hash) ++ *start_hash = bhash; ++ if ((hash & 1) == 0) { ++ if ((bhash & ~1) != hash) ++ return 0; ++ } ++ /* ++ * If the hash is HASH_NB_ALWAYS, we always go to the next ++ * block so no check is necessary ++ */ ++ while (num_frames--) { ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), ++ 0, err))) ++ return -1; /* Failure */ ++ p++; ++ brelse (p->bh); ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } ++ return 1; ++} ++ ++ ++/* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) ++{ ++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++/* ++ * This function fills a red-black tree with information from a ++ * directory. We start scanning the directory in hash order, starting ++ * at start_hash and start_minor_hash. ++ * ++ * This function returns the number of entries inserted into the tree, ++ * or a negative error code. ++ */ ++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash) ++{ ++ struct dx_hash_info hinfo; ++ struct buffer_head *bh; ++ struct ext3_dir_entry_2 *de, *top; ++ static struct dx_frame frames[2], *frame; ++ struct inode *dir; ++ int block, err; ++ int count = 0; ++ int ret; ++ __u32 hashval; ++ ++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, ++ start_minor_hash)); ++ dir = dir_file->f_dentry->d_inode; ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ ++ /* Add '.' and '..' from the htree header */ ++ if (!start_hash && !start_minor_hash) { ++ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) ++ goto errout; ++ de = ext3_next_entry(de); ++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) ++ goto errout; ++ count += 2; ++ } ++ ++ while (1) { ++ block = dx_get_block(frame->at); ++ dxtrace(printk("Reading block %d\n", block)); ++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) ++ goto errout; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) { ++ ext3fs_dirhash(de->name, de->name_len, &hinfo); ++ if ((hinfo.hash < start_hash) || ++ ((hinfo.hash == start_hash) && ++ (hinfo.minor_hash < start_minor_hash))) ++ continue; ++ if ((err = ext3_htree_store_dirent(dir_file, ++ hinfo.hash, hinfo.minor_hash, de)) != 0) ++ goto errout; ++ count++; ++ } ++ brelse (bh); ++ hashval = ~1; ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ frame, frames, &err, &hashval); ++ if (next_hash) ++ *next_hash = hashval; ++ if (ret == -1) ++ goto errout; ++ /* ++ * Stop if: (a) there are no more entries, or ++ * (b) we have inserted at least one entry and the ++ * next hash value is not a continuation ++ */ ++ if ((ret == 0) || ++ (count && ((hashval & 1) == 0))) ++ break; ++ } ++ dx_release(frames); ++ dxtrace(printk("Fill tree: returned %d entries\n", count)); ++ return count; ++errout: ++ dx_release(frames); ++ return (err); ++} ++ ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ while ((char *) de < base + size) ++ { ++ if (de->name_len && de->inode) { ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ map_tail--; ++ map_tail->hash = h.hash; ++ map_tail->offs = (u32) ((char *) de - base); ++ count++; ++ } ++ /* XXX: do we need to check rec_len == 0 case? -Chris */ ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries; ++ struct dx_entry *old = frame->at, *new = old + 1; ++ int count = dx_get_count(entries); ++ ++ assert(count < dx_get_limit(entries)); ++ assert(old < entries + count); ++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); ++ dx_set_hash(new, hash); ++ dx_set_block(new, block); ++ dx_set_count(entries, count + 1); ++} ++#endif ++ ++ ++static void ext3_update_dx_flag(struct inode *inode) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -94,6 +736,7 @@ + return 0; + } + ++ + /* + * ext3_find_entry() + * +@@ -105,6 +748,8 @@ + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -119,12 +764,32 @@ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; +- ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++ * old fashioned way. ++ */ ++ if (bh || (err != ERR_BAD_DX_DIR)) ++ return bh; ++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ } ++#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); +- start = dir->u.ext3_i.i_dir_start_lookup; ++ start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +@@ -165,7 +830,7 @@ + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { +- dir->u.ext3_i.i_dir_start_lookup = block; ++ EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { +@@ -196,6 +861,66 @@ + return ret; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err) ++{ ++ struct super_block * sb; ++ struct dx_hash_info hinfo; ++ u32 hash; ++ struct dx_frame frames[2], *frame; ++ struct ext3_dir_entry_2 *de, *top; ++ struct buffer_head *bh; ++ unsigned long block; ++ int retval; ++ int namelen = dentry->d_name.len; ++ const u8 *name = dentry->d_name.name; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ sb = dir->i_sb; ++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++ return NULL; ++ hash = hinfo.hash; ++ do { ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ goto errout; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto errout; ++ } ++ *res_dir = de; ++ dx_release (frames); ++ return bh; ++ } ++ brelse (bh); ++ /* Check to see if we should continue to search */ ++ retval = ext3_htree_next_block(dir, hash, frame, ++ frames, err, 0); ++ if (retval == -1) { ++ ext3_warning(sb, __FUNCTION__, ++ "error reading index page in directory #%lu", ++ dir->i_ino); ++ goto errout; ++ } ++ } while (retval == 1); ++ ++ *err = -ENOENT; ++errout: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++} ++#endif ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -212,8 +937,9 @@ + brelse (bh); + inode = iget(dir->i_sb, ino); + +- if (!inode) ++ if (!inode) { + return ERR_PTR(-EACCES); ++ } + } + d_add(dentry, inode); + return NULL; +@@ -237,6 +963,301 @@ + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct ext3_dir_entry_2 * ++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); ++ de->inode = 0; ++ map++; ++ to += rec_len; ++ } ++ return (struct ext3_dir_entry_2 *) (to - rec_len); ++} ++ ++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) ++{ ++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ unsigned rec_len = 0; ++ ++ prev = to = de; ++ while ((char*)de < base + size) { ++ next = (struct ext3_dir_entry_2 *) ((char *) de + ++ le16_to_cpu(de->rec_len)); ++ if (de->inode && de->name_len) { ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = cpu_to_le16(rec_len); ++ prev = to; ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); ++ } ++ de = next; ++ } ++ return prev; ++} ++ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ u32 hash2; ++ struct dx_map_entry *map; ++ char *data1 = (*bh)->b_data, *data2; ++ unsigned split; ++ struct ext3_dir_entry_2 *de = NULL, *de2; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) { ++ brelse(*bh); ++ *bh = NULL; ++ goto errout; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, *bh); ++ if (err) { ++ journal_error: ++ brelse(*bh); ++ brelse(bh2); ++ *bh = NULL; ++ ext3_std_error(dir->i_sb, err); ++ goto errout; ++ } ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ ++ data2 = bh2->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de = dx_pack_dirents(data1,blocksize); ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, frame->bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ dxtrace(dx_show_index ("frame", frame->entries)); ++errout: ++ return de; ++} ++#endif ++ ++ ++/* ++ * Add a new entry into a directory (leaf) block. If de is non-NULL, ++ * it points to a directory entry which is guaranteed to be large ++ * enough for new directory entry. If de is NULL, then ++ * add_dirent_to_buf will attempt search the directory block for ++ * space. It will return -ENOSPC if no space is available, and -EIO ++ * and -EEXIST if directory entry already exists. ++ * ++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In ++ * all other cases bh is released. ++ */ ++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct ext3_dir_entry_2 *de, ++ struct buffer_head * bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset = 0; ++ unsigned short reclen; ++ int nlen, rlen, err; ++ char *top; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ if (!de) { ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, ++ bh, offset)) { ++ brelse (bh); ++ return -EIO; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ break; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ if ((char *) de > top) ++ return -ENOSPC; ++ } ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return err; ++ } ++ ++ /* By now the buffer is marked for journaling */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ dir->i_version = ++event; ++ ext3_mark_inode_dirty(handle, dir); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return 0; ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * This converts a one block unindexed directory to a 3 block indexed ++ * directory, and adds the dentry to the indexed directory. ++ */ ++static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct buffer_head *bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de, *de2; ++ char *data1, *top; ++ unsigned len; ++ int retval; ++ unsigned blocksize; ++ struct dx_hash_info hinfo; ++ u32 block; ++ ++ blocksize = dir->i_sb->s_blocksize; ++ dxtrace(printk("Creating index\n")); ++ retval = ext3_journal_get_write_access(handle, bh); ++ if (retval) { ++ ext3_std_error(dir->i_sb, retval); ++ brelse(bh); ++ return retval; ++ } ++ root = (struct dx_root *) bh->b_data; ++ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) { ++ brelse(bh); ++ return retval; ++ } ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (struct ext3_dir_entry_2 *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (struct ext3_dir_entry_2 *) (&root->dotdot); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hinfo.hash_version = root->info.hash_version; ++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} ++#endif ++ + /* + * ext3_add_entry() + * +@@ -247,127 +1268,198 @@ + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +- +-/* +- * AKPM: the journalling code here looks wrong on the error paths +- */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; +- struct ext3_dir_entry_2 * de, * de1; ++ struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; ++#ifdef CONFIG_EXT3_INDEX ++ int dx_fallback=0; ++#endif ++ unsigned blocksize; ++ unsigned nlen, rlen; ++ u32 block, blocks; + + sb = dir->i_sb; +- +- if (!namelen) ++ blocksize = sb->s_blocksize; ++ if (!dentry->d_name.len) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ retval = ext3_dx_add_entry(handle, dentry, inode); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ return retval; ++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; ++ dx_fallback++; ++ ext3_mark_inode_dirty(handle, dir); ++ } ++#endif ++ blocks = dir->i_size >> sb->s_blocksize_bits; ++ for (block = 0, offset = 0; block < blocks; block++) { ++ bh = ext3_bread(handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (retval != -ENOSPC) ++ return retval; ++ ++#ifdef CONFIG_EXT3_INDEX ++ if (blocks == 1 && !dx_fallback && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ return make_indexed_dir(handle, dentry, inode, bh); ++#endif ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; +- } ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} + +- ext3_debug ("creating next block\n"); ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block * sb = dir->i_sb; ++ struct ext3_dir_entry_2 *de; ++ int err; + +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- } else { ++ frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ entries = frame->entries; ++ at = frame->at; + +- ext3_debug ("skipping to next block\n"); ++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ goto cleanup; + +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- } +- } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (err != -ENOSPC) { ++ bh = 0; ++ goto cleanup; ++ } ++ ++ /* Block full, should compress but for now just split */ ++ dxtrace(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ ++ if (levels && (dx_get_count(frames->entries) == ++ dx_get_limit(frames->entries))) { ++ ext3_warning(sb, __FUNCTION__, ++ "Directory index full!\n"); ++ err = -ENOSPC; ++ goto cleanup; + } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) ++ goto cleanup; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); ++ node2->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ if (levels) { ++ unsigned icount1 = icount/2, icount2 = icount - icount1; ++ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ err = ext3_journal_get_write_access(handle, ++ frames[0].bh); ++ if (err) ++ goto journal_error; ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1), ++ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries >= icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- dir->i_version = ++event; +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); +- brelse(bh); +- return 0; ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace(dx_show_index ("node", frames[1].entries)); ++ dxtrace(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ } else { ++ dxtrace(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ err = ext3_journal_get_write_access(handle, ++ frame->bh); ++ if (err) ++ goto journal_error; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- brelse (bh); +- return -ENOSPC; ++ de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ if (!de) ++ goto cleanup; ++ err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ bh = 0; ++ goto cleanup; ++ ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++cleanup: ++ if (bh) ++ brelse(bh); ++ dx_release(frames); ++ return err; + } ++#endif + + /* + * ext3_delete_entry deletes a directory entry by merging it with the +@@ -451,9 +1543,11 @@ + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -478,9 +1572,11 @@ + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -507,9 +1603,11 @@ + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -521,7 +1619,7 @@ + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; +- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +@@ -554,21 +1652,19 @@ + inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); +- if (err) +- goto out_no_entry; ++ if (err) { ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + dir->i_nlink++; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- inode->i_nlink = 0; +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + /* +@@ -655,7 +1751,7 @@ + int err = 0, rc; + + lock_super(sb); +- if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks +@@ -696,7 +1792,7 @@ + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) +- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", +@@ -714,25 +1810,26 @@ + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); +- if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); +- prev = inode->u.ext3_i.i_orphan.prev; ++ prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + +- list_del(&inode->u.ext3_i.i_orphan); +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ list_del(&ei->i_orphan); ++ INIT_LIST_HEAD(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on +@@ -793,8 +1890,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); +@@ -832,7 +1930,7 @@ + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -850,8 +1948,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -878,7 +1977,7 @@ + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -904,9 +2003,11 @@ + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -916,7 +2017,7 @@ + if (IS_ERR(inode)) + goto out_stop; + +- if (l > sizeof (inode->u.ext3_i.i_data)) { ++ if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* +@@ -925,8 +2026,12 @@ + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); +- if (err) +- goto out_no_entry; ++ if (err) { ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&inode->u.ext3_i.i_data,symname,l); +@@ -938,12 +2043,6 @@ + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- ext3_dec_count(handle, inode); +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, +@@ -956,12 +2055,15 @@ + if (S_ISDIR(inode->i_mode)) + return -EPERM; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (inode->i_nlink >= EXT3_LINK_MAX) { + return -EMLINK; ++ } + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -995,9 +2097,11 @@ + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; +@@ -1070,14 +2174,33 @@ + /* + * ok, that's it + */ +- ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ if (retval == -ENOENT) { ++ /* ++ * old_de could have moved out from under us. ++ */ ++ struct buffer_head *old_bh2; ++ struct ext3_dir_entry_2 *old_de2; ++ ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ if (old_bh2) { ++ retval = ext3_delete_entry(handle, old_dir, ++ old_de2, old_bh2); ++ brelse(old_bh2); ++ } ++ } ++ if (retval) { ++ ext3_warning(old_dir->i_sb, "ext3_rename", ++ "Deleting old file (%lu), %d, error=%d", ++ old_dir->i_ino, old_dir->i_nlink, retval); ++ } + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1089,7 +2212,7 @@ + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } +Index: linux.mcp2/fs/ext3/super.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:03:55.000000000 -0700 ++++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:08:50.000000000 -0700 +@@ -702,6 +702,7 @@ + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO +@@ -712,6 +713,7 @@ + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +@@ -886,6 +888,7 @@ + return res; + } + ++ + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { +@@ -1062,6 +1065,9 @@ + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ for (i=0; i < 4; i++) ++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); ++ sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +@@ -1744,7 +1750,7 @@ + unregister_filesystem(&ext3_fs_type); + } + +-EXPORT_NO_SYMBOLS; ++EXPORT_SYMBOL(ext3_force_commit); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +Index: linux.mcp2/include/linux/ext3_fs.h +=================================================================== +--- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 14:53:17.000000000 -0700 ++++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:07:07.000000000 -0700 +@@ -40,6 +40,11 @@ + #define EXT3FS_VERSION "2.4-0.9.17" + + /* ++ * Always enable hashed directories ++ */ ++#define CONFIG_EXT3_INDEX ++ ++/* + * Debug code + */ + #ifdef EXT3FS_DEBUG +@@ -437,8 +442,11 @@ + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ +- +-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ ++ __u32 s_hash_seed[4]; /* HTREE hash seed */ ++ __u8 s_def_hash_version; /* Default hash version to use */ ++ __u8 s_reserved_char_pad; ++ __u16 s_reserved_word_pad; ++ __u32 s_reserved[192]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -575,9 +583,46 @@ + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#ifdef CONFIG_EXT3_INDEX ++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif ++ ++/* Legal values for the dx_root hash_version field: */ ++ ++#define DX_HASH_LEGACY 0 ++#define DX_HASH_HALF_MD4 1 ++#define DX_HASH_TEA 2 ++ ++/* hash info structure used by the directory hash */ ++struct dx_hash_info ++{ ++ u32 hash; ++ u32 minor_hash; ++ int hash_version; ++ u32 *seed; ++}; + + #ifdef __KERNEL__ + /* ++ * Control parameters used by ext3_htree_next_block ++ */ ++#define HASH_NB_ALWAYS 1 ++ ++ ++/* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc +@@ -587,6 +632,27 @@ + unsigned long block_group; + }; + ++ ++/* ++ * This structure is stuffed into the struct file's private_data field ++ * for directories. It is where we put information so that we can do ++ * readdir operations in hash tree order. ++ */ ++struct dir_private_info { ++ rb_root_t root; ++ rb_node_t *curr_node; ++ struct fname *extra_fname; ++ loff_t last_pos; ++ __u32 curr_hash; ++ __u32 curr_minor_hash; ++ __u32 next_hash; ++}; ++ ++/* ++ * Special error return code only used by dx_probe() and its callers. ++ */ ++#define ERR_BAD_DX_DIR -75000 ++ + /* + * Function prototypes + */ +@@ -614,11 +680,20 @@ + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, struct buffer_head *, +- unsigned long); ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent); ++extern void ext3_htree_free_dir_info(struct dir_private_info *p); ++ + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + ++/* hash.c */ ++extern int ext3fs_dirhash(const char *name, int len, struct ++ dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); +@@ -650,6 +725,8 @@ + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); ++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) +Index: linux.mcp2/include/linux/ext3_fs_sb.h +=================================================================== +--- linux.mcp2.orig/include/linux/ext3_fs_sb.h 2004-05-17 14:41:25.000000000 -0700 ++++ linux.mcp2/include/linux/ext3_fs_sb.h 2004-05-17 15:07:07.000000000 -0700 +@@ -62,6 +62,8 @@ + int s_inode_size; + int s_first_ino; + u32 s_next_generation; ++ u32 s_hash_seed[4]; ++ int s_def_hash_version; + + /* Journaling */ + struct inode * s_journal_inode; +Index: linux.mcp2/include/linux/ext3_jbd.h +=================================================================== +--- linux.mcp2.orig/include/linux/ext3_jbd.h 2004-05-17 14:53:17.000000000 -0700 ++++ linux.mcp2/include/linux/ext3_jbd.h 2004-05-17 15:07:07.000000000 -0700 +@@ -63,6 +63,8 @@ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +Index: linux.mcp2/include/linux/rbtree.h +=================================================================== +--- linux.mcp2.orig/include/linux/rbtree.h 2004-05-17 14:41:25.000000000 -0700 ++++ linux.mcp2/include/linux/rbtree.h 2004-05-17 15:07:07.000000000 -0700 +@@ -120,6 +120,8 @@ + + extern void rb_insert_color(rb_node_t *, rb_root_t *); + extern void rb_erase(rb_node_t *, rb_root_t *); ++extern rb_node_t *rb_get_first(rb_root_t *root); ++extern rb_node_t *rb_get_next(rb_node_t *n); + + static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) + { +Index: linux.mcp2/lib/rbtree.c +=================================================================== +--- linux.mcp2.orig/lib/rbtree.c 2004-01-19 07:49:44.000000000 -0800 ++++ linux.mcp2/lib/rbtree.c 2004-05-17 15:10:39.000000000 -0700 +@@ -17,6 +17,8 @@ + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c ++ ++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 + */ + + #include +@@ -294,3 +296,42 @@ + __rb_erase_color(child, parent, root); + } + EXPORT_SYMBOL(rb_erase); ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++rb_node_t *rb_get_first(rb_root_t *root) ++{ ++ rb_node_t *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return 0; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++EXPORT_SYMBOL(rb_get_first); ++ ++/* ++ * Given a node, this function will return the next node in the tree. ++ */ ++rb_node_t *rb_get_next(rb_node_t *n) ++{ ++ rb_node_t *parent; ++ ++ if (n->rb_right) { ++ n = n->rb_right; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++ } else { ++ while ((parent = n->rb_parent)) { ++ if (n == parent->rb_left) ++ return parent; ++ n = parent; ++ } ++ return 0; ++ } ++} ++EXPORT_SYMBOL(rb_get_next); diff --git a/lustre/kernel_patches/patches/ext3-no-write-super.patch b/lustre/kernel_patches/patches/ext3-no-write-super.patch new file mode 100644 index 0000000..d2dcdae --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-no-write-super.patch @@ -0,0 +1,22 @@ + 0 files changed + +--- linux-2.4.20/fs/ext3/super.c~ext3-no-write-super 2003-08-11 13:20:17.000000000 +0400 ++++ linux-2.4.20-alexey/fs/ext3/super.c 2003-08-11 13:31:35.000000000 +0400 +@@ -1849,7 +1849,6 @@ void ext3_write_super (struct super_bloc + if (down_trylock(&sb->s_lock) == 0) + BUG(); /* aviro detector */ + sb->s_dirt = 0; +- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + + /* + * Tricky --- if we are unmounting, the write really does need +@@ -1857,6 +1856,7 @@ void ext3_write_super (struct super_bloc + * sb->s_root. + */ + if (do_sync_supers || !sb->s_root) { ++ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + +_ diff --git a/lustre/kernel_patches/patches/ext3-unmount_sync.patch b/lustre/kernel_patches/patches/ext3-unmount_sync.patch new file mode 100644 index 0000000..c57903c --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-unmount_sync.patch @@ -0,0 +1,21 @@ + fs/ext3/super.c | 7 ++++++- + 1 files changed, 6 insertions(+), 1 deletion(-) + +--- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600 +@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + +- if (do_sync_supers) { ++ /* ++ * Tricky --- if we are unmounting, the write really does need ++ * to be synchronous. We can detect that by looking for NULL in ++ * sb->s_root. ++ */ ++ if (do_sync_supers || !sb->s_root) { + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + +_ diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch b/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch new file mode 100644 index 0000000..595db54 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch @@ -0,0 +1,53 @@ + ./fs/ext3/namei.c | 11 +++++------ + 1 files changed, 5 insertions(+), 6 deletions(-) + +Index: linux-2.4.19-pre1/./fs/ext3/namei.c +=================================================================== +--- linux-2.4.19-pre1.orig/./fs/ext3/namei.c 2003-11-21 01:52:06.000000000 +0300 ++++ linux-2.4.19-pre1/./fs/ext3/namei.c 2003-11-21 01:58:15.000000000 +0300 +@@ -1522,8 +1522,11 @@ + { + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { +- d_instantiate(dentry, inode); +- return 0; ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ d_instantiate(dentry, inode); ++ return 0; ++ } + } + ext3_dec_count(handle, inode); + iput(inode); +@@ -1559,7 +1562,6 @@ + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +- ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); +@@ -1586,7 +1588,6 @@ + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); +- ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); +@@ -2035,7 +2036,6 @@ + inode->i_size = l-1; + } + inode->u.ext3_i.i_disksize = inode->i_size; +- ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); +@@ -2069,7 +2069,6 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + ext3_journal_stop(handle, dir); + return err; diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch new file mode 100644 index 0000000..85bdf9e --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch @@ -0,0 +1,121 @@ + + + + fs/inode.c | 21 ++++++++++++++------- + fs/smbfs/inode.c | 2 +- + fs/super.c | 4 ++-- + include/linux/fs.h | 2 +- + 4 files changed, 18 insertions(+), 11 deletions(-) + +Index: linux.mcp2/fs/inode.c +=================================================================== +--- linux.mcp2.orig/fs/inode.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/inode.c 2004-05-05 14:31:31.000000000 -0700 +@@ -553,7 +553,8 @@ + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -578,6 +579,11 @@ + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR ++ "inode busy: dev %s:%lu (%p) mode %o count %u\n", ++ kdevname(sb->s_dev), inode->i_ino, inode, ++ inode->i_mode, atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -596,22 +602,23 @@ + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -637,7 +644,7 @@ + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_buffers(dev); +Index: linux.mcp2/fs/super.c +=================================================================== +--- linux.mcp2.orig/fs/super.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/super.c 2004-05-05 14:32:06.000000000 -0700 +@@ -838,7 +838,7 @@ + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -847,7 +847,7 @@ + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +Index: linux.mcp2/fs/smbfs/inode.c +=================================================================== +--- linux.mcp2.orig/fs/smbfs/inode.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/smbfs/inode.c 2004-05-05 14:31:31.000000000 -0700 +@@ -166,7 +166,7 @@ + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* +Index: linux.mcp2/include/linux/fs.h +=================================================================== +--- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:31:06.000000000 -0700 ++++ linux.mcp2/include/linux/fs.h 2004-05-05 14:31:31.000000000 -0700 +@@ -1283,7 +1283,7 @@ + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch new file mode 100644 index 0000000..2466af6 --- /dev/null +++ b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch @@ -0,0 +1,52 @@ + fs/Makefile | 2 +- + fs/inode.c | 4 +++- + mm/page_alloc.c | 1 + + 3 files changed, 5 insertions(+), 2 deletions(-) + +Index: linux-ion/fs/inode.c +=================================================================== +--- linux-ion.orig/fs/inode.c 2004-09-27 14:58:03.000000000 -0700 ++++ linux-ion/fs/inode.c 2004-09-27 14:58:34.000000000 -0700 +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -66,7 +67,8 @@ + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(inode_lock); + + /* + * Statistics gathering.. +Index: linux-ion/fs/Makefile +=================================================================== +--- linux-ion.orig/fs/Makefile 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/fs/Makefile 2004-09-27 14:59:37.000000000 -0700 +@@ -7,7 +7,7 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o ++export-objs := filesystems.o open.o dcache.o buffer.o inode.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +Index: linux-ion/mm/page_alloc.c +=================================================================== +--- linux-ion.orig/mm/page_alloc.c 2004-07-28 14:34:57.000000000 -0700 ++++ linux-ion/mm/page_alloc.c 2004-09-27 14:58:34.000000000 -0700 +@@ -28,6 +28,7 @@ + LIST_HEAD(inactive_list); + LIST_HEAD(active_list); + pg_data_t *pgdat_list; ++EXPORT_SYMBOL(pgdat_list); + + /* Used to look up the address of the struct zone encoded in page->zone */ + zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch new file mode 100644 index 0000000..511cf37 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch @@ -0,0 +1,494 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 4 + fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 13 + + fs/ext3/namei.c | 13 + + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 318 insertions(+), 2 deletions(-) + +Index: linux-2.4.19/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 ++++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400 +@@ -35,6 +35,22 @@ + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +Index: linux.mcp2/fs/ext3/Makefile +=================================================================== +--- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:20:52.000000000 -0700 ++++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:21:55.000000000 -0700 +@@ -11,7 +11,7 @@ + + export-objs := ext3-exports.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + +Index: linux.mcp2/fs/ext3/inode.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/inode.c 2004-05-17 15:20:59.000000000 -0700 ++++ linux.mcp2/fs/ext3/inode.c 2004-05-17 15:21:55.000000000 -0700 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2125,6 +2126,9 @@ + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +Index: linux.mcp2/fs/ext3/iopen.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700 ++++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700 +@@ -0,0 +1,282 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del_init(&goal->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&dentry->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, dentry); ++ do_switch(goal->d_parent, dentry->d_parent); ++ do_switch(goal->d_name.len, dentry->d_name.len); ++ do_switch(goal->d_name.hash, dentry->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); ++ __d_rehash(goal, 0); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux.mcp2/fs/ext3/iopen.h +=================================================================== +--- linux.mcp2.orig/fs/ext3/iopen.h 2002-04-11 07:25:15.000000000 -0700 ++++ linux.mcp2/fs/ext3/iopen.h 2004-05-17 15:21:55.000000000 -0700 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux.mcp2/fs/ext3/namei.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:20:59.000000000 -0700 ++++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:21:55.000000000 -0700 +@@ -35,7 +35,7 @@ + #include + #include + #include +- ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -931,6 +931,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -942,8 +945,8 @@ + return ERR_PTR(-EACCES); + } + } +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + #define S_SHIFT 12 +@@ -1932,10 +1935,6 @@ + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; +@@ -2054,6 +2053,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2081,7 +2097,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle, dir); + return err; + } +Index: linux.mcp2/fs/ext3/super.c +=================================================================== +--- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:20:59.000000000 -0700 ++++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:21:55.000000000 -0700 +@@ -836,6 +836,18 @@ + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +Index: linux.mcp2/include/linux/ext3_fs.h +=================================================================== +--- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 15:20:59.000000000 -0700 ++++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:21:55.000000000 -0700 +@@ -323,6 +323,8 @@ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch new file mode 100644 index 0000000..bbbf613 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch @@ -0,0 +1,274 @@ +Index: linux-2.4.19-pre1/include/linux/jbd.h +=================================================================== +--- linux-2.4.19-pre1.orig/include/linux/jbd.h 2003-11-21 03:00:11.000000000 +0300 ++++ linux-2.4.19-pre1/include/linux/jbd.h 2003-11-21 03:04:47.000000000 +0300 +@@ -275,6 +275,13 @@ + return bh->b_private; + } + ++#define HAVE_JOURNAL_CALLBACK_STATUS ++struct journal_callback { ++ struct list_head jcb_list; ++ void (*jcb_func)(struct journal_callback *jcb, int error); ++ /* user data goes here */ ++}; ++ + struct jbd_revoke_table_s; + + /* The handle_t type represents a single atomic update being performed +@@ -305,6 +312,12 @@ + operations */ + int h_err; + ++ /* List of application registered callbacks for this handle. ++ * The function(s) will be called after the transaction that ++ * this handle is part of has been committed to disk. ++ */ ++ struct list_head h_jcb; ++ + /* Flags */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ +@@ -424,6 +437,10 @@ + + /* How many handles used this transaction? */ + int t_handle_count; ++ ++ /* List of registered callback functions for this transaction. ++ * Called when the transaction is committed. */ ++ struct list_head t_jcb; + }; + + +@@ -672,6 +689,9 @@ + extern int journal_try_to_free_buffers(journal_t *, struct page *, int); + extern int journal_stop(handle_t *); + extern int journal_flush (journal_t *); ++extern void journal_callback_set(handle_t *handle, ++ void (*fn)(struct journal_callback *,int), ++ struct journal_callback *jcb); + + extern void journal_lock_updates (journal_t *); + extern void journal_unlock_updates (journal_t *); +Index: linux-2.4.19-pre1/fs/jbd/checkpoint.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/jbd/checkpoint.c 2003-11-21 02:53:20.000000000 +0300 ++++ linux-2.4.19-pre1/fs/jbd/checkpoint.c 2003-11-21 03:04:47.000000000 +0300 +@@ -601,7 +601,8 @@ + J_ASSERT (transaction->t_log_list == NULL); + J_ASSERT (transaction->t_checkpoint_list == NULL); + J_ASSERT (transaction->t_updates == 0); +- ++ J_ASSERT (list_empty(&transaction->t_jcb)); ++ + J_ASSERT (transaction->t_journal->j_committing_transaction != + transaction); + +Index: linux-2.4.19-pre1/fs/jbd/commit.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/jbd/commit.c 2003-11-21 02:53:20.000000000 +0300 ++++ linux-2.4.19-pre1/fs/jbd/commit.c 2003-11-21 03:04:47.000000000 +0300 +@@ -480,7 +480,7 @@ + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + +- Wait for the transactions in reverse order. That way we are ++ Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ +@@ -571,8 +571,10 @@ + + jbd_debug(3, "JBD: commit phase 6\n"); + +- if (is_journal_aborted(journal)) ++ if (is_journal_aborted(journal)) { ++ unlock_journal(journal); + goto skip_commit; ++ } + + /* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort +@@ -582,9 +584,10 @@ + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) { + __journal_abort_hard(journal); ++ unlock_journal(journal); + goto skip_commit; + } +- ++ + /* AKPM: buglet - add `i' to tmp! */ + for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { + journal_header_t *tmp = +@@ -605,14 +608,32 @@ + put_bh(bh); /* One for getblk() */ + journal_unlock_journal_head(descriptor); + } +- lock_journal(journal); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + +-skip_commit: ++skip_commit: /* The journal should be unlocked by now. */ ++ ++ /* Call any callbacks that had been registered for handles in this ++ * transaction. It is up to the callback to free any allocated ++ * memory. ++ */ ++ if (!list_empty(&commit_transaction->t_jcb)) { ++ struct list_head *p, *n; ++ int error = is_journal_aborted(journal); ++ ++ list_for_each_safe(p, n, &commit_transaction->t_jcb) { ++ struct journal_callback *jcb; ++ ++ jcb = list_entry(p, struct journal_callback, jcb_list); ++ list_del(p); ++ jcb->jcb_func(jcb, error); ++ } ++ } ++ ++ lock_journal(journal); + + jbd_debug(3, "JBD: commit phase 7\n"); + +Index: linux-2.4.19-pre1/fs/jbd/journal.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/jbd/journal.c 2003-11-21 02:53:20.000000000 +0300 ++++ linux-2.4.19-pre1/fs/jbd/journal.c 2003-11-21 03:04:47.000000000 +0300 +@@ -58,6 +58,7 @@ + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); ++EXPORT_SYMBOL(journal_callback_set); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); +Index: linux-2.4.19-pre1/fs/jbd/transaction.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/jbd/transaction.c 2003-11-21 02:53:20.000000000 +0300 ++++ linux-2.4.19-pre1/fs/jbd/transaction.c 2003-11-21 03:05:14.000000000 +0300 +@@ -57,6 +57,7 @@ + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; ++ INIT_LIST_HEAD(&transaction->t_jcb); + + /* Set up the commit timer for the new transaction. */ + J_ASSERT (!journal->j_commit_timer_active); +@@ -90,7 +91,14 @@ + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; +- ++ ++ if (nblocks > journal->j_max_transaction_buffers) { ++ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", ++ current->comm, nblocks, ++ journal->j_max_transaction_buffers); ++ return -ENOSPC; ++ } ++ + jbd_debug(3, "New handle %p going live.\n", handle); + + repeat: +@@ -196,6 +204,20 @@ + return 0; + } + ++/* Allocate a new handle. This should probably be in a slab... */ ++static handle_t *new_handle(int nblocks) ++{ ++ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ if (!handle) ++ return NULL; ++ memset(handle, 0, sizeof (handle_t)); ++ handle->h_buffer_credits = nblocks; ++ handle->h_ref = 1; ++ INIT_LIST_HEAD(&handle->h_jcb); ++ ++ return handle; ++} ++ + /* + * Obtain a new handle. + * +@@ -222,14 +244,11 @@ + handle->h_ref++; + return handle; + } +- +- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ ++ handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); +- memset (handle, 0, sizeof (handle_t)); + +- handle->h_buffer_credits = nblocks; +- handle->h_ref = 1; + current->journal_info = handle; + + err = start_this_handle(journal, handle); +@@ -328,14 +347,11 @@ + + if (is_journal_aborted(journal)) + return ERR_PTR(-EIO); +- +- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ ++ handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); +- memset (handle, 0, sizeof (handle_t)); + +- handle->h_buffer_credits = nblocks; +- handle->h_ref = 1; + current->journal_info = handle; + + err = try_start_this_handle(journal, handle); +@@ -1324,6 +1340,28 @@ + #endif + + /* ++ * Register a callback function for this handle. The function will be ++ * called when the transaction that this handle is part of has been ++ * committed to disk with the original callback data struct and the ++ * error status of the journal as parameters. There is no guarantee of ++ * ordering between handles within a single transaction, nor between ++ * callbacks registered on the same handle. ++ * ++ * The caller is responsible for allocating the journal_callback struct. ++ * This is to allow the caller to add as much extra data to the callback ++ * as needed, but reduce the overhead of multiple allocations. The caller ++ * allocated struct must start with a struct journal_callback at offset 0, ++ * and has the caller-specific data afterwards. ++ */ ++void journal_callback_set(handle_t *handle, ++ void (*func)(struct journal_callback *jcb, int error), ++ struct journal_callback *jcb) ++{ ++ list_add_tail(&jcb->jcb_list, &handle->h_jcb); ++ jcb->jcb_func = func; ++} ++ ++/* + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining +@@ -1389,7 +1427,10 @@ + wake_up(&journal->j_wait_transaction_locked); + } + +- /* ++ /* Move callbacks from the handle to the transaction. */ ++ list_splice(&handle->h_jcb, &transaction->t_jcb); ++ ++ /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the diff --git a/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch b/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch new file mode 100644 index 0000000..8f7188a --- /dev/null +++ b/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch @@ -0,0 +1,685 @@ +Index: linux-bgl/arch/arm/vmlinux-armo.lds.in +=================================================================== +--- linux-bgl.orig/arch/arm/vmlinux-armo.lds.in 2003-07-02 08:44:12.000000000 -0700 ++++ linux-bgl/arch/arm/vmlinux-armo.lds.in 2004-10-26 22:52:50.037677957 -0700 +@@ -62,6 +62,10 @@ + *(__ksymtab) + __stop___ksymtab = .; + ++ __start___kallsyms = .; /* All kernel symbols */ ++ *(__kallsyms) ++ __stop___kallsyms = .; ++ + *(.got) /* Global offset table */ + + _etext = .; /* End of text section */ +Index: linux-bgl/arch/arm/vmlinux-armv.lds.in +=================================================================== +--- linux-bgl.orig/arch/arm/vmlinux-armv.lds.in 2003-07-02 08:44:12.000000000 -0700 ++++ linux-bgl/arch/arm/vmlinux-armv.lds.in 2004-10-26 22:52:50.038677801 -0700 +@@ -67,6 +67,12 @@ + __stop___ksymtab = .; + } + ++ __kallsyms : { /* Kernel debugging table */ ++ __start___kallsyms = .; /* All kernel symbols */ ++ *(__kallsyms) ++ __stop___kallsyms = .; ++ } ++ + . = ALIGN(8192); + + .data : { +Index: linux-bgl/arch/ppc/config.in +=================================================================== +--- linux-bgl.orig/arch/ppc/config.in 2004-10-04 09:55:49.000000000 -0700 ++++ linux-bgl/arch/ppc/config.in 2004-10-26 23:11:56.416643929 -0700 +@@ -732,6 +732,7 @@ + string 'Additional compile arguments' CONFIG_COMPILE_OPTIONS "-g -ggdb" + fi + fi ++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS + + if [ "$CONFIG_ALL_PPC" = "y" ]; then + bool 'Support for early boot text console (BootX or OpenFirmware only)' CONFIG_BOOTX_TEXT +Index: linux-bgl/arch/ppc/vmlinux.lds +=================================================================== +--- linux-bgl.orig/arch/ppc/vmlinux.lds 2003-07-02 08:43:30.000000000 -0700 ++++ linux-bgl/arch/ppc/vmlinux.lds 2004-10-26 22:52:50.043677020 -0700 +@@ -73,6 +73,10 @@ + __ksymtab : { *(__ksymtab) } + __stop___ksymtab = .; + ++ __start___kallsyms = .; /* All kernel symbols */ ++ __kallsyms : { *(__kallsyms) } ++ __stop___kallsyms = .; ++ + __start___ftr_fixup = .; + __ftr_fixup : { *(__ftr_fixup) } + __stop___ftr_fixup = .; +Index: linux-bgl/arch/i386/config.in +=================================================================== +--- linux-bgl.orig/arch/i386/config.in 2003-07-02 08:43:46.000000000 -0700 ++++ linux-bgl/arch/i386/config.in 2004-10-26 22:52:50.040677488 -0700 +@@ -363,6 +363,7 @@ + if [ "$CONFIG_ISDN" != "n" ]; then + source drivers/isdn/Config.in + fi ++ bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS + fi + endmenu + +Index: linux-bgl/arch/i386/vmlinux.lds +=================================================================== +--- linux-bgl.orig/arch/i386/vmlinux.lds 2003-07-02 08:44:32.000000000 -0700 ++++ linux-bgl/arch/i386/vmlinux.lds 2004-10-26 22:52:50.040677488 -0700 +@@ -27,6 +27,9 @@ + __start___ksymtab = .; /* Kernel symbol table */ + __ksymtab : { *(__ksymtab) } + __stop___ksymtab = .; ++ __start___kallsyms = .; /* All kernel symbols */ ++ __kallsyms : { *(__kallsyms) } ++ __stop___kallsyms = .; + + .data : { /* Data */ + *(.data) +Index: linux-bgl/arch/ia64/config.in +=================================================================== +--- linux-bgl.orig/arch/ia64/config.in 2003-07-02 08:44:12.000000000 -0700 ++++ linux-bgl/arch/ia64/config.in 2004-10-26 22:52:50.055675147 -0700 +@@ -278,4 +278,6 @@ + bool ' Turn on irq debug checks (slow!)' CONFIG_IA64_DEBUG_IRQ + fi + ++bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS ++ + endmenu +Index: linux-bgl/arch/alpha/vmlinux.lds.in +=================================================================== +--- linux-bgl.orig/arch/alpha/vmlinux.lds.in 2003-07-02 08:43:45.000000000 -0700 ++++ linux-bgl/arch/alpha/vmlinux.lds.in 2004-10-26 22:52:50.036678113 -0700 +@@ -28,6 +28,10 @@ + __stop___ksymtab = .; + .kstrtab : { *(.kstrtab) } + ++ __start___kallsyms = .; /* All kernel symbols */ ++ __kallsyms : { *(__kallsyms) } ++ __stop___kallsyms = .; ++ + /* Startup code */ + . = ALIGN(8192); + __init_begin = .; +Index: linux-bgl/Makefile +=================================================================== +--- linux-bgl.orig/Makefile 2004-10-04 09:55:49.000000000 -0700 ++++ linux-bgl/Makefile 2004-10-26 22:54:44.018588371 -0700 +@@ -38,10 +38,13 @@ + MAKEFILES = $(TOPDIR)/.config + GENKSYMS = /sbin/genksyms + DEPMOD = /sbin/depmod ++KALLSYMS = /sbin/kallsyms + MODFLAGS = -DMODULE + CFLAGS_KERNEL = + PERL = perl + ++TMPPREFIX = ++ + export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ + CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ + CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL +@@ -198,7 +201,7 @@ + CLEAN_FILES = \ + kernel/ksyms.lst include/linux/compile.h \ + vmlinux System.map \ +- .tmp* \ ++ $(TMPPREFIX).tmp* \ + drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \ + drivers/char/conmakehash \ + drivers/char/drm/*-mod.c \ +@@ -278,16 +281,39 @@ + boot: vmlinux + @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot + ++LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ ++ --start-group \ ++ $(CORE_FILES) \ ++ $(DRIVERS) \ ++ $(NETWORKS) \ ++ $(LIBS) \ ++ --end-group ++ifeq ($(CONFIG_KALLSYMS),y) ++LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o ++else ++LD_VMLINUX_KALLSYMS := ++endif ++ + vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs +- $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ +- --start-group \ +- $(CORE_FILES) \ +- $(DRIVERS) \ +- $(NETWORKS) \ +- $(LIBS) \ +- --end-group \ +- -o vmlinux ++ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms ++ ++.PHONY: kallsyms ++ ++kallsyms: ++ifeq ($(CONFIG_KALLSYMS),y) ++ @echo kallsyms pass 1 ++ $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1 ++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o ++ @echo kallsyms pass 2 ++ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2 ++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o ++ @echo kallsyms pass 3 ++ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3 ++ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o ++endif ++ $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o vmlinux + $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map ++ @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms* + + symlinks: + rm -f include/asm +Index: linux-bgl/kernel/Makefile +=================================================================== +--- linux-bgl.orig/kernel/Makefile 2003-07-02 08:44:29.000000000 -0700 ++++ linux-bgl/kernel/Makefile 2004-10-26 22:59:34.101037916 -0700 +@@ -19,6 +19,7 @@ + obj-$(CONFIG_UID16) += uid16.o + obj-$(CONFIG_MODULES) += ksyms.o + obj-$(CONFIG_PM) += pm.o ++obj-$(CONFIG_KALLSYMS) += kallsyms.o + + ifneq ($(CONFIG_IA64),y) + # According to Alan Modra , the -fno-omit-frame-pointer is +Index: linux-bgl/kernel/ksyms.c +=================================================================== +--- linux-bgl.orig/kernel/ksyms.c 2004-10-26 21:49:59.922431839 -0700 ++++ linux-bgl/kernel/ksyms.c 2004-10-26 22:52:50.050675927 -0700 +@@ -56,6 +56,9 @@ + #ifdef CONFIG_KMOD + #include + #endif ++#ifdef CONFIG_KALLSYMS ++#include ++#endif + + extern void set_device_ro(kdev_t dev,int flag); + +@@ -81,6 +84,15 @@ + EXPORT_SYMBOL(inter_module_put); + EXPORT_SYMBOL(try_inc_mod_count); + ++#ifdef CONFIG_KALLSYMS ++extern const char __start___kallsyms[]; ++extern const char __stop___kallsyms[]; ++EXPORT_SYMBOL(__start___kallsyms); ++EXPORT_SYMBOL(__stop___kallsyms); ++ ++ ++#endif ++ + /* process memory management */ + EXPORT_SYMBOL(do_mmap_pgoff); + EXPORT_SYMBOL(do_munmap); +Index: linux-bgl/kernel/kallsyms.c +=================================================================== +--- linux-bgl.orig/kernel/kallsyms.c 2004-10-26 17:10:51.404753448 -0700 ++++ linux-bgl/kernel/kallsyms.c 2004-10-26 22:52:50.048676240 -0700 +@@ -0,0 +1,306 @@ ++/* An example of using kallsyms data in a kernel debugger. ++ ++ Copyright 2000 Keith Owens April 2000 ++ ++ This file is part of the Linux modutils. ++ ++ This program is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by the ++ Free Software Foundation; either version 2 of the License, or (at your ++ option) any later version. ++ ++ This program is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++ */ ++ ++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.4.1 2004/10/29 00:51:21 jacob Exp $" ++ ++/* ++ This code uses the list of all kernel and module symbols to :- ++ ++ * Find any non-stack symbol in a kernel or module. Symbols do ++ not have to be exported for debugging. ++ ++ * Convert an address to the module (or kernel) that owns it, the ++ section it is in and the nearest symbol. This finds all non-stack ++ symbols, not just exported ones. ++ ++ You need modutils >= 2.3.11 and a kernel with the kallsyms patch ++ which was compiled with CONFIG_KALLSYMS. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* These external symbols are only set on kernels compiled with ++ * CONFIG_KALLSYMS. ++ */ ++ ++extern const char __start___kallsyms[]; ++extern const char __stop___kallsyms[]; ++ ++static struct module **kallsyms_module_list; ++ ++static void kallsyms_get_module_list(void) ++{ ++ const struct kallsyms_header *ka_hdr; ++ const struct kallsyms_section *ka_sec; ++ const struct kallsyms_symbol *ka_sym; ++ const char *ka_str; ++ int i; ++ const char *p; ++ ++ if (__start___kallsyms >= __stop___kallsyms) ++ return; ++ ka_hdr = (struct kallsyms_header *)__start___kallsyms; ++ ka_sec = (struct kallsyms_section *) ++ ((char *)(ka_hdr) + ka_hdr->section_off); ++ ka_sym = (struct kallsyms_symbol *) ++ ((char *)(ka_hdr) + ka_hdr->symbol_off); ++ ka_str = ++ ((char *)(ka_hdr) + ka_hdr->string_off); ++ ++ for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) { ++ p = ka_str + ka_sym->name_off; ++ if (strcmp(p, "module_list") == 0) { ++ if (ka_sym->symbol_addr) ++ kallsyms_module_list = (struct module **)(ka_sym->symbol_addr); ++ break; ++ } ++ } ++} ++ ++static inline void kallsyms_do_first_time(void) ++{ ++ static int first_time = 1; ++ if (first_time) ++ kallsyms_get_module_list(); ++ first_time = 0; ++} ++ ++/* A symbol can appear in more than one module. A token is used to ++ * restart the scan at the next module, set the token to 0 for the ++ * first scan of each symbol. ++ */ ++ ++int kallsyms_symbol_to_address( ++ const char *name, /* Name to lookup */ ++ unsigned long *token, /* Which module to start at */ ++ const char **mod_name, /* Set to module name */ ++ unsigned long *mod_start, /* Set to start address of module */ ++ unsigned long *mod_end, /* Set to end address of module */ ++ const char **sec_name, /* Set to section name */ ++ unsigned long *sec_start, /* Set to start address of section */ ++ unsigned long *sec_end, /* Set to end address of section */ ++ const char **sym_name, /* Set to full symbol name */ ++ unsigned long *sym_start, /* Set to start address of symbol */ ++ unsigned long *sym_end /* Set to end address of symbol */ ++ ) ++{ ++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ ++ const struct kallsyms_section *ka_sec; ++ const struct kallsyms_symbol *ka_sym = NULL; ++ const char *ka_str = NULL; ++ const struct module *m; ++ int i = 0, l; ++ const char *p, *pt_R; ++ char *p2; ++ ++ kallsyms_do_first_time(); ++ if (!kallsyms_module_list) ++ return(0); ++ ++ /* Restart? */ ++ m = *kallsyms_module_list; ++ if (token && *token) { ++ for (; m; m = m->next) ++ if ((unsigned long)m == *token) ++ break; ++ if (m) ++ m = m->next; ++ } ++ ++ for (; m; m = m->next) { ++ if (!mod_member_present(m, kallsyms_start) || ++ !mod_member_present(m, kallsyms_end) || ++ m->kallsyms_start >= m->kallsyms_end) ++ continue; ++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; ++ ka_sym = (struct kallsyms_symbol *) ++ ((char *)(ka_hdr) + ka_hdr->symbol_off); ++ ka_str = ++ ((char *)(ka_hdr) + ka_hdr->string_off); ++ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { ++ p = ka_str + ka_sym->name_off; ++ if (strcmp(p, name) == 0) ++ break; ++ /* Unversioned requests match versioned names */ ++ if (!(pt_R = strstr(p, "_R"))) ++ continue; ++ l = strlen(pt_R); ++ if (l < 10) ++ continue; /* Not _R.*xxxxxxxx */ ++ (void)simple_strtoul(pt_R+l-8, &p2, 16); ++ if (*p2) ++ continue; /* Not _R.*xxxxxxxx */ ++ if (strncmp(p, name, pt_R-p) == 0) ++ break; /* Match with version */ ++ } ++ if (i < ka_hdr->symbols) ++ break; ++ } ++ ++ if (token) ++ *token = (unsigned long)m; ++ if (!m) ++ return(0); /* not found */ ++ ++ ka_sec = (const struct kallsyms_section *) ++ ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off); ++ *mod_name = *(m->name) ? m->name : "kernel"; ++ *mod_start = ka_hdr->start; ++ *mod_end = ka_hdr->end; ++ *sec_name = ka_sec->name_off + ka_str; ++ *sec_start = ka_sec->start; ++ *sec_end = ka_sec->start + ka_sec->size; ++ *sym_name = ka_sym->name_off + ka_str; ++ *sym_start = ka_sym->symbol_addr; ++ if (i < ka_hdr->symbols-1) { ++ const struct kallsyms_symbol *ka_symn = ka_sym; ++ kallsyms_next_sym(ka_hdr, ka_symn); ++ *sym_end = ka_symn->symbol_addr; ++ } ++ else ++ *sym_end = *sec_end; ++ return(1); ++} ++ ++int kallsyms_address_to_symbol( ++ unsigned long address, /* Address to lookup */ ++ const char **mod_name, /* Set to module name */ ++ unsigned long *mod_start, /* Set to start address of module */ ++ unsigned long *mod_end, /* Set to end address of module */ ++ const char **sec_name, /* Set to section name */ ++ unsigned long *sec_start, /* Set to start address of section */ ++ unsigned long *sec_end, /* Set to end address of section */ ++ const char **sym_name, /* Set to full symbol name */ ++ unsigned long *sym_start, /* Set to start address of symbol */ ++ unsigned long *sym_end /* Set to end address of symbol */ ++ ) ++{ ++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ ++ const struct kallsyms_section *ka_sec = NULL; ++ const struct kallsyms_symbol *ka_sym; ++ const char *ka_str; ++ const struct module *m; ++ int i; ++ unsigned long end; ++ ++ kallsyms_do_first_time(); ++ if (!kallsyms_module_list) ++ return(0); ++ ++ for (m = *kallsyms_module_list; m; m = m->next) { ++ if (!mod_member_present(m, kallsyms_start) || ++ !mod_member_present(m, kallsyms_end) || ++ m->kallsyms_start >= m->kallsyms_end) ++ continue; ++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; ++ ka_sec = (const struct kallsyms_section *) ++ ((char *)ka_hdr + ka_hdr->section_off); ++ /* Is the address in any section in this module? */ ++ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { ++ if (ka_sec->start <= address && ++ (ka_sec->start + ka_sec->size) > address) ++ break; ++ } ++ if (i < ka_hdr->sections) ++ break; /* Found a matching section */ ++ } ++ ++ if (!m) ++ return(0); /* not found */ ++ ++ ka_sym = (struct kallsyms_symbol *) ++ ((char *)(ka_hdr) + ka_hdr->symbol_off); ++ ka_str = ++ ((char *)(ka_hdr) + ka_hdr->string_off); ++ *mod_name = *(m->name) ? m->name : "kernel"; ++ *mod_start = ka_hdr->start; ++ *mod_end = ka_hdr->end; ++ *sec_name = ka_sec->name_off + ka_str; ++ *sec_start = ka_sec->start; ++ *sec_end = ka_sec->start + ka_sec->size; ++ *sym_name = *sec_name; /* In case we find no matching symbol */ ++ *sym_start = *sec_start; ++ *sym_end = *sec_end; ++ ++ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { ++ if (ka_sym->symbol_addr > address) ++ continue; ++ if (i < ka_hdr->symbols-1) { ++ const struct kallsyms_symbol *ka_symn = ka_sym; ++ kallsyms_next_sym(ka_hdr, ka_symn); ++ end = ka_symn->symbol_addr; ++ } ++ else ++ end = *sec_end; ++ if (end <= address) ++ continue; ++ if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off ++ != (char *)ka_sec) ++ continue; /* wrong section */ ++ *sym_name = ka_str + ka_sym->name_off; ++ *sym_start = ka_sym->symbol_addr; ++ *sym_end = end; ++ break; ++ } ++ return(1); ++} ++ ++/* List all sections in all modules. The callback routine is invoked with ++ * token, module name, section name, section start, section end, section flags. ++ */ ++int kallsyms_sections(void *token, ++ int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word))) ++{ ++ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ ++ const struct kallsyms_section *ka_sec = NULL; ++ const char *ka_str; ++ const struct module *m; ++ int i; ++ ++ kallsyms_do_first_time(); ++ if (!kallsyms_module_list) ++ return(0); ++ ++ for (m = *kallsyms_module_list; m; m = m->next) { ++ if (!mod_member_present(m, kallsyms_start) || ++ !mod_member_present(m, kallsyms_end) || ++ m->kallsyms_start >= m->kallsyms_end) ++ continue; ++ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; ++ ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off); ++ ka_str = ((char *)(ka_hdr) + ka_hdr->string_off); ++ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { ++ if (callback( ++ token, ++ *(m->name) ? m->name : "kernel", ++ ka_sec->name_off + ka_str, ++ ka_sec->start, ++ ka_sec->start + ka_sec->size, ++ ka_sec->flags)) ++ return(0); ++ } ++ } ++ return(1); ++} +Index: linux-bgl/include/linux/kallsyms.h +=================================================================== +--- linux-bgl.orig/include/linux/kallsyms.h 2004-10-26 17:10:51.404753448 -0700 ++++ linux-bgl/include/linux/kallsyms.h 2004-10-26 22:52:50.045676708 -0700 +@@ -0,0 +1,141 @@ ++/* kallsyms headers ++ Copyright 2000 Keith Owens ++ ++ This file is part of the Linux modutils. It is exported to kernel ++ space so debuggers can access the kallsyms data. ++ ++ The kallsyms data contains all the non-stack symbols from a kernel ++ or a module. The kernel symbols are held between __start___kallsyms ++ and __stop___kallsyms. The symbols for a module are accessed via ++ the struct module chain which is based at module_list. ++ ++ This program is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by the ++ Free Software Foundation; either version 2 of the License, or (at your ++ option) any later version. ++ ++ This program is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++ */ ++ ++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.4.1 2004/10/29 00:51:21 jacob Exp $" ++ ++#ifndef MODUTILS_KALLSYMS_H ++#define MODUTILS_KALLSYMS_H 1 ++ ++/* Have to (re)define these ElfW entries here because external kallsyms ++ * code does not have access to modutils/include/obj.h. This code is ++ * included from user spaces tools (modutils) and kernel, they need ++ * different includes. ++ */ ++ ++#ifndef ELFCLASS32 ++#ifdef __KERNEL__ ++#include ++#else /* __KERNEL__ */ ++#include ++#endif /* __KERNEL__ */ ++#endif /* ELFCLASS32 */ ++ ++#ifndef ELFCLASSM ++#define ELFCLASSM ELF_CLASS ++#endif ++ ++#ifndef ElfW ++# if ELFCLASSM == ELFCLASS32 ++# define ElfW(x) Elf32_ ## x ++# define ELFW(x) ELF32_ ## x ++# else ++# define ElfW(x) Elf64_ ## x ++# define ELFW(x) ELF64_ ## x ++# endif ++#endif ++ ++/* Format of data in the kallsyms section. ++ * Most of the fields are small numbers but the total size and all ++ * offsets can be large so use the 32/64 bit types for these fields. ++ * ++ * Do not use sizeof() on these structures, modutils may be using extra ++ * fields. Instead use the size fields in the header to access the ++ * other bits of data. ++ */ ++ ++struct kallsyms_header { ++ int size; /* Size of this header */ ++ ElfW(Word) total_size; /* Total size of kallsyms data */ ++ int sections; /* Number of section entries */ ++ ElfW(Off) section_off; /* Offset to first section entry */ ++ int section_size; /* Size of one section entry */ ++ int symbols; /* Number of symbol entries */ ++ ElfW(Off) symbol_off; /* Offset to first symbol entry */ ++ int symbol_size; /* Size of one symbol entry */ ++ ElfW(Off) string_off; /* Offset to first string */ ++ ElfW(Addr) start; /* Start address of first section */ ++ ElfW(Addr) end; /* End address of last section */ ++}; ++ ++struct kallsyms_section { ++ ElfW(Addr) start; /* Start address of section */ ++ ElfW(Word) size; /* Size of this section */ ++ ElfW(Off) name_off; /* Offset to section name */ ++ ElfW(Word) flags; /* Flags from section */ ++}; ++ ++struct kallsyms_symbol { ++ ElfW(Off) section_off; /* Offset to section that owns this symbol */ ++ ElfW(Addr) symbol_addr; /* Address of symbol */ ++ ElfW(Off) name_off; /* Offset to symbol name */ ++}; ++ ++#define KALLSYMS_SEC_NAME "__kallsyms" ++#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */ ++ ++#define kallsyms_next_sec(h,s) \ ++ ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size)) ++#define kallsyms_next_sym(h,s) \ ++ ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size)) ++ ++int kallsyms_symbol_to_address( ++ const char *name, /* Name to lookup */ ++ unsigned long *token, /* Which module to start with */ ++ const char **mod_name, /* Set to module name or "kernel" */ ++ unsigned long *mod_start, /* Set to start address of module */ ++ unsigned long *mod_end, /* Set to end address of module */ ++ const char **sec_name, /* Set to section name */ ++ unsigned long *sec_start, /* Set to start address of section */ ++ unsigned long *sec_end, /* Set to end address of section */ ++ const char **sym_name, /* Set to full symbol name */ ++ unsigned long *sym_start, /* Set to start address of symbol */ ++ unsigned long *sym_end /* Set to end address of symbol */ ++ ); ++ ++int kallsyms_address_to_symbol( ++ unsigned long address, /* Address to lookup */ ++ const char **mod_name, /* Set to module name */ ++ unsigned long *mod_start, /* Set to start address of module */ ++ unsigned long *mod_end, /* Set to end address of module */ ++ const char **sec_name, /* Set to section name */ ++ unsigned long *sec_start, /* Set to start address of section */ ++ unsigned long *sec_end, /* Set to end address of section */ ++ const char **sym_name, /* Set to full symbol name */ ++ unsigned long *sym_start, /* Set to start address of symbol */ ++ unsigned long *sym_end /* Set to end address of symbol */ ++ ); ++ ++int kallsyms_sections(void *token, ++ int (*callback)(void *, /* token */ ++ const char *, /* module name */ ++ const char *, /* section name */ ++ ElfW(Addr), /* Section start */ ++ ElfW(Addr), /* Section end */ ++ ElfW(Word) /* Section flags */ ++ ) ++ ); ++ ++#endif /* kallsyms.h */ diff --git a/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch b/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch new file mode 100644 index 0000000..9d33973 --- /dev/null +++ b/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch @@ -0,0 +1,678 @@ +Index: linux-bgl/arch/i386/kernel/traps.c +=================================================================== +--- linux-bgl.orig/arch/i386/kernel/traps.c 2003-07-02 08:43:23.000000000 -0700 ++++ linux-bgl/arch/i386/kernel/traps.c 2004-10-26 23:25:17.950442396 -0700 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_MCA + #include +@@ -135,6 +136,8 @@ + { + int i; + unsigned long addr; ++ /* static to not take up stackspace; if we race here too bad */ ++ static char buffer[512]; + + if (!stack) + stack = (unsigned long*)&stack; +@@ -144,9 +147,8 @@ + while (((long) stack & (THREAD_SIZE-1)) != 0) { + addr = *stack++; + if (kernel_text_address(addr)) { +- if (i && ((i % 6) == 0)) +- printk("\n "); +- printk(" [<%08lx>]", addr); ++ lookup_symbol(addr, buffer, 512); ++ printk("[<%08lx>] %s (0x%p)\n", addr,buffer,stack-1); + i++; + } + } +@@ -186,12 +188,19 @@ + show_trace(esp); + } + ++#ifdef CONFIG_MK7 ++#define ARCHIT "/athlon" ++#else ++#define ARCHIT "/i686" ++#endif ++ + void show_registers(struct pt_regs *regs) + { + int i; + int in_kernel = 1; + unsigned long esp; + unsigned short ss; ++ static char buffer[512]; + + esp = (unsigned long) (®s->esp); + ss = __KERNEL_DS; +@@ -200,8 +209,12 @@ + esp = regs->esp; + ss = regs->xss & 0xffff; + } ++ ++ print_modules(); ++ lookup_symbol(regs->eip, buffer, 512); + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n", + smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags); ++ printk("\nEIP is at %s (" UTS_RELEASE ARCHIT ")\n",buffer); + printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", +@@ -261,7 +274,7 @@ + if (__get_user(file, (char **)(eip + 4)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = ""; +- ++ printk("------------[ cut here ]------------\n"); + printk("kernel BUG at %s:%d!\n", file, line); + + no_bug: +Index: linux-bgl/arch/i386/kernel/process.c +=================================================================== +--- linux-bgl.orig/arch/i386/kernel/process.c 2003-07-02 08:44:07.000000000 -0700 ++++ linux-bgl/arch/i386/kernel/process.c 2004-10-26 23:28:53.017015082 -0700 +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -437,10 +438,14 @@ + void show_regs(struct pt_regs * regs) + { + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; ++ static char buffer[512]; ++ ++ lookup_symbol(regs->eip, buffer, 512); + + printk("\n"); + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); ++ printk("\nEIP is at %s (" UTS_RELEASE ")\n", buffer); + if (regs->xcs & 3) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted()); +Index: linux-bgl/arch/ia64/kernel/process.c +=================================================================== +--- linux-bgl.orig/arch/ia64/kernel/process.c 2003-07-02 08:43:26.000000000 -0700 ++++ linux-bgl/arch/ia64/kernel/process.c 2004-10-26 23:29:56.340005959 -0700 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -33,9 +34,10 @@ + #include + #endif + +-static void +-do_show_stack (struct unw_frame_info *info, void *arg) ++void ++ia64_do_show_stack (struct unw_frame_info *info, void *arg) + { ++ static char buffer[512]; + unsigned long ip, sp, bsp; + + printk("\nCall Trace: "); +@@ -46,7 +48,8 @@ + + unw_get_sp(info, &sp); + unw_get_bsp(info, &bsp); +- printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx\n", ip, sp, bsp); ++ lookup_symbol(ip, buffer, 512); ++ printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx %s\n", ip, sp, bsp, buffer); + } while (unw_unwind(info) >= 0); + } + +@@ -56,19 +59,19 @@ + struct unw_frame_info info; + + unw_init_from_blocked_task(&info, task); +- do_show_stack(&info, 0); ++ ia64_do_show_stack(&info, 0); + } + + void + show_stack (struct task_struct *task) + { + if (!task) +- unw_init_running(do_show_stack, 0); ++ unw_init_running(ia64_do_show_stack, 0); + else { + struct unw_frame_info info; + + unw_init_from_blocked_task(&info, task); +- do_show_stack(&info, 0); ++ ia64_do_show_stack(&info, 0); + } + } + +@@ -76,8 +79,11 @@ + show_regs (struct pt_regs *regs) + { + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; ++ static char buffer[512]; + + printk("\nPid: %d, comm: %20s\n", current->pid, current->comm); ++ lookup_symbol(ip, buffer, 512); ++ printk("EIP is at %s (" UTS_RELEASE ")\n", buffer); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); + printk("unat: %016lx pfs : %016lx rsc : %016lx\n", +Index: linux-bgl/arch/s390/config.in +=================================================================== +--- linux-bgl.orig/arch/s390/config.in 2003-07-02 08:43:27.000000000 -0700 ++++ linux-bgl/arch/s390/config.in 2004-10-26 23:25:17.961440685 -0700 +@@ -73,5 +73,6 @@ + # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG + #fi + bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ ++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS + endmenu + +Index: linux-bgl/arch/s390/kernel/traps.c +=================================================================== +--- linux-bgl.orig/arch/s390/kernel/traps.c 2003-07-02 08:44:02.000000000 -0700 ++++ linux-bgl/arch/s390/kernel/traps.c 2004-10-26 23:25:17.964440218 -0700 +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -108,27 +109,26 @@ + + void show_trace(unsigned long * stack) + { ++ static char buffer[512]; + unsigned long backchain, low_addr, high_addr, ret_addr; + int i; + + if (!stack) + stack = (unsigned long*)&stack; + +- printk("Call Trace: "); + low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; + high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; + /* Skip the first frame (biased stack) */ + backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; +- /* Print up to 8 lines */ +- for (i = 0; i < 8; i++) { ++ /* Print up to 20 lines */ ++ for (i = 0; i < 20; i++) { + if (backchain < low_addr || backchain >= high_addr) + break; + ret_addr = *((unsigned long *) (backchain+56)) & PSW_ADDR_MASK; + if (!kernel_text_address(ret_addr)) + break; +- if (i && ((i % 6) == 0)) +- printk("\n "); +- printk("[<%08lx>] ", ret_addr); ++ lookup_symbol(ret_addr, buffer, 512); ++ printk("[<%08lx>] %s (0x%lx)\n", ret_addr,buffer,backchain+56); + low_addr = backchain; + backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; + } +@@ -171,6 +171,7 @@ + + void show_registers(struct pt_regs *regs) + { ++ static char buffer[512]; + mm_segment_t old_fs; + char *mode; + int i; +@@ -179,6 +180,10 @@ + printk("%s PSW : %08lx %08lx\n", + mode, (unsigned long) regs->psw.mask, + (unsigned long) regs->psw.addr); ++ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { ++ lookup_symbol(regs->psw.addr & 0x7FFFFFFF, buffer, 512); ++ printk(" %s (" UTS_RELEASE ")\n", buffer); ++ } + printk("%s GPRS: %08x %08x %08x %08x\n", mode, + regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); + printk(" %08x %08x %08x %08x\n", +Index: linux-bgl/arch/s390x/config.in +=================================================================== +--- linux-bgl.orig/arch/s390x/config.in 2003-07-02 08:43:07.000000000 -0700 ++++ linux-bgl/arch/s390x/config.in 2004-10-26 23:25:17.964440218 -0700 +@@ -75,5 +75,6 @@ + # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG + #fi + bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ ++bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS + endmenu + +Index: linux-bgl/arch/s390x/kernel/traps.c +=================================================================== +--- linux-bgl.orig/arch/s390x/kernel/traps.c 2003-07-02 08:43:25.000000000 -0700 ++++ linux-bgl/arch/s390x/kernel/traps.c 2004-10-26 23:25:17.966439907 -0700 +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -112,25 +113,25 @@ + { + unsigned long backchain, low_addr, high_addr, ret_addr; + int i; ++ /* static to not take up stackspace; if we race here too bad */ ++ static char buffer[512]; + + if (!stack) + stack = (unsigned long*)&stack; + +- printk("Call Trace: "); + low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; + high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; + /* Skip the first frame (biased stack) */ + backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; +- /* Print up to 8 lines */ +- for (i = 0; i < 8; i++) { ++ /* Print up to 20 lines */ ++ for (i = 0; i < 20; i++) { + if (backchain < low_addr || backchain >= high_addr) + break; + ret_addr = *((unsigned long *) (backchain+112)) & PSW_ADDR_MASK; + if (!kernel_text_address(ret_addr)) + break; +- if (i && ((i % 3) == 0)) +- printk("\n "); +- printk("[<%016lx>] ", ret_addr); ++ lookup_symbol(ret_addr, buffer, 512); ++ printk("[<%016lx>] %s (0x%lx)\n", ret_addr, buffer, backchain+112); + low_addr = backchain; + backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; + } +@@ -173,6 +174,7 @@ + + void show_registers(struct pt_regs *regs) + { ++ static char buffer[512]; + mm_segment_t old_fs; + char *mode; + int i; +@@ -181,6 +183,10 @@ + printk("%s PSW : %016lx %016lx\n", + mode, (unsigned long) regs->psw.mask, + (unsigned long) regs->psw.addr); ++ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { ++ lookup_symbol(regs->psw.addr, buffer, 512); ++ printk(" %s (" UTS_RELEASE ")\n", buffer); ++ } + printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, + regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); + printk(" %016lx %016lx %016lx %016lx\n", +Index: linux-bgl/arch/ppc64/mm/fault.c +=================================================================== +--- linux-bgl.orig/arch/ppc64/mm/fault.c 2003-07-02 08:43:12.000000000 -0700 ++++ linux-bgl/arch/ppc64/mm/fault.c 2004-10-26 23:30:24.467942247 -0700 +@@ -224,7 +224,6 @@ + if (debugger_kernel_faults) + debugger(regs); + #endif +- print_backtrace( (unsigned long *)regs->gpr[1] ); + panic("kernel access of bad area pc %lx lr %lx address %lX tsk %s/%d", + regs->nip,regs->link,address,current->comm,current->pid); + } +Index: linux-bgl/arch/ppc64/kernel/traps.c +=================================================================== +--- linux-bgl.orig/arch/ppc64/kernel/traps.c 2003-07-02 08:44:03.000000000 -0700 ++++ linux-bgl/arch/ppc64/kernel/traps.c 2004-10-26 23:33:45.297572484 -0700 +@@ -89,7 +89,6 @@ + #if defined(CONFIG_KDB) + kdb(KDB_REASON_OOPS, 0, (kdb_eframe_t) regs); + #endif +- print_backtrace((unsigned long *)regs->gpr[1]); + panic("Exception in kernel pc %lx signal %d",regs->nip,signr); + #if defined(CONFIG_PPCDBG) && (defined(CONFIG_XMON) || defined(CONFIG_KGDB)) + /* Allow us to catch SIGILLs for 64-bit app/glibc debugging. -Peter */ +@@ -187,7 +186,6 @@ + if (kdb(KDB_REASON_FAULT, 0, regs)) + return ; + #endif +- print_backtrace((unsigned long *)regs->gpr[1]); + panic("machine check"); + } + _exception(SIGSEGV, regs); +@@ -209,7 +207,6 @@ + } + #endif + show_regs(regs); +- print_backtrace((unsigned long *)regs->gpr[1]); + panic("System Management Interrupt"); + } + +Index: linux-bgl/arch/ppc64/kernel/process.c +=================================================================== +--- linux-bgl.orig/arch/ppc64/kernel/process.c 2003-07-02 08:44:31.000000000 -0700 ++++ linux-bgl/arch/ppc64/kernel/process.c 2004-10-26 23:33:01.060713583 -0700 +@@ -30,6 +30,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -130,12 +132,61 @@ + __restore_flags(s); + } + ++/* ++ * If the address is either in the .text section of the ++ * kernel, or in the vmalloc'ed module regions, it *may* ++ * be the address of a calling routine ++ */ ++ ++#ifdef CONFIG_MODULES ++ ++extern struct module *module_list; ++extern struct module kernel_module; ++extern char _stext[], _etext[]; ++ ++static inline int kernel_text_address(unsigned long addr) ++{ ++ int retval = 0; ++ struct module *mod; ++ ++ if (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext) ++ return 1; ++ ++ for (mod = module_list; mod != &kernel_module; mod = mod->next) { ++ /* mod_bound tests for addr being inside the vmalloc'ed ++ * module area. Of course it'd be better to test only ++ * for the .text subset... */ ++ if (mod_bound(addr, 0, mod)) { ++ retval = 1; ++ break; ++ } ++ } ++ ++ return retval; ++} ++ ++#else ++ ++static inline int kernel_text_address(unsigned long addr) ++{ ++ return (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext); ++} ++ ++#endif ++ ++ + void show_regs(struct pt_regs * regs) + { + int i; ++ static char buffer[512]; + +- printk("NIP: %016lX XER: %016lX LR: %016lX REGS: %p TRAP: %04lx %s\n", ++ print_modules(); ++ printk("NIP: %016lx XER: %016lx LR: %016lx REGS: %p TRAP: %04lx %s\n", + regs->nip, regs->xer, regs->link, regs,regs->trap, print_tainted()); ++ lookup_symbol(regs->nip, buffer, 512); ++ printk("NIP is at %s (" UTS_RELEASE ")\n", buffer); + printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x IR/DR: %01x%01x\n", + regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0, + regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0, +@@ -147,27 +198,22 @@ + printk("\nlast math %p ", last_task_used_math); + + #ifdef CONFIG_SMP +- /* printk(" CPU: %d last CPU: %d", current->processor,current->last_processor); */ ++ printk("CPU: %d", smp_processor_id()); + #endif /* CONFIG_SMP */ + +- printk("\n"); + for (i = 0; i < 32; i++) + { + long r; + if ((i % 4) == 0) +- { +- printk("GPR%02d: ", i); +- } ++ printk("\nGPR%02d: ", i); + + if ( __get_user(r, &(regs->gpr[i])) ) + return; + +- printk("%016lX ", r); +- if ((i % 4) == 3) +- { +- printk("\n"); +- } ++ printk("%016lx ", r); + } ++ printk("\n"); ++ print_backtrace((unsigned long *)regs->gpr[1]); + } + + void exit_thread(void) +@@ -415,67 +461,24 @@ + } + } + +-extern char _stext[], _etext[]; +- +-char * ppc_find_proc_name( unsigned * p, char * buf, unsigned buflen ) +-{ +- unsigned long tb_flags; +- unsigned short name_len; +- unsigned long tb_start, code_start, code_ptr, code_offset; +- unsigned code_len; +- strcpy( buf, "Unknown" ); +- code_ptr = (unsigned long)p; +- code_offset = 0; +- if ( ( (unsigned long)p >= (unsigned long)_stext ) && ( (unsigned long)p <= (unsigned long)_etext ) ) { +- while ( (unsigned long)p <= (unsigned long)_etext ) { +- if ( *p == 0 ) { +- tb_start = (unsigned long)p; +- ++p; /* Point to traceback flags */ +- tb_flags = *((unsigned long *)p); +- p += 2; /* Skip over traceback flags */ +- if ( tb_flags & TB_NAME_PRESENT ) { +- if ( tb_flags & TB_PARMINFO ) +- ++p; /* skip over parminfo data */ +- if ( tb_flags & TB_HAS_TBOFF ) { +- code_len = *p; /* get code length */ +- code_start = tb_start - code_len; +- code_offset = code_ptr - code_start + 1; +- if ( code_offset > 0x100000 ) +- break; +- ++p; /* skip over code size */ +- } +- name_len = *((unsigned short *)p); +- if ( name_len > (buflen-20) ) +- name_len = buflen-20; +- memcpy( buf, ((char *)p)+2, name_len ); +- buf[name_len] = 0; +- if ( code_offset ) +- sprintf( buf+name_len, "+0x%lx", code_offset-1 ); +- } +- break; +- } +- ++p; +- } +- } +- return buf; +-} +- + void + print_backtrace(unsigned long *sp) + { + int cnt = 0; + unsigned long i; +- char name_buf[256]; ++ char buffer[512]; + +- printk("Call backtrace: \n"); ++ printk("Call Trace: \n"); + while (sp) { + if (__get_user( i, &sp[2] )) + break; +- printk("%016lX ", i); +- printk("%s\n", ppc_find_proc_name( (unsigned *)i, name_buf, 256 )); ++ if (kernel_text_address(i)) { ++ if (__get_user(sp, (unsigned long **)sp)) ++ break; ++ lookup_symbol(i, buffer, 512); ++ printk("[<%016lx>] %s\n", i, buffer); ++ } + if (cnt > 32) break; +- if (__get_user(sp, (unsigned long **)sp)) +- break; + } + printk("\n"); + } +@@ -515,6 +518,7 @@ + unsigned long ip, sp; + unsigned long stack_page = (unsigned long)p; + int count = 0; ++ static char buffer[512]; + + if (!p) + return; +@@ -528,7 +532,8 @@ + break; + if (count > 0) { + ip = *(unsigned long *)(sp + 16); +- printk("[%016lx] ", ip); ++ lookup_symbol(ip, buffer, 512); ++ printk("[<%016lx>] %s\n", ip, buffer); + } + } while (count++ < 16); + printk("\n"); +Index: linux-bgl/kernel/Makefile +=================================================================== +--- linux-bgl.orig/kernel/Makefile 2004-10-26 23:23:00.516655289 -0700 ++++ linux-bgl/kernel/Makefile 2004-10-26 23:35:04.930451186 -0700 +@@ -14,7 +14,7 @@ + obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ + module.o exit.o itimer.o info.o time.o softirq.o resource.o \ + sysctl.o acct.o capability.o ptrace.o timer.o user.o \ +- signal.o sys.o kmod.o context.o ++ signal.o sys.o kmod.o context.o kksymoops.o + + obj-$(CONFIG_UID16) += uid16.o + obj-$(CONFIG_MODULES) += ksyms.o +Index: linux-bgl/kernel/kksymoops.c +=================================================================== +--- linux-bgl.orig/kernel/kksymoops.c 2004-10-26 17:10:51.404753448 -0700 ++++ linux-bgl/kernel/kksymoops.c 2004-10-26 23:25:17.971439129 -0700 +@@ -0,0 +1,82 @@ ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_KALLSYMS ++#include ++#endif ++ ++ ++ ++int lookup_symbol(unsigned long address, char *buffer, int buflen) ++{ ++ struct module *this_mod; ++ unsigned long bestsofar; ++ ++ const char *mod_name = NULL, *sec_name = NULL, *sym_name = NULL; ++ unsigned long mod_start,mod_end,sec_start,sec_end,sym_start,sym_end; ++ ++ if (!buffer) ++ return -EFAULT; ++ ++ if (buflen<256) ++ return -ENOMEM; ++ ++ memset(buffer,0,buflen); ++ ++#ifdef CONFIG_KALLSYMS ++ if (!kallsyms_address_to_symbol(address,&mod_name,&mod_start,&mod_end,&sec_name, ++ &sec_start, &sec_end, &sym_name, &sym_start, &sym_end)) { ++ /* kallsyms doesn't have a clue; lets try harder */ ++ bestsofar = 0; ++ snprintf(buffer,buflen-1,"[unresolved]"); ++ ++ this_mod = module_list; ++ ++ while (this_mod != NULL) { ++ int i; ++ /* walk the symbol list of this module. Only symbols ++ who's address is smaller than the searched for address ++ are relevant; and only if it's better than the best so far */ ++ for (i=0; i< this_mod->nsyms; i++) ++ if ((this_mod->syms[i].value<=address) && ++ (bestsofarsyms[i].value)) { ++ snprintf(buffer,buflen-1,"%s [%s] 0x%x", ++ this_mod->syms[i].name, ++ this_mod->name, ++ (unsigned int)(address - this_mod->syms[i].value)); ++ bestsofar = this_mod->syms[i].value; ++ } ++ this_mod = this_mod->next; ++ } ++ ++ } else { /* kallsyms success */ ++ snprintf(buffer,buflen-1,"%s [%s] 0x%x",sym_name,mod_name,(unsigned int)(address-sym_start)); ++ } ++#endif ++ return strlen(buffer); ++} ++ ++static char modlist[4096]; ++/* this function isn't smp safe but that's not really a problem; it's called from ++ * oops context only and any locking could actually prevent the oops from going out; ++ * the line that is generated is informational only and should NEVER prevent the real oops ++ * from going out. ++ */ ++void print_modules(void) ++{ ++ struct module *this_mod; ++ int pos = 0, i; ++ memset(modlist,0,4096); ++ ++#ifdef CONFIG_KALLSYMS ++ this_mod = module_list; ++ while (this_mod != NULL) { ++ if (this_mod->name != NULL) ++ pos +=snprintf(modlist+pos,160-pos-1,"%s ",this_mod->name); ++ this_mod = this_mod->next; ++ } ++ printk("%s\n",modlist); ++#endif ++} +Index: linux-bgl/include/linux/kernel.h +=================================================================== +--- linux-bgl.orig/include/linux/kernel.h 2003-07-02 08:44:16.000000000 -0700 ++++ linux-bgl/include/linux/kernel.h 2004-10-26 23:25:17.968439596 -0700 +@@ -107,6 +107,9 @@ + extern int tainted; + extern const char *print_tainted(void); + ++extern int lookup_symbol(unsigned long address, char *buffer, int buflen); ++extern void print_modules(void); ++ + #if DEBUG + #define pr_debug(fmt,arg...) \ + printk(KERN_DEBUG fmt,##arg) diff --git a/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch b/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch new file mode 100644 index 0000000..f8db708 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch @@ -0,0 +1,1842 @@ +Index: linux-2.4.24/arch/i386/kernel/i386_ksyms.c +=================================================================== +--- linux-2.4.24.orig/arch/i386/kernel/i386_ksyms.c 2003-11-28 13:26:19.000000000 -0500 ++++ linux-2.4.24/arch/i386/kernel/i386_ksyms.c 2004-05-07 16:58:39.000000000 -0400 +@@ -186,3 +186,8 @@ + EXPORT_SYMBOL(edd); + EXPORT_SYMBOL(eddnr); + #endif ++ ++EXPORT_SYMBOL_GPL(show_mem); ++EXPORT_SYMBOL_GPL(show_state); ++EXPORT_SYMBOL_GPL(show_regs); ++ +Index: linux-2.4.24/arch/i386/kernel/process.c +=================================================================== +--- linux-2.4.24.orig/arch/i386/kernel/process.c 2003-11-28 13:26:19.000000000 -0500 ++++ linux-2.4.24/arch/i386/kernel/process.c 2004-05-07 17:08:18.000000000 -0400 +@@ -400,7 +400,8 @@ + * Stop all CPUs and turn off local APICs and the IO-APIC, so + * other OSs see a clean IRQ state. + */ +- smp_send_stop(); ++ if (!netdump_func) ++ smp_send_stop(); + #elif CONFIG_X86_LOCAL_APIC + if (cpu_has_apic) { + __cli(); +Index: linux-2.4.24/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.4.24.orig/arch/i386/kernel/traps.c 2004-05-07 16:57:00.000000000 -0400 ++++ linux-2.4.24/arch/i386/kernel/traps.c 2004-05-07 17:09:17.000000000 -0400 +@@ -280,6 +280,9 @@ + printk("Kernel BUG\n"); + } + ++void (*netdump_func) (struct pt_regs *regs) = NULL; ++int netdump_mode = 0; ++ + spinlock_t die_lock = SPIN_LOCK_UNLOCKED; + + void die(const char * str, struct pt_regs * regs, long err) +@@ -290,6 +293,8 @@ + handle_BUG(regs); + printk("%s: %04lx\n", str, err & 0xffff); + show_registers(regs); ++ if (netdump_func) ++ netdump_func(regs); + bust_spinlocks(0); + spin_unlock_irq(&die_lock); + do_exit(SIGSEGV); +@@ -1041,3 +1046,9 @@ + + EXPORT_SYMBOL_GPL(is_kernel_text_address); + EXPORT_SYMBOL_GPL(lookup_symbol); ++ ++EXPORT_SYMBOL_GPL(netdump_func); ++EXPORT_SYMBOL_GPL(netdump_mode); ++#if CONFIG_X86_LOCAL_APIC ++EXPORT_SYMBOL_GPL(nmi_watchdog); ++#endif +Index: linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c +=================================================================== +--- linux-2.4.24.orig/arch/x86_64/kernel/x8664_ksyms.c 2003-11-28 13:26:19.000000000 -0500 ++++ linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c 2004-05-07 17:01:51.000000000 -0400 +@@ -41,6 +41,9 @@ + EXPORT_SYMBOL(drive_info); + #endif + ++//extern void (*netdump_func) (struct pt_regs *regs) = NULL; ++int netdump_mode = 0; ++ + /* platform dependent support */ + EXPORT_SYMBOL(boot_cpu_data); + EXPORT_SYMBOL(dump_fpu); +@@ -229,3 +232,6 @@ + EXPORT_SYMBOL(touch_nmi_watchdog); + + EXPORT_SYMBOL(do_fork); ++ ++EXPORT_SYMBOL_GPL(netdump_func); ++EXPORT_SYMBOL_GPL(netdump_mode); +Index: linux-2.4.24/drivers/net/3c59x.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/3c59x.c 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/3c59x.c 2004-05-07 17:01:00.000000000 -0400 +@@ -874,6 +874,7 @@ + static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); + static void vortex_tx_timeout(struct net_device *dev); + static void acpi_set_WOL(struct net_device *dev); ++static void vorboom_poll(struct net_device *dev); + static struct ethtool_ops vortex_ethtool_ops; + + /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ +@@ -1343,6 +1344,9 @@ + dev->set_multicast_list = set_rx_mode; + dev->tx_timeout = vortex_tx_timeout; + dev->watchdog_timeo = (watchdog * HZ) / 1000; ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = &vorboom_poll; ++#endif + if (pdev && vp->enable_wol) { + vp->pm_state_valid = 1; + pci_save_state(vp->pdev, vp->power_state); +@@ -2322,6 +2326,29 @@ + spin_unlock(&vp->lock); + } + ++#ifdef HAVE_POLL_CONTROLLER ++ ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++ ++static void vorboom_poll (struct net_device *dev) ++{ ++ struct vortex_private *vp = (struct vortex_private *)dev->priv; ++ ++ if (!netdump_mode) disable_irq(dev->irq); ++ if (vp->full_bus_master_tx) ++ boomerang_interrupt(dev->irq, dev, 0); ++ else ++ vortex_interrupt(dev->irq, dev, 0); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++ ++#endif ++ ++ + static int vortex_rx(struct net_device *dev) + { + struct vortex_private *vp = (struct vortex_private *)dev->priv; +Index: linux-2.4.24/drivers/net/Config.in +=================================================================== +--- linux-2.4.24.orig/drivers/net/Config.in 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/Config.in 2004-05-07 16:58:39.000000000 -0400 +@@ -295,6 +295,8 @@ + dep_tristate ' SysKonnect FDDI PCI support' CONFIG_SKFP $CONFIG_PCI + fi + ++tristate 'Network logging support' CONFIG_NETCONSOLE ++ + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_INET" = "y" ]; then + bool 'HIPPI driver support (EXPERIMENTAL)' CONFIG_HIPPI +Index: linux-2.4.24/drivers/net/eepro100.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/eepro100.c 2003-08-25 07:44:42.000000000 -0400 ++++ linux-2.4.24/drivers/net/eepro100.c 2004-05-07 16:58:39.000000000 -0400 +@@ -543,6 +543,7 @@ + static int speedo_rx(struct net_device *dev); + static void speedo_tx_buffer_gc(struct net_device *dev); + static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); ++static void poll_speedo (struct net_device *dev); + static int speedo_close(struct net_device *dev); + static struct net_device_stats *speedo_get_stats(struct net_device *dev); + static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); +@@ -879,6 +880,9 @@ + dev->get_stats = &speedo_get_stats; + dev->set_multicast_list = &set_rx_mode; + dev->do_ioctl = &speedo_ioctl; ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = &poll_speedo; ++#endif + + return 0; + } +@@ -1176,10 +1180,8 @@ + + + /* Media monitoring and control. */ +-static void speedo_timer(unsigned long data) ++static void speedo_timeout(struct net_device *dev, struct speedo_private *sp) + { +- struct net_device *dev = (struct net_device *)data; +- struct speedo_private *sp = (struct speedo_private *)dev->priv; + long ioaddr = dev->base_addr; + int phy_num = sp->phy[0] & 0x1f; + +@@ -1217,6 +1219,15 @@ + dev->name, sp->rx_mode, jiffies, sp->last_rx_time); + set_rx_mode(dev); + } ++} ++ ++static void speedo_timer(unsigned long data) ++{ ++ struct net_device *dev = (struct net_device *)data; ++ struct speedo_private *sp = (struct speedo_private *)dev->priv; ++ ++ speedo_timeout(dev, sp); ++ + /* We must continue to monitor the media. */ + sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */ + add_timer(&sp->timer); +@@ -1661,6 +1672,29 @@ + return; + } + ++#ifdef HAVE_POLL_CONTROLLER ++ ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++ ++static void poll_speedo (struct net_device *dev) ++{ ++ struct speedo_private *sp = (struct speedo_private *)dev->priv; ++ ++ if (!netdump_mode) disable_irq(dev->irq); ++ if (sp->timer.expires == jiffies) { ++ sp->timer.expires = RUN_AT(2*HZ); ++ speedo_timeout(dev, sp); ++ } ++ speedo_interrupt (dev->irq, dev, NULL); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++ ++#endif ++ + static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) + { + struct speedo_private *sp = (struct speedo_private *)dev->priv; +Index: linux-2.4.24/drivers/net/Makefile +=================================================================== +--- linux-2.4.24.orig/drivers/net/Makefile 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/Makefile 2004-05-07 16:58:39.000000000 -0400 +@@ -250,6 +250,8 @@ + obj-y += ../acorn/net/acorn-net.o + endif + ++obj-$(CONFIG_NETCONSOLE) += netconsole.o ++ + # + # HIPPI adapters + # +Index: linux-2.4.24/drivers/net/netconsole.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/netconsole.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.4.24/drivers/net/netconsole.c 2004-05-07 16:58:39.000000000 -0400 +@@ -0,0 +1,1246 @@ ++/* ++ * linux/drivers/net/netconsole.c ++ * ++ * Copyright (C) 2001 Ingo Molnar ++ * Copyright (C) 2002 Red Hat, Inc. ++ * ++ * This file contains the implementation of an IRQ-safe, crash-safe ++ * kernel console implementation that outputs kernel messages to the ++ * network. ++ * ++ * Modification history: ++ * ++ * 2001-09-17 started by Ingo Molnar. ++ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson ++ */ ++ ++/**************************************************************** ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2, or (at your option) ++ * any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ ****************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if CONFIG_X86_LOCAL_APIC ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct net_device *netconsole_dev; ++static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port; ++static u32 source_ip, netdump_target_ip, netlog_target_ip, syslog_target_ip; ++static unsigned char netdump_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; ++static unsigned char netlog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; ++static unsigned char syslog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; ++ ++static unsigned int mhz = 500, idle_timeout; ++static unsigned long long mhz_cycles, jiffy_cycles; ++ ++#include "netconsole.h" ++ ++#define MAX_UDP_CHUNK 1460 ++#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) ++ ++#define DEBUG 0 ++#if DEBUG ++# define Dprintk(x...) printk(KERN_INFO x) ++#else ++# define Dprintk(x...) ++#endif ++/* ++ * We maintain a small pool of fully-sized skbs, ++ * to make sure the message gets out even in ++ * extreme OOM situations. ++ */ ++#define MAX_NETCONSOLE_SKBS 128 ++ ++static spinlock_t netconsole_lock = SPIN_LOCK_UNLOCKED; ++static int nr_netconsole_skbs; ++static struct sk_buff *netconsole_skbs; ++ ++#define MAX_SKB_SIZE \ ++ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ ++ sizeof(struct iphdr) + sizeof(struct ethhdr)) ++ ++static int new_arp = 0; ++static unsigned char arp_sha[ETH_ALEN], arp_tha[ETH_ALEN]; ++static u32 arp_sip, arp_tip; ++ ++static void send_netconsole_arp(struct net_device *dev); ++ ++static void __refill_netconsole_skbs(void) ++{ ++ struct sk_buff *skb; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netconsole_lock, flags); ++ while (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) { ++ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); ++ if (!skb) ++ break; ++ if (netconsole_skbs) ++ skb->next = netconsole_skbs; ++ else ++ skb->next = NULL; ++ netconsole_skbs = skb; ++ nr_netconsole_skbs++; ++ } ++ spin_unlock_irqrestore(&netconsole_lock, flags); ++} ++ ++static struct sk_buff * get_netconsole_skb(void) ++{ ++ struct sk_buff *skb; ++ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netconsole_lock, flags); ++ skb = netconsole_skbs; ++ if (skb) { ++ netconsole_skbs = skb->next; ++ skb->next = NULL; ++ nr_netconsole_skbs--; ++ } ++ spin_unlock_irqrestore(&netconsole_lock, flags); ++ ++ return skb; ++} ++ ++static unsigned long long t0; ++ ++/* ++ * Do cleanups: ++ * - zap completed output skbs. ++ * - send ARPs if requested ++ * - reboot the box if inactive for more than N seconds. ++ */ ++static void zap_completion_queue(void) ++{ ++ unsigned long long t1; ++ int cpu = smp_processor_id(); ++ ++ if (softnet_data[cpu].completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = softnet_data[cpu].completion_queue; ++ softnet_data[cpu].completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist != NULL) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ __kfree_skb(skb); ++ } ++ } ++ ++ if (new_arp) { ++ Dprintk("got ARP req - sending reply.\n"); ++ new_arp = 0; ++ send_netconsole_arp(netconsole_dev); ++ } ++ ++ rdtscll(t1); ++ if (idle_timeout) { ++ if (t0) { ++ if (((t1 - t0) >> 20) > mhz_cycles * (unsigned long long)idle_timeout) { ++ t0 = t1; ++ printk("netdump idle timeout - rebooting in 3 seconds.\n"); ++ mdelay(3000); ++ machine_restart(NULL); ++ } ++ } ++ } ++ /* maintain jiffies in a polling fashion, based on rdtsc. */ ++ { ++ static unsigned long long prev_tick; ++ ++ if (t1 - prev_tick >= jiffy_cycles) { ++ prev_tick += jiffy_cycles; ++ jiffies++; ++ } ++ } ++} ++ ++static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve) ++{ ++ int once = 1; ++ int count = 0; ++ struct sk_buff *skb = NULL; ++ ++repeat: ++ zap_completion_queue(); ++ if (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) ++ __refill_netconsole_skbs(); ++ ++ skb = alloc_skb(len, GFP_ATOMIC); ++ if (!skb) { ++ skb = get_netconsole_skb(); ++ if (!skb) { ++ count++; ++ if (once && (count == 1000000)) { ++ printk("possibly FATAL: out of netconsole skbs!!! will keep retrying.\n"); ++ once = 0; ++ } ++ Dprintk("alloc skb: polling controller ...\n"); ++ dev->poll_controller(dev); ++ goto repeat; ++ } ++ } ++ ++ atomic_set(&skb->users, 1); ++ skb_reserve(skb, reserve); ++ return skb; ++} ++ ++static void transmit_raw_skb(struct sk_buff *skb, struct net_device *dev) ++{ ++ ++repeat_poll: ++ spin_lock(&dev->xmit_lock); ++ dev->xmit_lock_owner = smp_processor_id(); ++ ++ if (netif_queue_stopped(dev)) { ++ dev->xmit_lock_owner = -1; ++ spin_unlock(&dev->xmit_lock); ++ ++ Dprintk("xmit skb: polling controller ...\n"); ++ dev->poll_controller(dev); ++ zap_completion_queue(); ++ goto repeat_poll; ++ } ++ ++ dev->hard_start_xmit(skb, dev); ++ ++ dev->xmit_lock_owner = -1; ++ spin_unlock(&dev->xmit_lock); ++} ++ ++static void transmit_netconsole_skb(struct sk_buff *skb, struct net_device *dev, ++ int ip_len, int udp_len, ++ u16 source_port, u16 target_port, u32 source_ip, u32 target_ip, ++ unsigned char * macdaddr) ++{ ++ struct udphdr *udph; ++ struct iphdr *iph; ++ struct ethhdr *eth; ++ ++ udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); ++ udph->source = source_port; ++ udph->dest = target_port; ++ udph->len = htons(udp_len); ++ udph->check = 0; ++ ++ iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); ++ ++ iph->version = 4; ++ iph->ihl = 5; ++ iph->tos = 0; ++ iph->tot_len = htons(ip_len); ++ iph->id = 0; ++ iph->frag_off = 0; ++ iph->ttl = 64; ++ iph->protocol = IPPROTO_UDP; ++ iph->check = 0; ++ iph->saddr = source_ip; ++ iph->daddr = target_ip; ++ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); ++ ++ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); ++ ++ eth->h_proto = htons(ETH_P_IP); ++ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); ++ memcpy(eth->h_dest, macdaddr, dev->addr_len); ++ ++ transmit_raw_skb(skb, dev); ++} ++ ++static void send_netconsole_arp(struct net_device *dev) ++{ ++ int total_len, arp_len, arp_data_len; ++ struct sk_buff *skb; ++ unsigned char *arp; ++ struct arphdr *arph; ++ struct ethhdr *eth; ++ ++ arp_data_len = 2*4 + 2*ETH_ALEN; ++ arp_len = arp_data_len + sizeof(struct arphdr); ++ total_len = arp_len + ETH_HLEN; ++ ++ skb = alloc_netconsole_skb(dev, total_len, total_len - arp_data_len); ++ ++ arp = skb->data; ++ ++ memcpy(arp, dev->dev_addr, ETH_ALEN); ++ arp += ETH_ALEN; ++ ++ memcpy(arp, &source_ip, 4); ++ arp += 4; ++ ++ memcpy(arp, arp_sha, ETH_ALEN); ++ arp += ETH_ALEN; ++ ++ memcpy(arp, &arp_sip, 4); ++ arp += 4; ++ ++ skb->len += 2*4 + 2*ETH_ALEN; ++ ++ arph = (struct arphdr *)skb_push(skb, sizeof(*arph)); ++ ++ arph->ar_hrd = htons(dev->type); ++ arph->ar_pro = __constant_htons(ETH_P_IP); ++ arph->ar_hln = ETH_ALEN; ++ arph->ar_pln = 4; ++ arph->ar_op = __constant_htons(ARPOP_REPLY); ++ ++ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); ++ ++ eth->h_proto = htons(ETH_P_ARP); ++ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); ++ memcpy(eth->h_dest, arp_sha, dev->addr_len); ++ ++ transmit_raw_skb(skb, dev); ++} ++ ++static void send_netdump_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) ++{ ++ int total_len, ip_len, udp_len; ++ struct sk_buff *skb; ++ ++ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); ++ ip_len = udp_len + sizeof(struct iphdr); ++ total_len = ip_len + ETH_HLEN; ++ ++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); ++ ++ skb->data[0] = NETCONSOLE_VERSION; ++ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); ++ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); ++ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); ++ ++ memcpy(skb->data + HEADER_LEN, msg, msg_len); ++ skb->len += msg_len + HEADER_LEN; ++ ++ transmit_netconsole_skb(skb, dev, ip_len, udp_len, ++ source_port, netdump_target_port, source_ip, netdump_target_ip, netdump_daddr); ++} ++ ++#define SYSLOG_HEADER_LEN 4 ++ ++static void send_netlog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) ++{ ++ int total_len, ip_len, udp_len; ++ struct sk_buff *skb; ++ ++ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); ++ ip_len = udp_len + sizeof(struct iphdr); ++ total_len = ip_len + ETH_HLEN; ++ ++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); ++ ++ skb->data[0] = NETCONSOLE_VERSION; ++ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); ++ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); ++ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); ++ ++ memcpy(skb->data + HEADER_LEN, msg, msg_len); ++ skb->len += msg_len + HEADER_LEN; ++ ++ transmit_netconsole_skb(skb, dev, ip_len, udp_len, ++ source_port, netlog_target_port, source_ip, netlog_target_ip, netlog_daddr); ++} ++ ++#define SYSLOG_HEADER_LEN 4 ++ ++static void send_syslog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, int pri) ++{ ++ int total_len, ip_len, udp_len; ++ struct sk_buff *skb; ++ ++ udp_len = msg_len + SYSLOG_HEADER_LEN + sizeof(struct udphdr); ++ ip_len = udp_len + sizeof(struct iphdr); ++ total_len = ip_len + ETH_HLEN; ++ ++ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - SYSLOG_HEADER_LEN); ++ ++ skb->data[0] = '<'; ++ skb->data[1] = pri + '0'; ++ skb->data[2]= '>'; ++ skb->data[3]= ' '; ++ ++ memcpy(skb->data + SYSLOG_HEADER_LEN, msg, msg_len); ++ skb->len += msg_len + SYSLOG_HEADER_LEN; ++ ++ transmit_netconsole_skb(skb, dev, ip_len, udp_len, source_port, ++ syslog_target_port, source_ip, syslog_target_ip, syslog_daddr); ++} ++ ++#define MAX_SYSLOG_CHARS 1000 ++ ++static spinlock_t syslog_lock = SPIN_LOCK_UNLOCKED; ++static int syslog_chars; ++static unsigned char syslog_line [MAX_SYSLOG_CHARS + 10]; ++ ++/* ++ * We feed kernel messages char by char, and send the UDP packet ++ * one linefeed. We buffer all characters received. ++ */ ++static inline void feed_syslog_char(struct net_device *dev, const unsigned char c) ++{ ++ if (syslog_chars == MAX_SYSLOG_CHARS) ++ syslog_chars--; ++ syslog_line[syslog_chars] = c; ++ syslog_chars++; ++ if (c == '\n') { ++ send_syslog_skb(dev, syslog_line, syslog_chars, 5); ++ syslog_chars = 0; ++ } ++} ++ ++static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED; ++static unsigned int log_offset; ++ ++static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len) ++{ ++ int len, left, i; ++ struct net_device *dev; ++ const char *msg = msg0; ++ reply_t reply; ++ ++ dev = netconsole_dev; ++ if (!dev || netdump_mode) ++ return; ++ ++ if (dev->poll_controller && netif_running(dev)) { ++ unsigned long flags; ++ ++ __save_flags(flags); ++ __cli(); ++ left = msg_len; ++ if (netlog_target_ip) { ++ while (left) { ++ if (left > MAX_PRINT_CHUNK) ++ len = MAX_PRINT_CHUNK; ++ else ++ len = left; ++ reply.code = REPLY_LOG; ++ reply.nr = 0; ++ spin_lock(&sequence_lock); ++ reply.info = log_offset; ++ log_offset += len; ++ spin_unlock(&sequence_lock); ++ send_netlog_skb(dev, msg, len, &reply); ++ msg += len; ++ left -= len; ++ } ++ } ++ if (syslog_target_ip) { ++ spin_lock(&syslog_lock); ++ for (i = 0; i < msg_len; i++) ++ feed_syslog_char(dev, msg0[i]); ++ spin_unlock(&syslog_lock); ++ } ++ ++ __restore_flags(flags); ++ } ++} ++ ++static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) ++{ ++ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); ++} ++ ++static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, ++ unsigned short ulen, u32 saddr, u32 daddr) ++{ ++ if (uh->check == 0) { ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ } else if (skb->ip_summed == CHECKSUM_HW) { ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) ++ return 0; ++ skb->ip_summed = CHECKSUM_NONE; ++ } ++ if (skb->ip_summed != CHECKSUM_UNNECESSARY) ++ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, ++0); ++ /* Probably, we should checksum udp header (it should be in cache ++ * in any case) and data in tiny packets (< rx copybreak). ++ */ ++ return 0; ++} ++ ++static __inline__ int __udp_checksum_complete(struct sk_buff *skb) ++{ ++ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); ++} ++ ++static __inline__ int udp_checksum_complete(struct sk_buff *skb) ++{ ++ return skb->ip_summed != CHECKSUM_UNNECESSARY && ++ __udp_checksum_complete(skb); ++} ++ ++/* ++ * NOTE: security depends on the trusted path between the netconsole ++ * server and netconsole client, since none of the packets are ++ * encrypted. The random magic number protects the protocol ++ * against spoofing. ++ */ ++static u64 netconsole_magic; ++static u32 magic1, magic2; ++ ++static spinlock_t req_lock = SPIN_LOCK_UNLOCKED; ++static int nr_req = 0; ++static LIST_HEAD(request_list); ++ ++static void add_new_req(req_t *req) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&req_lock, flags); ++ list_add_tail(&req->list, &request_list); ++ nr_req++; ++ Dprintk("pending requests: %d.\n", nr_req); ++ spin_unlock_irqrestore(&req_lock, flags); ++ ++ rdtscll(t0); ++} ++ ++static req_t *get_new_req(void) ++{ ++ req_t *req = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&req_lock, flags); ++ if (nr_req) { ++ req = list_entry(request_list.next, req_t, list); ++ list_del(&req->list); ++ nr_req--; ++ } ++ spin_unlock_irqrestore(&req_lock, flags); ++ ++ return req; ++} ++ ++static req_t *alloc_req(void) ++{ ++ req_t *req; ++ ++ req = (req_t *) kmalloc(sizeof(*req), GFP_ATOMIC); ++ return req; ++} ++ ++static int netconsole_rx_hook(struct sk_buff *skb) ++{ ++ int proto; ++ struct iphdr *iph; ++ struct udphdr *uh; ++ __u32 len, saddr, daddr, ulen; ++ req_t *__req; ++ req_t *req; ++ struct net_device *dev; ++ ++ if (!netdump_mode) ++ return NET_RX_SUCCESS; ++#if DEBUG ++ { ++ static int packet_count; ++ Dprintk(" %d\r", ++packet_count); ++ } ++#endif ++ dev = skb->dev; ++ if (dev->type != ARPHRD_ETHER) ++ goto out; ++ proto = ntohs(skb->mac.ethernet->h_proto); ++ Dprintk("rx got skb %p (len: %d, users: %d), dev %s, h_proto: %04x.\n", skb, skb->len, atomic_read(&skb->users), dev->name, proto); ++ #define D(x) skb->mac.ethernet->h_dest[x] ++ Dprintk("... h_dest: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); ++ #undef D ++ #define D(x) skb->mac.ethernet->h_source[x] ++ Dprintk("... h_source: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); ++ #undef D ++ if (skb->pkt_type == PACKET_OTHERHOST) ++ goto out; ++ if (skb_shared(skb)) ++ goto out; ++ if (proto == ETH_P_ARP) { ++ struct arphdr *arp; ++ unsigned char *arp_ptr; ++ ++ Dprintk("got arp skb.\n"); ++ arp = (struct arphdr *)skb->data; ++ if (!pskb_may_pull(skb, sizeof(struct arphdr) + 2*4 + 2*ETH_ALEN)) ++ goto out; ++ if (htons(dev->type) != arp->ar_hrd) ++ goto out; ++ if (arp->ar_pro != __constant_htons(ETH_P_IP)) ++ goto out; ++ if (arp->ar_hln != ETH_ALEN) ++ goto out; ++ if (arp->ar_pln != 4) ++ goto out; ++ if (arp->ar_op != __constant_htons(ARPOP_REQUEST)) ++ goto out; ++ /* ++ * ARP header looks ok so far, extract fields: ++ */ ++ arp_ptr = (unsigned char *)(arp + 1); ++ ++ memcpy(arp_sha, arp_ptr, ETH_ALEN); ++ arp_ptr += ETH_ALEN; ++ ++ memcpy(&arp_sip, arp_ptr, 4); ++ arp_ptr += 4; ++ ++ memcpy(arp_tha, arp_ptr, ETH_ALEN); ++ arp_ptr += ETH_ALEN; ++ ++ memcpy(&arp_tip, arp_ptr, 4); ++ ++ #define D(x) arp_sha[x] ++ Dprintk("... arp_sha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); ++ #undef D ++ #define D(x) ((unsigned char *)&arp_sip)[x] ++ Dprintk("... arp_sip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); ++ #undef D ++ #define D(x) arp_tha[x] ++ Dprintk("... arp_tha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); ++ #undef D ++ #define D(x) ((unsigned char *)&arp_tip)[x] ++ Dprintk("... arp_tip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); ++ #undef D ++ #define D(x) ((unsigned char *)&source_ip)[x] ++ Dprintk("... (source_ip): %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); ++ #undef D ++ ++ if (LOOPBACK(arp_tip) || MULTICAST(arp_tip)) ++ goto out; ++ ++ if (arp_tip != source_ip) ++ goto out; ++ new_arp = 1; ++ goto out; ++ } ++ if (proto != ETH_P_IP) ++ goto out; ++ /* ++ * IP header correctness testing: ++ */ ++ iph = (struct iphdr *)skb->data; ++ if (!pskb_may_pull(skb, sizeof(struct iphdr))) ++ goto out; ++ Dprintk("... IP ihl*4: %d, version: %d.\n", iph->ihl*4, iph->version); ++ if (iph->ihl < 5 || iph->version != 4) ++ goto out; ++ if (!pskb_may_pull(skb, iph->ihl*4)) ++ goto out; ++ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) ++ goto out; ++ len = ntohs(iph->tot_len); ++ Dprintk("... IP len: %d.\n", len); ++ if (skb->len < len || len < iph->ihl*4) ++ goto out; ++ saddr = iph->saddr; ++ daddr = iph->daddr; ++ Dprintk("... IP src: %08x, dst: %08x.\n", saddr, daddr); ++ Dprintk("... IP protocol: %d.\n", iph->protocol); ++ if (iph->protocol != IPPROTO_UDP) ++ goto out; ++ Dprintk("... netdump src: %08x, dst: %08x.\n", source_ip, netlog_target_ip); ++ if (source_ip != daddr) ++ goto out; ++ if (netlog_target_ip != saddr) ++ goto out; ++ len -= iph->ihl*4; ++ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); ++ ulen = ntohs(uh->len); ++ Dprintk("... UDP len: %d (left %d).\n", ulen, len); ++ ++#define MIN_COMM_SIZE (sizeof(*uh) + NETDUMP_REQ_SIZE) ++ if (ulen != len || ulen < MIN_COMM_SIZE) { ++ Dprintk("... UDP, hm, len not ok.\n"); ++ goto out; ++ } ++ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) { ++ Dprintk("... UDP, hm, checksum init not ok.\n"); ++ goto out; ++ } ++ if (udp_checksum_complete(skb)) { ++ Dprintk("... UDP, hm, checksum complete not ok.\n"); ++ goto out; ++ } ++ Dprintk("... UDP packet OK!\n"); ++ Dprintk("... UDP src port: %d, dst port: %d.\n", uh->source, uh->dest); ++ if (source_port != uh->source) ++ goto out; ++ if (netlog_target_port != uh->dest) ++ goto out; ++ __req = (req_t *)(uh + 1); ++ Dprintk("... UDP netdump packet OK!\n"); ++ ++ req = alloc_req(); ++ if (!req) { ++ printk("no more RAM to allocate request - dropping it.\n"); ++ goto out; ++ } ++ ++ req->magic = ntohl(__req->magic); ++ req->command = ntohl(__req->command); ++ req->from = ntohl(__req->from); ++ req->to = ntohl(__req->to); ++ req->nr = ntohl(__req->nr); ++ ++ Dprintk("... netdump magic: %08Lx.\n", req->magic); ++ Dprintk("... netdump command: %08x.\n", req->command); ++ Dprintk("... netdump from: %08x.\n", req->from); ++ Dprintk("... netdump to: %08x.\n", req->to); ++ ++ add_new_req(req); ++out: ++ return NET_RX_DROP; ++} ++ ++#define INVALID_PAGE "page is not valid!\n" ++ ++static void send_netdump_mem (struct net_device *dev, req_t *req) ++{ ++ int i; ++ char *kaddr; ++ char str[1024]; ++ struct page *page; ++ unsigned long nr = req->from; ++ int nr_chunks = PAGE_SIZE/1024; ++ reply_t reply; ++ ++ reply.nr = req->nr; ++ reply.info = 0; ++ if (req->from >= max_mapnr) { ++ sprintf(str, "page %08lx is bigger than max page # %08lx!\n", nr, max_mapnr); ++ reply.code = REPLY_ERROR; ++ send_netdump_skb(dev, str, strlen(str), &reply); ++ return; ++ } ++ page = mem_map + nr; ++ if (PageReserved(page)) ++ page = ZERO_PAGE(0); ++ ++ kaddr = (char *)kmap_atomic(page, KM_NETDUMP); ++ ++ for (i = 0; i < nr_chunks; i++) { ++ unsigned int offset = i*1024; ++ reply.code = REPLY_MEM; ++ reply.info = offset; ++ send_netdump_skb(dev, kaddr + offset, 1024, &reply); ++ } ++ ++ kunmap_atomic(kaddr, KM_NETDUMP); ++} ++ ++/* ++ * This function waits for the client to acknowledge the receipt ++ * of the netdump startup reply, with the possibility of packets ++ * getting lost. We resend the startup packet if no ACK is received, ++ * after a 1 second delay. ++ * ++ * (The client can test the success of the handshake via the HELLO ++ * command, and send ACKs until we enter netdump mode.) ++ */ ++static void netdump_startup_handshake(struct net_device *dev) ++{ ++ char tmp[200]; ++ reply_t reply; ++ req_t *req = NULL; ++ int i; ++ ++ netdump_mode = 1; ++ ++repeat: ++ sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); ++ reply.code = REPLY_START_NETDUMP; ++ reply.nr = 0; ++ reply.info = 0; ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ ++ for (i = 0; i < 10000; i++) { ++ // wait 1 sec. ++ udelay(100); ++ Dprintk("handshake: polling controller ...\n"); ++ dev->poll_controller(dev); ++ zap_completion_queue(); ++ req = get_new_req(); ++ if (req) ++ break; ++ } ++ if (!req) ++ goto repeat; ++ if (req->command != COMM_START_NETDUMP_ACK) { ++ kfree(req); ++ goto repeat; ++ } ++ kfree(req); ++ ++ printk("NETDUMP START!\n"); ++} ++ ++#if 0 ++ ++static inline void print_status (req_t *req) ++{ ++ static int count = 0; ++ ++ switch (++count & 3) { ++ case 0: printk("/\r"); break; ++ case 1: printk("|\r"); break; ++ case 2: printk("\\\r"); break; ++ case 3: printk("-\r"); break; ++ } ++} ++ ++#else ++ ++static inline void print_status (req_t *req) ++{ ++ static int count = 0; ++ static int prev_jiffies = 0; ++ ++ if (jiffies/HZ != prev_jiffies/HZ) { ++ prev_jiffies = jiffies; ++ count++; ++ switch (count & 3) { ++ case 0: printk("%d(%ld)/\r", nr_req, jiffies); break; ++ case 1: printk("%d(%ld)|\r", nr_req, jiffies); break; ++ case 2: printk("%d(%ld)\\\r", nr_req, jiffies); break; ++ case 3: printk("%d(%ld)-\r", nr_req, jiffies); break; ++ } ++ } ++} ++ ++#endif ++ ++#define CLI 1 ++ ++#if CONFIG_SMP ++static void freeze_cpu (void * dummy) ++{ ++ printk("CPU#%d is frozen.\n", smp_processor_id()); ++#if CLI ++ for (;;) __cli(); ++#else ++ for (;;) __sti(); ++#endif ++} ++#endif ++ ++static void netconsole_netdump (struct pt_regs *regs) ++{ ++ reply_t reply; ++ char tmp[200]; ++ unsigned long flags; ++ struct net_device *dev = netconsole_dev; ++ unsigned long esp; ++ unsigned short ss; ++ struct pt_regs myregs; ++ req_t *req; ++ ++ __save_flags(flags); ++ __cli(); ++#if CONFIG_X86_LOCAL_APIC ++ nmi_watchdog = 0; ++#endif ++#if CONFIG_SMP ++ smp_call_function(freeze_cpu, NULL, 1, 0); ++#endif ++ mdelay(1000); ++ /* ++ * Just in case we are crashing within the networking code ++ * ... attempt to fix up. ++ */ ++ spin_lock_init(&dev->xmit_lock); ++ ++ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs)); ++ ss = __KERNEL_DS; ++ if (regs->xcs & 3) { ++ esp = regs->esp; ++ ss = regs->xss & 0xffff; ++ } ++ myregs = *regs; ++ myregs.esp = esp; ++ myregs.xss = (myregs.xss & 0xffff0000) | ss; ++ ++ rdtscll(t0); ++ ++ printk("< netdump activated - performing handshake with the client. >\n"); ++ netdump_startup_handshake(dev); ++ ++ printk("< handshake completed - listening for dump requests. >\n"); ++ ++ while (netdump_mode) { ++ __cli(); ++ Dprintk("main netdump loop: polling controller ...\n"); ++ dev->poll_controller(dev); ++ zap_completion_queue(); ++#if !CLI ++ __sti(); ++#endif ++ req = get_new_req(); ++ if (!req) ++ continue; ++ Dprintk("got new req, command %d.\n", req->command); ++ print_status(req); ++ switch (req->command) { ++ case COMM_NONE: ++ Dprintk("got NO command.\n"); ++ break; ++ ++ case COMM_SEND_MEM: ++ Dprintk("got MEM command.\n"); ++ // send ->from ->to. ++ send_netdump_mem(dev, req); ++ break; ++ ++ case COMM_EXIT: ++ Dprintk("got EXIT command.\n"); ++ netdump_mode = 0; ++ break; ++ ++ case COMM_REBOOT: ++ Dprintk("got REBOOT command.\n"); ++ printk("netdump: rebooting in 3 seconds.\n"); ++ mdelay(3000); ++ machine_restart(NULL); ++ break; ++ ++ case COMM_HELLO: ++ sprintf(tmp, "Hello, this is netdump version 0.%02d\n", NETCONSOLE_VERSION); ++ reply.code = REPLY_HELLO; ++ reply.nr = req->nr; ++ reply.info = NETCONSOLE_VERSION; ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ break; ++ ++ case COMM_GET_PAGE_SIZE: ++ sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); ++ reply.code = REPLY_PAGE_SIZE; ++ reply.nr = req->nr; ++ reply.info = PAGE_SIZE; ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ break; ++ ++ case COMM_GET_REGS: ++ { ++ char *tmp2 = tmp; ++ elf_gregset_t elf_regs; ++ ++ reply.code = REPLY_REGS; ++ reply.nr = req->nr; ++ reply.info = max_mapnr; ++ tmp2 = tmp + sprintf(tmp, "Sending register info.\n"); ++ ELF_CORE_COPY_REGS(elf_regs, regs); ++ memcpy(tmp2, &elf_regs, sizeof(elf_regs)); ++ send_netdump_skb(dev, tmp, strlen(tmp) + sizeof(elf_regs), &reply); ++ break; ++ } ++ ++ case COMM_GET_NR_PAGES: ++ reply.code = REPLY_NR_PAGES; ++ reply.nr = req->nr; ++ reply.info = max_mapnr; ++ sprintf(tmp, "Number of pages: %ld\n", max_mapnr); ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ break; ++ ++ case COMM_SHOW_STATE: ++ netdump_mode = 0; ++ if (regs) ++ show_regs(regs); ++ show_state(); ++ show_mem(); ++ netdump_mode = 1; ++ reply.code = REPLY_SHOW_STATE; ++ reply.nr = req->nr; ++ reply.info = 0; ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ break; ++ ++ default: ++ reply.code = REPLY_ERROR; ++ reply.nr = req->nr; ++ reply.info = req->command; ++ Dprintk("got UNKNOWN command!\n"); ++ sprintf(tmp, "Got unknown command code %d!\n", req->command); ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ break; ++ } ++ kfree(req); ++ req = NULL; ++ } ++ sprintf(tmp, "NETDUMP end.\n"); ++ reply.code = REPLY_END_NETDUMP; ++ reply.nr = 0; ++ reply.info = 0; ++ send_netdump_skb(dev, tmp, strlen(tmp), &reply); ++ printk("NETDUMP END!\n"); ++ __restore_flags(flags); ++} ++ ++static char *dev; ++static int netdump_target_eth_byte0 = 255; ++static int netdump_target_eth_byte1 = 255; ++static int netdump_target_eth_byte2 = 255; ++static int netdump_target_eth_byte3 = 255; ++static int netdump_target_eth_byte4 = 255; ++static int netdump_target_eth_byte5 = 255; ++ ++static int netlog_target_eth_byte0 = 255; ++static int netlog_target_eth_byte1 = 255; ++static int netlog_target_eth_byte2 = 255; ++static int netlog_target_eth_byte3 = 255; ++static int netlog_target_eth_byte4 = 255; ++static int netlog_target_eth_byte5 = 255; ++ ++static int syslog_target_eth_byte0 = 255; ++static int syslog_target_eth_byte1 = 255; ++static int syslog_target_eth_byte2 = 255; ++static int syslog_target_eth_byte3 = 255; ++static int syslog_target_eth_byte4 = 255; ++static int syslog_target_eth_byte5 = 255; ++ ++MODULE_PARM(netdump_target_ip, "i"); ++MODULE_PARM_DESC(netdump_target_ip, ++ "remote netdump IP address as a native (not network) endian integer"); ++MODULE_PARM(netlog_target_ip, "i"); ++MODULE_PARM_DESC(netlog_target_ip, ++ "remote netlog IP address as a native (not network) endian integer"); ++MODULE_PARM(syslog_target_ip, "i"); ++MODULE_PARM_DESC(syslog_target_ip, ++ "remote syslog IP address as a native (not network) endian integer"); ++ ++MODULE_PARM(source_port, "h"); ++MODULE_PARM_DESC(source_port, ++ "local port from which to send netdump packets"); ++ ++MODULE_PARM(netdump_target_port, "h"); ++MODULE_PARM_DESC(netdump_target_port, ++ "remote port to which to send netdump packets"); ++MODULE_PARM(netlog_target_port, "h"); ++MODULE_PARM_DESC(netlog_target_port, ++ "remote port to which to send netlog packets"); ++MODULE_PARM(syslog_target_port, "h"); ++MODULE_PARM_DESC(syslog_target_port, ++ "remote port to which to send syslog packets"); ++ ++#define ETH_BYTE(name,nr) \ ++ MODULE_PARM(name##_target_eth_byte##nr, "i"); \ ++ MODULE_PARM_DESC(name##_target_eth_byte##nr, \ ++ "byte "#nr" of the netdump server MAC address") ++ ++#define ETH_BYTES(name) \ ++ ETH_BYTE(name, 0); ETH_BYTE(name, 1); ETH_BYTE(name, 2); \ ++ ETH_BYTE(name, 3); ETH_BYTE(name, 4); ETH_BYTE(name, 5); ++ ++ETH_BYTES(netdump); ++ETH_BYTES(netlog); ++ETH_BYTES(syslog); ++ ++MODULE_PARM(magic1, "i"); ++MODULE_PARM_DESC(magic1, ++ "lower 32 bits of magic cookie shared between client and server"); ++MODULE_PARM(magic2, "i"); ++MODULE_PARM_DESC(magic2, ++ "upper 32 bits of magic cookie shared between client and server"); ++MODULE_PARM(dev, "s"); ++MODULE_PARM_DESC(dev, ++ "name of the device from which to send netdump and syslog packets"); ++MODULE_PARM(mhz, "i"); ++MODULE_PARM_DESC(mhz, ++ "one second wall clock time takes this many million CPU cycles"); ++MODULE_PARM(idle_timeout, "i"); ++MODULE_PARM_DESC(idle_timeout, ++ "reboot system after this many idle seconds"); ++ ++static struct console netconsole = ++ { flags: CON_ENABLED, write: write_netconsole_msg }; ++ ++static int init_netconsole(void) ++{ ++ struct net_device *ndev = NULL; ++ struct in_device *in_dev; ++ ++ printk(KERN_INFO "netlog: using network device <%s>\n", dev); ++ // this will be valid once the device goes up. ++ if (dev) ++ ndev = dev_get_by_name(dev); ++ if (!ndev) { ++ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev); ++ return -1; ++ } ++ if (!ndev->poll_controller) { ++ printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev); ++ return -1; ++ } ++ in_dev = in_dev_get(ndev); ++ if (!in_dev) { ++ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev); ++ return -1; ++ } ++ ++ if (!magic1 || !magic2) { ++ printk(KERN_ERR "netlog: magic cookie (magic1,magic2) not specified.\n"); ++ return -1; ++ } ++ netconsole_magic = magic1 + (((u64)magic2)<<32); ++ ++ source_ip = ntohl(in_dev->ifa_list->ifa_local); ++ if (!source_ip) { ++ printk(KERN_ERR "netlog: network device %s has no local address, aborting.\n", dev); ++ return -1; ++ } ++#define IP(x) ((unsigned char *)&source_ip)[x] ++ printk(KERN_INFO "netlog: using source IP %u.%u.%u.%u\n", ++ IP(3), IP(2), IP(1), IP(0)); ++#undef IP ++ source_ip = htonl(source_ip); ++ if (!source_port) { ++ printk(KERN_ERR "netlog: source_port parameter not specified, aborting.\n"); ++ return -1; ++ } ++ printk(KERN_INFO "netlog: using source UDP port: %u\n", source_port); ++ source_port = htons(source_port); ++ ++ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) { ++ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n"); ++ return -1; ++ } ++ if (netdump_target_ip) { ++#define IP(x) ((unsigned char *)&netdump_target_ip)[x] ++ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n", ++ IP(3), IP(2), IP(1), IP(0)); ++#undef IP ++ netdump_target_ip = htonl(netdump_target_ip); ++ } ++ if (netlog_target_ip) { ++#define IP(x) ((unsigned char *)&netlog_target_ip)[x] ++ printk(KERN_INFO "netlog: using netlog target IP %u.%u.%u.%u\n", ++ IP(3), IP(2), IP(1), IP(0)); ++#undef IP ++ netlog_target_ip = htonl(netlog_target_ip); ++ } ++ if (syslog_target_ip) { ++ if (!syslog_target_port) ++ syslog_target_port = 514; ++#define IP(x) ((unsigned char *)&syslog_target_ip)[x] ++ printk("netlog: using syslog target IP %u.%u.%u.%u, port: %d\n", IP(3), IP(2), IP(1), IP(0), syslog_target_port); ++#undef IP ++ syslog_target_ip = htonl(syslog_target_ip); ++ syslog_target_port = htons(syslog_target_port); ++ } ++ if (!netdump_target_port && !netlog_target_port && !syslog_target_port) { ++ printk(KERN_ERR "netlog: target_port parameter not specified, aborting.\n"); ++ return -1; ++ } ++ if (netdump_target_port) { ++ printk(KERN_INFO "netlog: using target UDP port: %u\n", netdump_target_port); ++ netdump_target_port = htons(netdump_target_port); ++ } ++ if (netlog_target_port) { ++ printk(KERN_INFO "netlog: using target UDP port: %u\n", netlog_target_port); ++ netlog_target_port = htons(netlog_target_port); ++ } ++ ++ netdump_daddr[0] = netdump_target_eth_byte0; ++ netdump_daddr[1] = netdump_target_eth_byte1; ++ netdump_daddr[2] = netdump_target_eth_byte2; ++ netdump_daddr[3] = netdump_target_eth_byte3; ++ netdump_daddr[4] = netdump_target_eth_byte4; ++ netdump_daddr[5] = netdump_target_eth_byte5; ++ ++ if ((netdump_daddr[0] & netdump_daddr[1] & netdump_daddr[2] & netdump_daddr[3] & netdump_daddr[4] & netdump_daddr[5]) == 255) ++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); ++ else ++ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", ++ netdump_daddr[0], netdump_daddr[1], netdump_daddr[2], netdump_daddr[3], netdump_daddr[4], netdump_daddr[5]); ++ ++ netlog_daddr[0] = netlog_target_eth_byte0; ++ netlog_daddr[1] = netlog_target_eth_byte1; ++ netlog_daddr[2] = netlog_target_eth_byte2; ++ netlog_daddr[3] = netlog_target_eth_byte3; ++ netlog_daddr[4] = netlog_target_eth_byte4; ++ netlog_daddr[5] = netlog_target_eth_byte5; ++ ++ if ((netlog_daddr[0] & netlog_daddr[1] & netlog_daddr[2] & netlog_daddr[3] & netlog_daddr[4] & netlog_daddr[5]) == 255) ++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); ++ else ++ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", ++ netlog_daddr[0], netlog_daddr[1], netlog_daddr[2], netlog_daddr[3], netlog_daddr[4], netlog_daddr[5]); ++ syslog_daddr[0] = syslog_target_eth_byte0; ++ syslog_daddr[1] = syslog_target_eth_byte1; ++ syslog_daddr[2] = syslog_target_eth_byte2; ++ syslog_daddr[3] = syslog_target_eth_byte3; ++ syslog_daddr[4] = syslog_target_eth_byte4; ++ syslog_daddr[5] = syslog_target_eth_byte5; ++ ++ if ((syslog_daddr[0] & syslog_daddr[1] & syslog_daddr[2] & syslog_daddr[3] & syslog_daddr[4] & syslog_daddr[5]) == 255) ++ printk(KERN_INFO "netlog: using broadcast ethernet frames to send syslog packets.\n"); ++ else ++ printk(KERN_INFO "netlog: using syslog target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", ++ syslog_daddr[0], syslog_daddr[1], syslog_daddr[2], syslog_daddr[3], syslog_daddr[4], syslog_daddr[5]); ++ ++ mhz_cycles = (unsigned long long)mhz * 1000000ULL; ++ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ); ++ ++ INIT_LIST_HEAD(&request_list); ++ ++ ndev->rx_hook = netconsole_rx_hook; ++ netdump_func = netconsole_netdump; ++ netconsole_dev = ndev; ++#define STARTUP_MSG "[...network console startup...]\n" ++ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG)); ++ ++ register_console(&netconsole); ++ printk(KERN_INFO "netlog: network logging started up successfully!\n"); ++ return 0; ++} ++ ++static void cleanup_netconsole(void) ++{ ++ printk(KERN_INFO "netlog: network logging shut down.\n"); ++ unregister_console(&netconsole); ++ ++#define SHUTDOWN_MSG "[...network console shutdown...]\n" ++ write_netconsole_msg(NULL, SHUTDOWN_MSG, strlen(SHUTDOWN_MSG)); ++ netconsole_dev->rx_hook = NULL; ++ netconsole_dev = NULL; ++} ++ ++module_init(init_netconsole); ++module_exit(cleanup_netconsole); ++ ++MODULE_LICENSE("GPL"); ++ +Index: linux-2.4.24/drivers/net/netconsole.h +=================================================================== +--- linux-2.4.24.orig/drivers/net/netconsole.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.4.24/drivers/net/netconsole.h 2004-05-07 16:58:39.000000000 -0400 +@@ -0,0 +1,81 @@ ++/* ++ * linux/drivers/net/netconsole.h ++ * ++ * Copyright (C) 2001 Ingo Molnar ++ * ++ * This file contains the implementation of an IRQ-safe, crash-safe ++ * kernel console implementation that outputs kernel messages to the ++ * network. ++ * ++ * Modification history: ++ * ++ * 2001-09-17 started by Ingo Molnar. ++ */ ++ ++/**************************************************************** ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2, or (at your option) ++ * any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ ****************************************************************/ ++ ++#define NETCONSOLE_VERSION 0x04 ++ ++enum netdump_commands { ++ COMM_NONE = 0, ++ COMM_SEND_MEM = 1, ++ COMM_EXIT = 2, ++ COMM_REBOOT = 3, ++ COMM_HELLO = 4, ++ COMM_GET_NR_PAGES = 5, ++ COMM_GET_PAGE_SIZE = 6, ++ COMM_START_NETDUMP_ACK = 7, ++ COMM_GET_REGS = 8, ++ COMM_SHOW_STATE = 9, ++}; ++ ++#define NETDUMP_REQ_SIZE (8+4*4) ++ ++typedef struct netdump_req_s { ++ u64 magic; ++ u32 nr; ++ u32 command; ++ u32 from; ++ u32 to; ++ struct list_head list; ++} req_t; ++ ++enum netdump_replies { ++ REPLY_NONE = 0, ++ REPLY_ERROR = 1, ++ REPLY_LOG = 2, ++ REPLY_MEM = 3, ++ REPLY_RESERVED = 4, ++ REPLY_HELLO = 5, ++ REPLY_NR_PAGES = 6, ++ REPLY_PAGE_SIZE = 7, ++ REPLY_START_NETDUMP = 8, ++ REPLY_END_NETDUMP = 9, ++ REPLY_REGS = 10, ++ REPLY_MAGIC = 11, ++ REPLY_SHOW_STATE = 12, ++}; ++ ++typedef struct netdump_reply_s { ++ u32 nr; ++ u32 code; ++ u32 info; ++} reply_t; ++ ++#define HEADER_LEN (1 + sizeof(reply_t)) ++ +Index: linux-2.4.24/drivers/net/tlan.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/tlan.c 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/tlan.c 2004-05-07 16:58:39.000000000 -0400 +@@ -345,6 +345,8 @@ + static void TLan_EeReceiveByte( u16, u8 *, int ); + static int TLan_EeReadByte( struct net_device *, u8, u8 * ); + ++static void TLan_Poll(struct net_device *); ++ + + static void + TLan_StoreSKB( struct tlan_list_tag *tag, struct sk_buff *skb) +@@ -891,6 +893,9 @@ + dev->get_stats = &TLan_GetStats; + dev->set_multicast_list = &TLan_SetMulticastList; + dev->do_ioctl = &TLan_ioctl; ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = &TLan_Poll; ++#endif + dev->tx_timeout = &TLan_tx_timeout; + dev->watchdog_timeo = TX_TIMEOUT; + +@@ -1176,7 +1181,14 @@ + + } /* TLan_HandleInterrupts */ + +- ++#ifdef HAVE_POLL_CONTROLLER ++static void TLan_Poll(struct net_device *dev) ++{ ++ if (!netdump_mode) disable_irq(dev->irq); ++ TLan_HandleInterrupt(dev->irq, dev, NULL); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++#endif + + + /*************************************************************** +Index: linux-2.4.24/drivers/net/tulip/tulip_core.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/tulip/tulip_core.c 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/tulip/tulip_core.c 2004-05-07 16:58:39.000000000 -0400 +@@ -266,6 +266,7 @@ + static struct net_device_stats *tulip_get_stats(struct net_device *dev); + static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); + static void set_rx_mode(struct net_device *dev); ++static void poll_tulip(struct net_device *dev); + + + +@@ -1728,6 +1729,9 @@ + dev->get_stats = tulip_get_stats; + dev->do_ioctl = private_ioctl; + dev->set_multicast_list = set_rx_mode; ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = &poll_tulip; ++#endif + + if (register_netdev(dev)) + goto err_out_free_ring; +@@ -1902,6 +1906,24 @@ + } + + ++#ifdef HAVE_POLL_CONTROLLER ++ ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++ ++static void poll_tulip (struct net_device *dev) ++{ ++ if (!netdump_mode) disable_irq(dev->irq); ++ tulip_interrupt (dev->irq, dev, NULL); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++ ++#endif ++ ++ + static struct pci_driver tulip_driver = { + name: DRV_NAME, + id_table: tulip_pci_tbl, +Index: linux-2.4.24/drivers/net/e100/e100_main.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/e100/e100_main.c 2004-05-07 16:58:39.000000000 -0400 ++++ linux-2.4.24/drivers/net/e100/e100_main.c 2004-05-07 17:00:21.000000000 -0400 +@@ -664,6 +664,10 @@ + goto err_unregister_netdev; + } + ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = e100_netpoll; ++#endif ++ + e100nics++; + + e100_get_speed_duplex_caps(bdp); +Index: linux-2.4.24/drivers/net/e1000/e1000_main.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/e1000/e1000_main.c 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/e1000/e1000_main.c 2004-05-07 16:58:39.000000000 -0400 +@@ -182,6 +182,9 @@ + static int e1000_resume(struct pci_dev *pdev); + #endif + ++/* for netdump / net console */ ++static void e1000_netpoll (struct net_device *dev); ++ + struct notifier_block e1000_notifier_reboot = { + .notifier_call = e1000_notify_reboot, + .next = NULL, +@@ -434,6 +437,10 @@ + netdev->vlan_rx_add_vid = e1000_vlan_rx_add_vid; + netdev->vlan_rx_kill_vid = e1000_vlan_rx_kill_vid; + ++#ifdef HAVE_POLL_CONTROLLER ++ netdev->poll_controller = e1000_netpoll; ++#endif ++ + netdev->irq = pdev->irq; + netdev->mem_start = mmio_start; + netdev->mem_end = mmio_start + mmio_len; +@@ -2899,4 +2906,20 @@ + } + #endif + ++#ifdef HAVE_POLL_CONTROLLER ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++ ++static void e1000_netpoll (struct net_device *dev) ++{ ++ if (!netdump_mode) disable_irq(dev->irq); ++ e1000_intr (dev->irq, dev, NULL); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++ ++#endif ++ + /* e1000_main.c */ +Index: linux-2.4.24/drivers/net/tg3.c +=================================================================== +--- linux-2.4.24.orig/drivers/net/tg3.c 2003-11-28 13:26:20.000000000 -0500 ++++ linux-2.4.24/drivers/net/tg3.c 2004-05-07 16:58:39.000000000 -0400 +@@ -216,6 +216,9 @@ + #define tr16(reg) readw(tp->regs + (reg)) + #define tr8(reg) readb(tp->regs + (reg)) + ++/* Added by mark.fasheh@oracle.com to help enable netdump on these cards */ ++static void poll_tg3 (struct net_device *dev); ++ + static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) + { + unsigned long flags; +@@ -7630,6 +7633,9 @@ + dev->watchdog_timeo = TG3_TX_TIMEOUT; + dev->change_mtu = tg3_change_mtu; + dev->irq = pdev->irq; ++#ifdef HAVE_POLL_CONTROLLER ++ dev->poll_controller = &poll_tg3; ++#endif + + err = tg3_get_invariants(tp); + if (err) { +@@ -7862,5 +7868,23 @@ + pci_unregister_driver(&tg3_driver); + } + ++#ifdef HAVE_POLL_CONTROLLER ++ ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++ ++static void poll_tg3 (struct net_device *dev) ++{ ++ if (!netdump_mode) disable_irq(dev->irq); ++ tg3_interrupt (dev->irq, dev, NULL); ++ if (!netdump_mode) enable_irq(dev->irq); ++} ++ ++#endif ++ ++ + module_init(tg3_init); + module_exit(tg3_cleanup); +Index: linux-2.4.24/include/asm-i386/kmap_types.h +=================================================================== +--- linux-2.4.24.orig/include/asm-i386/kmap_types.h 2003-08-25 07:44:43.000000000 -0400 ++++ linux-2.4.24/include/asm-i386/kmap_types.h 2004-05-07 16:59:12.000000000 -0400 +@@ -10,6 +10,7 @@ + KM_BH_IRQ, + KM_SOFTIRQ0, + KM_SOFTIRQ1, ++ KM_NETDUMP, + KM_TYPE_NR + }; + +Index: linux-2.4.24/include/linux/kernel.h +=================================================================== +--- linux-2.4.24.orig/include/linux/kernel.h 2004-05-07 16:56:55.000000000 -0400 ++++ linux-2.4.24/include/linux/kernel.h 2004-05-07 16:58:39.000000000 -0400 +@@ -104,6 +104,9 @@ + + extern void bust_spinlocks(int yes); + extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ ++struct pt_regs; ++extern void (*netdump_func) (struct pt_regs *regs); ++extern int netdump_mode; + + extern int tainted; + extern const char *print_tainted(void); +Index: linux-2.4.24/include/linux/netdevice.h +=================================================================== +--- linux-2.4.24.orig/include/linux/netdevice.h 2003-11-28 13:26:21.000000000 -0500 ++++ linux-2.4.24/include/linux/netdevice.h 2004-05-07 16:58:39.000000000 -0400 +@@ -435,6 +435,9 @@ + unsigned char *haddr); + int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); + int (*accept_fastpath)(struct net_device *, struct dst_entry*); ++#define HAVE_POLL_CONTROLLER ++ void (*poll_controller)(struct net_device *dev); ++ int (*rx_hook)(struct sk_buff *skb); + + /* open/release and usage marking */ + struct module *owner; +Index: linux-2.4.24/kernel/panic.c +=================================================================== +--- linux-2.4.24.orig/kernel/panic.c 2004-05-07 16:56:56.000000000 -0400 ++++ linux-2.4.24/kernel/panic.c 2004-05-07 16:58:39.000000000 -0400 +@@ -62,6 +62,8 @@ + vsprintf(buf, fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic: %s\n",buf); ++ if (netdump_func) ++ BUG(); + if (in_interrupt()) + printk(KERN_EMERG "In interrupt handler - not syncing\n"); + else if (!current->pid) +Index: linux-2.4.24/net/core/dev.c +=================================================================== +--- linux-2.4.24.orig/net/core/dev.c 2003-11-28 13:26:21.000000000 -0500 ++++ linux-2.4.24/net/core/dev.c 2004-05-07 16:58:39.000000000 -0400 +@@ -1288,6 +1288,13 @@ + + local_irq_save(flags); + ++ if (unlikely(skb->dev->rx_hook != NULL)) { ++ int ret; ++ ++ ret = skb->dev->rx_hook(skb); ++ if (ret == NET_RX_DROP) ++ goto drop; ++ } + netdev_rx_stat[this_cpu].total++; + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { diff --git a/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch new file mode 100644 index 0000000..a6a7e12 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch @@ -0,0 +1,5242 @@ + Documentation/Configure.help | 66 ++ + arch/alpha/defconfig | 7 + arch/alpha/kernel/entry.S | 12 + arch/arm/defconfig | 7 + arch/arm/kernel/calls.S | 24 + arch/i386/defconfig | 7 + arch/ia64/defconfig | 7 + arch/ia64/kernel/entry.S | 24 + arch/m68k/defconfig | 7 + arch/mips/defconfig | 7 + arch/mips64/defconfig | 7 + arch/ppc/defconfig | 14 + arch/ppc64/kernel/misc.S | 2 + arch/s390/defconfig | 7 + arch/s390/kernel/entry.S | 24 + arch/s390x/defconfig | 7 + arch/s390x/kernel/entry.S | 24 + arch/s390x/kernel/wrapper32.S | 92 +++ + arch/sparc/defconfig | 7 + arch/sparc/kernel/systbls.S | 10 + arch/sparc64/defconfig | 7 + arch/sparc64/kernel/systbls.S | 20 + fs/Config.in | 14 + fs/Makefile | 3 + fs/ext2/Makefile | 4 + fs/ext2/file.c | 5 + fs/ext2/ialloc.c | 2 + fs/ext2/inode.c | 34 - + fs/ext2/namei.c | 14 + fs/ext2/super.c | 29 + fs/ext2/symlink.c | 14 + fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ + fs/ext2/xattr_user.c | 103 +++ + fs/ext3/Makefile | 10 + fs/ext3/file.c | 5 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 35 - + fs/ext3/namei.c | 21 + fs/ext3/super.c | 36 + + fs/ext3/symlink.c | 14 + fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/xattr_user.c | 111 +++ + fs/jfs/jfs_xattr.h | 6 + fs/jfs/xattr.c | 6 + fs/mbcache.c | 648 ++++++++++++++++++++++ + include/asm-arm/unistd.h | 2 + include/asm-ia64/unistd.h | 13 + include/asm-ppc64/unistd.h | 2 + include/asm-s390/unistd.h | 15 + include/asm-s390x/unistd.h | 15 + include/asm-sparc/unistd.h | 24 + include/asm-sparc64/unistd.h | 24 + include/linux/cache_def.h | 15 + include/linux/errno.h | 4 + include/linux/ext2_fs.h | 31 - + include/linux/ext2_xattr.h | 157 +++++ + include/linux/ext3_fs.h | 31 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 157 +++++ + include/linux/fs.h | 2 + include/linux/mbcache.h | 69 ++ + kernel/ksyms.c | 4 + mm/vmscan.c | 35 + + fs/ext3/ext3-exports.c | 14 + + 64 files changed, 4355 insertions(+), 195 deletions(-) + +Index: linux-DRV401/arch/ppc/defconfig +=================================================================== +--- linux-DRV401.orig/arch/ppc/defconfig 2004-10-15 10:24:32.000000000 -0700 ++++ linux-DRV401/arch/ppc/defconfig 2004-10-15 11:03:51.000000000 -0700 +@@ -1,6 +1,13 @@ + # + # Automatically generated by make menuconfig: don't edit + # ++CONFIG_EXT3_FS_XATTR=y ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set + CONFIG_RWSEM_XCHGADD_ALGORITHM=y +Index: linux-DRV401/fs/Config.in +=================================================================== +--- linux-DRV401.orig/fs/Config.in 2004-10-15 10:24:06.000000000 -0700 ++++ linux-DRV401/fs/Config.in 2004-10-15 11:03:51.000000000 -0700 +@@ -22,6 +22,11 @@ + dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + + tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS ++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS ++dep_bool ' Ext3 extended attribute block sharing' \ ++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR ++dep_bool ' Ext3 extended user attributes' \ ++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS +@@ -77,6 +82,11 @@ + tristate 'ROM file system support' CONFIG_ROMFS_FS + + tristate 'Second extended fs support' CONFIG_EXT2_FS ++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS ++dep_bool ' Ext2 extended attribute block sharing' \ ++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR ++dep_bool ' Ext2 extended user attributes' \ ++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR + + tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS + +@@ -156,6 +166,10 @@ + fi + fi + ++# Meta block cache for Extended Attributes (ext2/ext3) ++#tristate 'Meta block cache' CONFIG_FS_MBCACHE ++define_tristate CONFIG_FS_MBCACHE y ++ + mainmenu_option next_comment + comment 'Partition Types' + source fs/partitions/Config.in +Index: linux-DRV401/fs/Makefile +=================================================================== +--- linux-DRV401.orig/fs/Makefile 2004-10-15 10:39:15.000000000 -0700 ++++ linux-DRV401/fs/Makefile 2004-10-15 11:03:51.000000000 -0700 +@@ -14,7 +14,7 @@ + super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ + fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ +- filesystems.o namespace.o seq_file.o quota.o ++ filesystems.o namespace.o seq_file.o quota.o xattr.o + + ifeq ($(CONFIG_QUOTA),y) + obj-y += dquot.o +@@ -76,6 +76,9 @@ + + obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o + ++export-objs += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++ + # persistent filesystems + obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) + +Index: linux-DRV401/fs/ext2/Makefile +=================================================================== +--- linux-DRV401.orig/fs/ext2/Makefile 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/Makefile 2004-10-15 11:03:51.000000000 -0700 +@@ -13,4 +13,8 @@ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +Index: linux-DRV401/fs/ext2/file.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/file.c 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/file.c 2004-10-15 11:03:51.000000000 -0700 +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + + /* +@@ -51,4 +52,8 @@ + + struct inode_operations ext2_file_inode_operations = { + truncate: ext2_truncate, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +Index: linux-DRV401/fs/ext2/ialloc.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/ialloc.c 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/ialloc.c 2004-10-15 11:03:51.000000000 -0700 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -167,6 +168,7 @@ + */ + if (!is_bad_inode(inode)) { + /* Quota is already initialized in iput() */ ++ ext2_xattr_delete_inode(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + } +Index: linux-DRV401/fs/ext2/inode.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/inode.c 2004-10-15 10:24:00.000000000 -0700 ++++ linux-DRV401/fs/ext2/inode.c 2004-10-15 11:03:51.000000000 -0700 +@@ -39,6 +39,18 @@ + static int ext2_update_inode(struct inode * inode, int do_sync); + + /* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext2_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext2_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ ++/* + * Called at each iput() + */ + void ext2_put_inode (struct inode * inode) +@@ -53,9 +65,7 @@ + { + lock_kernel(); + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + inode->u.ext2_i.i_dtime = CURRENT_TIME; + mark_inode_dirty(inode); +@@ -792,6 +802,8 @@ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext2_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -879,8 +891,7 @@ + unsigned long offset; + struct ext2_group_desc * gdp; + +- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && +- inode->i_ino != EXT2_ACL_DATA_INO && ++ if ((inode->i_ino != EXT2_ROOT_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_read_inode", +@@ -965,10 +976,7 @@ + for (block = 0; block < EXT2_N_BLOCKS; block++) + inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; + +- if (inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + inode->i_mapping->a_ops = &ext2_aops; +@@ -977,15 +985,17 @@ + inode->i_fop = &ext2_dir_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext2_inode_is_fast_symlink(inode)) + inode->i_op = &ext2_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + } +- } else ++ } else { ++ inode->i_op = &ext2_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(raw_inode->i_block[0])); ++ } + brelse (bh); + inode->i_attr_flags = 0; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { +Index: linux-DRV401/fs/ext2/namei.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/namei.c 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/namei.c 2004-10-15 11:03:51.000000000 -0700 +@@ -31,6 +31,7 @@ + + #include + #include ++#include + #include + + /* +@@ -136,7 +137,7 @@ + + if (l > sizeof (inode->u.ext2_i.i_data)) { + /* slow symlink */ +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + err = block_symlink(inode, symname, l); + if (err) +@@ -345,4 +346,15 @@ + rmdir: ext2_rmdir, + mknod: ext2_mknod, + rename: ext2_rename, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ ++struct inode_operations ext2_special_inode_operations = { ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +Index: linux-DRV401/fs/ext2/super.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/super.c 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/super.c 2004-10-15 11:03:51.000000000 -0700 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -125,6 +126,7 @@ + int db_count; + int i; + ++ ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + +@@ -175,6 +177,13 @@ + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -424,6 +433,9 @@ + blocksize = BLOCK_SIZE; + + sb->u.ext2_sb.s_mount_opt = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ ++#endif + if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, + &sb->u.ext2_sb.s_mount_opt)) { + return NULL; +@@ -810,12 +822,27 @@ + + static int __init init_ext2_fs(void) + { +- return register_filesystem(&ext2_fs_type); ++ int error = init_ext2_xattr(); ++ if (error) ++ return error; ++ error = init_ext2_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext2_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext2_xattr_user(); ++fail: ++ exit_ext2_xattr(); ++ return error; + } + + static void __exit exit_ext2_fs(void) + { + unregister_filesystem(&ext2_fs_type); ++ exit_ext2_xattr_user(); ++ exit_ext2_xattr(); + } + + EXPORT_NO_SYMBOLS; +Index: linux-DRV401/fs/ext2/symlink.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/symlink.c 2004-10-15 10:23:59.000000000 -0700 ++++ linux-DRV401/fs/ext2/symlink.c 2004-10-15 11:03:51.000000000 -0700 +@@ -19,6 +19,7 @@ + + #include + #include ++#include + + static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -32,7 +33,20 @@ + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext2_symlink_inode_operations = { ++ readlink: page_readlink, ++ follow_link: page_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ + struct inode_operations ext2_fast_symlink_inode_operations = { + readlink: ext2_readlink, + follow_link: ext2_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +Index: linux-DRV401/fs/ext2/xattr.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/xattr.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/ext2/xattr.c 2004-10-15 11:03:51.000000000 -0700 +@@ -0,0 +1,1212 @@ ++/* ++ * linux/fs/ext2/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT2_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++EXPORT_SYMBOL(ext2_xattr_register); ++EXPORT_SYMBOL(ext2_xattr_unregister); ++EXPORT_SYMBOL(ext2_xattr_get); ++EXPORT_SYMBOL(ext2_xattr_list); ++EXPORT_SYMBOL(ext2_xattr_set); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT2_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext2_xattr_set2(struct inode *, struct buffer_head *, ++ struct ext2_xattr_header *); ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++static int ext2_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext2_xattr_cache_find(struct inode *, ++ struct ext2_xattr_header *); ++static void ext2_xattr_cache_remove(struct buffer_head *); ++static void ext2_xattr_rehash(struct ext2_xattr_header *, ++ struct ext2_xattr_entry *); ++ ++static struct mb_cache *ext2_xattr_cache; ++ ++#else ++# define ext2_xattr_cache_insert(bh) 0 ++# define ext2_xattr_cache_find(inode, header) NULL ++# define ext2_xattr_cache_remove(bh) while(0) {} ++# define ext2_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext2_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext2_xattr_sem); ++ ++static inline int ++ext2_xattr_new_block(struct inode *inode, int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + ++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext2_new_block(inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext2_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext2_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext2_xattr_free_block(struct inode * inode, unsigned long block) ++{ ++ ext2_free_blocks(inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext2_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext2_xattr_free_block(inode, block) \ ++ ext2_free_blocks(inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; ++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ if (!ext2_xattr_handlers[name_index-1]) { ++ ext2_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext2_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ ext2_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext2_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static struct ext2_xattr_handler * ++ext2_xattr_resolve_name(const char **name) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext2_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext2_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext2_handler_lock); ++ return handler; ++} ++ ++static inline struct ext2_xattr_handler * ++ext2_xattr_handler(int name_index) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ read_lock(&ext2_handler_lock); ++ handler = ext2_xattr_handlers[name_index-1]; ++ read_unlock(&ext2_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext2_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext2_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT2_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT2_I(inode)->i_file_acl) ++ return 0; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext2_xattr_update_super_block(struct super_block *sb) ++{ ++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT2_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext2_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_header *header = NULL; ++ struct ext2_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT2_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext2_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(sb, "ext2_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext2_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT2_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT2_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT2_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext2_xattr_cache_remove(bh); ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT2_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT2_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext2_xattr_set2(inode, bh, NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT2_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT2_XATTR_PAD, 0, ++ EXT2_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext2_xattr_rehash(header, here); ++ ++ error = ext2_xattr_set2(inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext2_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext2_xattr_set(): Update the file system. ++ */ ++static int ++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ++ struct ext2_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext2_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext2_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext2_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT2_I(inode)->i_file_acl != 0; ++ int block = ext2_xattr_new_block(inode, &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++ ext2_xattr_free_block(inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext2_xattr_cache_insert(new_bh); ++ ++ ext2_xattr_update_super_block(sb); ++ } ++ mark_buffer_dirty(new_bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &new_bh); ++ wait_on_buffer(new_bh); ++ error = -EIO; ++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) ++ goto cleanup; ++ } ++ } ++ ++ /* Update the inode. */ ++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ if (IS_SYNC(inode)) { ++ error = ext2_sync_inode (inode); ++ if (error) ++ goto cleanup; ++ } else ++ mark_inode_dirty(inode); ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext2_xattr_free_block(inode, old_bh->b_blocknr); ++ mark_buffer_clean(old_bh); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext2_xattr_quota_free(inode); ++ mark_buffer_dirty(old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT2_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext2_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext2_xattr_cache_remove(bh); ++ ext2_xattr_free_block(inode, block); ++ bforget(bh); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ mark_buffer_dirty(bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ } ++ ext2_xattr_quota_free(inode); ++ } ++ EXT2_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext2_xattr_sem); ++} ++ ++/* ++ * ext2_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++/* ++ * ext2_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext2_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext2_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext2_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext2_xattr_cmp(struct ext2_xattr_header *header1, ++ struct ext2_xattr_header *header2) ++{ ++ struct ext2_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT2_XATTR_NEXT(entry1); ++ entry2 = EXT2_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext2_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT2_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT2_XATTR_REFCOUNT_MAX); ++ } else if (!ext2_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext2_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext2_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext2_xattr_rehash(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ struct ext2_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext2_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT2_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext2_xattr(void) ++{ ++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext2_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++ mb_cache_destroy(ext2_xattr_cache); ++} ++ ++#else /* CONFIG_EXT2_FS_XATTR_SHARING */ ++ ++int __init ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ +Index: linux-DRV401/fs/ext2/xattr_user.c +=================================================================== +--- linux-DRV401.orig/fs/ext2/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/ext2/xattr_user.c 2004-10-15 11:03:51.000000000 -0700 +@@ -0,0 +1,103 @@ ++/* ++ * linux/fs/ext2/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext2_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext2_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext2_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, ++ value, size, flags); ++} ++ ++struct ext2_xattr_handler ext2_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext2_xattr_user_list, ++ get: ext2_xattr_user_get, ++ set: ext2_xattr_user_set, ++}; ++ ++int __init ++init_ext2_xattr_user(void) ++{ ++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} ++ ++void ++exit_ext2_xattr_user(void) ++{ ++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} +Index: linux-DRV401/fs/ext3/Makefile +=================================================================== +--- linux-DRV401.orig/fs/ext3/Makefile 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/fs/ext3/Makefile 2004-10-15 11:03:51.000000000 -0700 +@@ -1,5 +1,5 @@ + # +-# Makefile for the linux ext2-filesystem routines. ++# Makefile for the linux ext3-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here +@@ -9,8 +9,14 @@ + + O_TARGET := ext3.o + ++export-objs := ext3-exports.o ++ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +Index: linux-DRV401/fs/ext3/file.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/file.c 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/fs/ext3/file.c 2004-10-15 11:03:51.000000000 -0700 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -93,5 +94,9 @@ + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; + +Index: linux-DRV401/fs/ext3/ialloc.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/ialloc.c 2004-10-15 10:24:00.000000000 -0700 ++++ linux-DRV401/fs/ext3/ialloc.c 2004-10-15 11:03:52.000000000 -0700 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +Index: linux-DRV401/fs/ext3/inode.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/inode.c 2004-10-15 10:24:00.000000000 -0700 ++++ linux-DRV401/fs/ext3/inode.c 2004-10-15 11:03:52.000000000 -0700 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext3_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -164,9 +176,7 @@ + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1843,6 +1853,8 @@ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -1990,8 +2002,6 @@ + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2118,10 +2128,7 @@ + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2129,15 +2136,17 @@ + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } +- } else ++ } else { ++ inode->i_op = &ext3_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); ++ } + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ +Index: linux-DRV401/fs/ext3/namei.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/namei.c 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/fs/ext3/namei.c 2004-10-15 11:03:52.000000000 -0700 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1612,7 +1613,7 @@ + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1620,7 +1621,6 @@ + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1647,9 +1647,6 @@ + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +@@ -2018,7 +2015,7 @@ + goto out_stop; + + if (l > sizeof (EXT3_I(inode)->i_data)) { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. +@@ -2245,4 +2242,16 @@ + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; ++ ++struct inode_operations ext3_special_inode_operations = { ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ +Index: linux-DRV401/fs/ext3/super.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/super.c 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/fs/ext3/super.c 2004-10-15 11:03:52.000000000 -0700 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -404,6 +405,7 @@ + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -499,6 +501,7 @@ + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; ++ + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; +@@ -511,6 +514,13 @@ + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -924,6 +934,12 @@ + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; ++ ++ /* Default extended attribute flags */ ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ ++#endif ++ + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; +@@ -1742,12 +1758,27 @@ + + static int __init init_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (error) ++ return error; ++ error = init_ext3_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_xattr_user(); ++fail: ++ exit_ext3_xattr(); ++ return error; + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + + EXPORT_SYMBOL(ext3_force_commit); +Index: linux-DRV401/fs/ext3/symlink.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/symlink.c 2004-10-15 10:24:00.000000000 -0700 ++++ linux-DRV401/fs/ext3/symlink.c 2004-10-15 11:03:52.000000000 -0700 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -33,7 +34,20 @@ + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext3_symlink_inode_operations = { ++ readlink: page_readlink, /* BKL not held. Don't need */ ++ follow_link: page_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; +Index: linux-DRV401/fs/ext3/xattr.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/xattr.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/ext3/xattr.c 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,1225 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define EXT3_EA_USER "user." ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) while(0) {} ++# define ext3_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT3_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext3_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext3_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext3_xattr_cache_insert(new_bh); ++ ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext3_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext3_xattr_sem); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +Index: linux-DRV401/fs/ext3/xattr_user.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/ext3/xattr_user.c 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,111 @@ ++/* ++ * linux/fs/ext3/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext3_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext3_xattr_user_list, ++ get: ext3_xattr_user_get, ++ set: ext3_xattr_user_set, ++}; ++ ++int __init ++init_ext3_xattr_user(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} ++ ++void ++exit_ext3_xattr_user(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} +Index: linux-DRV401/fs/ext3/ext3-exports.c +=================================================================== +--- linux-DRV401.orig/fs/ext3/ext3-exports.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/ext3/ext3-exports.c 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); +Index: linux-DRV401/fs/mbcache.c +=================================================================== +--- linux-DRV401.orig/fs/mbcache.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/mbcache.c 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,648 @@ ++/* ++ * linux/fs/mbcache.c ++ * (C) 2001-2002 Andreas Gruenbacher, ++ */ ++ ++/* ++ * Filesystem Meta Information Block Cache (mbcache) ++ * ++ * The mbcache caches blocks of block devices that need to be located ++ * by their device/block number, as well as by other criteria (such ++ * as the block's contents). ++ * ++ * There can only be one cache entry in a cache per device and block number. ++ * Additional indexes need not be unique in this sense. The number of ++ * additional indexes (=other criteria) can be hardwired at compile time ++ * or specified at cache create time. ++ * ++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' ++ * in the cache. A valid entry is in the main hash tables of the cache, ++ * and may also be in the lru list. An invalid entry is not in any hashes ++ * or lists. ++ * ++ * A valid cache entry is only in the lru list if no handles refer to it. ++ * Invalid cache entries will be freed when the last handle to the cache ++ * entry is released. Entries that cannot be freed immediately are put ++ * back on the lru list. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#ifdef MB_CACHE_DEBUG ++# define mb_debug(f...) do { \ ++ printk(KERN_DEBUG f); \ ++ printk("\n"); \ ++ } while (0) ++#define mb_assert(c) do { if (!(c)) \ ++ printk(KERN_ERR "assertion " #c " failed\n"); \ ++ } while(0) ++#else ++# define mb_debug(f...) do { } while(0) ++# define mb_assert(c) do { } while(0) ++#endif ++#define mb_error(f...) do { \ ++ printk(KERN_ERR f); \ ++ printk("\n"); \ ++ } while(0) ++ ++MODULE_AUTHOR("Andreas Gruenbacher "); ++MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ++MODULE_LICENSE("GPL"); ++#endif ++ ++EXPORT_SYMBOL(mb_cache_create); ++EXPORT_SYMBOL(mb_cache_shrink); ++EXPORT_SYMBOL(mb_cache_destroy); ++EXPORT_SYMBOL(mb_cache_entry_alloc); ++EXPORT_SYMBOL(mb_cache_entry_insert); ++EXPORT_SYMBOL(mb_cache_entry_release); ++EXPORT_SYMBOL(mb_cache_entry_takeout); ++EXPORT_SYMBOL(mb_cache_entry_free); ++EXPORT_SYMBOL(mb_cache_entry_dup); ++EXPORT_SYMBOL(mb_cache_entry_get); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++EXPORT_SYMBOL(mb_cache_entry_find_first); ++EXPORT_SYMBOL(mb_cache_entry_find_next); ++#endif ++ ++ ++/* ++ * Global data: list of all mbcache's, lru list, and a spinlock for ++ * accessing cache data structures on SMP machines. The lru list is ++ * global across all mbcaches. ++ */ ++ ++static LIST_HEAD(mb_cache_list); ++static LIST_HEAD(mb_cache_lru_list); ++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; ++ ++static inline int ++mb_cache_indexes(struct mb_cache *cache) ++{ ++#ifdef MB_CACHE_INDEXES_COUNT ++ return MB_CACHE_INDEXES_COUNT; ++#else ++ return cache->c_indexes_count; ++#endif ++} ++ ++/* ++ * What the mbcache registers as to get shrunk dynamically. ++ */ ++ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask); ++ ++static struct cache_definition mb_cache_definition = { ++ "mb_cache", ++ mb_cache_memory_pressure ++}; ++ ++ ++static inline int ++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) ++{ ++ return !list_empty(&ce->e_block_list); ++} ++ ++ ++static inline void ++__mb_cache_entry_unhash(struct mb_cache_entry *ce) ++{ ++ int n; ++ ++ if (__mb_cache_entry_is_hashed(ce)) { ++ list_del_init(&ce->e_block_list); ++ for (n=0; ne_cache); n++) ++ list_del(&ce->e_indexes[n].o_list); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ ++ mb_assert(atomic_read(&ce->e_used) == 0); ++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { ++ /* free failed -- put back on the lru list ++ for freeing later. */ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&ce->e_lru_list, &mb_cache_lru_list); ++ spin_unlock(&mb_cache_spinlock); ++ } else { ++ kmem_cache_free(cache->c_entry_cache, ce); ++ atomic_dec(&cache->c_entry_count); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) ++{ ++ if (atomic_dec_and_test(&ce->e_used)) { ++ if (__mb_cache_entry_is_hashed(ce)) ++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); ++ else { ++ spin_unlock(&mb_cache_spinlock); ++ __mb_cache_entry_forget(ce, GFP_KERNEL); ++ return; ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_memory_pressure() memory pressure callback ++ * ++ * This function is called by the kernel memory management when memory ++ * gets low. ++ * ++ * @priority: Amount by which to shrink the cache (0 = highes priority) ++ * @gfp_mask: (ignored) ++ */ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int count = 0; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &mb_cache_list) { ++ struct mb_cache *cache = ++ list_entry(l, struct mb_cache, c_cache_list); ++ mb_debug("cache %s (%d)", cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ count += atomic_read(&cache->c_entry_count); ++ } ++ mb_debug("trying to free %d of %d entries", ++ count / (priority ? priority : 1), count); ++ if (priority) ++ count /= priority; ++ while (count-- && !list_empty(&mb_cache_lru_list)) { ++ struct mb_cache_entry *ce = ++ list_entry(mb_cache_lru_list.next, ++ struct mb_cache_entry, e_lru_list); ++ list_del(&ce->e_lru_list); ++ __mb_cache_entry_unhash(ce); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), gfp_mask); ++ } ++} ++ ++ ++/* ++ * mb_cache_create() create a new cache ++ * ++ * All entries in one cache are equal size. Cache entries may be from ++ * multiple devices. If this is the first mbcache created, registers ++ * the cache with kernel memory management. Returns NULL if no more ++ * memory was available. ++ * ++ * @name: name of the cache (informal) ++ * @cache_op: contains the callback called when freeing a cache entry ++ * @entry_size: The size of a cache entry, including ++ * struct mb_cache_entry ++ * @indexes_count: number of additional indexes in the cache. Must equal ++ * MB_CACHE_INDEXES_COUNT if the number of indexes is ++ * hardwired. ++ * @bucket_count: number of hash buckets ++ */ ++struct mb_cache * ++mb_cache_create(const char *name, struct mb_cache_op *cache_op, ++ size_t entry_size, int indexes_count, int bucket_count) ++{ ++ int m=0, n; ++ struct mb_cache *cache = NULL; ++ ++ if(entry_size < sizeof(struct mb_cache_entry) + ++ indexes_count * sizeof(struct mb_cache_entry_index)) ++ return NULL; ++ ++ MOD_INC_USE_COUNT; ++ cache = kmalloc(sizeof(struct mb_cache) + ++ indexes_count * sizeof(struct list_head), GFP_KERNEL); ++ if (!cache) ++ goto fail; ++ cache->c_name = name; ++ cache->c_op.free = NULL; ++ if (cache_op) ++ cache->c_op.free = cache_op->free; ++ atomic_set(&cache->c_entry_count, 0); ++ cache->c_bucket_count = bucket_count; ++#ifdef MB_CACHE_INDEXES_COUNT ++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); ++#else ++ cache->c_indexes_count = indexes_count; ++#endif ++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_block_hash) ++ goto fail; ++ for (n=0; nc_block_hash[n]); ++ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * ++ sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_indexes_hash[m]) ++ goto fail; ++ for (n=0; nc_indexes_hash[m][n]); ++ } ++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, ++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); ++ if (!cache->c_entry_cache) ++ goto fail; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&cache->c_cache_list, &mb_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ return cache; ++ ++fail: ++ if (cache) { ++ while (--m >= 0) ++ kfree(cache->c_indexes_hash[m]); ++ if (cache->c_block_hash) ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ } ++ MOD_DEC_USE_COUNT; ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_shrink() ++ * ++ * Removes all cache entires of a device from the cache. All cache entries ++ * currently in use cannot be freed, and thus remain in the cache. ++ * ++ * @cache: which cache to shrink ++ * @dev: which device's cache entries to shrink ++ */ ++void ++mb_cache_shrink(struct mb_cache *cache, kdev_t dev) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_dev == dev) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++} ++ ++ ++/* ++ * mb_cache_destroy() ++ * ++ * Shrinks the cache to its minimum possible size (hopefully 0 entries), ++ * and then destroys it. If this was the last mbcache, un-registers the ++ * mbcache from kernel memory management. ++ */ ++void ++mb_cache_destroy(struct mb_cache *cache) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_cache == cache) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ list_del(&cache->c_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++ ++ if (atomic_read(&cache->c_entry_count) > 0) { ++ mb_error("cache %s: %d orphaned entries", ++ cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ } ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) ++ /* We don't have kmem_cache_destroy() in 2.2.x */ ++ kmem_cache_shrink(cache->c_entry_cache); ++#else ++ kmem_cache_destroy(cache->c_entry_cache); ++#endif ++ for (n=0; n < mb_cache_indexes(cache); n++) ++ kfree(cache->c_indexes_hash[n]); ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ ++ MOD_DEC_USE_COUNT; ++} ++ ++ ++/* ++ * mb_cache_entry_alloc() ++ * ++ * Allocates a new cache entry. The new entry will not be valid initially, ++ * and thus cannot be looked up yet. It should be filled with data, and ++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL ++ * if no more memory was available. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_alloc(struct mb_cache *cache) ++{ ++ struct mb_cache_entry *ce; ++ ++ atomic_inc(&cache->c_entry_count); ++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); ++ if (ce) { ++ INIT_LIST_HEAD(&ce->e_lru_list); ++ INIT_LIST_HEAD(&ce->e_block_list); ++ ce->e_cache = cache; ++ atomic_set(&ce->e_used, 1); ++ } ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_insert() ++ * ++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into ++ * the cache. After this, the cache entry can be looked up, but is not yet ++ * in the lru list as the caller still holds a handle to it. Returns 0 on ++ * success, or -EBUSY if a cache entry for that device + inode exists ++ * already (this may happen after a failed lookup, if another process has ++ * inserted the same cache entry in the meantime). ++ * ++ * @dev: device the cache entry belongs to ++ * @block: block number ++ * @keys: array of additional keys. There must be indexes_count entries ++ * in the array (as specified when creating the cache). ++ */ ++int ++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, ++ unsigned long block, unsigned int keys[]) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ int error = -EBUSY, n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) ++ goto out; ++ } ++ __mb_cache_entry_unhash(ce); ++ ce->e_dev = dev; ++ ce->e_block = block; ++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); ++ for (n=0; ne_indexes[n].o_key = keys[n]; ++ bucket = keys[n] % cache->c_bucket_count; ++ list_add(&ce->e_indexes[n].o_list, ++ &cache->c_indexes_hash[n][bucket]); ++ } ++out: ++ spin_unlock(&mb_cache_spinlock); ++ return error; ++} ++ ++ ++/* ++ * mb_cache_entry_release() ++ * ++ * Release a handle to a cache entry. When the last handle to a cache entry ++ * is released it is either freed (if it is invalid) or otherwise inserted ++ * in to the lru list. ++ */ ++void ++mb_cache_entry_release(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_takeout() ++ * ++ * Take a cache entry out of the cache, making it invalid. The entry can later ++ * be re-inserted using mb_cache_entry_insert(), or released using ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_takeout(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_entry_free() ++ * ++ * This is equivalent to the sequence mb_cache_entry_takeout() -- ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_free(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_dup() ++ * ++ * Duplicate a handle to a cache entry (does not duplicate the cache entry ++ * itself). After the call, both the old and the new handle must be released. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_dup(struct mb_cache_entry *ce) ++{ ++ atomic_inc(&ce->e_used); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_get() ++ * ++ * Get a cache entry by device / block number. (There can only be one entry ++ * in the cache per device and block.) Returns NULL if no such cache entry ++ * exists. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) ++{ ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ ce = list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ goto cleanup; ++ } ++ } ++ ce = NULL; ++ ++cleanup: ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++ ++static struct mb_cache_entry * ++__mb_cache_entry_find(struct list_head *l, struct list_head *head, ++ int index, kdev_t dev, unsigned int key) ++{ ++ while (l != head) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, ++ e_indexes[index].o_list); ++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ return ce; ++ } ++ l = l->next; ++ } ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_entry_find_first() ++ * ++ * Find the first cache entry on a given device with a certain key in ++ * an additional index. Additonal matches can be found with ++ * mb_cache_entry_find_next(). Returns NULL if no match was found. ++ * ++ * @cache: the cache to search ++ * @index: the number of the additonal index to search (0<=indexc_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = cache->c_indexes_hash[index][bucket].next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_find_next() ++ * ++ * Find the next cache entry on a given device with a certain key in an ++ * additional index. Returns NULL if no match could be found. The previous ++ * entry is atomatically released, so that mb_cache_entry_find_next() can ++ * be called like this: ++ * ++ * entry = mb_cache_entry_find_first(); ++ * while (entry) { ++ * ... ++ * entry = mb_cache_entry_find_next(entry, ...); ++ * } ++ * ++ * @prev: The previous match ++ * @index: the number of the additonal index to search (0<=indexe_cache; ++ unsigned int bucket = key % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = prev->e_indexes[index].o_list.next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ __mb_cache_entry_release_unlock(prev); ++ return ce; ++} ++ ++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ ++ ++static int __init init_mbcache(void) ++{ ++ register_cache(&mb_cache_definition); ++ return 0; ++} ++ ++static void __exit exit_mbcache(void) ++{ ++ unregister_cache(&mb_cache_definition); ++} ++ ++module_init(init_mbcache) ++module_exit(exit_mbcache) ++ +Index: linux-DRV401/fs/xattr.c +=================================================================== +--- linux-DRV401.orig/fs/xattr.c 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/fs/xattr.c 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,355 @@ ++/* ++ File: fs/xattr.c ++ ++ Extended attribute handling. ++ ++ Copyright (C) 2001 by Andreas Gruenbacher ++ Copyright (C) 2001 SGI - Silicon Graphics, Inc ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Extended attribute memory allocation wrappers, originally ++ * based on the Intermezzo PRESTO_ALLOC/PRESTO_FREE macros. ++ * The vmalloc use here is very uncommon - extended attributes ++ * are supposed to be small chunks of metadata, and it is quite ++ * unusual to have very many extended attributes, so lists tend ++ * to be quite short as well. The 64K upper limit is derived ++ * from the extended attribute size limit used by XFS. ++ * Intentionally allow zero @size for value/list size requests. ++ */ ++static void * ++xattr_alloc(size_t size, size_t limit) ++{ ++ void *ptr; ++ ++ if (size > limit) ++ return ERR_PTR(-E2BIG); ++ ++ if (!size) /* size request, no buffer is needed */ ++ return NULL; ++ else if (size <= PAGE_SIZE) ++ ptr = kmalloc((unsigned long) size, GFP_KERNEL); ++ else ++ ptr = vmalloc((unsigned long) size); ++ if (!ptr) ++ return ERR_PTR(-ENOMEM); ++ return ptr; ++} ++ ++static void ++xattr_free(void *ptr, size_t size) ++{ ++ if (!size) /* size request, no buffer was needed */ ++ return; ++ else if (size <= PAGE_SIZE) ++ kfree(ptr); ++ else ++ vfree(ptr); ++} ++ ++/* ++ * Extended attribute SET operations ++ */ ++static long ++setxattr(struct dentry *d, char *name, void *value, size_t size, int flags) ++{ ++ int error; ++ void *kvalue; ++ char kname[XATTR_NAME_MAX + 1]; ++ ++ if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) ++ return -EINVAL; ++ ++ error = strncpy_from_user(kname, name, sizeof(kname)); ++ if (error == 0 || error == sizeof(kname)) ++ error = -ERANGE; ++ if (error < 0) ++ return error; ++ ++ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); ++ if (IS_ERR(kvalue)) ++ return PTR_ERR(kvalue); ++ ++ if (size > 0 && copy_from_user(kvalue, value, size)) { ++ xattr_free(kvalue, size); ++ return -EFAULT; ++ } ++ ++ error = -EOPNOTSUPP; ++ if (d->d_inode->i_op && d->d_inode->i_op->setxattr) { ++ down(&d->d_inode->i_sem); ++ lock_kernel(); ++ error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags); ++ unlock_kernel(); ++ up(&d->d_inode->i_sem); ++ } ++ ++ xattr_free(kvalue, size); ++ return error; ++} ++ ++asmlinkage long ++sys_setxattr(char *path, char *name, void *value, size_t size, int flags) ++{ ++ struct nameidata nd; ++ int error; ++ ++ error = user_path_walk(path, &nd); ++ if (error) ++ return error; ++ error = setxattr(nd.dentry, name, value, size, flags); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage long ++sys_lsetxattr(char *path, char *name, void *value, size_t size, int flags) ++{ ++ struct nameidata nd; ++ int error; ++ ++ error = user_path_walk_link(path, &nd); ++ if (error) ++ return error; ++ error = setxattr(nd.dentry, name, value, size, flags); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage long ++sys_fsetxattr(int fd, char *name, void *value, size_t size, int flags) ++{ ++ struct file *f; ++ int error = -EBADF; ++ ++ f = fget(fd); ++ if (!f) ++ return error; ++ error = setxattr(f->f_dentry, name, value, size, flags); ++ fput(f); ++ return error; ++} ++ ++/* ++ * Extended attribute GET operations ++ */ ++static ssize_t ++getxattr(struct dentry *d, char *name, void *value, size_t size) ++{ ++ ssize_t error; ++ void *kvalue; ++ char kname[XATTR_NAME_MAX + 1]; ++ ++ error = strncpy_from_user(kname, name, sizeof(kname)); ++ if (error == 0 || error == sizeof(kname)) ++ error = -ERANGE; ++ if (error < 0) ++ return error; ++ ++ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); ++ if (IS_ERR(kvalue)) ++ return PTR_ERR(kvalue); ++ ++ error = -EOPNOTSUPP; ++ if (d->d_inode->i_op && d->d_inode->i_op->getxattr) { ++ down(&d->d_inode->i_sem); ++ lock_kernel(); ++ error = d->d_inode->i_op->getxattr(d, kname, kvalue, size); ++ unlock_kernel(); ++ up(&d->d_inode->i_sem); ++ } ++ ++ if (kvalue && error > 0) ++ if (copy_to_user(value, kvalue, error)) ++ error = -EFAULT; ++ xattr_free(kvalue, size); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_getxattr(char *path, char *name, void *value, size_t size) ++{ ++ struct nameidata nd; ++ ssize_t error; ++ ++ error = user_path_walk(path, &nd); ++ if (error) ++ return error; ++ error = getxattr(nd.dentry, name, value, size); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_lgetxattr(char *path, char *name, void *value, size_t size) ++{ ++ struct nameidata nd; ++ ssize_t error; ++ ++ error = user_path_walk_link(path, &nd); ++ if (error) ++ return error; ++ error = getxattr(nd.dentry, name, value, size); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_fgetxattr(int fd, char *name, void *value, size_t size) ++{ ++ struct file *f; ++ ssize_t error = -EBADF; ++ ++ f = fget(fd); ++ if (!f) ++ return error; ++ error = getxattr(f->f_dentry, name, value, size); ++ fput(f); ++ return error; ++} ++ ++/* ++ * Extended attribute LIST operations ++ */ ++static ssize_t ++listxattr(struct dentry *d, char *list, size_t size) ++{ ++ ssize_t error; ++ char *klist; ++ ++ klist = (char *)xattr_alloc(size, XATTR_LIST_MAX); ++ if (IS_ERR(klist)) ++ return PTR_ERR(klist); ++ ++ error = -EOPNOTSUPP; ++ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { ++ down(&d->d_inode->i_sem); ++ lock_kernel(); ++ error = d->d_inode->i_op->listxattr(d, klist, size); ++ unlock_kernel(); ++ up(&d->d_inode->i_sem); ++ } ++ ++ if (klist && error > 0) ++ if (copy_to_user(list, klist, error)) ++ error = -EFAULT; ++ xattr_free(klist, size); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_listxattr(char *path, char *list, size_t size) ++{ ++ struct nameidata nd; ++ ssize_t error; ++ ++ error = user_path_walk(path, &nd); ++ if (error) ++ return error; ++ error = listxattr(nd.dentry, list, size); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_llistxattr(char *path, char *list, size_t size) ++{ ++ struct nameidata nd; ++ ssize_t error; ++ ++ error = user_path_walk_link(path, &nd); ++ if (error) ++ return error; ++ error = listxattr(nd.dentry, list, size); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage ssize_t ++sys_flistxattr(int fd, char *list, size_t size) ++{ ++ struct file *f; ++ ssize_t error = -EBADF; ++ ++ f = fget(fd); ++ if (!f) ++ return error; ++ error = listxattr(f->f_dentry, list, size); ++ fput(f); ++ return error; ++} ++ ++/* ++ * Extended attribute REMOVE operations ++ */ ++static long ++removexattr(struct dentry *d, char *name) ++{ ++ int error; ++ char kname[XATTR_NAME_MAX + 1]; ++ ++ error = strncpy_from_user(kname, name, sizeof(kname)); ++ if (error == 0 || error == sizeof(kname)) ++ error = -ERANGE; ++ if (error < 0) ++ return error; ++ ++ error = -EOPNOTSUPP; ++ if (d->d_inode->i_op && d->d_inode->i_op->removexattr) { ++ down(&d->d_inode->i_sem); ++ lock_kernel(); ++ error = d->d_inode->i_op->removexattr(d, kname); ++ unlock_kernel(); ++ up(&d->d_inode->i_sem); ++ } ++ return error; ++} ++ ++asmlinkage long ++sys_removexattr(char *path, char *name) ++{ ++ struct nameidata nd; ++ int error; ++ ++ error = user_path_walk(path, &nd); ++ if (error) ++ return error; ++ error = removexattr(nd.dentry, name); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage long ++sys_lremovexattr(char *path, char *name) ++{ ++ struct nameidata nd; ++ int error; ++ ++ error = user_path_walk_link(path, &nd); ++ if (error) ++ return error; ++ error = removexattr(nd.dentry, name); ++ path_release(&nd); ++ return error; ++} ++ ++asmlinkage long ++sys_fremovexattr(int fd, char *name) ++{ ++ struct file *f; ++ int error = -EBADF; ++ ++ f = fget(fd); ++ if (!f) ++ return error; ++ error = removexattr(f->f_dentry, name); ++ fput(f); ++ return error; ++} +Index: linux-DRV401/include/linux/cache_def.h +=================================================================== +--- linux-DRV401.orig/include/linux/cache_def.h 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/include/linux/cache_def.h 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,15 @@ ++/* ++ * linux/cache_def.h ++ * Handling of caches defined in drivers, filesystems, ... ++ * ++ * Copyright (C) 2002 by Andreas Gruenbacher, ++ */ ++ ++struct cache_definition { ++ const char *name; ++ void (*shrink)(int, unsigned int); ++ struct list_head link; ++}; ++ ++extern void register_cache(struct cache_definition *); ++extern void unregister_cache(struct cache_definition *); +Index: linux-DRV401/include/linux/errno.h +=================================================================== +--- linux-DRV401.orig/include/linux/errno.h 2004-10-15 10:26:15.000000000 -0700 ++++ linux-DRV401/include/linux/errno.h 2004-10-15 11:03:52.000000000 -0700 +@@ -23,4 +23,8 @@ + + #endif + ++/* Defined for extended attributes */ ++#define ENOATTR ENODATA /* No such attribute */ ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++ + #endif +Index: linux-DRV401/include/linux/ext2_fs.h +=================================================================== +--- linux-DRV401.orig/include/linux/ext2_fs.h 2004-10-15 10:26:11.000000000 -0700 ++++ linux-DRV401/include/linux/ext2_fs.h 2004-10-15 11:03:52.000000000 -0700 +@@ -57,8 +57,6 @@ + */ + #define EXT2_BAD_INO 1 /* Bad blocks inode */ + #define EXT2_ROOT_INO 2 /* Root inode */ +-#define EXT2_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT2_ACL_DATA_INO 4 /* ACL inode */ + #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +@@ -86,7 +84,6 @@ + #else + # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) + #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -121,28 +118,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext2_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext2_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext2_group_desc +@@ -314,6 +289,7 @@ + #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt + #define set_opt(o, opt) o |= EXT2_MOUNT_##opt +@@ -397,6 +373,7 @@ + + #ifdef __KERNEL__ + #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) ++#define EXT2_I(inode) (&((inode)->u.ext2_i)) + #else + /* Assume that user mode programs are passing in an ext2fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +@@ -466,7 +443,7 @@ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +-#define EXT2_FEATURE_COMPAT_SUPP 0 ++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -623,8 +600,10 @@ + + /* namei.c */ + extern struct inode_operations ext2_dir_inode_operations; ++extern struct inode_operations ext2_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext2_symlink_inode_operations; + extern struct inode_operations ext2_fast_symlink_inode_operations; + + #endif /* __KERNEL__ */ +Index: linux-DRV401/include/linux/ext2_xattr.h +=================================================================== +--- linux-DRV401.orig/include/linux/ext2_xattr.h 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/include/linux/ext2_xattr.h 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext2_xattr.h ++ ++ On-disk format of extended attributes for the ext2 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT2_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT2_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT2_XATTR_INDEX_MAX 10 ++#define EXT2_XATTR_INDEX_USER 1 ++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext2_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext2_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT2_XATTR_PAD_BITS 2 ++#define EXT2_XATTR_PAD (1<e_name_len)) ) ++#define EXT2_XATTR_SIZE(size) \ ++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT2_FS_XATTR ++ ++struct ext2_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext2_xattr_register(int, struct ext2_xattr_handler *); ++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); ++ ++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); ++extern int ext2_removexattr(struct dentry *, const char *); ++ ++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext2_xattr_list(struct inode *, char *, size_t); ++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext2_xattr_delete_inode(struct inode *); ++extern void ext2_xattr_put_super(struct super_block *); ++ ++extern int init_ext2_xattr(void) __init; ++extern void exit_ext2_xattr(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR */ ++# define ext2_setxattr NULL ++# define ext2_getxattr NULL ++# define ext2_listxattr NULL ++# define ext2_removexattr NULL ++ ++static inline int ++ext2_xattr_get(struct inode *inode, int name_index, ++ const char *name, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++} ++ ++static inline void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR */ ++ ++# ifdef CONFIG_EXT2_FS_XATTR_USER ++ ++extern int init_ext2_xattr_user(void) __init; ++extern void exit_ext2_xattr_user(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++static inline int ++init_ext2_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr_user(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +Index: linux-DRV401/include/linux/ext3_fs.h +=================================================================== +--- linux-DRV401.orig/include/linux/ext3_fs.h 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/include/linux/ext3_fs.h 2004-10-15 11:03:52.000000000 -0700 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -344,6 +319,7 @@ + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -520,7 +496,7 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -703,6 +679,7 @@ + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +@@ -771,8 +748,10 @@ + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; ++extern struct inode_operations ext3_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + + +Index: linux-DRV401/include/linux/ext3_jbd.h +=================================================================== +--- linux-DRV401.orig/include/linux/ext3_jbd.h 2004-10-15 10:39:16.000000000 -0700 ++++ linux-DRV401/include/linux/ext3_jbd.h 2004-10-15 11:03:52.000000000 -0700 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +Index: linux-DRV401/include/linux/ext3_xattr.h +=================================================================== +--- linux-DRV401.orig/include/linux/ext3_xattr.h 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/include/linux/ext3_xattr.h 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext3_xattr_delete_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +Index: linux-DRV401/include/linux/fs.h +=================================================================== +--- linux-DRV401.orig/include/linux/fs.h 2004-10-15 10:39:15.000000000 -0700 ++++ linux-DRV401/include/linux/fs.h 2004-10-15 11:03:52.000000000 -0700 +@@ -936,6 +936,10 @@ + int (*setattr) (struct dentry *, struct iattr *); + int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); ++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); ++ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ++ ssize_t (*listxattr) (struct dentry *, char *, size_t); ++ int (*removexattr) (struct dentry *, const char *); + }; + + struct seq_file; +Index: linux-DRV401/include/linux/mbcache.h +=================================================================== +--- linux-DRV401.orig/include/linux/mbcache.h 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/include/linux/mbcache.h 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,69 @@ ++/* ++ File: linux/mbcache.h ++ ++ (C) 2001 by Andreas Gruenbacher, ++*/ ++ ++/* Hardwire the number of additional indexes */ ++#define MB_CACHE_INDEXES_COUNT 1 ++ ++struct mb_cache_entry; ++ ++struct mb_cache_op { ++ int (*free)(struct mb_cache_entry *, int); ++}; ++ ++struct mb_cache { ++ struct list_head c_cache_list; ++ const char *c_name; ++ struct mb_cache_op c_op; ++ atomic_t c_entry_count; ++ int c_bucket_count; ++#ifndef MB_CACHE_INDEXES_COUNT ++ int c_indexes_count; ++#endif ++ kmem_cache_t *c_entry_cache; ++ struct list_head *c_block_hash; ++ struct list_head *c_indexes_hash[0]; ++}; ++ ++struct mb_cache_entry_index { ++ struct list_head o_list; ++ unsigned int o_key; ++}; ++ ++struct mb_cache_entry { ++ struct list_head e_lru_list; ++ struct mb_cache *e_cache; ++ atomic_t e_used; ++ kdev_t e_dev; ++ unsigned long e_block; ++ struct list_head e_block_list; ++ struct mb_cache_entry_index e_indexes[0]; ++}; ++ ++/* Functions on caches */ ++ ++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, ++ int, int); ++void mb_cache_shrink(struct mb_cache *, kdev_t); ++void mb_cache_destroy(struct mb_cache *); ++ ++/* Functions on cache entries */ ++ ++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); ++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, ++ unsigned int[]); ++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); ++void mb_cache_entry_release(struct mb_cache_entry *); ++void mb_cache_entry_takeout(struct mb_cache_entry *); ++void mb_cache_entry_free(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, ++ unsigned long); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, ++ kdev_t, unsigned int); ++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, ++ kdev_t, unsigned int); ++#endif +Index: linux-DRV401/include/linux/xattr.h +=================================================================== +--- linux-DRV401.orig/include/linux/xattr.h 2004-10-12 08:56:38.404764448 -0700 ++++ linux-DRV401/include/linux/xattr.h 2004-10-15 11:03:52.000000000 -0700 +@@ -0,0 +1,15 @@ ++/* ++ File: linux/xattr.h ++ ++ Extended attributes handling. ++ ++ Copyright (C) 2001 by Andreas Gruenbacher ++ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. ++*/ ++#ifndef _LINUX_XATTR_H ++#define _LINUX_XATTR_H ++ ++#define XATTR_CREATE 0x1 /* set the value, fail if attr already exists */ ++#define XATTR_REPLACE 0x2 /* set the value, fail if attr does not exist */ ++ ++#endif /* _LINUX_XATTR_H */ +Index: linux-DRV401/include/linux/limits.h +=================================================================== +--- linux-DRV401.orig/include/linux/limits.h 2004-10-15 10:26:20.000000000 -0700 ++++ linux-DRV401/include/linux/limits.h 2004-10-15 11:03:52.000000000 -0700 +@@ -13,6 +13,9 @@ + #define NAME_MAX 255 /* # chars in a file name */ + #define PATH_MAX 4096 /* # chars in a path name including nul */ + #define PIPE_BUF 4096 /* # bytes in atomic write to a pipe */ ++#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */ ++#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */ ++#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */ + + #define RTSIG_MAX 32 + +Index: linux-DRV401/kernel/ksyms.c +=================================================================== +--- linux-DRV401.orig/kernel/ksyms.c 2004-10-15 10:39:15.000000000 -0700 ++++ linux-DRV401/kernel/ksyms.c 2004-10-15 11:03:52.000000000 -0700 +@@ -11,6 +11,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -88,6 +89,7 @@ + EXPORT_SYMBOL(exit_files); + EXPORT_SYMBOL(exit_fs); + EXPORT_SYMBOL(exit_sighand); ++EXPORT_SYMBOL(copy_fs_struct); + EXPORT_SYMBOL(unshare_files); + + /* internal kernel memory management */ +@@ -105,6 +107,8 @@ + EXPORT_SYMBOL(kmem_cache_shrink); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); ++EXPORT_SYMBOL(register_cache); ++EXPORT_SYMBOL(unregister_cache); + EXPORT_SYMBOL(kmalloc); + EXPORT_SYMBOL(kfree); + EXPORT_SYMBOL(vfree); +Index: linux-DRV401/mm/vmscan.c +=================================================================== +--- linux-DRV401.orig/mm/vmscan.c 2004-10-15 10:24:07.000000000 -0700 ++++ linux-DRV401/mm/vmscan.c 2004-10-15 11:08:53.000000000 -0700 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -31,6 +32,39 @@ + */ + #define DEF_PRIORITY (6) + ++static DECLARE_MUTEX(other_caches_sem); ++static LIST_HEAD(cache_definitions); ++ ++void register_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_add(&cache->link, &cache_definitions); ++ up(&other_caches_sem); ++} ++ ++void unregister_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_del(&cache->link); ++ up(&other_caches_sem); ++} ++ ++static void shrink_other_caches(unsigned int priority, int gfp_mask) ++{ ++ struct list_head *p; ++ ++ if (down_trylock(&other_caches_sem)) ++ return; ++ ++ list_for_each_prev(p, &cache_definitions) { ++ struct cache_definition *cache = ++ list_entry(p, struct cache_definition, link); ++ ++ cache->shrink(priority, gfp_mask); ++ } ++ up(&other_caches_sem); ++} ++ + /* + * The swap-out function returns 1 if it successfully + * scanned all the pages it was asked to (`count'). +@@ -584,6 +618,7 @@ + + shrink_dcache_memory(priority, gfp_mask); + shrink_icache_memory(priority, gfp_mask); ++ shrink_other_caches(priority, gfp_mask); + #ifdef CONFIG_QUOTA + shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + #endif diff --git a/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch new file mode 100644 index 0000000..1becfbc --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch @@ -0,0 +1,346 @@ + Documentation/Configure.help | 66 ++ + arch/ia64/defconfig | 7 + fs/Config.in | 14 + fs/Makefile | 3 + fs/ext2/Makefile | 4 + fs/ext2/file.c | 5 + fs/ext2/ialloc.c | 2 + fs/ext2/inode.c | 34 - + fs/ext2/namei.c | 14 + fs/ext2/super.c | 29 + fs/ext2/symlink.c | 14 + fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ + fs/ext2/xattr_user.c | 103 +++ + fs/ext3/Makefile | 9 + fs/ext3/ext3-exports.c | 13 + fs/ext3/file.c | 5 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 35 - + fs/ext3/namei.c | 21 + fs/ext3/super.c | 36 + + fs/ext3/symlink.c | 14 + fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/xattr_user.c | 111 +++ + fs/jfs/jfs_xattr.h | 6 + fs/jfs/xattr.c | 6 + fs/mbcache.c | 648 ++++++++++++++++++++++ + include/linux/cache_def.h | 15 + include/linux/errno.h | 4 + include/linux/ext2_fs.h | 31 - + include/linux/ext2_xattr.h | 157 +++++ + include/linux/ext3_fs.h | 31 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 157 +++++ + include/linux/fs.h | 2 + include/linux/mbcache.h | 69 ++ + kernel/ksyms.c | 4 + mm/vmscan.c | 35 + + 62 files changed, 4343 insertions(+), 182 deletions(-) + +Index: linux-2.4.19.SuSE/Documentation/Configure.help +=================================================================== +--- linux-2.4.19.SuSE.orig/Documentation/Configure.help 2004-05-03 11:20:17.000000000 -0700 ++++ linux-2.4.19.SuSE/Documentation/Configure.help 2004-05-03 11:50:22.000000000 -0700 +@@ -15296,6 +15296,39 @@ + + If unsure, say N. + ++Ext2 extended attributes ++CONFIG_EXT2_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext2 extended attribute block sharing ++CONFIG_EXT2_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext2 extended user attributes ++CONFIG_EXT2_FS_XATTR_USER ++ This option enables extended user attributes on ext2. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext2 trusted extended attributes ++CONFIG_EXT2_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext2 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Ext3 journalling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journalling version of the Second extended file system +@@ -15354,6 +15387,39 @@ + + If unsure, say N. + ++Ext3 extended attributes ++CONFIG_EXT3_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext3 extended attribute block sharing ++CONFIG_EXT3_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext3 extended user attributes ++CONFIG_EXT3_FS_XATTR_USER ++ This option enables extended user attributes on ext3. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext3 trusted extended attributes ++CONFIG_EXT3_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext3 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journalling layer for block devices. It is +Index: linux-2.4.19.SuSE/arch/ia64/defconfig +=================================================================== +--- linux-2.4.19.SuSE.orig/arch/ia64/defconfig 2004-05-03 11:19:10.000000000 -0700 ++++ linux-2.4.19.SuSE/arch/ia64/defconfig 2004-05-03 11:50:22.000000000 -0700 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++CONFIG_EXT3_FS_XATTR=y ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +Index: linux-2.4.19.SuSE/fs/Config.in +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/Config.in 2004-05-03 11:18:52.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/Config.in 2004-05-03 11:50:22.000000000 -0700 +@@ -203,6 +203,10 @@ + #tristate 'Meta block cache' CONFIG_FS_MBCACHE + define_tristate CONFIG_FS_MBCACHE y + ++# Meta block cache for Extended Attributes (ext2/ext3) ++#tristate 'Meta block cache' CONFIG_FS_MBCACHE ++define_tristate CONFIG_FS_MBCACHE y ++ + mainmenu_option next_comment + comment 'Partition Types' + source fs/partitions/Config.in +Index: linux-2.4.19.SuSE/fs/Makefile +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/Makefile 2004-05-03 11:22:49.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/Makefile 2004-05-03 11:50:22.000000000 -0700 +@@ -104,6 +104,9 @@ + obj-$(CONFIG_FS_MBCACHE) += mbcache.o + obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o + ++export-objs += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++ + # persistent filesystems + obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) + +Index: linux-2.4.19.SuSE/fs/ext2/Makefile +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext2/Makefile 2004-05-03 11:18:46.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext2/Makefile 2004-05-03 11:50:22.000000000 -0700 +@@ -18,4 +18,8 @@ + obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o + obj-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o + ++export-objs += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +Index: linux-2.4.19.SuSE/fs/ext2/inode.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext2/inode.c 2004-05-03 11:18:47.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext2/inode.c 2004-05-03 11:50:22.000000000 -0700 +@@ -52,6 +52,18 @@ + } + + /* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext2_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext2_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ ++/* + * Called at each iput() + */ + void ext2_put_inode (struct inode * inode) +@@ -806,6 +818,8 @@ + return; + if (ext2_inode_is_fast_symlink(inode)) + return; ++ if (ext2_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +Index: linux-2.4.19.SuSE/fs/ext2/super.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext2/super.c 2004-05-03 11:18:47.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext2/super.c 2004-05-03 11:50:22.000000000 -0700 +@@ -70,6 +70,7 @@ + { + va_list args; + ++ ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; + sb->u.ext2_sb.s_es->s_state = +Index: linux-2.4.19.SuSE/fs/ext3/inode.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/inode.c 2004-05-03 11:18:47.000000000 -0700 ++++ linux-2.4.19.SuSE/fs/ext3/inode.c 2004-05-03 11:50:22.000000000 -0700 +@@ -54,6 +54,18 @@ + inode->i_blocks - ea_blocks == 0); + } + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext3_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -1968,6 +1980,8 @@ + return; + if (ext3_inode_is_fast_symlink(inode)) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c 2004-02-18 07:26:44.000000000 -0800 ++++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c 2004-05-03 11:50:22.000000000 -0700 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); +Index: linux-2.4.19.SuSE/include/linux/errno.h +=================================================================== +--- linux-2.4.19.SuSE.orig/include/linux/errno.h 2004-05-03 11:20:21.000000000 -0700 ++++ linux-2.4.19.SuSE/include/linux/errno.h 2004-05-03 11:50:22.000000000 -0700 +@@ -30,4 +30,8 @@ + + #endif + ++/* Defined for extended attributes */ ++#define ENOATTR ENODATA /* No such attribute */ ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++ + #endif +Index: linux-2.4.19.SuSE/kernel/ksyms.c +=================================================================== +--- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-05-03 11:22:48.000000000 -0700 ++++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-05-03 11:50:22.000000000 -0700 +@@ -12,6 +12,7 @@ + #define __KERNEL_SYSCALLS__ + #include + #include ++#include + #include + #include + #include +Index: linux-2.4.19.SuSE/mm/vmscan.c +=================================================================== +--- linux-2.4.19.SuSE.orig/mm/vmscan.c 2004-05-03 11:18:53.000000000 -0700 ++++ linux-2.4.19.SuSE/mm/vmscan.c 2004-05-03 11:50:22.000000000 -0700 +@@ -32,6 +32,39 @@ + */ + int vm_passes = 60; + ++static DECLARE_MUTEX(other_caches_sem); ++static LIST_HEAD(cache_definitions); ++ ++void register_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_add(&cache->link, &cache_definitions); ++ up(&other_caches_sem); ++} ++ ++void unregister_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_del(&cache->link); ++ up(&other_caches_sem); ++} ++ ++static void shrink_other_caches(unsigned int priority, int gfp_mask) ++{ ++ struct list_head *p; ++ ++ if (down_trylock(&other_caches_sem)) ++ return; ++ ++ list_for_each_prev(p, &cache_definitions) { ++ struct cache_definition *cache = ++ list_entry(p, struct cache_definition, link); ++ ++ cache->shrink(priority, gfp_mask); ++ } ++ up(&other_caches_sem); ++} ++ + /* + * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan + * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll diff --git a/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch b/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch new file mode 100644 index 0000000..19ad959 --- /dev/null +++ b/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch @@ -0,0 +1,72 @@ +Index: linux-2.4.18-chaos/include/linux/list.h +=================================================================== +--- linux-2.4.18-chaos.orig/include/linux/list.h 2003-11-23 00:07:05.000000000 +0300 ++++ linux-2.4.18-chaos/include/linux/list.h 2003-12-11 00:25:15.000000000 +0300 +@@ -173,6 +173,67 @@ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + ++/** ++ * list_for_each_entry - iterate over list of given type ++ * @pos: the type * to use as a loop counter. ++ * @head: the head for your list. ++ * @member: the name of the list_struct within the struct. ++ */ ++#define list_for_each_entry(pos, head, member) \ ++ for (pos = list_entry((head)->next, typeof(*pos), member), \ ++ prefetch(pos->member.next); \ ++ &pos->member != (head); \ ++ pos = list_entry(pos->member.next, typeof(*pos), member), \ ++ prefetch(pos->member.next)) ++ ++#ifndef list_for_each_entry_safe ++/** ++ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry ++ * @pos: the type * to use as a loop counter. ++ * @n: another type * to use as temporary storage ++ * @head: the head for your list. ++ * @member: the name of the list_struct within the struct. ++ */ ++#define list_for_each_entry_safe(pos, n, head, member) \ ++ for (pos = list_entry((head)->next, typeof(*pos), member), \ ++ n = list_entry(pos->member.next, typeof(*pos), member); \ ++ &pos->member != (head); \ ++ pos = n, n = list_entry(n->member.next, typeof(*n), member)) ++#endif ++ ++/** ++ * list_move - delete from one list and add as another's head ++ * @list: the entry to move ++ * @head: the head that will precede our entry ++ */ ++static inline void list_move(struct list_head *list, struct list_head *head) ++{ ++ __list_del(list->prev, list->next); ++ list_add(list, head); ++} ++ ++/** ++ * list_move_tail - delete from one list and add as another's tail ++ * @list: the entry to move ++ * @head: the head that will follow our entry ++ */ ++static inline void list_move_tail(struct list_head *list, ++ struct list_head *head) ++{ ++ __list_del(list->prev, list->next); ++ list_add_tail(list, head); ++} ++ ++/* 2.5 uses hlists for some things, like the d_hash. we'll treat them ++ * as 2.5 and let macros drop back.. */ ++#define hlist_entry list_entry ++#define hlist_head list_head ++#define hlist_node list_head ++#define HLIST_HEAD LIST_HEAD ++#define INIT_HLIST_HEAD INIT_LIST_HEAD ++#define hlist_del_init list_del_init ++#define hlist_add_head list_add ++#define hlist_for_each_safe list_for_each_safe + + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ + diff --git a/lustre/kernel_patches/patches/netconsole-2.4.24-ppc.patch b/lustre/kernel_patches/patches/netconsole-2.4.24-ppc.patch new file mode 100644 index 0000000..701c56a --- /dev/null +++ b/lustre/kernel_patches/patches/netconsole-2.4.24-ppc.patch @@ -0,0 +1,489 @@ +Index: linux-2.4.24/drivers/net/netconsole.c +=================================================================== +Index: bglio/drivers/net/netconsole.c +=================================================================== +--- bglio.orig/drivers/net/netconsole.c 2004-05-07 15:50:22.000000000 -0700 ++++ bglio/drivers/net/netconsole.c 2004-05-07 17:15:28.000000000 -0700 +@@ -12,6 +12,8 @@ + * + * 2001-09-17 started by Ingo Molnar. + * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson ++ * 2003-10-30 Add sysrq command processing by Wangdi ++ * + */ + + /**************************************************************** +@@ -51,6 +53,7 @@ + #include + #include + #include ++#include "netconsole.h" + + static struct net_device *netconsole_dev; + static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port; +@@ -62,12 +65,11 @@ + static unsigned int mhz = 500, idle_timeout; + static unsigned long long mhz_cycles, jiffy_cycles; + +-#include "netconsole.h" + + #define MAX_UDP_CHUNK 1460 + #define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) + +-#define DEBUG 0 ++#define DEBUG 0 + #if DEBUG + # define Dprintk(x...) printk(KERN_INFO x) + #else +@@ -187,6 +189,22 @@ + } + } + } ++void (*irqfunc)(int, void *, struct pt_regs *); ++ ++static void netdump_poll(struct net_device *dev) ++{ ++ int budget = 1; ++ ++ disable_irq(dev->irq); ++ ++ irqfunc(dev->irq, dev, 0); ++ ++ if(dev->poll && test_bit(__LINK_STATE_RX_SCHED, &dev->state)) ++ dev->poll(dev, &budget); ++ ++ enable_irq(dev->irq); ++ ++} + + static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve) + { +@@ -209,7 +227,7 @@ + once = 0; + } + Dprintk("alloc skb: polling controller ...\n"); +- dev->poll_controller(dev); ++ netdump_poll(dev); + goto repeat; + } + } +@@ -231,7 +249,7 @@ + spin_unlock(&dev->xmit_lock); + + Dprintk("xmit skb: polling controller ...\n"); +- dev->poll_controller(dev); ++ netdump_poll(dev); + zap_completion_queue(); + goto repeat_poll; + } +@@ -426,18 +444,79 @@ + static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED; + static unsigned int log_offset; + ++static int thread_stopped = 0; ++/*Interrupt function for netdump */ ++static int sysrq_mode = 0; ++static int stop_sysrq_thread = 0; ++#define Set_Sysrq_mode() (sysrq_mode = 1) ++#define Clear_Sysrq_mode() (sysrq_mode = 0) ++static char send_cache[MAX_PRINT_CHUNK]; ++static unsigned int send_cache_pos = 0; ++wait_queue_head_t sysrq_thread_queue; ++wait_queue_head_t sysrq_thread_waiter_queue; ++ ++#define SEND_MSG_BUFFER(buf, len) \ ++do \ ++{ \ ++ reply_t reply; \ ++ unsigned int flags; \ ++ __save_flags(flags); \ ++ __cli(); \ ++ reply.code = REPLY_LOG; \ ++ reply.nr = 0; \ ++ reply.info = 0; \ ++ spin_lock(&sequence_lock); \ ++ send_netlog_skb(dev, buf, len, &reply); \ ++ spin_unlock(&sequence_lock); \ ++ __restore_flags(flags); \ ++}while(0); ++ ++void netconsole_do_sysrq(req_t *req) ++{ ++ struct pt_regs regs; ++ struct net_device *dev = netconsole_dev; ++ ++ if (!dev) ++ return; ++ Set_Sysrq_mode(); ++ get_current_regs(®s); ++ handle_sysrq((int)req->from, ®s, NULL); ++ ++ if (send_cache_pos != 0){ ++ SEND_MSG_BUFFER(send_cache, send_cache_pos); ++ memset(send_cache, 0, MAX_PRINT_CHUNK); ++ send_cache_pos = 0; ++ } ++ ++ Clear_Sysrq_mode(); ++} + static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len) + { + int len, left, i; + struct net_device *dev; + const char *msg = msg0; + reply_t reply; +- ++ + dev = netconsole_dev; + if (!dev || netdump_mode) + return; +- +- if (dev->poll_controller && netif_running(dev)) { ++ if (sysrq_mode){ ++ unsigned long total_len = send_cache_pos + msg_len; ++ unsigned long left_len = msg_len; ++ while (total_len >= MAX_PRINT_CHUNK){ ++ unsigned long send_len = MAX_PRINT_CHUNK - send_cache_pos; ++ memcpy(send_cache + send_cache_pos, msg, send_len); ++ SEND_MSG_BUFFER(send_cache, MAX_PRINT_CHUNK); ++ send_cache_pos = 0; ++ total_len -= MAX_PRINT_CHUNK; ++ left_len -= send_len; ++ } ++ if (left_len > 0){ ++ memcpy(send_cache + send_cache_pos, msg + (msg_len -left_len), left_len); ++ send_cache_pos += left_len; ++ } ++ return; ++ }else if (netif_running(dev)) { + unsigned long flags; + + __save_flags(flags); +@@ -567,8 +646,6 @@ + req_t *req; + struct net_device *dev; + +- if (!netdump_mode) +- return NET_RX_SUCCESS; + #if DEBUG + { + static int packet_count; +@@ -722,8 +799,16 @@ + Dprintk("... netdump from: %08x.\n", req->from); + Dprintk("... netdump to: %08x.\n", req->to); + +- add_new_req(req); ++ if (netdump_mode) ++ add_new_req(req); ++ else if (req->command == COMM_SYSRQ){ ++ add_new_req(req); ++ wake_up(&sysrq_thread_queue); ++ return NET_RX_DROP; ++ } + out: ++ if (!netdump_mode) ++ return NET_RX_SUCCESS; + return NET_RX_DROP; + } + +@@ -763,6 +848,7 @@ + kunmap_atomic(kaddr, KM_NETDUMP); + } + ++ + /* + * This function waits for the client to acknowledge the receipt + * of the netdump startup reply, with the possibility of packets +@@ -792,7 +878,7 @@ + // wait 1 sec. + udelay(100); + Dprintk("handshake: polling controller ...\n"); +- dev->poll_controller(dev); ++ netdump_poll(dev); + zap_completion_queue(); + req = get_new_req(); + if (req) +@@ -884,6 +970,7 @@ + */ + spin_lock_init(&dev->xmit_lock); + ++#ifdef __i386__ + esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs)); + ss = __KERNEL_DS; + if (regs->xcs & 3) { +@@ -893,6 +980,7 @@ + myregs = *regs; + myregs.esp = esp; + myregs.xss = (myregs.xss & 0xffff0000) | ss; ++#endif + + rdtscll(t0); + +@@ -904,7 +992,7 @@ + while (netdump_mode) { + __cli(); + Dprintk("main netdump loop: polling controller ...\n"); +- dev->poll_controller(dev); ++ netdump_poll(dev); + zap_completion_queue(); + #if !CLI + __sti(); +@@ -1009,6 +1097,32 @@ + printk("NETDUMP END!\n"); + __restore_flags(flags); + } ++static int netconsole_sysrq_schedule(void *arg) ++{ ++ struct task_struct *tsk = current; ++ ++ sprintf(tsk->comm, "sysrq_schedule"); ++ sigfillset(&tsk->blocked); ++ ++ /* main loop */ ++ thread_stopped = 0; ++ for (;;) { ++ wait_event_interruptible(sysrq_thread_queue, ++ !list_empty(&request_list) || stop_sysrq_thread); ++ while (!list_empty(&request_list)) { ++ req_t *req = get_new_req(); ++ if (req->command == COMM_SYSRQ) ++ netconsole_do_sysrq(req); ++ } ++ if (stop_sysrq_thread) ++ break; ++ wake_up(&sysrq_thread_waiter_queue); ++ } ++ thread_stopped = 1; ++ wake_up(&sysrq_thread_waiter_queue); ++ return 0; ++} ++ + + static char *dev; + static int netdump_target_eth_byte0 = 255; +@@ -1087,11 +1201,12 @@ + + static struct console netconsole = + { flags: CON_ENABLED, write: write_netconsole_msg }; +- + static int init_netconsole(void) + { + struct net_device *ndev = NULL; + struct in_device *in_dev; ++ struct irqaction *action; ++ int rc = 0; + + printk(KERN_INFO "netlog: using network device <%s>\n", dev); + // this will be valid once the device goes up. +@@ -1101,10 +1216,6 @@ + printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev); + return -1; + } +- if (!ndev->poll_controller) { +- printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev); +- return -1; +- } + in_dev = in_dev_get(ndev); + if (!in_dev) { + printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev); +@@ -1137,8 +1248,6 @@ + if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) { + printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n"); + return -1; +- } +- if (netdump_target_ip) { + #define IP(x) ((unsigned char *)&netdump_target_ip)[x] + printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n", + IP(3), IP(2), IP(1), IP(0)); +@@ -1214,12 +1323,27 @@ + + mhz_cycles = (unsigned long long)mhz * 1000000ULL; + jiffy_cycles = (unsigned long long)mhz * (1000000/HZ); +- +- INIT_LIST_HEAD(&request_list); +- ++ + ndev->rx_hook = netconsole_rx_hook; + netdump_func = netconsole_netdump; + netconsole_dev = ndev; ++ /* find irq function of the ndev*/ ++ action=find_irq_action(ndev->irq, ndev); ++ if (!action) { ++ printk(KERN_ERR "couldn't find irq handler for <%s>", dev); ++ return -1; ++ } ++ irqfunc = action->handler; ++ ++ stop_sysrq_thread = 0; ++ INIT_LIST_HEAD(&request_list); ++ init_waitqueue_head(&sysrq_thread_queue); ++ init_waitqueue_head(&sysrq_thread_waiter_queue); ++ if ((rc = kernel_thread(netconsole_sysrq_schedule, NULL, 0)) < 0 ){ ++ printk(KERN_ERR "Can not start netconsole sysrq thread: rc %d\n", rc); ++ return -1; ++ } ++ + #define STARTUP_MSG "[...network console startup...]\n" + write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG)); + +@@ -1230,7 +1354,11 @@ + + static void cleanup_netconsole(void) + { +- printk(KERN_INFO "netlog: network logging shut down.\n"); ++ stop_sysrq_thread = 1; ++ ++ wake_up(&sysrq_thread_queue); ++ wait_event(sysrq_thread_waiter_queue, thread_stopped); ++ printk(KERN_INFO"netlog: network logging shut down.\n"); + unregister_console(&netconsole); + + #define SHUTDOWN_MSG "[...network console shutdown...]\n" +Index: bglio/drivers/net/netconsole.h +=================================================================== +--- bglio.orig/drivers/net/netconsole.h 2004-05-07 15:50:22.000000000 -0700 ++++ bglio/drivers/net/netconsole.h 2004-05-07 17:11:01.000000000 -0700 +@@ -29,7 +29,7 @@ + * + ****************************************************************/ + +-#define NETCONSOLE_VERSION 0x04 ++#define NETCONSOLE_VERSION 0x03 + + enum netdump_commands { + COMM_NONE = 0, +@@ -42,6 +42,8 @@ + COMM_START_NETDUMP_ACK = 7, + COMM_GET_REGS = 8, + COMM_SHOW_STATE = 9, ++ COMM_START_WRITE_NETDUMP_ACK = 10, ++ COMM_SYSRQ = 11, + }; + + #define NETDUMP_REQ_SIZE (8+4*4) +@@ -69,6 +71,7 @@ + REPLY_REGS = 10, + REPLY_MAGIC = 11, + REPLY_SHOW_STATE = 12, ++ REPLY_SYSRQ = 13, + }; + + typedef struct netdump_reply_s { +@@ -78,4 +81,24 @@ + } reply_t; + + #define HEADER_LEN (1 + sizeof(reply_t)) +- ++/* for netconsole */ ++static inline void get_current_regs(struct pt_regs *regs) ++{ ++#ifdef __i386__ ++ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); ++ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); ++ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); ++ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); ++ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); ++ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); ++ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); ++ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); ++ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); ++ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); ++ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); ++ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); ++ __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); ++ regs->eip = (unsigned long)current_text_addr(); ++#endif ++} ++ +Index: bglio/arch/i386/kernel/irq.c +=================================================================== +--- bglio.orig/arch/i386/kernel/irq.c 2004-05-07 15:50:17.000000000 -0700 ++++ bglio/arch/i386/kernel/irq.c 2004-05-07 17:11:01.000000000 -0700 +@@ -182,7 +182,20 @@ + + return 0; + } ++struct irqaction *find_irq_action(unsigned int irq, void *dev_id) ++{ ++ struct irqaction *a, *r=0; + ++ spin_lock_irq(&irq_desc[irq].lock); ++ for(a=irq_desc[irq].action; a; a=a->next) { ++ if(a->dev_id == dev_id) { ++ r=a; ++ break; ++ } ++ } ++ spin_unlock_irq(&irq_desc[irq].lock); ++ return r; ++} + + /* + * Global interrupt locks for SMP. Allow interrupts to come in on any +Index: bglio/arch/i386/kernel/i386_ksyms.c +=================================================================== +--- bglio.orig/arch/i386/kernel/i386_ksyms.c 2004-05-07 15:50:22.000000000 -0700 ++++ bglio/arch/i386/kernel/i386_ksyms.c 2004-05-07 17:11:01.000000000 -0700 +@@ -66,6 +66,7 @@ + EXPORT_SYMBOL(iounmap); + EXPORT_SYMBOL(enable_irq); + EXPORT_SYMBOL(disable_irq); ++EXPORT_SYMBOL(find_irq_action); + EXPORT_SYMBOL(disable_irq_nosync); + EXPORT_SYMBOL(probe_irq_mask); + EXPORT_SYMBOL(kernel_thread); +@@ -186,7 +187,6 @@ + EXPORT_SYMBOL(edd); + EXPORT_SYMBOL(eddnr); + #endif +- + EXPORT_SYMBOL_GPL(show_mem); + EXPORT_SYMBOL_GPL(show_state); + EXPORT_SYMBOL_GPL(show_regs); +Index: bglio/net/core/dev.c +=================================================================== +--- bglio.orig/net/core/dev.c 2004-05-07 15:50:22.000000000 -0700 ++++ bglio/net/core/dev.c 2004-05-07 17:11:01.000000000 -0700 +@@ -1476,6 +1476,16 @@ + + skb_bond(skb); + ++ if (unlikely(skb->dev->rx_hook != NULL)) { ++ int ret; ++ ++ ret = skb->dev->rx_hook(skb); ++ if (ret == NET_RX_DROP){ ++ kfree_skb(skb); ++ return ret; ++ } ++ } ++ + netdev_rx_stat[smp_processor_id()].total++; + + #ifdef CONFIG_NET_FASTROUTE +Index: bglio/include/asm-i386/irq.h +=================================================================== +--- bglio.orig/include/asm-i386/irq.h 2004-05-07 15:25:28.000000000 -0700 ++++ bglio/include/asm-i386/irq.h 2004-05-07 17:11:01.000000000 -0700 +@@ -38,7 +38,7 @@ + extern void disable_irq_nosync(unsigned int); + extern void enable_irq(unsigned int); + extern void release_x86_irqs(struct task_struct *); +- ++extern struct irqaction *find_irq_action(unsigned int irq, void *dev_id); + #ifdef CONFIG_X86_LOCAL_APIC + #define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ + #endif +Index: bglio/kernel/panic.c +=================================================================== +--- bglio.orig/kernel/panic.c 2004-05-07 15:50:22.000000000 -0700 ++++ bglio/kernel/panic.c 2004-05-07 17:11:01.000000000 -0700 +@@ -66,8 +66,6 @@ + vsprintf(buf, fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic: %s\n",buf); +- if (netdump_func) +- BUG(); + if (in_interrupt()) + printk(KERN_EMERG "In interrupt handler - not syncing\n"); + else if (!current->pid) diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch new file mode 100644 index 0000000..983da60 --- /dev/null +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch @@ -0,0 +1,741 @@ + fs/Makefile | 3 + fs/file_table.c | 11 ++ + fs/inode.c | 23 ++++- + fs/namei.c | 12 ++ + fs/nfsd/export.c | 5 + + fs/nfsd/nfsfh.c | 65 +++++++++++++- + fs/nfsd/vfs.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++----- + include/linux/fs.h | 10 ++ + kernel/ksyms.c | 2 + 9 files changed, 337 insertions(+), 34 deletions(-) + +Index: linux-bgl/fs/nfsd/vfs.c +=================================================================== +--- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700 ++++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800 +@@ -77,6 +77,128 @@ + static struct raparms * raparml; + static struct raparms * raparm_cache; + ++static int link_raw(struct dentry *dold, struct dentry *ddir, ++ struct dentry *dnew) ++{ ++ int err; ++ ++ struct nameidata old_nd = { .dentry = dold }; ++ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->link_raw(&old_nd, &nd); ++ d_instantiate(dnew, dold->d_inode); ++ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) ++ dold->d_inode->i_op->revalidate_it(dnew, NULL); ++ ++ return err; ++} ++ ++static int unlink_raw(struct dentry *dentry, char *fname, int flen, ++ struct dentry *rdentry) ++{ ++ int err; ++ struct qstr last = { .name = fname, .len = flen }; ++ struct nameidata nd = { .dentry = dentry, .last = last }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->unlink_raw(&nd); ++ if (!err) ++ d_delete(rdentry); ++ ++ return err; ++} ++ ++static int rmdir_raw(struct dentry *dentry, char *fname, int flen, ++ struct dentry *rdentry) ++{ ++ int err; ++ struct qstr last = { .name = fname, .len = flen }; ++ struct nameidata nd = { .dentry = dentry, .last = last }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->rmdir_raw(&nd); ++ if(!err) { ++ rdentry->d_inode->i_flags |= S_DEAD; ++ d_delete(rdentry); ++ } ++ ++ return err; ++} ++ ++static int symlink_raw(struct dentry *dentry, char *fname, int flen, ++ char *path) ++{ ++ int err; ++ struct qstr last = { .name = fname, .len = flen }; ++ struct nameidata nd = { .dentry = dentry, .last = last }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->symlink_raw(&nd, path); ++ ++ return err; ++} ++ ++static int mkdir_raw(struct dentry *dentry, char *fname, int flen, int mode) ++{ ++ int err; ++ struct qstr last = { .name = fname, .len = flen }; ++ struct nameidata nd = { .dentry = dentry, .last = last }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->mkdir_raw(&nd, mode); ++ ++ return err; ++} ++ ++static int mknod_raw(struct dentry *dentry, char *fname, int flen, int mode, ++ dev_t dev) ++{ ++ int err; ++ struct qstr last = { .name = fname, .len = flen }; ++ struct nameidata nd = { .dentry = dentry, .last = last }; ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ err = op->mknod_raw(&nd, mode, dev); ++ ++ return err; ++} ++ ++static int rename_raw(struct dentry *fdentry, struct dentry *tdentry, ++ struct dentry *odentry, struct dentry *ndentry) ++{ ++ int err; ++ ++ struct nameidata old_nd = { .dentry = fdentry, .last = odentry->d_name}; ++ struct nameidata new_nd = { .dentry = tdentry, .last = ndentry->d_name}; ++ struct inode_operations *op = old_nd.dentry->d_inode->i_op; ++ err = op->rename_raw(&old_nd, &new_nd); ++ d_move(odentry, ndentry); ++ ++ return err; ++} ++ ++static int setattr_raw(struct inode *inode, struct iattr *iap) ++{ ++ int err; ++ ++ iap->ia_valid |= ATTR_RAW; ++ err = inode->i_op->setattr_raw(inode, iap); ++ ++ return err; ++} ++ ++int revalidate_it(struct dentry *dentry, struct lookup_intent *it) ++{ ++ int err = 0; ++ ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ err = -EINVAL; ++ dentry = NULL; ++ return err; ++ } ++ } ++ ++ return err; ++} ++ + /* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put +@@ -304,7 +426,10 @@ + } + err = nfserr_notsync; + if (!check_guard || guardtime == inode->i_ctime) { +- err = notify_change(dentry, iap); ++ if ( dentry->d_inode->i_op && dentry->d_inode->i_op->setattr_raw) ++ err = setattr_raw(dentry->d_inode, iap); ++ else ++ err = notify_change(dentry, iap); + err = nfserrno(err); + } + if (size_change) { +@@ -431,6 +556,7 @@ + { + struct dentry *dentry; + struct inode *inode; ++ struct lookup_intent it; + int err; + + /* If we get here, then the client has already done an "open", and (hopefully) +@@ -477,6 +603,14 @@ + filp->f_mode = FMODE_READ; + } + ++ intent_init(&it, IT_OPEN, (filp->f_flags & ~O_ACCMODE) | filp->f_mode); ++ ++ err = revalidate_it(dentry, &it); ++ if (err) ++ goto out_nfserr; ++ ++ filp->f_it = ⁢ ++ + err = 0; + if (filp->f_op && filp->f_op->open) { + err = filp->f_op->open(inode, filp); +@@ -491,7 +625,11 @@ + atomic_dec(&filp->f_count); + } + } ++ + out_nfserr: ++ if (it.it_op_release) ++ intent_release(&it); ++ + if (err) + err = nfserrno(err); + out: +@@ -822,7 +960,7 @@ + { + struct dentry *dentry, *dchild; + struct inode *dirp; +- int err; ++ int err, error = -EOPNOTSUPP; + + err = nfserr_perm; + if (!flen) +@@ -838,20 +976,44 @@ + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + ++ switch (type) { ++ case S_IFDIR: ++ if (dirp->i_op->mkdir_raw) ++ error = mkdir_raw(dentry, fname, flen, iap->ia_mode); ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: ++ case S_IFREG: ++ if (dirp->i_op->mknod_raw) { ++ if (type == S_IFREG) ++ rdev = 0; ++ error = mknod_raw(dentry, fname, flen, iap->ia_mode, rdev); ++ } ++ break; ++ default: ++ printk("nfsd: bad file type %o in nfsd_create\n", type); ++ } ++ + err = nfserr_notdir; +- if(!dirp->i_op || !dirp->i_op->lookup) ++ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) + goto out; + /* + * Check whether the response file handle has been verified yet. + * If it has, the parent directory should already be locked. + */ +- if (!resfhp->fh_dentry) { +- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ +- fh_lock(fhp); ++ if (!resfhp->fh_dentry || dirp->i_op->lookup_it) { ++ /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create ++ and nfsd_proc_create in case of lustre ++ */ ++ if (!resfhp->fh_dentry) ++ fh_lock(fhp); + dchild = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; ++ resfhp->fh_dentry = NULL; + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; +@@ -872,10 +1034,12 @@ + * Make sure the child dentry is still negative ... + */ + err = nfserr_exist; +- if (dchild->d_inode) { +- dprintk("nfsd_create: dentry %s/%s not negative!\n", +- dentry->d_name.name, dchild->d_name.name); +- goto out; ++ if ( error == -EOPNOTSUPP) { ++ if (dchild->d_inode) { ++ dprintk("nfsd_create: dentry %s/%s not negative!\n", ++ dentry->d_name.name, dchild->d_name.name); ++ goto out; ++ } + } + + if (!(iap->ia_valid & ATTR_MODE)) +@@ -888,16 +1052,19 @@ + err = nfserr_perm; + switch (type) { + case S_IFREG: +- err = vfs_create(dirp, dchild, iap->ia_mode); ++ if (error == -EOPNOTSUPP) ++ err = vfs_create(dirp, dchild, iap->ia_mode); + break; + case S_IFDIR: +- err = vfs_mkdir(dirp, dchild, iap->ia_mode); ++ if (error == -EOPNOTSUPP) ++ err = vfs_mkdir(dirp, dchild, iap->ia_mode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: +- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); ++ if (error == -EOPNOTSUPP) ++ err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + break; + default: + printk("nfsd: bad file type %o in nfsd_create\n", type); +@@ -966,7 +1133,13 @@ + /* Get all the sanity checks out of the way before + * we lock the parent. */ + err = nfserr_notdir; +- if(!dirp->i_op || !dirp->i_op->lookup) ++ if (dirp->i_op->mknod_raw) { ++ err = mknod_raw(dentry, fname, flen, iap->ia_mode, 0); ++ if (err && err != -EOPNOTSUPP) ++ goto out; ++ } ++ ++ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) + goto out; + fh_lock(fhp); + +@@ -1017,6 +1190,8 @@ + case NFS3_CREATE_GUARDED: + err = nfserr_exist; + } ++ if(dirp->i_op->mknod_raw) ++ err = 0; + goto out; + } + +@@ -1123,7 +1298,7 @@ + struct iattr *iap) + { + struct dentry *dentry, *dnew; +- int err, cerr; ++ int err, cerr, error = -EOPNOTSUPP; + + err = nfserr_noent; + if (!flen || !plen) +@@ -1137,12 +1312,18 @@ + goto out; + fh_lock(fhp); + dentry = fhp->fh_dentry; ++ ++ if (dentry->d_inode->i_op->symlink_raw) ++ error = symlink_raw(dentry, fname, flen, path); ++ + dnew = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + +- err = vfs_symlink(dentry->d_inode, dnew, path); ++ err = error; ++ if (err == -EOPNOTSUPP || !dentry->d_inode->i_op->symlink_raw) ++ err = vfs_symlink(dentry->d_inode, dnew, path); + if (!err) { + if (EX_ISSYNC(fhp->fh_export)) + nfsd_sync_dir(dentry); +@@ -1152,7 +1333,10 @@ + iap->ia_valid |= ATTR_CTIME; + iap->ia_mode = (iap->ia_mode&S_IALLUGO) + | S_IFLNK; +- err = notify_change(dnew, iap); ++ if (dnew->d_inode->i_op && dnew->d_inode->i_op->setattr_raw) ++ err = setattr_raw(dnew->d_inode, iap); ++ else ++ err = notify_change(dnew, iap); + if (!err && EX_ISSYNC(fhp->fh_export)) + write_inode_now(dentry->d_inode, 1); + } +@@ -1210,7 +1394,10 @@ + dold = tfhp->fh_dentry; + dest = dold->d_inode; + +- err = vfs_link(dold, dirp, dnew); ++ if (dirp->i_op->link_raw) ++ err = link_raw(dold, ddir, dnew); ++ else ++ err = vfs_link(dold, dirp, dnew); + if (!err) { + if (EX_ISSYNC(ffhp->fh_export)) { + nfsd_sync_dir(ddir); +@@ -1295,7 +1482,10 @@ + err = nfserr_perm; + } else + #endif +- err = vfs_rename(fdir, odentry, tdir, ndentry); ++ if(fdir->i_op->rename_raw) ++ err = rename_raw(fdentry, tdentry, odentry, ndentry); ++ else ++ err = vfs_rename(fdir, odentry, tdir, ndentry); + if (!err && EX_ISSYNC(tfhp->fh_export)) { + nfsd_sync_dir(tdentry); + nfsd_sync_dir(fdentry); +@@ -1316,7 +1506,7 @@ + fill_post_wcc(tfhp); + double_up(&tdir->i_sem, &fdir->i_sem); + ffhp->fh_locked = tfhp->fh_locked = 0; +- ++ + out: + return err; + } +@@ -1362,9 +1552,15 @@ + err = nfserr_perm; + } else + #endif +- err = vfs_unlink(dirp, rdentry); ++ if (dirp->i_op->unlink_raw) ++ err = unlink_raw(dentry, fname, flen, rdentry); ++ else ++ err = vfs_unlink(dirp, rdentry); + } else { /* It's RMDIR */ +- err = vfs_rmdir(dirp, rdentry); ++ if (dirp->i_op->rmdir_raw) ++ err = rmdir_raw(dentry, fname, flen, rdentry); ++ else ++ err = vfs_rmdir(dirp, rdentry); + } + + dput(rdentry); +Index: linux-bgl/fs/nfsd/nfsfh.c +=================================================================== +--- linux-bgl.orig/fs/nfsd/nfsfh.c 2003-07-02 08:44:08.000000000 -0700 ++++ linux-bgl/fs/nfsd/nfsfh.c 2004-12-28 17:13:59.942919514 -0800 +@@ -36,6 +36,15 @@ + int sequence; /* sequence counter */ + }; + ++static struct dentry *lookup_it(struct inode *inode, struct dentry * dentry) ++{ ++ if (inode->i_op->lookup_it) ++ return inode->i_op->lookup_it(inode, dentry, NULL, 0); ++ else ++ return inode->i_op->lookup(inode, dentry); ++ ++} ++ + /* + * A rather strange filldir function to capture + * the name matching the specified inode number. +@@ -75,6 +84,8 @@ + int error; + struct file file; + struct nfsd_getdents_callback buffer; ++ struct lookup_intent it; ++ struct file *filp = NULL; + + error = -ENOTDIR; + if (!dir || !S_ISDIR(dir->i_mode)) +@@ -85,9 +96,37 @@ + /* + * Open the directory ... + */ +- error = init_private_file(&file, dentry, FMODE_READ); +- if (error) ++ if (dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if ((dentry->d_flags & DCACHE_NFSD_DISCONNECTED) && ++ (dentry->d_parent == dentry) ) { ++ it.it_op_release = NULL; ++ /* ++ * XXX Temporary Hack: Simulating init_private_file without ++ * f_op->open for disconnected dentry Since we don't have actual ++ * dentry->d_name to revalidate in revalidate_it() ++ */ ++ filp = &file; ++ memset(filp, 0, sizeof(*filp)); ++ filp->f_mode = FMODE_READ; ++ atomic_set(&filp->f_count, 1); ++ filp->f_dentry = dentry; ++ filp->f_uid = current->fsuid; ++ filp->f_gid = current->fsgid; ++ filp->f_op = dentry->d_inode->i_fop; ++ error = 0; ++ } else { ++ intent_init(&it, IT_OPEN, 0); ++ error = revalidate_it(dentry, &it); ++ if (error) ++ goto out; ++ error = init_private_file_it(&file, dentry, FMODE_READ, &it); ++ } ++ } else { ++ error = init_private_file_it(&file, dentry, FMODE_READ, NULL); ++ } ++ if (error) + goto out; ++ + error = -EINVAL; + if (!file.f_op->readdir) + goto out_close; +@@ -113,9 +152,13 @@ + } + + out_close: +- if (file.f_op->release) ++ if (file.f_op->release && !filp) + file.f_op->release(dir, &file); + out: ++ if (dentry->d_op && ++ dentry->d_op->d_revalidate_it && ++ it.it_op_release && !filp) ++ intent_release(&it); + return error; + } + +@@ -273,7 +316,7 @@ + /* I'm going to assume that if the returned dentry is different, then + * it is well connected. But nobody returns different dentrys do they? + */ +- pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry); ++ pdentry = lookup_it(child->d_inode, tdentry); + d_drop(tdentry); /* we never want ".." hashed */ + if (!pdentry && tdentry->d_inode == NULL) { + /* File system cannot find ".." ... sad but possible */ +@@ -304,6 +347,8 @@ + igrab(tdentry->d_inode); + pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED; + } ++ if (child->d_op && child->d_op->d_revalidate_it) ++ pdentry->d_op = child->d_op; + } + if (pdentry == NULL) + pdentry = ERR_PTR(-ENOMEM); +@@ -461,6 +506,8 @@ + struct dentry *pdentry; + struct inode *parent; + ++ if (result->d_op && result->d_op->d_revalidate_it) ++ dentry->d_op = result->d_op; + pdentry = nfsd_findparent(dentry); + err = PTR_ERR(pdentry); + if (IS_ERR(pdentry)) +@@ -648,6 +695,11 @@ + + inode = dentry->d_inode; + ++ /* cache coherency for non-device filesystems */ ++ if (inode->i_op && inode->i_op->revalidate_it) { ++ inode->i_op->revalidate_it(dentry, NULL); ++ } ++ + /* Type check. The correct error return for type mismatches + * does not seem to be generally agreed upon. SunOS seems to + * use EISDIR if file isn't S_IFREG; a comment in the NFSv3 +@@ -878,8 +930,9 @@ + dentry->d_parent->d_name.name, dentry->d_name.name); + goto out; + out_uptodate: +- printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", +- dentry->d_parent->d_name.name, dentry->d_name.name); ++ if(!dentry->d_parent->d_inode->i_op->mkdir_raw) ++ printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", ++ dentry->d_parent->d_name.name, dentry->d_name.name); + goto out; + } + +Index: linux-bgl/fs/Makefile +=================================================================== +--- linux-bgl.orig/fs/Makefile 2004-12-28 17:13:56.898868625 -0800 ++++ linux-bgl/fs/Makefile 2004-12-28 17:13:59.943919356 -0800 +@@ -7,7 +7,8 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o inode.o ++export-objs := filesystems.o open.o dcache.o buffer.o inode.o namei.o \ ++ file_table.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +Index: linux-bgl/fs/namei.c +=================================================================== +--- linux-bgl.orig/fs/namei.c 2004-12-28 17:13:56.265835195 -0800 ++++ linux-bgl/fs/namei.c 2004-12-28 17:13:59.947918720 -0800 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -100,6 +101,7 @@ + it->it_op_release(it); + + } ++EXPORT_SYMBOL(intent_release); + + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the +@@ -889,7 +891,8 @@ + + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, ++ int len, struct lookup_intent *it) + { + unsigned long hash; + struct qstr this; +@@ -909,11 +912,16 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash_it(&this, base, NULL); ++ return lookup_hash_it(&this, base, it); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +Index: linux-bgl/fs/file_table.c +=================================================================== +--- linux-bgl.orig/fs/file_table.c 2003-07-02 08:44:42.000000000 -0700 ++++ linux-bgl/fs/file_table.c 2004-12-28 17:13:59.948918562 -0800 +@@ -82,7 +82,8 @@ + * and call the open function (if any). The caller must verify that + * inode->i_fop is not NULL. + */ +-int init_private_file(struct file *filp, struct dentry *dentry, int mode) ++int init_private_file_it(struct file *filp, struct dentry *dentry, int mode, ++ struct lookup_intent *it) + { + memset(filp, 0, sizeof(*filp)); + filp->f_mode = mode; +@@ -90,12 +91,20 @@ + filp->f_dentry = dentry; + filp->f_uid = current->fsuid; + filp->f_gid = current->fsgid; ++ if (it) ++ filp->f_it = it; + filp->f_op = dentry->d_inode->i_fop; + if (filp->f_op->open) + return filp->f_op->open(dentry->d_inode, filp); + else + return 0; + } ++EXPORT_SYMBOL(init_private_file_it); ++ ++int init_private_file(struct file *filp, struct dentry *dentry, int mode) ++{ ++ return init_private_file_it(filp, dentry, mode, NULL); ++} + + void fput(struct file * file) + { +Index: linux-bgl/fs/inode.c +=================================================================== +--- linux-bgl.orig/fs/inode.c 2004-12-28 17:13:56.635910389 -0800 ++++ linux-bgl/fs/inode.c 2004-12-28 17:13:59.950918244 -0800 +@@ -971,9 +971,10 @@ + } + + +-struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque) ++static inline struct inode *ifind(struct super_block *sb, unsigned long ino, ++ struct list_head *head, ++ find_inode_t find_actor, void *opaque) + { +- struct list_head * head = inode_hashtable + hash(sb,ino); + struct inode * inode; + + spin_lock(&inode_lock); +@@ -986,6 +987,24 @@ + } + spin_unlock(&inode_lock); + ++ return NULL; ++} ++ ++struct inode *ilookup4(struct super_block *sb, unsigned long ino, ++ find_inode_t find_actor, void *opaque) ++{ ++ struct list_head * head = inode_hashtable + hash(sb,ino); ++ return ifind(sb, ino, head, find_actor, opaque); ++} ++ ++struct inode *iget4(struct super_block *sb, unsigned long ino, ++ find_inode_t find_actor, void *opaque) ++{ ++ struct list_head * head = inode_hashtable + hash(sb,ino); ++ struct inode *inode = ifind(sb, ino, head, find_actor, opaque); ++ if (inode) ++ return inode; ++ + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. +Index: linux-bgl/kernel/ksyms.c +=================================================================== +--- linux-bgl.orig/kernel/ksyms.c 2004-12-28 17:13:56.978855920 -0800 ++++ linux-bgl/kernel/ksyms.c 2004-12-28 17:13:59.951918085 -0800 +@@ -142,6 +142,7 @@ + EXPORT_SYMBOL(igrab); + EXPORT_SYMBOL(iunique); + EXPORT_SYMBOL(iget4); ++EXPORT_SYMBOL(ilookup4); + EXPORT_SYMBOL(iput); + EXPORT_SYMBOL(force_delete); + EXPORT_SYMBOL(follow_up); +@@ -152,6 +153,7 @@ + EXPORT_SYMBOL(path_release); + EXPORT_SYMBOL(__user_walk); + EXPORT_SYMBOL(lookup_one_len); ++EXPORT_SYMBOL(lookup_one_len_it); + EXPORT_SYMBOL(lookup_hash); + EXPORT_SYMBOL(sys_close); + EXPORT_SYMBOL(dcache_lock); +Index: linux-bgl/include/linux/fs.h +=================================================================== +--- linux-bgl.orig/include/linux/fs.h 2004-12-28 17:13:59.471860200 -0800 ++++ linux-bgl/include/linux/fs.h 2004-12-28 17:13:59.955917450 -0800 +@@ -93,6 +93,9 @@ + #define FS_SINGLE 8 /* Filesystem that can have only one superblock */ + #define FS_NOMOUNT 16 /* Never mount from userland */ + #define FS_LITTER 32 /* Keeps the tree in dcache */ ++#define FS_NFSEXP_FSID 64 /* Use file system specific fsid for ++ * exporting non device filesystems. ++ */ + #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon + * as nfs_rename() will be cleaned up + */ +@@ -1149,6 +1152,9 @@ + struct nameidata *nd, struct lookup_intent *it); + extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, + int flags, struct lookup_intent *it); ++extern int revalidate_it(struct dentry *dentry, struct lookup_intent *it); ++extern int init_private_file_it(struct file *, struct dentry *dentry, int mode, ++ struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1418,6 +1424,8 @@ + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); ++extern struct dentry * lookup_one_len_it(const char *, struct dentry *, int, ++ struct lookup_intent *); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) +@@ -1431,6 +1439,8 @@ + + typedef int (*find_inode_t)(struct inode *, unsigned long, void *); + extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *); ++extern struct inode * ilookup4(struct super_block *, unsigned long, ++ find_inode_t, void *); + static inline struct inode *iget(struct super_block *sb, unsigned long ino) + { + return iget4(sb, ino, NULL, NULL); diff --git a/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch b/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch new file mode 100644 index 0000000..567e1e8 --- /dev/null +++ b/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch @@ -0,0 +1,16 @@ +Index: linux-2.4.19-pre1/include/linux/sched.h +=================================================================== +--- linux-2.4.19-pre1.orig/include/linux/sched.h 2003-11-21 04:05:05.000000000 +0300 ++++ linux-2.4.19-pre1/include/linux/sched.h 2003-11-21 04:10:29.000000000 +0300 +@@ -927,6 +927,11 @@ + return res; + } + ++static inline int need_resched(void) ++{ ++ return (unlikely(current->need_resched)); ++} ++ + #endif /* __KERNEL__ */ + + #endif diff --git a/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch new file mode 100644 index 0000000..e60f473 --- /dev/null +++ b/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch @@ -0,0 +1,46 @@ + include/linux/socket.h | 4 ++++ + net/netsyms.c | 2 ++ + net/socket.c | 2 +- + 3 files changed, 7 insertions(+), 1 deletion(-) + +Index: linux-DRV401/include/linux/socket.h +=================================================================== +--- linux-DRV401.orig/include/linux/socket.h 2004-10-15 10:26:20.000000000 -0700 ++++ linux-DRV401/include/linux/socket.h 2004-10-15 11:11:09.000000000 -0700 +@@ -260,6 +260,10 @@ + extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen); + extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); + extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); ++struct socket; ++extern int sock_map_fd(struct socket *sock); ++extern struct socket *sockfd_lookup(int fd, int *err); ++ + #endif + #endif /* not kernel and not glibc */ + #endif /* _LINUX_SOCKET_H */ +Index: linux-DRV401/net/netsyms.c +=================================================================== +--- linux-DRV401.orig/net/netsyms.c 2004-10-15 11:10:52.000000000 -0700 ++++ linux-DRV401/net/netsyms.c 2004-10-15 11:11:09.000000000 -0700 +@@ -159,6 +159,8 @@ + EXPORT_SYMBOL(put_cmsg); + EXPORT_SYMBOL(sock_kmalloc); + EXPORT_SYMBOL(sock_kfree_s); ++EXPORT_SYMBOL(sockfd_lookup); ++EXPORT_SYMBOL(sock_map_fd); + + #ifdef CONFIG_FILTER + EXPORT_SYMBOL(sk_run_filter); +Index: linux-DRV401/net/socket.c +=================================================================== +--- linux-DRV401.orig/net/socket.c 2004-10-15 10:24:16.000000000 -0700 ++++ linux-DRV401/net/socket.c 2004-10-15 11:11:09.000000000 -0700 +@@ -326,7 +326,7 @@ + * but we take care of internal coherence yet. + */ + +-static int sock_map_fd(struct socket *sock) ++int sock_map_fd(struct socket *sock) + { + int fd; + struct qstr this; diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch new file mode 100644 index 0000000..bcd3f73 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch @@ -0,0 +1,461 @@ +Index: linux-2.4.19-pre1/include/linux/skbuff.h +=================================================================== +--- linux-2.4.19-pre1.orig/include/linux/skbuff.h 2001-11-22 22:46:26.000000000 +0300 ++++ linux-2.4.19-pre1/include/linux/skbuff.h 2004-01-14 01:15:13.000000000 +0300 +@@ -116,6 +116,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +147,12 @@ + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +Index: linux-2.4.19-pre1/include/net/tcp.h +=================================================================== +--- linux-2.4.19-pre1.orig/include/net/tcp.h 2001-11-22 22:47:22.000000000 +0300 ++++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300 +@@ -640,6 +640,8 @@ + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -733,6 +735,9 @@ + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern int tcp_listen_start(struct sock *sk); + +Index: linux-2.4.19-pre1/net/netsyms.c +=================================================================== +--- linux-2.4.19-pre1.orig/net/netsyms.c 2004-01-14 01:10:37.000000000 +0300 ++++ linux-2.4.19-pre1/net/netsyms.c 2004-01-14 01:15:54.000000000 +0300 +@@ -409,6 +409,9 @@ + + #endif + ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); ++ + EXPORT_SYMBOL(netlink_set_err); + EXPORT_SYMBOL(netlink_broadcast); + EXPORT_SYMBOL(netlink_unicast); +Index: linux-2.4.19-pre1/net/core/skbuff.c +=================================================================== +--- linux-2.4.19-pre1.orig/net/core/skbuff.c 2001-12-21 20:42:05.000000000 +0300 ++++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300 +@@ -208,6 +208,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; + return skb; + + nodata: +@@ -276,6 +278,10 @@ + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -532,6 +538,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ skb_shinfo(skb)->zccd2 = NULL; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -578,6 +586,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -620,6 +636,8 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ + + if (skb_shared(skb)) + BUG(); +@@ -641,6 +659,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data+nhead) - skb->head; +@@ -655,6 +678,8 @@ + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +Index: linux-2.4.19-pre1/net/ipv4/tcp.c +=================================================================== +--- linux-2.4.19-pre1.orig/net/ipv4/tcp.c 2001-12-21 20:42:05.000000000 +0300 ++++ linux-2.4.19-pre1/net/ipv4/tcp.c 2004-01-14 01:15:13.000000000 +0300 +@@ -744,7 +744,7 @@ + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -823,7 +823,8 @@ + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -871,6 +872,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -881,6 +893,20 @@ + goto new_segment; + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -944,7 +970,31 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +@@ -1683,6 +1733,202 @@ + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->err || ++ sk->state == TCP_CLOSE || ++ (sk->shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sk->done) ++ break; ++ ++ if (sk->err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->state == TCP_CLOSE) { ++ if (!sk->done) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ timeo = tcp_data_wait(sk, timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!sk->urginline) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch new file mode 100644 index 0000000..eec0362 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch @@ -0,0 +1,1849 @@ + fs/dcache.c | 19 ++ + fs/exec.c | 17 +- + fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- + fs/namespace.c | 28 +++- + fs/open.c | 172 +++++++++++++++++++------- + fs/stat.c | 52 +++++--- + include/linux/dcache.h | 60 +++++++++ + include/linux/fs.h | 32 ++++ + include/linux/fs_struct.h | 4 + kernel/exit.c | 3 + kernel/fork.c | 3 + kernel/ksyms.c | 1 + 12 files changed, 558 insertions(+), 128 deletions(-) + +Index: linux.mcp2/fs/dcache.c +=================================================================== +--- linux.mcp2.orig/fs/dcache.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/dcache.c 2004-05-05 14:19:59.000000000 -0700 +@@ -181,6 +181,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -830,13 +837,19 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +Index: linux.mcp2/fs/exec.c +=================================================================== +--- linux.mcp2.orig/fs/exec.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/exec.c 2004-05-05 14:19:59.000000000 -0700 +@@ -107,8 +107,10 @@ + struct file * file; + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_OPEN, ++ .it_flags = FMODE_READ|FMODE_EXEC }; + +- error = user_path_walk(library, &nd); ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -120,7 +122,8 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -342,9 +345,11 @@ + struct inode *inode; + struct file *file; + int err = 0; ++ struct lookup_intent it = { .it_op = IT_OPEN, ++ .it_flags = FMODE_READ|FMODE_EXEC }; + + if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) +- err = path_walk(name, &nd); ++ err = path_walk_it(name, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -356,7 +361,8 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -368,6 +374,7 @@ + return file; + } + } ++ intent_release(&it); + path_release(&nd); + } + goto out; +@@ -969,7 +976,7 @@ + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux.mcp2/fs/namei.c +=================================================================== +--- linux.mcp2.orig/fs/namei.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/namei.c 2004-05-05 14:28:26.000000000 -0700 +@@ -94,6 +94,13 @@ + * XEmacs seems to be relying on it... + */ + ++void intent_release(struct lookup_intent *it) ++{ ++ if (it && it->it_op_release) ++ it->it_op_release(it); ++ ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -260,10 +267,19 @@ + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -281,11 +297,15 @@ + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + ++again: ++ counter++; + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +320,9 @@ + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); ++ if (dir->i_op->lookup_it) ++ result = dir->i_op->lookup_it(dir, dentry, it, flags); ++ else + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) +@@ -321,6 +344,15 @@ + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate_it) { ++ if (!result->d_op->d_revalidate_it(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; ++ } + } + return result; + } +@@ -332,7 +364,8 @@ + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) + { + int err; + if (current->link_count >= 5) +@@ -346,10 +379,12 @@ + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); ++ nd->intent = it; + err = dentry->d_inode->i_op->follow_link(dentry, nd); + current->link_count--; + return err; + loop: ++ intent_release(it); + path_release(nd); + return -ELOOP; + } +@@ -447,7 +482,8 @@ + * + * We expect 'base' to be positive and a directory. + */ +-int link_path_walk(const char * name, struct nameidata *nd) ++int link_path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it) + { + struct dentry *dentry; + struct inode *inode; +@@ -520,9 +556,10 @@ + break; + } + /* This does the actual lookups.. */ +- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + if (!dentry) { +- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, ++ NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -540,7 +577,7 @@ + goto out_dput; + + if (inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, NULL); + dput(dentry); + if (err) + goto return_err; +@@ -556,7 +593,7 @@ + nd->dentry = dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup_it) + break; + continue; + /* here ends the main loop */ +@@ -583,9 +620,9 @@ + if (err < 0) + break; + } +- dentry = cached_lookup(nd->dentry, &this, 0); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); + if (!dentry) { +- dentry = real_lookup(nd->dentry, &this, 0); ++ dentry = real_lookup(nd->dentry, &this, 0, it); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -595,7 +632,7 @@ + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) + && inode && inode->i_op && inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, it); + dput(dentry); + if (err) + goto return_err; +@@ -609,7 +646,8 @@ + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup_it)) + break; + } + goto return_base; +@@ -633,6 +671,34 @@ + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ if (IS_ERR(new)) { ++ err = PTR_ERR(new); ++ break; ++ } ++ d_invalidate(dentry); ++ dput(dentry); ++ nd->dentry = new; ++ } ++ if (!nd->dentry->d_inode) ++ goto no_inode; ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if (!nd->dentry->d_inode->i_op || ++ (!nd->dentry->d_inode->i_op->lookup && ++ !nd->dentry->d_inode->i_op->lookup_it)) ++ break; ++ } ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { +@@ -646,15 +703,28 @@ + dput(dentry); + break; + } ++ if (err) ++ intent_release(it); + path_release(nd); + return_err: + return err; + } + ++int link_path_walk(const char * name, struct nameidata *nd) ++{ ++ return link_path_walk_it(name, nd, NULL); ++} ++ ++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) ++{ ++ current->total_link_count = 0; ++ return link_path_walk_it(name, nd, it); ++} ++ + int path_walk(const char * name, struct nameidata *nd) + { + current->total_link_count = 0; +- return link_path_walk(name, nd); ++ return link_path_walk_it(name, nd, NULL); + } + + /* SMP-safe */ +@@ -743,6 +813,7 @@ + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->intent = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -757,7 +828,8 @@ + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -780,13 +852,16 @@ + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); ++ if (inode->i_op->lookup_it) ++ dentry = inode->i_op->lookup_it(inode, new, it, 0); ++ else + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) +@@ -798,6 +873,12 @@ + return dentry; + } + ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++{ ++ return lookup_hash_it(name, base, NULL); ++} ++ ++ + /* SMP-safe */ + struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) + { +@@ -819,7 +900,7 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash_it(&this, base, NULL); + access: + return ERR_PTR(-EACCES); + } +@@ -851,6 +932,23 @@ + return err; + } + ++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ char *tmp; ++ int err; ++ ++ tmp = getname(name); ++ err = PTR_ERR(tmp); ++ if (!IS_ERR(tmp)) { ++ err = 0; ++ if (path_init(tmp, flags, nd)) ++ err = path_walk_it(tmp, nd, it); ++ putname(tmp); ++ } ++ return err; ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -946,7 +1044,8 @@ + return retval; + } + +-int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, ++ struct lookup_intent *it) + { + int error; + +@@ -959,12 +1058,15 @@ + goto exit_lock; + + error = -EACCES; /* shouldn't it be ENOSYS? */ +- if (!dir->i_op || !dir->i_op->create) ++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); +- error = dir->i_op->create(dir, dentry, mode); ++ if (dir->i_op->create_it) ++ error = dir->i_op->create_it(dir, dentry, mode, it); ++ else ++ error = dir->i_op->create(dir, dentry, mode); + unlock_kernel(); + exit_lock: + up(&dir->i_zombie); +@@ -973,6 +1075,11 @@ + return error; + } + ++int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ return vfs_create_it(dir, dentry, mode, NULL); ++} ++ + /* + * open_namei() + * +@@ -987,7 +1094,8 @@ + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) ++int open_namei_it(const char *pathname, int flag, int mode, ++ struct nameidata *nd, struct lookup_intent *it) + { + int acc_mode, error = 0; + struct inode *inode; +@@ -997,12 +1105,14 @@ + + acc_mode = ACC_MODE(flag); + ++ if (it) ++ it->it_flags = flag; + /* + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { + if (path_init(pathname, lookup_flags(flag), nd)) +- error = path_walk(pathname, nd); ++ error = path_walk_it(pathname, nd, it); + if (error) + return error; + dentry = nd->dentry; +@@ -1012,6 +1122,10 @@ + /* + * Create - we need to know the parent. + */ ++ if (it) { ++ it->it_create_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + if (path_init(pathname, LOOKUP_PARENT, nd)) + error = path_walk(pathname, nd); + if (error) +@@ -1028,7 +1142,7 @@ + + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); +@@ -1037,10 +1151,11 @@ + goto exit; + } + ++ it->it_create_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { +- error = vfs_create(dir->d_inode, dentry, +- mode & ~current->fs->umask); ++ error = vfs_create_it(dir->d_inode, dentry, ++ mode & ~current->fs->umask, it); + up(&dir->d_inode->i_sem); + dput(nd->dentry); + nd->dentry = dentry; +@@ -1144,7 +1259,7 @@ + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1156,8 +1271,10 @@ + return 0; + + exit_dput: ++ intent_release(it); + dput(dentry); + exit: ++ intent_release(it); + path_release(nd); + return error; + +@@ -1176,7 +1293,10 @@ + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); ++ nd->intent = it; + error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) ++ intent_release(it); + dput(dentry); + if (error) + return error; +@@ -1198,13 +1318,20 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; + } + ++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) ++{ ++ return open_namei_it(pathname, flag, mode, nd, NULL); ++} ++ ++ + /* SMP-safe */ +-static struct dentry *lookup_create(struct nameidata *nd, int is_dir) ++static struct dentry *lookup_create(struct nameidata *nd, int is_dir, ++ struct lookup_intent *it) + { + struct dentry *dentry; + +@@ -1212,7 +1339,7 @@ + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1269,7 +1396,20 @@ + error = path_walk(tmp, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + + mode &= ~current->fs->umask; +@@ -1290,6 +1426,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1338,7 +1475,18 @@ + error = path_walk(tmp, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 1); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_mkdir(nd.dentry->d_inode, dentry, +@@ -1346,6 +1490,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1447,8 +1592,16 @@ + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1507,8 +1660,15 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1576,15 +1736,27 @@ + error = path_walk(to, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1667,7 +1835,18 @@ + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +- new_dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out_release; ++ } ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); +@@ -1713,7 +1888,7 @@ + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + struct inode *target; +@@ -1792,7 +1967,7 @@ + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + +@@ -1883,9 +2058,18 @@ + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ unlock_kernel(); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + double_lock(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -1901,16 +2085,16 @@ + if (newnd.last.name[newnd.last.len]) + goto exit4; + } +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + ++ + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); + unlock_kernel(); +- + dput(new_dentry); + exit4: + dput(old_dentry); +@@ -1961,20 +2145,26 @@ + } + + static inline int +-__vfs_follow_link(struct nameidata *nd, const char *link) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) + { + int res = 0; + char *name; + if (IS_ERR(link)) + goto fail; + ++ if (it == NULL) ++ it = nd->intent; ++ else if (it != nd->intent) ++ printk("it != nd->intent: tell phil@clusterfs.com\n"); ++ + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) + /* weird __emul_prefix() stuff did it */ + goto out; + } +- res = link_path_walk(link, nd); ++ res = link_path_walk_it(link, nd, it); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; +@@ -1996,7 +2186,13 @@ + + int vfs_follow_link(struct nameidata *nd, const char *link) + { +- return __vfs_follow_link(nd, link); ++ return __vfs_follow_link(nd, link, NULL); ++} ++ ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) ++{ ++ return __vfs_follow_link(nd, link, it); + } + + /* get the link contents into pagecache */ +@@ -2038,7 +2234,7 @@ + { + struct page *page = NULL; + char *s = page_getlink(dentry, &page); +- int res = __vfs_follow_link(nd, s); ++ int res = __vfs_follow_link(nd, s, NULL); + if (page) { + kunmap(page); + page_cache_release(page); +Index: linux.mcp2/fs/namespace.c +=================================================================== +--- linux.mcp2.orig/fs/namespace.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/namespace.c 2004-05-05 14:22:06.000000000 -0700 +@@ -97,6 +97,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -108,6 +109,7 @@ + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -491,15 +493,18 @@ + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; + if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) +- err = path_walk(old_name, &old_nd); +- if (err) ++ err = path_walk_it(old_name, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -522,6 +527,7 @@ + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -706,6 +712,7 @@ + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -731,9 +738,11 @@ + + /* ... and get the mountpoint */ + if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) +- retval = path_walk(dir_name, &nd); +- if (retval) ++ retval = path_walk_it(dir_name, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -745,6 +754,8 @@ + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -910,6 +921,8 @@ + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + char *name; + int error; + +@@ -924,7 +937,7 @@ + goto out0; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) +- error = path_walk(name, &new_nd); ++ error = path_walk_it(name, &new_nd, &new_it); + putname(name); + if (error) + goto out0; +@@ -938,7 +951,7 @@ + goto out1; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) +- error = path_walk(name, &old_nd); ++ error = path_walk_it(name, &old_nd, &old_it); + putname(name); + if (error) + goto out1; +@@ -994,8 +1007,10 @@ + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); +Index: linux.mcp2/fs/open.c +=================================================================== +--- linux.mcp2.orig/fs/open.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/open.c 2004-05-05 14:30:34.000000000 -0700 +@@ -19,6 +19,8 @@ + #include + + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) ++extern int path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it); + + int vfs_statfs(struct super_block *sb, struct statfs *buf) + { +@@ -95,9 +97,10 @@ + write_unlock(&files->file_lock); + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + struct inode *inode = dentry->d_inode; ++ struct inode_operations *op = dentry->d_inode->i_op; + int error; + struct iattr newattrs; + +@@ -108,7 +111,13 @@ + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; +- error = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ } else ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + return error; + } +@@ -118,12 +127,13 @@ + struct nameidata nd; + struct inode * inode; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -163,11 +173,13 @@ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ intent_release(&it); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -215,7 +227,7 @@ + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -260,11 +272,13 @@ + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -279,11 +293,25 @@ + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!times) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } ++ + error = notify_change(nd.dentry, &newattrs); + dput_and_out: + path_release(&nd); +@@ -304,12 +332,14 @@ + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -324,7 +354,20 @@ + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!utimes) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; +@@ -347,6 +390,7 @@ + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -364,13 +408,14 @@ + else + current->cap_effective = current->cap_permitted; + +- res = user_path_walk(filename, &nd); ++ res = user_path_walk_it(filename, &nd, &it); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ intent_release(&it); + path_release(&nd); + } + +@@ -386,6 +431,7 @@ + int error; + struct nameidata nd; + char *name; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + name = getname(filename); + error = PTR_ERR(name); +@@ -394,7 +440,7 @@ + + error = 0; + if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) +- error = path_walk(name, &nd); ++ error = path_walk_it(name, &nd, &it); + putname(name); + if (error) + goto out; +@@ -406,6 +452,7 @@ + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -446,6 +493,7 @@ + int error; + struct nameidata nd; + char *name; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + name = getname(filename); + error = PTR_ERR(name); +@@ -454,7 +502,7 @@ + + path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | + LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); +- error = path_walk(name, &nd); ++ error = path_walk_it(name, &nd, &it); + putname(name); + if (error) + goto out; +@@ -471,39 +519,56 @@ + set_fs_altroot(); + error = 0; + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; + } + +-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++int chmod_common(struct dentry *dentry, mode_t mode) + { +- struct inode * inode; +- struct dentry * dentry; +- struct file * file; +- int err = -EBADF; ++ struct inode *inode = dentry->d_inode; + struct iattr newattrs; ++ int err = -EROFS; + +- file = fget(fd); +- if (!file) ++ if (IS_RDONLY(inode)) + goto out; + +- dentry = file->f_dentry; +- inode = dentry->d_inode; ++ if (inode->i_op->setattr_raw) { ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ err = inode->i_op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (err != -EOPNOTSUPP) ++ goto out; ++ } + +- err = -EROFS; +- if (IS_RDONLY(inode)) +- goto out_putf; + err = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto out_putf; ++ goto out; ++ + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + err = notify_change(dentry, &newattrs); + +-out_putf: ++out: ++ return err; ++} ++ ++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++{ ++ struct file * file; ++ int err = -EBADF; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ err = chmod_common(file->f_dentry, mode); ++ + fput(file); + out: + return err; +@@ -512,30 +577,14 @@ + asmlinkage long sys_chmod(const char * filename, mode_t mode) + { + struct nameidata nd; +- struct inode * inode; + int error; +- struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; +- inode = nd.dentry->d_inode; +- +- error = -EROFS; +- if (IS_RDONLY(inode)) +- goto dput_and_out; + +- error = -EPERM; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto dput_and_out; ++ error = chmod_common(nd.dentry, mode); + +- if (mode == (mode_t) -1) +- mode = inode->i_mode; +- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); +- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- error = notify_change(nd.dentry, &newattrs); +- +-dput_and_out: + path_release(&nd); + out: + return error; +@@ -555,6 +604,20 @@ + error = -EROFS; + if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; +@@ -659,6 +722,7 @@ + { + int namei_flags, error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN }; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -666,14 +730,15 @@ + if (namei_flags & O_TRUNC) + namei_flags |= 2; + +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ error = open_namei_it(filename, namei_flags, mode, &nd, &it); ++ if (error) ++ return ERR_PTR(error); + +- return ERR_PTR(error); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); + } + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -710,12 +775,15 @@ + } + + if (f->f_op && f->f_op->open) { ++ f->f_it = it; + error = f->f_op->open(inode,f); ++ f->f_it = NULL; + if (error) + goto cleanup_all; + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + ++ intent_release(it); + return f; + + cleanup_all: +@@ -730,11 +798,17 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ return dentry_open_it(dentry, mnt, flags, NULL); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +Index: linux.mcp2/fs/stat.c +=================================================================== +--- linux.mcp2.orig/fs/stat.c 2004-01-19 07:49:43.000000000 -0800 ++++ linux.mcp2/fs/stat.c 2004-05-05 14:19:59.000000000 -0700 +@@ -17,10 +17,12 @@ + * Revalidate the inode. This is required for proper NFS attribute caching. + */ + static __inline__ int +-do_revalidate(struct dentry *dentry) ++do_revalidate(struct dentry *dentry, struct lookup_intent *it) + { + struct inode * inode = dentry->d_inode; +- if (inode->i_op && inode->i_op->revalidate) ++ if (inode->i_op && inode->i_op->revalidate_it) ++ return inode->i_op->revalidate_it(dentry, it); ++ else if (inode->i_op && inode->i_op->revalidate) + return inode->i_op->revalidate(dentry); + return 0; + } +@@ -135,13 +139,15 @@ + asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -151,13 +157,15 @@ + asmlinkage long sys_newstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -172,13 +180,15 @@ + asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -189,13 +199,15 @@ + asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -216,7 +228,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_old_stat(dentry->d_inode, statbuf); + fput(f); +@@ -235,7 +247,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_new_stat(dentry->d_inode, statbuf); + fput(f); +@@ -257,7 +269,7 @@ + + error = -EINVAL; + if (inode->i_op && inode->i_op->readlink && +- !(error = do_revalidate(nd.dentry))) { ++ !(error = do_revalidate(nd.dentry, NULL))) { + UPDATE_ATIME(inode); + error = inode->i_op->readlink(nd.dentry, buf, bufsiz); + } +@@ -333,12 +345,14 @@ + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -348,12 +362,14 @@ + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -368,7 +384,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_new_stat64(dentry->d_inode, statbuf); + fput(f); +Index: linux.mcp2/include/linux/dcache.h +=================================================================== +--- linux.mcp2.orig/include/linux/dcache.h 2004-04-23 16:52:28.000000000 -0700 ++++ linux.mcp2/include/linux/dcache.h 2004-05-05 14:19:59.000000000 -0700 +@@ -5,6 +5,51 @@ + + #include + #include ++#include ++ ++#define IT_OPEN 0x0001 ++#define IT_CREAT 0x0002 ++#define IT_READDIR 0x0004 ++#define IT_GETATTR 0x0008 ++#define IT_LOOKUP 0x0010 ++#define IT_UNLINK 0x0020 ++#define IT_GETXATTR 0x0040 ++#define IT_EXEC 0x0080 ++#define IT_PIN 0x0100 ++ ++#define IT_FL_LOCKED 0x0001 ++#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ ++ ++#define INTENT_MAGIC 0x19620323 ++ ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; ++ int it_int_flags; ++}; ++struct lookup_intent { ++ int it_magic; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_op; ++ int it_flags; ++ int it_create_mode; ++ union { ++ struct lustre_intent_data lustre; ++ } d; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op, int flags) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++ it->it_flags = flags; ++} ++ + + /* + * linux/include/linux/dcache.h +@@ -90,8 +135,22 @@ + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); ++ void (*d_pin)(struct dentry *, struct vfsmount * , int); ++ void (*d_unpin)(struct dentry *, struct vfsmount *, int); + }; + ++#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ ++ de->d_op->d_pin(de, mnt, flag); ++#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ ++ de->d_op->d_unpin(de, mnt, flag); ++ ++ ++/* defined in fs/namei.c */ ++extern void intent_release(struct lookup_intent *it); ++/* defined in fs/dcache.c */ ++extern void __d_rehash(struct dentry * entry, int lock); ++ + /* the dentry parameter passed to d_hash and d_compare is the parent + * directory of the entries to be compared. It is used in case these + * functions need any directory specific information for determining +@@ -123,6 +182,7 @@ + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +Index: linux.mcp2/include/linux/fs.h +=================================================================== +--- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:12:28.000000000 -0700 ++++ linux.mcp2/include/linux/fs.h 2004-05-05 14:19:59.000000000 -0700 +@@ -73,6 +73,7 @@ + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + #define READ 0 + #define WRITE 1 +@@ -335,6 +336,9 @@ + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 ++#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ ++#define ATTR_CTIME_SET 0x2000 + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -470,6 +474,7 @@ + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct char_device *i_cdev; ++ void *i_filterdata; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ +@@ -574,6 +579,7 @@ + + /* needed for tty driver, and maybe others */ + void *private_data; ++ struct lookup_intent *f_it; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; +@@ -692,6 +698,7 @@ + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *intent; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -840,7 +847,8 @@ + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry); + + /* + * File types +@@ -900,21 +908,32 @@ + + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); ++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); ++ int (*revalidate_it) (struct dentry *, struct lookup_intent *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + }; + +@@ -1115,10 +1134,14 @@ + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1380,6 +1403,7 @@ + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); +@@ -1390,6 +1414,8 @@ + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) ++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) ++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) + + extern void iput(struct inode *); + extern void force_delete(struct inode *); +@@ -1499,6 +1525,8 @@ + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; +Index: linux.mcp2/include/linux/fs_struct.h +=================================================================== +--- linux.mcp2.orig/include/linux/fs_struct.h 2004-01-19 07:49:42.000000000 -0800 ++++ linux.mcp2/include/linux/fs_struct.h 2004-05-05 14:19:59.000000000 -0700 +@@ -34,10 +34,12 @@ + write_lock(&fs->lock); + old_root = fs->root; + old_rootmnt = fs->rootmnt; ++ PIN(dentry, mnt, 1); + fs->rootmnt = mntget(mnt); + fs->root = dget(dentry); + write_unlock(&fs->lock); + if (old_root) { ++ UNPIN(old_root, old_rootmnt, 1); + dput(old_root); + mntput(old_rootmnt); + } +@@ -57,10 +59,12 @@ + write_lock(&fs->lock); + old_pwd = fs->pwd; + old_pwdmnt = fs->pwdmnt; ++ PIN(dentry, mnt, 0); + fs->pwdmnt = mntget(mnt); + fs->pwd = dget(dentry); + write_unlock(&fs->lock); + if (old_pwd) { ++ UNPIN(old_pwd, old_pwdmnt, 0); + dput(old_pwd); + mntput(old_pwdmnt); + } +Index: linux.mcp2/kernel/exit.c +=================================================================== +--- linux.mcp2.orig/kernel/exit.c 2004-01-19 07:49:44.000000000 -0800 ++++ linux.mcp2/kernel/exit.c 2004-05-05 14:19:59.000000000 -0700 +@@ -252,11 +252,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } +Index: linux.mcp2/kernel/fork.c +=================================================================== +--- linux.mcp2.orig/kernel/fork.c 2004-01-19 07:49:44.000000000 -0800 ++++ linux.mcp2/kernel/fork.c 2004-05-05 14:19:59.000000000 -0700 +@@ -384,10 +384,13 @@ + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); ++ PIN(old->pwd, old->pwdmnt, 0); ++ PIN(old->root, old->rootmnt, 1); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { ++ PIN(old->altroot, old->altrootmnt, 1); + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { +Index: linux.mcp2/kernel/ksyms.c +=================================================================== +--- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:12:28.000000000 -0700 ++++ linux.mcp2/kernel/ksyms.c 2004-05-05 14:19:59.000000000 -0700 +@@ -264,6 +264,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch new file mode 100644 index 0000000..1ff2f5d --- /dev/null +++ b/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch @@ -0,0 +1,12 @@ +Index: linux.mcp2/kernel/ksyms.c +=================================================================== +--- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:57:48.000000000 -0700 ++++ linux.mcp2/kernel/ksyms.c 2004-05-05 15:32:44.000000000 -0700 +@@ -108,6 +108,7 @@ + EXPORT_SYMBOL(kfree); + EXPORT_SYMBOL(vfree); + EXPORT_SYMBOL(__vmalloc); ++extern struct page * vmalloc_to_page(void *addr); + EXPORT_SYMBOL(vmalloc_to_page); + EXPORT_SYMBOL(mem_map); + EXPORT_SYMBOL(remap_page_range); diff --git a/lustre/kernel_patches/series/bgl-2.4.19 b/lustre/kernel_patches/series/bgl-2.4.19 new file mode 100644 index 0000000..0a03eda --- /dev/null +++ b/lustre/kernel_patches/series/bgl-2.4.19 @@ -0,0 +1,46 @@ +dev_read_only_2.4.20-rh.patch +exports_2.4.19-bgl.patch +lustre_version.patch +vfs_intent-2.4.19-bgl.patch +invalidate_show-2.4.19-bgl.patch +export-truncate-bgl.patch +iod-stock-24-exports-2.4.19-bgl.patch +ext3-htree-2.4.19-bgl.patch +linux-2.4.19-bgl-xattr-0.8.54.patch +ext3-2.4.20-fixes.patch +ext3-2.4-ino_t.patch +ext3-largefile.patch +ext3-truncate_blocks.patch +ext3-unmount_sync.patch +ext3-use-after-free-2.4.19-pre1.patch +ext3-orphan_lock.patch +ext3-noread-2.4.20.patch +ext3-delete_thread-2.4.20.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +ext3-map_inode_page.patch +ext3-error-export.patch +iopen-2.4.19-bgl.patch +tcp-zero-copy-2.4.19-pre1.patch +jbd-dont-account-blocks-twice.patch +jbd-commit-tricks.patch +ext3-no-write-super.patch +add_page_private-2.4.19-bgl.patch +socket-exports-2.4.19-bgl.patch +removepage-2.4.20.patch +jbd-ctx_switch.patch +jbd-flushtime-2.4.19-suse.patch +jbd-get_write_access.patch +nfs_export_kernel-2.4.19-bgl.patch +ext3-raw-lookup.patch +ext3-ea-in-inode-2.4.20.patch +listman-2.4.19-bgl.patch +ext3-trusted_ea-2.4.20.patch +jbd-2.4.19-pre1-jcberr.patch +resched-2.4.19-pre1.patch +ext3-xattr-ptr-arith-fix.patch +vmalloc_to_page-2.4.19-bgl.patch +ext3-truncate-buffer-head.patch +kallsyms-2.4-bgl.patch +kksymoops-2.4-bgl.patch +export-show_task-2.4-bgl.patch diff --git a/lustre/kernel_patches/targets/2.6-suse.target.in b/lustre/kernel_patches/targets/2.6-suse.target.in index b0aa7f1..4150cd1 100644 --- a/lustre/kernel_patches/targets/2.6-suse.target.in +++ b/lustre/kernel_patches/targets/2.6-suse.target.in @@ -15,7 +15,8 @@ BIGMEM_ARCHS="" BOOT_ARCHS="" JENSEN_ARCHS="" SMP_ARCHS="x86_64 ia64" -BIGSMP_ARCHS="i686 ppc" +BIGSMP_ARCHS="i686" +PSERIES64_ARCHS="ppc" UP_ARCHS="" SRC_ARCHS="" diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 85830fc..28e6a7f 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -71,30 +71,31 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - if (lcfg->lcfg_inllen1 < 1) { + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { CERROR("requires a TARGET UUID\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inllen1 > 37) { + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { CERROR("client UUID must be less than 38 characters\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inllen2 < 1) { + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { CERROR("setup requires a SERVER UUID\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inllen2 > 37) { + if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { CERROR("target UUID must be less than 38 characters\n"); RETURN(-EINVAL); } sema_init(&cli->cl_sem, 1); cli->cl_conn_count = 0; - memcpy(server_uuid.uuid, lcfg->lcfg_inlbuf2, - min_t(unsigned int, lcfg->lcfg_inllen2, sizeof(server_uuid))); + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); cli->cl_dirty = 0; cli->cl_avail_grant = 0; @@ -151,8 +152,8 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) imp->imp_generation = 0; imp->imp_initial_recov = 1; INIT_LIST_HEAD(&imp->imp_pinger_chain); - memcpy(imp->imp_target_uuid.uuid, lcfg->lcfg_inlbuf1, - lcfg->lcfg_inllen1); + memcpy(imp->imp_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 1)); class_import_put(imp); cli->cl_import = imp; @@ -161,17 +162,17 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie); cli->cl_sandev = to_kdev_t(0); - if (lcfg->lcfg_inllen3 != 0) { - if (!strcmp(lcfg->lcfg_inlbuf3, "inactive")) { + if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { CDEBUG(D_HA, "marking %s %s->%s as inactive\n", name, obddev->obd_name, imp->imp_target_uuid.uuid); imp->imp_invalid = 1; - if (lcfg->lcfg_inllen4 != 0) - mgmt_name = lcfg->lcfg_inlbuf4; + if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0) + mgmt_name = lustre_cfg_string(lcfg, 4); } else { - mgmt_name = lcfg->lcfg_inlbuf3; + mgmt_name = lustre_cfg_string(lcfg, 3); } } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index e46c0fe..f5616ad 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1103,7 +1103,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) case LDLM_GL_CALLBACK: OBD_FAIL_RETURN(OBD_FAIL_LDLM_GL_CALLBACK, 0); break; - case OBD_LOG_CANCEL: + case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */ OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0); rc = llog_origin_handle_cancel(req); ldlm_callback_reply(req, rc); @@ -1200,15 +1200,16 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) if (req->rq_export == NULL) { struct ldlm_request *dlm_req; - CERROR("operation %d with bad export from %s\n", - req->rq_reqmsg->opc, - req->rq_peerstr); - CERROR("--> export cookie: "LPX64"\n", + + CERROR("operation %d from %s with bad export cookie "LPU64"\n", + req->rq_reqmsg->opc, req->rq_peerstr, req->rq_reqmsg->handle.cookie); + dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req), lustre_swab_ldlm_request); if (dlm_req != NULL) ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1); + ldlm_callback_reply(req, -ENOTCONN); RETURN(0); } @@ -1223,7 +1224,11 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) if (rc) break; RETURN(0); - + case OBD_LOG_CANCEL: + OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0); + rc = llog_origin_handle_cancel(req); + ldlm_callback_reply(req, rc); + RETURN(0); default: CERROR("invalid opcode %d\n", req->rq_reqmsg->opc); ldlm_callback_reply(req, -EINVAL); diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index aca2897..a776768 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -46,7 +46,8 @@ endif libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c llite_lib.h # for make rpms -- need cleanup -liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c +liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c \ + llite_lib.h liblustre.a : $(LUSTRE_LIBS) $(PTL_LIBS) $(SYSIO_LIBS) $(srcdir)/genlib.sh $(SYSIO) $(AR) $(LINK) || ( rm -f $@; exit 1 ) diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index da29c9f..2ec8d83 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -246,7 +246,8 @@ static void llu_check_request() int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; char *peer = "MDS_PEER_UUID"; struct obd_device *obd; struct lustre_handle mdc_conn = {0, }; @@ -272,30 +273,32 @@ int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov) CERROR("Can't parse NAL tcp\n"); RETURN(-EINVAL); } - LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL); - lcfg.lcfg_nid = nid; - lcfg.lcfg_inllen1 = strlen(peer) + 1; - lcfg.lcfg_inlbuf1 = peer; - lcfg.lcfg_nal = nal; - err = class_process_config(&lcfg); + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set_string(&bufs, 1, peer); + lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); + lcfg->lcfg_nid = nid; + lcfg->lcfg_nal = nal; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out, err); - LCFG_INIT(lcfg, LCFG_ATTACH, name); - lcfg.lcfg_inlbuf1 = "mdc"; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = mdc_uuid.uuid; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME); + lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid); + lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out_del_uuid, err); - LCFG_INIT(lcfg, LCFG_SETUP, name); - lcfg.lcfg_inlbuf1 = g_zconf_mdsname; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = peer; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, g_zconf_mdsname); + lustre_cfg_bufs_set_string(&bufs, 2, peer); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out_detach, err); @@ -326,23 +329,27 @@ int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov) err = obd_disconnect(exp); out_cleanup: - LCFG_INIT(lcfg, LCFG_CLEANUP, name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out, err); out_detach: - LCFG_INIT(lcfg, LCFG_DETACH, name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lcfg = lustre_cfg_new(LCFG_DETACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out, err); out_del_uuid: - LCFG_INIT(lcfg, LCFG_DEL_UUID, name); - lcfg.lcfg_inllen1 = strlen(peer) + 1; - lcfg.lcfg_inlbuf1 = peer; - err = class_process_config(&lcfg); - + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, peer); + lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); out: if (rc == 0) rc = err; diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am index a8a5545..0a9a1c0 100644 --- a/lustre/liblustre/tests/Makefile.am +++ b/lustre/liblustre/tests/Makefile.am @@ -19,7 +19,7 @@ endif # LIBLUSTRE libtestcommon_a_SOURCES = test_common.c test_common.h -echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c +echo_test_SOURCES = echo_test.c $(top_srcdir)/lustre/utils/parser.c $(top_srcdir)/lustre/utils/obd.c $(top_srcdir)/lustre/utils/lustre_cfg.c echo_test_CFLAGS = $(LL_CFLAGS) echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread echo_test_DEPENDENCIES=$(top_builddir)/lustre/liblustre/liblsupport.a diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c index 92018a8..1b70246 100644 --- a/lustre/liblustre/tests/echo_test.c +++ b/lustre/liblustre/tests/echo_test.c @@ -11,6 +11,7 @@ #include #include #include +#include #define LIBLUSTRE_TEST 1 #include "../utils/lctl.c" @@ -175,7 +176,8 @@ static char *echo_dev_name = "ECHO_CLIENT_DEV_NAME"; static int connect_echo_client(void) { - struct lustre_cfg lcfg; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; ptl_nid_t nid; char *peer = "ECHO_PEER_NID"; class_uuid_t osc_uuid, echo_uuid; @@ -199,60 +201,60 @@ static int connect_echo_client(void) } /* add uuid */ - LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL); - lcfg.lcfg_nid = nid; - lcfg.lcfg_inllen1 = strlen(peer) + 1; - lcfg.lcfg_inlbuf1 = peer; - lcfg.lcfg_nal = nal; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set_string(&bufs, 1, peer); + lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); + lcfg->lcfg_nid = nid; + lcfg->lcfg_nal = nal; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed add_uuid\n"); RETURN(-EINVAL); } /* attach osc */ - LCFG_INIT(lcfg, LCFG_ATTACH, osc_dev_name); - lcfg.lcfg_inlbuf1 = "osc"; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = osc_uuid_str.uuid; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, LUSTRE_OSC_NAME); + lustre_cfg_bufs_set_string(&bufs, 2, osc_uuid_str.uuid); + lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed attach osc\n"); RETURN(-EINVAL); } /* setup osc */ - LCFG_INIT(lcfg, LCFG_SETUP, osc_dev_name); - lcfg.lcfg_inlbuf1 = echo_server_ostname; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = peer; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, osc_dev_name); + lustre_cfg_bufs_set_string(&bufs, 1, echo_server_ostname); + lustre_cfg_bufs_set_string(&bufs, 2, peer); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed setup osc\n"); RETURN(-EINVAL); } /* attach echo_client */ - LCFG_INIT(lcfg, LCFG_ATTACH, echo_dev_name); - lcfg.lcfg_inlbuf1 = "echo_client"; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = echo_uuid_str.uuid; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, echo_dev_name); + lustre_cfg_bufs_set_string(&bufs, 1, "echo_client"); + lustre_cfg_bufs_set_string(&bufs, 2, echo_uuid_str.uuid); + lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed attach echo_client\n"); RETURN(-EINVAL); } /* setup echo_client */ - LCFG_INIT(lcfg, LCFG_SETUP, echo_dev_name); - lcfg.lcfg_inlbuf1 = osc_dev_name; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = NULL; - lcfg.lcfg_inllen2 = 0; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, echo_dev_name); + lustre_cfg_bufs_set_string(&bufs, 1, osc_dev_name); + lustre_cfg_bufs_set_string(&bufs, 2, NULL); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed setup echo_client\n"); RETURN(-EINVAL); @@ -263,37 +265,44 @@ static int connect_echo_client(void) static int disconnect_echo_client(void) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg = NULL; int err; ENTRY; /* cleanup echo_client */ - LCFG_INIT(lcfg, LCFG_CLEANUP, echo_dev_name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, echo_dev_name); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + err = class_process_config(lcfg); if (err < 0) { + lustre_cfg_free(lcfg); CERROR("failed cleanup echo_client\n"); RETURN(-EINVAL); } /* detach echo_client */ - LCFG_INIT(lcfg, LCFG_DETACH, echo_dev_name); - err = class_process_config(&lcfg); + lcfg->lcfg_command = LCFG_DETACH; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed detach echo_client\n"); RETURN(-EINVAL); } /* cleanup osc */ - LCFG_INIT(lcfg, LCFG_CLEANUP, osc_dev_name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, osc_dev_name); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + err = class_process_config(lcfg); if (err < 0) { + lustre_cfg_free(lcfg); CERROR("failed cleanup osc device\n"); RETURN(-EINVAL); } /* detach osc */ - LCFG_INIT(lcfg, LCFG_DETACH, osc_dev_name); - err = class_process_config(&lcfg); + lcfg->lcfg_command = LCFG_DETACH; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) { CERROR("failed detach osc device\n"); RETURN(-EINVAL); diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 1fba7a7..43f884e 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -447,9 +447,18 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, if (rc) return(-EFAULT); + /* + * This is coming from userspace, so should be in + * local endian. But the MDS would like it in little + * endian, so we swab it before we send it. + */ if (lum.lmm_magic != LOV_USER_MAGIC) RETURN(-EINVAL); + if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC)) + lustre_swab_lov_user_md(&lum); + + /* swabbing is done in lov_setstripe() on server side */ rc = mdc_setattr(sbi->ll_mdc_exp, &op_data, &attr, &lum, sizeof(lum), NULL, 0, &request); if (rc) { @@ -493,6 +502,17 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, lmm = lustre_msg_buf(request->rq_repmsg, 1, lmmsize); LASSERT(lmm != NULL); LASSERT_REPSWABBED(request, 1); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { + lustre_swab_lov_user_md((struct lov_user_md *)lmm); + lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); + } + rc = copy_to_user(lump, lmm, lmmsize); if (rc) GOTO(out_get, rc = -EFAULT); @@ -543,6 +563,16 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, LASSERT(lmm != NULL); LASSERT_REPSWABBED(request, 1); + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { + lustre_swab_lov_user_md((struct lov_user_md *)lmm); + lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); + } + if (cmd == IOC_MDC_GETFILEINFO) { struct lov_user_mds_data *lmdp; lstat_t st = { 0 }; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index fbc9200..e1c4ccf 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -46,6 +46,32 @@ extern struct address_space_operations ll_dir_aops; #define log2(n) ffz(~(n)) #endif +/* We need to have some extra twiddling here because some systems have + * no random state when they start up. */ +static void +lustre_generate_random_uuid(class_uuid_t uuid) +{ + struct timeval t; + int *i, j, k; + + ENTRY; + LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0); + + j = jiffies; + do_gettimeofday(&t); + k = t.tv_usec; + + generate_random_uuid(uuid); + + for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) { + *i ^= j ^ k; + j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff); + k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000); + } + + EXIT; +} + struct ll_sb_info *lustre_init_sbi(struct super_block *sb) { struct ll_sb_info *sbi = NULL; @@ -70,8 +96,9 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); ll_s2sbi_nocast(sb) = sbi; - generate_random_uuid(uuid); + lustre_generate_random_uuid(uuid); class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + CDEBUG(D_HA, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); spin_lock(&ll_sb_lock); list_add_tail(&sbi->ll_list, &ll_super_blocks); @@ -107,6 +134,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) struct lustre_md md; kdev_t devno; int err; + ENTRY; obd = class_name2obd(mdc); if (!obd) { @@ -324,7 +352,6 @@ void lustre_common_put_super(struct super_block *sb) EXIT; } - char *ll_read_opt(const char *opt, char *data) { char *value; @@ -376,10 +403,11 @@ void ll_options(char *options, char **ost, char **mdc, int *flags) #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) for (this_char = strtok (options, ","); this_char != NULL; - this_char = strtok (NULL, ",")) { + this_char = strtok (NULL, ",")) #else - while ((this_char = strsep (&opt_ptr, ",")) != NULL) { + while ((this_char = strsep (&opt_ptr, ",")) != NULL) #endif + { CDEBUG(D_SUPER, "this_char %s\n", this_char); if (!*ost && (*ost = ll_read_opt("osc", this_char))) continue; @@ -448,7 +476,8 @@ out: int lustre_process_log(struct lustre_mount_data *lmd, char * profile, struct config_llog_instance *cfg, int allow_recov) { - struct lustre_cfg lcfg; + struct lustre_cfg *lcfg = NULL; + struct lustre_cfg_bufs bufs; struct portals_cfg pcfg; char * peer = "MDS_PEER_UUID"; struct obd_device *obd; @@ -465,8 +494,9 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, if (lmd_bad_magic(lmd)) RETURN(-EINVAL); - generate_random_uuid(uuid); + lustre_generate_random_uuid(uuid); class_uuid_unparse(uuid, &mdc_uuid); + CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid); if (lmd->lmd_local_nid) { PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID); @@ -492,30 +522,36 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, GOTO(out, err); } - LCFG_INIT(lcfg, LCFG_ADD_UUID, name); - lcfg.lcfg_nid = lmd->lmd_server_nid; - lcfg.lcfg_inllen1 = strlen(peer) + 1; - lcfg.lcfg_inlbuf1 = peer; - lcfg.lcfg_nal = lmd->lmd_nal; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, peer); + + lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); + lcfg->lcfg_nal = lmd->lmd_nal; + lcfg->lcfg_nid = lmd->lmd_server_nid; + LASSERT(lcfg->lcfg_nal); + LASSERT(lcfg->lcfg_nid); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out_del_conn, err); - LCFG_INIT(lcfg, LCFG_ATTACH, name); - lcfg.lcfg_inlbuf1 = "mdc"; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = mdc_uuid.uuid; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME); + lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid); + + lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out_del_uuid, err); - LCFG_INIT(lcfg, LCFG_SETUP, name); - lcfg.lcfg_inlbuf1 = lmd->lmd_mds; - lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; - lcfg.lcfg_inlbuf2 = peer; - lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, lmd->lmd_mds); + lustre_cfg_bufs_set_string(&bufs, 2, peer); + + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out_detach, err); @@ -565,22 +601,27 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, err = obd_disconnect(exp); out_cleanup: - LCFG_INIT(lcfg, LCFG_CLEANUP, name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out, err); out_detach: - LCFG_INIT(lcfg, LCFG_DETACH, name); - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lcfg = lustre_cfg_new(LCFG_DETACH, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err < 0) GOTO(out, err); out_del_uuid: - LCFG_INIT(lcfg, LCFG_DEL_UUID, name); - lcfg.lcfg_inllen1 = strlen(peer) + 1; - lcfg.lcfg_inlbuf1 = peer; - err = class_process_config(&lcfg); + lustre_cfg_bufs_reset(&bufs, name); + lustre_cfg_bufs_set_string(&bufs, 1, peer); + lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs); + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); out_del_conn: if (lmd->lmd_nal == SOCKNAL || @@ -605,23 +646,27 @@ out: static void lustre_manual_cleanup(struct ll_sb_info *sbi) { - struct lustre_cfg lcfg; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; struct obd_device *obd; int next = 0; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) != NULL) - { + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) != NULL){ int err; - LCFG_INIT(lcfg, LCFG_CLEANUP, obd->obd_name); - err = class_process_config(&lcfg); + /* the lcfg is almost the same for both ops */ + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + + err = class_process_config(lcfg); if (err) { CERROR("cleanup failed: %s\n", obd->obd_name); //continue; } - LCFG_INIT(lcfg, LCFG_DETACH, obd->obd_name); - err = class_process_config(&lcfg); + lcfg->lcfg_command = LCFG_DETACH; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); if (err) { CERROR("detach failed: %s\n", obd->obd_name); //continue; @@ -738,8 +783,7 @@ out_free: OBD_ALLOC(cln_prof, len); sprintf(cln_prof, "%s-clean", sbi->ll_lmd->lmd_profile); - err = lustre_process_log(sbi->ll_lmd, cln_prof, &cfg, - 0); + err = lustre_process_log(sbi->ll_lmd, cln_prof, &cfg,0); if (err < 0) { CERROR("Unable to process log: %s\n", cln_prof); lustre_manual_cleanup(sbi); @@ -770,6 +814,7 @@ void lustre_put_super(struct super_block *sb) lustre_common_put_super(sb); if (sbi->ll_lmd != NULL) { +#if 0 char * cln_prof; int len = strlen(sbi->ll_lmd->lmd_profile) + sizeof("-clean")+1; int err; @@ -796,6 +841,9 @@ void lustre_put_super(struct super_block *sb) OBD_FREE(cln_prof, len); free_lmd: +#else + lustre_manual_cleanup(sbi); +#endif OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd)); OBD_FREE(sbi->ll_instance, strlen(sbi->ll_instance) + 1); } diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index d9e52c3..98c2350 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -305,28 +305,42 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) int count; ENTRY; - if (lcfg->lcfg_inllen1 < 1) { + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { CERROR("LOV setup requires a descriptor\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inllen2 < 1) { + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { CERROR("LOV setup requires an OST UUID list\n"); RETURN(-EINVAL); } - desc = (struct lov_desc *)lcfg->lcfg_inlbuf1; - if (sizeof(*desc) > lcfg->lcfg_inllen1) { + desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { CERROR("descriptor size wrong: %d > %d\n", - (int)sizeof(*desc), lcfg->lcfg_inllen1); + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); RETURN(-EINVAL); } + if (desc->ld_magic != LOV_DESC_MAGIC) { + if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { + CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", + obd->obd_name, desc); + lustre_swab_lov_desc(desc); + } else { + CERROR("%s: Bad lov desc magic: %#x\n", + obd->obd_name, desc->ld_magic); + RETURN(-EINVAL); + } + } + + desc->ld_active_tgt_count = 0; count = desc->ld_tgt_count; - uuids = (struct obd_uuid *)lcfg->lcfg_inlbuf2; - if (sizeof(*uuids) * count != lcfg->lcfg_inllen2) { + uuids = (struct obd_uuid *)lustre_cfg_buf(lcfg, 2); + if (sizeof(*uuids) * count != LUSTRE_CFG_BUFLEN(lcfg, 2)) { CERROR("UUID array size wrong: %u * %u != %u\n", - (int)sizeof(*uuids), count, lcfg->lcfg_inllen2); + (int)sizeof(*uuids), count, + LUSTRE_CFG_BUFLEN(lcfg, 2)); RETURN(-EINVAL); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 968de87..342ad47 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -343,9 +343,14 @@ int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, RETURN(-EFAULT); if (lum.lmm_magic != LOV_USER_MAGIC) { - CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x\n", - lum.lmm_magic, LOV_USER_MAGIC); - RETURN(-EINVAL); + if (lum.lmm_magic == __swab32(LOV_USER_MAGIC)) { + lustre_swab_lov_user_md(&lum); + } else { + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x\n", + lum.lmm_magic, LOV_USER_MAGIC); + RETURN(-EINVAL); + } } if (lum.lmm_pattern == 0) { diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index b9f17ef..c2198ba 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -654,6 +654,9 @@ static int fsfilt_ext3_sync(struct super_block *sb) #warning "kernel code has old extents/mballoc patch, disabling" #undef EXT3_MULTIBLOCK_ALLOCATOR #endif +#ifndef EXT3_EXTENTS_FL +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#endif #ifdef EXT3_MULTIBLOCK_ALLOCATOR #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) @@ -771,7 +774,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, ext3_up_truncate_sem(inode); lock_24kernel(); - handle = journal_start(EXT3_JOURNAL(inode), count + EXT3_ALLOC_NEEDED + 1); + handle = journal_start(EXT3_JOURNAL(inode), count+EXT3_ALLOC_NEEDED+1); unlock_24kernel(); if (IS_ERR(handle)) { ext3_down_truncate_sem(inode); diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index ca1eb61..d6e7b50 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -186,6 +186,60 @@ static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) } } +static int round_up(int val) +{ + int ret = 1; + while (val) { + val >>= 1; + ret <<= 1; + } + return ret; +} + +/* Save a large LOV EA into the request buffer so that it is available + * for replay. We don't do this in the initial request because the + * original request doesn't need this buffer (at most it sends just the + * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty + * buffer and may also be difficult to allocate and save a very large + * request buffer for each open. (bug 5707) + * + * OOM here may cause recovery failure if lmm is needed (only for the + * original open if the MDS crashed just when this client also OOM'd) + * but this is incredibly unlikely, and questionable whether the client + * could do MDS recovery under OOM anyways... */ +static void mdc_realloc_openmsg(struct ptlrpc_request *req, + struct mds_body *body, int size[5]) +{ + int new_size, old_size; + struct lustre_msg *new_msg; + + /* save old size */ + old_size = lustre_msg_size(5, size); + + size[4] = body->eadatasize; + new_size = lustre_msg_size(5, size); + OBD_ALLOC(new_msg, new_size); + if (new_msg != NULL) { + struct lustre_msg *old_msg = req->rq_reqmsg; + long irqflags; + + DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u\n", + body->eadatasize); + memcpy(new_msg, old_msg, old_size); + new_msg->buflens[4] = body->eadatasize; + + spin_lock_irqsave(&req->rq_lock, irqflags); + req->rq_reqmsg = new_msg; + req->rq_reqlen = new_size; + spin_unlock_irqrestore(&req->rq_lock, irqflags); + + OBD_FREE(old_msg, old_size); + } else { + body->valid &= ~OBD_MD_FLEASIZE; + body->eadatasize = 0; + } +} + /* We always reserve enough space in the reply packet for a stripe MD, because * we don't know in advance the file type. */ int mdc_enqueue(struct obd_export *exp, @@ -204,7 +258,7 @@ int mdc_enqueue(struct obd_export *exp, struct obd_device *obddev = class_exp2obd(exp); struct ldlm_res_id res_id = { .name = {data->fid1.id, data->fid1.generation} }; - int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; + int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; int rc, flags = LDLM_FL_HAS_INTENT; int repsize[4] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), @@ -227,8 +281,18 @@ int mdc_enqueue(struct obd_export *exp, size[2] = sizeof(struct mds_rec_create); size[3] = data->namelen + 1; - size[4] = obddev->u.cli.cl_max_mds_easize; - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, + /* As an optimization, we allocate an RPC request buffer for + * at least a default-sized LOV EA even if we aren't sending + * one. We grow the whole request to the next power-of-two + * size since we get that much from a slab allocation anyways. + * This avoids an allocation below in the common case where + * we need to save a default-sized LOV EA for open replay. */ + size[4] = max(lmmsize, obddev->u.cli.cl_default_mds_easize); + rc = lustre_msg_size(5, size); + if (rc & (rc - 1)) + size[4] = min(size[4] + round_up(rc) - rc, + obddev->u.cli.cl_max_mds_easize); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 5, size, NULL); if (!req) RETURN(-ENOMEM); @@ -367,7 +431,6 @@ int mdc_enqueue(struct obd_export *exp, } if ((body->valid & OBD_MD_FLEASIZE) != 0) { - void *replayea; /* The eadata is opaque; just check that it is * there. Eventually, obd_unpackmd() will check * the contents */ @@ -377,11 +440,19 @@ int mdc_enqueue(struct obd_export *exp, CERROR ("Missing/short eadata\n"); RETURN (-EPROTO); } + /* We save the reply LOV EA in case we have to replay + * a create for recovery. If we didn't allocate a + * large enough request buffer above we need to + * reallocate it here to hold the actual LOV EA. */ if (it->it_op & IT_OPEN) { - replayea = lustre_msg_buf(req->rq_reqmsg, 4, - obddev->u.cli.cl_max_mds_easize); - LASSERT(replayea); - memcpy(replayea, eadata, body->eadatasize); + if (req->rq_reqmsg->buflens[4] < + body->eadatasize) + mdc_realloc_openmsg(req, body, size); + + lmm = lustre_msg_buf(req->rq_reqmsg, 4, + body->eadatasize); + if (lmm) + memcpy(lmm, eadata, body->eadatasize); } } } diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 64b589c..239e874 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -913,7 +913,7 @@ err_rpc_lock: RETURN(rc); } -/* Initialize the maximum LOV EA and cookie sizes. This allows +/* Initialize the default and maximum LOV EA and cookie sizes. This allows * us to make MDS RPCs with large enough reply buffers to hold the * maximum-sized (= maximum striped) EA and cookie without having to * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ @@ -921,21 +921,29 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp) { struct obd_device *obd = mdc_exp->exp_obd; struct client_obd *cli = &obd->u.cli; - struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC }; struct lov_desc desc; __u32 valsize = sizeof(desc); int rc, size; ENTRY; + size = obd_size_diskmd(lov_exp, NULL); + if (cli->cl_max_mds_easize < size) + cli->cl_max_mds_easize = size; + rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc", &valsize, &desc); - if (rc < 0) + if (rc) RETURN(rc); - lsm.lsm_stripe_count = desc.ld_tgt_count; - size = obd_size_diskmd(lov_exp, &lsm); - if (cli->cl_max_mds_easize < size) - cli->cl_max_mds_easize = size; + /* If default_stripe_count is zero we stripe over all OSTs */ + if (desc.ld_default_stripe_count != 0) { + struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC, + .lsm_stripe_count = + desc.ld_default_stripe_count }; + size = obd_size_diskmd(lov_exp, &lsm); + } + if (cli->cl_default_mds_easize < size) + cli->cl_default_mds_easize = size; size = desc.ld_tgt_count * sizeof(struct llog_cookie); if (cli->cl_max_mds_cookiesize < size) diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 5dcdadb..2ebe618c 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1386,10 +1386,13 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) int rc = 0; ENTRY; - if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2) + if (lcfg->lcfg_bufcount < 3) RETURN(rc = -EINVAL); - obd->obd_fsops = fsfilt_get_ops(lcfg->lcfg_inlbuf2); + if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0) + RETURN(rc = -EINVAL); + + obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); if (IS_ERR(obd->obd_fsops)) RETURN(rc = PTR_ERR(obd->obd_fsops)); @@ -1399,18 +1402,18 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) options = (char *)page; memset(options, 0, PAGE_SIZE); - + /* here we use "iopen_nopriv" hardcoded, because it affects MDS utility * and the rest of options are passed by mount options. Probably this * should be moved to somewhere else like startup scripts or lconf. */ sprintf(options, "iopen_nopriv"); - if (lcfg->lcfg_inllen4 > 0 && lcfg->lcfg_inlbuf4) + if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) sprintf(options + strlen(options), ",%s", - lcfg->lcfg_inlbuf4); + lustre_cfg_string(lcfg, 4)); - mnt = do_kern_mount(lcfg->lcfg_inlbuf2, 0, - lcfg->lcfg_inlbuf1, (void *)options); + mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0, + lustre_cfg_string(lcfg, 1), (void *)options); free_page(page); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); @@ -1418,8 +1421,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) GOTO(err_ops, rc); } - CDEBUG(D_SUPER, "%s: mnt = %p\n", lcfg->lcfg_inlbuf1, mnt); - + CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt); + LASSERT(!ll_check_rdonly(ll_sbdev(mnt->mnt_sb))); sema_init(&mds->mds_orphan_recovery_sem, 1); @@ -1446,18 +1449,18 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) if (rc < 0) GOTO(err_fs, rc); - if (lcfg->lcfg_inllen3 > 0 && lcfg->lcfg_inlbuf3) { + if (lcfg->lcfg_bufcount >= 4 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { class_uuid_t uuid; generate_random_uuid(uuid); class_uuid_unparse(uuid, &mds->mds_lov_uuid); - OBD_ALLOC(mds->mds_profile, lcfg->lcfg_inllen3); + OBD_ALLOC(mds->mds_profile, LUSTRE_CFG_BUFLEN(lcfg, 3)); if (mds->mds_profile == NULL) GOTO(err_fs, rc = -ENOMEM); - memcpy(mds->mds_profile, lcfg->lcfg_inlbuf3, - lcfg->lcfg_inllen3); + strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3), + LUSTRE_CFG_BUFLEN(lcfg, 3)); } @@ -1480,7 +1483,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) "Recovery progress can be monitored by watching " "/proc/fs/lustre/mds/%s/recovery_status.\n", obd->obd_name, - lcfg->lcfg_inlbuf1, + lustre_cfg_string(lcfg, 1), obd->obd_recoverable_clients, (obd->obd_recoverable_clients == 1) ? "client" : "clients", @@ -1489,7 +1492,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_name); } else { LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n", - obd->obd_name, lcfg->lcfg_inlbuf1, + obd->obd_name, + lustre_cfg_string(lcfg, 1), obd->obd_replayable ? "enabled" : "disabled"); } diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 92eac6b..7641aa3 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -38,22 +38,6 @@ #include "mds_internal.h" -void le_lov_desc_to_cpu (struct lov_desc *ld) -{ - ld->ld_tgt_count = le32_to_cpu (ld->ld_tgt_count); - ld->ld_default_stripe_count = le32_to_cpu (ld->ld_default_stripe_count); - ld->ld_default_stripe_size = le32_to_cpu (ld->ld_default_stripe_size); - ld->ld_pattern = le32_to_cpu (ld->ld_pattern); -} - -void cpu_to_le_lov_desc (struct lov_desc *ld) -{ - ld->ld_tgt_count = cpu_to_le32 (ld->ld_tgt_count); - ld->ld_default_stripe_count = cpu_to_le32 (ld->ld_default_stripe_count); - ld->ld_default_stripe_size = cpu_to_le32 (ld->ld_default_stripe_size); - ld->ld_pattern = cpu_to_le32 (ld->ld_pattern); -} - void mds_lov_update_objids(struct obd_device *obd, obd_id *ids) { struct mds_obd *mds = &obd->u.mds; @@ -323,6 +307,9 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, struct obd_run_ctxt saved; int rc = 0; + ENTRY; + CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd); + switch (cmd) { case OBD_IOC_RECORD: { char *name = data->ioc_inlbuf1; diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index d092fc5..f761599 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -1,6 +1,7 @@ MODULES := obdclass llog_test -obdclass-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o class_obd.o +obdclass-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o +obdclass-objs += class_obd.o obdclass-objs += debug.o genops.o sysctl.o uuid.o llog_ioctl.o obdclass-objs += lprocfs_status.o lustre_handles.o lustre_peer.o obdclass-objs += statfs_pack.o obdo.o obd_config.o diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am index 79ba5ac..8d670dc 100644 --- a/lustre/obdclass/autoMakefile.am +++ b/lustre/obdclass/autoMakefile.am @@ -3,8 +3,8 @@ if LIBLUSTRE noinst_LIBRARIES = liblustreclass.a liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c -liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c -liblustreclass_a_SOURCES += llog_lvfs.c #llog_ioctl.c rbtree.c +liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c +liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c #llog_ioctl.c rbtree.c liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_VERSION=\"32\" -DBUILD_VERSION=\"1\" liblustreclass_a_CFLAGS = $(LLCFLAGS) diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 150948c..5173165 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -194,21 +194,25 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg) switch (cmd) { case OBD_IOC_PROCESS_CFG: { - char *buf; struct lustre_cfg *lcfg; if (!data->ioc_plen1 || !data->ioc_pbuf1) { CERROR("No config buffer passed!\n"); GOTO(out, err = -EINVAL); } - err = lustre_cfg_getdata(&buf, data->ioc_plen1, - data->ioc_pbuf1, 0); + + err = lustre_cfg_sanity_check(data->ioc_pbuf1, + data->ioc_plen1); + if (err) + GOTO(out, err); + + OBD_ALLOC(lcfg, data->ioc_plen1); + err = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1); if (err) GOTO(out, err); - lcfg = (struct lustre_cfg* ) buf; err = class_process_config(lcfg); - lustre_cfg_freedata(buf, data->ioc_plen1); + OBD_FREE(lcfg, data->ioc_plen1); GOTO(out, err); } @@ -541,6 +545,76 @@ struct file_operations obd_device_list_fops = { }; #endif +#define OBD_INIT_CHECK +#ifdef OBD_INIT_CHECK +int obd_init_checks(void) +{ + long long llval; + __u64 u64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s, LPSZ=%s, LPSSZ=%s\n", + LPU64, LPD64, LPX64, LPSZ, LPSSZ); + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", OBD_OBJECT_EOF); + + llval = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "llval OBD_OBJECT_EOF = "LPX64"\n", llval); + if (llval != OBD_OBJECT_EOF) { + CDEBUG(D_ERROR, "long long "LPX64"(%d) != 0xffffffffffffffff\n", + llval, sizeof(llval)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPX64, llval); + if (len != 18) { + CDEBUG(D_WARNING, "LPX64 wrong length! strlen(%s)=%d != 18\n", + buf, len); + ret = -EINVAL; + } + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CDEBUG(D_ERROR, "__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, sizeof(u64val)); + ret = -EINVAL; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CDEBUG(D_ERROR, "__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPX64, u64val); + if (len != 18) { + CDEBUG(D_WARNING, "LPX64 wrong length! strlen(%s)=%d != 18\n", + buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPU64, u64val); + if (len != 20) { + CDEBUG(D_WARNING, "LPU64 wrong length! strlen(%s)=%d != 20\n", + buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPD64, u64val); + if (len != 2) { + CDEBUG(D_WARNING, "LPD64 wrong length! strlen(%s)=%d != 2\n", + buf, len); + ret = -EINVAL; + } + if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) { + CDEBUG(D_WARNING, "mask failed: u64val "LPU64" >= %lu\n", + u64val, PAGE_SIZE); + ret = -EINVAL; + } + + return ret; +} +#else +#define obd_init_checks() do {} while(0) +#endif + #ifdef __KERNEL__ static int __init init_obdclass(void) #else @@ -582,6 +656,7 @@ int init_obdclass(void) #ifdef __KERNEL__ obd_sysctl_init(); #endif + obd_init_checks(); #ifdef LPROCFS proc_lustre_root = proc_mkdir("lustre", proc_root_fs); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 4cb163d..bea932d 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -311,7 +311,7 @@ struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, /* Iterate the obd_device list looking devices have grp_uuid. Start searching at *next, and if a device is found, the next index to look - it is saved in *next. If next is NULL, then the first matching device + at is saved in *next. If next is NULL, then the first matching device will always be returned. */ struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next) { diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 6d76716..f39d614 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -166,11 +166,13 @@ out: if (flags & LLOG_F_IS_CAT) { INIT_LIST_HEAD(&handle->u.chd.chd_head); llh->llh_size = sizeof(struct llog_logid_rec); - } - else if (flags & LLOG_F_IS_PLAIN) + } else if (flags & LLOG_F_IS_PLAIN) { INIT_LIST_HEAD(&handle->u.phd.phd_entry); - else + } else { + CERROR("Unknown flags: %#x (Expected %#x or %#x\n", + flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); LBUG(); + } if (rc) { OBD_FREE(llh, sizeof(*llh)); @@ -203,9 +205,9 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, { struct llog_log_hdr *llh = loghandle->lgh_hdr; struct llog_process_cat_data *cd = catdata; - void *buf; + char *buf; __u64 cur_offset = LLOG_CHUNK_SIZE; - int rc = 0, index = 1, last_index, idx; + int rc = 0, index = 1, last_index; int saved_index = 0; ENTRY; @@ -232,6 +234,9 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, if (index == last_index + 1) break; + CDEBUG(D_OTHER, "index: %d last_index %d\n", + index, last_index); + /* get the buf with our target record; avoid old garbage */ memset(buf, 0, LLOG_CHUNK_SIZE); rc = llog_next_block(loghandle, &saved_index, index, @@ -239,21 +244,36 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, if (rc) GOTO(out, rc); - rec = buf; - idx = rec->lrh_index; - if (idx < index) - CDEBUG(D_HA, "index %u : idx %u\n", index, idx); - while (idx < index) { - rec = (struct llog_rec_hdr *) - ((char *)rec + rec->lrh_len); - idx ++; - } + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)buf; + (char *)rec < buf + LLOG_CHUNK_SIZE; + rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){ + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec, NULL); + + CDEBUG(D_OTHER, "after swabbing, type: %#x\n", + rec->lrh_type); - /* process records in buffer, starting where we found one */ - while ((void *)rec < buf + LLOG_CHUNK_SIZE) { if (rec->lrh_index == 0) GOTO(out, 0); /* no more records */ + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + LLOG_CHUNK_SIZE - (char *)rec)); + /* if set, process the callback on this record */ if (ext2_test_bit(index, llh->llh_bitmap)) { rc = cb(loghandle, rec, data); @@ -266,14 +286,14 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, } if (rc) GOTO(out, rc); + } else { + CDEBUG(D_OTHER, "Skipped index %d\n", index); } /* next record, still in buffer? */ ++index; if (index > last_index) GOTO(out, rc = 0); - rec = (struct llog_rec_hdr *) - ((char *)rec + rec->lrh_len); } } diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 40d0753..2cae0a7 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -61,7 +61,7 @@ static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file, tail.lrt_len = rec.lrh_len = len; tail.lrt_index = rec.lrh_index = index; - rec.lrh_type = 0; + rec.lrh_type = LLOG_PAD_MAGIC; rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0); if (rc) { @@ -168,9 +168,10 @@ static int llog_lvfs_read_header(struct llog_handle *handle) handle->lgh_file->f_dentry->d_name.name); } else { struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr; - /* - * These need to be fixed for bug 1987 - */ + + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { CERROR("bad log %.*s header magic: %#x (expected %#x)\n", handle->lgh_file->f_dentry->d_name.len, @@ -387,7 +388,13 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, RETURN(-EINVAL); } - tail = buf + rc - sizeof(struct llog_rec_tail); + rec = buf; + tail = (struct llog_rec_tail *)((char *)buf + rc - sizeof(struct llog_rec_tail)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) { + lustre_swab_llog_rec(rec, tail); + } + *cur_idx = tail->lrt_index; /* this shouldn't happen */ @@ -402,7 +409,6 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, /* sanity check that the start of the new buffer is no farther * than the record that we wanted. This shouldn't happen. */ - rec = buf; if (rec->lrh_index > next_idx) { CERROR("missed desired record? %u > %u\n", rec->lrh_index, next_idx); diff --git a/lustre/obdclass/llog_swab.c b/lustre/obdclass/llog_swab.c new file mode 100644 index 0000000..3a2ae51 --- /dev/null +++ b/lustre/obdclass/llog_swab.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004-2005 Cluster File Systems, Inc. + * Author: jacob berkman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Swabbing of llog datatypes (from disk or over the wire). + * + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oid: "LPX64"\n", d->lgd_logid.lgl_oid); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogr: "LPX64"\n", d->lgd_logid.lgl_ogr); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset); +} + +void lustre_swab_llogd_body (struct llogd_body *d) +{ + ENTRY; + print_llogd_body(d); + __swab64s (&d->lgd_logid.lgl_oid); + __swab64s (&d->lgd_logid.lgl_ogr); + __swab32s (&d->lgd_logid.lgl_ogen); + __swab32s (&d->lgd_ctxt_idx); + __swab32s (&d->lgd_llh_flags); + __swab32s (&d->lgd_index); + __swab32s (&d->lgd_saved_index); + __swab32s (&d->lgd_len); + __swab64s (&d->lgd_cur_offset); + print_llogd_body(d); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) +{ + __swab64s (&d->lgdc_gen.mnt_cnt); + __swab64s (&d->lgdc_gen.conn_cnt); + __swab64s (&d->lgdc_logid.lgl_oid); + __swab64s (&d->lgdc_logid.lgl_ogr); + __swab32s (&d->lgdc_logid.lgl_ogen); + __swab32s (&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid (struct ll_fid *fid) +{ + __swab64s (&fid->id); + __swab32s (&fid->generation); + __swab32s (&fid->f_type); +} +EXPORT_SYMBOL(lustre_swab_ll_fid); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail) +{ + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + + switch (rec->lrh_type) { + case OST_SZ_REC: { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_io_epoch); + + break; + } + + case OST_RAID1_REC: + break; + + case MDS_UNLINK_REC: { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_ogen); + + break; + } + + case OBD_CFG_REC: + case PTL_CFG_REC: + /* these are swabbed as they are consumed */ + break; + + case LLOG_HDR_MAGIC: { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + if (tail != &llh->llh_tail) { + __swab32s(&llh->llh_tail.lrt_index); + __swab32s(&llh->llh_tail.lrt_len); + } + + break; + } + + case LLOG_LOGID_MAGIC: { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + __swab64s(&lid->lid_id.lgl_oid); + __swab64s(&lid->lid_id.lgl_ogr); + __swab32s(&lid->lid_id.lgl_ogen); + break; + } + + /* ignore old pad records of type 0 */ + case 0: + break; + + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} +EXPORT_SYMBOL(lustre_swab_llog_rec); + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len); +} + +void lustre_swab_llog_hdr (struct llog_log_hdr *h) +{ + ENTRY; + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr, &h->llh_tail); + + print_llog_hdr(h); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +#define PRINT_PCFG32(x) CDEBUG(D_OTHER, "\tpcfg->pcfg_"#x": %#x\n", pcfg->pcfg_##x) +#define PRINT_PCFG64(x) CDEBUG(D_OTHER, "\tpcfg->pcfg_"#x": "LPX64"\n", pcfg->pcfg_##x) + +static void print_portals_cfg(struct portals_cfg *pcfg) +{ + ENTRY; + + if (!(portal_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "portals_cfg: %p\n", pcfg); + PRINT_PCFG32(version); + PRINT_PCFG32(command); + + PRINT_PCFG32(nal); + PRINT_PCFG32(flags); + + PRINT_PCFG32(gw_nal); + PRINT_PCFG64(nid); + PRINT_PCFG64(nid2); + PRINT_PCFG64(nid3); + PRINT_PCFG32(id); + PRINT_PCFG32(misc); + PRINT_PCFG32(fd); + PRINT_PCFG32(count); + PRINT_PCFG32(size); + PRINT_PCFG32(wait); + + PRINT_PCFG32(plen1); + PRINT_PCFG32(plen2); + + EXIT; +} + +void lustre_swab_portals_cfg(struct portals_cfg *pcfg) +{ + ENTRY; + + __swab32s(&pcfg->pcfg_version); + __swab32s(&pcfg->pcfg_command); + + __swab32s(&pcfg->pcfg_nal); + __swab32s(&pcfg->pcfg_flags); + + __swab32s(&pcfg->pcfg_gw_nal); + __swab64s(&pcfg->pcfg_nid); + __swab64s(&pcfg->pcfg_nid2); + __swab64s(&pcfg->pcfg_nid3); + __swab32s(&pcfg->pcfg_id); + __swab32s(&pcfg->pcfg_misc); + __swab32s(&pcfg->pcfg_fd); + __swab32s(&pcfg->pcfg_count); + __swab32s(&pcfg->pcfg_size); + __swab32s(&pcfg->pcfg_wait); + + __swab32s(&pcfg->pcfg_plen1); + __swab32s(&pcfg->pcfg_plen2); + + print_portals_cfg(pcfg); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_portals_cfg); + +static void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + if (!(portal_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: "LPX64"\n", lcfg->lcfg_nid); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nal: %#x\n", lcfg->lcfg_nal); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n", + i, lcfg->lcfg_buflens[i]); + EXIT; +} + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + EXIT; + return; + } + + __swab32s(&lcfg->lcfg_command); + + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_nal); + + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + EXIT; + return; +} +EXPORT_SYMBOL(lustre_swab_lustre_cfg); diff --git a/lustre/obdclass/llog_test.c b/lustre/obdclass/llog_test.c index 5c8af06..7d851fe 100644 --- a/lustre/obdclass/llog_test.c +++ b/lustre/obdclass/llog_test.c @@ -507,8 +507,8 @@ static int llog_test_7(struct obd_device *obd) } llog_init_handle(llh, LLOG_F_IS_PLAIN, &uuid); - lcr.lcr_hdr.lrh_len = lcr.lcr_tail.lrt_len = cpu_to_le32(sizeof(lcr)); - lcr.lcr_hdr.lrh_type = cpu_to_le32(OST_SZ_REC); + lcr.lcr_hdr.lrh_len = lcr.lcr_tail.lrt_len = sizeof(lcr); + lcr.lcr_hdr.lrh_type = OST_SZ_REC; rc = llog_write_rec(llh, &lcr.lcr_hdr, NULL, 0, NULL, -1); if (rc) { CERROR("7: write one log record failed: %d\n", rc); @@ -621,15 +621,20 @@ static int llog_test_setup(struct obd_device *obd, obd_count len, void *buf) int rc; ENTRY; - if (lcfg->lcfg_inllen1 < 1) { + if (lcfg->lcfg_bufcount < 2) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + if (lcfg->lcfg_buflens[1] < 1) { CERROR("requires a TARGET OBD name\n"); RETURN(-EINVAL); } - tgt = class_name2obd(lcfg->lcfg_inlbuf1); + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { CERROR("target device not attached or not set up (%s)\n", - lcfg->lcfg_inlbuf1); + lustre_cfg_string(lcfg, 1)); RETURN(-EINVAL); } diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index 9b1ec40..0252602 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -89,6 +89,9 @@ int class_add_uuid(char *uuid, __u64 nid, __u32 nal) int rc; int nob = strnlen (uuid, PAGE_SIZE) + 1; + LASSERT(nid != 0); + LASSERT(nal != 0); + if (nob > PAGE_SIZE) return -EINVAL; diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index cd04ae0..3105e22 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -46,43 +46,30 @@ int class_attach(struct lustre_cfg *lcfg) { struct obd_type *type; - struct obd_device *obd; + struct obd_device *obd = NULL; char *typename, *name, *namecopy, *uuid; int rc, len, cleanup_phase = 0; - if (!lcfg->lcfg_inllen1 || !lcfg->lcfg_inlbuf1) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { CERROR("No type passed!\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inlbuf1[lcfg->lcfg_inllen1 - 1] != 0) { - CERROR("Type not nul terminated!\n"); - RETURN(-EINVAL); - } - typename = lcfg->lcfg_inlbuf1; + typename = lustre_cfg_string(lcfg, 1); - if (!lcfg->lcfg_dev_namelen || !lcfg->lcfg_dev_name) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { CERROR("No name passed!\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_dev_name[lcfg->lcfg_dev_namelen - 1] != 0) { - CERROR("Name not nul terminated!\n"); - RETURN(-EINVAL); - } - name = lcfg->lcfg_dev_name; + name = lustre_cfg_string(lcfg, 0); - if (!lcfg->lcfg_inllen2 || !lcfg->lcfg_inlbuf2) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { CERROR("No UUID passed!\n"); RETURN(-EINVAL); } - if (lcfg->lcfg_inlbuf2[lcfg->lcfg_inllen2 - 1] != 0) { - CERROR("UUID not nul terminated!\n"); - RETURN(-EINVAL); - } - uuid = lcfg->lcfg_inlbuf2; + uuid = lustre_cfg_string(lcfg, 2); CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", - MKSTR(lcfg->lcfg_inlbuf1), - MKSTR(lcfg->lcfg_dev_name), MKSTR(lcfg->lcfg_inlbuf2)); + MKSTR(typename), MKSTR(name), MKSTR(uuid)); /* find the type */ type = class_get_type(typename); @@ -311,8 +298,8 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) obd->obd_stopping = 1; spin_unlock(&obd->obd_dev_lock); - if (lcfg->lcfg_inlbuf1) { - for (flag = lcfg->lcfg_inlbuf1; *flag != 0; flag++) + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) switch (*flag) { case 'F': obd->obd_force = 1; @@ -461,11 +448,10 @@ void class_del_profile(char *prof) int class_process_config(struct lustre_cfg *lcfg) { struct obd_device *obd; - char str[PTL_NALFMT_SIZE]; + char nidstr[PTL_NALFMT_SIZE]; int err; LASSERT(lcfg && !IS_ERR(lcfg)); - CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); /* Commands that don't need a device */ @@ -476,38 +462,44 @@ int class_process_config(struct lustre_cfg *lcfg) } case LCFG_ADD_UUID: { CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64 - " (%s), nal %x\n", lcfg->lcfg_inlbuf1, lcfg->lcfg_nid, - portals_nid2str(lcfg->lcfg_nal, lcfg->lcfg_nid, str), + " (%s), nal %x\n", lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid, + portals_nid2str(lcfg->lcfg_nal, lcfg->lcfg_nid, nidstr), lcfg->lcfg_nal); - err = class_add_uuid(lcfg->lcfg_inlbuf1, lcfg->lcfg_nid, + err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid, lcfg->lcfg_nal); GOTO(out, err); } case LCFG_DEL_UUID: { CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", - lcfg->lcfg_inlbuf1 == NULL ? "" : - lcfg->lcfg_inlbuf1); + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) + ? "" : lustre_cfg_string(lcfg, 1)); - err = class_del_uuid(lcfg->lcfg_inlbuf1); + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); GOTO(out, err); } case LCFG_MOUNTOPT: { CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", - lcfg->lcfg_inlbuf1, lcfg->lcfg_inlbuf2, - lcfg->lcfg_inlbuf3); + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); /* set these mount options somewhere, so ll_fill_super * can find them. */ - err = class_add_profile(lcfg->lcfg_inllen1, lcfg->lcfg_inlbuf1, - lcfg->lcfg_inllen2, lcfg->lcfg_inlbuf2, - lcfg->lcfg_inllen3, lcfg->lcfg_inlbuf3); + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); GOTO(out, err); } case LCFG_DEL_MOUNTOPT: { - CDEBUG(D_IOCTL, "mountopt: profile %s\n", lcfg->lcfg_inlbuf1); + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); /* set these mount options somewhere, so ll_fill_super * can find them. */ - class_del_profile(lcfg->lcfg_inlbuf1); + class_del_profile(lustre_cfg_string(lcfg, 1)); GOTO(out, err = 0); } case LCFG_SET_TIMEOUT: { @@ -519,22 +511,23 @@ int class_process_config(struct lustre_cfg *lcfg) } case LCFG_SET_UPCALL: { CDEBUG(D_IOCTL, "setting lustre ucpall to: %s\n", - lcfg->lcfg_inlbuf1); - if (lcfg->lcfg_inllen1 > sizeof obd_lustre_upcall) + lustre_cfg_string(lcfg, 1)); + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof obd_lustre_upcall) GOTO(out, err = -EINVAL); - memcpy(obd_lustre_upcall, lcfg->lcfg_inlbuf1, - lcfg->lcfg_inllen1); + strncpy(obd_lustre_upcall, lustre_cfg_string(lcfg, 1), + sizeof (obd_lustre_upcall)); GOTO(out, err = 0); } } /* Commands that require a device */ - obd = class_name2obd(lcfg->lcfg_dev_name); + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); if (obd == NULL) { - if (lcfg->lcfg_dev_name == NULL) + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) CERROR("this lcfg command requires a device name\n"); else - CERROR("no device for: %s\n", lcfg->lcfg_dev_name); + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); GOTO(out, err = -EINVAL); } @@ -570,65 +563,82 @@ static int class_config_llog_handler(struct llog_handle * handle, char *cfg_buf = (char*) (rec + 1); int rc = 0; ENTRY; - if (rec->lrh_type == OBD_CFG_REC) { - char *buf; - struct lustre_cfg *lcfg; - char *old_name = NULL; - int old_len = 0; - char *old_uuid = NULL; - int old_uuid_len = 0; + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; char *inst_name = NULL; int inst_len = 0; + int inst = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) + lustre_swab_lustre_cfg(lcfg); - rc = lustre_cfg_getdata(&buf, cfg_len, cfg_buf, 1); + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); if (rc) GOTO(out, rc); - lcfg = (struct lustre_cfg* ) buf; - if (cfg && cfg->cfg_instance && lcfg->lcfg_dev_name) { - inst_len = strlen(lcfg->lcfg_dev_name) + - strlen(cfg->cfg_instance) + 2; + lustre_cfg_bufs_init(&bufs, lcfg); + + if (cfg && cfg->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + inst = 1; + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + + strlen(cfg->cfg_instance) + 1; OBD_ALLOC(inst_name, inst_len); if (inst_name == NULL) GOTO(out, rc = -ENOMEM); - sprintf(inst_name, "%s-%s", lcfg->lcfg_dev_name, + sprintf(inst_name, "%s-%s", + lustre_cfg_string(lcfg, 0), cfg->cfg_instance); - old_name = lcfg->lcfg_dev_name; - old_len = lcfg->lcfg_dev_namelen; - lcfg->lcfg_dev_name = inst_name; - lcfg->lcfg_dev_namelen = strlen(inst_name) + 1; + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); } if (cfg && lcfg->lcfg_command == LCFG_ATTACH) { - old_uuid = lcfg->lcfg_inlbuf2; - old_uuid_len = lcfg->lcfg_inllen2; - - lcfg->lcfg_inlbuf2 = (char*)&cfg->cfg_uuid.uuid; - lcfg->lcfg_inllen2 = sizeof(cfg->cfg_uuid); + lustre_cfg_bufs_set_string(&bufs, 2, cfg->cfg_uuid.uuid); } - rc = class_process_config(lcfg); + lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); - if (old_name) { - lcfg->lcfg_dev_name = old_name; - lcfg->lcfg_dev_namelen = old_len; - OBD_FREE(inst_name, inst_len); - } + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + lcfg_new->lcfg_nal = lcfg->lcfg_nal; - if (old_uuid) { - lcfg->lcfg_inlbuf2 = old_uuid; - lcfg->lcfg_inllen2 = old_uuid_len; - } + rc = class_process_config(lcfg_new); + lustre_cfg_free(lcfg_new); - lustre_cfg_freedata(buf, cfg_len); - } else if (rec->lrh_type == PTL_CFG_REC) { + if (inst) + OBD_FREE(inst_name, inst_len); + break; + } + case PTL_CFG_REC: { struct portals_cfg *pcfg = (struct portals_cfg *)cfg_buf; + if (pcfg->pcfg_version != PORTALS_CFG_VERSION) { + if (pcfg->pcfg_version == __swab32(PORTALS_CFG_VERSION)) { + CDEBUG(D_OTHER, "swabbing portals_cfg %p\n", + pcfg); + lustre_swab_portals_cfg(pcfg); + } else { + CERROR("Unknown portals_cfg version: %#x " + "(expecting %#x)\n", + pcfg->pcfg_version, + PORTALS_CFG_VERSION); + RETURN(-EINVAL); + } + } if (pcfg->pcfg_command ==NAL_CMD_REGISTER_MYNID && cfg->cfg_local_nid != PTL_NID_ANY) { pcfg->pcfg_nid = cfg->cfg_local_nid; } rc = libcfs_nal_cmd(pcfg); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; } out: RETURN(rc); @@ -641,6 +651,7 @@ int class_config_parse_llog(struct llog_ctxt *ctxt, char *name, int rc, rc2; ENTRY; + CDEBUG(D_INFO, "looking up llog %s\n", name); rc = llog_create(ctxt, &llh, NULL, name); if (rc) RETURN(rc); @@ -667,18 +678,18 @@ int class_config_dump_handler(struct llog_handle * handle, int rc = 0; ENTRY; if (rec->lrh_type == OBD_CFG_REC) { - char *buf; struct lustre_cfg *lcfg; + int i; - rc = lustre_cfg_getdata(&buf, cfg_len, cfg_buf, 1); + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); if (rc) GOTO(out, rc); - lcfg = (struct lustre_cfg* ) buf; + lcfg = (struct lustre_cfg *)cfg_buf; CDEBUG(D_INFO, "lcfg command: %x\n", lcfg->lcfg_command); - if (lcfg->lcfg_dev_name) + if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) CDEBUG(D_INFO, " devname: %s\n", - lcfg->lcfg_dev_name); + lustre_cfg_string(lcfg, 0)); if (lcfg->lcfg_flags) CDEBUG(D_INFO, " flags: %x\n", lcfg->lcfg_flags); if (lcfg->lcfg_nid) @@ -688,19 +699,12 @@ int class_config_dump_handler(struct llog_handle * handle, CDEBUG(D_INFO, " nal: %x\n", lcfg->lcfg_nal); if (lcfg->lcfg_num) CDEBUG(D_INFO, " nal: %x\n", lcfg->lcfg_num); - if (lcfg->lcfg_inlbuf1) - CDEBUG(D_INFO, " inlbuf1: %s\n",lcfg->lcfg_inlbuf1); - if (lcfg->lcfg_inlbuf2) - CDEBUG(D_INFO, " inlbuf2: %s\n",lcfg->lcfg_inlbuf2); - if (lcfg->lcfg_inlbuf3) - CDEBUG(D_INFO, " inlbuf3: %s\n",lcfg->lcfg_inlbuf3); - if (lcfg->lcfg_inlbuf4) - CDEBUG(D_INFO, " inlbuf4: %s\n",lcfg->lcfg_inlbuf4); - - lustre_cfg_freedata(buf, cfg_len); + for (i = 1; i < lcfg->lcfg_bufcount; i++) + if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) + CDEBUG(D_INFO, " inlbuf%d: %s\n", i, + lustre_cfg_string(lcfg, i)); } else if (rec->lrh_type == PTL_CFG_REC) { struct portals_cfg *pcfg = (struct portals_cfg *)cfg_buf; - CDEBUG(D_INFO, "pcfg command: %d\n", pcfg->pcfg_command); if (pcfg->pcfg_nal) CDEBUG(D_INFO, " nal: %x\n", diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index fb67492..f07ee99 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -1329,15 +1329,15 @@ echo_client_setup(struct obd_device *obddev, obd_count len, void *buf) int rc; ENTRY; - if (lcfg->lcfg_inllen1 < 1) { + if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { CERROR("requires a TARGET OBD name\n"); RETURN(-EINVAL); } - tgt = class_name2obd(lcfg->lcfg_inlbuf1); + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { CERROR("device not attached or not set up (%s)\n", - lcfg->lcfg_inlbuf1); + lustre_cfg_string(lcfg, 1)); RETURN(-EINVAL); } @@ -1347,7 +1347,8 @@ echo_client_setup(struct obd_device *obddev, obd_count len, void *buf) rc = obd_connect(&conn, tgt, &echo_uuid); if (rc) { - CERROR("fail to connect to device %s\n", lcfg->lcfg_inlbuf1); + CERROR("fail to connect to device %s\n", + lustre_cfg_string(lcfg, 1)); return (rc); } ec->ec_exp = class_conn2export(&conn); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index d7fb733..ff6fe1e 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1186,34 +1186,38 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, struct lustre_cfg* lcfg = buf; struct filter_obd *filter = &obd->u.filter; struct vfsmount *mnt; + char *str; char ns_name[48]; int rc = 0; ENTRY; - if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2) + if (lcfg->lcfg_bufcount < 3 || + LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) RETURN(-EINVAL); - obd->obd_fsops = fsfilt_get_ops(lcfg->lcfg_inlbuf2); + obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); if (IS_ERR(obd->obd_fsops)) RETURN(PTR_ERR(obd->obd_fsops)); - mnt = do_kern_mount(lcfg->lcfg_inlbuf2, MS_NOATIME | MS_NODIRATIME, - lcfg->lcfg_inlbuf1, (void *)option); + mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), MS_NOATIME | MS_NODIRATIME, + lustre_cfg_string(lcfg, 1), (void *)option); rc = PTR_ERR(mnt); if (IS_ERR(mnt)) GOTO(err_ops, rc); LASSERT(!ll_check_rdonly(ll_sbdev(mnt->mnt_sb))); - if (lcfg->lcfg_inllen3 > 0 && lcfg->lcfg_inlbuf3) { - if (*lcfg->lcfg_inlbuf3 == 'f') { + if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + str = lustre_cfg_string(lcfg, 3); + if (*str == 'f') { obd->obd_replayable = 1; obd_sync_filter = 1; CWARN("%s: recovery enabled\n", obd->obd_name); } else { - if (*lcfg->lcfg_inlbuf3 != 'n') { + if (*str != 'n') { CERROR("unrecognised flag '%c'\n", - *lcfg->lcfg_inlbuf3); + *str); } // XXX Robert? Why do we get errors here // GOTO(err_mntput, rc = -EINVAL); @@ -1282,7 +1286,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, "Recovery progress can be monitored by watching " "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", obd->obd_name, - lcfg->lcfg_inlbuf1, + lustre_cfg_string(lcfg, 1), obd->obd_recoverable_clients, (obd->obd_recoverable_clients == 1) ? "client" : "clients", @@ -1291,7 +1295,8 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, obd->obd_name); } else { LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n", - obd->obd_name, lcfg->lcfg_inlbuf1, + obd->obd_name, + lustre_cfg_string(lcfg, 1), obd->obd_replayable ? "enabled" : "disabled"); } @@ -1315,10 +1320,10 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf) struct lustre_cfg* lcfg = buf; int rc; - if (!lcfg->lcfg_inlbuf1 || !lcfg->lcfg_inlbuf2) + if (!LUSTRE_CFG_BUFLEN(lcfg, 1) || !LUSTRE_CFG_BUFLEN(lcfg, 2)) RETURN(-EINVAL); - rc = filter_common_setup(obd, len, buf, lcfg->lcfg_inlbuf4); + rc = filter_common_setup(obd, len, buf, lustre_cfg_buf(lcfg, 4)); lprocfs_init_vars(filter, &lvars); if (rc == 0 && lprocfs_obd_setup(obd, lvars.obd_vars) == 0 && diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 6d59e9f..d4327ca 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -38,7 +38,6 @@ #include #include "filter_internal.h" - /* We should only change the file mtime (and not the ctime, like * update_inode_times() in generic_file_write()) when we only change data. */ void inode_update_time(struct inode *inode, int ctime_too) @@ -203,7 +202,6 @@ int filter_direct_io(int rw, struct dentry *dchild, void *buf, * leaves it there, sometimes generating io from it at later truncates. * Someday very soon we'll be performing our brw_kiovec() IO to and * from the page cache. */ - check_pending_bhs(KIOBUF_GET_BLOCKS(iobuf), iobuf->nr_pages, inode->i_dev, 1 << inode->i_blkbits); diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 97675bb..d8fe7c8 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -129,7 +129,7 @@ int filter_recov_log_unlink_cb(struct llog_handle *llh, int rc = 0; ENTRY; - if (!(le32_to_cpu(llh->lgh_hdr->llh_flags) & LLOG_F_IS_PLAIN)) { + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { CERROR("log is not plain\n"); RETURN(-EINVAL); } @@ -141,7 +141,7 @@ int filter_recov_log_unlink_cb(struct llog_handle *llh, cookie.lgc_lgl = llh->lgh_id; cookie.lgc_subsys = LLOG_UNLINK_ORIG_CTXT; - cookie.lgc_index = le32_to_cpu(rec->lrh_index); + cookie.lgc_index = rec->lrh_index; if (rec->lrh_type == LLOG_GEN_REC) { lgr = (struct llog_gen_rec *)rec; diff --git a/lustre/obdfilter/filter_san.c b/lustre/obdfilter/filter_san.c index 64ddf68..4b1e14d 100644 --- a/lustre/obdfilter/filter_san.c +++ b/lustre/obdfilter/filter_san.c @@ -40,13 +40,13 @@ int filter_san_setup(struct obd_device *obd, obd_count len, void *buf) struct lustre_cfg* lcfg = buf; char *option = NULL; - if (!lcfg->lcfg_inlbuf2) + if (lcfg->lcfg_bufcount < 3 || LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) RETURN(-EINVAL); /* for extN/ext3 filesystem, we must mount it with 'writeback' mode */ - if (!strcmp(lcfg->lcfg_inlbuf2, "extN")) + if (!strcmp(lustre_cfg_string(lcfg, 2), "ldiskfs")) option = "data=writeback"; - else if (!strcmp(lcfg->lcfg_inlbuf2, "ext3")) + else if (!strcmp(lustre_cfg_string(lcfg, 2), "ext3")) option = "data=writeback,asyncdel"; else LBUG(); /* just a reminder */ diff --git a/lustre/osc/osc_lib.c b/lustre/osc/osc_lib.c index 79b4b6b..cedce02 100644 --- a/lustre/osc/osc_lib.c +++ b/lustre/osc/osc_lib.c @@ -58,16 +58,17 @@ int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf) struct client_obd *cli = &obddev->u.cli; ENTRY; - if (lcfg->lcfg_inllen3 < 1) { + if (lcfg->lcfg_bufcount < 4 || LUSTRE_CFG_BUFLEN(lcfg, 3) < 1) { CERROR("setup requires a SAN device pathname\n"); RETURN(-EINVAL); } client_obd_setup(obddev, len, buf); - cli->cl_sandev = path2dev(lcfg->lcfg_inlbuf3); + cli->cl_sandev = path2dev(lustre_cfg_string(lcfg, 3)); if (!kdev_t_to_nr(cli->cl_sandev)) { - CERROR("%s seems not a valid SAN device\n", lcfg->lcfg_inlbuf3); + CERROR("%s seems not a valid SAN device\n", + lustre_cfg_string(lcfg, 3)); RETURN(-EINVAL); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 693ee23..735fc20 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -359,8 +359,12 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) ptl_nid_t peer_nid; int i; char str[PTL_NALFMT_SIZE]; - int rc = lustre_uuid_to_peer(uuid->uuid, - &peer_nal, &peer_nid); + int rc; + + ENTRY; + + rc = lustre_uuid_to_peer (uuid->uuid, &peer_nal, &peer_nid); + if (rc != 0) RETURN (rc); @@ -371,7 +375,7 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) peer->peer_id.nid = peer_nid; peer->peer_id.pid = LUSTRE_SRV_PTL_PID; peer->peer_ni = pni; - return (0); + RETURN(0); } } diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c index 5b8e33f..591c5fc 100644 --- a/lustre/ptlrpc/llog_client.c +++ b/lustre/ptlrpc/llog_client.c @@ -145,6 +145,7 @@ static int llog_client_next_block(struct llog_handle *loghandle, GOTO(out, rc =-EFAULT); } + /* The log records are swabbed as they are processed */ ptr = lustre_msg_buf(req->rq_repmsg, 1, len); if (ptr == NULL) { CERROR ("Can't unpack bitmap\n"); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index d865afe..aa4c917 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -221,8 +221,9 @@ int lustre_unpack_msg(struct lustre_msg *m, int len) __swab64s (&m->last_committed); __swab64s (&m->transno); __swab32s (&m->status); - __swab32s (&m->bufcount); __swab32s (&m->flags); + __swab32s (&m->conn_cnt); + __swab32s (&m->bufcount); } required_len = HDR_SIZE(m->bufcount); @@ -424,13 +425,6 @@ void lustre_swab_ost_lvb(struct ost_lvb *lvb) __swab64s(&lvb->lvb_blocks); } -void lustre_swab_ll_fid (struct ll_fid *fid) -{ - __swab64s (&fid->id); - __swab32s (&fid->generation); - __swab32s (&fid->f_type); -} - void lustre_swab_mds_status_req (struct mds_status_req *r) { __swab32s (&r->flags); @@ -544,6 +538,67 @@ void lustre_swab_lov_desc (struct lov_desc *ld) /* uuid endian insensitive */ } +static void print_lum (struct lov_user_md *lum) +{ + CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); + CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic); + CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern); + CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lum->lmm_object_id); + CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lum->lmm_object_gr); + CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); + CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); + CDEBUG(D_OTHER, "\tlmm_stripe_offset: %#x\n", lum->lmm_stripe_offset); +} + +void lustre_swab_lov_user_md(struct lov_user_md *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md\n"); + __swab32s(&lum->lmm_magic); + __swab32s(&lum->lmm_pattern); + __swab64s(&lum->lmm_object_id); + __swab64s(&lum->lmm_object_gr); + __swab32s(&lum->lmm_stripe_size); + __swab16s(&lum->lmm_stripe_count); + __swab16s(&lum->lmm_stripe_offset); + print_lum(lum); + EXIT; +} + +static void print_lum_objs(struct lov_user_md *lum) +{ + struct lov_user_ost_data *lod; + int i; + ENTRY; + if (!(portal_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "lov_user_md_objects: %p\n", lum); + for (i = 0; i < lum->lmm_stripe_count; i++) { + lod = &lum->lmm_objects[i]; + CDEBUG(D_OTHER, "(%i) lod->l_object_id: "LPX64"\n", i, lod->l_object_id); + CDEBUG(D_OTHER, "(%i) lod->l_object_gr: "LPX64"\n", i, lod->l_object_gr); + CDEBUG(D_OTHER, "(%i) lod->l_ost_gen: %#x\n", i, lod->l_ost_gen); + CDEBUG(D_OTHER, "(%i) lod->l_ost_idx: %#x\n", i, lod->l_ost_idx); + } + EXIT; +} + +void lustre_swab_lov_user_md_objects(struct lov_user_md *lum) +{ + struct lov_user_ost_data *lod; + int i; + ENTRY; + for (i = 0; i < lum->lmm_stripe_count; i++) { + lod = &lum->lmm_objects[i]; + __swab64s(&lod->l_object_id); + __swab64s(&lod->l_object_gr); + __swab32s(&lod->l_ost_gen); + __swab32s(&lod->l_ost_idx); + } + print_lum_objs(lum); + EXIT; +} + void lustre_swab_ldlm_res_id (struct ldlm_res_id *id) { int i; @@ -621,57 +676,11 @@ void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r) __swab16s (&r->r_error_cnt); } -/* no one calls this */ -int llog_log_swabbed(struct llog_log_hdr *hdr) -{ - if (hdr->llh_hdr.lrh_type == __swab32(LLOG_HDR_MAGIC)) - return 1; - if (hdr->llh_hdr.lrh_type == LLOG_HDR_MAGIC) - return 0; - return -1; -} - -void lustre_swab_llogd_body (struct llogd_body *d) -{ - __swab64s (&d->lgd_logid.lgl_oid); - __swab64s (&d->lgd_logid.lgl_ogr); - __swab32s (&d->lgd_logid.lgl_ogen); - __swab32s (&d->lgd_ctxt_idx); - __swab32s (&d->lgd_llh_flags); - __swab32s (&d->lgd_index); - __swab32s (&d->lgd_saved_index); - __swab32s (&d->lgd_len); - __swab64s (&d->lgd_cur_offset); -} - -void lustre_swab_llog_hdr (struct llog_log_hdr *h) -{ - __swab32s (&h->llh_hdr.lrh_index); - __swab32s (&h->llh_hdr.lrh_len); - __swab32s (&h->llh_hdr.lrh_type); - __swab64s (&h->llh_timestamp); - __swab32s (&h->llh_count); - __swab32s (&h->llh_bitmap_offset); - __swab32s (&h->llh_flags); - __swab32s (&h->llh_tail.lrt_index); - __swab32s (&h->llh_tail.lrt_len); -} - -void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) -{ - __swab64s (&d->lgdc_gen.mnt_cnt); - __swab64s (&d->lgdc_gen.conn_cnt); - __swab64s (&d->lgdc_logid.lgl_oid); - __swab64s (&d->lgdc_logid.lgl_ogr); - __swab32s (&d->lgdc_logid.lgl_ogen); - __swab32s (&d->lgdc_ctxt_idx); -} - void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux schnapps.adilger.int 2.4.22-l32 #4 Thu Jan 8 14:32:57 MST 2004 i686 i686 - * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */ + * running on Linux milano 2.4.21-20.EL_87k.6-b_release_1_3_3.200410121845smp #1 SMP Tue Oct + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ /* Constants... */ @@ -921,22 +930,22 @@ void lustre_assert_wire_constants(void) (long long)offsetof(struct obdo, o_gr)); LASSERTF((int)sizeof(((struct obdo *)0)->o_gr) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_gr)); - LASSERTF(offsetof(struct obdo, o_atime) == 16, " found %lld\n", - (long long)offsetof(struct obdo, o_atime)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF(offsetof(struct obdo, o_size) == 16, " found %lld\n", + (long long)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); LASSERTF(offsetof(struct obdo, o_mtime) == 24, " found %lld\n", (long long)offsetof(struct obdo, o_mtime)); LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); - LASSERTF(offsetof(struct obdo, o_ctime) == 32, " found %lld\n", + LASSERTF(offsetof(struct obdo, o_atime) == 32, " found %lld\n", + (long long)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF(offsetof(struct obdo, o_ctime) == 40, " found %lld\n", (long long)offsetof(struct obdo, o_ctime)); LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); - LASSERTF(offsetof(struct obdo, o_size) == 40, " found %lld\n", - (long long)offsetof(struct obdo, o_size)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_size)); LASSERTF(offsetof(struct obdo, o_blocks) == 48, " found %lld\n", (long long)offsetof(struct obdo, o_blocks)); LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, " found %lld\n", @@ -2129,4 +2138,3 @@ void lustre_assert_wire_constants(void) (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); } - diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index ff19d6a..8f408d7 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -155,7 +155,6 @@ EXPORT_SYMBOL(lustre_swab_niobuf_remote); EXPORT_SYMBOL(lustre_swab_ost_body); EXPORT_SYMBOL(lustre_swab_ost_last_id); EXPORT_SYMBOL(lustre_swab_ost_lvb); -EXPORT_SYMBOL(lustre_swab_ll_fid); EXPORT_SYMBOL(lustre_swab_mds_status_req); EXPORT_SYMBOL(lustre_swab_mds_body); EXPORT_SYMBOL(lustre_swab_mds_rec_setattr); @@ -164,6 +163,8 @@ EXPORT_SYMBOL(lustre_swab_mds_rec_link); EXPORT_SYMBOL(lustre_swab_mds_rec_unlink); EXPORT_SYMBOL(lustre_swab_mds_rec_rename); EXPORT_SYMBOL(lustre_swab_lov_desc); +EXPORT_SYMBOL(lustre_swab_lov_user_md); +EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); EXPORT_SYMBOL(lustre_swab_ldlm_res_id); EXPORT_SYMBOL(lustre_swab_ldlm_policy_data); EXPORT_SYMBOL(lustre_swab_ldlm_intent); diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index fc048fb..1c2b462 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -85,12 +85,12 @@ void ptlrpcd_add_req(struct ptlrpc_request *req) if (req->rq_send_state == LUSTRE_IMP_FULL) pc = &ptlrpcd_pc; - else + else pc = &ptlrpcd_recovery_pc; ptlrpc_set_add_new_req(pc->pc_set, req); req->rq_ptlrpcd_data = pc; - + ptlrpcd_wake(req); } diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 27c31a3..f48d9bc 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -343,6 +343,9 @@ static int log_commit_thread(void *arg) request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1, &llcd->llcd_cookiebytes, bufs); + /* XXX FIXME bug 249, 5515 */ + request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; if (request == NULL) { rc = -ENOMEM; diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 1b264cea..a5c9e21 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -27,6 +27,7 @@ # include # include # include +# include #else # include #endif diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 5f837e4..829c078 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -238,11 +238,11 @@ ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc) list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds); spin_unlock_irqrestore(&svc->srv_lock, flags); - + rc = ptlrpc_register_rqbd(rqbd); if (rc != 0) break; - + posted = 1; } diff --git a/lustre/tests/directio.c b/lustre/tests/directio.c index d64d59d..8896f15 100644 --- a/lustre/tests/directio.c +++ b/lustre/tests/directio.c @@ -14,6 +14,7 @@ int main(int argc, char **argv) { +#ifdef O_DIRECT int fd; char *wbuf; int blocks, seek_blocks; @@ -113,4 +114,9 @@ int main(int argc, char **argv) printf("PASS\n"); return 0; +#else /* !O_DIRECT */ +#warning O_DIRECT not defined, directio test will fail + printf("O_DIRECT not defined\n"); + return 1; +#endif /* !O_DIRECT */ } diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 375f959..9e08890 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -67,7 +67,7 @@ fi RECORDSOUT=`grep "records out" $LOG | cut -d + -f1` FILESIZE=`ls -l $OOS | awk '{ print $5 }'` -if [ $RECORDSOUT -ne $(($FILESIZE / 1024)) ]; then +if [ "$RECORDSOUT" -ne $(($FILESIZE / 1024)) ]; then echo "ERROR: blocks written by dd not equal to the size of file" SUCCESS=0 fi diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh index 43e8edf..782784d 100644 --- a/lustre/tests/oos2.sh +++ b/lustre/tests/oos2.sh @@ -69,7 +69,7 @@ RECORDSOUT=$((`grep "records out" $LOG | cut -d+ -f 1` + \ `grep "records out" $LOG2 | cut -d+ -f 1`)) FILESIZE=$((`ls -l $OOS | awk '{print $5}'` + `ls -l $OOS2 | awk '{print $5}'`)) -if [ $RECORDSOUT -ne $(($FILESIZE / 1024)) ]; then +if [ "$RECORDSOUT" -ne $(($FILESIZE / 1024)) ]; then echo "ERROR: blocks written by dd not equal to the size of file" SUCCESS=0 fi diff --git a/lustre/tests/openclose.c b/lustre/tests/openclose.c index 0def4b2..0a517fc 100644 --- a/lustre/tests/openclose.c +++ b/lustre/tests/openclose.c @@ -17,6 +17,9 @@ #include #include +#ifndef O_DIRECT +#define O_DIRECT 0 +#endif int main(int argc, char *argv[]) { diff --git a/lustre/tests/openfile.c b/lustre/tests/openfile.c index 1cb36ea..563baff 100644 --- a/lustre/tests/openfile.c +++ b/lustre/tests/openfile.c @@ -36,7 +36,9 @@ FLAG_MAPPING flag_table[] = { {"O_NONBLOCK", O_NONBLOCK}, {"O_NDELAY", O_NDELAY}, {"O_SYNC", O_SYNC}, +#ifdef O_DIRECT {"O_DIRECT", O_DIRECT}, +#endif {"O_LARGEFILE", O_LARGEFILE}, {"O_DIRECTORY", O_DIRECTORY}, {"O_NOFOLLOW", O_NOFOLLOW}, @@ -60,7 +62,7 @@ int main(int argc, char** argv) int mode_set=0; int flag_set=0; int file_set=0; - char c; + int c; char* cloned_flags = NULL; if (argc == 1) diff --git a/lustre/tests/rename_many.c b/lustre/tests/rename_many.c index 484cfc4..5bf46d2 100644 --- a/lustre/tests/rename_many.c +++ b/lustre/tests/rename_many.c @@ -71,9 +71,9 @@ extern int optind; int main(int argc, char *argv[]) { unsigned long n; - char msg[100], c, *end = NULL; + char msg[100], *end = NULL; int h1, h2; - int i; + int i, c; while ((c = getopt(argc, argv, "cf:n:rs:vx")) != EOF) { switch(c) { diff --git a/lustre/tests/statmany.c b/lustre/tests/statmany.c index 7d4d2c9..af7c41f 100644 --- a/lustre/tests/statmany.c +++ b/lustre/tests/statmany.c @@ -44,10 +44,10 @@ static int usage(char *prog, FILE *out) int main(int argc, char ** argv) { - long i, count, iter = LONG_MAX, mode = 0, offset; + long i, c, count, iter = LONG_MAX, mode = 0, offset; long int start, length = LONG_MAX, last, rc = 0; char parent[4096], *t; - char c, *prog = argv[0], *base; + char *prog = argv[0], *base; int seed = 0; int fd = -1; diff --git a/lustre/tests/test_brw.c b/lustre/tests/test_brw.c index 54126a0..ecefe92 100644 --- a/lustre/tests/test_brw.c +++ b/lustre/tests/test_brw.c @@ -118,13 +118,26 @@ int main(int argc, char **argv) flags = O_RDWR | O_CREAT; } if (strchr(argv[3], 'd')) { +#ifdef O_DIRECT flags |= O_DIRECT; +#else + fprintf(stderr, + "%s: O_DIRECT not supported in this build\n", + argv[0]); + exit(1); +#endif } if (!cmd) usage(argv[0]); } else { cmd = READ | WRITE; - flags = O_RDWR | O_CREAT | O_DIRECT; + flags = O_RDWR | O_CREAT; +#ifdef O_DIRECT + flags |= O_DIRECT; +#else + fprintf(stderr, "%s: warning: not setting O_DIRECT\n", + argv[0]); +#endif } if (argc >= 5) { @@ -150,7 +163,12 @@ int main(int argc, char **argv) } printf("%s: %s on %s(objid "LPX64") for "LPU64"x%ld pages \n", - argv[0], flags & O_DIRECT ? "directio" : "i/o", + argv[0], +#ifdef O_DIRECT + flags & O_DIRECT ? "directio" : "i/o", +#else + "i/o", +#endif argv[1], objid, count, pg_vec); fd = open(argv[1], flags | O_LARGEFILE); diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 1777fd0..2343844 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1,4 +1,6 @@ #!/usr/bin/env python +# -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- +# vim:expandtab:shiftwidth=8:tabstop=8: # # Copyright (C) 2002-2003 Cluster File Systems, Inc. # Authors: Robert Read @@ -985,18 +987,13 @@ def sys_get_elan_position_file(): return "" def sys_get_local_nid(net_type, wildcard, cluster_id): - """Return the local nid.""" - local = "" - if sys_get_elan_position_file() and net_type == 'elan': - local = sys_get_local_address('elan', '*', cluster_id) - else: - local = sys_get_local_address(net_type, wildcard, cluster_id) - return local - -def sys_get_local_address(net_type, wildcard, cluster_id): """Return the local address for the network type.""" local = "" - if net_type in ('tcp','openib','iib','vib','ra'): + + # don't need a real nid for config log - client will replace (bug5619) + if config.record: + local = "54321" + elif net_type in ('tcp','openib','iib','vib','ra'): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) @@ -1234,7 +1231,7 @@ class Network(Module): if len(self.hostaddr) == 0: self.hostaddr.append(self.nid) if '*' in self.hostaddr[0]: - self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id) + self.hostaddr[0] = sys_get_local_nid(self.net_type, self.hostaddr[0], self.cluster_id) if not self.hostaddr[0]: panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id) debug("hostaddr:", self.hostaddr[0]) @@ -1672,10 +1669,14 @@ class MDSDEV(Module): client_name = node_db.getName() for prof_uuid in node_db.get_refs('profile'): prof_db = node_db.lookup(prof_uuid) - # refactor this into a funtion to test "clientness" - # of a node. + # refactor this into a funtion to test "clientness" of a node. for ref_class, ref_uuid in prof_db.get_all_refs(): if ref_class in ('mountpoint','echoclient'): + thing = self.db.lookup(ref_uuid); + fs_uuid = thing.get_first_ref('filesystem') + if not fs_uuid in self.filesystem_uuids: + continue; + debug("recording", client_name) old_noexec = config.noexec config.noexec = 0 @@ -1947,20 +1948,23 @@ class Client(Module): if is_prepared(self.name): self.cleanup() try: - srv = choose_local_server(self.get_servers()) - if srv: + srv_list = find_local_servers(self.get_servers()) + for srv in srv_list: lctl.connect(srv) - else: - routes = find_route(self.get_servers()) - if len(routes) == 0: - panic("no route to", self.target_uuid) - for (srv, r) in routes: - lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3]) + + routes = find_route(self.get_servers()) + for (srv, r) in routes: + lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3]) + srv_list.append(srv) + + if len(srv_list) == 0: + panic("no local servers and no route to", self.target_uuid) except CommandError, e: if not ignore_connect_failure: raise e - if srv: + if srv_list[0]: + srv = srv_list[0] if self.target_uuid in config.inactive and self.permits_inactive(): debug("%s inactive" % self.target_uuid) inactive_p = "inactive" @@ -1970,17 +1974,20 @@ class Client(Module): lctl.newdev(self.module, self.name, self.uuid, setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid, inactive_p, self.mgmt_name)) + else: + panic("Unable to create OSC for ", self.target_uuid) def cleanup(self): if is_prepared(self.name): Module.cleanup(self) try: - srv = choose_local_server(self.get_servers()) - if srv: + srv_list = find_local_servers(self.get_servers()) + for srv in srv_list: lctl.disconnect(srv) - else: - for (srv, r) in find_route(self.get_servers()): - lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3]) + + routes = find_route(self.get_servers()) + for (srv, r) in routes: + lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3]) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() @@ -2342,10 +2349,13 @@ def find_local_routes(lustre): debug("find_local_routes:", local_routes) -def choose_local_server(srv_list): +def find_local_servers(srv_list): + result = [] + for srv in srv_list: if local_cluster(srv.net_type, srv.cluster_id): - return srv + result.append(srv) + return result def local_cluster(net_type, cluster_id): for cluster in local_clusters: @@ -2581,27 +2591,28 @@ def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid): if not new_uuid: raise Lustre.LconfError("doRecovery: no active target found for: " + tgt_uuid) - net = choose_local_server(get_ost_net(lustreDB, new_uuid)) - if not net: + srv_list = find_local_servers(get_ost_net(lustreDB, new_uuid)) + if not srv_list[0]: raise Lustre.LconfError("Unable to find a connection to:" + new_uuid) - log("Reconnecting", tgt_uuid, " to ", net.nid_uuid); - try: - oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid) - lustreDB.close() - if oldnet: - lctl.disconnect(oldnet) - except CommandError, e: - log("recover: disconnect", nid_uuid, "failed: ") - e.dump() + for srv in srv_list: + log("Reconnecting", tgt_uuid, " to ", srv.nid_uuid); + try: + oldsrv = get_server_by_nid_uuid(lustreDB, nid_uuid) + lustreDB.close() + if oldsrv: + lctl.disconnect(oldsrv) + except CommandError, e: + log("recover: disconnect", nid_uuid, "failed: ") + e.dump() - try: - lctl.connect(net) - except CommandError, e: - log("recover: connect failed") - e.dump() + try: + lctl.connect(srv) + except CommandError, e: + log("recover: connect failed") + e.dump() - lctl.recover(client_uuid, net.nid_uuid) + lctl.recover(client_uuid, srv.nid_uuid) def setupModulePath(cmd, portals_dir = PORTALS_DIR): diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 659ad92..fdce3d6 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -40,6 +40,8 @@ #include "parser.h" #include "obdctl.h" +unsigned int portal_subsystem_debug = 0; + /* all functions */ static int lfs_setstripe(int argc, char **argv); static int lfs_find(int argc, char **argv); diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 3e01b25..bb2e949 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -89,8 +89,8 @@ int llapi_file_create(char *name, long stripe_size, int stripe_offset, if (errno != EEXIST && errno != EALREADY) errmsg = strerror(errno); - fprintf(stderr, "error on ioctl for '%s' (%d): %s\n", - name, fd, errmsg); + fprintf(stderr, "error on ioctl "LPX64" for '%s' (%d): %s\n", + (__u64)LL_IOC_LOV_SETSTRIPE, name, fd, errmsg); rc = -errno; } if (close(fd) < 0) { @@ -293,7 +293,8 @@ void llapi_lov_dump_user_lmm(struct find_param *param, char *dname, char *fname) (param->verbose || !param->obduuid)); break; default: - printf("unknown lmm_magic: 0x%08X\n", *(__u32 *)param->lum); + printf("unknown lmm_magic: %#x (expecting %#x)\n", + *(__u32 *)param->lum, LOV_USER_MAGIC_V1); return; } } diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 65ace13..ab4cb63 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -231,10 +231,10 @@ static int parse_route(char *opteq, char *opttgts) * ****************************************************************************/ struct opt_map { - const char *opt; /* option name */ - int skip; /* skip in mtab option string */ - int inv; /* true if flag value should be inverted */ - int mask; /* flag mask value */ + const char *opt; /* option name */ + int skip; /* skip in mtab option string */ + int inv; /* true if flag value should be inverted */ + int mask; /* flag mask value */ }; static const struct opt_map opt_map[] = { @@ -382,6 +382,8 @@ set_local(struct lustre_mount_data *lmd) return 1; case SOCKNAL: + /* We need to do this before the mount is started if routing */ + system("/sbin/modprobe ksocknal"); case TCPNAL: case OPENIBNAL: case IIBNAL: @@ -401,6 +403,8 @@ set_local(struct lustre_mount_data *lmd) NULL}; int i = 0; + /* We need to do this before the mount is started if routing */ + system("/sbin/modprobe kqswnal"); do { rc = get_local_elan_id(pfiles[i], buf); } while (rc != 0 && pfiles[++i] != NULL); diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index adbc384..33f0264 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -93,19 +93,18 @@ int jt_lcfg_newdev(int argc, char **argv) int jt_lcfg_attach(int argc, char **argv) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; int rc; - LCFG_INIT(lcfg, LCFG_ATTACH, lcfg_devname); - if (argc != 2 && argc != 3 && argc != 4) return CMD_HELP; - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; + lustre_cfg_bufs_reset(&bufs, NULL); + + lustre_cfg_bufs_set_string(&bufs, 1, argv[1]); if (argc >= 3) { - lcfg.lcfg_dev_namelen = strlen(argv[2]) + 1; - lcfg.lcfg_dev_name = argv[2]; + lustre_cfg_bufs_set_string(&bufs, 0, argv[2]); } else { fprintf(stderr, "error: %s: LCFG_ATTACH requires a name\n", jt_cmdname(argv[0])); @@ -113,11 +112,12 @@ int jt_lcfg_attach(int argc, char **argv) } if (argc == 4) { - lcfg.lcfg_inllen2 = strlen(argv[3]) + 1; - lcfg.lcfg_inlbuf2 = argv[3]; + lustre_cfg_bufs_set_string(&bufs, 2, argv[3]); } - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: LCFG_ATTACH %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -143,7 +143,9 @@ int jt_lcfg_attach(int argc, char **argv) int jt_lcfg_setup(int argc, char **argv) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + int i; int rc; if (lcfg_devname == NULL) { @@ -153,29 +155,18 @@ int jt_lcfg_setup(int argc, char **argv) return -EINVAL; } - LCFG_INIT(lcfg, LCFG_SETUP, lcfg_devname); + lustre_cfg_bufs_reset(&bufs, lcfg_devname); if (argc > 5) return CMD_HELP; - if (argc > 1) { - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; - } - if (argc > 2) { - lcfg.lcfg_inllen2 = strlen(argv[2]) + 1; - lcfg.lcfg_inlbuf2 = argv[2]; - } - if (argc > 3) { - lcfg.lcfg_inllen3 = strlen(argv[3]) + 1; - lcfg.lcfg_inlbuf3 = argv[3]; - } - if (argc > 4) { - lcfg.lcfg_inllen4 = strlen(argv[4]) + 1; - lcfg.lcfg_inlbuf4 = argv[4]; + for (i = 1; i < argc; i++) { + lustre_cfg_bufs_set_string(&bufs, i, argv[i]); } - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -185,7 +176,8 @@ int jt_lcfg_setup(int argc, char **argv) int jt_obd_detach(int argc, char **argv) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; int rc; if (lcfg_devname == NULL) { @@ -195,12 +187,14 @@ int jt_obd_detach(int argc, char **argv) return -EINVAL; } - LCFG_INIT(lcfg, LCFG_DETACH, lcfg_devname); + lustre_cfg_bufs_reset(&bufs, lcfg_devname); if (argc != 1) return CMD_HELP; - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_DETACH, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -210,7 +204,8 @@ int jt_obd_detach(int argc, char **argv) int jt_obd_cleanup(int argc, char **argv) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; char force = 'F'; char failover = 'A'; char flags[3]; @@ -224,7 +219,7 @@ int jt_obd_cleanup(int argc, char **argv) return -EINVAL; } - LCFG_INIT(lcfg, LCFG_CLEANUP, lcfg_devname); + lustre_cfg_bufs_reset(&bufs, lcfg_devname); if (argc < 1 || argc > 3) return CMD_HELP; @@ -239,11 +234,13 @@ int jt_obd_cleanup(int argc, char **argv) return CMD_HELP; } - lcfg.lcfg_inllen1 = flag_cnt; - if (flag_cnt) - lcfg.lcfg_inlbuf1 = flags; + if (flag_cnt) { + lustre_cfg_bufs_set(&bufs, 1, flags, flag_cnt); + } - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -256,15 +253,23 @@ int do_add_uuid(char * func, char *uuid, ptl_nid_t nid, int nal) { char tmp[64]; int rc; - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + if (uuid) + lustre_cfg_bufs_set_string(&bufs, 1, uuid); - LCFG_INIT(lcfg, LCFG_ADD_UUID, lcfg_devname); - lcfg.lcfg_nid = nid; - lcfg.lcfg_inllen1 = strlen(uuid) + 1; - lcfg.lcfg_inlbuf1 = uuid; - lcfg.lcfg_nal = nal; + lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); + lcfg->lcfg_nid = nid; + lcfg->lcfg_nal = nal; - rc = lcfg_ioctl(func, OBD_DEV_ID, &lcfg); +#if 0 + fprintf(stderr, "adding\tnal: %d\tnid: %d\tuuid: %s\n", + lcfg->lcfg_nid, lcfg->lcfg_nal, uuid); +#endif + rc = lcfg_ioctl(func, OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc) { fprintf(stderr, "IOC_PORTAL_ADD_UUID failed: %s\n", strerror(errno)); @@ -307,22 +312,21 @@ int obd_add_uuid(char *uuid, ptl_nid_t nid, int nal) int jt_lcfg_del_uuid(int argc, char **argv) { int rc; - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; if (argc != 2) { fprintf(stderr, "usage: %s \n", argv[0]); return 0; } - LCFG_INIT(lcfg, LCFG_DEL_UUID, lcfg_devname); - + lustre_cfg_bufs_reset(&bufs, lcfg_devname); if (strcmp (argv[1], "_all_")) - { - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; - } + lustre_cfg_bufs_set_string(&bufs, 1, argv[1]); - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc) { fprintf(stderr, "IOC_PORTAL_DEL_UUID failed: %s\n", strerror(errno)); @@ -333,14 +337,13 @@ int jt_lcfg_del_uuid(int argc, char **argv) int jt_lcfg_lov_setup(int argc, char **argv) { - struct lustre_cfg lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; struct lov_desc desc; - struct obd_uuid *uuidarray, *ptr; + struct obd_uuid *uuidarray = NULL, *ptr; int rc, i; char *end; - LCFG_INIT(lcfg, LCFG_SETUP, lcfg_devname); - if (argc <= 6) return CMD_HELP; @@ -354,6 +357,7 @@ int jt_lcfg_lov_setup(int argc, char **argv) memset(&desc, 0, sizeof(desc)); obd_str2uuid(&desc.ld_uuid, argv[1]); desc.ld_tgt_count = argc - 6; + desc.ld_magic = LOV_DESC_MAGIC; desc.ld_default_stripe_count = strtoul(argv[2], &end, 0); if (*end) { fprintf(stderr, "error: %s: bad default stripe count '%s'\n", @@ -422,12 +426,14 @@ int jt_lcfg_lov_setup(int argc, char **argv) strcpy((char *)ptr, argv[i]); } - lcfg.lcfg_inllen1 = sizeof(desc); - lcfg.lcfg_inlbuf1 = (char *)&desc; - lcfg.lcfg_inllen2 = desc.ld_tgt_count * sizeof(*uuidarray); - lcfg.lcfg_inlbuf2 = (char *)uuidarray; + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + lustre_cfg_bufs_set(&bufs, 1, &desc, sizeof(desc)); + lustre_cfg_bufs_set(&bufs, 2, uuidarray, + desc.ld_tgt_count * sizeof(*uuidarray)); - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc) fprintf(stderr, "error: %s: ioctl error: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -439,74 +445,71 @@ out: int jt_lcfg_mount_option(int argc, char **argv) { int rc; - struct lustre_cfg lcfg; - - LCFG_INIT(lcfg, LCFG_MOUNTOPT, lcfg_devname); + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + int i; if (argc < 3 || argc > 4) return CMD_HELP; - /* profile name */ - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; - /* osc name */ - lcfg.lcfg_inllen2 = strlen(argv[2]) + 1; - lcfg.lcfg_inlbuf2 = argv[2]; - if (argc == 4) { - /* mdc name */ - lcfg.lcfg_inllen3 = strlen(argv[3]) + 1; - lcfg.lcfg_inlbuf3 = argv[3]; - } - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + + for (i = 1; i < argc; i++) + lustre_cfg_bufs_set_string(&bufs, i, argv[i]); + + lcfg = lustre_cfg_new(LCFG_MOUNTOPT, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); } - return rc; } int jt_lcfg_del_mount_option(int argc, char **argv) { int rc; - struct lustre_cfg lcfg; - - LCFG_INIT(lcfg, LCFG_DEL_MOUNTOPT, lcfg_devname); + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; if (argc != 2) return CMD_HELP; + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + /* profile name */ - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; + lustre_cfg_bufs_set_string(&bufs, 1, argv[1]); - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_DEL_MOUNTOPT, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); } - return rc; } int jt_lcfg_set_timeout(int argc, char **argv) { int rc; - struct lustre_cfg lcfg; - - LCFG_INIT(lcfg, LCFG_SET_TIMEOUT, lcfg_devname); + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; if (argc != 2) return CMD_HELP; - lcfg.lcfg_num = atoi(argv[1]); + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs); + lcfg->lcfg_num = atoi(argv[1]); - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); } - return rc; } @@ -514,23 +517,23 @@ int jt_lcfg_set_timeout(int argc, char **argv) int jt_lcfg_set_lustre_upcall(int argc, char **argv) { int rc; - struct lustre_cfg lcfg; - - LCFG_INIT(lcfg, LCFG_SET_UPCALL, lcfg_devname); + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; if (argc != 2) return CMD_HELP; + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + /* profile name */ - lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; - lcfg.lcfg_inlbuf1 = argv[1]; + lustre_cfg_bufs_set_string(&bufs, 1, argv[1]); - rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + lcfg = lustre_cfg_new(LCFG_SET_UPCALL, &bufs); + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); } - return rc; } - diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 24bd1eb..2190928 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -136,33 +136,22 @@ int obd_record(enum cfg_record_type type, int len, void *ptr) int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg) { int opc; - char lcfg_rawbuf[8192]; - char * lcfg_buf= lcfg_rawbuf; struct obd_ioctl_data data; - int len; int rc; - memset(lcfg_buf, 0, sizeof(lcfg_rawbuf)); - if (lustre_cfg_pack(lcfg, &lcfg_buf, sizeof(lcfg_rawbuf), &len)) { - fprintf(stderr, "error: %s: invalid ioctl\n", - jt_cmdname(func)); - return -2; - } - IOC_INIT(data); data.ioc_type = LUSTRE_CFG_TYPE; - data.ioc_plen1 = len; - data.ioc_pbuf1 = lcfg_buf; + data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + data.ioc_pbuf1 = (void *)lcfg; IOC_PACK(func, data); - if (jt_recording) + if (jt_recording) { opc = OBD_IOC_DORECORD; - else + } else { opc = OBD_IOC_PROCESS_CFG; - + } rc = l_ioctl(dev_id, opc, buf); - if (rc == 0) - rc = lustre_cfg_unpack(lcfg, lcfg_buf, sizeof(lcfg_rawbuf)); return rc; } diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 6b50727..907db58 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -100,10 +100,10 @@ check_obdo(void) CHECK_STRUCT(obdo); CHECK_MEMBER(obdo, o_id); CHECK_MEMBER(obdo, o_gr); - CHECK_MEMBER(obdo, o_atime); + CHECK_MEMBER(obdo, o_size); CHECK_MEMBER(obdo, o_mtime); + CHECK_MEMBER(obdo, o_atime); CHECK_MEMBER(obdo, o_ctime); - CHECK_MEMBER(obdo, o_size); CHECK_MEMBER(obdo, o_blocks); CHECK_MEMBER(obdo, o_grant); CHECK_MEMBER(obdo, o_blksize); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 4af12bc..e02b32d 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -25,8 +25,8 @@ int main() void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux schnapps.adilger.int 2.4.22-l32 #4 Thu Jan 8 14:32:57 MST 2004 i686 i686 - * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */ + * running on Linux milano 2.4.21-20.EL_87k.6-b_release_1_3_3.200410121845smp #1 SMP Tue Oct + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ /* Constants... */ @@ -276,22 +276,22 @@ void lustre_assert_wire_constants(void) (long long)offsetof(struct obdo, o_gr)); LASSERTF((int)sizeof(((struct obdo *)0)->o_gr) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_gr)); - LASSERTF(offsetof(struct obdo, o_atime) == 16, " found %lld\n", - (long long)offsetof(struct obdo, o_atime)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF(offsetof(struct obdo, o_size) == 16, " found %lld\n", + (long long)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); LASSERTF(offsetof(struct obdo, o_mtime) == 24, " found %lld\n", (long long)offsetof(struct obdo, o_mtime)); LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); - LASSERTF(offsetof(struct obdo, o_ctime) == 32, " found %lld\n", + LASSERTF(offsetof(struct obdo, o_atime) == 32, " found %lld\n", + (long long)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF(offsetof(struct obdo, o_ctime) == 40, " found %lld\n", (long long)offsetof(struct obdo, o_ctime)); LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); - LASSERTF(offsetof(struct obdo, o_size) == 40, " found %lld\n", - (long long)offsetof(struct obdo, o_size)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, " found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_size)); LASSERTF(offsetof(struct obdo, o_blocks) == 48, " found %lld\n", (long long)offsetof(struct obdo, o_blocks)); LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, " found %lld\n",