From fb22bcd4877be9ce5f583e14cd9158662636433c Mon Sep 17 00:00:00 2001 From: green Date: Mon, 5 Dec 2005 11:51:45 +0000 Subject: [PATCH] Updated to b1_4 --- .../patches/ext3-mballoc2-2.6-suse.patch | 30 +- .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 30 +- lustre/ChangeLog | 152 +- lustre/autoconf/lustre-core.m4 | 1 + lustre/autoconf/lustre-version.ac | 32 +- lustre/doc/lctl.8 | 2 +- lustre/doc/lctl.lyx | 2 +- lustre/doc/lmc.1 | 2 +- lustre/doc/lmc.lyx | 2 +- lustre/include/liblustre.h | 1 + lustre/include/linux/Makefile.am | 2 +- lustre/include/linux/lustre_cfg.h | 50 +- lustre/include/linux/lustre_dlm.h | 10 +- lustre/include/linux/lustre_idl.h | 144 +- lustre/include/linux/lustre_import.h | 7 +- lustre/include/linux/lustre_lib.h | 312 ++-- lustre/include/linux/lustre_lite.h | 53 + lustre/include/linux/lustre_log.h | 6 +- lustre/include/linux/lustre_mds.h | 6 +- lustre/include/linux/lustre_net.h | 48 +- lustre/include/linux/lustre_ver.h.in | 23 + lustre/include/linux/obd.h | 202 ++- lustre/include/linux/obd_class.h | 49 +- lustre/include/linux/obd_ost.h | 8 +- .../patches/ext3-mballoc2-2.4.24.patch | 1766 -------------------- .../patches/ext3-mballoc2-2.6-suse.patch | 30 +- .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 30 +- lustre/ldlm/ldlm_lib.c | 73 +- lustre/ldlm/ldlm_lock.c | 5 +- lustre/ldlm/ldlm_lockd.c | 87 +- lustre/ldlm/ldlm_request.c | 27 +- lustre/ldlm/ldlm_resource.c | 4 +- lustre/liblustre/Makefile.am | 2 +- lustre/liblustre/file.c | 22 +- lustre/liblustre/genlib.sh | 3 +- lustre/liblustre/llite_lib.c | 13 +- lustre/liblustre/llite_lib.h | 24 +- lustre/liblustre/rw.c | 53 +- lustre/liblustre/super.c | 61 +- lustre/liblustre/tests/echo_test.c | 2 +- lustre/liblustre/tests/sanity.c | 22 +- lustre/llite/file.c | 2 +- lustre/llite/llite_internal.h | 2 +- lustre/llite/llite_lib.c | 150 +- lustre/llite/namei.c | 3 +- lustre/llite/rw.c | 2 +- lustre/lov/lov_obd.c | 248 ++- lustre/lov/lov_pack.c | 4 +- lustre/lov/lov_request.c | 4 +- lustre/lvfs/fsfilt_ext3.c | 43 +- lustre/lvfs/lvfs_linux.c | 20 + lustre/mdc/mdc_request.c | 42 +- lustre/mds/handler.c | 122 +- lustre/mds/mds_fs.c | 33 +- lustre/mds/mds_internal.h | 5 +- lustre/mds/mds_lib.c | 15 +- lustre/mds/mds_lov.c | 55 +- lustre/mds/mds_open.c | 14 +- lustre/mds/mds_reint.c | 11 +- lustre/mds/mds_xattr.c | 5 +- lustre/obdclass/class_obd.c | 6 +- lustre/obdclass/genops.c | 14 +- lustre/obdclass/lprocfs_status.c | 1 + lustre/obdclass/lustre_handles.c | 34 +- lustre/obdclass/lustre_peer.c | 1 + lustre/obdclass/obd_config.c | 49 +- lustre/obdecho/echo_client.c | 7 +- lustre/obdfilter/filter.c | 181 +- lustre/obdfilter/filter_internal.h | 17 +- lustre/obdfilter/filter_io.c | 29 +- lustre/obdfilter/filter_io_24.c | 44 +- lustre/obdfilter/filter_io_26.c | 221 +-- lustre/osc/osc_request.c | 123 +- lustre/ost/ost_handler.c | 168 +- lustre/ost/ost_internal.h | 12 +- lustre/ptlrpc/client.c | 2 +- lustre/ptlrpc/events.c | 2 +- lustre/ptlrpc/import.c | 84 +- lustre/ptlrpc/lproc_ptlrpc.c | 6 - lustre/ptlrpc/pack_generic.c | 324 ++-- lustre/ptlrpc/ptlrpc_internal.h | 10 +- lustre/ptlrpc/ptlrpc_module.c | 3 - lustre/ptlrpc/service.c | 21 +- lustre/tests/conf-sanity.sh | 15 +- lustre/tests/insanity.sh | 2 +- lustre/tests/llmount.sh | 1 + lustre/tests/llrmount.sh | 3 +- lustre/tests/oos.sh | 19 +- lustre/tests/oos2.sh | 2 +- lustre/tests/recovery-small.sh | 3 +- lustre/tests/replay-dual.sh | 4 +- lustre/tests/replay-ost-single.sh | 5 +- lustre/tests/replay-single.sh | 2 +- lustre/tests/sanity.sh | 40 +- lustre/tests/sanityN.sh | 22 +- lustre/tests/test-framework.sh | 4 +- lustre/utils/lconf | 8 +- lustre/utils/lfs.c | 6 +- lustre/utils/llmount.c | 47 +- lustre/utils/llog_reader.c | 44 +- lustre/utils/wirecheck.c | 213 +-- lustre/utils/wirehdr.c | 1 + lustre/utils/wiretest.c | 247 ++- 103 files changed, 2868 insertions(+), 3357 deletions(-) create mode 100644 lustre/include/linux/lustre_ver.h.in delete mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 5decb55..f36b90c 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -446,31 +446,31 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); ++ return ext3_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); ++ ext3_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); ++ ext3_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); ++ ext3_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) @@ -1015,7 +1015,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + i = e3b->bd_bd->bb_first_free; + + while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1205,13 +1205,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + if (ac.ac_status == AC_STATUS_BREAK && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1219,7 +1219,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + * The only thing we can do is just take first + * found block(s) + */ -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; @@ -2413,7 +2413,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, ++ {Opt_mbfactor, "mbfactor=%u"}, {Opt_err, NULL} }; diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index fbd014f..72b7926 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -463,31 +463,31 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); ++ return ext3_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); ++ ext3_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); ++ ext3_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); ++ ext3_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) @@ -1032,7 +1032,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + i = e3b->bd_bd->bb_first_free; + + while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1222,13 +1222,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + if (ac.ac_status == AC_STATUS_BREAK && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1236,7 +1236,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + * The only thing we can do is just take first + * found block(s) + */ -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; @@ -2428,7 +2428,7 @@ Index: linux-2.6.9/fs/ext3/super.c {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, ++ {Opt_mbfactor, "mbfactor=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 42526aa..0d08023 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,3 +1,5 @@ +------------------------------------------------------------------------------ + 12-31-2005 Cluster File Systems, Inc. * version 1.4.6 * WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT @@ -26,7 +28,7 @@ Details : LNET is new networking infrastructure for Lustre, it includes Severity : enhancement Bugzilla : 7982 -Description: Configuration change for the XT3 +Description: Configuration change for the XT3 The PTLLND is now used to run Lustre over Portals on the XT3 The configure option(s) --with-cray-portals are no longer used. Rather --with-portals= is used to @@ -277,6 +279,126 @@ Details : lconf --force (implied by --failover) sets the global obd_timeout other RPCs to time out too quickly. Do not change the global obd_timeout for force cleanup, only set it for DISCONNECT RPCs. +Severity : enhancement +Frequency : if MDS is started with down OST +Bugzilla : 9439,5706 +Description: Allow startup/shutdown of an MDS without depending on the + availability of the OSTs. +Details : Asynchronously call mds_lov_synchronize during MDS startup. + Add appropriate locking and lov-osc refcounts for safe + cleaning. Add osc abort_inflight calls in case the + synchronize never started. + +Severity : minor +Frequency : occasional (Cray XT3 only) +Bugzilla : 7305 +Description: root not authorized to access files in CRAY_PORTALS environment +Details : The client process capabilities were not honoured on the MDS in + a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously + been accessed by an authorized user then root was able to access + the file on the local client also. The root user capabilities + are now allowed on the MDS, as this environment has secure UID. + +Severity : minor +Frequency : occasional +Bugzilla : 6449 +Description: ldiskfs "too long searching" message happens too often +Details : A debugging message (otherwise harmless) prints too often on + the OST console. This has been reduced to only happen when + there are fragmentation problems on the filesystem. + +Severity : minor +Frequency : rare +Bugzilla : 9598 +Description: Division by zero in statfs when all OSCs are inactive +Details : lov_get_stripecnt() returns zero due to incorrect order of checks, + lov_statfs divides by value returned by lov_get_stripecnt(). + +Severity : minor +Frequency : common +Bugzilla : 9489, 3273 +Description: First write from each client to each OST was only 4kB in size, + to initialize client writeback cache, which caused sub-optimal + RPCs and poor layout on disk for the first writen file. +Details : Clients now request an initial cache grant at (re)connect time + and so that they can start streaming writes to the cache right + away and always do full-sized RPCs if there is enough data. + If the OST is rebooted the client also re-establishes its grant + so that client cached writes will be honoured under the grant. + +Severity : minor +Frequency : common +Bugzilla : 7198 +Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs +Details : Now I/O RPCs go to different portal number and (presumably) fast + lock requests (and glimses) and other RPCs get their own service + threads pool that should be able to service those RPCs + immediatelly. + +Severity : enhancement +Bugzilla : 7417 +Description: Ability to exchange lustre version between client and servers and + issue warnings at client side if client is too old. Also for + liblustre clients there is ability to refuse connection of too old + clients. +Details : New 'version' field is added to connect data structure that is + filled with version info. That info is later checked by server and + by client. + +Severity : minor +Frequency : rare, liblustre only. +Bugzilla : 9296, 9581 +Description: Two simultaneous writes from liblustre at offset within same page + might proceed at the same time overwriting eachother with stale + data. +Details : I/O lock withing llu_file_prwv was released too early, before data + actually was hitting the wire. Extended lock-holding time until + server acknowledges receiving data. + +Severity : minor +Frequency : extremely rare. Never observed in practice. +Bugzilla : 9652 +Description: avoid generating lustre_handle cookie of 0. +Details : class_handle_hash() generates handle cookies by incrementing + global counter, and can hit 0 occasionaly (this is unlikely, but + not impossible, because initial value of cookie counter is + selected randonly). Value of 0 is used as a sentinel meaning + "unassigned handle" --- avoid it. Also coalesce two critical + sections in this function into one. + +Severity : enhancement +Bugzilla : 9528 +Description: allow liblustre clients to delegate truncate locking to OST +Details : To avoid overhead of locking, liblustre client instructs OST to + take extent lock in ost_punch() on client's behalf. New connection + flag is added to handle backward compatibility. + +Severity : enhancement +Bugzilla : 4928, 7341, 9758 +Description: allow number of OST service threads to be specified +Details : a module parameter allows the number of OST service threads + to be specified via "options ost ost_num_threads=X" in + /etc/modules.conf or /etc/modutils.conf. + +Severity : major +Frequency : rare +Bugzilla : 9635 +Description: servers crash with bad pointer in target_handle_connect() +Details : In rare cases when a client is reconnecting it was possible that + the connection request was the last reference for that export. + We would temporarily drop the export reference and get a new + one, but this may have been the last reference and the export + was just destroyed. Get new reference before dropping old one. + +Severity : enhancement +Frequency : if client is started with failover MDS +Bugzilla : 9818 +Description: Allow multiple MDS hostnames in the mount command +Details : Try to read the configuration from all specified MDS + hostnames during a client mount in case the "primary" + MDS is down. + + ------------------------------------------------------------------------------ 08-26-2005 Cluster File Systems, Inc. @@ -391,7 +513,7 @@ Details : lconf was attempting to abort recovery on the MDT device and not * bug fixes Severity : major -Frequency : rare (only unsupported configurations with a node running as an +Frequency : rare (only unsupported configurations with a node running as an OST and a client) Bugzilla : 6514, 5137 Description: Mounting a Lustre file system on a node running as an OST could @@ -420,7 +542,7 @@ Details : By default, OSTs will now run in failover mode. To return to Severity : enhancement Bugzilla : 1693 Description: Health checks are now provided for MDS and OSTs -Details : Additional detailed health check information on MSD and OSTs +Details : Additional detailed health check information on MSD and OSTs is now provided through the procfs health_check value. Severity : minor @@ -470,7 +592,7 @@ Details : It was possible under high-load situations to have an extent Severity : minor Bugzilla : 7241 -Frequency : filesystems with default stripe_count larger than 77 +Frequency : filesystems with default stripe_count larger than 77 Description: lconf+mke2fs fail when formatting filesystem with > 77 stripes Details : lconf specifies an inode size of 4096 bytes when the default stripe_count is larger than 77. This conflicts with the default @@ -537,7 +659,7 @@ Severity: : enhancement Bugzilla : 3262, 6359 Description: Attempts to reconnect to servers are now more aggressive. Details : This builds on the enhanced upcall-less recovery that was added - in 1.4.2. When trying to reconnect to servers, clients will + in 1.4.2. When trying to reconnect to servers, clients will now try each server in the failover group every 10 seconds. By default, clients would previously try one server every 25 seconds. @@ -547,13 +669,13 @@ Bugzilla : 6371 Description: After recovery, certain operations trigger a failed assertion on a client. Details : Failing over an mds, using lconf -d --failover, while a - client was doing a readdir() call would cause the client to + client was doing a readdir() call would cause the client to LBUG after recovery completed and the readdir() was resent. Severity : enhancement Bugzilla : 6296 Description: Default groups are now added by lconf -Details : You can now run lconf --group without having to +Details : You can now run lconf --group without having to manually add groups with lmc. Severity : major @@ -612,7 +734,7 @@ Details : Creating a new file via mkdir or mknod (starting a transaction Severity : minor Frequency : occasional -Description: While starting a server, the fsfilt_ext3 module could not be +Description: While starting a server, the fsfilt_ext3 module could not be loaded. Details : CFS's improved ext3 filesystem is named ldiskfs for 2.6 kernels. Previously, lconf would still use the ext3 name @@ -718,11 +840,11 @@ Description: Changes the "SCSI I/O Stats" kernel patch to default to "enabled" - lconf should create multiple TCP connections from a client (5201) - init scripts are now turned off by default; run chkconfig --on lustre and chkconfig --on lustrefs to use them - - upcalls are no longer needed for clients to recover to failover + - upcalls are no longer needed for clients to recover to failover servers (3262) - add --abort-recovery option to lconf to abort recovery on device startup (6017) - - add support for an arbitrary number of OSTs (3026) + - add support for an arbitrary number of OSTs (3026) - Quota support protocol changes. - forward compatibility changes to wire structs (6007) - rmmod NALs that might be loaded because of /etc/modules.conf (6133) @@ -1500,9 +1622,9 @@ tbd Cluster File Systems, Inc. - fix dbench 2, extN refcount problem (170, 258, 356, 418) - fix double-O_EXCL intent crash (424) - avoid sending multiple lock CANCELs (352) - * Features + * Features - MDS can do multi-client recovery (modulo bugs in new code) - * Documentation + * Documentation - many updates, edits, cleanups 2002-11-18 Phil Schwan @@ -1686,8 +1808,8 @@ tbd Cluster File Systems, Inc. * small changes in the DLM wire protocol 2002-07-25 Peter J. Braam - * version 0_5_1 with some initial stability, - * locking on MD and file I/O. + * version 0_5_1 with some initial stability, + * locking on MD and file I/O. * documentation updates * several bug fixes since 0.5.0 * small changes in wire protocol @@ -1721,4 +1843,4 @@ tbd Cluster File Systems, Inc. * move forward to latest Lustre kernel 2002-06-25 Peter Braam - * release version v0_4_1. Hopefully stable on single node use. + * release version v0_4_1. Hopefully stable on single node use. diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 84d7fc8..d9c5618 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -605,6 +605,7 @@ lustre/conf/Makefile lustre/doc/Makefile lustre/include/Makefile lustre/include/linux/Makefile +lustre/include/linux/lustre_ver.h lustre/include/lustre/Makefile lustre/kernel_patches/targets/2.6-suse.target lustre/kernel_patches/targets/2.6-vanilla.target diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 8f1e376..257516c 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1 +1,31 @@ -m4_define([LUSTRE_VERSION],[1.4.5.94]) +m4_define([LUSTRE_MAJOR],[1]) +m4_define([LUSTRE_MINOR],[4]) +m4_define([LUSTRE_PATCH],[5]) +m4_define([LUSTRE_FIX],[94]) + +dnl # 288 stands for 0.0.1.32 , next version with fixes is ok, but next after +dnl # next release candidate/beta would spill this warning already. +m4_define([LUSTRE_VER_ALLOWED_OFFSET],[288]) +m4_define([LUSTRE_VER_OFFSET_WARN],[288]) + +dnl # User editable part ends here. ----------------------------------------- + +m4_pattern_allow(AC_LUSTRE) +m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTR +E_PATCH,LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH.LUSTRE_FIX)) + +[AC_LUSTRE_MAJOR]=LUSTRE_MAJOR +[AC_LUSTRE_MINOR]=LUSTRE_MINOR +[AC_LUSTRE_PATCH]=LUSTRE_PATCH +[AC_LUSTRE_FIX]=LUSTRE_FIX +[AC_LUSTRE_VERSION_STRING]=LUSTRE_VERSION +[AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET +[AC_LUSTRE_VER_OFFSET_WARN]=LUSTRE_VER_OFFSET_WARN + +AC_SUBST([AC_LUSTRE_MAJOR]) +AC_SUBST([AC_LUSTRE_MINOR]) +AC_SUBST([AC_LUSTRE_PATCH]) +AC_SUBST([AC_LUSTRE_FIX]) +AC_SUBST([AC_LUSTRE_VERSION_STRING]) +AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET]) +AC_SUBST([AC_LUSTRE_VER_OFFSET_WARN]) diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 index 58e5a80..69c6ece 100644 --- a/lustre/doc/lctl.8 +++ b/lustre/doc/lctl.8 @@ -374,7 +374,7 @@ Finished (success) .B setup -lctl > setup /dev/loop0 extN +lctl > setup /dev/loop0 ldiskfs .br lctl > quit diff --git a/lustre/doc/lctl.lyx b/lustre/doc/lctl.lyx index 087a2cd..c3a769f 100644 --- a/lustre/doc/lctl.lyx +++ b/lustre/doc/lctl.lyx @@ -910,7 +910,7 @@ setup \size small -lctl > setup /dev/loop0 extN +lctl > setup /dev/loop0 ldiskfs \newline lctl > quit \size default diff --git a/lustre/doc/lmc.1 b/lustre/doc/lmc.1 index 0377c33..d755de8 100644 --- a/lustre/doc/lmc.1 +++ b/lustre/doc/lmc.1 @@ -192,7 +192,7 @@ Optional arguement. Name of LOV to which this OSC will be attached. Specify the UUID of the OST device. .TP --fstype -extN|ext3 Optional arguement used to specify the file system type. Default is ext3. +ldiskfs|ext3 Optional arguement used to specify the file system type. Default is ext3. .TP --inode_size Specify new inode size for underlying ext3 file system. diff --git a/lustre/doc/lmc.lyx b/lustre/doc/lmc.lyx index 1c27a15..e42e64b 100644 --- a/lustre/doc/lmc.lyx +++ b/lustre/doc/lmc.lyx @@ -454,7 +454,7 @@ UUID Specify the UUID of the OST device. \layout Description --fstype\SpecialChar ~ -extN|ext3 Optional arguement used to specify the file system type. +ldiskfs|ext3 Optional arguement used to specify the file system type. Default is ext3. \layout Description diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 98669cb..ea0ef65 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -291,6 +291,7 @@ typedef __u64 kdev_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { } #define LASSERT_SPIN_LOCKED(lock) do {} while(0) +#define LASSERT_SEM_LOCKED(sem) do {} while(0) static inline void spin_lock(spinlock_t *l) {return;} static inline void spin_unlock(spinlock_t *l) {return;} diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am index fae4b80..6523129 100644 --- a/lustre/include/linux/Makefile.am +++ b/lustre/include/linux/Makefile.am @@ -15,4 +15,4 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \ lustre_export.h lustre_log.h obd_echo.h \ lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \ lvfs.h lvfs_linux.h lustre_cfg.h lustre_lite.h lustre_idl.h \ - lustre_quota.h lustre_ucache.h + lustre_quota.h lustre_ucache.h lustre_ver.h.in diff --git a/lustre/include/linux/lustre_cfg.h b/lustre/include/linux/lustre_cfg.h index acc8fe8..17e0c41 100644 --- a/lustre/include/linux/lustre_cfg.h +++ b/lustre/include/linux/lustre_cfg.h @@ -47,7 +47,9 @@ enum lcfg_command_type { LCFG_ADD_CONN = 0x00cf00b, LCFG_DEL_CONN = 0x00cf00c, LCFG_LOV_ADD_OBD = 0x00cf00d, - LCFG_LOV_DEL_OBD = 0x00cf00e + LCFG_LOV_DEL_OBD = 0x00cf00e, + LCFG_PARAM = 0x00cf00f, + LCFG_MARKER = 0x00cf010 }; struct lustre_cfg_bufs { @@ -56,6 +58,9 @@ struct lustre_cfg_bufs { uint32_t lcfg_bufcount; }; +/* Mountconf transitional hack, should go away after 1.6 */ +#define LCFG_FLG_MOUNTCONF 0x400 + struct lustre_cfg { uint32_t lcfg_version; uint32_t lcfg_command; @@ -232,13 +237,46 @@ static inline int lustre_cfg_sanity_check(void *buf, int len) RETURN(0); } + +#define LMD_MAGIC 0xbdacbd03 +#define LMD_MAGIC_MASK (0xffffff00 & LMD_MAGIC) + +#define lmd_bad_magic(LMDP) \ +({ \ + struct lustre_mount_data *_lmd__ = (LMDP); \ + int _ret__ = 0; \ + if (!_lmd__) { \ + LCONSOLE_ERROR("Missing mount data: " \ + "check that /sbin/mount.lustre is installed.\n");\ + _ret__ = 1; \ + } else if (_lmd__->lmd_magic == LMD_MAGIC) { \ + _ret__ = 0; \ + } else if ((_lmd__->lmd_magic & LMD_MAGIC_MASK) == LMD_MAGIC_MASK) { \ + LCONSOLE_ERROR("You're using an old version of " \ + "/sbin/mount.lustre. Please install version " \ + "1.%d\n", LMD_MAGIC & 0xFF); \ + _ret__ = 1; \ + } else { \ + LCONSOLE_ERROR("Invalid mount data (%#x != %#x): " \ + "check that /sbin/mount.lustre is installed\n", \ + _lmd__->lmd_magic, LMD_MAGIC); \ + _ret__ = 1; \ + } \ + _ret__; \ +}) + +#define MAX_FAILOVER_NIDS 10 + /* Passed by mount */ +/* Any changes in the alignment of elements in this stuct require a change to + LMD_MAGIC */ struct lustre_mount_data { - uint32_t lmd_magic; - uint32_t lmd_flags; - uint64_t lmd_nid; - char lmd_mds[64]; - char lmd_profile[64]; + uint32_t lmd_magic; + uint32_t lmd_flags; + uint16_t lmd_nid_count; /* how many failover nids we have for the MDS */ + lnet_nid_t lmd_nid[MAX_FAILOVER_NIDS]; + char lmd_mds[64]; + char lmd_profile[64]; }; #define LMD_FLG_FLOCK 0x0001 diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index 0721f4b..3251fcf 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -122,15 +122,7 @@ typedef enum { #define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX) #define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) -static ldlm_mode_t lck_compat_array[] = { - [LCK_EX] LCK_COMPAT_EX, - [LCK_PW] LCK_COMPAT_PW, - [LCK_PR] LCK_COMPAT_PR, - [LCK_CW] LCK_COMPAT_CW, - [LCK_CR] LCK_COMPAT_CR, - [LCK_NL] LCK_COMPAT_NL, - [LCK_GROUP] LCK_COMPAT_GROUP -}; +extern ldlm_mode_t lck_compat_array[]; static inline void lockmode_verify(ldlm_mode_t mode) { diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index b3300f1..498dac0 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -76,7 +76,7 @@ //#define OSC_REQUEST_PORTAL 3 #define OSC_REPLY_PORTAL 4 //#define OSC_BULK_PORTAL 5 -#define OST_REQUEST_PORTAL 6 +#define OST_IO_PORTAL 6 #define OST_CREATE_PORTAL 7 #define OST_BULK_PORTAL 8 //#define MDC_REQUEST_PORTAL 9 @@ -89,15 +89,13 @@ #define LDLM_CB_REPLY_PORTAL 16 #define LDLM_CANCEL_REQUEST_PORTAL 17 #define LDLM_CANCEL_REPLY_PORTAL 18 -#define PTLBD_REQUEST_PORTAL 19 -#define PTLBD_REPLY_PORTAL 20 -#define PTLBD_BULK_PORTAL 21 +//#define PTLBD_REQUEST_PORTAL 19 +//#define PTLBD_REPLY_PORTAL 20 +//#define PTLBD_BULK_PORTAL 21 #define MDS_SETATTR_PORTAL 22 #define MDS_READPAGE_PORTAL 23 -#define MGMT_REQUEST_PORTAL 24 -#define MGMT_REPLY_PORTAL 25 -#define MGMT_CLI_REQUEST_PORTAL 26 -#define MGMT_CLI_REPLY_PORTAL 27 + +#define OST_REQUEST_PORTAL 28 #define SVC_KILLED 1 #define SVC_EVENT 2 @@ -128,6 +126,23 @@ struct lustre_handle { }; #define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL +static inline int lustre_handle_is_used(struct lustre_handle *lh) +{ + return lh->cookie != 0ull; +} + +static inline int lustre_handle_equal(struct lustre_handle *lh1, + struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + /* we depend on this structure to be 8-byte aligned */ /* this type is only endian-adjusted in lustre_unpack_msg() */ struct lustre_msg { @@ -206,20 +221,38 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define MSG_CONNECT_ASYNC 0x40 /* Connect flags */ -#define OBD_CONNECT_RDONLY 0x0001ULL -#define OBD_CONNECT_SRVLOCK 0x0010ULL /* server takes locks for client */ -#define OBD_CONNECT_ACL 0x0080ULL -#define OBD_CONNECT_USER_XATTR 0x0100ULL -#define OBD_CONNECT_CROW 0x0200ULL /* OST is CROW able */ -#define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */ - -#define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | \ - OBD_CONNECT_ACL | \ - OBD_CONNECT_USER_XATTR | \ +#define OBD_CONNECT_RDONLY 0x1ULL /* client allowed read-only access */ +#define OBD_CONNECT_INDEX 0x2ULL /* connect to specific LOV idx */ +#define OBD_CONNECT_GRANT 0x8ULL /* OSC acquires grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /* server takes locks for client */ +#define OBD_CONNECT_VERSION 0x20ULL /* Server supports versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /* Separate portal for non-IO reqs */ +#define OBD_CONNECT_ACL 0x80ULL /* client using access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /* client using extended attributes*/ +#define OBD_CONNECT_CROW 0x0200ULL /* OST is CROW able */ +/* + * set by servers supporting taking extent locks during obd_punch(). Currently + * is requested by liblustre clients only. See bug 9528. + */ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /* server gets extent lock on punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /* replay is sending initial transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */ + +#define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY|OBD_CONNECT_VERSION| \ + OBD_CONNECT_ACL|OBD_CONNECT_XATTR| \ OBD_CONNECT_IBITS) -#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_CROW) +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_CROW) #define ECHO_CONNECT_SUPPORTED (0) +#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ + ((patch)<<8) + (fix)) +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255) +#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255) + /* This structure is used for both request and reply. * * If we eventually have separate connect data for different types, which we @@ -294,6 +327,12 @@ typedef uint32_t obd_count; #define OBD_FL_NO_USRQUOTA (0x00000100) /* the object's owner is over quota */ #define OBD_FL_NO_GRPQUOTA (0x00000200) /* the object's group is over quota */ #define OBD_FL_CREATE_CROW (0x00000400) /* object should be created with crow */ +/* + * set this to delegate DLM locking during obd_punch() to the OSTs. Only OSTs + * that declared OBD_CONNECT_TRUNCLOCK in their connect flags support this + * functionality. + */ +#define OBD_FL_TRUNCLOCK (0x00000800) /* this should be not smaller than sizeof(struct lustre_handle) + sizeof(struct * llog_cookie) + sizeof(ll_fid). Nevertheless struct ll_fid is not longer @@ -893,7 +932,7 @@ extern void lustre_swab_ldlm_intent (struct ldlm_intent *i); struct ldlm_resource_desc { ldlm_type_t lr_type; - __u32 lr_padding; + __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */ struct ldlm_res_id lr_name; }; @@ -910,7 +949,7 @@ extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l); struct ldlm_request { __u32 lock_flags; - __u32 lock_padding; + __u32 lock_padding; /* also fix lustre_swab_ldlm_request */ struct ldlm_lock_desc lock_desc; struct lustre_handle lock_handle1; struct lustre_handle lock_handle2; @@ -920,7 +959,7 @@ extern void lustre_swab_ldlm_request (struct ldlm_request *rq); struct ldlm_reply { __u32 lock_flags; - __u32 lock_padding; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ struct ldlm_lock_desc lock_desc; struct lustre_handle lock_handle; __u64 lock_policy_res1; @@ -930,57 +969,6 @@ struct ldlm_reply { extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); /* - * ptlbd, portal block device requests - */ -typedef enum { - PTLBD_QUERY = 200, - PTLBD_READ = 201, - PTLBD_WRITE = 202, - PTLBD_FLUSH = 203, - PTLBD_CONNECT = 204, - PTLBD_DISCONNECT = 205, - PTLBD_LAST_OPC -} ptlbd_cmd_t; -#define PTLBD_FIRST_OPC PTLBD_QUERY - -struct ptlbd_op { - __u16 op_cmd; - __u16 op_lun; - __u16 op_niob_cnt; - __u16 op__padding; - __u32 op_block_cnt; -}; - -extern void lustre_swab_ptlbd_op (struct ptlbd_op *op); - -struct ptlbd_niob { - __u64 n_xid; - __u64 n_block_nr; - __u32 n_offset; - __u32 n_length; -}; - -extern void lustre_swab_ptlbd_niob (struct ptlbd_niob *n); - -struct ptlbd_rsp { - __u16 r_status; - __u16 r_error_cnt; -}; - -extern void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r); - -/* - * Opcodes for management/monitoring node. - */ -typedef enum { - MGMT_CONNECT = 250, - MGMT_DISCONNECT, - MGMT_EXCEPTION, /* node died, etc. */ - MGMT_LAST_OPC -} mgmt_cmd_t; -#define MGMT_FIRST_OPC MGMT_CONNECT - -/* * Opcodes for multiple servers. */ @@ -1005,7 +993,9 @@ struct llog_logid { #define CATLIST "CATALOGS" struct llog_catid { struct llog_logid lci_logid; - __u32 lci_padding[3]; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; } __attribute__((packed)); /* Log data record types - there is no specific reason that these need to @@ -1055,7 +1045,11 @@ struct llog_rec_tail { struct llog_logid_rec { struct llog_rec_hdr lid_hdr; struct llog_logid lid_id; - __u32 padding[5]; + __u32 padding1; + __u32 padding2; + __u32 padding3; + __u32 padding4; + __u32 padding5; struct llog_rec_tail lid_tail; } __attribute__((packed)); diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index 8ff15be..b0445bb 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -38,6 +38,7 @@ enum obd_import_event { IMP_EVENT_INACTIVE = 0x808002, IMP_EVENT_INVALIDATE = 0x808003, IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, }; struct obd_import_conn { @@ -87,9 +88,9 @@ struct obd_import { /* flags */ unsigned int imp_invalid:1, imp_replayable:1, imp_dlm_fake:1, imp_server_timeout:1, - imp_initial_recov:1, imp_force_verify:1, - imp_pingable:1, imp_resend_replay:1, - imp_deactive:1; + imp_initial_recov:1, imp_initial_recov_bk:1, + imp_force_verify:1, imp_pingable:1, + imp_resend_replay:1, imp_deactive:1; __u32 imp_connect_op; struct obd_connect_data imp_connect_data; __u64 imp_connect_flags_orig; diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 29677a2..8dfbb60 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -485,68 +485,64 @@ static inline int ll_insecure_random_int(void) * configuration of interrupt and timeout sensitivity along with actions to * be performed in the event of either exception. * - * Common usage looks like this: + * The first form of usage looks like this: * * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, * intr_handler, callback_data); * rc = l_wait_event(waitq, condition, &lwi); * - * (LWI_TIMEOUT and LWI_INTR macros are available for timeout- and - * interrupt-only variants, respectively.) + * l_wait_event() makes the current process wait on 'waitq' until 'condition' + * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It + * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before + * 'condition' becomes true, it optionally calls the specified 'intr_handler' + * if not NULL, and returns -EINTR. * - * If a timeout is specified, the timeout_handler will be invoked in the event - * that the timeout expires before the process is awakened. (Note that any - * waking of the process will restart the timeout, even if the condition is - * not satisfied and the process immediately returns to sleep. This might be - * considered a bug.) If the timeout_handler returns non-zero, l_wait_event - * will return -ETIMEDOUT and the caller will continue. If the handler returns - * zero instead, the process will go back to sleep until it is awakened by the - * waitq or some similar mechanism, or an interrupt occurs (if the caller has - * asked for interrupts to be detected). The timeout will only fire once, so - * callers should take care that a timeout_handler which returns zero will take - * future steps to awaken the process. N.B. that these steps must include - * making the provided condition become true. + * If a non-zero timeout is specified, signals are ignored until the timeout + * has expired. At this time, if 'timeout_handler' is not NULL it is called. + * If it returns FALSE l_wait_event() continues to wait as described above with + * signals enabled. Otherwise it returns -ETIMEDOUT. * - * If the interrupt flag (lwi_signals) is non-zero, then the process will be - * interruptible, and will be awakened by any "killable" signal (SIGTERM, - * SIGKILL or SIGINT). If a timeout is also specified, then the process will - * only become interruptible _after_ the timeout has expired, though it can be - * awakened by a signal that was delivered before the timeout and is still - * pending when the timeout expires. If a timeout is not specified, the process - * will be interruptible at all times during l_wait_event. + * LWI_INTR(intr_handler, callback_data) is shorthand for + * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data) + * + * The second form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * + * This form is the same as the first except that it COMPLETELY IGNORES + * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if + * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that + * can unblock the current process is 'condition' becoming TRUE. */ +#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1)) + struct l_wait_info { long lwi_timeout; int (*lwi_on_timeout)(void *); - long lwi_signals; void (*lwi_on_signal)(void *); void *lwi_cb_data; }; -#define LWI_TIMEOUT(time, cb, data) \ -((struct l_wait_info) { \ - lwi_timeout: time, \ - lwi_on_timeout: cb, \ - lwi_cb_data: data \ -}) - -#define LWI_INTR(cb, data) \ -((struct l_wait_info) { \ - lwi_signals: 1, \ - lwi_on_signal: cb, \ - lwi_cb_data: data \ +/* NB: LWI_TIMEOUT ignores signals completely */ +#define LWI_TIMEOUT(time, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data \ }) #define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \ ((struct l_wait_info) { \ - lwi_timeout: time, \ - lwi_on_timeout: time_cb, \ - lwi_signals: 1, \ - lwi_on_signal: sig_cb, \ - lwi_cb_data: data \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = (sig_cb == NULL) ? LWI_ON_SIGNAL_NOOP : sig_cb, \ + .lwi_cb_data = data \ }) +#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data) + #define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \ sigmask(SIGTERM) | sigmask(SIGQUIT) | \ sigmask(SIGALRM)) @@ -568,161 +564,139 @@ static inline sigset_t l_w_e_set_sigs(int sigs) #define __l_wait_event(wq, condition, info, ret, excl) \ do { \ - wait_queue_t __wait; \ - int __timed_out = 0; \ - unsigned long irqflags; \ - sigset_t blocked; \ - signed long timeout_remaining; \ + wait_queue_t __wait; \ + signed long __timeout = info->lwi_timeout; \ + unsigned long __irqflags; \ + sigset_t __blocked; \ + \ + ret = 0; \ + if (condition) \ + break; \ \ init_waitqueue_entry(&__wait, current); \ if (excl) \ - add_wait_queue_exclusive(&wq, &__wait); \ + add_wait_queue_exclusive(&wq, &__wait); \ else \ - add_wait_queue(&wq, &__wait); \ + add_wait_queue(&wq, &__wait); \ \ /* Block all signals (just the non-fatal ones if no timeout). */ \ - if (info->lwi_signals && !info->lwi_timeout) \ - blocked = l_w_e_set_sigs(LUSTRE_FATAL_SIGS); \ + if (info->lwi_on_signal != NULL && __timeout == 0) \ + __blocked = l_w_e_set_sigs(LUSTRE_FATAL_SIGS); \ else \ - blocked = l_w_e_set_sigs(0); \ - \ - timeout_remaining = info->lwi_timeout; \ + __blocked = l_w_e_set_sigs(0); \ \ for (;;) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (info->lwi_timeout && !__timed_out) { \ - timeout_remaining = schedule_timeout(timeout_remaining); \ - if (timeout_remaining == 0) { \ - __timed_out = 1; \ - if (!info->lwi_on_timeout || \ - info->lwi_on_timeout(info->lwi_cb_data)) { \ - ret = -ETIMEDOUT; \ + set_current_state(TASK_INTERRUPTIBLE); \ + \ + if (condition) \ break; \ - } \ - /* We'll take signals after a timeout. */ \ - if (info->lwi_signals) \ - (void)l_w_e_set_sigs(LUSTRE_FATAL_SIGS); \ + \ + if (__timeout == 0) { \ + schedule(); \ + } else { \ + __timeout = schedule_timeout(__timeout); \ + if (__timeout == 0) { \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + /* Take signals after the timeout expires. */ \ + if (info->lwi_on_signal != NULL) \ + (void)l_w_e_set_sigs(LUSTRE_FATAL_SIGS); \ + } \ } \ - } else { \ - schedule(); \ - } \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - if (!info->lwi_timeout || __timed_out) { \ - break; \ - } else { \ - /* We have to do this here because some signals */ \ - /* are not blockable - ie from strace(1). */ \ - /* In these cases we want to schedule_timeout() */ \ - /* again, because we don't want that to return */ \ - /* -EINTR when the RPC actually succeeded. */ \ - /* the RECALC_SIGPENDING below will deliver the */ \ - /* signal properly. */ \ - SIGNAL_MASK_LOCK(current, irqflags); \ - CLEAR_SIGPENDING; \ - SIGNAL_MASK_UNLOCK(current, irqflags); \ - } \ - } \ - } \ \ - SIGNAL_MASK_LOCK(current, irqflags); \ - current->blocked = blocked; \ - RECALC_SIGPENDING; \ - SIGNAL_MASK_UNLOCK(current, irqflags); \ + if (condition) \ + break; \ \ - if ((!info->lwi_timeout || __timed_out) && signal_pending(current)) { \ - if (info->lwi_on_signal) \ - info->lwi_on_signal(info->lwi_cb_data); \ - ret = -EINTR; \ + if (signal_pending(current)) { \ + if (info->lwi_on_signal != NULL && __timeout == 0) { \ + if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \ + info->lwi_on_signal(info->lwi_cb_data);\ + ret = -EINTR; \ + break; \ + } \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the RECALC_SIGPENDING below will deliver the */ \ + /* signal properly. */ \ + SIGNAL_MASK_LOCK(current, __irqflags); \ + CLEAR_SIGPENDING; \ + SIGNAL_MASK_UNLOCK(current, __irqflags); \ + } \ } \ \ + SIGNAL_MASK_LOCK(current, __irqflags); \ + current->blocked = __blocked; \ + RECALC_SIGPENDING; \ + SIGNAL_MASK_UNLOCK(current, __irqflags); \ + \ current->state = TASK_RUNNING; \ remove_wait_queue(&wq, &__wait); \ } while(0) #else /* !__KERNEL__ */ -#define __l_wait_event(wq, condition, info, ret, excl) \ -do { \ - long timeout = info->lwi_timeout, elapse, last = 0; \ - int __timed_out = 0; \ - \ - if (info->lwi_timeout == 0) \ - timeout = 1000000000; \ - else \ - last = time(NULL); \ - \ - for (;;) { \ - if (condition) \ - break; \ - if (liblustre_wait_event(timeout)) { \ - if (timeout == 0 || info->lwi_timeout == 0) \ - continue; \ - elapse = time(NULL) - last; \ - if (elapse) { \ - last += elapse; \ - timeout -= elapse; \ - if (timeout < 0) \ - timeout = 0; \ - } \ - continue; \ - } \ - if (info->lwi_timeout && !__timed_out) { \ - __timed_out = 1; \ - if (info->lwi_on_timeout == NULL || \ - info->lwi_on_timeout(info->lwi_cb_data)) { \ - ret = -ETIMEDOUT; \ - break; \ - } \ - } \ - } \ +#define __l_wait_event(wq, condition, info, ret, excl) \ +do { \ + long __timeout = info->lwi_timeout; \ + long __now; \ + long __then = 0; \ + int __timed_out = 0; \ + \ + ret = 0; \ + if (condition) \ + break; \ + \ + if (__timeout == 0) \ + __timeout = 1000000000; \ + else \ + __then = time(NULL); \ + \ + while (!(condition)) { \ + if (liblustre_wait_event(__timeout)) { \ + if (__timeout != 0 && info->lwi_timeout != 0) { \ + __now = time(NULL); \ + __timeout -= __now - __then; \ + if (__timeout < 0) \ + __timeout = 0; \ + __then = __now; \ + } \ + continue; \ + } \ + \ + if (info->lwi_timeout != 0 && !__timed_out) { \ + __timed_out = 1; \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + } \ + } \ } while (0) #endif /* __KERNEL__ */ -#define l_wait_event(wq, condition, info) \ -({ \ - int __ret = 0; \ - struct l_wait_info *__info = (info); \ - if (!(condition)) \ - __l_wait_event(wq, condition, __info, __ret, 0); \ - __ret; \ -}) - -#define l_wait_event_exclusive(wq, condition, info) \ -({ \ - int __ret = 0; \ - struct l_wait_info *__info = (info); \ - if (!(condition)) \ - __l_wait_event(wq, condition, __info, __ret, 1); \ - __ret; \ +#define l_wait_event(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, __ret, 0); \ + __ret; \ }) -#define LMD_MAGIC_R1 0xbdacbdac -#define LMD_MAGIC 0xbdacbd02 - -#define lmd_bad_magic(LMDP) \ -({ \ - struct lustre_mount_data *_lmd__ = (LMDP); \ - int _ret__ = 0; \ - if (!_lmd__) { \ - LCONSOLE_ERROR("Missing mount data: " \ - "check that /sbin/mount.lustre is installed.\n");\ - _ret__ = 1; \ - } else if (_lmd__->lmd_magic == LMD_MAGIC_R1) { \ - LCONSOLE_ERROR("You're using an old version of " \ - "/sbin/mount.lustre. Please install version " \ - "1.%d\n", LMD_MAGIC & 0xFF); \ - _ret__ = 1; \ - } else if (_lmd__->lmd_magic != LMD_MAGIC) { \ - LCONSOLE_ERROR("Invalid mount data (%#x != %#x): " \ - "check that /sbin/mount.lustre is installed\n", \ - _lmd__->lmd_magic, LMD_MAGIC); \ - _ret__ = 1; \ - } \ - _ret__; \ +#define l_wait_event_exclusive(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, __ret, 1); \ + __ret; \ }) #ifdef __KERNEL__ diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index 28ee37b..146bae0 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -126,3 +126,56 @@ static inline void lustre_build_lock_params(int cmd, unsigned long open_flags, params->lrp_ast_flags = (open_flags & O_NONBLOCK) ? LDLM_FL_BLOCK_NOWAIT : 0; } + +/* + * This is embedded into liblustre and llite super-blocks to keep track of + * connect flags (capabilities) supported by all imports given mount is + * connected to. + */ +struct lustre_client_ocd { + /* + * This is conjunction of connect_flags across all imports (LOVs) this + * mount is connected to. This field is updated by ll_ocd_update() + * under ->lco_lock. + */ + __u64 lco_flags; + spinlock_t lco_lock; +}; + +/* + * This function is used as an upcall-callback hooked by liblustre and llite + * clients into obd_notify() listeners chain to handle notifications about + * change of import connect_flags. See llu_fsswop_mount() and + * lustre_common_fill_super(). + * + * Again, it is dumped into this header for the lack of a better place. + */ +static inline int ll_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner) +{ + struct lustre_client_ocd *lco; + struct client_obd *cli; + __u64 flags; + int result; + + ENTRY; + if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + cli = &watched->u.cli; + lco = owner; + flags = cli->cl_import->imp_connect_data.ocd_connect_flags; + CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n", + lco->lco_flags, flags); + spin_lock(&lco->lco_lock); + lco->lco_flags &= flags; + spin_unlock(&lco->lco_lock); + result = 0; + } else { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + result = -EINVAL; + } + RETURN(result); +} + diff --git a/lustre/include/linux/lustre_log.h b/lustre/include/linux/lustre_log.h index 54e791f..2b4950c 100644 --- a/lustre/include/linux/lustre_log.h +++ b/lustre/include/linux/lustre_log.h @@ -204,9 +204,9 @@ static inline void llog_gen_init(struct llog_ctxt *ctxt) { struct obd_device *obd = ctxt->loc_exp->exp_obd; - if (!strcmp(obd->obd_type->typ_name, "mds")) + if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) ctxt->loc_gen.mnt_cnt = obd->u.mds.mds_mount_count; - else if (!strstr(obd->obd_type->typ_name, "filter")) + else if (!strstr(obd->obd_type->typ_name, LUSTRE_FILTER_NAME)) ctxt->loc_gen.mnt_cnt = obd->u.filter.fo_mount_count; else ctxt->loc_gen.mnt_cnt = 0; @@ -221,7 +221,7 @@ static inline int llog_gen_lt(struct llog_gen a, struct llog_gen b) return(a.conn_cnt < b.conn_cnt ? 1 : 0); } -#define LLOG_GEN_INC(gen) ((gen).conn_cnt) ++ +#define LLOG_GEN_INC(gen) ((gen).conn_cnt ++) #define LLOG_PROC_BREAK 0x0001 static inline int llog_obd2ops(struct llog_ctxt *ctxt, diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index d684683..f3d4e61 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -34,10 +34,6 @@ struct ptlrpc_request; struct obd_device; struct ll_file_data; -#define LUSTRE_MDS_NAME "mds" -#define LUSTRE_MDT_NAME "mdt" -#define LUSTRE_MDC_NAME "mdc" - struct lustre_md { struct mds_body *body; struct lov_stripe_md *lsm; @@ -122,7 +118,7 @@ int it_disposition(struct lookup_intent *it, int flag); void it_set_disposition(struct lookup_intent *it, int flag); int it_open_error(int phase, struct lookup_intent *it); void mdc_set_lock_data(__u64 *lockh, void *data); -int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, +int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, ldlm_iterator_t it, void *data); int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *, diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index bb31f68..efc8b00 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -43,7 +43,7 @@ /* MD flags we _always_ use */ #define PTLRPC_MD_OPTIONS 0 -/* Define maxima for bulk I/O +/* Define maxima for bulk I/O * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) * these limits are system wide and not interface-local. */ #define PTLRPC_MAX_BRW_SIZE LNET_MTU @@ -83,15 +83,15 @@ * considered full when less than ?_MAXREQSIZE is left in them. */ -#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64) +#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64) #define LDLM_NBUFS 64 #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) #define LDLM_MAXREPSIZE (1024) #define MDT_MAX_THREADS 32UL -#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \ - MDT_MAX_THREADS), 2UL) +#define MDT_NUM_THREADS max(min_t(unsigned long, MDT_MAX_THREADS, \ + num_physpages >> (25 - PAGE_SHIFT)), 2UL) #define MDS_NBUFS (64 * smp_num_cpus) #define MDS_BUFSIZE (8 * 1024) /* Assume file name length = FNAME_MAX = 256 (true for ext3). @@ -104,6 +104,7 @@ * * MDS_MAXREQSIZE ~= 4736 bytes = * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX + * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header * * Realistic size is about 512 bytes (20 character name + 128 char symlink), * except in the open case where there are a large number of OSTs in a LOV. @@ -111,9 +112,9 @@ #define MDS_MAXREQSIZE (5 * 1024) #define MDS_MAXREPSIZE (9 * 1024) -#define OST_MAX_THREADS 36UL -#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \ - OST_MAX_THREADS), 2UL) +#define OST_MAX_THREADS 512UL +#define OST_DEF_THREADS max_t(unsigned long, 2, \ + (num_physpages >> (26-PAGE_SHIFT)) * smp_num_cpus) #define OST_NBUFS (64 * smp_num_cpus) #define OST_BUFSIZE (8 * 1024) /* OST_MAXREQSIZE ~= 4768 bytes = @@ -125,11 +126,6 @@ #define OST_MAXREQSIZE (5 * 1024) #define OST_MAXREPSIZE (9 * 1024) -#define PTLBD_NUM_THREADS 4 -#define PTLBD_NBUFS 64 -#define PTLBD_BUFSIZE (32 * 1024) -#define PTLBD_MAXREQSIZE 1024 - struct ptlrpc_connection { struct list_head c_link; lnet_nid_t c_self; @@ -296,7 +292,7 @@ struct ptlrpc_request { int rq_reqlen; struct lustre_msg *rq_reqmsg; - int rq_timeout; /* seconds */ + int rq_timeout; /* time to wait for reply (seconds) */ int rq_replen; struct lustre_msg *rq_repmsg; __u64 rq_transno; @@ -320,7 +316,6 @@ struct ptlrpc_request { struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */ struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/ #if CRAY_XT3 -# error "Need to get the uid from the event?" __u32 rq_uid; /* peer uid, used in MDS only */ #endif @@ -444,7 +439,7 @@ struct ptlrpc_bulk_desc { struct ptlrpc_cb_id bd_cbid; /* network callback info */ lnet_handle_md_t bd_md_h; /* associated MD */ - + #if defined(__KERNEL__) lnet_kiov_t bd_iov[0]; #else @@ -492,8 +487,8 @@ struct ptlrpc_service { int srv_num_threads; /* # threads to start/started */ unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ - __u32 srv_req_portal; - __u32 srv_rep_portal; + __u32 srv_req_portal; + __u32 srv_rep_portal; int srv_n_queued_reqs; /* # reqs waiting to be served */ struct list_head srv_request_queue; /* reqs waiting for service */ @@ -517,9 +512,8 @@ struct ptlrpc_service { wait_queue_head_t srv_waitq; /* all threads sleep on this */ struct list_head srv_threads; - struct obd_device *srv_obddev; svc_handler_t srv_handler; - + char *srv_name; /* only statically allocated strings here; we don't clean them */ spinlock_t srv_lock; @@ -528,10 +522,10 @@ struct ptlrpc_service { struct lprocfs_stats *srv_stats; /* List of free reply_states */ - struct list_head srv_free_rs_list; + struct list_head srv_free_rs_list; /* waitq to run, when adding stuff to srv_free_rs_list */ - wait_queue_head_t srv_free_rs_waitq; - + wait_queue_head_t srv_free_rs_waitq; + /* * if non-NULL called during thread creation (ptlrpc_start_thread()) * to initialize service specific per-thread state. @@ -548,7 +542,7 @@ struct ptlrpc_service { /* ptlrpc/events.c */ extern lnet_handle_eq_t ptlrpc_eq_h; -extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, lnet_process_id_t *peer, lnet_nid_t *self); extern void request_out_callback (lnet_event_t *ev); extern void reply_in_callback(lnet_event_t *ev); @@ -574,7 +568,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); int ptlrpc_register_bulk(struct ptlrpc_request *req); void ptlrpc_unregister_bulk (struct ptlrpc_request *req); -static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) +static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) { unsigned long flags; int rc; @@ -604,7 +598,7 @@ ptlrpc_client_receiving_reply (struct ptlrpc_request *req) { unsigned long flags; int rc; - + spin_lock_irqsave(&req->rq_lock, flags); rc = req->rq_receiving_reply; spin_unlock_irqrestore(&req->rq_lock, flags); @@ -616,7 +610,7 @@ ptlrpc_client_replied (struct ptlrpc_request *req) { unsigned long flags; int rc; - + spin_lock_irqsave(&req->rq_lock, flags); rc = req->rq_replied; spin_unlock_irqrestore(&req->rq_lock, flags); @@ -677,7 +671,7 @@ __u64 ptlrpc_sample_next_xid(void); __u64 ptlrpc_req_xid(struct ptlrpc_request *request); /* ptlrpc/service.c */ -void ptlrpc_save_lock (struct ptlrpc_request *req, +void ptlrpc_save_lock (struct ptlrpc_request *req, struct lustre_handle *lock, int mode); void ptlrpc_commit_replies (struct obd_device *obd); void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs); diff --git a/lustre/include/linux/lustre_ver.h.in b/lustre/include/linux/lustre_ver.h.in new file mode 100644 index 0000000..4abf818 --- /dev/null +++ b/lustre/include/linux/lustre_ver.h.in @@ -0,0 +1,23 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ + +#include + +#define LUSTRE_MAJOR @AC_LUSTRE_MAJOR@ +#define LUSTRE_MINOR @AC_LUSTRE_MINOR@ +#define LUSTRE_PATCH @AC_LUSTRE_PATCH@ +#define LUSTRE_FIX @AC_LUSTRE_FIX@ +#define LUSTRE_VERSION_STRING "@AC_LUSTRE_VERSION_STRING@" + +// liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches +// by this amount (set in lustre/autoconf/lustre-version.ac) +#define LUSTRE_VERSION_ALLOWED_OFFSET @AC_LUSTRE_VER_ALLOWED_OFFSET@ + +// if lustre version of client and servers it connects to differs by more than +// this amount, client would issue a warning +// (set in lustre/autoconf/lustre-version.ac) +#define LUSTRE_VERSION_OFFSET_WARN @AC_LUSTRE_VER_OFFSET_WARN@ + +#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX) + +#endif diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 7daf6f5..9ae46cd 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -86,26 +86,40 @@ struct lov_stripe_md { spinlock_t lsm_lock; void *lsm_lock_owner; /* debugging */ - /* Public members. */ - __u64 lsm_object_id; /* lov object id */ - __u64 lsm_object_gr; /* lov object id */ - __u64 lsm_maxbytes; /* maximum possible file size */ - unsigned long lsm_xfersize; /* optimal transfer size */ - - /* LOV-private members start here -- only for use in lov/. */ - __u32 lsm_magic; - __u32 lsm_stripe_size; /* size of the stripe */ - __u32 lsm_pattern; /* striping pattern (RAID0, RAID1) */ - unsigned lsm_stripe_count; /* number of objects being striped over */ + struct { + /* Public members. */ + __u64 lw_object_id; /* lov object id */ + __u64 lw_object_gr; /* lov object id */ + __u64 lw_maxbytes; /* maximum possible file size */ + unsigned long lw_xfersize; /* optimal transfer size */ + + /* LOV-private members start here -- only for use in lov/. */ + __u32 lw_magic; + __u32 lw_stripe_size; /* size of the stripe */ + __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ + unsigned lw_stripe_count; /* number of objects being striped over */ + } lsm_wire; struct lov_oinfo lsm_oinfo[0]; }; -/* compare all fields except for semaphore */ +#define lsm_object_id lsm_wire.lw_object_id +#define lsm_object_gr lsm_wire.lw_object_gr +#define lsm_maxbytes lsm_wire.lw_maxbytes +#define lsm_xfersize lsm_wire.lw_xfersize +#define lsm_magic lsm_wire.lw_magic +#define lsm_stripe_size lsm_wire.lw_stripe_size +#define lsm_pattern lsm_wire.lw_pattern +#define lsm_stripe_count lsm_wire.lw_stripe_count + +/* compare all relevant fields. */ static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1, struct lov_stripe_md *m2) { - return memcmp(&m1->lsm_object_id, &m2->lsm_object_id, - (char *)&m2->lsm_oinfo[0] - (char *)&m2->lsm_object_id); + /* + * ->lsm_wire contains padding, but it should be zeroed out during + * allocation. + */ + return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire); } void lov_stripe_lock(struct lov_stripe_md *md); @@ -135,7 +149,7 @@ enum async_flags { or cancel the size of the io */ ASYNC_GROUP_SYNC = 0x8, /* ap_completion will not be called, instead the page is accounted for in the - obd_io_group given to + obd_io_group given to obd_queue_group_io */ }; @@ -163,7 +177,7 @@ struct obd_io_group { struct oig_callback_context { struct list_head occ_oig_item; /* called when the caller has received a signal while sleeping. - * callees of this method are encouraged to abort their state + * callees of this method are encouraged to abort their state * in the oig. This may be called multiple times. */ void (*occ_interrupted)(struct oig_callback_context *occ); unsigned int interrupted:1; @@ -210,6 +224,7 @@ struct filter_obd { * increment */ struct file *fo_rcvd_filp; + struct file *fo_health_check_filp; struct filter_server_data *fo_fsd; unsigned long *fo_last_rcvd_slots; __u64 fo_mount_count; @@ -259,7 +274,7 @@ struct filter_obd { * This is (void *) array, because 2.4 and 2.6 use different iobuf * structures. */ - void **fo_iobuf_pool; + struct filter_iobuf **fo_iobuf_pool; int fo_iobuf_count; struct obd_histogram fo_r_pages; @@ -384,6 +399,7 @@ struct mds_obd { obd_id *mds_lov_objids; int mds_lov_nextid_set; struct file *mds_lov_objid_filp; + struct file *mds_health_check_filp; unsigned long *mds_client_bitmap; struct semaphore mds_orphan_recovery_sem; struct upcall_cache *mds_group_hash; @@ -404,37 +420,10 @@ struct echo_obd { atomic_t eo_prep; }; -/* - * this struct does double-duty acting as either a client or - * server instance .. maybe not wise. - */ -struct ptlbd_obd { - /* server's */ - struct ptlrpc_service *ptlbd_service; - struct file *filp; - /* client's */ - struct ptlrpc_client bd_client; - struct obd_import *bd_import; - struct obd_uuid bd_server_uuid; - struct obd_export *bd_exp; - int refcount; /* XXX sigh */ -}; - -struct recovd_obd { - spinlock_t recovd_lock; - struct list_head recovd_managed_items; /* items managed */ - struct list_head recovd_troubled_items; /* items in recovery */ - - wait_queue_head_t recovd_recovery_waitq; - wait_queue_head_t recovd_ctl_waitq; - wait_queue_head_t recovd_waitq; - struct task_struct *recovd_thread; - __u32 recovd_state; -}; - struct ost_obd { struct ptlrpc_service *ost_service; struct ptlrpc_service *ost_create_service; + struct ptlrpc_service *ost_io_service; struct semaphore ost_health_sem; }; @@ -446,23 +435,22 @@ struct echo_client_obd { __u64 ec_unique; }; -struct cache_obd { - struct obd_export *cobd_target_exp;/* local connection to target obd */ - struct obd_export *cobd_cache_exp; /* local connection to cache obd */ -}; - struct lov_tgt_desc { struct obd_uuid uuid; __u32 ltd_gen; struct obd_export *ltd_exp; - int active; /* is this target up for requests */ + unsigned int active:1, /* is this target up for requests */ + reap:1; /* should this target be deleted */ }; struct lov_obd { - spinlock_t lov_lock; + struct semaphore lov_lock; + atomic_t refcount; struct lov_desc desc; int bufsize; - int refcount; + int connects; + int death_row; /* Do we have tgts scheduled to be deleted? + (Make this a linked list?) */ unsigned int lo_catalog_loaded:1; struct lov_tgt_desc *tgts; }; @@ -477,6 +465,15 @@ struct niobuf_local { int rc; }; +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OST_NAME "ost" +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_FILTER_NAME "filter" +#define LUSTRE_SANOSC_NAME "sanosc" +#define LUSTRE_SANOST_NAME "sanost" + /* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ #define N_LOCAL_TEMP_PAGE 0x10000000 @@ -495,9 +492,24 @@ struct obd_trans_info { int oti_numcookies; /* initial thread handling transaction */ - struct ptlrpc_thread *oti_thread; + int oti_thread_id; }; +static inline void oti_init(struct obd_trans_info *oti, + struct ptlrpc_request *req) +{ + if (oti == NULL) + return; + memset(oti, 0, sizeof *oti); + + if (req == NULL) + return; + + if (req->rq_repmsg && req->rq_reqmsg != 0) + oti->oti_transno = req->rq_repmsg->transno; + oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1; +} + static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies) { if (!oti) @@ -543,6 +555,29 @@ enum llog_ctxt_id { LLOG_MAX_CTXTS }; +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * and liblustre being main examples). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + /* corresponds to one of the obd's */ struct obd_device { @@ -574,6 +609,7 @@ struct obd_device { struct lvfs_run_ctxt obd_lvfs_ctxt; struct llog_ctxt *obd_llog_ctxt[LLOG_MAX_CTXTS]; struct obd_device *obd_observer; + struct obd_notify_upcall obd_upcall; struct obd_export *obd_self_export; /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ struct list_head obd_exports_timed; @@ -606,12 +642,9 @@ struct obd_device { struct ost_obd ost; struct echo_client_obd echo_client; struct echo_obd echo; - struct recovd_obd recovd; struct lov_obd lov; - struct cache_obd cobd; - struct ptlbd_obd ptlbd; } u; - /* Fields used by LProcFS */ + /* Fields used by LProcFS */ unsigned int obd_cntr_base; struct lprocfs_stats *obd_stats; struct proc_dir_entry *obd_svc_procroot; @@ -623,6 +656,19 @@ struct obd_device { #define OBD_LLOG_FL_SENDNOW 0x0001 + +/* Special case hack for MDS LOVs */ +#define OBD_CLEANUP_EARLY 0 +/* Precleanup stage 1, we must make sure all exports (other than the + self-export) get destroyed. */ +#define OBD_CLEANUP_EXPORTS 1 +/* Precleanup stage 2, do other type-specific cleanup requiring the + self-export. */ +#define OBD_CLEANUP_SELF_EXP 2 +/* FIXME we should eliminate the "precleanup" function and make them stages + of the "cleanup" function. */ +#define OBD_CLEANUP_OBD 3 + struct obd_ops { struct module *o_owner; int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, @@ -648,6 +694,9 @@ struct obd_ops { * asked for. If @ocd == NULL, use default parameters. */ int (*o_connect)(struct lustre_handle *conn, struct obd_device *src, struct obd_uuid *cluuid, struct obd_connect_data *ocd); + int (*o_reconnect)(struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd); int (*o_disconnect)(struct obd_export *exp); int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs, @@ -678,26 +727,26 @@ struct obd_ops { struct lov_stripe_md *ea, obd_count oa_bufs, struct brw_page *pgarr, struct ptlrpc_request_set *, struct obd_trans_info *oti); - int (*o_prep_async_page)(struct obd_export *exp, + int (*o_prep_async_page)(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct page *page, obd_off offset, + struct lov_oinfo *loi, + struct page *page, obd_off offset, struct obd_async_page_ops *ops, void *data, void **res); - int (*o_queue_async_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, + int (*o_queue_async_io)(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct lov_oinfo *loi, void *cookie, + int cmd, obd_off off, int count, obd_flag brw_flags, obd_flag async_flags); - int (*o_queue_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, - void *cookie, int cmd, obd_off off, int count, + int (*o_queue_group_io)(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct lov_oinfo *loi, + struct obd_io_group *oig, + void *cookie, int cmd, obd_off off, int count, obd_flag brw_flags, obd_flag async_flags); - int (*o_trigger_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, + int (*o_trigger_group_io)(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct lov_oinfo *loi, struct obd_io_group *oig); int (*o_set_async_flags)(struct obd_export *exp, struct lov_stripe_md *lsm, @@ -744,7 +793,7 @@ struct obd_ops { __u32 mode, struct lustre_handle *); int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *, int flags, void *opaque); - int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, + int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, int join); int (*o_san_preprw)(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, @@ -767,7 +816,7 @@ struct obd_ops { enum obd_import_event); int (*o_notify)(struct obd_device *obd, struct obd_device *watched, - int active); + enum obd_notify_event ev); int (*o_health_check)(struct obd_device *); @@ -775,7 +824,7 @@ struct obd_ops { int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *); int (*o_quotactl)(struct obd_export *, struct obd_quotactl *); - /* + /* * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. * Also, add a wrapper function in include/linux/obd_class.h. @@ -786,6 +835,7 @@ struct obd_ops { */ }; +int lvfs_check_io_health(struct obd_device *obd, struct file *file); static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, int error) diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 38d7da4..65441bd 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -546,6 +546,26 @@ static inline int obd_connect(struct lustre_handle *conn, struct obd_device *obd RETURN(rc); } +static inline int obd_reconnect(struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition check */ + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_OP(obd, reconnect, 0); + OBD_COUNTER_INCREMENT(obd, reconnect); + + rc = OBP(obd, reconnect)(exp, obd, cluuid, d); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + RETURN(rc); +} + static inline int obd_disconnect(struct obd_export *exp) { int rc; @@ -1006,7 +1026,7 @@ static inline void obd_import_event(struct obd_device *obd, static inline int obd_notify(struct obd_device *obd, struct obd_device *watched, - int active) + enum obd_notify_event ev) { OBD_CHECK_DEV(obd); if (!obd->obd_set_up) { @@ -1020,7 +1040,32 @@ static inline int obd_notify(struct obd_device *obd, } OBD_COUNTER_INCREMENT(obd, notify); - return OBP(obd, notify)(obd, watched, active); + return OBP(obd, notify)(obd, watched, ev); +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev) +{ + int rc1; + int rc2; + + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc1 = obd_notify(observer->obd_observer, observed, ev); + else + rc1 = 0; + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner); + else + rc2 = 0; + + return rc1 ?: rc2; } static inline int obd_quotacheck(struct obd_export *exp, diff --git a/lustre/include/linux/obd_ost.h b/lustre/include/linux/obd_ost.h index 0b15af6..4a2a344 100644 --- a/lustre/include/linux/obd_ost.h +++ b/lustre/include/linux/obd_ost.h @@ -4,7 +4,7 @@ * This file is part of Lustre, http://www.lustre.org * * Data structures for object storage targets and client: OST & OSC's - * + * * See also lustre_idl.h for wire formats of requests. */ @@ -13,12 +13,6 @@ #include -#define LUSTRE_FILTER_NAME "obdfilter" -#define LUSTRE_OST_NAME "ost" -#define LUSTRE_OSC_NAME "osc" -#define LUSTRE_SANOSC_NAME "sanosc" -#define LUSTRE_SANOST_NAME "sanost" - struct osc_brw_async_args { struct obdo *aa_oa; int aa_requested_nob; diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch deleted file mode 100644 index 172432a..0000000 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch +++ /dev/null @@ -1,1766 +0,0 @@ -Index: linux-2.4.20-rh-20.9/fs/ext3/mballoc.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.4.20-rh-20.9/fs/ext3/mballoc.c 2004-10-20 22:28:51.000000000 +0400 -@@ -0,0 +1,1459 @@ -+/* -+ * Copyright (c) 2004, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - do not scan from the beginning, try to remember first free block -+ * - mb_mark_used_* may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. this checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * where to save buddies structures beetween umount/mount (clean case only) -+ */ -+#define EXT3_BUDDY_FILE ".buddy" -+ -+/* -+ * max. number of chunks to be tracked in ext3_free_extent struct -+ */ -+#define MB_ARR_SIZE 32 -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ int ac_g_group; -+ int ac_g_start; -+ int ac_g_len; -+ int ac_g_flags; -+ -+ /* the best found extent */ -+ int ac_b_group; -+ int ac_b_start; -+ int ac_b_len; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ int ac_repeats; -+ int ac_groups_scanned; -+ int ac_status; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+ -+ -+struct ext3_buddy { -+ void *bd_bitmap; -+ void *bd_buddy; -+ int bd_blkbits; -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; -+ struct super_block *bd_sb; -+}; -+ -+struct ext3_free_extent { -+ int fe_start; -+ int fe_len; -+ unsigned char fe_orders[MB_ARR_SIZE]; -+ unsigned char fe_nums; -+ unsigned char fe_back; -+}; -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+ -+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); -+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); -+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, u32 *, u32 *, int *); -+int ext3_mb_reserve_blocks(struct super_block *, int); -+void ext3_mb_release_blocks(struct super_block *, int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+int load_block_bitmap (struct super_block *, unsigned int); -+ -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ if ((unsigned long) addr & 1) { \ -+ bit += 8; \ -+ addr--; \ -+ } \ -+ if ((unsigned long) addr & 2) { \ -+ bit += 16; \ -+ addr--; \ -+ addr--; \ -+ } \ -+} -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ set_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ clear_bit(bit, addr); -+} -+ -+struct buffer_head * -+read_block_bitmap_bh(struct super_block *sb, unsigned int block_group) -+{ -+ struct buffer_head *bh; -+ int bitmap_nr; -+ -+ bitmap_nr = load_block_bitmap(sb, block_group); -+ if (bitmap_nr < 0) -+ return NULL; -+ -+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; -+ return bh; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ int i = 1; -+ void *bb; -+ -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) -+ return NULL; -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return e3b->bd_bitmap; -+ -+ bb = e3b->bd_buddy; -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ return bb; -+} -+ -+static int ext3_mb_load_desc(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ wait_on_buffer(e3b->bd_bh); -+ } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh2)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ wait_on_buffer(e3b->bd_bh2); -+ } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_bitmap = e3b->bd_bh->b_data; -+ e3b->bd_buddy = e3b->bd_bh2->b_data; -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} -+ -+#ifdef AGGRESSIVE_CHECK -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (!mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); -+ else if (mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); -+ order--; -+ } -+ -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (mb_test_bit(i, buddy)) -+ continue; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(!mb_test_bit(k, buddy2)); -+ } -+ } -+} -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = e3b->bd_buddy; -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block, max, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); -+ mb_set_bit(block, e3b->bd_bitmap); -+ e3b->bd_bd->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (!mb_test_bit(block, buddy) || -+ !mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't clear -+ * free bits in bitmap */ -+ mb_clear_bit(block, buddy); -+ mb_clear_bit(block + 1, buddy); -+ } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_bd->bb_counters[order]++; -+ -+ mb_set_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+/* -+ * returns 1 if out extent is enough to fill needed space -+ */ -+int mb_make_backward_extent(struct ext3_free_extent *in, -+ struct ext3_free_extent *out, int needed) -+{ -+ int i; -+ -+ J_ASSERT(in); -+ J_ASSERT(out); -+ J_ASSERT(in->fe_nums < MB_ARR_SIZE); -+ -+ out->fe_len = 0; -+ out->fe_start = in->fe_start + in->fe_len; -+ out->fe_nums = 0; -+ -+ /* for single-chunk extent we need not back order -+ * also, if an extent doesn't fill needed space -+ * then it makes no sense to try back order becase -+ * if we select this extent then it'll be use as is */ -+ if (in->fe_nums < 2 || in->fe_len < needed) -+ return 0; -+ -+ i = in->fe_nums - 1; -+ while (i >= 0 && out->fe_len < needed) { -+ out->fe_len += (1 << in->fe_orders[i]); -+ out->fe_start -= (1 << in->fe_orders[i]); -+ i--; -+ } -+ /* FIXME: in some situation fe_orders may be too small to hold -+ * all the buddies */ -+ J_ASSERT(out->fe_len >= needed); -+ -+ for (i++; i < in->fe_nums; i++) -+ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; -+ J_ASSERT(out->fe_nums < MB_ARR_SIZE); -+ out->fe_back = 1; -+ -+ return 1; -+} -+ -+int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int space = needed; -+ int next, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ ex->fe_nums = 0; -+ ex->fe_len = 0; -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (!mb_test_bit(block, buddy)) -+ goto nofree; -+ -+ if (order == 0) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_orders[ex->fe_nums++] = order; -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_back = 0; -+ -+ while ((space = space - (1 << order)) > 0) { -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (!mb_test_bit(next, e3b->bd_bitmap)) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ if ((1 << ord) >= needed) { -+ /* we dont want to coalesce with self-enough buddies */ -+ break; -+ } -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ -+ if (ex->fe_nums < MB_ARR_SIZE) -+ ex->fe_orders[ex->fe_nums++] = order; -+ } -+ -+nofree: -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used_backward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; -+ -+ start = ex->fe_start + ex->fe_len - 1; -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && -+ len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start -= mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ J_ASSERT(start >= 0); -+ continue; -+ } -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_mark_used_forward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; -+ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+int inline mb_mark_used(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int err; -+ -+ J_ASSERT(ex); -+ if (ex->fe_back == 0) -+ err = mb_mark_used_forward(e3b, ex, len); -+ else -+ err = mb_mark_used_backward(e3b, ex, len); -+ return err; -+} -+ -+int ext3_mb_new_in_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, int group) -+{ -+ struct super_block *sb = ac->ac_sb; -+ int err, gorder, max, i; -+ struct ext3_free_extent curex; -+ -+ /* let's know order of allocation */ -+ gorder = 0; -+ while (ac->ac_g_len > (1 << gorder)) -+ gorder++; -+ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { -+ /* someone asks for space at this specified block -+ * probably he wants to merge it into existing extent */ -+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { -+ /* good. at least one block is free */ -+ max = mb_find_extent(e3b, 0, ac->ac_g_start, -+ ac->ac_g_len, &curex); -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ err = 0; -+ goto out; -+ } -+ /* don't try to find goal anymore */ -+ ac->ac_g_flags &= ~1; -+ } -+ -+ i = 0; -+ while (1) { -+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) -+ break; -+ -+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); -+ if (max >= ac->ac_g_len) { -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ break; -+ } -+ i += max; -+ } -+ -+ return 0; -+ -+out: -+ return err; -+} -+ -+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) -+{ -+ struct ext3_group_desc *gdp; -+ int free_blocks; -+ -+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); -+ if (!gdp) -+ return 0; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ if (free_blocks == 0) -+ return 0; -+ -+ /* someone wants this block very much */ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) -+ return 1; -+ -+ /* FIXME: I'd like to take fragmentation into account here */ -+ if (cr == 0) { -+ if (free_blocks >= ac->ac_g_len >> 1) -+ return 1; -+ } else if (cr == 1) { -+ if (free_blocks >= ac->ac_g_len >> 2) -+ return 1; -+ } else if (cr == 2) { -+ return 1; -+ } else { -+ BUG(); -+ } -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, NULL,NULL, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ if (!(flags & 2)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; -+ ac.ac_status = 0; -+ ac.ac_groups_scanned = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_group = group; -+ ac.ac_g_start = block; -+ ac.ac_g_len = *len; -+ ac.ac_g_flags = flags; -+ -+ /* loop over the groups */ -+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ /* check is group good for our criteries */ -+ if (!mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ err = ext3_mb_new_in_group(&ac, &e3b, group); -+ ext3_unlock_group(sb, group); -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ break; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* unfortunately, we can't satisfy this request */ -+ J_ASSERT(ac.ac_b_len == 0); -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+ goto out; -+ } -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap_bh(sb, ac.ac_b_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#if 0 -+ for (i = 0; i < ac.ac_b_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); -+ -+ ext3_lock_group(sb, ac.ac_b_group); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ ac.ac_b_len); -+ ext3_unlock_group(sb, ac.ac_b_group); -+ spin_lock(&sbi->s_md_lock); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - ac.ac_b_len); -+ spin_unlock(&sbi->s_md_lock); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); -+ -+ *len = ac.ac_b_len; -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = err; -+ block = 0; -+out: -+ if (!(flags & 2)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ return block; -+} -+ -+int ext3_mb_generate_buddy(struct super_block *sb, int group) -+{ -+ struct buffer_head *bh; -+ int i, err, count = 0; -+ struct ext3_buddy e3b; -+ -+ err = ext3_mb_load_desc(sb, group, &e3b); -+ if (err) -+ goto out; -+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); -+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); -+ -+ bh = read_block_bitmap_bh(sb, group); -+ if (bh == NULL) { -+ err = -EIO; -+ goto out2; -+ } -+ -+ /* loop over the blocks, nad create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(&e3b, i, 1); -+ count++; -+ } -+ } -+ mb_check_buddy(&e3b); -+ ext3_mb_dirty_buddy(&e3b); -+ -+out2: -+ ext3_mb_release_desc(&e3b); -+out: -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS) -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct inode *root = sb->s_root->d_inode; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct dentry *db; -+ tid_t target; -+ int err, i; -+ -+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) * -+ sbi->s_groups_count, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy maps\n"); -+ return -ENOMEM; -+ } -+ memset(sbi->s_buddy_blocks, 0, -+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, -+ strlen(EXT3_BUDDY_FILE)); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk("EXT3-fs: can't lookup buddy file: %d\n", err); -+ goto out; -+ } -+ -+ if (db->d_inode != NULL) { -+ sbi->s_buddy = igrab(db->d_inode); -+ goto map; -+ } -+ -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk("error while creation buddy file: %d\n", err); -+ } else { -+ sbi->s_buddy = igrab(db->d_inode); -+ } -+ -+map: -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct buffer_head *bh = NULL; -+ handle_t *handle; -+ -+ sbi->s_buddy_blocks[i] = -+ kmalloc(sizeof(struct ext3_buddy_group_blocks), -+ GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ ext3_journal_stop(handle, sbi->s_buddy); -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; -+ } -+ -+ if ((target = log_start_commit(sbi->s_journal, NULL))) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ up(&root->i_sem); -+ return err; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_buddy_blocks) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ if (sbi->s_buddy_blocks[i]) -+ kfree(sbi->s_buddy_blocks[i]); -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb) -+{ -+ struct ext3_super_block *es; -+ int i; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ ext3_mb_init_backend(sb); -+ -+ es = EXT3_SB(sb)->s_es; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ ext3_mb_generate_buddy(sb, i); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_desc(sb, md->group, &e3b); -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ kfree(md); -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ bitmap_bh = read_block_bitmap_bh(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_desc(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ ext3_unlock_group(sb, block_group); -+ spin_lock(&sbi->s_md_lock); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) + count); -+ spin_unlock(&sbi->s_md_lock); -+ } -+ -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ -+ /* FIXME: undo logic will be implemented later and another way */ -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ DQUOT_FREE_BLOCK(inode, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_super_block *es; -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ es = EXT3_SB(sb)->s_es; -+ spin_lock(&sbi->s_reserve_lock); -+ free = le32_to_cpu(es->s_free_blocks_count); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ if (sbi->s_blocks_reserved < 0) -+ printk("EXT3-fs: reserve leak %ld\n", sbi->s_blocks_reserved); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, u32 *pc, u32 *pb, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, pc, pb, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ if (!test_opt(inode->i_sb, MBALLOC)) -+ ext3_free_blocks_old(handle, inode, block, count); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata); -+ return; -+} -+ -Index: linux-2.4.20-rh-20.9/fs/ext3/super.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/super.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/super.c 2004-10-15 20:57:33.000000000 +0400 -@@ -622,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_mb_release(sb); - J_ASSERT(sbi->s_delete_inodes == 0); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); -@@ -877,6 +878,8 @@ - else if (want_numeric(value, "journal", inum)) - return 0; - } -+ else if (!strcmp (this_char, "mballoc")) -+ set_opt (*mount_options, MBALLOC); - else if (!strcmp (this_char, "noload")) - set_opt (*mount_options, NOLOAD); - else if (!strcmp (this_char, "data")) { -@@ -1506,6 +1509,7 @@ - } - - ext3_ext_init(sb); -+ ext3_mb_init(sb); - - return sb; - -Index: linux-2.4.20-rh-20.9/fs/ext3/Makefile -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/Makefile 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/Makefile 2004-10-15 22:00:29.000000000 +0400 -@@ -13,8 +13,8 @@ - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ -- xattr_trusted.o extents.o --export-objs += extents.o -+ xattr_trusted.o extents.o mballoc.o -+export-objs += extents.o mballoc.o - - obj-m := $(O_TARGET) - -Index: linux-2.4.20-rh-20.9/fs/ext3/balloc.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/balloc.c 2004-10-15 20:43:28.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/balloc.c 2004-10-15 20:57:33.000000000 +0400 -@@ -203,8 +203,7 @@ - * differentiating between a group for which we have never performed a bitmap - * IO request, and a group for which the last bitmap read request failed. - */ --static inline int load_block_bitmap (struct super_block * sb, -- unsigned int block_group) -+int load_block_bitmap (struct super_block * sb, unsigned int block_group) - { - int slot; - -@@ -253,8 +252,8 @@ - } - - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks (handle_t *handle, struct inode * inode, -- unsigned long block, unsigned long count) -+void ext3_free_blocks_old (handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh; - struct buffer_head *gd_bh; -@@ -531,9 +530,9 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block (handle_t *handle, struct inode * inode, -- unsigned long goal, u32 * prealloc_count, -- u32 * prealloc_block, int * errp) -+int ext3_new_block_old (handle_t *handle, struct inode * inode, -+ unsigned long goal, u32 * prealloc_count, -+ u32 * prealloc_block, int * errp) - { - struct buffer_head * bh, *bhtmp; - struct buffer_head * bh2; -Index: linux-2.4.20-rh-20.9/fs/ext3/namei.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/namei.c 2004-10-15 20:43:30.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/namei.c 2004-10-15 20:57:33.000000000 +0400 -@@ -1877,7 +1877,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode) - { - handle_t *handle; - struct inode * inode; -Index: linux-2.4.20-rh-20.9/fs/ext3/inode.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/inode.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/inode.c 2004-10-15 20:57:33.000000000 +0400 -@@ -255,7 +255,7 @@ - inode->u.ext3_i.i_prealloc_count = 0; - inode->u.ext3_i.i_prealloc_block = 0; - /* Writer: end */ -- ext3_free_blocks (inode, block, total); -+ ext3_free_blocks (inode, block, total, 1); - } - unlock_kernel(); - #endif -@@ -619,7 +619,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -723,7 +723,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1751,7 +1751,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -1923,7 +1923,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.4.20-rh-20.9/fs/ext3/extents.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/extents.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/extents.c 2004-10-15 20:57:33.000000000 +0400 -@@ -741,7 +741,7 @@ - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1389,7 +1389,7 @@ - path->p_idx->ei_leaf); - bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1847,10 +1847,12 @@ - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1862,7 +1864,7 @@ - bh = sb_get_hash_table(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.4.20-rh-20.9/fs/ext3/xattr.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/xattr.c 2004-10-15 20:43:31.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/xattr.c 2004-10-15 20:57:33.000000000 +0400 -@@ -174,7 +174,7 @@ - ext3_xattr_free_block(handle_t *handle, struct inode * inode, - unsigned long block) - { -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - inode->i_blocks -= inode->i_sb->s_blocksize >> 9; - } - -@@ -182,7 +182,7 @@ - # define ext3_xattr_quota_free(inode) \ - DQUOT_FREE_BLOCK(inode, 1) - # define ext3_xattr_free_block(handle, inode, block) \ -- ext3_free_blocks(handle, inode, block, 1) -+ ext3_free_blocks(handle, inode, block, 1, 1) - #endif - - #if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs.h 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/include/linux/ext3_fs.h 2004-10-15 20:57:33.000000000 +0400 -@@ -334,6 +334,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -664,7 +665,7 @@ - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, - __u32 *, __u32 *, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern unsigned long ext3_count_free_blocks (struct super_block *); - extern void ext3_check_blocks_bitmap (struct super_block *); - extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -727,6 +728,13 @@ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, - unsigned long); - -+/* mballoc.c */ -+extern int ext3_mb_init(struct super_block *sb); -+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal,int *len, int flags,int *errp); -+extern int ext3_mb_release(struct super_block *sb); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs_sb.h 2004-10-15 20:43:29.000000000 +0400 -+++ linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h 2004-10-20 22:08:40.000000000 +0400 -@@ -19,6 +19,7 @@ - #ifdef __KERNEL__ - #include - #include -+#include - #endif - - /* -@@ -31,6 +32,25 @@ - - #define EXT3_DELETE_THREAD - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+#define EXT3_BB_MAX_ORDER 14 -+ -+struct ext3_buddy_group_blocks { -+ unsigned long bb_bitmap; -+ unsigned long bb_buddy; -+ spinlock_t bb_lock; -+ unsigned bb_counters[EXT3_BB_MAX_ORDER]; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned long bb_tid; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -86,6 +106,17 @@ - wait_queue_head_t s_delete_thread_queue; - wait_queue_head_t s_delete_waiter_queue; - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ unsigned int s_last_transaction; - }; - - #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 5decb55..f36b90c 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -446,31 +446,31 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); ++ return ext3_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); ++ ext3_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); ++ ext3_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); ++ ext3_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) @@ -1015,7 +1015,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + i = e3b->bd_bd->bb_first_free; + + while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1205,13 +1205,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + if (ac.ac_status == AC_STATUS_BREAK && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1219,7 +1219,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + * The only thing we can do is just take first + * found block(s) + */ -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; @@ -2413,7 +2413,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, ++ {Opt_mbfactor, "mbfactor=%u"}, {Opt_err, NULL} }; diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index fbd014f..72b7926 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -463,31 +463,31 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); ++ return ext3_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); ++ ext3_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); ++ ext3_set_bit_atomic(NULL, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); ++ ext3_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); +} + +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) @@ -1032,7 +1032,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + i = e3b->bd_bd->bb_first_free; + + while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1222,13 +1222,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + if (ac.ac_status == AC_STATUS_BREAK && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1236,7 +1236,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + * The only thing we can do is just take first + * found block(s) + */ -+ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); + ac.ac_b_ex.fe_group = 0; + ac.ac_b_ex.fe_start = 0; + ac.ac_b_ex.fe_len = 0; @@ -2428,7 +2428,7 @@ Index: linux-2.6.9/fs/ext3/super.c {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, ++ {Opt_mbfactor, "mbfactor=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 90bb3a9..7f49fda 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -36,6 +36,7 @@ #include /* for LUSTRE_OSC_NAME */ #include /* for LUSTRE_MDC_NAME */ #include +#include #include /* @priority: if non-zero, move the selected to the list head @@ -200,7 +201,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) /* In a more perfect world, we would hang a ptlrpc_client off of * obd_type and just use the values from there. */ if (!strcmp(name, LUSTRE_OSC_NAME)) { - rq_portal = OST_REQUEST_PORTAL; + rq_portal = OST_IO_PORTAL; rp_portal = OSC_REPLY_PORTAL; connect_op = OST_CONNECT; } else if (!strcmp(name, LUSTRE_MDC_NAME)) { @@ -261,7 +262,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) if (num_physpages >> (20 - PAGE_SHIFT) <= 128) { /* <= 128 MB */ cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 4; cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 4; - } else if (num_physpages >> (20 - PAGE_SHIFT) <= 512) { /* <= 512 MB */ + } else if (num_physpages >> (20 - PAGE_SHIFT) <= 256) { /* <= 256 MB */ cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 2; cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 2; } else { @@ -479,7 +480,8 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, CWARN("%s reconnecting\n", cluuid->uuid); conn->cookie = exp->exp_handle.h_cookie; /* target_handle_connect() treats EALREADY and - * -EALREADY differently */ + * -EALREADY differently. EALREADY means we are + * doing a valid reconnect from the same client. */ RETURN(EALREADY); } else { CERROR("%s reconnecting from %s, " @@ -489,7 +491,8 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, hdl->cookie, conn->cookie); memset(conn, 0, sizeof *conn); /* target_handle_connect() treats EALREADY and - * -EALREADY differently */ + * -EALREADY differently. -EALREADY is an error + * (same UUID, different handle). */ RETURN(-EALREADY); } } @@ -582,6 +585,22 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (rc) GOTO(out, rc); + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { + if (!data || (data->ocd_version < LUSTRE_VERSION_CODE - + LUSTRE_VERSION_ALLOWED_OFFSET)) { + DEBUG_REQ(D_INFO, req, "Refusing old (%d.%d.%d.%d) " + "libclient connection attempt\n", + OBD_OCD_VERSION_MAJOR(data->ocd_version), + OBD_OCD_VERSION_MINOR(data->ocd_version), + OBD_OCD_VERSION_PATCH(data->ocd_version), + OBD_OCD_VERSION_FIX(data->ocd_version)); + data = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*data)); + data->ocd_connect_flags = OBD_CONNECT_VERSION; + data->ocd_version = LUSTRE_VERSION_CODE; + GOTO(out, rc = -EPROTO); + } + } + /* lctl gets a backstage, all-access pass. */ if (obd_uuid_equals(&cluuid, &target->obd_uuid)) goto dont_check_exports; @@ -607,6 +626,12 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) GOTO(out, rc = -EALREADY); } + /* We indicate the reconnection in a flag, not an error code. */ + if (rc == EALREADY) { + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + rc = 0; + } + /* Tell the client if we're in recovery. */ /* If this is the first client, start the recovery timer */ if (target->obd_recovering) { @@ -630,8 +655,15 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) dont_check_exports: rc = obd_connect(&conn, target, &cluuid, data); } + } else { + rc = obd_reconnect(export, target, &cluuid, data); } + /* we want to handle EALREADY but *not* -EALREADY from + * target_handle_reconnect() */ + if (rc && rc != EALREADY) + GOTO(out, rc); + /* Return only the parts of obd_connect_data that we understand, so the * client knows that we don't understand the rest. */ if (data) @@ -641,13 +673,14 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) /* If all else goes well, this is our RPC return code. */ req->rq_status = 0; - /* we want to handle EALREADY but *not* -EALREADY from - * target_handle_reconnect() */ - if (rc && rc != EALREADY) - GOTO(out, rc); - req->rq_repmsg->handle = conn; + /* ownership of this export ref transfers to the request AFTER we + * drop any previous reference the request had, but we don't want + * that to go to zero before we get our new export reference. */ + export = class_conn2export(&conn); + LASSERT(export != NULL); + /* If the client and the server are the same node, we will already * have an export that really points to the client's DLM export, * because we have a shared handles table. @@ -658,9 +691,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (req->rq_export != NULL) class_export_put(req->rq_export); - /* ownership of this export ref transfers to the request */ - export = req->rq_export = class_conn2export(&conn); - LASSERT(export != NULL); + req->rq_export = export; spin_lock_irqsave(&export->exp_lock, flags); if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { @@ -686,11 +717,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) export->exp_connection = ptlrpc_get_connection(req->rq_peer, req->rq_self, &remote_uuid); - if (rc == EALREADY) { - /* We indicate the reconnection in a flag, not an error code. */ - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + + if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) GOTO(out, rc = 0); - } if (target->obd_recovering) target->obd_connected_clients++; @@ -1169,7 +1198,7 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) OBD_ALLOC(reqmsg, req->rq_reqlen); if (!reqmsg) LBUG(); - memcpy(saved_req, req, sizeof *saved_req); + *saved_req = *req; memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); /* Don't race cleanup */ @@ -1402,3 +1431,13 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req) #endif /* !__KERNEL__ */ } #endif /* HAVE_QUOTA_SUPPORT */ + +ldlm_mode_t lck_compat_array[] = { + [LCK_EX] LCK_COMPAT_EX, + [LCK_PW] LCK_COMPAT_PW, + [LCK_PR] LCK_COMPAT_PR, + [LCK_CW] LCK_COMPAT_CW, + [LCK_CR] LCK_COMPAT_CR, + [LCK_NL] LCK_COMPAT_NL, + [LCK_GROUP] LCK_COMPAT_GROUP +}; diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b8312eb..8d6eb94 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -551,7 +551,7 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) { struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); - LASSERT(lock != NULL); + LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie); ldlm_lock_decref_internal(lock, mode); LDLM_LOCK_PUT(lock); } @@ -847,6 +847,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, ldlm_error_t rc = ELDLM_OK; ENTRY; + do_gettimeofday(&lock->l_enqueued_time); /* policies are not executed on the client or during replay */ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT && !local && ns->ns_policy) { @@ -882,7 +883,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, /* Some flags from the enqueue want to make it into the AST, via the * lock's l_flags. */ - lock->l_flags |= (*flags & (LDLM_AST_DISCARD_DATA|LDLM_INHERIT_FLAGS)); + lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA; /* This distinction between local lock trees is very important; a client * namespace only has information about locks taken by that client, and diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 109584c..846feee 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -263,6 +263,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) unsigned long timeout_rounded; l_check_ns_lock(lock->l_resource->lr_namespace); + LASSERT(!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)); spin_lock_bh(&waiting_locks_spinlock); if (lock->l_destroyed) { @@ -348,6 +349,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) static int ldlm_add_waiting_lock(struct ldlm_lock *lock) { + LASSERT(!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)); RETURN(1); } @@ -369,7 +371,7 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID %s" " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid, - conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid), + conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid), str); if (obd_dump_on_timeout) @@ -471,19 +473,17 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, } body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); - memcpy(&body->lock_desc, desc, sizeof(*desc)); + body->lock_handle1 = lock->l_remote_handle; + body->lock_desc = *desc; body->lock_flags |= (lock->l_flags & LDLM_AST_FLAGS); LDLM_DEBUG(lock, "server preparing blocking AST"); req->rq_replen = lustre_msg_size(0, NULL); - if (instant_cancel) { + if (instant_cancel) ldlm_lock_cancel(lock); -// ldlm_reprocess_all(lock->l_resource); - } else if (lock->l_granted_mode == lock->l_req_mode) { + else if (lock->l_granted_mode == lock->l_req_mode) ldlm_add_waiting_lock(lock); - } + l_unlock(&lock->l_resource->lr_namespace->ns_lock); req->rq_send_state = LUSTRE_IMP_FULL; @@ -540,8 +540,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; body->lock_flags = flags; ldlm_lock2desc(lock, &body->lock_desc); @@ -567,8 +566,18 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) l_lock(&lock->l_resource->lr_namespace->ns_lock); if (lock->l_flags & LDLM_FL_AST_SENT) { body->lock_flags |= LDLM_FL_AST_SENT; - body->lock_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; - ldlm_add_waiting_lock(lock); /* start the lock-timeout clock */ + + /* We might get here prior to ldlm_handle_enqueue setting + LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock into + waiting list, but this is safe and similar code in + ldlm_handle_enqueue will call ldlm_lock_cancel() still, that + would not only cancel the loc, but will also remove it from + waiting list */ + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) + ldlm_lock_cancel(lock); + else + ldlm_add_waiting_lock(lock); /* start the lock-timeout + clock */ } l_unlock(&lock->l_resource->lr_namespace->ns_lock); @@ -598,8 +607,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; ldlm_lock2desc(lock, &body->lock_desc); down(&lock->l_resource->lr_lvb_sem); @@ -643,6 +651,10 @@ find_existing_lock(struct obd_export *exp, struct lustre_handle *remote_hdl) } +/* + * Main server-side entry point into LDLM. This is called by ptlrpc service + * threads to carry out client lock enqueueing requests. + */ int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback completion_callback, ldlm_blocking_callback blocking_callback, @@ -720,8 +732,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, GOTO(out, rc = -ENOMEM); do_gettimeofday(&lock->l_enqueued_time); - memcpy(&lock->l_remote_handle, &dlm_req->lock_handle1, - sizeof(lock->l_remote_handle)); + lock->l_remote_handle = dlm_req->lock_handle1; LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2); @@ -763,11 +774,9 @@ existing_lock: } if (dlm_req->lock_desc.l_resource.lr_type != LDLM_PLAIN) - memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data, - sizeof(ldlm_policy_data_t)); + lock->l_policy_data = dlm_req->lock_desc.l_policy_data; if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT) - memcpy(&lock->l_req_extent, &lock->l_policy_data.l_extent, - sizeof(lock->l_req_extent)); + lock->l_req_extent = lock->l_policy_data.l_extent; err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, &flags); if (err) @@ -782,6 +791,12 @@ existing_lock: /* We never send a blocking AST until the lock is granted, but * we can tell it right now */ l_lock(&lock->l_resource->lr_namespace->ns_lock); + + /* Now take into account flags to be inherited from original lock + request both in reply to client and in our own lock flags. */ + dlm_rep->lock_flags |= dlm_req->lock_flags & LDLM_INHERIT_FLAGS; + lock->l_flags |= dlm_req->lock_flags & LDLM_INHERIT_FLAGS; + /* Don't move a pending lock onto the export if it has already * been evicted. Cancel it now instead. (bug 5683) */ if (req->rq_export->exp_failed || @@ -790,10 +805,33 @@ existing_lock: rc = -ENOTCONN; } else if (lock->l_flags & LDLM_FL_AST_SENT) { dlm_rep->lock_flags |= LDLM_FL_AST_SENT; - dlm_rep->lock_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; - if (lock->l_granted_mode == lock->l_req_mode) + if (dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK) + ldlm_lock_cancel(lock); + else if (lock->l_granted_mode == lock->l_req_mode) ldlm_add_waiting_lock(lock); } + if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN) && + req->rq_export->exp_libclient) { + if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) || + !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK)) { + CERROR("Granting sync lock to libclient. " + "req fl %d, rep fl %d, lock fl %d\n", + dlm_req->lock_flags, dlm_rep->lock_flags, + lock->l_flags); + LDLM_ERROR(lock, "sync lock"); + if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) { + struct ldlm_intent *it; + it = lustre_msg_buf(req->rq_reqmsg, 1, + sizeof(*it)); + if (it != NULL) { + CERROR("This is intent %s (" + LPU64 ")\n", + ldlm_it2str(it->opc), it->opc); + } + } + } + } + l_unlock(&lock->l_resource->lr_namespace->ns_lock); EXIT; @@ -834,8 +872,10 @@ existing_lock: if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK) ldlm_reprocess_all(lock->l_resource); + LDLM_LOCK_PUT(lock); } + LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)", lock, rc); @@ -1023,8 +1063,7 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, } if (lock->l_resource->lr_type != LDLM_PLAIN) { - memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data, - sizeof(lock->l_policy_data)); + lock->l_policy_data = dlm_req->lock_desc.l_policy_data; LDLM_DEBUG(lock, "completion AST, new policy data"); } diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 3aedfc5..0aab832 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -254,22 +254,20 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_lock_addref_internal(lock, mode); ldlm_lock2handle(lock, lockh); lock->l_flags |= LDLM_FL_LOCAL; - lock->l_flags |= *flags & LDLM_INHERIT_FLAGS; lock->l_lvb_swabber = lvb_swabber; if (policy != NULL) - memcpy(&lock->l_policy_data, policy, sizeof(*policy)); + lock->l_policy_data = *policy; if (type == LDLM_EXTENT) - memcpy(&lock->l_req_extent, &policy->l_extent, - sizeof(policy->l_extent)); + lock->l_req_extent = policy->l_extent; err = ldlm_lock_enqueue(ns, &lock, policy, flags); if (err != ELDLM_OK) GOTO(out, err); if (policy != NULL) - memcpy(policy, &lock->l_policy_data, sizeof(*policy)); + *policy = lock->l_policy_data; if ((*flags) & LDLM_FL_LOCK_CHANGED) - memcpy(&res_id, &lock->l_resource->lr_name, sizeof(res_id)); + res_id = lock->l_resource->lr_name; LDLM_DEBUG_NOLOCK("client-side local enqueue handler END (lock %p)", lock); @@ -369,8 +367,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, } if (type == LDLM_EXTENT) - memcpy(&lock->l_req_extent, &policy->l_extent, - sizeof(policy->l_extent)); + lock->l_req_extent = policy->l_extent; LDLM_DEBUG(lock, "client-side enqueue START"); } @@ -401,7 +398,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, ldlm_lock2desc(lock, &body->lock_desc); body->lock_flags = *flags; - memcpy(&body->lock_handle1, lockh, sizeof(*lockh)); + body->lock_handle1 = *lockh; /* Continue as normal. */ if (!req_passed_in) { @@ -454,8 +451,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, /* lock enqueued on the server */ cleanup_phase = 1; - memcpy(&lock->l_remote_handle, &reply->lock_handle, - sizeof(lock->l_remote_handle)); + lock->l_remote_handle = reply->lock_handle; *flags = reply->lock_flags; lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS; @@ -608,8 +604,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags) GOTO(out, rc = -ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; body->lock_desc.l_req_mode = new_mode; body->lock_flags = *flags; @@ -703,8 +698,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; req->rq_replen = lustre_msg_size(0, NULL); @@ -1114,8 +1108,7 @@ static int replay_lock_interpret(struct ptlrpc_request *req, GOTO (out, rc = -EPROTO); } - memcpy(&lock->l_remote_handle, &reply->lock_handle, - sizeof(lock->l_remote_handle)); + lock->l_remote_handle = reply->lock_handle; LDLM_DEBUG(lock, "replayed lock:"); ptlrpc_import_recovery_state_machine(req->rq_import); out: diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 72e3230..19bd0e5 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -479,7 +479,7 @@ ldlm_resource_add(struct ldlm_namespace *ns, struct ldlm_resource *parent, RETURN(NULL); l_lock(&ns->ns_lock); - memcpy(&res->lr_name, &name, sizeof(res->lr_name)); + res->lr_name = name; res->lr_namespace = ns; atomic_inc(&ns->ns_refcount); @@ -688,7 +688,7 @@ EXPORT_SYMBOL(ldlm_resource_unlink_lock); void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) { desc->lr_type = res->lr_type; - memcpy(&desc->lr_name, &res->lr_name, sizeof(desc->lr_name)); + desc->lr_name = res->lr_name; } void ldlm_dump_all_namespaces(int level) diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index 72b71e6..872960f 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -61,7 +61,7 @@ liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c \ llite_lib.h liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS) $(QUOTA_LIBS) - sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" + sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" "$(CAP_LIBS)" EXTRA_DIST = genlib.sh diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 04d0894..5cdcde2 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -237,7 +237,6 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir) int rc; ENTRY; - oti.oti_thread = request->rq_svc_thread; /* req is swabbed so this is safe */ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); @@ -418,7 +417,7 @@ _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off) /* this isn't where truncate starts. roughly: * llu_iop_{open,setattr}->llu_setattr_raw->llu_vmtruncate->llu_truncate * we grab the lock back in setattr_raw to avoid races. */ -static void llu_truncate(struct inode *inode) +static void llu_truncate(struct inode *inode, obd_flag flags) { struct llu_inode_info *lli = llu_i2info(inode); struct intnl_stat *st = llu_i2stat(inode); @@ -438,9 +437,12 @@ static void llu_truncate(struct inode *inode) } oa.o_id = lsm->lsm_object_id; - oa.o_valid = OBD_MD_FLID; - obdo_from_inode(&oa, inode, OBD_MD_FLTYPE|OBD_MD_FLMODE|OBD_MD_FLATIME| - OBD_MD_FLMTIME | OBD_MD_FLCTIME); + oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; + oa.o_flags = flags; /* We don't actually want to copy inode flags */ + + obdo_from_inode(&oa, inode, + OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); obd_adjust_kms(llu_i2obdexp(inode), lsm, st->st_size, 1); @@ -462,11 +464,17 @@ static void llu_truncate(struct inode *inode) return; } /* llu_truncate */ -int llu_vmtruncate(struct inode * inode, loff_t offset) +int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag flags) { llu_i2stat(inode)->st_size = offset; - llu_truncate(inode); + /* + * llu_truncate() is only called from this + * point. llu_vmtruncate/llu_truncate split exists to mimic the + * structure of Linux VFS truncate code path. + */ + + llu_truncate(inode, flags); return 0; } diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index c7e7e06..6d977b3 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -22,6 +22,7 @@ LIBS=$2 LND_LIBS=$3 PTHREAD_LIBS=$4 QUOTA_LIBS=$5 +CAP_LIBS=$6 if [ ! -f $SYSIO/lib/libsysio.a ]; then echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist" @@ -103,7 +104,7 @@ if test x$OS = xAIX; then gcc -shared -o $CWD/liblustre.so $ALL_OBJS -lpthread -Xlinker -bnoipath ../../libsyscall.so else $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \ - $ALL_OBJS -lcap $PTHREAD_LIBS + $ALL_OBJS $CAP_LIBS $PTHREAD_LIBS fi rm -rf $sysio_tmp diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 10e3472..9e90805 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -56,6 +56,7 @@ #include "lutil.h" #include "llite_lib.h" +#include static int lllib_init(void) { @@ -87,6 +88,7 @@ int liblustre_process_log(struct config_llog_instance *cfg, struct llog_ctxt *ctxt; lnet_nid_t nid = 0; int err, rc = 0; + struct obd_connect_data *ocd = NULL; ENTRY; generate_random_uuid(uuid); @@ -129,12 +131,18 @@ int liblustre_process_log(struct config_llog_instance *cfg, if (obd == NULL) GOTO(out_cleanup, rc = -EINVAL); + OBD_ALLOC(ocd, sizeof(*ocd)); + if (ocd == NULL) + GOTO(out_cleanup, rc = -ENOMEM); + + ocd->ocd_version = LUSTRE_VERSION_CODE; + /* Disable initial recovery on this import */ rc = obd_set_info(obd->obd_self_export, strlen("initial_recov"), "initial_recov", sizeof(allow_recov), &allow_recov); - rc = obd_connect(&mdc_conn, obd, &mdc_uuid, NULL /*connect_flags*/); + rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd); if (rc) { CERROR("cannot connect to %s: rc = %d\n", mdsname, rc); GOTO(out_cleanup, rc); @@ -155,6 +163,9 @@ int liblustre_process_log(struct config_llog_instance *cfg, CERROR("obd_disconnect failed: rc = %d\n", err); out_cleanup: + if (ocd) + OBD_FREE(ocd, sizeof(*ocd)); + lustre_cfg_bufs_reset(&bufs, name); lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); err = class_process_config(lcfg); diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index a6371924..edfe806 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -25,17 +25,17 @@ struct ll_file_data { struct llu_sb_info { - struct obd_uuid ll_sb_uuid; - struct obd_export *ll_mdc_exp; - struct obd_export *ll_osc_exp; - obd_id ll_rootino; - int ll_flags; - __u64 ll_connect_flags; - struct list_head ll_conn_chain; - - struct obd_uuid ll_mds_uuid; - struct obd_uuid ll_mds_peer_uuid; - char *ll_instance; + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_mdc_exp; + struct obd_export *ll_osc_exp; + obd_id ll_rootino; + int ll_flags; + struct lustre_client_ocd ll_lco; + struct list_head ll_conn_chain; + + struct obd_uuid ll_mds_uuid; + struct obd_uuid ll_mds_peer_uuid; + char *ll_instance; }; #define LL_SBI_NOLCK 0x1 @@ -202,7 +202,7 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode); int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode); int llu_iop_close(struct inode *inode); _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off); -int llu_vmtruncate(struct inode * inode, loff_t offset); +int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag obd_flags); void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid); int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir); diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 5bd5642..3804c93 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -262,7 +262,7 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, int rc; ENTRY; - LASSERT(lockh->cookie == 0); + LASSERT(!lustre_handle_is_used(lockh)); CLASSERT(ELDLM_OK == 0); /* XXX phil: can we do this? won't it screw the file size up? */ @@ -459,7 +459,7 @@ static int llu_queue_pio(int cmd, struct llu_io_group *group, * The root of the problem is that * * kms = lov_merge_size(lsm, 1); - * if (end > kms) + * if (end >= kms) * glimpse_size(inode); * else * st->st_size = kms; @@ -587,7 +587,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, RETURN(-ERANGE); lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags, - lli->lli_sbi->ll_connect_flags, + lli->lli_sbi->ll_lco.lco_flags, pos, len, &p); iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p); @@ -609,7 +609,8 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, * detection. Rely in OST to handle short reads in that case. */ kms = lov_merge_size(lsm, 1); - if (p.lrp_policy.l_extent.end > kms) { + /* extent.end is last byte of the range */ + if (p.lrp_policy.l_extent.end >= kms) { /* A glimpse is necessary to determine whether * we return a short read or some zeroes at * the end of the buffer @@ -620,14 +621,14 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, * comment. */ if ((err = llu_glimpse_size(inode))) { - llu_extent_unlock(fd, inode, lsm, - p.lrp_lock_mode, &lockh); - GOTO(err_put, err); + GOTO(err_unlock, err); } - } else + } else { st->st_size = kms; - } else if (lli->lli_open_flags & O_APPEND) + } + } else if (lli->lli_open_flags & O_APPEND) { pos = st->st_size; + } for (iovidx = 0; iovidx < iovlen; iovidx++) { char *buf = (char *) iovec[iovidx].iov_base; @@ -638,9 +639,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, if (len < count) count = len; if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) { - llu_extent_unlock(fd, inode, - lsm, p.lrp_lock_mode, &lockh); - GOTO(err_put, err = -EFAULT); + GOTO(err_unlock, err = -EFAULT); } if (is_read) { @@ -648,9 +647,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, break; } else { if (pos >= lli->lli_maxbytes) { - llu_extent_unlock(fd, inode, lsm, - p.lrp_lock_mode, &lockh); - GOTO(err_put, err = -EFBIG); + GOTO(err_unlock, err = -EFBIG); } if (pos + count >= lli->lli_maxbytes) count = lli->lli_maxbytes - pos; @@ -658,9 +655,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos); if (ret < 0) { - llu_extent_unlock(fd, inode, - lsm, p.lrp_lock_mode, &lockh); - GOTO(err_put, err = ret); + GOTO(err_unlock, err = ret); } else { pos += ret; if (!is_read) { @@ -677,19 +672,25 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, } LASSERT(len == 0 || is_read); /* libsysio should guarantee this */ - /* - * BUG: lock is released too early. Fix is in bug 9296. - */ - err = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); - if (err) - CERROR("extent unlock error %d\n", err); - err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig); if (err) - GOTO(err_put, err); + GOTO(err_unlock, err); + + err = oig_wait(iogroup->lig_oig); + if (err) { + CERROR("sync error %d, data corruption possible\n", err); + GOTO(err_unlock, err); + } + + ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); + if (ret) + CERROR("extent unlock error %d\n", ret); session->lis_groups[session->lis_ngroups++] = iogroup; RETURN(0); + +err_unlock: + llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); err_put: put_io_group(iogroup); RETURN((ssize_t)err); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 8889cd1..b84ff27 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -51,6 +51,7 @@ #undef LIST_HEAD #include "llite_lib.h" +#include #ifndef MAY_EXEC #define MAY_EXEC 1 @@ -583,12 +584,14 @@ static int inode_setattr(struct inode * inode, struct iattr * attr) struct intnl_stat *st = llu_i2stat(inode); int error = 0; - if (ia_valid & ATTR_SIZE) { - error = llu_vmtruncate(inode, attr->ia_size); - if (error) - goto out; - } + /* + * inode_setattr() is only ever invoked with ATTR_SIZE (by + * llu_setattr_raw()) when file has no bodies. Check this. + */ + LASSERT(ergo(ia_valid & ATTR_SIZE, llu_i2info(inode)->lli_smd == NULL)); + if (ia_valid & ATTR_SIZE) + st->st_size = attr->ia_size; if (ia_valid & ATTR_UID) st->st_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -605,7 +608,6 @@ static int inode_setattr(struct inode * inode, struct iattr * attr) st->st_mode &= ~S_ISGID; } /* mark_inode_dirty(inode); */ -out: return error; } @@ -730,27 +732,46 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) if (ia_valid & ATTR_SIZE) { ldlm_policy_data_t policy = { .l_extent = {attr->ia_size, OBD_OBJECT_EOF} }; - struct lustre_handle lockh = { 0 }; - int err, ast_flags = 0; + struct lustre_handle lockh = { 0, }; + struct lustre_handle match_lockh = { 0, }; + + int err; + int flags = LDLM_FL_TEST_LOCK; /* for assertion check below */ + int lock_mode; + obd_flag obd_flags; + + /* check that there are no matching locks */ + LASSERT(obd_match(sbi->ll_osc_exp, lsm, LDLM_EXTENT, &policy, + LCK_PW, &flags, inode, &match_lockh) <= 0); + /* XXX when we fix the AST intents to pass the discard-range * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA * XXX here. */ - if (attr->ia_size == 0) - ast_flags = LDLM_AST_DISCARD_DATA; + flags = (attr->ia_size == 0) ? LDLM_AST_DISCARD_DATA : 0; - rc = llu_extent_lock(NULL, inode, lsm, LCK_PW, &policy, - &lockh, ast_flags); + if (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK) { + lock_mode = LCK_NL; + obd_flags = OBD_FL_TRUNCLOCK; + CDEBUG(D_INODE, "delegating locking to the OST"); + } else { + lock_mode = LCK_PW; + obd_flags = 0; + } + + /* with lock_mode == LK_NL no lock is taken. */ + rc = llu_extent_lock(NULL, inode, lsm, lock_mode, &policy, + &lockh, flags); if (rc != ELDLM_OK) { if (rc > 0) RETURN(-ENOLCK); RETURN(rc); } - rc = llu_vmtruncate(inode, attr->ia_size); + rc = llu_vmtruncate(inode, attr->ia_size, obd_flags); /* unlock now as we don't mind others file lockers racing with * the mds updates below? */ - err = llu_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); + err = llu_extent_unlock(NULL, inode, lsm, lock_mode, &lockh); if (err) { CERROR("llu_extent_unlock failed: %d\n", err); if (!rc) @@ -1744,8 +1765,9 @@ llu_fsswop_mount(const char *source, obd_set_info(obd->obd_self_export, strlen("async"), "async", sizeof(async), &async); - ocd.ocd_connect_flags = OBD_CONNECT_IBITS; + ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION; ocd.ocd_ibits_known = MDS_INODELOCK_FULL; + ocd.ocd_version = LUSTRE_VERSION_CODE; /* setup mdc */ err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, &ocd); @@ -1772,14 +1794,19 @@ llu_fsswop_mount(const char *source, obd_set_info(obd->obd_self_export, strlen("async"), "async", sizeof(async), &async); - ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK; + obd->obd_upcall.onu_owner = &sbi->ll_lco; + obd->obd_upcall.onu_upcall = ll_ocd_update; + + ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK|OBD_CONNECT_REQPORTAL| + OBD_CONNECT_VERSION|OBD_CONNECT_TRUNCLOCK; + ocd.ocd_version = LUSTRE_VERSION_CODE; err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, &ocd); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); GOTO(out_mdc, err); } sbi->ll_osc_exp = class_conn2export(&osc_conn); - sbi->ll_connect_flags = ocd.ocd_connect_flags; + sbi->ll_lco.lco_flags = ocd.ocd_connect_flags; mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp); diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c index f1da0f5..f24f93f 100644 --- a/lustre/liblustre/tests/echo_test.c +++ b/lustre/liblustre/tests/echo_test.c @@ -32,7 +32,7 @@ #include "../lutil.h" -#ifdef CRAY_XT3 +#if CRAY_XT3 int _sysio_lustre_init(void) { /* diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c index 5ee1d98..bf0021d 100644 --- a/lustre/liblustre/tests/sanity.c +++ b/lustre/liblustre/tests/sanity.c @@ -532,7 +532,7 @@ static int check_file_size(char *file, off_t size) { struct stat statbuf; - if(stat(file, &statbuf) != 0) { + if (stat(file, &statbuf) != 0) { printf("Error stat(%s)\n", file); return(1); } @@ -948,6 +948,7 @@ int t50(char *name) ENTRY("4k aligned i/o sanity"); while (np <= _npages) { printf("%3d per xfer(total %d)...\t", np, _npages); + fflush(stdout); pages_io(np, offset); np += np; } @@ -990,12 +991,12 @@ int t51(char *name) int result; ENTRY("truncate() should truncate file to proper length"); - snprintf(file, MAX_PATH_LENGTH, "%s/test_t19_file", lustre_path); + snprintf(file, MAX_PATH_LENGTH, "%s/test_t51_file", lustre_path); for (size = 0; size < T51_NR * T51_STEP; size += T51_STEP) { t_echo_create(file, ""); if (truncate(file, size) != 0) { - printf("error truncating file: %s\n", strerror(errno)); + printf("\nerror truncating file: %s\n",strerror(errno)); return(-1); } result = check_file_size(file, size); @@ -1006,11 +1007,11 @@ int t51(char *name) t_echo_create(file, ""); fd = open(file, O_RDWR|O_CREAT, (mode_t)0666); if (fd < 0) { - printf("error open file: %s\n", strerror(errno)); + printf("\nerror open file: %s\n", strerror(errno)); return(-1); } if (ftruncate(fd, size) != 0) { - printf("error ftruncating file: %s\n", strerror(errno)); + printf("\nerror ftruncating file:%s\n",strerror(errno)); return(-1); } close(fd); @@ -1018,7 +1019,12 @@ int t51(char *name) if (result != 0) return result; t_unlink(file); + if (size % (T51_STEP * (T51_NR / 75)) == 0) { + printf("."); + fflush(stdout); + } } + printf("\n"); LEAVE(); } @@ -1112,7 +1118,11 @@ int main(int argc, char * const argv[]) __liblustre_setup_(); buf_size = _npages * PAGE_SIZE; - buf_alloc = malloc(buf_size); + buf_alloc = calloc(1, buf_size); + if (buf_alloc == NULL) { + fprintf(stderr, "error allocating %d\n", buf_size); + exit(-ENOMEM); + } for (test = testlist; test->test != NULL; test++) { int run = 1, i; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index cdaca99..ab7b1aa 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -728,7 +728,7 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, int rc; ENTRY; - LASSERT(lockh->cookie == 0); + LASSERT(!lustre_handle_is_used(lockh)); LASSERT(lsm != NULL); /* don't drop the mmapped file to LRU */ diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 49f4cd0..6e0593c 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -157,7 +157,7 @@ struct ll_sb_info { int ll_flags; struct list_head ll_conn_chain; /* per-conn chain of SBs */ - __u64 ll_connect_flags; + struct lustre_client_ocd ll_lco; struct hlist_head ll_orphan_dentry_list; /*please don't ask -p*/ struct ll_close_queue *ll_lcq; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 024655f..173c7f4 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include "llite_internal.h" @@ -83,6 +84,7 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) RETURN(NULL); spin_lock_init(&sbi->ll_lock); + spin_lock_init(&sbi->ll_lco.lco_lock); INIT_LIST_HEAD(&sbi->ll_pglist); sbi->ll_pglist_gen = 0; if (num_physpages >> (20 - PAGE_SHIFT) < 512) @@ -160,7 +162,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) if (sb->s_flags & MS_RDONLY) data->ocd_connect_flags |= OBD_CONNECT_RDONLY; if (sbi->ll_flags & LL_SBI_USER_XATTR) - data->ocd_connect_flags |= OBD_CONNECT_USER_XATTR; + data->ocd_connect_flags |= OBD_CONNECT_XATTR; data->ocd_connect_flags |= OBD_CONNECT_ACL; if (sbi->ll_flags & LL_SBI_FLOCK) { @@ -169,6 +171,8 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) sbi->ll_fop = &ll_file_operations; } + data->ocd_connect_flags |= OBD_CONNECT_VERSION; + data->ocd_version = LUSTRE_VERSION_CODE; err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data); if (err == -EBUSY) { CERROR("An MDS (mdc %s) is performing recovery, of which this" @@ -196,7 +200,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) sbi->ll_namelen = osfs.os_namelen; if ((sbi->ll_flags & LL_SBI_USER_XATTR) && - !(data->ocd_connect_flags & OBD_CONNECT_USER_XATTR)) { + !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { LCONSOLE_INFO("Disabling user_xattr feature because " "it is not supported on the server\n"); sbi->ll_flags &= ~LL_SBI_USER_XATTR; @@ -226,6 +230,16 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) GOTO(out_mdc, err); } + data->ocd_connect_flags = + OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL; + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d " + "ocd_grant: %d\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + + obd->obd_upcall.onu_owner = &sbi->ll_lco; + obd->obd_upcall.onu_upcall = ll_ocd_update; + err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, data); if (err == -EBUSY) { CERROR("An OST (osc %s) is performing recovery, of which this" @@ -237,7 +251,9 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) GOTO(out_mdc, err); } sbi->ll_osc_exp = class_conn2export(&osc_conn); - sbi->ll_connect_flags = data->ocd_connect_flags; + spin_lock(&sbi->ll_lco.lco_lock); + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + spin_unlock(&sbi->ll_lco.lco_lock); mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp); @@ -459,9 +475,9 @@ void ll_options(char *options, char **ost, char **mdc, int *flags) #endif { CDEBUG(D_SUPER, "this_char %s\n", this_char); - if (!*ost && (*ost = ll_read_opt("osc", this_char))) + if (!*ost && (*ost = ll_read_opt(LUSTRE_OSC_NAME, this_char))) continue; - if (!*mdc && (*mdc = ll_read_opt("mdc", this_char))) + if (!*mdc && (*mdc = ll_read_opt(LUSTRE_MDC_NAME, this_char))) continue; tmp = ll_set_opt("nolock", this_char, LL_SBI_NOLCK); if (tmp) { @@ -553,12 +569,31 @@ out: RETURN(err); } /* ll_read_super */ -int lustre_process_log(struct lustre_mount_data *lmd, char * profile, - struct config_llog_instance *cfg, int allow_recov) +static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2) { - struct lustre_cfg *lcfg = NULL; struct lustre_cfg_bufs bufs; - char * peer = "MDS_PEER_UUID"; + struct lustre_cfg * lcfg = NULL; + int err; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s\n", cfgname, cmd, s1, s2); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + err = class_process_config(lcfg); + lustre_cfg_free(lcfg); + return(err); +} + +static int lustre_process_log(struct lustre_mount_data *lmd, char * profile, + struct config_llog_instance *cfg) +{ struct obd_device *obd; struct lustre_handle mdc_conn = {0, }; struct obd_export *exp; @@ -567,7 +602,8 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, struct obd_uuid mdc_uuid; struct llog_ctxt *ctxt; struct obd_connect_data ocd = { 0 }; - int rc = 0; + lnet_nid_t nid; + int i, rc = 0, recov_bk = 1; int err; ENTRY; @@ -577,35 +613,18 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, lustre_generate_random_uuid(uuid); class_uuid_unparse(uuid, &mdc_uuid); CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid); - - lustre_cfg_bufs_reset(&bufs, name); - lustre_cfg_bufs_set_string(&bufs, 1, peer); - - lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); - lcfg->lcfg_nid = lmd->lmd_nid; - LASSERT(lcfg->lcfg_nid != LNET_NID_ANY); - rc = class_process_config(lcfg); - lustre_cfg_free(lcfg); + + nid = lmd->lmd_nid[0]; + LASSERT(nid != LNET_NID_ANY); + rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0); if (rc < 0) GOTO(out, rc); - lustre_cfg_bufs_reset(&bufs, name); - lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME); - lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid); - - lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); - rc = class_process_config(lcfg); - lustre_cfg_free(lcfg); + rc = do_lcfg(name, 0, LCFG_ATTACH, LUSTRE_MDC_NAME, mdc_uuid.uuid); if (rc < 0) GOTO(out_del_uuid, rc); - lustre_cfg_bufs_reset(&bufs, name); - lustre_cfg_bufs_set_string(&bufs, 1, lmd->lmd_mds); - lustre_cfg_bufs_set_string(&bufs, 2, peer); - - lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); - rc = class_process_config(lcfg); - lustre_cfg_free(lcfg); + rc = do_lcfg(name, 0, LCFG_SETUP, lmd->lmd_mds, libcfs_nid2str(nid)); if (rc < 0) { LCONSOLE_ERROR("I couldn't establish a connection with the MDS." " Check that the MDS host NID is correct and the" @@ -617,10 +636,25 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, if (obd == NULL) GOTO(out_cleanup, rc = -EINVAL); - /* Disable initial recovery on this import */ + /* Add the redundant MDS nids */ + for (i = 1; i < lmd->lmd_nid_count; i++) { + nid = lmd->lmd_nid[i]; + rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0); + if (rc) { + CERROR("Add uuid for %s failed %d\n", + libcfs_nid2str(nid), rc); + continue; + } + rc = do_lcfg(name, 0, LCFG_ADD_CONN, libcfs_nid2str(nid), 0); + if (rc) + CERROR("Add conn for %s failed %d\n", + libcfs_nid2str(nid), rc); + } + + /* Try all connections, but only once. */ rc = obd_set_info(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", - sizeof(allow_recov), &allow_recov); + strlen("init_recov_bk"), "init_recov_bk", + sizeof(recov_bk), &recov_bk); if (rc) GOTO(out_cleanup, rc); @@ -665,30 +699,26 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, CERROR("obd_disconnect failed: rc = %d\n", err); out_cleanup: - lustre_cfg_bufs_reset(&bufs, name); - lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); - err = class_process_config(lcfg); - lustre_cfg_free(lcfg); + err = do_lcfg(name, 0, LCFG_CLEANUP, 0, 0); if (err) CERROR("mdc_cleanup failed: rc = %d\n", err); out_detach: - lustre_cfg_bufs_reset(&bufs, name); - lcfg = lustre_cfg_new(LCFG_DETACH, &bufs); - err = class_process_config(lcfg); - lustre_cfg_free(lcfg); + err = do_lcfg(name, 0, LCFG_DETACH, 0, 0); if (err) CERROR("mdc_detach failed: rc = %d\n", err); out_del_uuid: - lustre_cfg_bufs_reset(&bufs, name); - lustre_cfg_bufs_set_string(&bufs, 1, peer); - lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs); - err = class_process_config(lcfg); - lustre_cfg_free(lcfg); - if (err) - CERROR("del MDC UUID failed: rc = %d\n", err); - + /* class_add_uuid adds a nid even if the same uuid exists; we might + delete any copy here. So they all better match. */ + for (i = 0; i < lmd->lmd_nid_count; i++) { + nid = lmd->lmd_nid[i]; + err = do_lcfg(name, nid, LCFG_DEL_UUID, libcfs_nid2str(nid), 0); + if (err) + CERROR("del MDC UUID %s failed: rc = %d\n", + libcfs_nid2str(nid), err); + } + /* class_import_put will get rid of the additional connections */ out: RETURN(rc); } @@ -700,7 +730,7 @@ static void lustre_manual_cleanup(struct ll_sb_info *sbi) while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { class_manual_cleanup(obd); - } + } if (sbi->ll_lmd != NULL) class_del_profile(sbi->ll_lmd->lmd_profile); @@ -748,7 +778,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) cfg.cfg_instance = ll_instance; cfg.cfg_uuid = sbi->ll_sb_uuid; - err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 0); + err = lustre_process_log(lmd, lmd->lmd_profile, &cfg); if (err < 0) { CERROR("Unable to process log: %s\n", lmd->lmd_profile); GOTO(out_free, err); @@ -816,13 +846,13 @@ void lustre_put_super(struct super_block *sb) obd = class_exp2obd(sbi->ll_mdc_exp); if (obd) { int next = 0; - /* We need to set force before the lov_disconnect in + /* We need to set force before the lov_disconnect in lustre_common_put_super, since l_d cleans up osc's as well. */ force = obd->obd_no_recov; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { obd->obd_force = force; - } + } } lustre_common_put_super(sb); @@ -1034,14 +1064,14 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) * above to avoid invoking vmtruncate, otherwise it is important * to call vmtruncate in inode_setattr to update inode->i_size * (bug 6196) */ - inode_setattr(inode, attr); + rc = inode_setattr(inode, attr); ll_update_inode(inode, &md); ptlrpc_req_finished(request); if (!lsm || !S_ISREG(inode->i_mode)) { CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n"); - RETURN(0); + RETURN(rc); } } else { /* The OST doesn't check permissions, but the alternative is @@ -1063,7 +1093,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) } /* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */ - inode_setattr(inode, attr); + rc = inode_setattr(inode, attr); } /* We really need to get our PW lock before we change inode->i_size. diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index aaac23a..7e3a78c 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -37,6 +37,7 @@ /* methods */ +/* called from iget{4,5_locked}->find_inode() under inode_lock spinlock */ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) static int ll_test_inode(struct inode *inode, unsigned long ino, void *opaque) #else @@ -760,8 +761,6 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) int rc; ENTRY; - oti.oti_thread = request->rq_svc_thread; - /* req is swabbed so this is safe */ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index fcbd874..5675f36 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -103,7 +103,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, /* this isn't where truncate starts. roughly: * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs - * DLM lock on [0, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to + * DLM lock on [size, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to * avoid races. * * must be called under ->lli_size_sem */ diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 628dae2..0337fc9 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -55,7 +55,44 @@ #include "lov_internal.h" -/* obd methods */ + +/* FIXME add lov_get/putrefs around every access to lov->tgts for on-line non- + quiescent ost removal */ +/* Keep a refcount of lov->tgt usage to prevent racing with deletion */ +static void lov_getref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + /* nobody gets through here until lov_putref is done */ + down(&lov->lov_lock); + atomic_inc(&lov->refcount); + up(&lov->lov_lock); + return; +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); + +static void lov_putref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + down(&lov->lov_lock); + /* ok to dec to 0 more than once -- ltd_exp's will be null */ + if (atomic_dec_and_test(&lov->refcount) && lov->death_row) { + struct lov_tgt_desc *tgt; + int i; + CDEBUG(D_CONFIG, "destroying %d lov targets\n", lov->death_row); + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) { + if (!tgt->reap) + continue; + /* Disconnect and delete from list */ + __lov_del_obd(obd, tgt); + lov->death_row--; + } + } + up(&lov->lov_lock); +} + #define MAX_STRING_SIZE 128 static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, int activate, struct obd_connect_data *data) @@ -124,6 +161,7 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, } tgt->active = 1; + tgt->reap = 0; lov->desc.ld_active_tgt_count++; #ifdef __KERNEL__ @@ -173,8 +211,8 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, /* We don't want to actually do the underlying connections more than * once, so keep track. */ - lov->refcount++; - if (lov->refcount > 1) { + lov->connects++; + if (lov->connects > 1) { class_export_put(exp); RETURN(0); } @@ -265,16 +303,15 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) RETURN(0); } -static int -lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen); +static int lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, + int index, int gen); static int lov_disconnect(struct obd_export *exp) { struct obd_device *obd = class_exp2obd(exp); - struct obd_device *osc_obd; struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; - int rc, i; + int i, rc; ENTRY; rc = class_disconnect(exp); @@ -283,24 +320,20 @@ static int lov_disconnect(struct obd_export *exp) RETURN(rc); /* Only disconnect the underlying layers on the final disconnect. */ - lov->refcount--; - if (lov->refcount != 0) + lov->connects--; + if (lov->connects != 0) RETURN(rc); + /* Let's hold another reference so lov_del_obd doesn't spin through + putref every time */ + lov_getref(obd); for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { if (tgt->ltd_exp) { - osc_obd = class_exp2obd(tgt->ltd_exp); - /* Disconnect and delete from list */ + /* Disconnection is the last we know about an obd */ lov_del_obd(obd, &tgt->uuid, i, tgt->ltd_gen); - /* Cleanup the osc now - can't do it from - lov_cleanup because we just lost our only reference - to it. */ - /* Use lov's force/fail flags. */ - osc_obd->obd_force = obd->obd_force; - osc_obd->obd_fail = obd->obd_fail; - class_manual_cleanup(osc_obd); } } + lov_putref(obd); RETURN(rc); } @@ -321,7 +354,6 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n", lov, uuid->uuid, activate); - spin_lock(&lov->lov_lock); for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { if (tgt->ltd_exp == NULL) continue; @@ -352,17 +384,16 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, EXIT; out: - spin_unlock(&lov->lov_lock); return rc; } static int lov_notify(struct obd_device *obd, struct obd_device *watched, - int active) + enum obd_notify_event ev) { int rc; struct obd_uuid *uuid; - if (strcmp(watched->obd_type->typ_name, "osc")) { + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("unexpected notification of %s %s!\n", watched->obd_type->typ_name, watched->obd_name); @@ -370,19 +401,24 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, } uuid = &watched->u.cli.cl_import->imp_target_uuid; - /* Set OSC as active before notifying the observer, so the - * observer can use the OSC normally. - */ - rc = lov_set_osc_active(&obd->u.lov, uuid, active); - if (rc) { - CERROR("%sactivation of %s failed: %d\n", - active ? "" : "de", uuid->uuid, rc); - RETURN(rc); + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. + */ + lov_getref(obd); + rc = lov_set_osc_active(&obd->u.lov, uuid, + ev == OBD_NOTIFY_ACTIVE); + lov_putref(obd); + if (rc) { + CERROR("%sactivation of %s failed: %d\n", + (ev == OBD_NOTIFY_ACTIVE) ? "" : "de", + uuid->uuid, rc); + RETURN(rc); + } } - if (obd->obd_observer) - /* Pass the notification up the chain. */ - rc = obd_notify(obd->obd_observer, watched, active); + /* Pass the notification up the chain. */ + rc = obd_notify_observer(obd, watched, ev); RETURN(rc); } @@ -392,8 +428,9 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; + obd_id params[2]; int rc, old_count; - __u32 bufsize; + __u32 bufsize, size = 2; ENTRY; CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n", @@ -450,7 +487,7 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", index, tgt->ltd_gen, lov->desc.ld_tgt_count); - if (lov->refcount == 0) + if (lov->connects == 0) /* lov_connect hasn't been called yet. So we'll do the lov_connect_obd on this obd when that fn first runs. */ RETURN(0); @@ -472,6 +509,17 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) obd_llog_finish(obd->obd_observer, old_count); llog_cat_initialize(obd->obd_observer, lov->desc.ld_tgt_count); + params[0] = index; + rc = obd_get_info(tgt->ltd_exp, strlen("last_id"), "last_id", &size, + ¶ms[1]); + if (rc) + GOTO(out, rc); + + rc = obd_set_info(obd->obd_observer->obd_self_export, + strlen("next_id"),"next_id", 2, params); + if (rc) + GOTO(out, rc); + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, 1); GOTO(out, rc); out: @@ -480,6 +528,7 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) return rc; } +/* Schedule a target for deletion */ static int lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) { @@ -489,9 +538,6 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) int rc = 0; ENTRY; - CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n", - uuidp->uuid, index, gen); - if (index >= count) { CERROR("LOV target index %d >= number of LOV OBDs %d.\n", index, count); @@ -511,6 +557,25 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) RETURN(-EINVAL); } + CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", + tgt->uuid.uuid, index, tgt->ltd_gen, tgt->ltd_exp, tgt->active); + + lov_getref(obd); + tgt->reap = 1; + lov->death_row++; + /* we really delete it from lov_putref */ + lov_putref(obd); + + RETURN(rc); +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct obd_device *osc_obd; + + LASSERT(tgt->reap); + osc_obd = class_exp2obd(tgt->ltd_exp); + if (tgt->ltd_exp) lov_disconnect_obd(obd, tgt); @@ -521,10 +586,15 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) /* lt_gen = 0 will mean it will not match the gen of any valid loi */ memset(tgt, 0, sizeof(*tgt)); - CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", - tgt->uuid.uuid, index, tgt->ltd_gen, tgt->ltd_exp, tgt->active); - - RETURN(rc); + /* Manual cleanup - no cleanup logs to clean up the osc's. We must + do it ourselves. And we can't do it from lov_cleanup, + because we just lost our only reference to it. */ + if (osc_obd) { + /* Use lov's force/fail flags. */ + osc_obd->obd_force = obd->obd_force; + osc_obd->obd_fail = obd->obd_fail; + class_manual_cleanup(osc_obd); + } } static int lov_setup(struct obd_device *obd, obd_count len, void *buf) @@ -602,7 +672,8 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) desc->ld_active_tgt_count = 0; lov->desc = *desc; - spin_lock_init(&lov->lov_lock); + sema_init(&lov->lov_lock, 1); + atomic_set(&lov->refcount, 0); lprocfs_init_vars(lov, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); @@ -627,13 +698,23 @@ static int lov_precleanup(struct obd_device *obd, int stage) int rc = 0; ENTRY; - if (stage < 2) - RETURN(0); - - rc = obd_llog_finish(obd, 0); - if (rc != 0) - CERROR("failed to cleanup llogging subsystems\n"); - + switch (stage) { + case OBD_CLEANUP_EARLY: { + struct lov_obd *lov = &obd->u.lov; + int i; + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->tgts[i].active) + continue; + obd_precleanup(class_exp2obd(lov->tgts[i].ltd_exp), + OBD_CLEANUP_EARLY); + } + break; + } + case OBD_CLEANUP_SELF_EXP: + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + } RETURN(rc); } @@ -647,8 +728,12 @@ static int lov_cleanup(struct obd_device *obd) struct lov_tgt_desc *tgt; for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { - if (!obd_uuid_empty(&tgt->uuid)) + /* We should never get here - these should have + been removed in the disconnect. */ + if (!obd_uuid_empty(&tgt->uuid)) { + CERROR("lov tgt %d not cleaned!\n", i); lov_del_obd(obd, &tgt->uuid, i, 0); + } } OBD_FREE(lov->tgts, lov->bufsize); } @@ -990,7 +1075,7 @@ static int lov_setattr(struct obd_export *exp, struct obdo *src_oa, OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLINLINE | OBD_MD_FLFID | OBD_MD_FLGENER))); lov = &exp->exp_obd->u.lov; - rc = lov_prep_setattr_set(exp, src_oa, lsm, NULL, &set); + rc = lov_prep_setattr_set(exp, src_oa, lsm, oti, &set); if (rc) RETURN(rc); @@ -1787,7 +1872,7 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, } else { #ifdef MIN_DF /* Sandia requested that df (and so, statfs) only - returned minimal available space on + returned minimal available space on a single OST, so people would be able to write this much data guaranteed. */ if (osfs->os_bavail > lov_sfs.os_bavail) { @@ -1939,12 +2024,14 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; - int i; + int i, rc; ENTRY; if (!vallen || !val) RETURN(-EFAULT); + lov_getref(obddev); + if (keylen > strlen("lock_to_stripe") && strcmp(key, "lock_to_stripe") == 0) { struct { @@ -1956,7 +2043,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, __u32 *stripe = val; if (*vallen < sizeof(*stripe)) - RETURN(-EFAULT); + GOTO(out, rc = -EFAULT); *vallen = sizeof(*stripe); /* XXX This is another one of those bits that will need to @@ -1971,46 +2058,35 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, if (lov->tgts[loi->loi_ost_idx].ltd_exp == data->lock->l_conn_export) { *stripe = i; - RETURN(0); + GOTO(out, rc = 0); } } LDLM_ERROR(data->lock, "lock on inode without such object\n"); dump_lsm(D_ERROR, data->lsm); - RETURN(-ENXIO); - } else if (keylen >= strlen("size_to_stripe") && - strcmp(key, "size_to_stripe") == 0) { - struct { - int stripe_number; - __u64 size; - struct lov_stripe_md *lsm; - } *data = val; - - if (*vallen < sizeof(*data)) - RETURN(-EFAULT); - - data->size = lov_size_to_stripe(data->lsm, data->size, - data->stripe_number); - RETURN(0); + GOTO(out, rc = -ENXIO); } else if (keylen >= strlen("last_id") && strcmp(key, "last_id") == 0) { obd_id *ids = val; - int rc, size = sizeof(obd_id); + int size = sizeof(obd_id); for (i = 0; i < lov->desc.ld_tgt_count; i++) { if (!lov->tgts[i].active) continue; rc = obd_get_info(lov->tgts[i].ltd_exp, keylen, key, &size, &(ids[i])); if (rc != 0) - RETURN(rc); + GOTO(out, rc); } - RETURN(0); + GOTO(out, rc = 0); } else if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) { struct lov_desc *desc_ret = val; *desc_ret = lov->desc; - RETURN(0); + GOTO(out, rc = 0); } - RETURN(-EINVAL); + rc = -EINVAL; +out: + lov_putref(obddev); + RETURN(rc); } static int lov_set_info(struct obd_export *exp, obd_count keylen, @@ -2021,7 +2097,15 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, int i, rc = 0, err; ENTRY; - if (KEY_IS("checksum")) { + if (KEY_IS("next_id")) { + if (vallen != lov->desc.ld_tgt_count) + RETURN(-EINVAL); + vallen = sizeof(obd_id); + } + + lov_getref(obddev); + + if (KEY_IS("next_id") || KEY_IS("checksum")) { for (i = 0; i < lov->desc.ld_tgt_count; i++) { /* OST was disconnected */ if (!lov->tgts[i].ltd_exp) @@ -2033,7 +2117,7 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, if (!rc) rc = err; } - RETURN(rc); + GOTO(out, rc); } if (KEY_IS("evict_by_nid")) { @@ -2047,14 +2131,14 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, if (!rc) rc = err; } - RETURN(rc); + GOTO(out, rc); } if (KEY_IS("mds_conn") || KEY_IS("unlinked")) { if (vallen != 0) - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } else { - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } for (i = 0; i < lov->desc.ld_tgt_count; i++) { @@ -2073,6 +2157,8 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, if (!rc) rc = err; } +out: + lov_putref(obddev); RETURN(rc); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index a471eea..a7ca4f0 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -139,10 +139,10 @@ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count) { if (!stripe_count) stripe_count = lov->desc.ld_default_stripe_count; - if (!stripe_count) - stripe_count = 1; if (stripe_count > lov->desc.ld_active_tgt_count) stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; /* for now, we limit the stripe count directly, when bug 4424 is * fixed this needs to be somewhat dynamic based on whether ext3 * can handle larger EA sizes. */ diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 4547ee6..a1e82c4 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -202,7 +202,7 @@ static int enqueue_done(struct lov_request_set *set, __u32 mode) lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; LASSERT(lov_lockhp); - if (lov_lockhp->cookie == 0) + if (!lustre_handle_is_used(lov_lockhp)) continue; rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, @@ -458,7 +458,7 @@ int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm, struct lustre_handle *lov_lockhp; lov_lockhp = set->set_lockh->llh_handles + i; - if (lov_lockhp->cookie == 0) { + if (!lustre_handle_is_used(lov_lockhp)) { CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n", loi->loi_ost_idx, loi->loi_id); continue; diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index a0c9b34..d338d89 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -448,6 +448,12 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, int rc = 0; ENTRY; + /* FIXME: Can't do this because of nested transaction deadlock */ + if (cmd == EXT3_IOC_SETFLAGS && (*(int *)arg) & EXT3_JOURNAL_DATA_FL) { + CERROR("can't set data journal flag on file\n"); + RETURN(-EPERM); + } + if (inode->i_fop->ioctl) rc = inode->i_fop->ioctl(inode, file, cmd, arg); else @@ -461,7 +467,7 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, { int rc; - LASSERT(down_trylock(&inode->i_sem) != 0); + LASSERT_SEM_LOCKED(&inode->i_sem); if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) CWARN("setting EA on %lu/%u again... interesting\n", @@ -484,7 +490,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) { int rc; - LASSERT(down_trylock(&inode->i_sem) != 0); + LASSERT_SEM_LOCKED(&inode->i_sem); lock_24kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, @@ -740,6 +746,26 @@ static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, return bg_start + colour + block; } +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +static void ll_unmap_underlying_metadata(struct super_block *sb, + unsigned long blocknr) +{ + struct buffer_head *old_bh; + + old_bh = get_hash_table(sb->s_dev, blocknr, sb->s_blocksize); + if (old_bh) { + mark_buffer_clean(old_bh); + wait_on_buffer(old_bh); + clear_bit(BH_Req, &old_bh->b_state); + __brelse(old_bh); + } +} +#else +#define ll_unmap_underlying_metadata(sb, blocknr) \ + unmap_underlying_metadata((sb)->s_bdev, blocknr) +#endif + static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, struct ext3_ext_path *path, struct ext3_ext_cache *cex) @@ -847,8 +873,6 @@ out: unlock_24kernel(); map: if (err >= 0) { - struct block_device *bdev = inode->i_sb->s_bdev; - /* map blocks */ if (bp->num == 0) { CERROR("hmm. why do we find this extent?\n"); @@ -871,10 +895,9 @@ map: } else { *(bp->created) = 1; /* unmap any possible underlying metadata from - * the block device mapping. bug 6998. - * This only compiles on 2.6, but there are - * no users of mballoc on 2.4. */ - unmap_underlying_metadata(bdev, *(bp->blocks)); + * the block device mapping. bug 6998. */ + ll_unmap_underlying_metadata(inode->i_sb, + *(bp->blocks)); } bp->created++; bp->blocks++; @@ -961,7 +984,7 @@ int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, cleanup: return rc; } -#endif +#endif /* EXT3_MULTIBLOCK_ALLOCATOR */ extern int ext3_map_inode_page(struct inode *inode, struct page *page, unsigned long *blocks, int *created, int create); @@ -1164,6 +1187,8 @@ static int fsfilt_ext3_setup(struct super_block *sb) set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS); sb->s_flags |= S_PDIROPS; #endif + if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + CWARN("filesystem doesn't have dir_index feature enabled\n"); return 0; } diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index afe58d9..0a0e8b3 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -469,6 +469,26 @@ EXPORT_SYMBOL(lvfs_set_rdonly); EXPORT_SYMBOL(lvfs_check_rdonly); EXPORT_SYMBOL(lvfs_clear_rdonly); +int lvfs_check_io_health(struct obd_device *obd, struct file *file) +{ + char *write_page = NULL; + loff_t offset = 0; + int rc = 0; + ENTRY; + + OBD_ALLOC(write_page, PAGE_SIZE); + if (!write_page) + RETURN(-ENOMEM); + + rc = fsfilt_write_record(obd, file, write_page, PAGE_SIZE, &offset, 1); + + OBD_FREE(write_page, PAGE_SIZE); + + CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc); + RETURN(rc); +} +EXPORT_SYMBOL(lvfs_check_io_health); + static int __init lvfs_linux_init(void) { RETURN(0); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index b7e3848..7fea16e 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -474,7 +474,7 @@ static void mdc_replay_open(struct ptlrpc_request *req) struct mdc_open_data *mod = req->rq_cb_data; struct obd_client_handle *och; struct ptlrpc_request *close_req; - struct lustre_handle old; + struct lustre_handle old; struct mds_body *body; ENTRY; @@ -490,14 +490,14 @@ static void mdc_replay_open(struct ptlrpc_request *req) och = mod->mod_och; if (och != NULL) { - struct lustre_handle *file_fh; + struct lustre_handle *file_fh; LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); file_fh = &och->och_fh; CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n", file_fh->cookie, body->handle.cookie); memcpy(&old, file_fh, sizeof(old)); memcpy(file_fh, &body->handle, sizeof(*file_fh)); - } + } close_req = mod->mod_close_req; if (close_req != NULL) { @@ -662,7 +662,7 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, mod->mod_close_req = req; if (mod->mod_open_req->rq_type == LI_POISON) { /* FIXME This should be an ASSERT, but until we - figure out why it can be poisoned here, give + figure out why it can be poisoned here, give a reasonable return. bug 6155 */ CERROR("LBUG POISONED open %p!\n", mod->mod_open_req); ptlrpc_req_finished(req); @@ -866,12 +866,14 @@ out: return rc; } +#define INIT_RECOV_BACKUP "init_recov_bk" int mdc_set_info(struct obd_export *exp, obd_count keylen, void *key, obd_count vallen, void *val) { struct obd_import *imp = class_exp2cliimp(exp); int rc = -EINVAL; + /* Try to "recover" the initial connection; i.e. retry */ if (keylen == strlen("initial_recov") && memcmp(key, "initial_recov", strlen("initial_recov")) == 0) { if (vallen != sizeof(int)) @@ -881,6 +883,18 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen, exp->exp_obd->obd_name, imp->imp_initial_recov); RETURN(0); } + /* Turn off initial_recov after we try all backup servers once */ + if (keylen == strlen(INIT_RECOV_BACKUP) && + memcmp(key, INIT_RECOV_BACKUP, strlen(INIT_RECOV_BACKUP)) == 0) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + imp->imp_initial_recov_bk = *(int *)val; + if (imp->imp_initial_recov_bk) + imp->imp_initial_recov = 1; + CDEBUG(D_HA, "%s: set imp_initial_recov_bk = %d\n", + exp->exp_obd->obd_name, imp->imp_initial_recov_bk); + RETURN(0); + } if (keylen == strlen("read-only") && memcmp(key, "read-only", strlen("read-only")) == 0) { struct ptlrpc_request *req; @@ -908,7 +922,7 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen, ptlrpc_req_finished(req); RETURN(rc); } - + RETURN(rc); } @@ -1060,8 +1074,7 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid, RETURN(rc); } -static int mdc_import_event(struct obd_device *obd, - struct obd_import *imp, +static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, enum obd_import_event event) { int rc = 0; @@ -1073,8 +1086,7 @@ static int mdc_import_event(struct obd_device *obd, break; } case IMP_EVENT_INACTIVE: { - if (obd->obd_observer) - rc = obd_notify(obd->obd_observer, obd, 0); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); break; } case IMP_EVENT_INVALIDATE: { @@ -1085,12 +1097,14 @@ static int mdc_import_event(struct obd_device *obd, break; } case IMP_EVENT_ACTIVE: { - if (obd->obd_observer) - rc = obd_notify(obd->obd_observer, obd, 1); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); break; } + case IMP_EVENT_OCD: + break; + default: - CERROR("Unknown import event %d\n", event); + CERROR("Unknown import event %x\n", event); LBUG(); } RETURN(rc); @@ -1177,8 +1191,8 @@ static int mdc_precleanup(struct obd_device *obd, int stage) { int rc = 0; ENTRY; - - if (stage < 2) + + if (stage < OBD_CLEANUP_SELF_EXP) RETURN(0); rc = obd_llog_finish(obd, 0); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index fc7d151..dae235d 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "mds_internal.h" @@ -299,9 +300,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, data->ocd_connect_flags &= ~OBD_CONNECT_ACL; if (!obd->u.mds.mds_fl_user_xattr) - data->ocd_connect_flags &= ~OBD_CONNECT_USER_XATTR; + data->ocd_connect_flags &= ~OBD_CONNECT_XATTR; exp->exp_connect_flags = data->ocd_connect_flags; + data->ocd_version = LUSTRE_VERSION_CODE; exp->exp_mds_data.med_ibits_known = data->ocd_ibits_known; } @@ -336,7 +338,7 @@ out: RETURN(rc); } -static int mds_init_export(struct obd_export *exp) +static int mds_init_export(struct obd_export *exp) { struct mds_export_data *med = &exp->exp_mds_data; @@ -512,7 +514,7 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset, inode->i_ino, lmm_size, mds->mds_max_mdsize); // RETURN(-EINVAL); } - + rc = mds_get_md(obd, inode, lmm, &lmm_size, lock); if (rc > 0) { if (S_ISDIR(inode->i_mode)) @@ -598,7 +600,7 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, inode, 1); /* If we have LOV EA data, the OST holds size, atime, mtime */ - if (!(body->valid & OBD_MD_FLEASIZE) && + if (!(body->valid & OBD_MD_FLEASIZE) && !(body->valid & OBD_MD_FLDIREA)) body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME | OBD_MD_FLMTIME); @@ -750,7 +752,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, char *name; ENTRY; - LASSERT(!strcmp(obd->obd_type->typ_name, "mds")); + LASSERT(!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)); /* Swab now, before anyone looks inside the request */ @@ -807,7 +809,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, } #endif - if (child_lockh->cookie != 0) { + if (lustre_handle_is_used(child_lockh)) { LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT); resent_req = 1; } @@ -1434,7 +1436,7 @@ int mds_handle(struct ptlrpc_request *req) break; case MDS_GETATTR_NAME: { - struct lustre_handle lockh; + struct lustre_handle lockh = { 0 }; DEBUG_REQ(D_INODE, req, "getattr_name"); OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0); @@ -1442,12 +1444,11 @@ int mds_handle(struct ptlrpc_request *req) * acquiring any new locks in mds_getattr_name, so we don't * want to cancel. */ - lockh.cookie = 0; rc = mds_getattr_name(MDS_REQ_REC_OFF, req, MDS_INODELOCK_UPDATE, &lockh); /* this non-intent call (from an ioctl) is special */ req->rq_status = rc; - if (rc == 0 && lockh.cookie) + if (rc == 0 && lustre_handle_is_used(&lockh)) ldlm_lock_decref(&lockh, LCK_CR); break; } @@ -1764,7 +1765,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt); LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); - + sema_init(&mds->mds_orphan_recovery_sem, 1); sema_init(&mds->mds_epoch_sem, 1); spin_lock_init(&mds->mds_transno_lock); @@ -1820,8 +1821,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) GOTO(err_qctxt, rc); } - /* Wait for mds_postrecov trying to clear orphans until 9439 is fixed */ - obd->obd_async_recov = 0; + /* Don't wait for mds_postrecov trying to clear orphans */ + obd->obd_async_recov = 1; rc = mds_postsetup(obd); if (rc) GOTO(err_qctxt, rc); @@ -1840,7 +1841,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_name, lustre_cfg_string(lcfg, 1), obd->obd_recoverable_clients, - (obd->obd_recoverable_clients == 1) + (obd->obd_recoverable_clients == 1) ? "client" : "clients", (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60, (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60, @@ -1877,6 +1878,36 @@ err_ops: return rc; } +static int mds_lov_clean(struct obd_device *obd) +{ + struct mds_obd *mds = &obd->u.mds; + struct obd_device *osc = mds->mds_osc_obd; + ENTRY; + + if (mds->mds_profile) { + class_del_profile(mds->mds_profile); + OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1); + mds->mds_profile = NULL; + } + + /* There better be a lov */ + if (!osc) + RETURN(0); + + obd_register_observer(osc, NULL); + + /* Give lov our same shutdown flags */ + osc->obd_force = obd->obd_force; + osc->obd_fail = obd->obd_fail; + + /* Cleanup the lov */ + obd_disconnect(mds->mds_osc_exp); + class_manual_cleanup(osc); + mds->mds_osc_exp = NULL; + + RETURN(0); +} + static int mds_postsetup(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; @@ -1940,7 +1971,7 @@ int mds_postrecov(struct obd_device *obd) int rc, item = 0; ENTRY; - if (obd->obd_fail) + if (obd->obd_fail) RETURN(0); LASSERT(!obd->obd_recovering); @@ -1964,34 +1995,17 @@ out: RETURN(rc < 0 ? rc : item); } -int mds_lov_clean(struct obd_device *obd) +/* We need to be able to stop an mds_lov_synchronize */ +static int mds_lov_early_clean(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; struct obd_device *osc = mds->mds_osc_obd; - ENTRY; - - if (mds->mds_profile) { - class_del_profile(mds->mds_profile); - OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1); - mds->mds_profile = NULL; - } - - /* There better be a lov */ - if (!osc) - RETURN(0); - obd_register_observer(osc, NULL); + if (!osc || (!obd->obd_force && !obd->obd_fail)) + return(0); - /* Give lov our same shutdown flags */ - osc->obd_force = obd->obd_force; - osc->obd_fail = obd->obd_fail; - - /* Cleanup the lov */ - obd_disconnect(mds->mds_osc_exp); - class_manual_cleanup(osc); - mds->mds_osc_exp = NULL; - - RETURN(0); + CDEBUG(D_HA, "abort inflight\n"); + return (obd_precleanup(osc, OBD_CLEANUP_EARLY)); } static int mds_precleanup(struct obd_device *obd, int stage) @@ -2000,11 +2014,11 @@ static int mds_precleanup(struct obd_device *obd, int stage) ENTRY; switch (stage) { - case 1: - mds_lov_set_cleanup_flags(obd); + case OBD_CLEANUP_EXPORTS: target_cleanup_recovery(obd); + mds_lov_early_clean(obd); break; - case 2: + case OBD_CLEANUP_SELF_EXP: mds_lov_disconnect(obd); mds_lov_clean(obd); llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT)); @@ -2119,7 +2133,7 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset, /* If the xid matches, then we know this is a resent request, * and allow it. (It's probably an OPEN, for which we don't * send a lock */ - if (req->rq_xid == + if (req->rq_xid == le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_xid)) return; @@ -2216,7 +2230,7 @@ static int mds_intent_policy(struct ldlm_namespace *ns, RETURN(ELDLM_LOCK_ABORTED); if (intent_disposition(rep, DISP_LOOKUP_NEG) && !intent_disposition(rep, DISP_OPEN_OPEN)) -#endif +#endif RETURN(ELDLM_LOCK_ABORTED); break; case IT_LOOKUP: @@ -2238,7 +2252,6 @@ static int mds_intent_policy(struct ldlm_namespace *ns, rep->lock_policy_res2 = mds_getattr_name(offset, req, getattr_part, &lockh); - /* FIXME: LDLM can set req->rq_status. MDS sets policy_res{1,2} with disposition and status. - replay: returns 0 & req->status is old status @@ -2333,8 +2346,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, - mds_handle, "mds", obd->obd_proc_entry, NULL, - MDT_NUM_THREADS); + mds_handle, LUSTRE_MDS_NAME, + obd->obd_proc_entry, NULL, MDT_NUM_THREADS); if (!mds->mds_service) { CERROR("failed to start service\n"); @@ -2417,7 +2430,7 @@ static int mdt_health_check(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; int rc = 0; - + down(&mds->mds_health_sem); rc |= ptlrpc_service_health_check(mds->mds_readpage_service); rc |= ptlrpc_service_health_check(mds->mds_setattr_service); @@ -2430,7 +2443,7 @@ static int mdt_health_check(struct obd_device *obd) */ if(rc != 0) rc = 1; - + return rc; } @@ -2444,6 +2457,20 @@ static struct dentry *mds_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr, fid.generation = gen; return mds_fid2dentry(&obd->u.mds, &fid, NULL); } +static int mds_health_check(struct obd_device *obd) +{ + struct obd_device_target *odt = &obd->u.obt; + struct mds_obd *mds = &obd->u.mds; + int rc = 0; + + if (odt->obt_sb->s_flags & MS_RDONLY) + rc = 1; + + LASSERT(mds->mds_health_check_filp != NULL); + rc |= !!lvfs_check_io_health(obd, mds->mds_health_check_filp); + + return rc; +} struct lvfs_callback_ops mds_lvfs_ops = { l_fid2dentry: mds_lvfs_fid2dentry, @@ -2467,6 +2494,7 @@ static struct obd_ops mds_obd_ops = { .o_llog_init = mds_llog_init, .o_llog_finish = mds_llog_finish, .o_notify = mds_notify, + .o_health_check = mds_health_check, }; static struct obd_ops mdt_obd_ops = { diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index ba85c94..23efc66 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -48,6 +48,8 @@ #include "mds_internal.h" +#define HEALTH_CHECK "health_check" + /* Add client data to the MDS. We use a bitmap to locate a free space * in the last_rcvd file if cl_off is -1 (i.e. a new client). * Otherwise, we have just read the data from the last_rcvd file and @@ -506,11 +508,32 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt) file->f_dentry->d_inode->i_mode); GOTO(err_lov_objid, rc = -ENOENT); } + + /* open and test the check io file junk */ + file = filp_open(HEALTH_CHECK, O_RDWR | O_CREAT, 0644); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("cannot open/create %s file: rc = %d\n", HEALTH_CHECK, rc); + GOTO(err_lov_objid, rc = PTR_ERR(file)); + } + mds->mds_health_check_filp = file; + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + CERROR("%s is not a regular file!: mode = %o\n", HEALTH_CHECK, + file->f_dentry->d_inode->i_mode); + GOTO(err_health_check, rc = -ENOENT); + } + rc = lvfs_check_io_health(obd, file); + if (rc) + GOTO(err_health_check, rc); err_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); return rc; +err_health_check: + if (mds->mds_health_check_filp && + filp_close(mds->mds_health_check_filp, 0)) + CERROR("can't close %s after error\n", HEALTH_CHECK); err_lov_objid: if (mds->mds_lov_objid_filp && filp_close(mds->mds_lov_objid_filp, 0)) CERROR("can't close %s after error\n", LOV_OBJID); @@ -538,8 +561,8 @@ int mds_fs_cleanup(struct obd_device *obd) int rc = 0; if (obd->obd_fail) - CERROR("%s: shutting down for failover; client state will" - " be preserved.\n", obd->obd_name); + CWARN("%s: shutting down for failover; client state will " + "be preserved.\n", obd->obd_name); class_disconnect_exports(obd); /* cleans up client info too */ mds_server_free_data(mds); @@ -557,6 +580,12 @@ int mds_fs_cleanup(struct obd_device *obd) if (rc) CERROR("%s file won't close, rc=%d\n", LOV_OBJID, rc); } + if (mds->mds_health_check_filp) { + rc = filp_close(mds->mds_health_check_filp, 0); + mds->mds_health_check_filp = NULL; + if (rc) + CERROR("%s file won't close, rc=%d\n", HEALTH_CHECK, rc); + } if (mds->mds_objects_dir != NULL) { l_dput(mds->mds_objects_dir); mds->mds_objects_dir = NULL; diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index a7fccd3..c625236 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -189,7 +189,6 @@ int mds_llog_finish(struct obd_device *obd, int count); /* mds/mds_lov.c */ int mds_lov_connect(struct obd_device *obd, char * lov_name); int mds_lov_disconnect(struct obd_device *obd); -void mds_lov_set_cleanup_flags(struct obd_device *); int mds_lov_write_objids(struct obd_device *obd); void mds_lov_update_objids(struct obd_device *obd, obd_id *ids); int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid); @@ -197,7 +196,8 @@ int mds_lov_set_nextid(struct obd_device *obd); int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, int nonblock); int mds_post_mds_lovconf(struct obd_device *obd); -int mds_notify(struct obd_device *obd, struct obd_device *watched, int active); +int mds_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode, struct lov_mds_md *lmm, int lmm_size); void mds_objids_from_lmm(obd_id *ids, struct lov_mds_md *lmm, @@ -226,7 +226,6 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, /* mds/handler.c */ extern struct lvfs_callback_ops mds_lvfs_ops; -int mds_lov_clean(struct obd_device *obd); extern int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg); int mds_postrecov(struct obd_device *obd); diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index 2047fdb..4bc0f1b 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -365,12 +365,17 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req, LASSERT(body != NULL); /* previously verified & swabbed by caller */ #if CRAY_XT3 - ucred->luc_fsuid = req->rq_uid; -#else - ucred->luc_fsuid = body->fsuid; - ucred->luc_fsgid = body->fsgid; - ucred->luc_cap = body->capability; + if (req->rq_uid != LNET_UID_ANY) { + /* Non-root local cluster client */ + LASSERT (req->rq_uid != 0); + ucred->luc_fsuid = req->rq_uid; + } else #endif + { + ucred->luc_fsuid = body->fsuid; + ucred->luc_fsgid = body->fsgid; + ucred->luc_cap = body->capability; + } ucred->luc_uce = upcall_cache_get_entry(mds->mds_group_hash, ucred->luc_fsuid, diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 900ee04..103807c 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -258,34 +258,6 @@ int mds_lov_disconnect(struct obd_device *obd) RETURN(rc); } -/* for consistency, let's make the lov and the lov's - * osc's see the same cleanup flags as our mds */ -void mds_lov_set_cleanup_flags(struct obd_device *obd) -{ - struct mds_obd *mds = &obd->u.mds; - struct lov_obd *lov; - - if (IS_ERR(mds->mds_osc_obd) || (mds->mds_osc_exp == NULL)) - return; - - lov = &mds->mds_osc_obd->u.lov; - mds->mds_osc_obd->obd_force = obd->obd_force; - mds->mds_osc_obd->obd_fail = obd->obd_fail; - if (lov->tgts) { - struct obd_export *osc_exp; - int i; - spin_lock(&lov->lov_lock); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (lov->tgts[i].ltd_exp != NULL) { - osc_exp = lov->tgts[i].ltd_exp; - osc_exp->exp_obd->obd_force = obd->obd_force; - osc_exp->exp_obd->obd_fail = obd->obd_fail; - } - } - spin_unlock(&lov->lov_lock); - } -} - int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) { @@ -517,7 +489,10 @@ static int __mds_lov_syncronize(void *data) CWARN("MDS %s: %s now active, resetting orphans\n", obd->obd_name, uuid ? (char *)uuid->uuid : "All OSC's"); - + + if (obd->obd_stopping) + GOTO(out, rc = -ENODEV); + rc = mds_lov_clear_orphans(&obd->u.mds, uuid); if (rc != 0) { CERROR("%s: failed at mds_lov_clear_orphans: %d\n", @@ -526,7 +501,7 @@ static int __mds_lov_syncronize(void *data) } out: - class_export_put(obd->obd_self_export); + class_decref(obd); RETURN(rc); } @@ -560,9 +535,16 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, mlsi->mlsi_obd = obd; mlsi->mlsi_uuid = uuid; - - /* We need to lock the mds in place for our new thread context. */ - class_export_get(obd->obd_self_export); + + /* Although class_export_get(obd->obd_self_export) would lock + the MDS in place, since it's only a self-export + it doesn't lock the LOV in place. The LOV can be disconnected + during MDS precleanup, leaving nothing for __mds_lov_syncronize. + Simply taking an export ref on the LOV doesn't help, because it's + still disconnected. Taking an obd reference insures that we don't + disconnect the LOV. This of course means a cleanup won't + finish for as long as the sync is blocking. */ + atomic_inc(&obd->obd_refcount); if (nonblock) { /* Syncronize in the background */ @@ -583,16 +565,17 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, RETURN(rc); } -int mds_notify(struct obd_device *obd, struct obd_device *watched, int active) +int mds_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev) { struct obd_uuid *uuid; int rc = 0; ENTRY; - if (!active) + if (ev != OBD_NOTIFY_ACTIVE) RETURN(0); - if (strcmp(watched->obd_type->typ_name, "osc")) { + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("unexpected notification of %s %s!\n", watched->obd_type->typ_name, watched->obd_name); RETURN(-EINVAL); diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index f6fa43b..b8d1431 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -320,22 +320,22 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, void *lmm_buf; ENTRY; + if (!S_ISREG(inode->i_mode)) + RETURN(0); if (rec->ur_flags & MDS_OPEN_DELAY_CREATE || !(rec->ur_flags & FMODE_WRITE)) RETURN(0); body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body)); - if (!S_ISREG(inode->i_mode)) - RETURN(0); if (body->valid & OBD_MD_FLEASIZE) RETURN(0); OBD_ALLOC(*ids, mds->mds_lov_desc.ld_tgt_count * sizeof(**ids)); if (*ids == NULL) RETURN(-ENOMEM); + oti_init(&oti, req); oti.oti_objid = *ids; - oti.oti_thread = req->rq_svc_thread; /* replay case */ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { @@ -1052,10 +1052,6 @@ int mds_open(struct mds_update_record *rec, int offset, (acc_mode & MAY_WRITE)) GOTO(cleanup, rc = -EROFS); - /* Can't write to a read-only file */ - if (IS_RDONLY(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0) - GOTO(cleanup, rc = -EPERM); - /* An append-only file must be opened in append mode for * writing */ if (IS_APPEND(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0 && @@ -1130,7 +1126,7 @@ int mds_open(struct mds_update_record *rec, int offset, else ptlrpc_save_lock (req, &parent_lockh, parent_mode); } - + /* trigger dqacq on the owner of child and parent */ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_CREATE); RETURN(rc); @@ -1343,7 +1339,7 @@ int mds_close(struct ptlrpc_request *req, int offset) if (rc) { CERROR("lustre_pack_reply: rc = %d\n", rc); req->rq_status = rc; - /* Continue on to drop local open count even if we can't send the reply */ + /* continue on to drop local open even if we can't send reply */ } else { MDS_CHECK_RESENT(req, mds_reconstruct_generic(req)); } diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index a06e886..d7282ee 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -706,7 +706,8 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, ENTRY; LASSERT(offset == MDS_REQ_REC_OFF); - LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, "mds")); + LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, + LUSTRE_MDS_NAME)); DEBUG_REQ(D_INODE, req, "parent "LPU64"/%u name %s mode %o", rec->ur_fid1->id, rec->ur_fid1->generation, @@ -2174,7 +2175,13 @@ int mds_reint_rec(struct mds_update_record *rec, int offset, ENTRY; #if CRAY_XT3 - rec->ur_uc.luc_fsuid = req->rq_uid; + if (req->rq_uid != LNET_UID_ANY) { + /* non-root local cluster client + * NB root's creds are believed... */ + LASSERT (req->rq_uid != 0); + rec->ur_uc.luc_fsuid = req->rq_uid; + rec->ur_uc.luc_cap = 0; + } #endif /* get group info of this user */ diff --git a/lustre/mds/mds_xattr.c b/lustre/mds/mds_xattr.c index aa12812..668ed50 100644 --- a/lustre/mds/mds_xattr.c +++ b/lustre/mds/mds_xattr.c @@ -62,8 +62,7 @@ static int mds_getxattr_pack_msg(struct ptlrpc_request *req, return -EFAULT; } - if (!(req->rq_export->exp_connect_flags & - OBD_CONNECT_USER_XATTR) && + if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) && (strncmp(xattr_name, "user.", 5) == 0)) return -EOPNOTSUPP; @@ -255,7 +254,7 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) GOTO(out_dput, rc = -EACCES); } - if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_USER_XATTR) && + if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) && (strncmp(xattr_name, "user.", 5) == 0)) { GOTO(out_dput, rc = -EOPNOTSUPP); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index baeba10..2ac6c5d 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -321,9 +321,8 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg) CERROR("Device %d not attached\n", obd->obd_minor); GOTO(out, err = -ENODEV); } - CDEBUG(D_IOCTL, - "disabling committed-transno notifications on %d\n", - obd->obd_minor); + CDEBUG(D_HA, "%s: disabling committed-transno notification\n", + obd->obd_name); obd->obd_no_transno = 1; GOTO(out, err = 0); } @@ -418,6 +417,7 @@ EXPORT_SYMBOL(class_handle_unhash); EXPORT_SYMBOL(class_handle2object); /* config.c */ +EXPORT_SYMBOL(class_decref); EXPORT_SYMBOL(class_get_profile); EXPORT_SYMBOL(class_del_profile); EXPORT_SYMBOL(class_process_config); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 2040cd8..cb1eb32 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1218,8 +1218,8 @@ search_again: for (i = 0; i < num_to_evict; i++) { exports_evicted++; - CERROR("evicting NID '%s' (%s) #%d at adminstrative request\n", - nid, doomed_exp[i]->exp_client_uuid.uuid, + CERROR("%s: evict NID '%s' (%s) #%d at adminstrative request\n", + obd->obd_name, nid, doomed_exp[i]->exp_client_uuid.uuid, exports_evicted); class_fail_export(doomed_exp[i]); class_export_put(doomed_exp[i]); @@ -1230,7 +1230,8 @@ search_again: } if (!exports_evicted) - CERROR("can't disconnect NID '%s': no exports found\n", nid); + CERROR("%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); return exports_evicted; } EXPORT_SYMBOL(obd_export_evict_by_nid); @@ -1257,10 +1258,11 @@ int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid) spin_unlock(&obd->obd_dev_lock); if (doomed_exp == NULL) { - CERROR("can't disconnect %s: no exports found\n", uuid); + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); } else { - CERROR("evicting %s at adminstrative request\n", - doomed_exp->exp_client_uuid.uuid); + CERROR("%s: evicting %s at adminstrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); class_fail_export(doomed_exp); class_export_put(doomed_exp); exports_evicted++; diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 9e10b18..dc0e8fb 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -630,6 +630,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn); LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn); LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); diff --git a/lustre/obdclass/lustre_handles.c b/lustre/obdclass/lustre_handles.c index f3626b2..ef7639d 100644 --- a/lustre/obdclass/lustre_handles.c +++ b/lustre/obdclass/lustre_handles.c @@ -27,9 +27,9 @@ #ifdef __KERNEL__ # include # include -#else +#else # include -#endif +#endif #include #include @@ -43,6 +43,10 @@ static int handle_count = 0; #define HANDLE_HASH_SIZE (1 << 14) #define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ void class_handle_hash(struct portals_handle *h, portals_handle_addref_cb cb) { struct list_head *bucket; @@ -52,19 +56,33 @@ void class_handle_hash(struct portals_handle *h, portals_handle_addref_cb cb) LASSERT(list_empty(&h->h_link)); spin_lock(&handle_lock); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ h->h_cookie = handle_base; handle_base += HANDLE_INCR; - spin_unlock(&handle_lock); - h->h_addref = cb; bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); - CDEBUG(D_INFO, "adding object %p with handle "LPX64" to hash\n", - h, h->h_cookie); - - spin_lock(&handle_lock); list_add(&h->h_link, bucket); handle_count++; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + spin_unlock(&handle_lock); + + h->h_addref = cb; + CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n", + h, h->h_cookie); EXIT; } diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index a1d89e1..499e313 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -155,6 +155,7 @@ int class_del_uuid (char *uuid) data = list_entry(deathrow.next, struct uuid_nid_data, un_list); list_del (&data->un_list); + CDEBUG(D_INFO, "del uuid %s\n", data->un_uuid); OBD_FREE(data->un_uuid, strlen(data->un_uuid) + 1); OBD_FREE(data, sizeof(*data)); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index e8cda72..622af7d 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -73,6 +73,26 @@ int class_attach(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", MKSTR(typename), MKSTR(name), MKSTR(uuid)); + + /* Mountconf transitional hack, should go away after 1.6. + 1.4.7 uses the old names, so translate back if the + mountconf flag is set. + 1.6 should set this flag, and translate the other way here + if not set. */ + if (lcfg->lcfg_flags & LCFG_FLG_MOUNTCONF){ + char *tmp = NULL; + if (strcmp(typename, "mds") == 0) + tmp = "mdt"; + if (strcmp(typename, "mdt") == 0) + tmp = "mds"; + if (strcmp(typename, "osd") == 0) + tmp = "obdfilter"; + if (tmp) { + LCONSOLE_WARN("Using type %s for %s %s\n", tmp, + MKSTR(typename), MKSTR(name)); + typename = tmp; + } + } /* find the type */ type = class_get_type(typename); @@ -359,7 +379,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) /* Precleanup stage 1, we must make sure all exports (other than the self-export) get destroyed. */ - err = obd_precleanup(obd, 1); + err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS); if (err) CERROR("Precleanup %s returned %d\n", obd->obd_name, err); @@ -394,7 +414,7 @@ void class_decref(struct obd_device *obd) /* if we're not stopping, we didn't finish setup */ /* Precleanup stage 2, do other type-specific cleanup requiring the self-export. */ - err = obd_precleanup(obd, 2); + err = obd_precleanup(obd, OBD_CLEANUP_SELF_EXP); if (err) CERROR("Precleanup %s returned %d\n", obd->obd_name, err); @@ -435,8 +455,8 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) CERROR("invalid conn_uuid\n"); RETURN(-EINVAL); } - if (strcmp(obd->obd_type->typ_name, "mdc") && - strcmp(obd->obd_type->typ_name, "osc")) { + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("can't add connection on non-client dev\n"); RETURN(-EINVAL); } @@ -465,8 +485,8 @@ int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) CERROR("invalid conn_uuid\n"); RETURN(-EINVAL); } - if (strcmp(obd->obd_type->typ_name, "mdc") && - strcmp(obd->obd_type->typ_name, "osc")) { + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("can't del connection on non-client dev\n"); RETURN(-EINVAL); } @@ -619,6 +639,11 @@ int class_process_config(struct lustre_cfg *lcfg) sizeof (obd_lustre_upcall)); GOTO(out, err = 0); } + case LCFG_PARAM: + case LCFG_MARKER: { + LCONSOLE_WARN("LCFG_MARKER not yet implemented.\n"); + GOTO(out, err = 0); + } } /* Commands that require a device */ @@ -851,7 +876,7 @@ void class_manual_cleanup(struct obd_device *obd) int err; char flags[3]=""; ENTRY; - + if (!obd) { CERROR("empty cleanup\n"); EXIT; @@ -863,22 +888,22 @@ void class_manual_cleanup(struct obd_device *obd) if (obd->obd_fail) strcat(flags, "A"); - CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", obd->obd_name, flags); lustre_cfg_bufs_reset(&bufs, obd->obd_name); lustre_cfg_bufs_set_string(&bufs, 1, flags); lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); - + err = class_process_config(lcfg); - if (err) + if (err) CERROR("cleanup failed %d: %s\n", err, obd->obd_name); - + /* the lcfg is almost the same for both ops */ lcfg->lcfg_command = LCFG_DETACH; err = class_process_config(lcfg); lustre_cfg_free(lcfg); - if (err) + if (err) CERROR("detach failed %d: %s\n", err, obd->obd_name); EXIT; } diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 11e4ba2..200d2fb 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -938,9 +938,8 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw, rnb[i].len = PAGE_SIZE; } - /* XXX this can't be the best.. */ - memset(oti, 0, sizeof(*oti)); ioo.ioo_bufcnt = npages; + oti->oti_transno = 0; ret = obd_preprw(rw, exp, oa, 1, &ioo, npages, rnb, lnb, oti); if (ret != 0) @@ -988,7 +987,7 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp, { struct obd_device *obd = class_exp2obd(exp); struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_trans_info dummy_oti; + struct obd_trans_info dummy_oti = { .oti_thread_id = -1 }; struct ec_object *eco; int rc; ENTRY; @@ -997,8 +996,6 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp, if (rc) RETURN(rc); - memset(&dummy_oti, 0, sizeof(dummy_oti)); - data->ioc_obdo1.o_valid &= ~OBD_MD_FLHANDLE; data->ioc_obdo1.o_valid |= OBD_MD_FLGROUP; data->ioc_obdo1.o_gr = FILTER_GROUP_ECHO; diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 827085c..b086e15 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -368,9 +369,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) int rc; /* ensure padding in the struct is the correct size */ - LASSERT (offsetof(struct filter_server_data, fsd_padding) + + CLASSERT(offsetof(struct filter_server_data, fsd_padding) + sizeof(fsd->fsd_padding) == LR_SERVER_SIZE); - LASSERT (offsetof(struct filter_client_data, fcd_padding) + + CLASSERT(offsetof(struct filter_client_data, fcd_padding) + sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE); OBD_ALLOC(fsd, sizeof(*fsd)); @@ -782,7 +783,7 @@ static int filter_prep(struct obd_device *obd) LAST_RCVD, rc); GOTO(out, rc); } - + filter->fo_rcvd_filp = file; if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { CERROR("%s is not a regular file!: mode = %o\n", LAST_RCVD, file->f_dentry->d_inode->i_mode); @@ -800,12 +801,27 @@ static int filter_prep(struct obd_device *obd) CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc); GOTO(err_filp, rc); } - filter->fo_rcvd_filp = file; + /* open and create health check io file*/ + file = filp_open(HEALTH_CHECK, O_RDWR | O_CREAT, 0644); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("OBD filter: cannot open/create %s rc = %d\n", + HEALTH_CHECK, rc); + GOTO(err_filp, rc); + } + filter->fo_health_check_filp = file; + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + CERROR("%s is not a regular file!: mode = %o\n", HEALTH_CHECK, + file->f_dentry->d_inode->i_mode); + GOTO(err_health_check, rc = -ENOENT); + } + rc = lvfs_check_io_health(obd, file); + if (rc) + GOTO(err_health_check, rc); rc = filter_prep_groups(obd); if (rc) GOTO(err_server_data, rc); - out: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -814,8 +830,12 @@ static int filter_prep(struct obd_device *obd) err_server_data: //class_disconnect_exports(obd, 0); filter_free_server_data(filter); + err_health_check: + if (filp_close(filter->fo_health_check_filp, 0)) + CERROR("can't close %s after error\n", HEALTH_CHECK); + filter->fo_health_check_filp = NULL; err_filp: - if (filp_close(file, 0)) + if (filp_close(filter->fo_rcvd_filp, 0)) CERROR("can't close %s after error\n", LAST_RCVD); filter->fo_rcvd_filp = NULL; goto out; @@ -850,6 +870,11 @@ static void filter_post(struct obd_device *obd) if (rc) CERROR("error closing %s: rc = %d\n", LAST_RCVD, rc); + rc = filp_close(filter->fo_health_check_filp, 0); + filter->fo_health_check_filp = NULL; + if (rc) + CERROR("error closing %s: rc = %d\n", HEALTH_CHECK, rc); + filter_cleanup_groups(obd); filter_free_server_data(filter); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -982,10 +1007,8 @@ struct dentry *filter_fid2dentry(struct obd_device *obd, int len; ENTRY; - if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT)) { - CERROR("test case OBD_FAIL_OST_ENOENT\n"); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT)) RETURN(ERR_PTR(-ENOENT)); - } if (id == 0) { CERROR("fatal: invalid object id 0\n"); @@ -1211,6 +1234,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns, } RETURN(ELDLM_LOCK_ABORTED); } + /* * This check is for lock taken in filter_prepare_destroy() that does * not have l_glimpse_ast set. So the logic is: if there is a lock @@ -1259,7 +1283,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns, * unknown at the time of OST thread creation. * * Instead array of iobuf's is attached to struct filter_obd (->fo_iobuf_pool - * field). This array has size OST_NUM_THREADS, so that each OST thread uses + * field). This array has size OST_MAX_THREADS, so that each OST thread uses * it's very own iobuf. * * Functions below @@ -1279,18 +1303,18 @@ static int filter_intent_policy(struct ldlm_namespace *ns, */ static void filter_iobuf_pool_done(struct filter_obd *filter) { - void **pool; + struct filter_iobuf **pool; int i; ENTRY; pool = filter->fo_iobuf_pool; if (pool != NULL) { - for (i = 0; i < OST_NUM_THREADS; ++ i) { + for (i = 0; i < filter->fo_iobuf_count; ++ i) { if (pool[i] != NULL) filter_free_iobuf(pool[i]); } - OBD_FREE(pool, OST_NUM_THREADS * sizeof pool[0]); + OBD_FREE(pool, filter->fo_iobuf_count * sizeof pool[0]); filter->fo_iobuf_pool = NULL; } EXIT; @@ -1299,48 +1323,37 @@ static void filter_iobuf_pool_done(struct filter_obd *filter) /* * pre-allocate pool of iobuf's to be used by filter_{prep,commit}rw_write(). */ -static int filter_iobuf_pool_init(struct filter_obd *filter, int count) +static int filter_iobuf_pool_init(struct filter_obd *filter) { void **pool; - int i; - int result = 0; ENTRY; - LASSERT(count <= OST_NUM_THREADS); - - OBD_ALLOC_GFP(pool, OST_NUM_THREADS * sizeof pool[0], GFP_KERNEL); - if (pool == NULL) + OBD_ALLOC_GFP(filter->fo_iobuf_pool, OST_MAX_THREADS * sizeof(*pool), + GFP_KERNEL); + if (filter->fo_iobuf_pool == NULL) RETURN(-ENOMEM); - filter->fo_iobuf_pool = pool; - filter->fo_iobuf_count = count; - for (i = 0; i < count; ++ i) { - /* - * allocate kiobuf to be used by i-th OST thread. - */ - result = filter_alloc_iobuf(filter, OBD_BRW_WRITE, - PTLRPC_MAX_BRW_PAGES, - &pool[i]); - if (result != 0) { - filter_iobuf_pool_done(filter); - break; - } - } - RETURN(result); + filter->fo_iobuf_count = OST_MAX_THREADS; + + RETURN(0); } -/* - * return iobuf preallocated by filter_iobuf_pool_init() for @thread. - */ -void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter) +/* Return iobuf allocated for @thread_id. We don't know in advance how + * many threads there will be so we allocate a large empty array and only + * fill in those slots that are actually in use. + * If we haven't allocated a pool entry for this thread before, do so now. */ +void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti) { - void *kio; + int thread_id = oti ? oti->oti_thread_id : -1; + struct filter_iobuf **pool = &filter->fo_iobuf_pool[thread_id]; - LASSERT(thread->t_id < filter->fo_iobuf_count); - kio = filter->fo_iobuf_pool[thread->t_id]; - LASSERT(kio != NULL); - return kio; + LASSERT(thread_id < filter->fo_iobuf_count); + if (unlikely(thread_id < 0 || *pool == NULL)) + filter_alloc_iobuf(filter, OBD_BRW_WRITE, + PTLRPC_MAX_BRW_PAGES, pool); + + return *pool; } /* mount the file system (secretly). lustre_cfg parameters are: @@ -1369,7 +1382,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (IS_ERR(obd->obd_fsops)) RETURN(PTR_ERR(obd->obd_fsops)); - rc = filter_iobuf_pool_init(filter, OST_NUM_THREADS); + rc = filter_iobuf_pool_init(filter); if (rc != 0) GOTO(err_ops, rc); @@ -1591,10 +1604,10 @@ static int filter_precleanup(struct obd_device *obd, int stage) ENTRY; switch(stage) { - case 1: + case OBD_CLEANUP_EXPORTS: target_cleanup_recovery(obd); break; - case 2: + case OBD_CLEANUP_SELF_EXP: rc = filter_llog_finish(obd, 0); } RETURN(rc); @@ -1669,6 +1682,54 @@ static int filter_cleanup(struct obd_device *obd) RETURN(0); } +static int filter_connect_internal(struct obd_export *exp, + struct obd_connect_data *data) +{ + if (data != NULL) { + CDEBUG(D_RPCTRACE, "%s: cli %s/%p ocd_connect_flags: "LPX64 + " ocd_version: %x ocd_grant: %d\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + data->ocd_connect_flags, data->ocd_version, + data->ocd_grant); + + data->ocd_connect_flags &= OST_CONNECT_SUPPORTED; + exp->exp_connect_flags = data->ocd_connect_flags; + data->ocd_version = LUSTRE_VERSION_CODE; + + if (exp->exp_connect_flags & OBD_CONNECT_GRANT) { + obd_size left, want; + + spin_lock(&exp->exp_obd->obd_osfs_lock); + left = filter_grant_space_left(exp); + want = data->ocd_grant; + data->ocd_grant = filter_grant(exp, 0, want, left); + spin_unlock(&exp->exp_obd->obd_osfs_lock); + + CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: " + "%lld left: %lld\n", exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, exp, + data->ocd_grant, want, left); + } + } + + RETURN(0); +} + +static int filter_reconnect(struct obd_export *exp, struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data) +{ + int rc; + ENTRY; + + if (exp == NULL || obd == NULL || cluuid == NULL) + RETURN(-EINVAL); + + rc = filter_connect_internal(exp, data); + + RETURN(rc); +} + /* nearly identical to mds_connect */ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data) @@ -1720,6 +1781,9 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, fed->fed_fcd = fcd; rc = filter_client_add(obd, filter, fed, -1); + if (!rc) + filter_connect_internal(exp, data); + GOTO(cleanup, rc); cleanup: @@ -1855,6 +1919,8 @@ static int filter_destroy_export(struct obd_export *exp) if (exp->exp_obd->obd_replayable) filter_client_free(exp); + else + fsfilt_sync(exp->exp_obd, exp->exp_obd->u.obt.obt_sb); filter_grant_discard(exp); @@ -1991,9 +2057,8 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, GOTO(out_unlock, rc = PTR_ERR(handle)); if (oa->o_valid & OBD_MD_FLFLAGS) { - rc = fsfilt_iocontrol(exp->exp_obd, dentry->d_inode, - NULL, EXT3_IOC_SETFLAGS, - (long)&iattr.ia_attr_flags); + rc = fsfilt_iocontrol(exp->exp_obd, dentry->d_inode, NULL, + EXT3_IOC_SETFLAGS, (long)&oa->o_flags); } else { rc = fsfilt_setattr(exp->exp_obd, dentry, handle, &iattr, 1); if (fcc != NULL) @@ -2157,9 +2222,9 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs, filter_grant_sanity_check(obd, __FUNCTION__); - osfs->os_bavail -= min(osfs->os_bavail, - (filter->fo_tot_dirty + filter->fo_tot_pending + - osfs->os_bsize - 1) >> blockbits); + osfs->os_bavail -= min(osfs->os_bavail, GRANT_FOR_LLOG(obd) + + ((filter->fo_tot_dirty + filter->fo_tot_pending + + osfs->os_bsize - 1) >> blockbits)); /* set EROFS to state field if FS is mounted as RDONLY. The goal is to * stop creating files on MDS if OST is not good shape to create @@ -2680,9 +2745,11 @@ static int filter_truncate(struct obd_export *exp, struct obdo *oa, int rc; ENTRY; - if (end != OBD_OBJECT_EOF) + if (end != OBD_OBJECT_EOF) { CERROR("PUNCH not supported, only truncate: end = "LPX64"\n", end); + RETURN(-EFAULT); + } CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = "LPX64 ", o_size = "LPD64"\n", oa->o_id, oa->o_valid, start); @@ -2882,6 +2949,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp, static int filter_health_check(struct obd_device *obd) { + struct filter_obd *filter = &obd->u.filter; int rc = 0; /* @@ -2891,6 +2959,9 @@ static int filter_health_check(struct obd_device *obd) if(obd->u.obt.obt_sb->s_flags & MS_RDONLY) rc = 1; + LASSERT(filter->fo_health_check_filp != NULL); + rc |= !!lvfs_check_io_health(obd, filter->fo_health_check_filp); + return rc; } @@ -2912,6 +2983,7 @@ static struct obd_ops filter_obd_ops = { .o_precleanup = filter_precleanup, .o_cleanup = filter_cleanup, .o_connect = filter_connect, + .o_reconnect = filter_reconnect, .o_disconnect = filter_disconnect, .o_statfs = filter_statfs, .o_getattr = filter_getattr, @@ -2939,6 +3011,7 @@ static struct obd_ops filter_sanobd_ops = { .o_precleanup = filter_precleanup, .o_cleanup = filter_cleanup, .o_connect = filter_connect, + .o_reconnect = filter_reconnect, .o_disconnect = filter_disconnect, .o_statfs = filter_statfs, .o_getattr = filter_getattr, diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 3998e2a..fb3f741 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -23,6 +23,7 @@ # define OBD_FILTER_SAN_DEVICENAME "sanobdfilter" #endif +#define HEALTH_CHECK "health_check" #define FILTER_INIT_OBJID 0 #define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */ @@ -35,6 +36,7 @@ #define FILTER_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS) #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE) +#define GRANT_FOR_LLOG(obd) 16 /* Data stored per server at the head of the last_rcvd file. In le32 order. * Try to keep this the same as mds_server_data so we might one day merge. */ @@ -138,6 +140,7 @@ int filter_brw(int cmd, struct obd_export *, struct obdo *, void flip_into_page_cache(struct inode *inode, struct page *new_page); /* filter_io_*.c */ +struct filter_iobuf; int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, struct obd_trans_info *oti, @@ -147,13 +150,15 @@ long filter_grant(struct obd_export *exp, obd_size current_grant, obd_size want, obd_size fs_space_left); void filter_grant_commit(struct obd_export *exp, int niocount, struct niobuf_local *res); -int filter_alloc_iobuf(struct filter_obd *, int rw, int num_pages, void **ret); -void filter_free_iobuf(void *iobuf); -int filter_iobuf_add_page(struct obd_device *obd, void *iobuf, +int filter_alloc_iobuf(struct filter_obd *, int rw, int num_pages, + struct filter_iobuf **ret); +void filter_free_iobuf(struct filter_iobuf *iobuf); +int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf, struct inode *inode, struct page *page); -void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter); -void filter_iobuf_put(void *iobuf); -int filter_direct_io(int rw, struct dentry *dchild, void *iobuf, +void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti); +void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, + struct obd_trans_info *oti); +int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle); diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 41097b5..c2e7219 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -142,8 +142,6 @@ static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa) EXIT; } -#define GRANT_FOR_LLOG(obd) 16 - /* Figure out how much space is available between what we've granted * and what remains in the filesystem. Compensate for ext3 indirect * block overhead when computing how much free space is left ungranted. @@ -182,7 +180,7 @@ restat: if (left >= tot_granted) { left -= tot_granted; } else { - if (left < tot_granted - obd->u.filter.fo_tot_pending + 65536) { + if (left < tot_granted - obd->u.filter.fo_tot_pending) { CERROR("%s: cli %s/%p grant "LPU64" > available " LPU64" and pending "LPU64"\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, tot_granted, @@ -228,12 +226,16 @@ long filter_grant(struct obd_export *exp, obd_size current_grant, obd->obd_name, exp->exp_client_uuid.uuid, exp, want); } else if (current_grant < want && current_grant < fed->fed_grant + FILTER_GRANT_CHUNK) { - grant = min((want >> blockbits) / 2, + grant = min((want >> blockbits), (fs_space_left >> blockbits) / 8); grant <<= blockbits; if (grant) { - if (grant > FILTER_GRANT_CHUNK) + /* Allow >FILTER_GRANT_CHUNK size when clients + * reconnect due to a server reboot. + */ + if ((grant > FILTER_GRANT_CHUNK) && + (!obd->obd_recovering)) grant = FILTER_GRANT_CHUNK; obd->u.filter.fo_tot_granted += grant; @@ -290,9 +292,9 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, spin_unlock(&obd->obd_osfs_lock); } - push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); dentry = filter_oa2dentry_quiet(obd, oa); if (IS_ERR(dentry)) { @@ -367,9 +369,9 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, f_dput(dentry); } - filter_iobuf_put(iobuf); + filter_iobuf_put(&obd->u.filter, iobuf, oti); - pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); if (rc) CERROR("io error %d\n", rc); @@ -442,7 +444,7 @@ static int filter_grant_check(struct obd_export *exp, int objcount, * marked BRW_GRANTED are already mapped and we can * ignore this error. */ lnb[n].rc = -ENOSPC; - rnb[n].flags &= OBD_BRW_GRANTED; + rnb[n].flags &= ~OBD_BRW_GRANTED; CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n", exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, n, bytes); @@ -521,7 +523,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, OBD_RACE(OBD_FAIL_OST_CLEAR_ORPHANS_RACE); - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&exp->exp_obd->u.filter, oti); cleanup_phase = 1; push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); @@ -636,19 +638,20 @@ cleanup: switch(cleanup_phase) { case 4: case 3: - filter_iobuf_put(iobuf); + filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti); case 2: pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); if (rc && dentry && !IS_ERR(dentry)) f_dput(dentry); break; case 1: + filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti); + case 0: spin_lock(&exp->exp_obd->obd_osfs_lock); if (oa) filter_grant_incoming(exp, oa); spin_unlock(&exp->exp_obd->obd_osfs_lock); pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - filter_iobuf_put(iobuf); break; default:; } diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index b0c498e..1b69b0a 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -149,13 +149,13 @@ static int filter_clear_page_cache(struct inode *inode, struct kiobuf *iobuf) } /* Must be called with i_sem taken for writes; this will drop it */ -int filter_direct_io(int rw, struct dentry *dchild, void *buf, +int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *buf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle) { struct obd_device *obd = exp->exp_obd; struct inode *inode = dchild->d_inode; - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; int rc, create = (rw == OBD_BRW_WRITE), committed = 0; int blocks_per_page = PAGE_SIZE >> inode->i_blkbits, cleanup_phase = 0; struct semaphore *sem = NULL; @@ -296,13 +296,8 @@ static void clear_kiobuf(struct kiobuf *iobuf) iobuf->length = 0; } -void filter_iobuf_put(void *iobuf) -{ - clear_kiobuf(iobuf); -} - int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, - void **ret) + struct filter_iobuf **ret) { int rc; struct kiobuf *iobuf; @@ -310,6 +305,7 @@ int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw); + *ret = NULL; rc = alloc_kiovec(1, &iobuf); if (rc) RETURN(rc); @@ -324,22 +320,38 @@ int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */ #endif clear_kiobuf(iobuf); - *ret = iobuf; + *ret = (void *)iobuf; RETURN(0); } -void filter_free_iobuf(void *buf) +void filter_free_iobuf(struct filter_iobuf *buf) { - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; clear_kiobuf(iobuf); free_kiovec(1, &iobuf); } -int filter_iobuf_add_page(struct obd_device *obd, void *buf, +void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, + struct obd_trans_info *oti) +{ + int thread_id = oti ? oti->oti_thread_id : -1; + + if (unlikely(thread_id < 0)) { + filter_free_iobuf(iobuf); + return; + } + + LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf, + "iobuf mismatch for thread %d: pool %p iobuf %p\n", + thread_id, filter->fo_iobuf_pool[thread_id], iobuf); + clear_kiobuf((void *)iobuf); +} + +int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *buf, struct inode *inode, struct page *page) { - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; iobuf->maplist[iobuf->nr_pages++] = page; iobuf->length += PAGE_SIZE; @@ -370,7 +382,9 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, if (rc != 0) GOTO(cleanup, rc); - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); + if (iobuf == NULL) + GOTO(cleanup, rc = -ENOMEM); cleanup_phase = 1; fso.fso_dentry = res->dentry; @@ -442,7 +456,7 @@ cleanup: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); LASSERT(current->journal_info == NULL); case 1: - filter_iobuf_put(iobuf); + filter_iobuf_put(&obd->u.filter, iobuf, oti); case 0: /* * lnb->page automatically returns back into per-thread page diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 1ab866c..13141b1 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -42,7 +42,7 @@ /* 512byte block min */ #define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512) -struct dio_request { +struct filter_iobuf { atomic_t dr_numreqs; /* number of reqs being processed */ struct bio *dr_bios; /* list of completed bios */ wait_queue_head_t dr_wait; @@ -57,12 +57,12 @@ struct dio_request { struct filter_obd *dr_filter; }; -static void record_start_io(struct dio_request *dreq, int rw, int size) +static void record_start_io(struct filter_iobuf *iobuf, int rw, int size) { - struct filter_obd *filter = dreq->dr_filter; + struct filter_obd *filter = iobuf->dr_filter; unsigned long flags; - atomic_inc(&dreq->dr_numreqs); + atomic_inc(&iobuf->dr_numreqs); if (rw == OBD_BRW_READ) { lprocfs_oh_tally(&filter->fo_read_rpc_hist, @@ -79,12 +79,12 @@ static void record_start_io(struct dio_request *dreq, int rw, int size) else filter->fo_w_in_flight++; spin_unlock_irqrestore(&filter->fo_stats_lock, flags); - dreq->dr_start_time = jiffies; + iobuf->dr_start_time = jiffies; } -static void record_finish_io(struct dio_request *dreq, int rw, int rc) +static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc) { - struct filter_obd *filter = dreq->dr_filter; + struct filter_obd *filter = iobuf->dr_filter; unsigned long flags, stop_time = jiffies; spin_lock_irqsave(&filter->fo_stats_lock, flags); @@ -94,24 +94,24 @@ static void record_finish_io(struct dio_request *dreq, int rw, int rc) filter->fo_w_in_flight--; spin_unlock_irqrestore(&filter->fo_stats_lock, flags); - if (atomic_dec_and_test(&dreq->dr_numreqs)) - wake_up(&dreq->dr_wait); + if (atomic_dec_and_test(&iobuf->dr_numreqs)) + wake_up(&iobuf->dr_wait); if (rc != 0) return; if (rw == OBD_BRW_READ) { lprocfs_oh_tally_log2(&filter->fo_r_io_time, - stop_time - dreq->dr_start_time); + stop_time - iobuf->dr_start_time); } else { lprocfs_oh_tally_log2(&filter->fo_w_io_time, - stop_time - dreq->dr_start_time); + stop_time - iobuf->dr_start_time); } } static int dio_complete_routine(struct bio *bio, unsigned int done, int error) { - struct dio_request *dreq = bio->bi_private; + struct filter_iobuf *iobuf = bio->bi_private; unsigned long flags; if (bio->bi_size) { @@ -120,7 +120,7 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) return 1; } - if (dreq == NULL) { + if (iobuf == NULL) { CERROR("***** bio->bi_private is NULL! This should never " "happen. Normally, I would crash here, but instead I " "will dump the bio contents to the console. Please " @@ -138,14 +138,14 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) return 0; } - spin_lock_irqsave(&dreq->dr_lock, flags); - bio->bi_private = dreq->dr_bios; - dreq->dr_bios = bio; - if (dreq->dr_error == 0) - dreq->dr_error = error; - spin_unlock_irqrestore(&dreq->dr_lock, flags); + spin_lock_irqsave(&iobuf->dr_lock, flags); + bio->bi_private = iobuf->dr_bios; + iobuf->dr_bios = bio; + if (iobuf->dr_error == 0) + iobuf->dr_error = error; + spin_unlock_irqrestore(&iobuf->dr_lock, flags); - record_finish_io(dreq, test_bit(BIO_RW, &bio->bi_rw) ? + record_finish_io(iobuf, test_bit(BIO_RW, &bio->bi_rw) ? OBD_BRW_WRITE : OBD_BRW_READ, error); return 0; @@ -163,91 +163,103 @@ static int can_be_merged(struct bio *bio, sector_t sector) } int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, - void **ret) + struct filter_iobuf **ret) { - struct dio_request *dreq; + struct filter_iobuf *iobuf; LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw); - OBD_ALLOC(dreq, sizeof(*dreq)); - if (dreq == NULL) + *ret = NULL; + OBD_ALLOC(iobuf, sizeof(*iobuf)); + if (iobuf == NULL) goto failed_0; - OBD_ALLOC(dreq->dr_pages, num_pages * sizeof(*dreq->dr_pages)); - if (dreq->dr_pages == NULL) + OBD_ALLOC(iobuf->dr_pages, num_pages * sizeof(*iobuf->dr_pages)); + if (iobuf->dr_pages == NULL) goto failed_1; - OBD_ALLOC(dreq->dr_blocks, - MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks)); - if (dreq->dr_blocks == NULL) + OBD_ALLOC(iobuf->dr_blocks, + MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks)); + if (iobuf->dr_blocks == NULL) goto failed_2; - dreq->dr_filter = filter; - dreq->dr_bios = NULL; - init_waitqueue_head(&dreq->dr_wait); - atomic_set(&dreq->dr_numreqs, 0); - spin_lock_init(&dreq->dr_lock); - dreq->dr_max_pages = num_pages; - dreq->dr_npages = 0; + iobuf->dr_filter = filter; + iobuf->dr_bios = NULL; + init_waitqueue_head(&iobuf->dr_wait); + atomic_set(&iobuf->dr_numreqs, 0); + spin_lock_init(&iobuf->dr_lock); + iobuf->dr_max_pages = num_pages; + iobuf->dr_npages = 0; - *ret = dreq; + *ret = iobuf; RETURN(0); - + failed_2: - OBD_FREE(dreq->dr_pages, - num_pages * sizeof(*dreq->dr_pages)); + OBD_FREE(iobuf->dr_pages, + num_pages * sizeof(*iobuf->dr_pages)); failed_1: - OBD_FREE(dreq, sizeof(*dreq)); + OBD_FREE(iobuf, sizeof(*iobuf)); failed_0: RETURN(-ENOMEM); } -void filter_iobuf_put(void *iobuf) +static void filter_clear_iobuf(struct filter_iobuf *iobuf) { - struct dio_request *dreq = iobuf; - /* free all bios */ - while (dreq->dr_bios) { - struct bio *bio = dreq->dr_bios; - dreq->dr_bios = bio->bi_private; + while (iobuf->dr_bios) { + struct bio *bio = iobuf->dr_bios; + iobuf->dr_bios = bio->bi_private; bio_put(bio); } - dreq->dr_npages = 0; - atomic_set(&dreq->dr_numreqs, 0); + iobuf->dr_npages = 0; + atomic_set(&iobuf->dr_numreqs, 0); } -void filter_free_iobuf(void *iobuf) +void filter_free_iobuf(struct filter_iobuf *iobuf) { - struct dio_request *dreq = iobuf; - int num_pages = dreq->dr_max_pages; + int num_pages = iobuf->dr_max_pages; - filter_iobuf_put(dreq); + filter_clear_iobuf(iobuf); - OBD_FREE(dreq->dr_blocks, - MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks)); - OBD_FREE(dreq->dr_pages, - num_pages * sizeof(*dreq->dr_pages)); - OBD_FREE_PTR(dreq); + OBD_FREE(iobuf->dr_blocks, + MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks)); + OBD_FREE(iobuf->dr_pages, + num_pages * sizeof(*iobuf->dr_pages)); + OBD_FREE_PTR(iobuf); } -int filter_iobuf_add_page(struct obd_device *obd, void *iobuf, - struct inode *inode, struct page *page) +void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, + struct obd_trans_info *oti) { - struct dio_request *dreq = iobuf; + int thread_id = oti ? oti->oti_thread_id : -1; - LASSERT (dreq->dr_npages < dreq->dr_max_pages); - dreq->dr_pages[dreq->dr_npages++] = page; + if (unlikely(thread_id < 0)) { + filter_free_iobuf(iobuf); + return; + } + + LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf, + "iobuf mismatch for thread %d: pool %p iobuf %p\n", + thread_id, filter->fo_iobuf_pool[thread_id], iobuf); + filter_clear_iobuf(iobuf); +} + +int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf, + struct inode *inode, struct page *page) +{ + LASSERT(iobuf->dr_npages < iobuf->dr_max_pages); + iobuf->dr_pages[iobuf->dr_npages++] = page; return 0; } int filter_do_bio(struct obd_device *obd, struct inode *inode, - struct dio_request *dreq, int rw) + struct filter_iobuf *iobuf, int rw) { int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; - struct page **pages = dreq->dr_pages; - int npages = dreq->dr_npages; - unsigned long *blocks = dreq->dr_blocks; + struct page **pages = iobuf->dr_pages; + int npages = iobuf->dr_npages; + unsigned long *blocks = iobuf->dr_blocks; int total_blocks = npages * blocks_per_page; int sector_bits = inode->i_sb->s_blocksize_bits - 9; unsigned int blocksize = inode->i_sb->s_blocksize; @@ -262,17 +274,17 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, int rc = 0; ENTRY; - LASSERT(dreq->dr_npages == npages); + LASSERT(iobuf->dr_npages == npages); LASSERT(total_blocks <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); - for (page_idx = 0, block_idx = 0; - page_idx < npages; + for (page_idx = 0, block_idx = 0; + page_idx < npages; page_idx++, block_idx += blocks_per_page) { - + page = pages[page_idx]; LASSERT (block_idx + blocks_per_page <= total_blocks); - for (i = 0, page_offset = 0; + for (i = 0, page_offset = 0; i < blocks_per_page; i += nblocks, page_offset += blocksize * nblocks) { @@ -295,7 +307,7 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, if (bio != NULL && can_be_merged(bio, sector) && - bio_add_page(bio, page, + bio_add_page(bio, page, blocksize * nblocks, page_offset) != 0) continue; /* added this frag OK */ @@ -306,25 +318,25 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, /* Dang! I have to fragment this I/O */ CDEBUG(D_INODE, "bio++ sz %d vcnt %d(%d) " "sectors %d(%d) psg %d(%d) hsg %d(%d)\n", - bio->bi_size, + bio->bi_size, bio->bi_vcnt, bio->bi_max_vecs, bio->bi_size >> 9, q->max_sectors, - bio_phys_segments(q, bio), + bio_phys_segments(q, bio), q->max_phys_segments, - bio_hw_segments(q, bio), + bio_hw_segments(q, bio), q->max_hw_segments); - record_start_io(dreq, rw, bio->bi_size); + record_start_io(iobuf, rw, bio->bi_size); rc = fsfilt_send_bio(rw, obd, inode, bio); if (rc < 0) { CERROR("Can't send bio: %d\n", rc); - record_finish_io(dreq, rw, rc); + record_finish_io(iobuf, rw, rc); goto out; } } /* allocate new bio */ - bio = bio_alloc(GFP_NOIO, + bio = bio_alloc(GFP_NOIO, (npages - page_idx) * blocks_per_page); if (bio == NULL) { CERROR ("Can't allocate bio\n"); @@ -335,30 +347,30 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_sector = sector; bio->bi_end_io = dio_complete_routine; - bio->bi_private = dreq; + bio->bi_private = iobuf; - rc = bio_add_page(bio, page, + rc = bio_add_page(bio, page, blocksize * nblocks, page_offset); LASSERT (rc != 0); } } if (bio != NULL) { - record_start_io(dreq, rw, bio->bi_size); + record_start_io(iobuf, rw, bio->bi_size); rc = fsfilt_send_bio(rw, obd, inode, bio); if (rc >= 0) { rc = 0; } else { CERROR("Can't send bio: %d\n", rc); - record_finish_io(dreq, rw, rc); + record_finish_io(iobuf, rw, rc); } } out: - wait_event(dreq->dr_wait, atomic_read(&dreq->dr_numreqs) == 0); + wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0); if (rc == 0) - rc = dreq->dr_error; + rc = iobuf->dr_error; RETURN(rc); } @@ -375,7 +387,7 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, * not be dirty, because we already called fdatasync/fdatawait on them. */ static int filter_clear_page_cache(struct inode *inode, - struct dio_request *iobuf) + struct filter_iobuf *iobuf) { struct page *page; int i, rc, rc2; @@ -414,38 +426,37 @@ static int filter_clear_page_cache(struct inode *inode, } /* Must be called with i_sem taken for writes; this will drop it */ -int filter_direct_io(int rw, struct dentry *dchild, void *iobuf, +int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle) { struct obd_device *obd = exp->exp_obd; - struct dio_request *dreq = iobuf; struct inode *inode = dchild->d_inode; int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; int rc, rc2, create; struct semaphore *sem; ENTRY; - LASSERTF(dreq->dr_npages <= dreq->dr_max_pages, "%d,%d\n", - dreq->dr_npages, dreq->dr_max_pages); - LASSERT(dreq->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); + LASSERTF(iobuf->dr_npages <= iobuf->dr_max_pages, "%d,%d\n", + iobuf->dr_npages, iobuf->dr_max_pages); + LASSERT(iobuf->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); if (rw == OBD_BRW_READ) { - if (dreq->dr_npages == 0) + if (iobuf->dr_npages == 0) RETURN(0); create = 0; sem = NULL; } else { LASSERTF(rw == OBD_BRW_WRITE, "%x\n", rw); - LASSERT(dreq->dr_npages > 0); + LASSERT(iobuf->dr_npages > 0); create = 1; sem = &obd->u.filter.fo_alloc_lock; lquota_enforce(quota_interface, obd, dreq->dr_ignore_quota); } remap: - rc = fsfilt_map_inode_pages(obd, inode, dreq->dr_pages, - dreq->dr_npages, dreq->dr_blocks, + rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages, + iobuf->dr_npages, iobuf->dr_blocks, obdfilter_created_scratchpad, create, sem); if (rc == -EDQUOT) { @@ -464,9 +475,9 @@ remap: if (rw == OBD_BRW_WRITE) { if (rc == 0) { filter_tally_write(&obd->u.filter, - dreq->dr_pages, - dreq->dr_npages, - dreq->dr_blocks, + iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, blocks_per_page); if (attr->ia_size > inode->i_size) attr->ia_valid |= ATTR_SIZE; @@ -490,11 +501,11 @@ remap: RETURN(rc); } - rc = filter_clear_page_cache(inode, dreq); + rc = filter_clear_page_cache(inode, iobuf); if (rc != 0) RETURN(rc); - RETURN(filter_do_bio(obd, inode, dreq, rw)); + RETURN(filter_do_bio(obd, inode, iobuf, rw)); } /* See if there are unallocated parts in given file region */ @@ -524,7 +535,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int rc) { struct niobuf_local *lnb; - struct dio_request *dreq = NULL; + struct filter_iobuf *iobuf = NULL; struct lvfs_run_ctxt saved; struct fsfilt_objinfo fso; struct iattr iattr = { 0 }; @@ -544,7 +555,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (rc != 0) GOTO(cleanup, rc); - dreq = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); cleanup_phase = 1; fso.fso_dentry = res->dentry; @@ -565,7 +576,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, continue; } - err = filter_iobuf_add_page(obd, dreq, inode, lnb->page); + err = filter_iobuf_add_page(obd, iobuf, inode, lnb->page); LASSERT (err == 0); total_size += lnb->len; @@ -604,7 +615,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME); /* filter_direct_io drops i_sem */ - rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, dreq, exp, &iattr, + rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr, oti, &wait_handle); if (rc == 0) obdo_from_inode(oa, inode, @@ -635,7 +646,7 @@ cleanup: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); LASSERT(current->journal_info == NULL); case 1: - filter_iobuf_put(dreq); + filter_iobuf_put(&obd->u.filter, iobuf, oti); case 0: /* * lnb->page automatically returns back into per-thread page diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 6052b2e..89a4d83 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -635,6 +635,17 @@ void osc_wake_cache_waiters(struct client_obd *cli) EXIT; } +static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) +{ + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = ocd->ocd_grant; + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, "setting cl_avail_grant: %ld cl_lost_grant: %ld\n", + cli->cl_avail_grant, cli->cl_lost_grant); + LASSERT(cli->cl_avail_grant >= 0); +} + static void osc_update_grant(struct client_obd *cli, struct ost_body *body) { spin_lock(&cli->cl_loi_list_lock); @@ -794,6 +805,10 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, if (req == NULL) return (-ENOMEM); + /* FIXME bug 249. Also see bug 7198 */ + if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL) + req->rq_request_portal = OST_IO_PORTAL; + if (opc == OST_WRITE) desc = ptlrpc_prep_bulk_imp (req, page_count, BULK_GET_SOURCE, OST_BULK_PORTAL); @@ -1140,24 +1155,24 @@ static obd_count max_unfragmented_pages(struct brw_page *pg, obd_count pages) int count = 1; int offset; - LASSERT (pages > 0); + LASSERT (pages > 0); offset = pg->off & (PAGE_SIZE - 1); - for (;;) { - pages--; - if (pages == 0) /* that's all */ + for (;;) { + pages--; + if (pages == 0) /* that's all */ return count; if (offset + pg->count < PAGE_SIZE) - return count; /* doesn't end on page boundary */ + return count; /* doesn't end on page boundary */ - pg++; + pg++; offset = pg->off & (PAGE_SIZE - 1); - if (offset != 0) /* doesn't start on page boundary */ - return count; + if (offset != 0) /* doesn't start on page boundary */ + return count; - count++; - } + count++; + } } static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, @@ -1243,6 +1258,9 @@ static void osc_check_rpcs(struct client_obd *cli); static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap, int sent); +/* This maintains the lists of pending pages to read/write for a given object + * (lop). This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint() + * to quickly find objects that are ready to send an RPC. */ static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, int cmd) { @@ -1770,6 +1788,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, !list_empty(&(LOI)->loi_read_lop.lop_urgent), \ args) \ +/* This is called by osc_check_rpcs() to find which objects have pages that + * we could be sending. These lists are maintained by lop_makes_rpc(). */ struct lov_oinfo *osc_next_loi(struct client_obd *cli) { ENTRY; @@ -2040,7 +2060,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, #ifdef HAVE_QUOTA_SUPPORT if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){ struct obd_async_page_ops *ops; - struct obdo *oa = NULL; + struct obdo *oa; oa = obdo_alloc(); if (oa == NULL) @@ -2311,6 +2331,7 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, struct niobuf_remote *nioptr; struct obd_ioobj *iooptr; int rc, size[3] = {sizeof(*body)}, mapped = 0; + struct obd_import *imp = class_exp2cliimp(exp); int swab; ENTRY; @@ -2324,6 +2345,11 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, if (!request) RETURN(-ENOMEM); + /* FIXME bug 249 */ + /* See bug 7198 */ + if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL) + request->rq_request_portal = OST_IO_PORTAL; + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof(*body)); iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof(*iooptr)); nioptr = lustre_msg_buf(request->rq_reqmsg, 2, @@ -2441,6 +2467,7 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, struct ost_body *body; struct niobuf_remote *nioptr; struct obd_ioobj *iooptr; + struct obd_import *imp = class_exp2cliimp(exp); int rc, size[3] = {sizeof(*body)}, mapped = 0; int swab; ENTRY; @@ -2454,6 +2481,11 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, if (!request) RETURN(-ENOMEM); + /* FIXME bug 249 */ + /* See bug 7198 */ + if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL) + request->rq_request_portal = OST_IO_PORTAL; + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr)); nioptr = lustre_msg_buf(request->rq_reqmsg, 2, @@ -3131,6 +3163,33 @@ static int osc_llog_finish(struct obd_device *obd, int count) RETURN(rc); } +static int osc_reconnect(struct obd_export *exp, struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data) +{ + struct client_obd *cli = &obd->u.cli; + + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; + + spin_lock(&cli->cl_loi_list_lock); + data->ocd_grant = cli->cl_avail_grant ?: + 2 * cli->cl_max_pages_per_rpc << PAGE_SHIFT; + lost_grant = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, "request ocd_grant: %d cl_avail_grant: %ld " + "cl_lost_grant: %ld\n", data->ocd_grant, + cli->cl_avail_grant, lost_grant); + CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d" + " ocd_grant: %d\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + } + + RETURN(0); +} + static int osc_disconnect(struct obd_export *exp) { struct obd_device *obd = class_exp2obd(exp); @@ -3168,8 +3227,7 @@ static int osc_import_event(struct obd_device *obd, break; } case IMP_EVENT_INACTIVE: { - if (obd->obd_observer) - rc = obd_notify(obd->obd_observer, obd, 0); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); break; } case IMP_EVENT_INVALIDATE: { @@ -3197,8 +3255,20 @@ static int osc_import_event(struct obd_device *obd, oscc->oscc_flags &= ~OSCC_FLAG_NOSPC; spin_unlock(&oscc->oscc_lock); } - if (obd->obd_observer) - rc = obd_notify(obd->obd_observer, obd, 1); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + break; + } + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) + osc_init_grant(&obd->u.cli, ocd); + + /* See bug 7198 */ + if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) + imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); break; } default: @@ -3248,13 +3318,20 @@ static int osc_precleanup(struct obd_device *obd, int stage) int rc = 0; ENTRY; - if (stage < 2) - RETURN(0); - - rc = obd_llog_finish(obd, 0); - if (rc != 0) - CERROR("failed to cleanup llogging subsystems\n"); - + switch (stage) { + case OBD_CLEANUP_EARLY: { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name); + /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */ + ptlrpc_deactivate_import(imp); + break; + } + case OBD_CLEANUP_SELF_EXP: + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + } RETURN(rc); } @@ -3291,6 +3368,7 @@ struct obd_ops osc_obd_ops = { .o_add_conn = client_import_add_conn, .o_del_conn = client_import_del_conn, .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, .o_disconnect = osc_disconnect, .o_statfs = osc_statfs, .o_packmd = osc_packmd, @@ -3332,6 +3410,7 @@ struct obd_ops sanosc_obd_ops = { .o_add_conn = client_import_add_conn, .o_del_conn = client_import_del_conn, .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, .o_disconnect = client_disconnect_export, .o_statfs = osc_statfs, .o_packmd = osc_packmd, diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 4e52544..e621936 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -51,16 +51,9 @@ #include #include "ost_internal.h" -void oti_init(struct obd_trans_info *oti, struct ptlrpc_request *req) -{ - if (oti == NULL) - return; - memset(oti, 0, sizeof *oti); - - if (req->rq_repmsg && req->rq_reqmsg != 0) - oti->oti_transno = req->rq_repmsg->transno; - oti->oti_thread = req->rq_svc_thread; -} +static int ost_num_threads; +CFS_MODULE_PARM(ost_num_threads, "i", int, 0444, + "number of OST service threads to start"); void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) { @@ -169,18 +162,91 @@ static int ost_create(struct obd_export *exp, struct ptlrpc_request *req, RETURN(0); } +/* + * Helper function for ost_punch(): if asked by client, acquire [size, EOF] + * lock on the file being truncated. + */ +static int ost_punch_lock_get(struct obd_export *exp, struct obdo *oa, + struct lustre_handle *lh) +{ + int flags; + struct ldlm_res_id res_id = { .name = { oa->o_id } }; + ldlm_policy_data_t policy; + __u64 start; + __u64 finis; + + ENTRY; + + LASSERT(!lustre_handle_is_used(lh)); + + if (!(oa->o_valid & OBD_MD_FLFLAGS) || + !(oa->o_flags & OBD_FL_TRUNCLOCK)) + RETURN(0); + + CDEBUG(D_INODE, "OST-side truncate lock.\n"); + + start = oa->o_size; + finis = start + oa->o_blocks; + + /* + * standard truncate optimization: if file body is completely + * destroyed, don't send data back to the server. + */ + flags = (start == 0) ? LDLM_AST_DISCARD_DATA : 0; + + policy.l_extent.start = start & CFS_PAGE_MASK; + + /* + * If ->o_blocks is EOF it means "lock till the end of the + * file". Otherwise, it's size of a hole being punched (in bytes) + */ + if (oa->o_blocks == OBD_OBJECT_EOF || finis < start) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = finis | ~CFS_PAGE_MASK; + + RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace, + res_id, LDLM_EXTENT, &policy, LCK_PW, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + ldlm_glimpse_ast, + NULL, NULL, 0, NULL, lh)); +} + +/* + * Helper function for ost_punch(): release lock acquired by + * ost_punch_lock_get(), if any. + */ +static void ost_punch_lock_put(struct obd_export *exp, struct obdo *oa, + struct lustre_handle *lh) +{ + ENTRY; + if (lustre_handle_is_used(lh)) + ldlm_lock_decref(lh, LCK_PW); + EXIT; +} + static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, struct obd_trans_info *oti) { + struct obdo *oa; struct ost_body *body, *repbody; + struct lustre_handle lh = {0,}; + int rc, size = sizeof(*repbody); + ENTRY; - body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); + /* + * check that we do support OBD_CONNECT_TRUNCLOCK. + */ + CLASSERT(OST_CONNECT_SUPPORTED & OBD_CONNECT_TRUNCLOCK); + + body = lustre_swab_reqbuf(req, 0, sizeof *body, lustre_swab_ost_body); if (body == NULL) RETURN(-EFAULT); - if ((body->oa.o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != + oa = &body->oa; + if ((oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) RETURN(-EINVAL); @@ -189,10 +255,23 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, RETURN(rc); repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody)); - memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_punch(exp, &repbody->oa, NULL, repbody->oa.o_size, - repbody->oa.o_blocks, oti); - RETURN(0); + repbody->oa = *oa; + rc = ost_punch_lock_get(exp, oa, &lh); + if (rc == 0) { + if (oa->o_valid & OBD_MD_FLFLAGS && + oa->o_flags == OBD_FL_TRUNCLOCK) + /* + * If OBD_FL_TRUNCLOCK is the only bit set in + * ->o_flags, clear OBD_MD_FLFLAGS to avoid falling + * through filter_setattr() to filter_iocontrol(). + */ + oa->o_valid &= ~OBD_MD_FLFLAGS; + + req->rq_status = obd_punch(exp, oa, NULL, + oa->o_size, oa->o_blocks, oti); + ost_punch_lock_put(exp, oa, &lh); + } + RETURN(rc); } static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req) @@ -463,15 +542,16 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp, ENTRY; LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT(!lustre_handle_is_used(lh)); + + if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)) + RETURN(0); /* EXPENSIVE ASSERTION */ for (i = 1; i < nrbufs; i ++) LASSERT((nb[0].flags & OBD_BRW_SRVLOCK) == (nb[i].flags & OBD_BRW_SRVLOCK)); - if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)) - RETURN(0); - policy.l_extent.start = nb[0].offset & CFS_PAGE_MASK; policy.l_extent.end = (nb[nrbufs - 1].offset + nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK; @@ -489,7 +569,9 @@ static void ost_brw_lock_put(int mode, { ENTRY; LASSERT(mode == LCK_PR || mode == LCK_PW); - if (obj->ioo_bufcnt > 0 && niob[0].flags & OBD_BRW_SRVLOCK) + LASSERT((obj->ioo_bufcnt > 0 && (niob[0].flags & OBD_BRW_SRVLOCK)) == + lustre_handle_is_used(lh)); + if (lustre_handle_is_used(lh)) ldlm_lock_decref(lh, mode); EXIT; } @@ -503,7 +585,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) struct obd_ioobj *ioo; struct ost_body *body, *repbody; struct l_wait_info lwi; - struct lustre_handle lockh; + struct lustre_handle lockh = {0}; int size[1] = { sizeof(*body) }; int comms_error = 0; int niocount; @@ -709,7 +791,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) struct obd_ioobj *ioo; struct ost_body *body, *repbody; struct l_wait_info lwi; - struct lustre_handle lockh; + struct lustre_handle lockh = {0}; __u32 *rcs; int size[2] = { sizeof(*body) }; int objcount, niocount, npages; @@ -860,7 +942,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) repbody->oa.o_valid |= OBD_MD_FLCKSUM; } else if ((cksum_counter & (-cksum_counter)) == cksum_counter) { - CWARN("Checksum %u from %s: %x OK\n", cksum_counter, + CWARN("Checksum %u from %s: %x OK\n", cksum_counter, libcfs_id2str(req->rq_peer), cksum); } else { cksum_counter++; @@ -1446,7 +1528,6 @@ static void ost_thread_done(struct ptlrpc_thread *thread) ENTRY; LASSERT(thread != NULL); - LASSERT(thread->t_data != NULL); /* * be prepared to handle partially-initialized pools (because this is @@ -1477,7 +1558,7 @@ static int ost_thread_init(struct ptlrpc_thread *thread) LASSERT(thread != NULL); LASSERT(thread->t_data == NULL); - LASSERT(thread->t_id < OST_NUM_THREADS); + LASSERT(thread->t_id < OST_MAX_THREADS); OBD_ALLOC_PTR(tls); if (tls != NULL) { @@ -1519,21 +1600,23 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) sema_init(&ost->ost_health_sem, 1); + if (ost_num_threads < 2) + ost_num_threads = OST_DEF_THREADS; + if (ost_num_threads > OST_MAX_THREADS) + ost_num_threads = OST_MAX_THREADS; + ost->ost_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, "ost", + obd_timeout * 1000, ost_handle, LUSTRE_OST_NAME, obd->obd_proc_entry, ost_print_req, - OST_NUM_THREADS); + ost_num_threads); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); GOTO(out_lprocfs, rc = -ENOMEM); } - ost->ost_service->srv_init = ost_thread_init; - ost->ost_service->srv_done = ost_thread_done; - ost->ost_service->srv_cpu_affinity = 1; rc = ptlrpc_start_threads(obd, ost->ost_service, "ll_ost"); if (rc) GOTO(out_service, rc = -EINVAL); @@ -1554,8 +1637,31 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) if (rc) GOTO(out_create, rc = -EINVAL); + ost->ost_io_service = + ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, + OST_MAXREPSIZE, OST_IO_PORTAL, + OSC_REPLY_PORTAL, + obd_timeout * 1000, ost_handle, "ost_io", + obd->obd_proc_entry, ost_print_req, + ost_num_threads); + if (ost->ost_io_service == NULL) { + CERROR("failed to start OST I/O service\n"); + GOTO(out_create, rc = -ENOMEM); + } + + ost->ost_io_service->srv_init = ost_thread_init; + ost->ost_io_service->srv_done = ost_thread_done; + ost->ost_io_service->srv_cpu_affinity = 1; + rc = ptlrpc_start_threads(obd, ost->ost_io_service, + "ll_ost_io"); + if (rc) + GOTO(out_io, rc = -EINVAL); + RETURN(0); +out_io: + ptlrpc_unregister_service(ost->ost_io_service); + ost->ost_io_service = NULL; out_create: ptlrpc_unregister_service(ost->ost_create_service); ost->ost_create_service = NULL; @@ -1583,6 +1689,7 @@ static int ost_cleanup(struct obd_device *obd) down(&ost->ost_health_sem); ptlrpc_unregister_service(ost->ost_service); ptlrpc_unregister_service(ost->ost_create_service); + ptlrpc_unregister_service(ost->ost_io_service); ost->ost_service = NULL; ost->ost_create_service = NULL; up(&ost->ost_health_sem); @@ -1600,6 +1707,7 @@ static int ost_health_check(struct obd_device *obd) down(&ost->ost_health_sem); rc |= ptlrpc_service_health_check(ost->ost_service); rc |= ptlrpc_service_health_check(ost->ost_create_service); + rc |= ptlrpc_service_health_check(ost->ost_io_service); up(&ost->ost_health_sem); /* diff --git a/lustre/ost/ost_internal.h b/lustre/ost/ost_internal.h index 51ae8c9..3407a96 100644 --- a/lustre/ost/ost_internal.h +++ b/lustre/ost/ost_internal.h @@ -14,16 +14,8 @@ extern void ost_print_req(void *seq_file, struct ptlrpc_request *req); /* * tunables for per-thread page pool (bug 5137) */ -enum { - /* - * pool size in pages - */ - OST_THREAD_POOL_SIZE = PTLRPC_MAX_BRW_PAGES, - /* - * GFP mask used to allocate pages for pool - */ - OST_THREAD_POOL_GFP = GFP_HIGHUSER -}; +#define OST_THREAD_POOL_SIZE PTLRPC_MAX_BRW_PAGES /* pool size in pages */ +#define OST_THREAD_POOL_GFP GFP_HIGHUSER /* GFP mask for pool pages */ struct page; struct niobuf_local; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index c6dc70b..fd640ce 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1805,7 +1805,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) } /* Last chance to free reqs left on the replay list, but we - * will still leak reqs that haven't comitted. */ + * will still leak reqs that haven't committed. */ if (imp->imp_replayable) ptlrpc_free_committed(imp); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index da6f8ad..501abce 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -392,7 +392,7 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, } } - CDEBUG(D_WARNING,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); if (rc != 0) CERROR("No NID found for %s\n", uuid->uuid); return rc; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 906f2f5..1213342 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "ptlrpc_internal.h" @@ -130,14 +131,13 @@ int ptlrpc_set_import_discon(struct obd_import *imp) "service will %s.\n", target_len, target_start, libcfs_nid2str(imp->imp_connection->c_peer.nid), - imp->imp_replayable - ? "wait for recovery to complete" - : "fail"); + imp->imp_replayable ? + "wait for recovery to complete" : "fail"); if (obd_dump_on_timeout) libcfs_debug_dumplog(); - CWARN("%s: connection lost to %s@%s\n", + CDEBUG(D_HA, "%s: connection lost to %s@%s\n", imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); @@ -333,7 +333,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) imp->imp_conn_cnt++; imp->imp_resend_replay = 0; - if (imp->imp_remote_handle.cookie == 0) { + if (!lustre_handle_is_used(&imp->imp_remote_handle)) { initial_connect = 1; } else { committed_before_reconnect = imp->imp_peer_committed_transno; @@ -354,6 +354,23 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) if (rc) GOTO(out, rc); + if (imp->imp_initial_recov_bk && initial_connect && + /* last in list */ + (imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) { + CDEBUG(D_HA, "Last connection attempt (%d) for %s\n", + imp->imp_conn_cnt, imp->imp_target_uuid.uuid); + /* Don't retry if connect fails */ + rc = 0; + obd_set_info(obd->obd_self_export, + strlen("initial_recov"), "initial_recov", + sizeof(rc), &rc); + } + + rc = obd_reconnect(imp->imp_obd->obd_self_export, obd, + &obd->obd_uuid, &imp->imp_connect_data); + if (rc) + GOTO(out, rc); + request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, imp->imp_connect_op, 4, size, tmp); if (!request) @@ -369,7 +386,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) request->rq_replen = lustre_msg_size(1, size); request->rq_interpret_reply = ptlrpc_connect_interpret; - LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args)); + CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args)); aa = (struct ptlrpc_connect_async_args *)&request->rq_async_args; memset(aa, 0, sizeof *aa); @@ -378,7 +395,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) if (aa->pcaa_initial_connect) { imp->imp_replayable = 1; - /* On an initial connect, we don't know which one of a + /* On an initial connect, we don't know which one of a failover server pair is up. Don't wait long. */ request->rq_timeout = max((int)(obd_timeout / 20), 5); } @@ -564,11 +581,35 @@ finish: LASSERT(exp); exp->exp_connect_flags = ocd->ocd_connect_flags; class_export_put(exp); - + if (IMP_CROW_ABLE(imp)) { CDEBUG(D_HA, "connected to CROW capable target: %s\n", imp->imp_target_uuid.uuid); } + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); + + if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + + LUSTRE_VERSION_OFFSET_WARN)) { + /* Sigh, some compilers do not like #ifdef in the middle + of macro arguments */ +#ifdef __KERNEL__ + char *action = "upgrading this client"; +#else + char *action = "recompiling this application"; +#endif + + CWARN("Server %s version (%d.%d.%d.%d) is much newer. " + "Consider %s (%s).\n", + imp->imp_target_uuid.uuid, + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + action, LUSTRE_VERSION_STRING); + } + if (imp->imp_conn_current != NULL) { list_del(&imp->imp_conn_current->oic_item); list_add(&imp->imp_conn_current->oic_item, @@ -587,11 +628,38 @@ finish: out: if (rc != 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); if (aa->pcaa_initial_connect && !imp->imp_initial_recov) { ptlrpc_deactivate_import(imp); } + if (rc == -EPROTO) { + struct obd_connect_data *ocd; + ocd = lustre_swab_repbuf(request, 0, + sizeof *ocd, + lustre_swab_connect); + if (ocd && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version != LUSTRE_VERSION_CODE)) { + /* Actually servers are only supposed to refuse + connection from liblustre clients, so we should + never see this from VFS context */ + CERROR("Server %s version (%d.%d.%d.%d) refused" + " connection from this client as too old " + "version (%s). Client must be " + "recompiled\n", + imp->imp_target_uuid.uuid, + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + LUSTRE_VERSION_STRING); + IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED); + RETURN(-EPROTO); + } + } + ptlrpc_maybe_ping_import_soon(imp); CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index b19a309..9952b49 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -82,12 +82,6 @@ struct ll_rpc_opcode { { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, - { PTLBD_QUERY, "ptlbd_query" }, - { PTLBD_READ, "ptlbd_read" }, - { PTLBD_WRITE, "ptlbd_write" }, - { PTLBD_FLUSH, "ptlbd_flush" }, - { PTLBD_CONNECT, "ptlbd_connect" }, - { PTLBD_DISCONNECT, "ptlbd_disconnect" }, { OBD_PING, "obd_ping" }, { OBD_LOG_CANCEL, "llog_origin_handle_cancel"}, }; diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 6fd3eea..c838de6 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -502,6 +502,16 @@ void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size, void lustre_swab_connect(struct obd_connect_data *ocd) { __swab64s (&ocd->ocd_connect_flags); + __swab32s (&ocd->ocd_version); + __swab32s (&ocd->ocd_grant); + __swab32s (&ocd->ocd_index); + __swab32s (&ocd->ocd_unused); + __swab64s (&ocd->ocd_ibits_known); + CLASSERT(offsetof(typeof(*ocd), padding2) != 0); + CLASSERT(offsetof(typeof(*ocd), padding3) != 0); + CLASSERT(offsetof(typeof(*ocd), padding4) != 0); + CLASSERT(offsetof(typeof(*ocd), padding5) != 0); + CLASSERT(offsetof(typeof(*ocd), padding6) != 0); } void lustre_swab_obdo (struct obdo *o) @@ -539,7 +549,7 @@ void lustre_swab_obd_statfs (struct obd_statfs *os) __swab64s (&os->os_bavail); __swab64s (&os->os_files); __swab64s (&os->os_ffree); - /* no need to swap os_fsid */ + /* no need to swab os_fsid */ __swab32s (&os->os_bsize); __swab32s (&os->os_namelen); __swab64s (&os->os_maxbytes); @@ -637,7 +647,7 @@ static void lustre_swab_obd_dqblk (struct obd_dqblk *b) __swab64s (&b->dqb_btime); __swab64s (&b->dqb_itime); __swab32s (&b->dqb_valid); - __swab32s (&b->padding); + CLASSERT(offsetof(typeof(*b), padding) != 0); } void lustre_swab_obd_quotactl (struct obd_quotactl *q) @@ -667,7 +677,7 @@ void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa) __swab32s (&sa->sa_uid); __swab32s (&sa->sa_gid); __swab32s (&sa->sa_attr_flags); - __swab32s (&sa->sa_padding); + CLASSERT(offsetof(typeof(*sa), sa_padding) != 0); } void lustre_swab_mds_rec_create (struct mds_rec_create *cr) @@ -683,11 +693,11 @@ void lustre_swab_mds_rec_create (struct mds_rec_create *cr) __swab64s (&cr->cr_time); __swab64s (&cr->cr_rdev); __swab32s (&cr->cr_suppgid); - __swab32s (&cr->cr_padding_1); - __swab32s (&cr->cr_padding_2); - __swab32s (&cr->cr_padding_3); - __swab32s (&cr->cr_padding_4); - __swab32s (&cr->cr_padding_5); + CLASSERT(offsetof(typeof(*cr), cr_padding_1) != 0); + CLASSERT(offsetof(typeof(*cr), cr_padding_2) != 0); + CLASSERT(offsetof(typeof(*cr), cr_padding_3) != 0); + CLASSERT(offsetof(typeof(*cr), cr_padding_4) != 0); + CLASSERT(offsetof(typeof(*cr), cr_padding_5) != 0); } void lustre_swab_mds_rec_link (struct mds_rec_link *lk) @@ -701,10 +711,10 @@ void lustre_swab_mds_rec_link (struct mds_rec_link *lk) lustre_swab_ll_fid (&lk->lk_fid1); lustre_swab_ll_fid (&lk->lk_fid2); __swab64s (&lk->lk_time); - __swab32s (&lk->lk_padding_1); - __swab32s (&lk->lk_padding_2); - __swab32s (&lk->lk_padding_3); - __swab32s (&lk->lk_padding_4); + CLASSERT(offsetof(typeof(*lk), lk_padding_1) != 0); + CLASSERT(offsetof(typeof(*lk), lk_padding_2) != 0); + CLASSERT(offsetof(typeof(*lk), lk_padding_3) != 0); + CLASSERT(offsetof(typeof(*lk), lk_padding_4) != 0); } void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul) @@ -718,10 +728,10 @@ void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul) lustre_swab_ll_fid (&ul->ul_fid1); lustre_swab_ll_fid (&ul->ul_fid2); __swab64s (&ul->ul_time); - __swab32s (&ul->ul_padding_1); - __swab32s (&ul->ul_padding_2); - __swab32s (&ul->ul_padding_3); - __swab32s (&ul->ul_padding_4); + CLASSERT(offsetof(typeof(*ul), ul_padding_1) != 0); + CLASSERT(offsetof(typeof(*ul), ul_padding_2) != 0); + CLASSERT(offsetof(typeof(*ul), ul_padding_3) != 0); + CLASSERT(offsetof(typeof(*ul), ul_padding_4) != 0); } void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn) @@ -735,10 +745,10 @@ void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn) lustre_swab_ll_fid (&rn->rn_fid1); lustre_swab_ll_fid (&rn->rn_fid2); __swab64s (&rn->rn_time); - __swab32s (&rn->rn_padding_1); - __swab32s (&rn->rn_padding_2); - __swab32s (&rn->rn_padding_3); - __swab32s (&rn->rn_padding_4); + CLASSERT(offsetof(typeof(*rn), rn_padding_1) != 0); + CLASSERT(offsetof(typeof(*rn), rn_padding_2) != 0); + CLASSERT(offsetof(typeof(*rn), rn_padding_3) != 0); + CLASSERT(offsetof(typeof(*rn), rn_padding_4) != 0); } void lustre_swab_lov_desc (struct lov_desc *ld) @@ -840,6 +850,7 @@ void lustre_swab_ldlm_intent (struct ldlm_intent *i) void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r) { __swab32s (&r->lr_type); + CLASSERT(offsetof(typeof(*r), lr_padding) != 0); lustre_swab_ldlm_res_id (&r->lr_name); } @@ -854,6 +865,7 @@ void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l) void lustre_swab_ldlm_request (struct ldlm_request *rq) { __swab32s (&rq->lock_flags); + CLASSERT(offsetof(typeof(*rq), lock_padding) != 0); lustre_swab_ldlm_lock_desc (&rq->lock_desc); /* lock_handle1 opaque */ /* lock_handle2 opaque */ @@ -862,35 +874,13 @@ void lustre_swab_ldlm_request (struct ldlm_request *rq) void lustre_swab_ldlm_reply (struct ldlm_reply *r) { __swab32s (&r->lock_flags); + CLASSERT(offsetof(typeof(*r), lock_padding) != 0); lustre_swab_ldlm_lock_desc (&r->lock_desc); /* lock_handle opaque */ __swab64s (&r->lock_policy_res1); __swab64s (&r->lock_policy_res2); } -void lustre_swab_ptlbd_op (struct ptlbd_op *op) -{ - __swab16s (&op->op_cmd); - __swab16s (&op->op_lun); - __swab16s (&op->op_niob_cnt); - /* ignore op__padding */ - __swab32s (&op->op_block_cnt); -} - -void lustre_swab_ptlbd_niob (struct ptlbd_niob *n) -{ - __swab64s (&n->n_xid); - __swab64s (&n->n_block_nr); - __swab32s (&n->n_offset); - __swab32s (&n->n_length); -} - -void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r) -{ - __swab16s (&r->r_status); - __swab16s (&r->r_error_cnt); -} - /* no one calls this */ int llog_log_swabbed(struct llog_log_hdr *hdr) { @@ -912,8 +902,8 @@ void lustre_swab_qdata(struct qunit_data *d) void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux mustang 2.6.12-1.1456_FC4smp #1 SMP Thu Sep 22 02:22:14 EDT 2005 i686 i68 - * with gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) */ + * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6 + * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */ /* Constants... */ @@ -1045,8 +1035,6 @@ void lustre_assert_wire_constants(void) (long long)MDS_STATUS_CONN); LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n", (long long)MDS_STATUS_LOV); - LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n", - (long long)MDS_OPEN_HAS_EA); LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n", (long long)LDLM_ENQUEUE); LASSERTF(LDLM_CONVERT == 102, " found %lld\n", @@ -1057,6 +1045,8 @@ void lustre_assert_wire_constants(void) (long long)LDLM_BL_CALLBACK); LASSERTF(LDLM_CP_CALLBACK == 105, " found %lld\n", (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, " found %lld\n", + (long long)LDLM_GL_CALLBACK); LASSERTF(LDLM_LAST_OPC == 107, " found %lld\n", (long long)LDLM_LAST_OPC); LASSERTF(LCK_EX == 1, " found %lld\n", @@ -1071,26 +1061,10 @@ void lustre_assert_wire_constants(void) (long long)LCK_CR); LASSERTF(LCK_NL == 32, " found %lld\n", (long long)LCK_NL); - LASSERTF(PTLBD_QUERY == 200, " found %lld\n", - (long long)PTLBD_QUERY); - LASSERTF(PTLBD_READ == 201, " found %lld\n", - (long long)PTLBD_READ); - LASSERTF(PTLBD_WRITE == 202, " found %lld\n", - (long long)PTLBD_WRITE); - LASSERTF(PTLBD_FLUSH == 203, " found %lld\n", - (long long)PTLBD_FLUSH); - LASSERTF(PTLBD_CONNECT == 204, " found %lld\n", - (long long)PTLBD_CONNECT); - LASSERTF(PTLBD_DISCONNECT == 205, " found %lld\n", - (long long)PTLBD_DISCONNECT); - LASSERTF(PTLBD_LAST_OPC == 206, " found %lld\n", - (long long)PTLBD_LAST_OPC); - LASSERTF(MGMT_CONNECT == 250, " found %lld\n", - (long long)MGMT_CONNECT); - LASSERTF(MGMT_DISCONNECT == 251, " found %lld\n", - (long long)MGMT_DISCONNECT); - LASSERTF(MGMT_EXCEPTION == 252, " found %lld\n", - (long long)MGMT_EXCEPTION); + LASSERTF(LCK_GROUP == 64, " found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_MAXMODE == 65, " found %lld\n", + (long long)LCK_MAXMODE); LASSERTF(OBD_PING == 400, " found %lld\n", (long long)OBD_PING); LASSERTF(OBD_LOG_CANCEL == 401, " found %lld\n", @@ -1103,6 +1077,28 @@ void lustre_assert_wire_constants(void) (long long)QUOTA_DQACQ); LASSERTF(QUOTA_DQREL == 602, " found %lld\n", (long long)QUOTA_DQREL); + LASSERTF(OBD_CONNECT_RDONLY == 1, " found %lld\n", + (long long)OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 2, " found %lld\n", + (long long)OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_GRANT == 8, " found %lld\n", + (long long)OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 16, " found %lld\n", + (long long)OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 32, " found %lld\n", + (long long)OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 64, " found %lld\n", + (long long)OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 128, " found %lld\n", + (long long)OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 256, " found %lld\n", + (long long)OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_CROW == 512, " found %lld\n", + (long long)OBD_CONNECT_CROW); + LASSERTF(OBD_CONNECT_TRUNCLOCK == 1024, " found %lld\n", + (long long)OBD_CONNECT_TRUNCLOCK); + LASSERTF(OBD_CONNECT_TRANSNO == 2048, " found %lld\n", + (long long)OBD_CONNECT_TRANSNO); /* Sizes and Offsets */ @@ -1249,6 +1245,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obdo, o_mds)); LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_mds)); + LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_1)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_1)); LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n", (long long)(int)offsetof(struct obdo, o_inline)); LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n", @@ -1541,6 +1541,18 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, " found %lld\n", (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, padding) == 68, " found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->padding)); + LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n", + (long long)Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n", + (long long)Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102," found %lld\n", + (long long)Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n", + (long long)Q_GETOQUOTA); /* Checks for struct niobuf_remote */ LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n", @@ -1985,6 +1997,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4)); LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_uuid)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n", @@ -2049,6 +2081,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding)); LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, " found %lld\n", @@ -2081,6 +2117,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_request, lock_flags)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding)); LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_request, lock_desc)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n", @@ -2101,6 +2141,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_reply, lock_flags)); LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding)); LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_request, lock_desc)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n", @@ -2142,62 +2186,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n", (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); - /* Checks for struct ptlbd_op */ - LASSERTF((int)sizeof(struct ptlbd_op) == 12, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_op)); - LASSERTF((int)offsetof(struct ptlbd_op, op_cmd) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_cmd)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_cmd) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_cmd)); - LASSERTF((int)offsetof(struct ptlbd_op, op_lun) == 2, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_lun)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_lun) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_lun)); - LASSERTF((int)offsetof(struct ptlbd_op, op_niob_cnt) == 4, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_niob_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt)); - LASSERTF((int)offsetof(struct ptlbd_op, op__padding) == 6, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op__padding)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op__padding) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op__padding)); - LASSERTF((int)offsetof(struct ptlbd_op, op_block_cnt) == 8, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_block_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_block_cnt) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_block_cnt)); - - /* Checks for struct ptlbd_niob */ - LASSERTF((int)sizeof(struct ptlbd_niob) == 24, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_niob)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_xid) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_xid)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_xid) == 8, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_xid)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_block_nr) == 8, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_block_nr)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_block_nr) == 8, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_block_nr)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_offset) == 16, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_offset)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_offset) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_offset)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_length) == 20, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_length)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_length) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_length)); - - /* Checks for struct ptlbd_rsp */ - LASSERTF((int)sizeof(struct ptlbd_rsp) == 4, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_rsp)); - LASSERTF((int)offsetof(struct ptlbd_rsp, r_status) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_rsp, r_status)); - LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_status) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_status)); - LASSERTF((int)offsetof(struct ptlbd_rsp, r_error_cnt) == 2, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_rsp, r_error_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt)); - /* Checks for struct llog_logid */ LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n", (long long)(int)sizeof(struct llog_logid)); @@ -2223,6 +2211,8 @@ void lustre_assert_wire_constants(void) (long long)MDS_SETATTR_REC); LASSERTF(OBD_CFG_REC == 274857984, " found %lld\n", (long long)OBD_CFG_REC); + LASSERTF(PTL_CFG_REC == 274923520, " found %lld\n", + (long long)PTL_CFG_REC); LASSERTF(LLOG_GEN_REC == 274989056, " found %lld\n", (long long)LLOG_GEN_REC); LASSERTF(LLOG_HDR_MAGIC == 275010873, " found %lld\n", @@ -2237,6 +2227,18 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_catid, lci_logid)); LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, " found %lld\n", (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); /* Checks for struct llog_rec_hdr */ LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, " found %lld\n", @@ -2253,6 +2255,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, padding) == 12, " found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, padding)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->padding)); /* Checks for struct llog_rec_tail */ LASSERTF((int)sizeof(struct llog_rec_tail) == 8, " found %lld\n", @@ -2277,6 +2283,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_logid_rec, lid_id)); LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, " found %lld\n", (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding1) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding2) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding3) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding4) == 48, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding4)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding4) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding4)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding5) == 52, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding5)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding5) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding5)); LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, " found %lld\n", (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, " found %lld\n", @@ -2301,6 +2327,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_create_rec, lcr_ogen)); LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen)); + LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_create_rec, padding)); + LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_create_rec *)0)->padding)); /* Checks for struct llog_orphan_rec */ LASSERTF((int)sizeof(struct llog_orphan_rec) == 40, " found %lld\n", @@ -2317,6 +2347,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_orphan_rec, lor_ogen)); LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen)); + LASSERTF((int)offsetof(struct llog_orphan_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_orphan_rec, padding)); + LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_orphan_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_orphan_rec, lor_tail) == 32, " found %lld\n", (long long)(int)offsetof(struct llog_orphan_rec, lor_tail)); LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_tail) == 8, " found %lld\n", @@ -2337,11 +2371,47 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen)); LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen)); + LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, padding)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n", (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n", (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_setattr_rec */ + LASSERTF((int)sizeof(struct llog_setattr_rec) == 48, " found %lld\n", + (long long)(int)sizeof(struct llog_setattr_rec)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_hdr) == 0, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr) == 16, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_oid) == 16, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_gid) == 32, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, padding) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, padding)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->padding)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_tail) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail)); + /* Checks for struct llog_size_change_rec */ LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n", (long long)(int)sizeof(struct llog_size_change_rec)); @@ -2357,6 +2427,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_size_change_rec, lsc_io_epoch)); LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, padding) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, padding)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 40, " found %lld\n", (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, " found %lld\n", @@ -2453,6 +2527,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_cookie, lgc_index)); LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_padding)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding)); /* Checks for struct llogd_body */ LASSERTF((int)sizeof(struct llogd_body) == 48, " found %lld\n", diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 364fbca..0ce3872 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -75,16 +75,9 @@ static inline int opcode_offset(__u32 opc) { return (opc - LDLM_FIRST_OPC + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); - } else if (opc < PTLBD_LAST_OPC) { - /* Portals Block Device */ - return (opc - PTLBD_FIRST_OPC + - (LDLM_LAST_OPC - LDLM_FIRST_OPC) + - (MDS_LAST_OPC - MDS_FIRST_OPC) + - (OST_LAST_OPC - OST_FIRST_OPC)); } else if (opc < OBD_LAST_OPC) { /* OBD Ping */ return (opc - OBD_FIRST_OPC + - (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); @@ -94,8 +87,7 @@ static inline int opcode_offset(__u32 opc) { } } -#define LUSTRE_MAX_OPCODES ((PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + \ - (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ +#define LUSTRE_MAX_OPCODES ((LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ (MDS_LAST_OPC - MDS_FIRST_OPC) + \ (OST_LAST_OPC - OST_FIRST_OPC) + \ (OBD_LAST_OPC - OBD_FIRST_OPC)) diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 3c059c4..ee1c002 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -184,9 +184,6 @@ EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc); EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); -EXPORT_SYMBOL(lustre_swab_ptlbd_op); -EXPORT_SYMBOL(lustre_swab_ptlbd_niob); -EXPORT_SYMBOL(lustre_swab_ptlbd_rsp); EXPORT_SYMBOL(lustre_swab_qdata); /* recover.c */ diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index af61984..b3492e8 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -570,17 +570,22 @@ put_conn: timediff = timeval_sub(&work_end, &work_start); if (timediff / 1000000 > (long)obd_timeout) - CERROR("request "LPU64" opc %u from %s processed in %lds\n", + CERROR("request "LPU64" opc %u from %s processed in %lds " + "trans "LPU64" rc %d/%d\n", request->rq_xid, request->rq_reqmsg->opc, libcfs_id2str(request->rq_peer), timeval_sub(&work_end, - &request->rq_arrival_time) / 1000000); + &request->rq_arrival_time) / 1000000, + request->rq_transno, request->rq_status, + request->rq_repmsg ? request->rq_repmsg->status : -999); else - CDEBUG(D_HA,"request "LPU64" opc %u from %s processed in %ldus" - " (%ldus total)\n", request->rq_xid, - request->rq_reqmsg->opc, + CDEBUG(D_HA, "request "LPU64" opc %u from %s processed in " + "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", + request->rq_xid, request->rq_reqmsg->opc, libcfs_id2str(request->rq_peer), timediff, - timeval_sub(&work_end, &request->rq_arrival_time)); + timeval_sub(&work_end, &request->rq_arrival_time), + request->rq_transno, request->rq_status, + request->rq_repmsg ? request->rq_repmsg->status : -999); if (svc->srv_stats != NULL) { int opc = opcode_offset(request->rq_reqmsg->opc); @@ -625,7 +630,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) list_del_init (&rs->rs_list); - /* Disengage from notifiers carefully (lock ordering!) */ + /* Disengage from notifiers carefully (lock order - irqrestore below!)*/ spin_unlock(&svc->srv_lock); spin_lock (&obd->obd_uncommitted_replies_lock); @@ -970,7 +975,7 @@ void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) spin_unlock_irqrestore(&svc->srv_lock, flags); } -/* @base_name should be 12 characters or less - 3 will be added on */ +/* @base_name should be 11 characters or less - 3 will be added on */ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc, char *base_name) { diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 0674610..75c20ce 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11,11 +11,9 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} +ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" - SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH @@ -746,4 +744,15 @@ test_18() { } run_test 18 "check lconf creates large journals" +test_19() { + # first format the ost/mdt + start_ost + start_mds + stop_mds + stop_ost + start mds $MDSLCONFARGS || return 1 + stop mds --force || return 2 +} +run_test 19 "start/stop MDS without OSTs" + equals_msg "Done" diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 4e89693..03a8f7d 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -10,7 +10,7 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh} -ALWAYS_EXCEPT="10" +ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" SETUP=${SETUP:-"setup"} CLEANUP=${CLEANUP:-"cleanup"} diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index 24e5521..a21d8e1 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -27,6 +27,7 @@ fi [ "$NODE" ] && node_opt="--node $NODE" [ "$DEBUG" ] && debug_opt="--ptldebug=$DEBUG" +[ "$PTLDEBUG" ] && debug_opt="--ptldebug=$PTLDEBUG" ${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ \ $conf_opt || exit 2 diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh index 5e49d89..ba38ba0 100755 --- a/lustre/tests/llrmount.sh +++ b/lustre/tests/llrmount.sh @@ -27,10 +27,11 @@ else fi [ "$NODE" ] && node_opt="--node $NODE" +[ "$DEBUG" ] && portals_opt="$portals_opt --ptldebug=$DEBUG" +[ "$PTLDEBUG" ] && portals_opt="$portals_opt --ptldebug=$PTLDEBUG" ${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2 -[ $DEBUG ] && sysctl -w lnet.debug=$DEBUG if [ "$MOUNT2" ]; then $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 28d1b2b..0d12568 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -13,9 +13,9 @@ LOG=$TMP/ooslog SUCCESS=1 -rm -f $OOS +rm -f $OOS $LOG -sleep 1 # to ensure we get up-to-date statfs info +sync; sleep 1; sync # to ensure we get up-to-date statfs info #echo -1 > /proc/sys/lnet/debug #echo 0x40a8 > /proc/sys/lnet/subsystem_debug @@ -34,7 +34,6 @@ fi export LANG=C LC_LANG=C # for "No space left on device" message -rm -f $LOG >/dev/null 2>&1 [ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1 # make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs @@ -45,7 +44,8 @@ if dd if=/dev/zero of=$OOS count=$(($ORIGFREE + 100)) bs=1k 2> $LOG; then fi if [ "`grep -c 'No space left on device' $LOG`" -ne 1 ]; then - echo "ERROR: dd not return ENOSPC" + echo "ERROR: dd not return ENOSPC" + sed "s/^/LOG: /" $LOG SUCCESS=0 fi @@ -65,11 +65,14 @@ if [ -z "$OSCFULL" ]; then fi RECORDSOUT=`grep "records out" $LOG | cut -d + -f1` - FILESIZE=`ls -l $OOS | awk '{ print $5 }'` -if [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then - echo "ERROR: blocks written by dd not equal to the size of file" - SUCCESS=0 +if [ -z "$RECORDSOUT" ]; then + echo "ERROR: no blocks written by dd?" + sed "s/^/LOG: /" $LOG + SUCCESS=0 +elif [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then + echo "ERROR: blocks written by dd not equal to the size of file" + SUCCESS=0 fi #lctl debug_daemon stop diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh index b028760..f7682bb 100644 --- a/lustre/tests/oos2.sh +++ b/lustre/tests/oos2.sh @@ -18,7 +18,7 @@ SUCCESS=1 rm -f $OOS $OOS2 $LOG $LOG2 -sleep 1 # to ensure we get up-to-date statfs info +sync; sleep 1; sync # to ensure we get up-to-date statfs info STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1` ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1` diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index c784c50..ecf3719 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3,8 +3,7 @@ set -e # bug 2986 5494 7288 -ALWAYS_EXCEPT="20b 24 27" - +ALWAYS_EXCEPT="20b 24 27 $RECOVERY_SMALL_EXCEPT" LUSTRE=${LUSTRE:-`dirname $0`/..} diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index a5f461b..b71f318 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -3,7 +3,7 @@ set -e # bug 6088 -ALWAYS_EXCEPT="8" +ALWAYS_EXCEPT="8 $REPLAY_DUAL_EXCEPT" LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh @@ -253,9 +253,9 @@ test_12() { sysctl -w lustre.fail_loc=0 ls $DIR/$tfile - $CHECKSTAT -t file $DIR/$tfile || return 2 kill -USR1 $MULTIPID || return 3 wait $MULTIPID || return 4 + $CHECKSTAT -t file $DIR/$tfile || return 2 rm $DIR/$tfile return 0 diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index cbe7da1..b7979c7 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -13,7 +13,7 @@ ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} # Skip these tests # BUG NUMBER: 2766? -ALWAYS_EXCEPT="5" +ALWAYS_EXCEPT="5 $REPLAY_OST_SINGLE_EXCEPT" gen_config() { rm -f $XMLCONFIG @@ -154,6 +154,7 @@ test_6() { sync && sleep 2 && sync # wait for delete thread before=`kbytesfree` dd if=/dev/urandom bs=4096 count=1280 of=$f + lfs getstripe $f #define OBD_FAIL_MDS_REINT_NET_REP 0x119 do_facet mds "sysctl -w lustre.fail_loc=0x80000119" sync @@ -166,7 +167,7 @@ test_6() { $CHECKSTAT -t file $f && return 2 || true sync # let the delete happen - sleep 2 + sleep 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 4fbeaf3..9546a3f 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -15,7 +15,7 @@ init_test_env $@ # Skip these tests # bug number: 2766 4176 -ALWAYS_EXCEPT="0b 39 48" +ALWAYS_EXCEPT="0b 39 $REPLAY_SINGLE_EXCEPT" gen_config() { rm -f $XMLCONFIG diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 59e2a8f..6de2c9f 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -11,7 +11,7 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42c 45 68"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 51b 51c 64b 71 101" +[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 64b 71 101" case `uname -r` in 2.4*) FSTYPE=${FSTYPE:-ext3} ;; @@ -19,8 +19,8 @@ case `uname -r` in *) error "unsupported kernel" ;; esac -[ "$ALWAYS_EXCEPT$EXCEPT" ] && \ - echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT`" +[ "$ALWAYS_EXCEPT$EXCEPT$SANITY_EXCEPT" ] && \ + echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITY_EXCEPT`" SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH:/sbin @@ -123,7 +123,7 @@ build_test_filter() { for O in $ONLY; do eval ONLY_${O}=true done - for E in $EXCEPT $ALWAYS_EXCEPT; do + for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do eval EXCEPT_${E}=true done } @@ -1412,8 +1412,9 @@ test_33a() { rm -fr $DIR/d33 mkdir -p $DIR/d33 chown $RUNAS_ID $DIR/d33 - $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 || error - $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 && error || true + $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33|| error "create" + $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 && \ + error "open RDWR" || true } run_test 33a "test open file(mode=0444) with O_RDWR (should return error)" @@ -2002,17 +2003,17 @@ test_52a() { [ -f $DIR/d52a/foo ] && chattr -a $DIR/d52a/foo mkdir -p $DIR/d52a touch $DIR/d52a/foo - chattr =a $DIR/d52a/foo || error - echo bar >> $DIR/d52a/foo || error - cp /etc/hosts $DIR/d52a/foo && error - rm -f $DIR/d52a/foo 2>/dev/null && error - link $DIR/d52a/foo $DIR/d52a/foo_link 2>/dev/null && error - echo foo >> $DIR/d52a/foo || error - mrename $DIR/d52a/foo $DIR/d52a/foo_ren && error - lsattr $DIR/d52a/foo | egrep -q "^-+a-+ $DIR/d52a/foo" || error - chattr -a $DIR/d52a/foo || error - - rm -fr $DIR/d52a || error + chattr =a $DIR/d52a/foo || error "chattr =a failed" + echo bar >> $DIR/d52a/foo || error "append bar failed" + cp /etc/hosts $DIR/d52a/foo && error "cp worked" + rm -f $DIR/d52a/foo 2>/dev/null && error "rm worked" + link $DIR/d52a/foo $DIR/d52a/foo_link 2>/dev/null && error "link worked" + echo foo >> $DIR/d52a/foo || error "append foo failed" + mrename $DIR/d52a/foo $DIR/d52a/foo_ren && error "rename worked" + lsattr $DIR/d52a/foo | egrep -q "^-+a-+ $DIR/d52a/foo" || error "lsattr" + chattr -a $DIR/d52a/foo || error "chattr -a failed" + + rm -fr $DIR/d52a || error "cleanup rm failed" } run_test 52a "append-only flag test (should return errors) =====" @@ -2054,7 +2055,7 @@ test_54a() { $SOCKETCLIENT $DIR/socket || error $MUNLINK $DIR/socket } -run_test 54a "unix damain socket test ==========================" +run_test 54a "unix domain socket test ==========================" test_54b() { f="$DIR/f54b" @@ -2310,7 +2311,7 @@ test_63() { for i in /proc/fs/lustre/osc/*/max_dirty_mb ; do echo $MAX_DIRTY_MB > $i done - true + rm -f $DIR/f63 || true } run_test 63 "Verify oig_wait interruption does not crash =======" @@ -2714,6 +2715,7 @@ test_101() { cat $LPROC/llite/*/read_ahead_stats error "too many ($discard) discarded pages" fi + rm -f $DIR/f101 || true } run_test 101 "check read-ahead for random reads ===========" diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 234b12c..f9ae4e7 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -7,7 +7,8 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b 14c"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" +[ "$ALWAYS_EXCEPT$EXCEPT$SANITYN_EXCEPT" ] && \ + echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITYN_EXCEPT`" SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH @@ -86,7 +87,7 @@ run_test() { echo -n "." fi done - for X in $EXCEPT $ALWAYS_EXCEPT; do + for X in $EXCEPT $ALWAYS_EXCEPT $SANITYN_EXCEPT; do if [ "`echo $1 | grep '\<'$X'[a-z]*\>'`" ]; then echo "skipping excluded test $1" return 0 @@ -421,24 +422,25 @@ test_20() { [ $CNTD -gt 0 ] && \ error $CNTD" page left in cache after lock cancel" || true } - run_test 20 "test extra readahead page left in cache ====" +cleanup_21() { + umount $DIR1/d21 +} + test_21() { # Bug 5907 mkdir $DIR1/d21 - mount /etc $DIR1/d21 --bind # Poor man's mount. - rmdir $DIR1/d21 && error "Removed mounted directory" - rmdir $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed" - test -d $DIR1/d21 || error "Monted directory disappeared" - umount $DIR1/d21 + mount /etc $DIR1/d21 --bind || error "mount failed" # Poor man's mount. + trap cleanup_21 EXIT + rmdir -v $DIR1/d21 && error "Removed mounted directory" + rmdir -v $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed" + test -d $DIR1/d21 || error "Mounted directory disappeared" test -d $DIR2/d21 || test -d $DIR1/d21 && error "Removed dir still visible after umount" true } - run_test 21 " Try to remove mountpoint on another dir ====" - log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 339cbb5..1055789 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -31,7 +31,7 @@ init_test_env() { export XMLCONFIG=${XMLCONFIG:-${TESTSUITE}.xml} export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} - [ -d /r ] && export ROOT=/r + [ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests @@ -549,7 +549,7 @@ build_test_filter() { eval ONLY_${O}=true done [ "$EXCEPT$ALWAYS_EXCEPT" ] && \ - log "skipping test `echo $EXCEPT $ALWAYS_EXCEPT`" + log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`" for E in $EXCEPT $ALWAYS_EXCEPT; do eval EXCEPT_${E}=true done diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 398d463..107b87f 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -261,7 +261,7 @@ class LCTLInterface: if config.noexec: return (0, []) child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command - child.tochild.write(cmds + "\n") + child.tochild.write(cmds + "\nq\n") child.tochild.close() # From "Python Cookbook" from O'Reilly @@ -1336,7 +1336,11 @@ class MDSDEV(Module): ret, out = runcmd("mount -o loop %s /tmp/lustre-XXXX/" %self.devpath) if ret: print out[0] - os.utime("/tmp/lustre-XXXX/LOGS", (mtime, mtime)) + try: + os.utime("/tmp/lustre-XXXX/LOGS", (mtime, mtime)) + except OSError: + runcmd("umount -f /tmp/lustre-XXXX/") + panic("Can't adjust config creation time!") runcmd("umount -f /tmp/lustre-XXXX/") else: print "XML file does not contain mtime, skip mtime checking." diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 79a3c58..89c4dc9 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -75,8 +75,8 @@ command_t cmdlist[] = { "usage: setstripe \n" " or \n" " setstripe -d (to delete default striping)\n" - "\tstripe size: Number of bytes in each stripe (0 default)\n" - "\tstripe start: OST index of first stripe (-1 default)\n" + "\tstripe size: Number of bytes on each OST (0 filesystem default)\n" + "\tstripe start: OST index of first stripe (-1 filesystem default)\n" "\tstripe count: Number of OSTs to stripe over (0 default, -1 all)"}, {"find", lfs_find, 0, "To list the extended attributes for a given filename or files in a\n" @@ -779,7 +779,7 @@ static inline char *type2name(int check_type) static void grace2str(time_t seconds,char *buf) { uint minutes, hours, days; - + minutes = (seconds + 30) / 60; hours = minutes / 60; minutes %= 60; diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 5f93784..dd5841f 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -46,8 +46,8 @@ static char *progname = NULL; void usage(FILE *out) { fprintf(out, "%s v1.%d\n", progname, LMD_MAGIC & 0xFF); - fprintf(out, "usage: %s :// " - "[-fhnv] [-o mntopt]\n", progname); + fprintf(out, "usage: %s [,]://" + " [-fhnv] [-o mntopt]\n", progname); fprintf(out, "\t: nid of MDS (config) node\n" "\t: name of MDS service (e.g. mds1)\n" "\t: name of client config (e.g. client)\n" @@ -59,7 +59,8 @@ void usage(FILE *out) "\t-v|--verbose: print verbose config settings\n" "\t-o: filesystem mount options:\n" "\t\tflock/noflock: enable/disable flock support\n" - "\t\tuser_xattr/nouser_xattr: enable/disable user extended attributes\n" + "\t\tuser_xattr/nouser_xattr: enable/disable user extended " + "attributes\n" ); exit(out != stdout); } @@ -126,21 +127,48 @@ init_options(struct lustre_mount_data *lmd) { memset(lmd, 0, sizeof(*lmd)); lmd->lmd_magic = LMD_MAGIC; - lmd->lmd_nid = LNET_NID_ANY; return 0; } int print_options(struct lustre_mount_data *lmd, const char *options) { - printf("nid: %s\n", libcfs_nid2str(lmd->lmd_nid)); - printf("mds: %s\n", lmd->lmd_mds); + int i; + for (i = 0; i < lmd->lmd_nid_count; i++) { + printf("mds nid %d: %s\n", i, + libcfs_nid2str(lmd->lmd_nid[i])); + } + printf("mds name: %s\n", lmd->lmd_mds); printf("profile: %s\n", lmd->lmd_profile); printf("options: %s\n", options); return 0; } +static int parse_nids(struct lustre_mount_data *lmd, char *nids) +{ + int i = 0; + char *tmp = 0; + lnet_nid_t nid; + + while ((tmp = strsep(&nids, ",:"))) { + nid = libcfs_str2nid(tmp); + if (nid == LNET_NID_ANY) { + fprintf(stderr, "%s: Can't parse NID '%s'\n", + progname, tmp); + continue; + } + lmd->lmd_nid[lmd->lmd_nid_count++] = nid; + if (lmd->lmd_nid_count >= MAX_FAILOVER_NIDS) { + fprintf(stderr, "%s: Too many: ignoring nids after %s\n", + progname, tmp); + break; + } + } + return (lmd->lmd_nid_count); +} + + /***************************************************************************** * * This part was cribbed from util-linux/mount/mount.c. There was no clear @@ -282,10 +310,9 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd, if (rc) return rc; - lmd->lmd_nid = libcfs_str2nid(nid); - if (lmd->lmd_nid == LNET_NID_ANY) { - fprintf(stderr, "%s: can't parse nid '%s'\n", progname, nid); - return 1; + if (parse_nids(lmd, nid) == 0) { + fprintf(stderr, "%s: Can't parse any mds nids\n", progname); + return(1); } if (strlen(mds) + 1 > sizeof(lmd->lmd_mds)) { diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c index bb53091..bbb827f 100644 --- a/lustre/utils/llog_reader.c +++ b/lustre/utils/llog_reader.c @@ -213,6 +213,9 @@ void print_llog_header(struct llog_log_hdr* llog_buf) static void print_1_cfg(struct lustre_cfg *lcfg) { int i; + if (lcfg->lcfg_nid) + printf("nid=%s("LPX64") ", libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); for (i = 0; i < lcfg->lcfg_bufcount; i++) printf("%d:%.*s ", i, lcfg->lcfg_buflens[i], (char*)lustre_cfg_buf(lcfg, i)); @@ -226,16 +229,16 @@ static void print_setup_cfg(struct lustre_cfg *lcfg) if ((lcfg->lcfg_bufcount == 2) && (lcfg->lcfg_buflens[1] == sizeof(*desc))) { printf("lov_setup "); - printf("0:%s ", lustre_cfg_string(lcfg, 0)); + printf("0:%s ", lustre_cfg_string(lcfg, 0)); printf("1:(struct lov_desc)\n"); desc = (struct lov_desc*)(lustre_cfg_string(lcfg, 1)); - printf(" uuid=%s, ", (char*)desc->ld_uuid.uuid); - printf("stripe count=%d, ", desc->ld_default_stripe_count); - printf("size=%lld, ", desc->ld_default_stripe_size); - printf("offset=%lld, ", desc->ld_default_stripe_offset); + printf("\t\tuuid=%s ", (char*)desc->ld_uuid.uuid); + printf("stripe:cnt=%d ", desc->ld_default_stripe_count); + printf("size=%lld ", desc->ld_default_stripe_size); + printf("offset=%lld ", desc->ld_default_stripe_offset); printf("pattern=%d", desc->ld_pattern); } else { - printf("setup "); + printf("setup "); print_1_cfg(lcfg); } return; @@ -247,7 +250,7 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) switch(cmd){ case(LCFG_ATTACH):{ - printf("attach "); + printf("attach "); print_1_cfg(lcfg); break; } @@ -256,37 +259,32 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) break; } case(LCFG_DETACH):{ - printf("detach "); + printf("detach "); print_1_cfg(lcfg); break; } case(LCFG_CLEANUP):{ - printf("cleanup "); + printf("cleanup "); print_1_cfg(lcfg); break; } case(LCFG_ADD_UUID):{ - printf("add_uuid "); - printf("nid=%s("LPX64") ", - libcfs_nid2str(lcfg->lcfg_nid), lcfg->lcfg_nid); - /* obsolete */ - if (lcfg->lcfg_nal) - printf("nal=%d ", lcfg->lcfg_nal); + printf("add_uuid "); print_1_cfg(lcfg); break; } case(LCFG_DEL_UUID):{ - printf("del_uuid "); + printf("del_uuid "); print_1_cfg(lcfg); break; } case(LCFG_ADD_CONN):{ - printf("add_conn "); + printf("add_conn "); print_1_cfg(lcfg); break; } case(LCFG_DEL_CONN):{ - printf("del_conn "); + printf("del_conn "); print_1_cfg(lcfg); break; } @@ -320,6 +318,16 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) print_1_cfg(lcfg); break; } + case(LCFG_PARAM):{ + printf("param "); + print_1_cfg(lcfg); + break; + } + case(LCFG_MARKER):{ + printf("marker "); + print_1_cfg(lcfg); + break; + } default: printf("unsupported cmd_code = %x\n",cmd); } diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 128472c..5244770 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -20,6 +20,13 @@ do { \ #define STRINGIFY(a) #a +#if 0 +#define CHECK_DEFINE(a) \ + printf(" CLASSERT("#a" == "STRINGIFY(a) ");\n", (long long)a) + +#define CHECK_VALUE(a) \ + printf(" CLASSERT("#a" == %lld);\n", (long long)a) +#else #define CHECK_DEFINE(a) \ do { \ printf(" LASSERTF("#a" == "STRINGIFY(a) \ @@ -40,6 +47,7 @@ do { \ " == %lldULL, \" found %%lld\\n\",\n "\ "(long long)"#a");\n", (long long)a); \ } while(0) +#endif #define CHECK_MEMBER_OFFSET(s,m) \ do { \ @@ -64,16 +72,7 @@ do { \ } while(0) - -void check1(void) -{ -#define VALUE 1234567 - - CHECK_VALUE(VALUE); - CHECK_DEFINE(VALUE); -} - -void +static void check_lustre_handle(void) { BLANK_LINE(); @@ -81,7 +80,7 @@ check_lustre_handle(void) CHECK_MEMBER(lustre_handle, cookie); } -void +static void check_lustre_msg(void) { BLANK_LINE(); @@ -100,7 +99,7 @@ check_lustre_msg(void) CHECK_MEMBER(lustre_msg, buflens[7]); } -void +static void check_obdo(void) { BLANK_LINE(); @@ -125,6 +124,7 @@ check_obdo(void) CHECK_MEMBER(obdo, o_misc); CHECK_MEMBER(obdo, o_easize); CHECK_MEMBER(obdo, o_mds); + CHECK_MEMBER(obdo, o_padding_1); CHECK_MEMBER(obdo, o_inline); CHECK_VALUE(OBD_INLINESZ); @@ -172,7 +172,7 @@ check_obdo(void) CHECK_VALUE(OBD_FL_NO_GRPQUOTA); } -void +static void check_lov_mds_md_v1(void) { BLANK_LINE(); @@ -198,7 +198,7 @@ check_lov_mds_md_v1(void) CHECK_VALUE(LOV_PATTERN_RAID1); } -void +static void check_obd_statfs(void) { BLANK_LINE(); @@ -214,7 +214,7 @@ check_obd_statfs(void) CHECK_MEMBER(obd_statfs, os_state); } -void +static void check_obd_ioobj(void) { BLANK_LINE(); @@ -225,7 +225,7 @@ check_obd_ioobj(void) CHECK_MEMBER(obd_ioobj, ioo_bufcnt); } -void +static void check_obd_quotactl(void) { BLANK_LINE(); @@ -255,9 +255,15 @@ check_obd_quotactl(void) CHECK_MEMBER(obd_dqblk, dqb_btime); CHECK_MEMBER(obd_dqblk, dqb_itime); CHECK_MEMBER(obd_dqblk, dqb_valid); + CHECK_MEMBER(obd_dqblk, padding); + + CHECK_DEFINE(Q_QUOTACHECK); + CHECK_DEFINE(Q_INITQUOTA); + CHECK_DEFINE(Q_GETOINFO); + CHECK_DEFINE(Q_GETOQUOTA); } -void +static void check_niobuf_remote(void) { BLANK_LINE(); @@ -273,7 +279,7 @@ check_niobuf_remote(void) CHECK_VALUE(OBD_BRW_NOQUOTA); } -void +static void check_ost_body(void) { BLANK_LINE(); @@ -281,7 +287,7 @@ check_ost_body(void) CHECK_MEMBER(ost_body, oa); } -void +static void check_ll_fid(void) { BLANK_LINE(); @@ -291,7 +297,7 @@ check_ll_fid(void) CHECK_MEMBER(ll_fid, f_type); } -void +static void check_mds_status_req(void) { BLANK_LINE(); @@ -300,7 +306,7 @@ check_mds_status_req(void) CHECK_MEMBER(mds_status_req, repbuf); } -void +static void check_mds_body(void) { BLANK_LINE(); @@ -336,6 +342,7 @@ check_mds_body(void) CHECK_VALUE(FMODE_READ); CHECK_VALUE(FMODE_WRITE); CHECK_VALUE(FMODE_EXEC); + CHECK_VALUE(MDS_OPEN_CREAT); CHECK_VALUE(MDS_OPEN_EXCL); CHECK_VALUE(MDS_OPEN_TRUNC); @@ -346,7 +353,7 @@ check_mds_body(void) CHECK_VALUE(MDS_OPEN_HAS_EA); } -void +static void check_mds_rec_setattr(void) { BLANK_LINE(); @@ -368,7 +375,7 @@ check_mds_rec_setattr(void) CHECK_MEMBER(mds_rec_setattr, sa_attr_flags); } -void +static void check_mds_rec_create(void) { BLANK_LINE(); @@ -386,7 +393,7 @@ check_mds_rec_create(void) CHECK_MEMBER(mds_rec_create, cr_suppgid); } -void +static void check_mds_rec_link(void) { BLANK_LINE(); @@ -402,7 +409,7 @@ check_mds_rec_link(void) CHECK_MEMBER(mds_rec_link, lk_time); } -void +static void check_mds_rec_unlink(void) { BLANK_LINE(); @@ -418,7 +425,7 @@ check_mds_rec_unlink(void) CHECK_MEMBER(mds_rec_unlink, ul_time); } -void +static void check_mds_rec_rename(void) { BLANK_LINE(); @@ -434,7 +441,7 @@ check_mds_rec_rename(void) CHECK_MEMBER(mds_rec_rename, rn_time); } -void +static void check_lov_desc(void) { BLANK_LINE(); @@ -445,10 +452,15 @@ check_lov_desc(void) CHECK_MEMBER(lov_desc, ld_pattern); CHECK_MEMBER(lov_desc, ld_default_stripe_size); CHECK_MEMBER(lov_desc, ld_default_stripe_offset); + CHECK_MEMBER(lov_desc, ld_default_stripe_offset); + CHECK_MEMBER(lov_desc, ld_padding_1); + CHECK_MEMBER(lov_desc, ld_padding_2); + CHECK_MEMBER(lov_desc, ld_padding_3); + CHECK_MEMBER(lov_desc, ld_padding_4); CHECK_MEMBER(lov_desc, ld_uuid); } -void +static void check_ldlm_res_id(void) { BLANK_LINE(); @@ -456,7 +468,7 @@ check_ldlm_res_id(void) CHECK_MEMBER(ldlm_res_id, name[RES_NAME_SIZE]); } -void +static void check_ldlm_extent(void) { BLANK_LINE(); @@ -466,7 +478,7 @@ check_ldlm_extent(void) CHECK_MEMBER(ldlm_extent, gid); } -void +static void check_ldlm_flock(void) { BLANK_LINE(); @@ -477,7 +489,7 @@ check_ldlm_flock(void) CHECK_MEMBER(ldlm_flock, pid); } -void +static void check_ldlm_intent(void) { BLANK_LINE(); @@ -485,16 +497,17 @@ check_ldlm_intent(void) CHECK_MEMBER(ldlm_intent, opc); } -void +static void check_ldlm_resource_desc(void) { BLANK_LINE(); CHECK_STRUCT(ldlm_resource_desc); CHECK_MEMBER(ldlm_resource_desc, lr_type); + CHECK_MEMBER(ldlm_resource_desc, lr_padding); CHECK_MEMBER(ldlm_resource_desc, lr_name); } -void +static void check_ldlm_lock_desc(void) { BLANK_LINE(); @@ -505,30 +518,32 @@ check_ldlm_lock_desc(void) CHECK_MEMBER(ldlm_lock_desc, l_policy_data); } -void +static void check_ldlm_request(void) { BLANK_LINE(); CHECK_STRUCT(ldlm_request); CHECK_MEMBER(ldlm_request, lock_flags); + CHECK_MEMBER(ldlm_request, lock_padding); CHECK_MEMBER(ldlm_request, lock_desc); CHECK_MEMBER(ldlm_request, lock_handle1); CHECK_MEMBER(ldlm_request, lock_handle2); } -void +static void check_ldlm_reply(void) { BLANK_LINE(); CHECK_STRUCT(ldlm_reply); CHECK_MEMBER(ldlm_reply, lock_flags); + CHECK_MEMBER(ldlm_request, lock_padding); CHECK_MEMBER(ldlm_request, lock_desc); CHECK_MEMBER(ldlm_reply, lock_handle); CHECK_MEMBER(ldlm_reply, lock_policy_res1); CHECK_MEMBER(ldlm_reply, lock_policy_res2); } -void +static void check_ldlm_lvb(void) { BLANK_LINE(); @@ -540,39 +555,8 @@ check_ldlm_lvb(void) CHECK_MEMBER(ost_lvb, lvb_blocks); } -void -check_ptlbd_op(void) -{ - BLANK_LINE(); - CHECK_STRUCT(ptlbd_op); - CHECK_MEMBER(ptlbd_op, op_cmd); - CHECK_MEMBER(ptlbd_op, op_lun); - CHECK_MEMBER(ptlbd_op, op_niob_cnt); - CHECK_MEMBER(ptlbd_op, op__padding); - CHECK_MEMBER(ptlbd_op, op_block_cnt); -} -void -check_ptlbd_niob(void) -{ - BLANK_LINE(); - CHECK_STRUCT(ptlbd_niob); - CHECK_MEMBER(ptlbd_niob, n_xid); - CHECK_MEMBER(ptlbd_niob, n_block_nr); - CHECK_MEMBER(ptlbd_niob, n_offset); - CHECK_MEMBER(ptlbd_niob, n_length); -} - -void -check_ptlbd_rsp(void) -{ - BLANK_LINE(); - CHECK_STRUCT(ptlbd_rsp); - CHECK_MEMBER(ptlbd_rsp, r_status); - CHECK_MEMBER(ptlbd_rsp, r_error_cnt); -} - -void +static void check_llog_logid(void) { BLANK_LINE(); @@ -592,15 +576,18 @@ check_llog_logid(void) CHECK_VALUE(LLOG_LOGID_MAGIC); } -void +static void check_llog_catid(void) { BLANK_LINE(); CHECK_STRUCT(llog_catid); CHECK_MEMBER(llog_catid, lci_logid); + CHECK_MEMBER(llog_catid, lci_padding1); + CHECK_MEMBER(llog_catid, lci_padding2); + CHECK_MEMBER(llog_catid, lci_padding3); } -void +static void check_llog_rec_hdr(void) { BLANK_LINE(); @@ -608,9 +595,10 @@ check_llog_rec_hdr(void) CHECK_MEMBER(llog_rec_hdr, lrh_len); CHECK_MEMBER(llog_rec_hdr, lrh_index); CHECK_MEMBER(llog_rec_hdr, lrh_type); + CHECK_MEMBER(llog_rec_hdr, padding); } -void +static void check_llog_rec_tail(void) { BLANK_LINE(); @@ -619,17 +607,22 @@ check_llog_rec_tail(void) CHECK_MEMBER(llog_rec_tail, lrt_index); } -void +static void check_llog_logid_rec(void) { BLANK_LINE(); CHECK_STRUCT(llog_logid_rec); CHECK_MEMBER(llog_logid_rec, lid_hdr); CHECK_MEMBER(llog_logid_rec, lid_id); + CHECK_MEMBER(llog_logid_rec, padding1); + CHECK_MEMBER(llog_logid_rec, padding2); + CHECK_MEMBER(llog_logid_rec, padding3); + CHECK_MEMBER(llog_logid_rec, padding4); + CHECK_MEMBER(llog_logid_rec, padding5); CHECK_MEMBER(llog_logid_rec, lid_tail); } -void +static void check_llog_create_rec(void) { BLANK_LINE(); @@ -638,9 +631,10 @@ check_llog_create_rec(void) CHECK_MEMBER(llog_create_rec, lcr_fid); CHECK_MEMBER(llog_create_rec, lcr_oid); CHECK_MEMBER(llog_create_rec, lcr_ogen); + CHECK_MEMBER(llog_create_rec, padding); } -void +static void check_llog_orphan_rec(void) { BLANK_LINE(); @@ -648,10 +642,11 @@ check_llog_orphan_rec(void) CHECK_MEMBER(llog_orphan_rec, lor_hdr); CHECK_MEMBER(llog_orphan_rec, lor_oid); CHECK_MEMBER(llog_orphan_rec, lor_ogen); + CHECK_MEMBER(llog_orphan_rec, padding); CHECK_MEMBER(llog_orphan_rec, lor_tail); } -void +static void check_llog_unlink_rec(void) { BLANK_LINE(); @@ -659,10 +654,25 @@ check_llog_unlink_rec(void) CHECK_MEMBER(llog_unlink_rec, lur_hdr); CHECK_MEMBER(llog_unlink_rec, lur_oid); CHECK_MEMBER(llog_unlink_rec, lur_ogen); + CHECK_MEMBER(llog_unlink_rec, padding); CHECK_MEMBER(llog_unlink_rec, lur_tail); } -void +static void +check_llog_setattr_rec(void) +{ + BLANK_LINE(); + CHECK_STRUCT(llog_setattr_rec); + CHECK_MEMBER(llog_setattr_rec, lsr_hdr); + CHECK_MEMBER(llog_setattr_rec, lsr_oid); + CHECK_MEMBER(llog_setattr_rec, lsr_ogen); + CHECK_MEMBER(llog_setattr_rec, lsr_uid); + CHECK_MEMBER(llog_setattr_rec, lsr_gid); + CHECK_MEMBER(llog_setattr_rec, padding); + CHECK_MEMBER(llog_setattr_rec, lsr_tail); +} + +static void check_llog_size_change_rec(void) { BLANK_LINE(); @@ -670,10 +680,11 @@ check_llog_size_change_rec(void) CHECK_MEMBER(llog_size_change_rec, lsc_hdr); CHECK_MEMBER(llog_size_change_rec, lsc_fid); CHECK_MEMBER(llog_size_change_rec, lsc_io_epoch); + CHECK_MEMBER(llog_size_change_rec, padding); CHECK_MEMBER(llog_size_change_rec, lsc_tail); } -void +static void check_llog_gen(void) { BLANK_LINE(); @@ -682,7 +693,7 @@ check_llog_gen(void) CHECK_MEMBER(llog_gen, conn_cnt); } -void +static void check_llog_gen_rec(void) { BLANK_LINE(); @@ -692,7 +703,7 @@ check_llog_gen_rec(void) CHECK_MEMBER(llog_gen_rec, lgr_tail); } -void +static void check_llog_log_hdr(void) { BLANK_LINE(); @@ -710,7 +721,7 @@ check_llog_log_hdr(void) CHECK_MEMBER(llog_log_hdr, llh_tail); } -void +static void check_llog_cookie(void) { BLANK_LINE(); @@ -718,9 +729,10 @@ check_llog_cookie(void) CHECK_MEMBER(llog_cookie, lgc_lgl); CHECK_MEMBER(llog_cookie, lgc_subsys); CHECK_MEMBER(llog_cookie, lgc_index); + CHECK_MEMBER(llog_cookie, lgc_padding); } -void +static void check_llogd_body(void) { BLANK_LINE(); @@ -742,7 +754,7 @@ check_llogd_body(void) CHECK_VALUE(LLOG_CATINFO); } -void +static void check_llogd_conn_body(void) { BLANK_LINE(); @@ -752,7 +764,7 @@ check_llogd_conn_body(void) CHECK_MEMBER(llogd_conn_body, lgdc_ctxt_idx); } -void +static void check_qunit_data(void) { BLANK_LINE(); @@ -763,7 +775,7 @@ check_qunit_data(void) CHECK_MEMBER(qunit_data, qd_isblk); } -void +static void system_string (char *cmdline, char *str, int len) { int fds[2]; @@ -907,13 +919,12 @@ main(int argc, char **argv) CHECK_VALUE(MDS_STATUS_CONN); CHECK_VALUE(MDS_STATUS_LOV); - CHECK_VALUE(MDS_OPEN_HAS_EA); - CHECK_VALUE(LDLM_ENQUEUE); CHECK_VALUE(LDLM_CONVERT); CHECK_VALUE(LDLM_CANCEL); CHECK_VALUE(LDLM_BL_CALLBACK); CHECK_VALUE(LDLM_CP_CALLBACK); + CHECK_VALUE(LDLM_GL_CALLBACK); CHECK_VALUE(LDLM_LAST_OPC); CHECK_VALUE(LCK_EX); @@ -922,18 +933,8 @@ main(int argc, char **argv) CHECK_VALUE(LCK_CW); CHECK_VALUE(LCK_CR); CHECK_VALUE(LCK_NL); - - CHECK_VALUE(PTLBD_QUERY); - CHECK_VALUE(PTLBD_READ); - CHECK_VALUE(PTLBD_WRITE); - CHECK_VALUE(PTLBD_FLUSH); - CHECK_VALUE(PTLBD_CONNECT); - CHECK_VALUE(PTLBD_DISCONNECT); - CHECK_VALUE(PTLBD_LAST_OPC); - - CHECK_VALUE(MGMT_CONNECT); - CHECK_VALUE(MGMT_DISCONNECT); - CHECK_VALUE(MGMT_EXCEPTION); + CHECK_VALUE(LCK_GROUP); + CHECK_VALUE(LCK_MAXMODE); CHECK_VALUE(OBD_PING); CHECK_VALUE(OBD_LOG_CANCEL); @@ -943,6 +944,18 @@ main(int argc, char **argv) CHECK_VALUE(QUOTA_DQACQ); CHECK_VALUE(QUOTA_DQREL); + CHECK_VALUE(OBD_CONNECT_RDONLY); + CHECK_VALUE(OBD_CONNECT_INDEX); + CHECK_VALUE(OBD_CONNECT_GRANT); + CHECK_VALUE(OBD_CONNECT_SRVLOCK); + CHECK_VALUE(OBD_CONNECT_VERSION); + CHECK_VALUE(OBD_CONNECT_REQPORTAL); + CHECK_VALUE(OBD_CONNECT_ACL); + CHECK_VALUE(OBD_CONNECT_XATTR); + CHECK_VALUE(OBD_CONNECT_CROW); + CHECK_VALUE(OBD_CONNECT_TRUNCLOCK); + CHECK_VALUE(OBD_CONNECT_TRANSNO); + COMMENT("Sizes and Offsets"); BLANK_LINE(); check_lustre_handle(); @@ -972,9 +985,6 @@ main(int argc, char **argv) check_ldlm_request(); check_ldlm_reply(); check_ldlm_lvb(); - check_ptlbd_op(); - check_ptlbd_niob(); - check_ptlbd_rsp(); check_llog_logid(); check_llog_catid(); check_llog_rec_hdr(); @@ -983,6 +993,7 @@ main(int argc, char **argv) check_llog_create_rec(); check_llog_orphan_rec(); check_llog_unlink_rec(); + check_llog_setattr_rec(); check_llog_size_change_rec(); check_llog_gen(); check_llog_gen_rec(); diff --git a/lustre/utils/wirehdr.c b/lustre/utils/wirehdr.c index b513d58..0b65ac1 100644 --- a/lustre/utils/wirehdr.c +++ b/lustre/utils/wirehdr.c @@ -5,6 +5,7 @@ #undef LASSERT #undef LASSERTF +#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } }) #define LASSERT(cond) if (!(cond)) { printf("failed " #cond "\n"); ret = 1; } #define LASSERTF(cond, fmt, arg) if (!(cond)) { printf("failed '" #cond "'" fmt, arg);ret = 1;} diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 7c38ad8..0a56ab2 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -5,6 +5,7 @@ #undef LASSERT #undef LASSERTF +#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } }) #define LASSERT(cond) if (!(cond)) { printf("failed " #cond "\n"); ret = 1; } #define LASSERTF(cond, fmt, arg) if (!(cond)) { printf("failed '" #cond "'" fmt, arg);ret = 1;} @@ -25,8 +26,8 @@ int main() void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux mustang 2.6.12-1.1456_FC4smp #1 SMP Thu Sep 22 02:22:14 EDT 2005 i686 i68 - * with gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) */ + * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6 + * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */ /* Constants... */ @@ -158,8 +159,6 @@ void lustre_assert_wire_constants(void) (long long)MDS_STATUS_CONN); LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n", (long long)MDS_STATUS_LOV); - LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n", - (long long)MDS_OPEN_HAS_EA); LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n", (long long)LDLM_ENQUEUE); LASSERTF(LDLM_CONVERT == 102, " found %lld\n", @@ -170,6 +169,8 @@ void lustre_assert_wire_constants(void) (long long)LDLM_BL_CALLBACK); LASSERTF(LDLM_CP_CALLBACK == 105, " found %lld\n", (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, " found %lld\n", + (long long)LDLM_GL_CALLBACK); LASSERTF(LDLM_LAST_OPC == 107, " found %lld\n", (long long)LDLM_LAST_OPC); LASSERTF(LCK_EX == 1, " found %lld\n", @@ -184,26 +185,10 @@ void lustre_assert_wire_constants(void) (long long)LCK_CR); LASSERTF(LCK_NL == 32, " found %lld\n", (long long)LCK_NL); - LASSERTF(PTLBD_QUERY == 200, " found %lld\n", - (long long)PTLBD_QUERY); - LASSERTF(PTLBD_READ == 201, " found %lld\n", - (long long)PTLBD_READ); - LASSERTF(PTLBD_WRITE == 202, " found %lld\n", - (long long)PTLBD_WRITE); - LASSERTF(PTLBD_FLUSH == 203, " found %lld\n", - (long long)PTLBD_FLUSH); - LASSERTF(PTLBD_CONNECT == 204, " found %lld\n", - (long long)PTLBD_CONNECT); - LASSERTF(PTLBD_DISCONNECT == 205, " found %lld\n", - (long long)PTLBD_DISCONNECT); - LASSERTF(PTLBD_LAST_OPC == 206, " found %lld\n", - (long long)PTLBD_LAST_OPC); - LASSERTF(MGMT_CONNECT == 250, " found %lld\n", - (long long)MGMT_CONNECT); - LASSERTF(MGMT_DISCONNECT == 251, " found %lld\n", - (long long)MGMT_DISCONNECT); - LASSERTF(MGMT_EXCEPTION == 252, " found %lld\n", - (long long)MGMT_EXCEPTION); + LASSERTF(LCK_GROUP == 64, " found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_MAXMODE == 65, " found %lld\n", + (long long)LCK_MAXMODE); LASSERTF(OBD_PING == 400, " found %lld\n", (long long)OBD_PING); LASSERTF(OBD_LOG_CANCEL == 401, " found %lld\n", @@ -216,6 +201,28 @@ void lustre_assert_wire_constants(void) (long long)QUOTA_DQACQ); LASSERTF(QUOTA_DQREL == 602, " found %lld\n", (long long)QUOTA_DQREL); + LASSERTF(OBD_CONNECT_RDONLY == 1, " found %lld\n", + (long long)OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 2, " found %lld\n", + (long long)OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_GRANT == 8, " found %lld\n", + (long long)OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 16, " found %lld\n", + (long long)OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 32, " found %lld\n", + (long long)OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 64, " found %lld\n", + (long long)OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 128, " found %lld\n", + (long long)OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 256, " found %lld\n", + (long long)OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_CROW == 512, " found %lld\n", + (long long)OBD_CONNECT_CROW); + LASSERTF(OBD_CONNECT_TRUNCLOCK == 1024, " found %lld\n", + (long long)OBD_CONNECT_TRUNCLOCK); + LASSERTF(OBD_CONNECT_TRANSNO == 2048, " found %lld\n", + (long long)OBD_CONNECT_TRANSNO); /* Sizes and Offsets */ @@ -362,6 +369,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obdo, o_mds)); LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n", (long long)(int)sizeof(((struct obdo *)0)->o_mds)); + LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_1)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_1)); LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n", (long long)(int)offsetof(struct obdo, o_inline)); LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n", @@ -654,6 +665,18 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, " found %lld\n", (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, padding) == 68, " found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->padding)); + LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n", + (long long)Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n", + (long long)Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102," found %lld\n", + (long long)Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n", + (long long)Q_GETOQUOTA); /* Checks for struct niobuf_remote */ LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n", @@ -1098,6 +1121,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4)); LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_uuid)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n", @@ -1162,6 +1205,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding)); LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, " found %lld\n", @@ -1194,6 +1241,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_request, lock_flags)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding)); LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_request, lock_desc)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n", @@ -1214,6 +1265,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ldlm_reply, lock_flags)); LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding)); LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n", (long long)(int)offsetof(struct ldlm_request, lock_desc)); LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n", @@ -1255,62 +1310,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n", (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); - /* Checks for struct ptlbd_op */ - LASSERTF((int)sizeof(struct ptlbd_op) == 12, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_op)); - LASSERTF((int)offsetof(struct ptlbd_op, op_cmd) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_cmd)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_cmd) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_cmd)); - LASSERTF((int)offsetof(struct ptlbd_op, op_lun) == 2, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_lun)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_lun) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_lun)); - LASSERTF((int)offsetof(struct ptlbd_op, op_niob_cnt) == 4, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_niob_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt)); - LASSERTF((int)offsetof(struct ptlbd_op, op__padding) == 6, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op__padding)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op__padding) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op__padding)); - LASSERTF((int)offsetof(struct ptlbd_op, op_block_cnt) == 8, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_op, op_block_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_block_cnt) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_op *)0)->op_block_cnt)); - - /* Checks for struct ptlbd_niob */ - LASSERTF((int)sizeof(struct ptlbd_niob) == 24, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_niob)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_xid) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_xid)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_xid) == 8, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_xid)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_block_nr) == 8, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_block_nr)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_block_nr) == 8, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_block_nr)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_offset) == 16, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_offset)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_offset) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_offset)); - LASSERTF((int)offsetof(struct ptlbd_niob, n_length) == 20, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_niob, n_length)); - LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_length) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_length)); - - /* Checks for struct ptlbd_rsp */ - LASSERTF((int)sizeof(struct ptlbd_rsp) == 4, " found %lld\n", - (long long)(int)sizeof(struct ptlbd_rsp)); - LASSERTF((int)offsetof(struct ptlbd_rsp, r_status) == 0, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_rsp, r_status)); - LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_status) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_status)); - LASSERTF((int)offsetof(struct ptlbd_rsp, r_error_cnt) == 2, " found %lld\n", - (long long)(int)offsetof(struct ptlbd_rsp, r_error_cnt)); - LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt) == 2, " found %lld\n", - (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt)); - /* Checks for struct llog_logid */ LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n", (long long)(int)sizeof(struct llog_logid)); @@ -1352,6 +1351,18 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_catid, lci_logid)); LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, " found %lld\n", (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); /* Checks for struct llog_rec_hdr */ LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, " found %lld\n", @@ -1368,6 +1379,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, padding) == 12, " found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, padding)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->padding)); /* Checks for struct llog_rec_tail */ LASSERTF((int)sizeof(struct llog_rec_tail) == 8, " found %lld\n", @@ -1392,6 +1407,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_logid_rec, lid_id)); LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, " found %lld\n", (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding1) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding1) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding2) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding2) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding3) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding3) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding4) == 48, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding4)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding4) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding4)); + LASSERTF((int)offsetof(struct llog_logid_rec, padding5) == 52, " found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, padding5)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding5) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding5)); LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, " found %lld\n", (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, " found %lld\n", @@ -1416,6 +1451,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_create_rec, lcr_ogen)); LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen)); + LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_create_rec, padding)); + LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_create_rec *)0)->padding)); /* Checks for struct llog_orphan_rec */ LASSERTF((int)sizeof(struct llog_orphan_rec) == 40, " found %lld\n", @@ -1432,6 +1471,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_orphan_rec, lor_ogen)); LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen)); + LASSERTF((int)offsetof(struct llog_orphan_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_orphan_rec, padding)); + LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_orphan_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_orphan_rec, lor_tail) == 32, " found %lld\n", (long long)(int)offsetof(struct llog_orphan_rec, lor_tail)); LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_tail) == 8, " found %lld\n", @@ -1452,11 +1495,47 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen)); LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen)); + LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, padding)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n", (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n", (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_setattr_rec */ + LASSERTF((int)sizeof(struct llog_setattr_rec) == 48, " found %lld\n", + (long long)(int)sizeof(struct llog_setattr_rec)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_hdr) == 0, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr) == 16, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_oid) == 16, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_gid) == 32, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr_rec, padding) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, padding)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->padding)); + LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_tail) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail)); + /* Checks for struct llog_size_change_rec */ LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n", (long long)(int)sizeof(struct llog_size_change_rec)); @@ -1472,6 +1551,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_size_change_rec, lsc_io_epoch)); LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, padding) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, padding)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->padding)); LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 40, " found %lld\n", (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, " found %lld\n", @@ -1568,6 +1651,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_cookie, lgc_index)); LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, " found %lld\n", (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_padding)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding)); /* Checks for struct llogd_body */ LASSERTF((int)sizeof(struct llogd_body) == 48, " found %lld\n", -- 1.8.3.1