Whamcloud - gitweb
e2fsck: add Lustre lfsck tool
authorAndreas Dilger <adilger@whamcloud.com>
Fri, 13 Apr 2012 08:32:19 +0000 (02:32 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 29 May 2012 08:09:27 +0000 (02:09 -0600)
The lfsck tool, in conjunction with e2fsck, build a DB4 database
of all the inodes and objects on the MDT and OST filesystems.

The lfsck tool combines the databases on the Lustre client,
and can verify that all of the objects referenced by inodes
exist, are not referenced by two inodes, and have a parent
inode.

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
21 files changed:
.gitignore
MCONFIG.in
configure
configure.in
e2fsck/Makefile.in
e2fsck/e2fsck.8.in
e2fsck/e2fsck.c
e2fsck/e2fsck.h
e2fsck/lfsck.8.in [new file with mode: 0644]
e2fsck/lfsck.c [new file with mode: 0644]
e2fsck/lfsck_common.c [new file with mode: 0644]
e2fsck/pass1.c
e2fsck/pass6.c [new file with mode: 0644]
e2fsck/unix.c
e2fsprogs-RHEL-6.spec.in
e2fsprogs-SUSE_LINUX-11.spec.in
e2fsprogs.spec.in
lib/config.h.in
lib/ext2fs/lfsck.h [new file with mode: 0644]
misc/mke2fs.c
util/subst.conf.in

index d9fa6fb..d4405c5 100644 (file)
@@ -37,6 +37,8 @@ e2fsck/e2fsck.conf.5
 e2fsck/e2fsck.shared
 e2fsck/e2fsck.static
 e2fsck/gen_crc32table
+e2fsck/lfsck
+e2fsck/lfsck.8
 e2fsck/prof_err.c
 e2fsck/prof_err.h
 e2fsck/tst_crc32
index 6503fec..79bf968 100644 (file)
@@ -97,6 +97,9 @@ DEPLIBCOM_ERR = $(LIB)/libcom_err@LIB_EXT@
 DEPLIBUUID = @DEPLIBUUID@
 DEPLIBQUOTA = @DEPSTATIC_LIBQUOTA@
 DEPLIBBLKID = @DEPLIBBLKID@ @PRIVATE_LIBS_CMT@ $(DEPLIBUUID)
+#Version of libdb lib found by configure
+LIBDB = @libdb@
+
 
 STATIC_LIBSS = $(LIB)/libss@STATIC_LIB_EXT@ @DLOPEN_LIB@
 STATIC_LIBCOM_ERR = $(LIB)/libcom_err@STATIC_LIB_EXT@ @SEM_INIT_LIB@
index 20c8fff..d7ac2fc 100755 (executable)
--- a/configure
+++ b/configure
@@ -611,6 +611,7 @@ CYGWIN_CMT
 LINUX_CMT
 UNI_DIFF_OPTS
 SEM_INIT_LIB
+DB4VERSION
 SOCKET_LIB
 SIZEOF_LONG_LONG
 SIZEOF_LONG
@@ -712,6 +713,11 @@ PROFILE_CMT
 BSDLIB_CMT
 ELF_CMT
 HTREE_CMT
+WITH_LUSTRE
+LUSTRE
+ENABLE_LFSCK
+LFSCK_MAN
+LFSCK_CMT
 Q
 E
 LINK_BUILD_FLAGS
@@ -799,6 +805,8 @@ enable_symlink_install
 enable_symlink_relative_symlinks
 enable_symlink_build
 enable_verbose_makecmds
+enable_lfsck
+with_lustre
 enable_compression
 enable_htree
 enable_elf_shlibs
@@ -1454,6 +1462,7 @@ Optional Features:
   --enable-relative-symlinks use relative symlinks when installing
   --enable-symlink-build  use symlinks while building instead of hard links
   --enable-verbose-makecmds enable verbose make command output
+  --disable-lfsck        disable EXPERIMENTAL Lustre lfsck support
   --enable-compression   enable EXPERIMENTAL compression support
   --enable-htree         enable EXPERIMENTAL htree directory support
   --enable-elf-shlibs    select ELF shared libraries
@@ -1485,6 +1494,7 @@ Optional Packages:
   --with-ccopts           no longer supported, use CFLAGS= instead
   --with-ldopts           no longer supported, use LDFLAGS= instead
   --with-root-prefix=PREFIX override prefix variable for files to be placed in the root
+  --with-lustre=LUSTRE path to Lustre sources for lfsck (default=/usr for RPM)
   --with-gnu-ld           assume the C compiler uses GNU ld default=no
   --with-libiconv-prefix[=DIR]  search for libiconv in DIR/include and DIR/lib
   --without-libiconv-prefix     don't search for libiconv in includedir and libdir
@@ -4452,6 +4462,104 @@ fi
 
 
 
+# Check whether --enable-lfsck was given.
+if test "${enable_lfsck+set}" = set; then :
+  enableval=$enable_lfsck; if test "$enableval" = "no"
+then
+       LFSCK_CMT=#
+       LFSCK_MAN=.\"
+       ENABLE_LFSCK="--disable-lfsck"
+       echo "Disabling Lustre lfsck support"
+else
+       LFSCK_CMT=
+       LFSCK_MAN=
+       ENABLE_LFSCK="--enable-lfsck"
+
+$as_echo "#define HAVE_LFSCK 1" >>confdefs.h
+
+       echo "Enabling Lustre lfsck support"
+fi
+
+else
+  LFSCK_CMT=
+LFSCK_MAN=
+ENABLE_LFSCK=
+
+$as_echo "#define HAVE_LFSCK 1" >>confdefs.h
+
+echo "Enabling Lustre lfsck support by default"
+
+fi
+
+
+
+
+
+# Check whether --with-lustre was given.
+if test "${with_lustre+set}" = set; then :
+  withval=$with_lustre; { $as_echo "$as_me:${as_lineno-$LINENO}: result: LUSTRE is in $withval" >&5
+$as_echo "LUSTRE is in $withval" >&6; }
+       LUSTRE="$withval"
+       WITH_LUSTRE="--with-lustre=$withval"
+else
+  LUSTRE="/usr"
+       WITH_LUSTRE=
+
+fi
+
+
+if test "x$enable_lfsck" != "xno"; then :
+
+       as_ac_File=`$as_echo "ac_cv_file_$LUSTRE/include/lustre/liblustreapi.h" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $LUSTRE/include/lustre/liblustreapi.h" >&5
+$as_echo_n "checking for $LUSTRE/include/lustre/liblustreapi.h... " >&6; }
+if { as_var=$as_ac_File; eval "test \"\${$as_var+set}\" = set"; }; then :
+  $as_echo_n "(cached) " >&6
+else
+  test "$cross_compiling" = yes &&
+  as_fn_error "cannot check for file existence when cross compiling" "$LINENO" 5
+if test -r "$LUSTRE/include/lustre/liblustreapi.h"; then
+  eval "$as_ac_File=yes"
+else
+  eval "$as_ac_File=no"
+fi
+fi
+eval ac_res=\$$as_ac_File
+              { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+eval as_val=\$$as_ac_File
+   if test "x$as_val" = x""yes; then :
+
+else
+  as_ac_File=`$as_echo "ac_cv_file_$LUSTRE/lustre/include/lustre/liblustreapi.h" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $LUSTRE/lustre/include/lustre/liblustreapi.h" >&5
+$as_echo_n "checking for $LUSTRE/lustre/include/lustre/liblustreapi.h... " >&6; }
+if { as_var=$as_ac_File; eval "test \"\${$as_var+set}\" = set"; }; then :
+  $as_echo_n "(cached) " >&6
+else
+  test "$cross_compiling" = yes &&
+  as_fn_error "cannot check for file existence when cross compiling" "$LINENO" 5
+if test -r "$LUSTRE/lustre/include/lustre/liblustreapi.h"; then
+  eval "$as_ac_File=yes"
+else
+  eval "$as_ac_File=no"
+fi
+fi
+eval ac_res=\$$as_ac_File
+              { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+eval as_val=\$$as_ac_File
+   if test "x$as_val" = x""yes; then :
+
+else
+  as_fn_error "No lustre includes found." "$LINENO" 5
+fi
+
+
+fi
+
+
+fi
 # Check whether --enable-compression was given.
 if test "${enable_compression+set}" = set; then :
   enableval=$enable_compression; if test "$enableval" = "no"
 
 done
 
+for ac_header in db.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "db.h" "ac_cv_header_db_h" "$ac_includes_default"
+if test "x$ac_cv_header_db_h" = x""yes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_DB_H 1
+_ACEOF
+
+fi
+
+done
+
 for ac_func in vprintf
 do :
   ac_fn_c_check_func "$LINENO" "vprintf" "ac_cv_func_vprintf"
@@ -10972,6 +11092,315 @@ if test "x$ac_cv_lib_socket_socket" = x""yes; then :
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.1" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.1... " >&6; }
+if test "${ac_cv_lib_db_4_1_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.1  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_1_db_env_create=yes
+else
+  ac_cv_lib_db_4_1_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_1_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_1_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_1_db_env_create" = x""yes; then :
+  DB4VERSION=4.1
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.2" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.2... " >&6; }
+if test "${ac_cv_lib_db_4_2_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.2  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_2_db_env_create=yes
+else
+  ac_cv_lib_db_4_2_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_2_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_2_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_2_db_env_create" = x""yes; then :
+  DB4VERSION=4.2
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.3" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.3... " >&6; }
+if test "${ac_cv_lib_db_4_3_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.3  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_3_db_env_create=yes
+else
+  ac_cv_lib_db_4_3_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_3_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_3_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_3_db_env_create" = x""yes; then :
+  DB4VERSION=4.3
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.5" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.5... " >&6; }
+if test "${ac_cv_lib_db_4_5_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.5  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_5_db_env_create=yes
+else
+  ac_cv_lib_db_4_5_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_5_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_5_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_5_db_env_create" = x""yes; then :
+  DB4VERSION=4.5
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.6" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.6... " >&6; }
+if test "${ac_cv_lib_db_4_6_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.6  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_6_db_env_create=yes
+else
+  ac_cv_lib_db_4_6_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_6_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_6_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_6_db_env_create" = x""yes; then :
+  DB4VERSION=4.6
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.7" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.7... " >&6; }
+if test "${ac_cv_lib_db_4_7_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.7  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_7_db_env_create=yes
+else
+  ac_cv_lib_db_4_7_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_7_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_7_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_7_db_env_create" = x""yes; then :
+  DB4VERSION=4.7
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for db_env_create in -ldb-4.8" >&5
+$as_echo_n "checking for db_env_create in -ldb-4.8... " >&6; }
+if test "${ac_cv_lib_db_4_8_db_env_create+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldb-4.8  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char db_env_create ();
+int
+main ()
+{
+return db_env_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_db_4_8_db_env_create=yes
+else
+  ac_cv_lib_db_4_8_db_env_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_db_4_8_db_env_create" >&5
+$as_echo "$ac_cv_lib_db_4_8_db_env_create" >&6; }
+if test "x$ac_cv_lib_db_4_8_db_env_create" = x""yes; then :
+  DB4VERSION=4.8
+
+$as_echo "#define HAVE_DB4 1" >>confdefs.h
+
+
+fi
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for optreset" >&5
 $as_echo_n "checking for optreset... " >&6; }
 if test "${ac_cv_have_optreset+set}" = set; then :
index fc48a7e..9905742 100644 (file)
@@ -226,6 +226,62 @@ Q=@
 AC_SUBST(E)
 AC_SUBST(Q)
 dnl
+dnl handle --enable-lfsck
+dnl
+AC_ARG_ENABLE([lfsck],
+[  --disable-lfsck               disable EXPERIMENTAL Lustre lfsck support],
+if test "$enableval" = "no"
+then
+       LFSCK_CMT=#
+       LFSCK_MAN=.\"
+       ENABLE_LFSCK="--disable-lfsck"
+       echo "Disabling Lustre lfsck support"
+else
+       LFSCK_CMT=
+       LFSCK_MAN=
+       ENABLE_LFSCK="--enable-lfsck"
+       AC_DEFINE(HAVE_LFSCK, 1, [Define to 1 if Lustre lfsck is enabled])
+       echo "Enabling Lustre lfsck support"
+fi
+,
+LFSCK_CMT=
+LFSCK_MAN=
+ENABLE_LFSCK=
+AC_DEFINE(HAVE_LFSCK, 1, [Define to 1 if Lustre lfsck is enabled])
+echo "Enabling Lustre lfsck support by default"
+)
+AC_SUBST(LFSCK_CMT)
+AC_SUBST(LFSCK_MAN)
+AC_SUBST(ENABLE_LFSCK)
+dnl
+dnl set lustre include path and build lfsck
+dnl
+AC_ARG_WITH([lustre],
+[  --with-lustre=LUSTRE path to Lustre sources for lfsck (default=/usr for RPM)],
+AC_MSG_RESULT(LUSTRE is in $withval)
+       LUSTRE="$withval"
+       WITH_LUSTRE="--with-lustre=$withval",
+       LUSTRE="/usr"
+       WITH_LUSTRE=
+)dnl
+AC_SUBST(LUSTRE)
+AC_SUBST(WITH_LUSTRE)
+dnl
+AS_IF([test "x$enable_lfsck" != "xno"],
+[
+       AC_CHECK_FILE($LUSTRE/include/lustre/liblustreapi.h,,
+               AC_CHECK_FILE($LUSTRE/lustre/include/lustre/liblustreapi.h,,
+                       AC_MSG_ERROR([No lustre includes found.]))
+       )
+dnl    # Disable for old autoconf that doesn't check header usability.
+dnl    # Supposed to check if lustre_idl.h is buildable from userspace.
+dnl    AC_CHECK_HEADER($LUSTRE/lustre/include/lustre/lustre_idl.h,
+dnl                    AC_DEFINE(HAVE_LUSTRE_LUSTRE_IDL_H),
+dnl            AC_CHECK_HEADER($LUSTRE/include/lustre/lustre_idl.h,
+dnl                            AC_DEFINE(HAVE_LUSTRE_LUSTRE_IDL_H),)
+dnl    )
+])
+dnl
 dnl handle --enable-compression
 dnl
 AC_ARG_ENABLE([compression],
@@ -861,6 +917,7 @@ AC_CHECK_HEADERS(net/if.h,,,
 #include <sys/socket.h>
 #endif
 ]])
+AC_CHECK_HEADERS(db.h)
 AC_FUNC_VPRINTF
 dnl Check to see if dirent has member d_reclen. On cygwin those d_reclen
 dnl is not decleared.
@@ -1028,6 +1085,38 @@ SOCKET_LIB=''
 AC_CHECK_LIB(socket, socket, [SOCKET_LIB=-lsocket])
 AC_SUBST(SOCKET_LIB)
 dnl
+dnl Check to see if libdb exists for the sake of lfsck
+dnl
+AC_CHECK_LIB(db-4.1, db_env_create,
+       [DB4VERSION=4.1
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.2, db_env_create,
+       [DB4VERSION=4.2
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.3, db_env_create,
+       [DB4VERSION=4.3
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.5, db_env_create,
+       [DB4VERSION=4.5
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.6, db_env_create,
+       [DB4VERSION=4.6
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.7, db_env_create,
+       [DB4VERSION=4.7
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_CHECK_LIB(db-4.8, db_env_create,
+       [DB4VERSION=4.8
+       AC_DEFINE(HAVE_DB4, 1, [Define to 1 if DB4 library is present])
+])
+AC_SUBST(DB4VERSION)
+dnl
 dnl See if optreset exists
 dnl
 AC_MSG_CHECKING(for optreset)
index d558985..7ec03d5 100644 (file)
@@ -12,33 +12,40 @@ INSTALL = @INSTALL@
 @MCONFIG@
 
 PROGS=         e2fsck
+@LFSCK_CMT@USPROGS= lfsck
 MANPAGES=      e2fsck.8
 FMANPAGES=     e2fsck.conf.5
+@LFSCK_CMT@MANPAGES   += lfsck.8
+XTRA_CFLAGS=   -DRESOURCE_TRACK -I.
 
+@LFSCK_CMT@LFSCK_LIBS=-ldb-@DB4VERSION@
+@LFSCK_CMT@LUSTRE_INC=-I @LUSTRE@/lustre/include -I @LUSTRE@/include -I @LUSTRE@/libcfs/include -Wall
+@LFSCK_CMT@LUSTRE_LIB=-L @LUSTRE@/lustre/utils -L @LUSTRE@/lib64 -L @LUSTRE@/lib
 LIBS= $(LIBQUOTA) $(LIBEXT2FS) $(LIBCOM_ERR) $(LIBBLKID) $(LIBUUID) \
-       $(LIBINTL) $(LIBE2P)
+       $(LIBINTL) $(LIBE2P) $(LFSCK_LIBS)
 DEPLIBS= $(DEPLIBQUOTA) $(LIBEXT2FS) $(DEPLIBCOM_ERR) $(DEPLIBBLKID) \
-        $(DEPLIBUUID) $(DEPLIBE2P)
+        $(DEPLIBUUID) $(DEPLIBE2P) $(LFSCK_LIBS)
 
 STATIC_LIBS= $(STATIC_LIBQUOTA) $(STATIC_LIBEXT2FS) $(STATIC_LIBCOM_ERR) \
-            $(STATIC_LIBBLKID) $(STATIC_LIBUUID) $(LIBINTL) $(STATIC_LIBE2P)
+            $(STATIC_LIBBLKID) $(STATIC_LIBUUID) $(LIBINTL) $(STATIC_LIBE2P) \
+            $(LFSCK_LIBS)
 STATIC_DEPLIBS= $(DEPSTATIC_LIBQUOTA) $(STATIC_LIBEXT2FS) \
                $(DEPSTATIC_LIBCOM_ERR) $(DEPSTATIC_LIBBLKID) \
-               $(DEPSTATIC_LIBUUID) $(DEPSTATIC_LIBE2P)
+               $(DEPSTATIC_LIBUUID) $(DEPSTATIC_LIBE2P) $(LFSCK_LIBS)
 
 PROFILED_LIBS= $(PROFILED_LIBQUOTA) $(PROFILED_LIBEXT2FS) \
               $(PROFILED_LIBCOM_ERR) $(PROFILED_LIBBLKID) $(PROFILED_LIBUUID) \
-              $(PROFILED_LIBE2P) $(LIBINTL) \
+              $(PROFILED_LIBE2P) $(LIBINTL) $(LFSCK_LIBS)
 PROFILED_DEPLIBS= $(DEPPROFILED_LIBQUOTA) $(PROFILED_LIBEXT2FS) \
                  $(DEPPROFILED_LIBCOM_ERR) $(DEPPROFILED_LIBBLKID) \
-                 $(DEPPROFILED_LIBUUID) $(DEPPROFILED_LIBE2P)
+                 $(DEPPROFILED_LIBUUID) $(DEPPROFILED_LIBE2P) $(LFSCK_LIBS)
 
 COMPILE_ET=$(top_builddir)/lib/et/compile_et --build-tree
 
 .c.o:
        $(E) "  CC $<"
-       $(Q) $(CC) -c $(ALL_CFLAGS) $< -o $@
-@PROFILE_CMT@  $(Q) $(CC) $(ALL_CFLAGS) -g -pg -o profiled/$*.o -c $<
+       $(Q) $(CC) -c $(ALL_CFLAGS) $(LUSTRE_INC) $< -o $@
+@PROFILE_CMT@  $(Q) $(CC) $(ALL_CFLAGS) $(LUSTRE_INC) -g -pg -o profiled/$*.o -c $<
 
 #
 # Flags for using Checker
@@ -65,21 +72,25 @@ COMPILE_ET=$(top_builddir)/lib/et/compile_et --build-tree
 #MCHECK= -DMCHECK
 
 OBJS= crc32.o dict.o unix.o e2fsck.o super.o pass1.o pass1b.o pass2.o \
-       pass3.o pass4.o pass5.o journal.o badblocks.o util.o dirinfo.o \
+       pass3.o pass4.o pass5.o pass6.o journal.o badblocks.o util.o dirinfo.o \
        dx_dirinfo.o ehandler.o problem.o message.o quota.o recovery.o \
        region.o revoke.o ea_refcount.o rehash.o profile.o prof_err.o \
        logfile.o sigcatcher.o $(MTRACE_OBJ)
+@LFSCK_CMT@OBJS += lfsck_common.o
+
+@LFSCK_CMT@LFSCK_OBJS = lfsck_common.o lfsck.o
 
 PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \
        profiled/super.o profiled/pass1.o profiled/pass1b.o \
        profiled/pass2.o profiled/pass3.o profiled/pass4.o profiled/pass5.o \
-       profiled/journal.o profiled/badblocks.o profiled/util.o \
+       profiled/pass6.o profiled/journal.o profiled/badblocks.o profiled/util.o \
        profiled/dirinfo.o profiled/dx_dirinfo.o profiled/ehandler.o \
        profiled/message.o profiled/problem.o profiled/quota.o \
        profiled/recovery.o profiled/region.o profiled/revoke.o \
        profiled/ea_refcount.o profiled/rehash.o profiled/profile.o \
        profiled/crc32.o profiled/prof_err.o profiled/logfile.o \
        profiled/sigcatcher.o
+@LFSCK_CMT@PROFILED_OBJS += profiled/lfsck_common.o
 
 SRCS= $(srcdir)/e2fsck.c \
        $(srcdir)/crc32.c \
@@ -92,6 +103,7 @@ SRCS= $(srcdir)/e2fsck.c \
        $(srcdir)/pass3.c \
        $(srcdir)/pass4.c \
        $(srcdir)/pass5.c \
+       $(srcdir)/pass6.c \
        $(srcdir)/journal.c \
        $(srcdir)/recovery.c \
        $(srcdir)/revoke.c \
@@ -113,7 +125,10 @@ SRCS= $(srcdir)/e2fsck.c \
        $(srcdir)/quota.c \
        $(MTRACE_SRC)
 
-all:: profiled $(PROGS) e2fsck $(MANPAGES) $(FMANPAGES)
+@LFSCK_CMT@SRCS += $(srcdir)/lfsck_common.c
+
+@LFSCK_CMT@LFSCK_SRCS = $(srcdir)/lfsck_common.c $(srcdir)/lfsck.c
+all:: profiled $(PROGS) $(USPROGS) e2fsck $(MANPAGES) $(FMANPAGES)
 
 @PROFILE_CMT@all:: e2fsck.profiled
 
@@ -178,6 +193,10 @@ check:: tst_refcount tst_region tst_crc32 tst_problem
        LD_LIBRARY_PATH=$(LIB) DYLD_LIBRARY_PATH=$(LIB) ./tst_crc32
        LD_LIBRARY_PATH=$(LIB) DYLD_LIBRARY_PATH=$(LIB) ./tst_problem
 
+@LFSCK_CMT@lfsck: $(LFSCK_OBJS)
+@LFSCK_CMT@    @echo " LD $@"
+@LFSCK_CMT@    $(LD) -pthread $(ALL_LDFLAGS) $(LUSTRE_INC) $(LFSCK_OBJS) -o lfsck $(LUSTRE_LIB) -llustreapi $(DEPLIBS)
+
 extend: extend.o
        $(E) "  LD $@"
        $(Q) $(LD) $(ALL_LDFLAGS) -o extend extend.o $(CHECKLIB)
@@ -201,6 +220,9 @@ profiled:
 @PROFILE_CMT@  $(E) "  MKDIR $@"
 @PROFILE_CMT@  $(Q) mkdir profiled
 
+lfsck.8: $(DEP_SUBSTITUTE) $(srcdir)/lfsck.8.in
+       $(SUBSTITUTE_UPTIME) $(srcdir)/lfsck.8.in lfsck.8
+
 e2fsck.8: $(DEP_SUBSTITUTE) $(srcdir)/e2fsck.8.in
        $(E) "  SUBST $@"
        $(Q) $(SUBSTITUTE_UPTIME) $(srcdir)/e2fsck.8.in e2fsck.8
@@ -210,15 +232,19 @@ e2fsck.conf.5: $(DEP_SUBSTITUTE) $(srcdir)/e2fsck.conf.5.in
        $(Q) $(SUBSTITUTE_UPTIME) $(srcdir)/e2fsck.conf.5.in e2fsck.conf.5
 
 installdirs:
-       $(E) "  MKINSTALLDIRS $(root_sbindir) $(man8dir)"
+       $(E) "  MKINSTALLDIRS $(root_sbindir) $(sbindir) $(man8dir)"
        $(Q) $(MKINSTALLDIRS) $(DESTDIR)$(root_sbindir) \
-               $(DESTDIR)$(man8dir) $(DESTDIR)$(man5dir)
+               $(DESTDIR)$(man8dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(sbindir)
 
-install: $(PROGS) $(MANPAGES) $(FMANPAGES) installdirs
+install: $(PROGS)  $(USPROGS) $(MANPAGES) $(FMANPAGES) installdirs
        $(Q) for i in $(PROGS); do \
                echo "  INSTALL $(root_sbindir)/$$i"; \
                $(INSTALL_PROGRAM) $$i $(DESTDIR)$(root_sbindir)/$$i; \
        done
+       @for i in $(USPROGS); do \
+               echo "  INSTALL $(sbindir)/$$i"; \
+               $(INSTALL_PROGRAM) $$i $(DESTDIR)$(sbindir)/$$i; \
+       done
        $(Q) for i in ext2 ext3 ext4 ext4dev; do \
                echo "  LINK $(root_sbindir)/fsck.$$i"; \
                (cd $(DESTDIR)$(root_sbindir); \
@@ -254,6 +280,9 @@ uninstall:
        for i in $(PROGS); do \
                $(RM) -f $(DESTDIR)$(root_sbindir)/$$i; \
        done
+       for i in $(USPROGS); do \
+               $(RM) -f $(DESTDIR)$(sbindir)/$$i; \
+       done
        $(RM) -f $(DESTDIR)$(root_sbindir)/fsck.ext2 \
                $(DESTDIR)$(root_sbindir)/fsck.ext3 \
                $(DESTDIR)$(root_sbindir)/fsck.ext4 \
@@ -270,7 +299,7 @@ uninstall:
                        $(DESTDIR)$(root_sbindir)/fsck.ext4dev
 
 clean:
-       $(RM) -f $(PROGS) \#* *\# *.s *.o *.a *~ core e2fsck.static \
+       $(RM) -f $(PROGS) $(USPROGS) \#* *\# *.s *.o *.a *~ core e2fsck.static \
                e2fsck.shared e2fsck.profiled flushb e2fsck.8 \
                tst_problem tst_crc32 tst_region tst_refcount gen_crc32table \
                crc32table.h e2fsck.conf.5 prof_err.c prof_err.h \
@@ -379,6 +408,12 @@ pass5.o: $(srcdir)/pass5.c $(top_builddir)/lib/config.h \
  $(srcdir)/profile.h prof_err.h $(top_srcdir)/lib/quota/mkquota.h \
  $(top_srcdir)/lib/quota/quota.h $(top_srcdir)/lib/../e2fsck/dict.h \
  $(srcdir)/problem.h
+pass6.o: $(srcdir)/pass6.c $(top_builddir)/lib/config.h $(srcdir)/e2fsck.h \
+ $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h \
+ $(top_srcdir)/lib/ext2fs/ext2fs.h $(top_srcdir)/lib/ext2fs/ext2_fs.h \
+ $(top_srcdir)/lib/et/com_err.h $(top_srcdir)/lib/ext2fs/ext2_io.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(top_srcdir)/lib/ext2fs/bitops.h \
+ $(srcdir)/problem.h $(top_srcdir)/lib/ext2fs/lfsck.h
 journal.o: $(srcdir)/journal.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/jfs_user.h $(srcdir)/e2fsck.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h \
@@ -519,6 +554,11 @@ region.o: $(srcdir)/region.c $(top_builddir)/lib/config.h \
 profile.o: $(srcdir)/profile.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(top_srcdir)/lib/et/com_err.h \
  $(srcdir)/profile.h prof_err.h
+lfsck.o: $(srcdir)/lfsck.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/e2fsck.h \
+ $(top_srcdir)/lib/ext2fs/lfsck.h $(srcdir)/lfsck_common.c \
+ $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h \
+ $(top_srcdir)/lib/ext2fs/ext2fs.h
 sigcatcher.o: $(srcdir)/sigcatcher.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/e2fsck.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h \
index f64afcf..08f5fba 100644 (file)
@@ -333,6 +333,20 @@ may not be specified at the same time as the
 or
 .B \-p
 options.
+@LFSCK_MAN@.TP
+@LFSCK_MAN@.BI \-\-mdsdb " mds_database_filename"
+@LFSCK_MAN@Specify which file to use for an MDS database on a Lustre filesystem.
+@LFSCK_MAN@The mdsdb also needs to be specified when creating an OST database with
+@LFSCK_MAN@.B \-\-ostdb
+@LFSCK_MAN@but the mdsdb file used in a read-only manner.
+@LFSCK_MAN@.TP
+@LFSCK_MAN@.BI \-\-ostdb " ostN_database_filename"
+@LFSCK_MAN@Specify which file to use for an OST database on a Lustre filesystem.
+@LFSCK_MAN@A separate database file should be specified for each OST device in
+@LFSCK_MAN@the filesystem.  For convenience a stub
+@LFSCK_MAN@.BR mdsdb .hdr
+@LFSCK_MAN@file is generated for use by the OST e2fsck to avoid the need to
+@LFSCK_MAN@wait for the MDS e2fsck to finish or copy the full mdsdb to the OSTs.
 .SH EXIT CODE
 The exit code returned by
 .B e2fsck
index d8609d6..de99e56 100644 (file)
@@ -197,8 +197,11 @@ void e2fsck_free_context(e2fsck_t ctx)
 typedef void (*pass_t)(e2fsck_t ctx);
 
 pass_t e2fsck_passes[] = {
-       e2fsck_pass1, e2fsck_pass2, e2fsck_pass3, e2fsck_pass4,
-       e2fsck_pass5, 0 };
+       e2fsck_pass1, e2fsck_pass2, e2fsck_pass3, e2fsck_pass4, e2fsck_pass5,
+#ifdef HAVE_LFSCK
+       e2fsck_pass6,
+#endif
+       0 };
 
 #define E2F_FLAG_RUN_RETURN    (E2F_FLAG_SIGNAL_MASK|E2F_FLAG_RESTART)
 
index c5a4083..2dc43df 100644 (file)
@@ -168,6 +168,7 @@ struct resource_track {
 #define E2F_OPT_FRAGCHECK      0x0800
 #define E2F_OPT_JOURNAL_ONLY   0x1000 /* only replay the journal */
 #define E2F_OPT_DISCARD                0x2000
+#define E2F_OPT_VERBOSE                0x4000
 
 /*
  * E2fsck flags
@@ -194,6 +195,13 @@ struct resource_track {
 
 #define E2F_RESET_FLAGS (E2F_FLAG_TIME_INSANE)
 
+/* Defines for Lustre */
+#define LUSTRE_NULL 0x0000
+#define LUSTRE_MDS  0x0001
+#define LUSTRE_OST  0x0002
+#define LUSTRE_TYPE 0x000f
+#define LUSTRE_ONLY 0x1000
+
 /*
  * Defines for indicating the e2fsck pass number
  */
@@ -354,6 +362,13 @@ struct e2fsck_struct {
         * Ext4 quota support
         */
        quota_ctx_t qctx;
+
+       /* lustre support */
+       int                     lustre_devtype;
+       char                    *lustre_mdsdb;
+       char                    *lustre_ostdb;
+       struct lfsck_outdb_info *lfsck_oinfo;
+
 #ifdef RESOURCE_TRACK
        /*
         * For timing purposes
@@ -437,6 +452,7 @@ extern void e2fsck_pass2(e2fsck_t ctx);
 extern void e2fsck_pass3(e2fsck_t ctx);
 extern void e2fsck_pass4(e2fsck_t ctx);
 extern void e2fsck_pass5(e2fsck_t ctx);
+extern void e2fsck_pass6(e2fsck_t ctx);
 
 /* e2fsck.c */
 extern errcode_t e2fsck_allocate_context(e2fsck_t *ret);
diff --git a/e2fsck/lfsck.8.in b/e2fsck/lfsck.8.in
new file mode 100644 (file)
index 0000000..1090c60
--- /dev/null
@@ -0,0 +1,78 @@
+.\" -*- nroff -*-
+.\" Copyright 2004 by Andreas Dilger.  All Rights Reserved.
+.\" This file may be copied under the terms of the GNU Public License.
+.\"
+.TH LFSCK 8 "@E2FSPROGS_MONTH@ @E2FSPROGS_YEAR@" "E2fsprogs version @E2FSPROGS_VERSION@"
+.SH NAME
+lfsck \- check a Lustre distributed filesystem
+.SH SYNOPSIS
+.B lfsck
+[
+.B \-cdfhlnv
+]
+.B \--mdsdb
+.I mds_database_file
+.B \--ostdb
+.IR "ost1_database_file " [ "ost2_database_file " ...]
+.I filesystem
+.SH DESCRIPTION
+.B lfsck
+is used to check and repair the distributed coherency of a Lustre filesystem.
+.PP
+.I filesystem
+is the mountpoint of the Lustre filesystem to be checked. e.g.
+.IR /mnt/lustre ).
+.SH DESCRIPTION
+Lfsck is used to do the distributed coherency checking of a Lustre filesystem
+after
+.B e2fsck
+has been run on each of the local MDS and OST device filesystems.
+.B Lfsck
+will ensure that objects are not referenced by multiple MDS files, that there
+are no orphan objects on the OSTs (objects that do not have any file on the
+MDS which references them), and that all of the objects referenced by the
+MDS exist.  Under normal circumstances Lustre will maintain such coherency
+by distributed logging mechanisms, but under exceptional circumstances that
+may fail (e.g. disk failure, filesystem corruption leading to e2fsck repair).
+.SH OPTIONS
+.TP
+.B \-c
+Create (empty) missing OST objects referenced by MDS inodes.
+.TP
+.B \-d
+Delete orphaned objects from the filesystem.  Since objects on the OST are
+often only one of several stripes of a file it can be difficult to put
+multiple objects back together into a single usable file.
+.TP
+.B \-h
+Print a brief help message.
+.TP
+.B \-l
+Put orphaned objects into a lost+found directory in the root of the filesystem.
+.TP
+.B \-n
+Do not repair the filesystem, just perform a read-only check (default).
+.TP
+.B \-v
+Verbose operation - more verbosity by specifing option multiple times.
+.TP
+.BI \-\-mdsdb " mds_database_file"
+The MDS database file created by running
+.B e2fsck --mdsdb mds_database_file device
+on the MDS backing device.  This is required.
+.TP
+.BI \-\-ostdb " ost1_database_file " [ "ost2_database_file " ...]
+The OST database files created by running
+.B e2fsck --ostdb ost_database_file device
+on each of the OST backing devices.  These are required, unless an OST is
+unavailable, in which case all objects thereon will be considered missing.
+.SH REPORTING BUGS
+Bugs should be reported to Sun Microsystems, Inc. via Bugzilla:
+http://bugzilla.lustre.org/
+.SH AUTHOR
+This version of
+.B lfsck
+was originally written by Liam Kelleher <liam.kelleher@hp.com>
+and maintained by Andreas Dilger <adilger@whamcloud.com>
+.SH SEE ALSO
+.BR e2fsck (8)
diff --git a/e2fsck/lfsck.c b/e2fsck/lfsck.c
new file mode 100644 (file)
index 0000000..68f3e46
--- /dev/null
@@ -0,0 +1,2111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004  Hewlett-Packard Co.
+ * Copyright 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright (c) 2011  Whamcloud, Inc.
+ */
+/***********************************************************************
+ * This program takes as input files generated by running e2fsck on
+ * the mds and ost filesystems. The file generated for each ost
+ * contains a table including the object id and size for each object
+ * extant on the ost in each entry.
+ * The file generated from scanning the mds filesystem with e2fsck
+ * contains multiple tables one for each ost. In each table an entry
+ * contains the mds fid as well as the object id on the appropriate
+ * ost. In addition there is an additional table that holds the mds_fid
+ * and the containing directory fid for each entry. This is used for
+ * name lookup.
+ * There are three basic checks
+ * 1) Make sure that multiple mds entries do not reference the same object
+ * 2) Cross reference each object on each ost to make sure a "containing"
+ *    file for this exists on the mds
+ * 3) For each file on the mds make sure that the associated objects exist
+ *    on the osts
+ * These checks and potential correction for errors found are run from
+ * run_pass*
+ * Each of these checks is just iterate through the appropriate table and
+ * cross check against another table and if errors are found repair.
+ ***************************************************************************/
+#include "config.h"
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <utime.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <getopt.h>
+#include <mntent.h>
+#include <dirent.h>
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+#include "../version.h"
+#include "e2fsck.h"
+#include "ext2fs/lfsck.h"
+#include "ext2fs/ext2fs.h"
+
+struct lfsck_fids {
+       int depth;
+       struct lu_fid *fids;
+};
+
+struct lfsck_thread_info {
+       struct lfsck_mds_hdr *mds_hdr;
+       DB *mds_direntdb;
+       DB *mds_sizeinfodb;
+       __u32 start_ost_idx;
+       __u32 end_ost_idx;
+       int status;
+};
+
+struct lfsck_saved_duplicates {
+       struct lu_fid   ld_mds_fid;
+       struct ost_id   ld_oi;
+       __u32           ld_ost_idx;
+       char            *ld_link;
+};
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define LOG_PATH "/var/log/lfsck.log"
+#define RLIMIT 1024
+
+/* Procedure declarations */
+
+char *progname = "lfsck";
+FILE *logfile;
+
+int lfsck_help;
+int lfsck_save;
+int lfsck_delete;
+int lfsck_create;
+int lfsck_force;
+int lfsck_verbose;
+int lfsck_yes;
+
+int num_threads = 1;
+
+char mnt_path[PATH_MAX];
+char *mds_file;
+char lostandfounddir[PATH_MAX];
+char dupedir[PATH_MAX];
+char *ost_files[LOV_MAX_OSTS];
+int num_ost_files;
+
+struct obd_uuid lfsck_uuid[LOV_MAX_OSTS];
+int lov_tgt_count = LOV_MAX_OSTS;
+
+struct lfsck_saved_duplicates *lfsck_duplicates;
+int lfsck_dup_saved;
+int fixed;
+int fix_failed;
+
+pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t phase_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t size_lock = PTHREAD_MUTEX_INITIALIZER;
+int all_started;
+
+int lfsck_path2fid(const char *path, struct lu_fid *fid)
+{
+       struct stat st;
+       long generation;
+       int fd;
+       int rc = 0;
+
+#ifdef LL_IOC_PATH2FID
+       rc = llapi_path2fid(path, fid);
+       if (rc == 0 || rc != -ENOTTY)
+               return rc;
+#endif
+       /* LL_IOC_PATH2FID was landed in 1.8.2.  If it doesn't exist at compile
+        * time, or it fails at runtime with a return ENOTTY indicating that
+        * the ioctl is unimplemented, emulate it here for the older clients.
+        * Assume the server is running Lustre 1.x and create an IGIF FID,
+        * since < 1.8.4 will not work properly with 2.x servers anyway. */
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       rc = fstat(fd, &st);
+       if (rc < 0) {
+               rc = -errno;
+               goto out;
+       }
+       fid->f_seq = st.st_ino;
+
+       rc = ioctl(fd, FSFILT_IOC_GETVERSION, &generation);
+       if (rc < 0) {
+               rc = -errno;
+               goto out;
+       }
+       fid->f_oid = generation;
+       fid->f_ver = 0;
+
+out:
+       close(fd);
+       return rc;
+}
+
+#define VERBOSE(lvl, fmt, args...)                                     \
+do { if (lfsck_verbose >= lvl) printf(fmt, ## args); } while (0)
+
+int log_open()
+{
+       time_t tm;
+
+       logfile = fopen(LOG_PATH, "a");
+       if (logfile == NULL) {
+               fprintf(stderr, "%s: Failed to open log file %s\n",
+                       progname, LOG_PATH);
+               return -EPERM;
+       }
+
+       time(&tm);
+       fprintf(logfile, "===============================================\n\n");
+       fprintf(logfile, "Starting lfsck %s\n", ctime(&tm));
+       return 0;
+}
+
+int log_close(int status)
+{
+       time_t tm;
+
+       if (logfile == NULL)
+               return 0;
+
+       time(&tm);
+       if (status < 0)
+               fprintf(logfile, "ERROR: lfsck aborted\n");
+       else
+               fprintf(logfile, "lfsck run completed:  %s\n", ctime(&tm));
+
+       fprintf(logfile, "===============================================\n\n");
+
+       fclose(logfile);
+       logfile = NULL;
+       return 0;
+}
+
+static void log_write(char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+
+static void log_write(char *fmt, ...)
+{
+       va_list args;
+
+       if (logfile) {
+               va_start(args, fmt);
+               vfprintf(logfile, fmt, args);
+               va_end(args);
+       }
+       va_start(args, fmt);
+       vfprintf(stderr, fmt, args);
+       va_end(args);
+}
+
+void usage()
+{
+       printf("\n");
+       printf("usage: lfsck [-cdfhlnv] --mdsdb mdsdb "
+              "--ostdb ostdb1 [ostdb2 ...] filesystem\n\n");
+       printf("\t-m|--mdsdb mdsdb  MDS database from e2fsck --mdsdb\n");
+       printf("\t-o|--ostdb ostdb  OST databases from e2fsck --ostdb\n");
+       printf("\tfilesytem         full path of lustre mountpoint\n");
+       printf("\t[-c|--create]     create missing objects\n");
+       printf("\t[-d|--delete]     delete orphan objects\n");
+       printf("\t[-f|--force]      force running if fs appears unmounted\n");
+       printf("\t[-h|--help]       print this message\n");
+       printf("\t[-l|--lostfound]  save orphans objects to lost+found\n");
+       printf("\t[-n|--nofix]      do not fix filesystem errors (default)\n");
+       printf("\t[-v|--verbose]    print verbose runtime messages\n");
+       /* printf("\t[-y|--yes]        do all cleanup automatically\n"); */
+       printf("\n");
+}
+
+/*
+ * Get the lov config for the filesystem - this is primarily used to correlate
+ * each ost db file with its index in the lov configuration. Obviously this is
+ * critical.
+ */
+int get_lov_config()
+{
+       int fd, rc;
+
+       fd = open(mnt_path, O_RDONLY);
+       if (fd < 0) {
+               log_write("Error: opening %s\n", mnt_path);
+               return -errno;
+       }
+
+       rc = llapi_lov_get_uuids(fd, lfsck_uuid, &lov_tgt_count);
+
+       close(fd);
+       return rc;
+}
+
+int parse_args(int argc, char *argv[])
+{
+       int option_index = 0;
+       char *path_name = NULL;
+       struct option long_options[] = {
+               { "create", 1, NULL, 'c' },
+               { "delete", 0, NULL, 'd' },
+               { "force", 0, NULL, 'f' },
+               { "help", 0, NULL, 'h' },
+               { "lostfound", 0, NULL, 'l' },
+               { "mdsdb", 1, NULL, 'm' },
+               { "mdtdb", 1, NULL, 'm' },
+               { "nofix", 0, NULL, 'n' },
+               { "ostdb", 1, NULL, 'o' },
+               { "threads", 1, NULL, 't' },
+               { "verbose", 0, NULL, 'v' },
+               /* { "yes", 0, NULL, 'y' }, */
+               { 0, 0, 0, 0 }
+       };
+       struct mntent *mnt_info;
+       char tmp[PATH_MAX];
+       char *dbpath;
+       int c, found;
+       char *p1;
+       FILE *fp;
+
+       if (argc < 6 )
+               return -EINVAL;
+
+       while ((c = getopt_long(argc, argv, "-cdfhlm:no:t:vy",
+                               long_options, &option_index)) != -1) {
+               switch (c) {
+               case 'c':
+                       lfsck_create++;
+                       break;
+               case 'd':
+                       lfsck_delete++;
+                       break;
+               case 'f':
+                       lfsck_force++;
+                       break;
+               case 'h':
+                       lfsck_help++;
+                       break;
+               case 'l':
+                       lfsck_save++;
+                       break;
+               case 'm':
+                       VERBOSE(1, "MDSDB: %s\n", optarg);
+                       dbpath = malloc(PATH_MAX);
+                       if (dbpath == NULL) {
+                               fprintf(stderr, "error allocating dbpath\n");
+                               return -ENOMEM;
+                       }
+                       strcpy(tmp, optarg);
+                       if (realpath(my_dirname(tmp), dbpath) == NULL) {
+                               fprintf(stderr, "Failure to resolve path %s\n",
+                                       optarg);
+                               free(dbpath);
+                               exit(1);
+                       }
+
+                       strcpy(tmp, optarg);
+                       sprintf(dbpath+strlen(dbpath), "/%s", my_basename(tmp));
+                       mds_file = dbpath;
+                       break;
+               case 'n':
+                       lfsck_create = 0;
+                       lfsck_delete = 0;
+                       lfsck_save = 0;
+                       break;
+               case 1:
+                       if (optind == argc) { /* last one is mountpoint */
+                               VERBOSE(1, "MOUNTPOINT: %s\n", optarg);
+                               path_name = optarg;
+                               break;
+                       }
+                       /* Otherwise it is another ostdb file */
+               case 'o':
+               {
+                       char *ost_path;
+
+                       VERBOSE(1, "OSTDB[%u]: %s\n", num_ost_files, optarg);
+                       p1 = optarg;
+                       do {
+                               dbpath = malloc(PATH_MAX);
+                               if (dbpath == NULL) {
+                                       fprintf(stderr,
+                                              "error allocate ost_files[%d]\n",
+                                              num_ost_files);
+                                       return -ENOMEM;
+                               }
+
+                               /* Old-style arguments are comma separated */
+                               ost_path = strsep(&p1, ",");
+                               strcpy(tmp, ost_path);
+                               if (realpath(my_dirname(tmp), dbpath) == NULL) {
+                                       fprintf(stderr, "Failure to resolve "
+                                               "path %s\n", optarg);
+                                       for (c = 0; c < num_ost_files; c++)
+                                               free(ost_files[c]);
+                                       free(dbpath);
+                                       exit(1);
+                               }
+
+                               strcpy(tmp, ost_path);
+                               sprintf(dbpath+strlen(dbpath), "/%s",
+                                       my_basename(tmp));
+                               ost_files[num_ost_files] = dbpath;
+                               num_ost_files++;
+                       } while (p1 != NULL);
+                       break;
+               }
+               case 't':
+                       num_threads = strtol(optarg, NULL, 0);
+                       if (num_threads == ULONG_MAX)
+                               return -EINVAL;
+
+                       if (num_threads > 128)
+                               num_threads = 128;
+                       if (num_threads == 0)
+                               num_threads = 1;
+                       break;
+               case 'v':
+                       lfsck_verbose++;
+                       break;
+               case 'y':
+                       lfsck_yes++;
+                       break;
+               default:
+                       fprintf(stderr, "unknown option %c\n", c);
+                       return -EINVAL;
+               }
+       }
+
+       if (path_name == NULL)
+               path_name = argv[optind];
+
+       if (lfsck_yes && !lfsck_save && !lfsck_delete && !lfsck_create) {
+               fprintf(stderr, "--yes requires either --save or --delete, or"
+                       "--create\n");
+               return -EINVAL;
+       }
+
+       if (realpath(path_name, mnt_path) == NULL) {
+               fprintf(stderr, "error getting real mount path_name\n");
+               return -EINVAL;
+       }
+       fp = setmntent(MOUNTED, "r");
+       if (fp == NULL) {
+               fprintf(stderr, "error opening /etc/mtab\n");
+               return -EINVAL;
+       }
+
+       found = 0;
+       while ((mnt_info = getmntent(fp)) != NULL) {
+               if (strcmp(mnt_info->mnt_dir, mnt_path) == 0 &&
+                   llapi_is_lustre_mnttype(mnt_info->mnt_type)) {
+                       found = 1;
+                       break;
+               }
+       }
+       endmntent(fp);
+
+       if (!found) {
+               if (lfsck_force) {
+                       fprintf(stderr, "lfsck: %s unmounted?  forcing\n",
+                               mnt_path);
+               } else {
+                       fprintf(stderr, "lfsck: %s not mounted\n", mnt_path);
+                       return -EINVAL;
+               }
+       }
+
+       if (mds_file == NULL || ost_files[0] == NULL) {
+               fprintf(stderr, "--mdsdb or --ostdb unspecified\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * This is called from lfsck_get_path and also recursively.
+ * This function is used on error paths when the name of an mds fid has
+ * to be determined. It relies on the order of directory search that was
+ * run in e2fsck when generating the mds_fid + containing dir table.
+ * It searches through the table for the correct mds_fid. When this
+ * is found a list of fids which are the fids of the directory tree up
+ * to the containing directory of the sought for fid is returned.
+ * When called recursively it continues search from the current point and
+ * when the recursive call returns the search is continued from the
+ * current search point as well. Basically is just traverses the list once.
+ * For a file like <mntpt>/aaa/ccc/ddd the fids of aaa ccc and the fid
+ * for ddd would also be returned.
+ */
+static int lfsck_get_fids(struct lu_fid *mds_fid, DB *mds_direntdb,
+                         int depth, struct lfsck_fids *lfidp)
+{
+       struct lfsck_mds_dirent mds_dirent;
+       int rc = 0;
+       DBT key, data;
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = mds_fid;
+       key.size = sizeof(*mds_fid);
+       data.data = &mds_dirent;
+       data.size = data.ulen = sizeof(mds_dirent);
+       data.flags = DB_DBT_USERMEM;
+
+       rc = mds_direntdb->get(mds_direntdb, NULL, &key, &data, 0);
+       if (rc && !lfsck_is_dirfid_root(mds_fid)) {
+               log_write("Failed to find fid "DFID": %s\n", PFID(mds_fid),
+                         db_strerror(rc));
+               return -ENOENT;
+       }
+       letocpu_mds_dirent(&mds_dirent);
+       if (lfsck_is_dirfid_root(&mds_dirent.mds_dirfid)) {
+               lfidp->fids = malloc(sizeof(*lfidp->fids) * (depth + 1));
+               if (lfidp->fids == NULL)
+                       return -ENOMEM;
+
+               lfidp->depth = depth;
+               lfidp->fids[depth] = mds_dirent.mds_fid;
+               return 0;
+       }
+       rc = lfsck_get_fids(&mds_dirent.mds_dirfid, mds_direntdb,
+                           depth + 1, lfidp);
+       if (rc)
+               return rc;
+
+       lfidp->fids[depth] = mds_dirent.mds_fid;
+       return 0;
+}
+
+/* This function determines a path to a file given an mds fid.
+ * The workhorse function is lfsck_get_fids which once given a
+ * fid return a list of directory fids from the "root" directory to
+ * the fid in question. Using these fids we can construct the path to
+ * the file by using readir()
+ */
+static int lfsck_get_path(struct lu_fid *mds_fid, DB *mds_direntdb,
+                         char *path, int path_len)
+{
+       struct lfsck_fids lfids;
+       DIR *dir;
+       struct dirent *dent;
+       int rc, i;
+       int cur_len = 0;
+
+       VERBOSE(2, "lookup path for FID "DFID"\n", PFID(mds_fid));
+
+       lfids.fids = NULL;
+       lfids.depth = 0;
+
+       rc = lfsck_get_fids(mds_fid, mds_direntdb, 0, &lfids);
+       if (rc != 0) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       if (strlen(mnt_path) + 1 > path_len) {
+               rc = -ENOMEM;
+               goto out;
+       }
+       cur_len = strlen(mnt_path);
+       path[strlen(mnt_path)] = 0;
+       memcpy(path, mnt_path, strlen(mnt_path));
+       /* Skip the first dir since this would be "ROOT" */
+       rc = 0;
+       for (i = lfids.depth - 1; i >= 0; i--) {
+               dir = opendir(path);
+               if (dir == NULL) {
+                       rc = -errno;
+                       goto out;
+               }
+               while (1) {
+                       char path_tmp[PATH_MAX];
+                       struct lu_fid fid;
+
+                       dent = readdir(dir);
+                       if (dent == NULL) {
+                               closedir(dir);
+                               rc = -ENOENT;
+                               goto out;
+                       }
+
+                       if (strlen(path) + strlen(dent->d_name)+2 >= PATH_MAX) {
+                               closedir(dir);
+                               rc = -ENAMETOOLONG;
+                               goto out;
+                       }
+
+                       sprintf(path_tmp, "%s/%s", path, dent->d_name);
+                       rc = lfsck_path2fid(path_tmp, &fid);
+                       if (rc)
+                               continue;
+
+                       rc = 0;
+                       if (lfsck_fidcmp(&fid, &lfids.fids[i]) == 0) {
+                               if (cur_len + 1 + strlen(dent->d_name) >
+                                   path_len) {
+                                       rc = -ENOMEM;
+                                       closedir(dir);
+                                       goto out;
+                               }
+                               path[cur_len] = '/';
+                               cur_len++;
+                               memcpy(&path[cur_len], dent->d_name,
+                                      strlen(dent->d_name));
+                               cur_len += strlen(dent->d_name);
+                               path[cur_len] = 0;
+                               closedir(dir);
+                               break;
+                       }
+               }
+       }
+out:
+       if (lfids.fids)
+               free(lfids.fids);
+       return rc;
+}
+
+/*
+ * Used by pass1 to save the ids of files which reference the same
+ * objects. This is then used by pass4 to repair these files
+ */
+static int lfsck_save_duplicate(const struct lfsck_mds_objent *mds_obj)
+{
+       VERBOSE(2, "save duplicate object %u:"DOIF" FID "DFID"\n",
+               mds_obj->mds_ostidx, POIF(&mds_obj->mds_oi),
+               PFID(&mds_obj->mds_fid));
+
+       pthread_mutex_lock(&phase_lock);
+       if (lfsck_duplicates == NULL) {
+               lfsck_duplicates = malloc(sizeof(*lfsck_duplicates) *
+                                         RLIMIT);
+               if (lfsck_duplicates == NULL)
+                       return -EINVAL;
+
+       } else if (!((lfsck_dup_saved + 1) % RLIMIT)) {
+               size_t size = (((lfsck_dup_saved + 1) / RLIMIT) + 1) *
+                             sizeof(*lfsck_duplicates) * RLIMIT;
+               void *tmp = realloc(lfsck_duplicates, size);
+
+               if (tmp == NULL)
+                       return -ENOMEM;
+
+               lfsck_duplicates = tmp;
+       }
+       lfsck_duplicates[lfsck_dup_saved].ld_mds_fid = mds_obj->mds_fid;
+       lfsck_duplicates[lfsck_dup_saved].ld_oi = mds_obj->mds_oi;
+       lfsck_duplicates[lfsck_dup_saved].ld_ost_idx = mds_obj->mds_ostidx;
+       lfsck_duplicates[lfsck_dup_saved].ld_link = NULL;
+       lfsck_dup_saved++;
+       pthread_mutex_unlock(&phase_lock);
+       return 0;
+}
+
+/*
+ * Check for duplicate ost objects on mds. Run through the table of
+ * mds_fid/ost object to make sure that each ost object is only
+ * refrenced by one mds entry. If a duplicate is found save the information
+ * for repair in pass4
+ */
+int lfsck_run_pass1(__u32 ost_idx, DB *mds_ostdb, DB *ost_db, DB *mds_direntdb)
+{
+       int i = 0;
+       int error = 0;
+       int rc = 0;
+       struct lfsck_mds_objent mds_obj, mds_obj2;
+       unsigned long count = 0;
+       DBT key, data;
+       DBC *dbcp = NULL;
+
+       log_write("%s: ost_idx %d: pass1: check for duplicate objects\n",
+                 progname, ost_idx);
+
+       rc = mds_ostdb->cursor(mds_ostdb, NULL, &dbcp, 0);
+       if (rc != 0) {
+               log_write("%s: error acquiring cursor for database: %s\n",
+                         progname, db_strerror(rc));
+               goto out;
+       }
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       data.data = &mds_obj;
+       data.size = data.ulen = sizeof(mds_obj);
+       data.flags = DB_DBT_USERMEM;
+
+       while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0)  {
+               DBT data_dup;
+               db_recno_t num_dup;
+
+               count++;
+               rc = dbcp->c_count(dbcp, &num_dup, 0);
+               if (rc != 0) {
+                       log_write("%s: [%u] getting object refcount: %s\n",
+                                 progname, ost_idx, db_strerror(rc));
+                       rc = -EINVAL;
+                       goto out;
+               }
+               if (num_dup <= 1)
+                       continue;
+
+               letocpu_mds_objent(&mds_obj);
+
+               if (lfsck_save_duplicate(&mds_obj))
+                       fix_failed++;
+
+               for (i = 1; i < num_dup; i++) {
+                       memset(&data_dup, 0, sizeof(data_dup));
+                       data_dup.data = &mds_obj2;
+                       data_dup.size = data_dup.ulen = sizeof(mds_obj2);
+                       data_dup.flags = DB_DBT_USERMEM;
+                       rc = dbcp->c_get(dbcp, &key, &data_dup, DB_NEXT);
+                       if (rc != 0) {
+                               log_write("%s: acquiring duplicate info: %s\n",
+                                         progname, db_strerror(rc));
+                               rc = -EINVAL;
+                               goto out;
+                       }
+                       letocpu_mds_objent(&mds_obj2);
+
+                       if (!lfsck_fidcmp(&mds_obj.mds_fid, &mds_obj2.mds_fid)){
+                               log_write("%s: [%u] hard link on FID "DFID" is"
+                                         " not a duplicate object "DOIF"\n",
+                                         progname, ost_idx,
+                                         PFID(&mds_obj.mds_fid),
+                                         POIF(&mds_obj.mds_oi));
+                               continue;
+                       }
+
+                       if (lfsck_save_duplicate(&mds_obj2))
+                               fix_failed++;
+               }
+       }
+
+       if (rc != DB_NOTFOUND) {
+               log_write("%s: error reading from inode database: %s\n",
+                         progname, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+
+       rc = 0;
+       if (error == 0) {
+               log_write("%s: ost_idx %d: pass1 OK (%lu files total)\n",
+                         progname, ost_idx, count);
+       } else {
+               log_write("%s: ost_idx %d: pass1 ERROR: %d duplicate "
+                         "entries found (fixed in pass4) (%lu files total)\n",
+                         progname, ost_idx, error, count);
+       }
+out:
+       if (dbcp)
+               dbcp->c_close(dbcp);
+
+       return rc;
+}
+
+#ifndef LL_IOC_RECREATE_OBJ
+#define LL_IOC_RECREATE_OBJ _IOW('f', 157, long)           /* 1.x object IDs */
+#endif
+#ifndef LL_IOC_RECREATE_FID
+#define LL_IOC_RECREATE_FID _IOW('f', 157, struct lu_fid)   /* 2.x FIDs */
+#endif
+
+/* If an MDS file is missing an object recreate object using an ioctl call */
+static int lfsck_recreate_obj(int cmd, void *creat, struct ost_id *oi,
+                             __u32 ost_idx, char *path)
+{
+
+       int fd;
+       int rc;
+
+       if (!lfsck_create) {
+               log_write("[%u]: %s object %s "DOIF" not created\n", ost_idx,
+                         path, cmd == LL_IOC_RECREATE_FID ? "FID":"ID",
+                         POIF(oi));
+               return 0;
+       }
+
+       fd = open(path, O_LARGEFILE | O_RDONLY, 0);
+       if (fd < 0) {
+               rc = -errno;
+               log_write("[%u]: FAILED to open %s missing obj "DOIF"\n",
+                         ost_idx, path, POIF(oi));
+               fix_failed++;
+               return rc;
+       }
+
+       rc = ioctl(fd, cmd, &creat);
+       if (rc) {
+               rc = -errno;
+               log_write("[%u]: failed to recreate %s missing obj "DOIF"\n",
+                         ost_idx, path, POIF(oi));
+               fix_failed++;
+       } else {
+               log_write("[%u]: recreated %s missing obj "DOIF"\n",
+                         ost_idx, path, POIF(oi));
+               fixed++;
+       }
+       close(fd);
+
+       return rc;
+}
+
+/*
+ * If size checking is enabled see if this ost is "adding" to the file size
+ * if it is then just calculate the new size and save.
+ */
+static int lfsck_calc_size(struct lfsck_mds_objent *mds_obj,
+                          struct lfsck_ost_objent *ost_obj,
+                          DB *mds_sizeinfodb)
+{
+       int rc = 0;
+#ifdef LFSCK_CHECK_SIZE
+       struct lfsck_mds_szinfo mds_szinfo1;
+       __u64 calc_size;
+       DBT key, data;
+       __u64 chunks, rem;
+
+       if (ost_obj->ost_size == 0)
+               return 0;
+
+       pthread_mutex_lock(&size_lock);
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &mds_obj->mds_fid;
+       key.size = sizeof(mds_obj->mds_fid);
+       data.data = &mds_szinfo1;
+       data.size = data.ulen = sizeof(mds_szinfo1);
+       data.flags = DB_DBT_USERMEM;
+       rc = mds_sizeinfodb->get(mds_sizeinfodb, NULL, &key, &data, 0);
+       if (rc != 0) {
+               log_write("Failure to get sizeinfo "LPU64"\n",
+                         mds_obj->mds_fid);
+               pthread_mutex_unlock(&size_lock);
+               return -ENOENT;
+       }
+       letocpu_mds_szinfo(&mds_szinfo1);
+       assert (mds_szinfo1.mds_stripe_pattern == LOV_PATTERN_RAID0);
+       chunks = ost_obj->ost_size / mds_szinfo1.mds_stripe_size;
+       rem = ost_obj->ost_size % mds_szinfo1.mds_stripe_size;
+       if (rem == 0) {
+               calc_size = (((chunks - 1 ) * mds_szinfo1.mds_stripe_size) *
+                            mds_szinfo1.mds_stripe_count);
+               calc_size += mds_szinfo1.mds_stripe_size *
+                            (mds_obj->mds_ostoffset + 1);
+       } else {
+               calc_size = ((chunks * mds_szinfo1.mds_stripe_size)
+                           * mds_szinfo1.mds_stripe_count);
+               if (mds_obj->mds_ostoffset == 0) {
+                       calc_size += rem;
+               } else {
+                       calc_size += mds_szinfo1.mds_stripe_size *
+                                    mds_obj->mds_ostoffset;
+                       calc_size += rem;
+               }
+       }
+       if (calc_size > mds_szinfo1.mds_calc_size) {
+               mds_szinfo1.mds_calc_size = calc_size;
+               memset(&key, 0, sizeof(key));
+               memset(&data, 0, sizeof(data));
+               key.data = &mds_obj->mds_fid;
+               key.size = sizeof(mds_obj->mds_fid);
+               data.data = &mds_szinfo1;
+               data.size = sizeof(mds_szinfo1);
+               cputole_mds_szinfo(&mds_szinfo1);
+               /* Make sure we overwrite */
+               rc = mds_sizeinfodb->put(mds_sizeinfodb, NULL, &key, &data, 0);
+               if (rc != 0) {
+                       log_write("Failure to update sizeinfo data\n");
+                       pthread_mutex_unlock(&size_lock);
+                       return -EIO;
+               }
+       }
+       pthread_mutex_unlock(&size_lock);
+#endif /* LFSCK_CHECK_SIZE */
+       return rc;
+}
+
+/*
+ * Check for dangling inode.
+ * pass runs through the mds table for an ost and checks again the ost table
+ * that the object refrenced on the mds exists on the ost
+ */
+int lfsck_run_pass2(__u32 ost_idx, struct lfsck_mds_hdr *mds_hdr,
+                   DB *mds_ostdb, DB *ostdb,
+                   DB *mds_direntdb, DB *mds_sizeinfodb)
+{
+       struct lfsck_mds_objent mds_obj;
+       struct lfsck_ost_objent ost_obj;
+       int error = 0, rc = 0;
+       unsigned long count = 0;
+       char *path;
+       DBC *dbcp = NULL;
+       DBT key, data;
+       __u64 max_objid = mds_hdr->mds_max_ost_id[ost_idx];
+       __u64 mds_connect_flags = 0;
+
+       rc = llapi_get_connect_flags(mnt_path, &mds_connect_flags);
+       /* Ignore the error here, and assume it is an older 1.8.x without
+        * LL_IOC_GET_CONNECT_FLAGS.  We only use this for 2.x detection. */
+
+       log_write("lfsck: ost_idx %d: pass2: check for missing inode objects\n",
+                 ost_idx);
+
+       path = malloc(PATH_MAX);
+       if (path == NULL) {
+               log_write("lfsck: [%u]: pass2 ERROR: out of memory\n",
+                          ost_idx);
+               return -ENOMEM;
+       }
+
+       rc = mds_ostdb->cursor(mds_ostdb, NULL, &dbcp, 0);
+       if (rc != 0) {
+               log_write("[%u]: error acquiring cursor for mds table: %s\n",
+                         ost_idx, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       data.data = &mds_obj;
+       data.size = data.ulen = sizeof(mds_obj);
+       data.flags = DB_DBT_USERMEM;
+       while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+               DBT key_ost, data_ost;
+
+               count++;
+               letocpu_mds_objent(&mds_obj);
+
+               if (mds_hdr->mds_flags & E2F_OPT_READONLY &&
+                   mds_obj.mds_oi.oi_id > max_objid) {
+                       VERBOSE(2, "[%u] skipping MDS FID "DFID": object "DOIF
+                               " > max "LPU64"\n", ost_idx,
+                               PFID(&mds_obj.mds_fid), POIF(&mds_obj.mds_oi),
+                               max_objid);
+                       continue;
+               }
+
+               memset(&key_ost, 0, sizeof(key_ost));
+               memset(&data_ost, 0, sizeof(data_ost));
+               key_ost.data = &mds_obj.mds_oi;
+               key_ost.size = sizeof(mds_obj.mds_oi);
+               data_ost.data = &ost_obj;
+               data_ost.size = data_ost.ulen = sizeof(ost_obj);
+               data_ost.flags = DB_DBT_USERMEM;
+               rc = ostdb->get(ostdb, NULL, &key_ost, &data_ost, 0);
+               if (rc != 0) {
+                       letocpu_ost_objent(&ost_obj);
+                       if (rc == DB_NOTFOUND) {
+                               struct lu_fid fid;
+                               struct create18 {
+                                       __u64 lrc_id;
+                                       __u32 lrc_ost_idx;
+                               } create18;
+                               void *create;
+                               int cmd;
+
+                               if (lfsck_get_path(&mds_obj.mds_fid,
+                                                  mds_direntdb, path,
+                                                  PATH_MAX)) {
+                                       VERBOSE(1,"[%u]: MDS FID "DFID" object "
+                                               DOIF" deleted?\n", ost_idx,
+                                               PFID(&mds_obj.mds_fid),
+                                               POIF(&mds_obj.mds_oi));
+                                       continue;
+                               }
+                               error++;
+                               if (mds_connect_flags & OBD_CONNECT_FID) {
+                                       ostid_idif_unpack(&mds_obj.mds_oi,
+                                                         &fid, ost_idx);
+                                       create = &fid;
+                                       cmd = LL_IOC_RECREATE_FID;
+                               } else {
+                                       create18.lrc_id = mds_obj.mds_oi.oi_id;
+                                       create18.lrc_ost_idx = ost_idx;
+                                       create = &create18;
+                                       cmd = LL_IOC_RECREATE_OBJ;
+                               }
+
+                               lfsck_recreate_obj(cmd, create,&mds_obj.mds_oi,
+                                                  ost_idx, path);
+                       } else {
+                               log_write("[%u]: error looking up object "DOIF
+                                         ": %s\n", ost_idx,
+                                         POIF(&mds_obj.mds_oi),
+                                         db_strerror(rc));
+                               rc = -EINVAL;
+                               goto out;
+                       }
+               }
+               if (lfsck_calc_size(&mds_obj, &ost_obj, mds_sizeinfodb)) {
+                       log_write("[%u]: error updating file size for object "
+                                 DOIF": %s\n", ost_idx, POIF(&mds_obj.mds_oi),
+                                 strerror(rc));
+                       rc = -EINVAL;
+                       goto out;
+               }
+       }
+       if (rc != DB_NOTFOUND) {
+               log_write("[%u]: error getting next inode: %s\n",
+                         ost_idx, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+       rc = 0;
+       if (error == 0) {
+               log_write("lfsck: ost_idx %d: pass2 OK (%lu objects)\n",
+                         ost_idx, count);
+       } else {
+               log_write("lfsck: ost_idx %d: pass2 ERROR: %d dangling inodes "
+                         "found (%lu files total)\n", ost_idx, error, count);
+       }
+
+out:
+       dbcp->c_close(dbcp);
+       free(path);
+       return 0;
+}
+
+/*
+ * If an object exists on an ost but is not referenced by an entry on the mds
+ * then create a lost+found entry and set the EA on the file so that the
+ * orphaned object is picked up. If the object is requested to be deleted
+ * an unlink on this lost+found file will now delete same
+ */
+int lfsck_fix_orphan(__u32 ost_idx, struct ost_id *oi,
+                    struct obd_uuid *uuid, int delete)
+{
+       struct lov_user_md *lum;
+       char file[PATH_MAX];
+       int fd, lum_size, rc = 0;
+       struct utimbuf utimbuf = { 0, 0 };
+
+       lum_size = LOV_EA_SIZE(lum, 1);
+
+       lum = malloc(lum_size);
+       if (lum == NULL) {
+               log_write("%s: out of memory on EA (%u) orphan %u:"DOIF"\n",
+                         progname, lum_size, ost_idx, POIF(oi));
+               return -ENOMEM;
+       }
+
+       memset(file, 0, PATH_MAX);
+       sprintf(file, "%s/%s-"DOIF, lostandfounddir, uuid->uuid, POIF(oi));
+
+       fd = open(file, O_CREAT|O_EXCL|O_LOV_DELAY_CREATE, 0600);
+       if (fd < 0) {
+               rc = -errno;
+               log_write("%s: unable to open %s for orphan %u:"DOIF": %s\n",
+                         progname, file, ost_idx, POIF(oi), strerror(-rc));
+               goto out_free;
+       }
+       lum->lmm_magic = LOV_USER_MAGIC;
+       lum->lmm_pattern = LOV_PATTERN_RAID0;
+       lum->lmm_stripe_size = 1048576;
+       lum->lmm_stripe_offset = 0;
+       lum->lmm_stripe_count = 1;
+       lum->lmm_objects[0].l_object_id = oi->oi_id;
+       lum->lmm_objects[0].l_object_seq = oi->oi_seq;
+       lum->lmm_objects[0].l_ost_gen = 0;
+       lum->lmm_objects[0].l_ost_idx = ost_idx;
+
+       /* reset the MDS timestamps so we can see the OST timestamps */
+       utime(file, &utimbuf);
+
+       if (ioctl(fd, LL_IOC_LOV_SETEA, lum) < 0) {
+               rc = -errno;
+               log_write("%s: unable to open %s for orphan %u:"DOIF": %s\n",
+                         progname, file, ost_idx, POIF(oi), strerror(-rc));
+       }
+
+       close(fd);
+       if (rc != 0 || delete) {
+               int err = unlink(file);
+               if (err != 0 && errno != ENOENT) {
+                       rc = rc ? rc : -errno;
+                       log_write("%s: failed to unlink %s for orphan %u:"DOIF
+                                 ": %s\n", progname, file, ost_idx,
+                                 POIF(oi), strerror(-rc));
+               }
+       }
+out_free:
+       free(lum);
+       return rc;
+}
+
+/*
+ * Check for orphans
+ * Run through each entry in ost table and check the mds ost table for
+ * a corresponding entry. If not found report and repair.
+ */
+int lfsck_run_pass3(__u32 ost_idx, DB *mds_ostdb, DB *ostdb,
+                   struct obd_uuid *uuid, __u64 last_id)
+{
+       int rc = 0;
+       struct lfsck_mds_objent mds_obj;
+       struct lfsck_ost_objent ost_obj;
+       unsigned long error = 0, count = 0;
+       DBT key, data;
+       DBC *dbcp = NULL;
+       __u64 bytes = 0;
+
+       log_write("lfsck: ost_idx %d: pass3: check for orphan objects\n",
+                 ost_idx);
+
+       VERBOSE(1, "[%u] uuid %s\n", ost_idx, uuid->uuid);
+       VERBOSE(1, "[%u] last_id "LPU64"\n", ost_idx, last_id);
+
+       rc = ostdb->cursor(ostdb, NULL, &dbcp, 0);
+       if (rc != 0) {
+               log_write("[%u]: error acquiring cursor for mds table: %s\n",
+                         ost_idx, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       data.data = &ost_obj;
+       data.size = data.ulen = sizeof(ost_obj);
+       data.flags = DB_DBT_USERMEM;
+
+       while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+               DBT key_mdt, data_mdt;
+               struct ost_id *oi;
+
+               count++;
+               letocpu_ost_objent(&ost_obj);
+               oi = &ost_obj.ost_oi;
+
+               if (oi->oi_id > last_id) {
+                       VERBOSE(2, "[%u] skipping objid "DOIF" > "LPU64"\n",
+                               ost_idx, POIF(oi), last_id);
+                       continue;
+               }
+               VERBOSE(2, "[%u] processing objid "DOIF"\n", ost_idx, POIF(oi));
+
+               memset(&key_mdt, 0, sizeof(key_mdt));
+               memset(&data_mdt, 0, sizeof(data_mdt));
+               key_mdt.data = oi;
+               key_mdt.size = sizeof(*oi);
+               data_mdt.data = &mds_obj;
+               data_mdt.size = data_mdt.ulen = sizeof(mds_obj);
+               data_mdt.flags = DB_DBT_USERMEM;
+               rc = mds_ostdb->get(mds_ostdb, NULL, &key_mdt, &data_mdt, 0);
+               if (rc == 0) {
+                       VERBOSE(2, "[%u] found object "DOIF" reference\n",
+                               ost_idx, POIF(oi));
+                       continue;
+               }
+
+               letocpu_mds_objent(&mds_obj);
+               if (rc != DB_NOTFOUND) {
+                       log_write("Failed to check mds db for entry\n");
+                       rc = -EINVAL;
+                       goto out;
+               }
+               if (ost_obj.ost_size == 0) {
+                       /* don't report errors for normal orphan recovery */
+                       VERBOSE(1, "[%u] zero-length orphan objid "DOIF"\n",
+                               ost_idx, POIF(oi));
+                       if (lfsck_save || lfsck_delete) {
+                               /* No reason to save just delete*/
+                               rc = lfsck_fix_orphan(ost_idx, oi, uuid, 1);
+                               if (rc) {
+                                       log_write("lfsck: [%u]: pass3 "
+                                                 "error fixing zero-length "
+                                                 "orphan objid "DOIF"\n",
+                                                 ost_idx, POIF(oi));
+                                       fix_failed++;
+                               } else {
+                                       fixed++;
+                               }
+                       }
+                       continue;
+               }
+
+               error++;
+               bytes += ost_obj.ost_bytes;
+               if (lfsck_save || lfsck_delete) {
+                       rc = lfsck_fix_orphan(ost_idx, oi, uuid, lfsck_delete);
+                       if (rc) {
+                               log_write("lfsck: [%u]: failed to fix orphan "
+                                         "object "DOIF", "LPU64" bytes\n",
+                                         ost_idx, POIF(oi), ost_obj.ost_bytes);
+                               fix_failed++;
+                       } else {
+                               log_write("lfsck: [%u]: pass3 %s orphan object "
+                                         DOIF", "LPU64" bytes\n", ost_idx,
+                                         lfsck_save ? "saved" : "unlinked",
+                                         POIF(oi), ost_obj.ost_bytes);
+                               fixed++;
+                       }
+               } else {
+                       error++;
+                       log_write("lfsck: [%u]: pass3 orphan found objid "
+                                 DOIF", "LPU64" bytes\n", ost_idx,
+                                 POIF(oi), ost_obj.ost_bytes);
+               }
+       }
+       if (rc != DB_NOTFOUND) {
+               log_write("[%u]: error getting next object in db %s\n",
+                         ost_idx, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+
+       if (error == 0) {
+               log_write("lfsck: ost_idx %d: pass3 OK (%lu files total)\n",
+                         ost_idx, count);
+       } else {
+               log_write("lfsck: ost_idx %d: pass3 %s: %4gMB of orphan "
+                         "data (%lu of %lu files total)\n", ost_idx,
+                         (lfsck_save | lfsck_delete) ? "FIXED" : "ERROR",
+                         (double)bytes / (1024 * 1024), error, count);
+       }
+out:
+       if (dbcp)
+               dbcp->c_close(dbcp);
+       return 0;
+}
+
+/* Missing ost information report affected file names */
+int lfsck_list_affected_files(char *mds_file, struct lfsck_mds_hdr *mds_hdr,
+                             DB *mds_direntdb, __u32 ost_idx)
+{
+       struct lfsck_mds_objent mds_obj;
+       char dbname[256];
+       char *path;
+       DB *mds_db = NULL;
+       DBT key, data;
+       DBC *dbcp = NULL;
+       int rc = 0;
+
+       path = malloc(PATH_MAX);
+       if (path == NULL)
+               return -ENOMEM;
+
+       sprintf(dbname, "%s.%d", MDS_OSTDB, ost_idx);
+       rc = lfsck_opendb(mds_file, dbname, &mds_db, 1, 0, 0);
+       if (rc != 0) {
+               log_write("%s: failed to open mds db file %s: rc %d\n",
+                         progname, mds_file, rc);
+               rc = -EINVAL;
+               goto out;
+       }
+
+       if ((rc = mds_db->cursor(mds_db, NULL, &dbcp, 0)) != 0) {
+               log_write("Failed to acquire cursor for mds table\n");
+               rc = -EINVAL;
+               goto out;
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       data.data = &mds_obj;
+       data.size = data.ulen = sizeof(mds_obj);
+       data.flags = DB_DBT_USERMEM;
+
+       log_write("Files affected by missing ost info are : -\n");
+       while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+               letocpu_mds_objent(&mds_obj);
+
+               if (lfsck_get_path(&mds_obj.mds_fid, mds_direntdb,
+                                  path, PATH_MAX)) {
+                       log_write("Failed to get path for fid "DFID"\n",
+                                 PFID(&mds_obj.mds_fid));
+                       fix_failed++;
+               } else {
+                       log_write("%s\n", path);
+               }
+       }
+       if (rc != DB_NOTFOUND) {
+               log_write("Error getting next element in db %d\n", rc);
+               rc = -EINVAL;
+               goto out;
+       }
+       rc = 0;
+
+out:
+       if (dbcp)
+               dbcp->c_close(dbcp);
+       if (mds_db)
+               mds_db->close(mds_db, 0);
+       return rc;
+}
+
+/*
+ * For each ost index run checks 1 2 and 3.
+ * 1) Check for object referenced by more than one file
+ * 2) Check that objects exist on ost
+ * 3) Check that containg mds entry exists for an object
+ */
+int run_test(__u32 ost_idx, struct lfsck_mds_hdr *mds_hdr,
+            DB *mds_direntdb, DB *mds_sizeinfodb)
+{
+       struct lfsck_ost_hdr *ost_hdr = NULL;
+       char dbname[256];
+       DB *mds_ostdb = NULL;
+       DB *ost_db = NULL;
+       DBT key, data;
+       __u64 last_id;
+       int i, rc;
+
+       sprintf(dbname, "%s.%d", MDS_OSTDB, ost_idx);
+
+       VERBOSE(2, "testing ost_idx %d\n", ost_idx);
+
+       rc = lfsck_opendb(mds_file, dbname, &mds_ostdb, 1, 0, 0);
+       if (rc != 0) {
+               log_write("%s: failed to open mds db file %s: rc %d\n",
+                         progname, mds_file, rc);
+               goto out;
+       }
+
+       ost_hdr = malloc(sizeof(*ost_hdr));
+       if (ost_hdr == NULL) {
+               log_write("Failure to alloc memory\n");
+               rc = -ENOMEM;
+               goto out;
+       }
+
+
+       VERBOSE(2, "looking for index %u UUID %s\n", ost_idx,
+               lfsck_uuid[ost_idx].uuid);
+
+       for (i = 0; i < num_ost_files; i++) {
+               VERBOSE(2, "checking file %s\n", ost_files[i]);
+               rc = lfsck_opendb(ost_files[i], OST_HDR, &ost_db, 0, 0, 0);
+               if (rc != 0) {
+                       log_write("%s: error opening ost_data_file %s: rc %d\n",
+                                 progname, ost_files[i], rc);
+                       goto out;
+               }
+               memset(&key, 0, sizeof(key));
+               memset(&data, 0, sizeof(data));
+               ost_hdr->ost_magic = OST_MAGIC;
+               key.data = &ost_hdr->ost_magic;
+               key.size = sizeof(ost_hdr->ost_magic);
+               data.size = data.ulen = sizeof(*ost_hdr);
+               data.data = ost_hdr;
+               data.flags = DB_DBT_USERMEM;
+
+               rc = ost_db->get(ost_db, NULL, &key, &data, 0);
+               ost_db->close(ost_db, 0);
+               ost_db = NULL;
+               if (rc != 0) {
+                       log_write("Invalid ost magic on file %s: rc %s\n",
+                                 ost_files[i], db_strerror(rc));
+                       continue;
+               }
+
+               letocpu_ost_hdr(ost_hdr);
+               VERBOSE(2, "%s has ost UUID %s\n", ost_files[i],
+                       ost_hdr->ost_uuid.uuid);
+
+               if (obd_uuid_equals(&lfsck_uuid[ost_idx], &ost_hdr->ost_uuid)) {
+                       if (ost_hdr->ost_index != ost_idx) {
+                               log_write("Requested ost_idx %u doesn't match "
+                                         "index %u found in %s\n", ost_idx,
+                                         ost_hdr->ost_index, ost_files[i]);
+                               continue;
+                       }
+
+                       break;
+               }
+       }
+
+       if (i == num_ost_files) {
+               log_write("lfsck: can't find file for ost_idx %d\n", ost_idx);
+               rc = lfsck_list_affected_files(mds_file, mds_hdr,
+                                              mds_direntdb, ost_idx);
+               goto out;
+       }
+       rc = lfsck_opendb(ost_files[i], OST_OSTDB, &ost_db, 0, 0, 0);
+       if (rc != 0) {
+               log_write("%s: error opening ost_data_file %s: rc %d\n",
+                         progname, ost_files[i], rc);
+               goto out;
+       }
+
+       VERBOSE(1, "MDS: max_id "LPU64" OST: max_id "LPU64"\n",
+               mds_hdr->mds_max_ost_id[ost_idx], ost_hdr->ost_last_id);
+
+       rc = lfsck_run_pass1(ost_idx, mds_ostdb, ost_db, mds_direntdb);
+       if (rc != 0) {
+               log_write("error in running pass1\n");
+               goto out;
+       }
+
+       rc = lfsck_run_pass2(ost_idx, mds_hdr, mds_ostdb, ost_db, mds_direntdb,
+                            mds_sizeinfodb);
+       if (rc != 0) {
+               log_write("error in running pass2\n");
+               goto out;
+       }
+
+       last_id = (ost_hdr->ost_flags & E2F_OPT_READONLY ||
+                  mds_hdr->mds_flags & E2F_OPT_READONLY) ?
+                       mds_hdr->mds_max_ost_id[ost_idx] : ost_hdr->ost_last_id;
+
+       rc = lfsck_run_pass3(ost_idx, mds_ostdb, ost_db, &ost_hdr->ost_uuid,
+                            last_id);
+       if (rc != 0) {
+               log_write("error in running pass3\n");
+               goto out;
+       }
+       rc = 0;
+
+out:
+       if (ost_hdr)
+               free(ost_hdr);
+       if (mds_ostdb)
+               mds_ostdb->close(mds_ostdb, 0);
+       if (ost_db)
+               ost_db->close(ost_db, 0);
+
+       return rc;
+}
+
+static int lfsck_validate_duplicate(struct lfsck_saved_duplicates *dup,
+                                   const char *path)
+{
+       struct lov_user_md *lum;
+       struct lov_user_ost_data_v1 *loi;
+       struct stat64 st;
+       struct lu_fid fid;
+       int rc, i;
+
+       VERBOSE(2,"[%u] check duplicate FID "DFID" object "DOIF"\n  for\t%s\n",
+               dup->ld_ost_idx, PFID(&dup->ld_mds_fid), POIF(&dup->ld_oi),
+               path);
+
+       /* first, validate that the paths are still valid */
+       if (stat64(path, &st) < 0) {
+               rc = -errno;
+               log_write("%s: duplicate file %s error: %s\n",
+                         progname, path, strerror(-rc));
+               return rc;
+       }
+
+       rc = lfsck_path2fid(path, &fid);
+       if (rc < 0) {
+               log_write("%s: unable to get LMA EA on %s: %s\n",
+                         progname, path, strerror(-rc));
+               return rc;
+       }
+       if (lfsck_fidcmp(&dup->ld_mds_fid, &fid)) {
+               log_write("%s: duplicate file %s is no longer FID "DFID"\n",
+                         progname, path, PFID(&dup->ld_mds_fid));
+               return -EBADF;
+       }
+
+       lum = malloc(LOV_EA_MAX(lum));
+       if (lum == NULL) {
+               log_write("%s: out of memory allocating LOV EA (%lu)\n",
+                         progname, LOV_EA_MAX(lum));
+               return -ENOMEM;
+       }
+
+       rc = llapi_file_get_stripe(path, lum);
+       if (rc < 0) {
+               log_write("%s: unable to get LOV EA on %s: %s\n",
+                         progname, path, strerror(-rc));
+               goto out;
+       }
+
+       if (lum->lmm_pattern != LOV_PATTERN_RAID0) {
+               log_write("%s: unknown LOV stripe pattern %#08x\n",
+                         progname, lum->lmm_pattern);
+               rc = -EINVAL;
+               goto out;
+       }
+
+       if (lum->lmm_magic == LOV_USER_MAGIC_V1) {
+               loi = lum->lmm_objects;
+       } else if (lum->lmm_magic == LOV_USER_MAGIC_V3) {
+               loi = ((struct lov_user_md_v3 *)lum)->lmm_objects;
+       } else {
+               log_write("%s: unknown LOV magic %#08x\n",
+                         progname, lum->lmm_magic);
+               rc = -EINVAL;
+               goto out;
+       }
+
+       /* Verify that the object in question is still in the file */
+       for (i = 0; i < lum->lmm_stripe_count; i++, loi++) {
+               if (loi->l_ost_idx == dup->ld_ost_idx &&
+                   loi->l_object_id == dup->ld_oi.oi_id &&
+                   loi->l_object_seq == dup->ld_oi.oi_seq)
+                       break;
+       }
+
+       if (i == lum->lmm_stripe_count) {
+               log_write("%s: cannot find object %u:"DOIF" in\n\t%s\n",
+                         progname, dup->ld_ost_idx, POIF(&dup->ld_oi), path);
+               rc = -EBADF;
+               goto out;
+       }
+
+out:
+       free(lum);
+       return rc;
+}
+
+#ifndef HAVE_LLAPI_CANCEL_OSC_LOCKS
+#define NAMESPACES "/proc/fs/lustre/ldlm/namespaces"
+void llapi_cancel_osc_locks(const char *mnt_path)
+{
+       DIR *namespaces;
+       char path[PATH_MAX];
+       struct dirent *dent;
+       int rc, fd;
+
+       namespaces = opendir(NAMESPACES);
+       if (namespaces == NULL) {
+               rc = -errno;
+               log_write("%s: error opening %s: %s\n",
+                         progname, NAMESPACES, strerror(-rc));
+               return;
+       }
+
+       while ((dent = readdir(namespaces)) != NULL) {
+               if (strcmp(dent->d_name, ".") == 0 ||
+                   strcmp(dent->d_name, "..") == 0)
+                       continue;
+
+               if (strstr(dent->d_name, "osc") == NULL &&
+                   strstr(dent->d_name, "OSC") == NULL)
+                       continue;
+
+               snprintf(path, sizeof(path) - 1, "%s/%s/lru_size",
+                        NAMESPACES, dent->d_name);
+
+               fd = open(path, O_WRONLY);
+               if (fd < 0) {
+                       log_write("%s: error opening %s to cancel locks: %s\n",
+                                 progname, path, strerror(errno));
+                       continue;
+               }
+               VERBOSE(3, "clearing locks in %s\n", path);
+               rc = write(fd, "clear", 6);
+               close(fd);
+       }
+
+       closedir(namespaces);
+}
+#endif
+
+/* Remove inodes from the client cache to avoid hitting an LASSERTF() on
+ * the client if it tries to attach two inodes to the same object */
+static void lfsck_drop_caches(void)
+{
+       int fd, rc;
+
+       sync();
+       llapi_cancel_osc_locks(mnt_path);
+       fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
+       if (fd < 0)
+               return;
+       VERBOSE(3, "flushing vm cache\n");
+       rc = write(fd, "3", 2);
+       close(fd);
+}
+
+/* Duplicate an object that is referenced by multiple files and point one
+ * of the files to use the duplicated object */
+static int lfsck_fix_duplicate(struct lfsck_saved_duplicates *dup,
+                              DB *mds_direntdb, const char *path)
+{
+       char path_tmp[PATH_MAX] = { 0 };
+       char tmp[PATH_MAX * 2 + 10] = { 0 };
+       const char *base;
+       int rc;
+
+       lfsck_drop_caches();
+
+       if (!lfsck_create) {
+               VERBOSE(1, "%s: [%u]: not duplicating FID "DFID
+                       " object "DOIF" by request\n  on\t%s\n",
+                       progname, dup->ld_ost_idx, PFID(&dup->ld_mds_fid),
+                       POIF(&dup->ld_oi), path);
+               return 0;
+       }
+
+       rc = lfsck_validate_duplicate(dup, path);
+       if (rc < 0)
+               goto out;
+
+       snprintf(path_tmp, sizeof(path_tmp) - 1, "%s.lfsck_tmp", path);
+       snprintf(tmp, sizeof(tmp) - 1, "cp -p '%s' '%s'", path, path_tmp);
+       VERBOSE(2, "%s\n", tmp);
+       rc = system(tmp);
+       if (rc) {
+               rc = -errno;
+               log_write("%s: duplicating object for %u:"DOIF" %s: %s\n",
+                         progname, dup->ld_ost_idx, POIF(&dup->ld_oi),
+                         path, strerror(-rc));
+               goto out;
+       }
+
+       base = strrchr(path, '/');
+       if (base == NULL)
+               base = path;
+       else
+               base++;
+
+       rc = asprintf(&dup->ld_link, "%s/%u-"DOIF"-"DFID":%s", dupedir,
+                     dup->ld_ost_idx, POIF(&dup->ld_oi),
+                     PFID(&dup->ld_mds_fid), base);
+       if (rc < 0) {
+               rc = -errno;
+               goto out;
+       }
+
+       VERBOSE(2, "ln %s %s\n", path, dup->ld_link);
+       rc = link(path, dup->ld_link);
+       if (rc) {
+               rc = -errno;
+               log_write("%s: error linking %s to %s: %s\n",
+                         progname, path, dup->ld_link, strerror(-rc));
+               free(dup->ld_link);
+               dup->ld_link = NULL;
+               goto out;
+       }
+
+       VERBOSE(2, "rename %s %s\n", path_tmp, path);
+       rc = rename(path_tmp, path);
+       if (rc) {
+               rc = -errno;
+               log_write("%s: error renaming %s to %s: %s\n",
+                         progname, path_tmp, path, strerror(-rc));
+               free(dup->ld_link);
+               dup->ld_link = NULL;
+       } else {
+               log_write("%s: [%u]: fixed duplicate FID "DFID" object "
+                         DOIF":\n\t%s\n", progname, dup->ld_ost_idx,
+                         PFID(&dup->ld_mds_fid), POIF(&dup->ld_oi), path);
+       }
+out:
+       if (rc) {
+               VERBOSE(2, "unlink %s\n", path_tmp);
+               if (unlink(path_tmp))
+                       log_write("%s: unlink %s failed: %s\n", progname,
+                                 path_tmp, strerror(errno));
+       }
+
+       return rc;
+}
+
+/*
+ * Check for files found that reference the same ost objects
+ * (found in pass1) and repair now if necessary
+ */
+int lfsck_run_pass4(DB *mds_direntdb)
+{
+       char tmp[PATH_MAX + 512];
+       char path[PATH_MAX];
+       int failed_get_path, fixed_dup;
+       int i, j;
+
+       log_write("lfsck: pass4: check for %u duplicate object references\n",
+                 lfsck_dup_saved);
+       if (lfsck_dup_saved == 0) {
+               log_write("lfsck: pass4 OK (no duplicates)\n");
+               return 0;
+       }
+
+       do {
+               struct lu_fid *mds_fid;
+
+               failed_get_path = 0;
+               fixed_dup = 0;
+
+               lfsck_drop_caches();
+
+               for (i = 0; i < lfsck_dup_saved; i++) {
+                       mds_fid = &lfsck_duplicates[i].ld_mds_fid;
+                       if (mds_fid->f_oid == 0)
+                               continue;
+
+                       if (lfsck_get_path(mds_fid, mds_direntdb, path,
+                                          sizeof(path))) {
+                               failed_get_path++;
+                               continue;
+                       }
+                       if (lfsck_fix_duplicate(&lfsck_duplicates[i],
+                                               mds_direntdb, path)) {
+                               fix_failed++;
+                               continue;
+                       }
+
+                       fixed++;
+                       fixed_dup++;
+
+                       /* don't duplicate a file multiple times even if it has
+                        * multiple shared objects */
+                       for (j = i + 1; j < lfsck_dup_saved; j++) {
+                               if (lfsck_fidcmp(&lfsck_duplicates[j].ld_mds_fid,
+                                                mds_fid) == 0)
+                                       lfsck_duplicates[j].ld_mds_fid.f_oid =0;
+                       }
+                       lfsck_duplicates[i].ld_mds_fid.f_oid = 0;
+               }
+       } while (failed_get_path && fixed_dup);
+
+       for (i = 0; i < lfsck_dup_saved; i++) {
+               lfsck_drop_caches();
+
+               if (!lfsck_duplicates[i].ld_link)
+                       continue;
+
+               if (unlink(lfsck_duplicates[i].ld_link))
+                       log_write("%s: failed to unlink %s: %s\n", progname,
+                                 lfsck_duplicates[i].ld_link, strerror(errno));
+               else
+                       log_write("%s: %s unlinked\n", progname,
+                                 lfsck_duplicates[i].ld_link);
+               free(lfsck_duplicates[i].ld_link);
+               lfsck_duplicates[i].ld_link = NULL;
+       }
+
+       snprintf(tmp, sizeof(tmp) - 1, "rm -rvf '%s'", dupedir);
+       VERBOSE(1, "%s\n", tmp);
+       if (system(tmp) == -1)
+               VERBOSE(1, "%s failed", tmp);
+
+       log_write("lfsck: pass4 finished\n");
+
+       return 0;
+}
+
+/*
+ * This is a placeholder to check for filesize correctness no fixup is in
+ * place right now since file size is still obtained from osts
+ */
+int lfsck_run_pass5(DB *mds_direntdb, DB *mds_sizeinfodb)
+{
+       int rc = 0;
+#ifdef LFSCK_CHECK_SIZE
+       struct lfsck_mds_szinfo mds_szinfo1;
+       char path[PATH_MAX];
+       struct stat64 statbuf;
+       DBT key,data;
+       DBC *dbcp;
+
+       log_write("lfsck: pass5: file size correctness\n");
+
+       rc = mds_sizeinfodb->cursor(mds_sizeinfodb, NULL, &dbcp, 0);
+       if (rc != 0) {
+               log_write("%s: error acquiring cursor for database: %s\n",
+                         progname, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       data.data = &mds_szinfo1;
+       data.size = data.ulen = sizeof(mds_szinfo1);
+       data.flags = DB_DBT_USERMEM;
+       while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+               letocpu_mds_szinfo(&mds_szinfo1);
+
+               if (mds_szinfo1.mds_size != mds_szinfo1.mds_calc_size) {
+                       if (lfsck_get_path(mds_szinfo1.mds_fid, mds_direntdb,
+                                          path, sizeof(path))) {
+                               log_write("%s: failed to get path and update "
+                                         "size for fid "LPU64"\n",
+                                         progname, mds_szinfo1.mds_fid);
+                               fix_failed++;
+                               continue;
+                       }
+
+                       if (stat64(path, &statbuf)) {
+                               log_write("%s: pass5: failed to stat %s\n",
+                                         progname, path);
+                               fix_failed++;
+                               continue;
+                       }
+                       if (statbuf.st_size == mds_szinfo1.mds_calc_size) {
+                               VERBOSE(2, "%s: %s really has right size\n",
+                                       progname, path);
+                       } else {
+                               log_write("%s: %s size "LPU64" != "LPU64"\n",
+                                         progname, path, statbuf.st_size,
+                                         mds_szinfo1.mds_calc_size);
+                               fixed++;
+                       }
+               }
+       }
+       if (rc != DB_NOTFOUND) {
+               log_write("%s: error getting next element in db: %s\n",
+                         progname, db_strerror(rc));
+               rc = -EINVAL;
+               goto out;
+       }
+       rc = 0;
+       log_write("%s: pass5 finished\n", progname);
+out:
+       dbcp->c_close(dbcp);
+#endif /* LFSCK_CHECK_SIZE */
+       return rc;
+}
+
+int get_response()
+{
+       char yes[] = "Yy";
+       char no[] = "Nn";
+       char c;
+       int rc = -1;
+
+       while (1) {
+               c = getchar();
+               if (c == EOF)
+                       break;
+
+               if (strchr(yes, c)) {
+                       rc = 1;
+                       break;
+               }
+
+               if (strchr(no, c)) {
+                       rc = 0;
+                       break;
+               }
+       }
+       return rc;
+}
+
+/* Starting point for each thread */
+void *lfsck_start_thread(void *arg)
+{
+       struct lfsck_thread_info *tinfo = (struct lfsck_thread_info *)arg;
+       int i, rc;
+
+       tinfo->status = 0;
+       pthread_mutex_lock(&init_mutex);
+       if (all_started)
+               pthread_mutex_unlock(&init_mutex);
+       else
+               pthread_cond_wait(&init_cond, &init_mutex);
+
+       if (!all_started)
+               pthread_exit(NULL);
+       for (i = tinfo->start_ost_idx; i < tinfo->end_ost_idx; i++) {
+               rc = run_test(i, tinfo->mds_hdr, tinfo->mds_direntdb,
+                             tinfo->mds_sizeinfodb);
+               if (rc) {
+                       log_write("lfsck: ost_idx %d: error running check\n",i);
+                       tinfo->status = rc;
+               }
+       }
+       pthread_exit(NULL);
+}
+
+/* Start threads and run filesystem checks and repair */
+int lfsck_run_checks()
+{
+       struct lfsck_mds_hdr *mds_hdr = NULL;
+       struct lfsck_thread_info *tinfo = NULL;
+       pthread_t *threads = NULL;
+       int rc, i;
+       DB *mds_direntdb = NULL;
+       DB *mds_hdrdb = NULL;
+       DB *mds_sizeinfodb = NULL;
+       DBT key, data;
+       int num_osts;
+
+       rc = lfsck_opendb(mds_file, MDS_HDR, &mds_hdrdb, 0, 0, 0);
+       if (rc != 0) {
+               log_write("%s: error opening mds_hdr in %s: rc %d\n",
+                         progname, mds_file, rc);
+               return -EINVAL;
+       }
+       mds_hdr = malloc(sizeof(*mds_hdr));
+       if (mds_hdr == NULL) {
+               log_write("%s: out of memory allocating DB header (%lu)\n",
+                         progname, sizeof(*mds_hdr));
+               rc = -ENOMEM;
+               goto out;
+       }
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       mds_hdr->mds_magic = MDS_MAGIC;
+       key.data = &mds_hdr->mds_magic;
+       key.size = sizeof(mds_hdr->mds_magic);
+       data.data = mds_hdr;
+       data.size = sizeof(*mds_hdr);
+       data.ulen = sizeof(*mds_hdr);
+       data.flags = DB_DBT_USERMEM;
+       rc = mds_hdrdb->get(mds_hdrdb, NULL, &key, &data, 0);
+       if (rc != 0) {
+               log_write("%s: error getting mds_hdr info %s: %s\n",
+                         progname, mds_file, db_strerror(rc));
+               goto out;
+       }
+       letocpu_mds_hdr(mds_hdr);
+
+       rc = lfsck_opendb(mds_file, MDS_DIRINFO, &mds_direntdb, 0, 0, 0);
+       if (rc != 0) {
+               log_write("%s: error opening dirinfo db %s: rc %d\n",
+                         progname, mds_file, rc);
+               goto out;
+       }
+
+       rc = lfsck_opendb(mds_file, MDS_SIZEINFO, &mds_sizeinfodb, 0, 0, 0);
+       if (rc != 0) {
+               log_write("%s: error opening sizeinfo db %s: rc %d\n",
+                         progname, mds_file, rc);
+               goto out;
+       }
+
+       if (lov_tgt_count > mds_hdr->mds_num_osts) {
+               fprintf(stderr, "%s: number of osts in lov (%u) > "
+                               "num referenced in mds (%u) (new ost or "
+                               "empty filesystem?)\n", progname,
+                               lov_tgt_count, mds_hdr->mds_num_osts);
+               fprintf(stderr, "Do you wish to continue? (y/n)\n");
+               rc = get_response();
+               if (rc != 1) {
+                       log_write("%s: exiting \n", progname);
+                       goto out;
+               }
+               fprintf(stderr, "\n");
+
+               num_osts = lov_tgt_count;
+       } else {
+               num_osts = mds_hdr->mds_num_osts;
+       }
+       if (num_threads > num_osts)
+               num_threads = num_osts;
+
+       tinfo = calloc(num_threads, sizeof(*tinfo));
+       if (tinfo == NULL) {
+               log_write("%s: out of memory for thread info\n", progname);
+               rc = -ENOMEM;
+               goto out;
+       }
+       threads = calloc(num_threads, sizeof(pthread_t));
+       if (threads == NULL) {
+               log_write("%s: out of memory for threads\n", progname);
+               rc =  -ENOMEM;
+               goto out;
+       }
+
+       all_started = 0;
+       for (i = 0; i < num_threads; i++) {
+               __u32 end_ost_idx;
+               __u32 chunk;
+
+               chunk = num_osts / num_threads;
+               if (num_osts % num_threads)
+                       chunk++;
+               tinfo[i].mds_hdr = mds_hdr;
+               tinfo[i].mds_direntdb = mds_direntdb;
+               tinfo[i].mds_sizeinfodb = mds_sizeinfodb;
+               tinfo[i].status = 0;
+               tinfo[i].start_ost_idx = (chunk) * i;
+               end_ost_idx = (chunk) * (i + 1);
+               end_ost_idx = end_ost_idx > num_osts ?
+                             num_osts : end_ost_idx;
+               tinfo[i].end_ost_idx = end_ost_idx;
+               rc = pthread_create(&threads[i], NULL, lfsck_start_thread,
+                                   &tinfo[i]);
+               if (rc) {
+                       log_write("%s: error starting thread waiting for other"
+                                 " threads to exit\n", progname);
+                       pthread_mutex_lock(&init_mutex);
+                       pthread_cond_broadcast(&init_cond);
+                       pthread_mutex_unlock(&init_mutex);
+                       for (--i; i >= 0; i--)
+                               pthread_cancel(threads[i]);
+
+                       rc = -ENOMEM;
+                       goto out;
+               }
+       }
+       pthread_mutex_lock(&init_mutex);
+       all_started = 1;
+       pthread_cond_broadcast(&init_cond);
+       pthread_mutex_unlock(&init_mutex);
+       for (i = 0; i < num_threads; i++) {
+               rc = pthread_join(threads[i], NULL);
+               if (tinfo[i].status) {
+                       log_write("%s: error running thread %u\n", progname, i);
+                       rc = -EINVAL;
+               }
+       }
+
+       rc = lfsck_run_pass4(mds_direntdb);
+       if (rc != 0)
+               goto out;
+
+       rc = lfsck_run_pass5(mds_direntdb, mds_sizeinfodb);
+
+out:
+       if (threads)
+               free(threads);
+       if (tinfo)
+               free(tinfo);
+       if (mds_hdr)
+               free(mds_hdr);
+       if (mds_direntdb)
+               mds_direntdb->close(mds_direntdb, 0);
+       if (mds_hdrdb)
+               mds_hdrdb->close(mds_hdrdb, 0);
+       if (mds_sizeinfodb)
+               mds_sizeinfodb->close(mds_sizeinfodb, 0);
+
+       return rc;
+}
+
+int create_lostandfound()
+{
+       struct stat statbuf;
+
+       snprintf(lostandfounddir, PATH_MAX - 1, "%s/lost+found", mnt_path);
+       lostandfounddir[PATH_MAX - 1] = '\0';
+
+       VERBOSE(2, "%s: creating %s\n", progname, lostandfounddir);
+       if (mkdir(lostandfounddir, 0700)) {
+               if (errno != EEXIST) {
+                       fprintf(stderr, "%s: error creating %s: %s\n",
+                               progname, lostandfounddir, strerror(errno));
+                       return -errno;
+               }
+
+               if (stat(lostandfounddir, &statbuf)) {
+                       fprintf(stderr, "%s: error stat %s: %s\n",
+                               progname, lostandfounddir, strerror(errno));
+                       return -errno;
+               }
+
+               if (!S_ISDIR(statbuf.st_mode)) {
+                       fprintf(stderr, "%s: error %s is not a directory\n",
+                               progname, lostandfounddir);
+                       return -EINVAL;
+               }
+       }
+
+       snprintf(dupedir, sizeof(dupedir), "%s/duplicates", lostandfounddir);
+       dupedir[PATH_MAX - 1] = '\0';
+
+       VERBOSE(2, "%s: creating %s\n", progname, dupedir);
+       if (mkdir(dupedir, 0700)) {
+               if (errno != EEXIST) {
+                       fprintf(stderr, "%s: error creating %s: %s\n",
+                               progname, dupedir, strerror(errno));
+                       return -errno;
+               }
+
+               if (stat(lostandfounddir, &statbuf)) {
+                       fprintf(stderr, "%s: error stat %s: %s\n",
+                               progname, dupedir, strerror(errno));
+                       return -errno;
+               }
+
+               if (!S_ISDIR(statbuf.st_mode)) {
+                       fprintf(stderr, "%s: error %s is not a directory\n",
+                               progname, dupedir);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       uid_t myuid;
+       int i;
+
+       fprintf(stderr, "lfsck %s (%s)\n", E2FSPROGS_VERSION, E2FSPROGS_DATE);
+
+       if (parse_args(argc, argv)) {
+               usage();
+               exit(16);
+       }
+
+       myuid = getuid();
+       if (myuid != 0 && !lfsck_force) {
+               fprintf(stderr, "%s: can only be run by root user\n", progname);
+               exit(16);
+       }
+
+       log_open();
+
+       if ((lfsck_save || lfsck_delete) && create_lostandfound() != 0) {
+               log_write("%s: failed to create lost+found directory\n",
+                         progname);
+               log_close(-1);
+               exit(8);
+       }
+
+       if (get_lov_config()) {
+               log_close(-1);
+               exit(8);
+       }
+
+       if (lfsck_run_checks())
+               log_close(-1);
+
+       if (mds_file)
+               free(mds_file);
+       for (i = 0; i < LOV_MAX_OSTS; i++) {
+               if (ost_files[i])
+                       free(ost_files[i]);
+       }
+       if (lfsck_duplicates)
+               free(lfsck_duplicates);
+
+       log_close(0);
+       if (fix_failed) {
+               fprintf(stderr, "%s: exit with %u unfixed errors\n",
+                       progname, fix_failed);
+               return 2;
+       } else {
+               printf("%s: fixed %u errors\n", progname, fixed);
+               return !!fixed;
+       }
+}
diff --git a/e2fsck/lfsck_common.c b/e2fsck/lfsck_common.c
new file mode 100644 (file)
index 0000000..165c2bf
--- /dev/null
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2004  Hewlett-Packard Co.
+ * Copyright (c) 2011  Whamcloud, Inc
+ */
+/*****************************************************************************
+ * e2fsck extentions: code for gathering data from the OST & MDT filesystems
+ * when e2fsck is run against them. The best description and knowledge of
+ * the layout and information gathered is in lfsck.h where the structures
+ * defining each entry in the tables are declared. Basically the ost file
+ * contains one table with each entry holding the object id and size.
+ * In addition there is header information at the start of the file.
+ * The MDT file contains multiple tables, one per OST. Each MDT/OST table
+ * contains an entry describing the MDT FID and the OST object associated
+ * with this FID on an OST. In addition the MDT also contains a table
+ * with the mds_fid and the FID of the containg directory. Header information
+ * for each table is also included.
+ * lfsck is run afterwards where the data gathered and stored here is cross
+ * checked to ensure consistency and correctness
+ *
+ *****************************************************************************/
+#include "config.h"
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "ext2fs/lfsck.h"
+
+#ifdef HAVE_LFSCK
+char *my_dirname(char *path)
+{
+       if (path != NULL) {
+               char *tmp = strrchr(path, '/');
+               if (tmp != NULL) {
+                       *tmp = '\0';
+                       return path;
+               }
+       }
+
+       return ".";
+}
+
+const char *my_basename(const char *path)
+{
+       if (path != NULL) {
+               char *tmp = strrchr(path, '/');
+               if (tmp != NULL)
+                       return tmp + 1;
+       }
+
+       return path;
+}
+
+DB_ENV *dbenv;
+
+u_int32_t lfsck_hash_raw_fn(const void *p)
+{
+       u_int32_t *c = (u_int32_t *)p;
+       u_int32_t rc = 0;
+
+       rc = (*c >> 7) & (HASH_SIZE - 1) ;
+
+       return rc;
+}
+
+
+u_int32_t lfsck_hash_fn(DB *dbp, const void *p, u_int32_t len)
+{
+       u_int32_t rc = 0 ;
+
+       if (len < sizeof(u_int32_t)) {
+               printf("Hash size error");
+               exit(128);
+       }
+       rc = lfsck_hash_raw_fn(p);
+
+       return rc;
+}
+
+int lfsck_create_dbenv(const char *progname)
+{
+       int rc;
+       size_t pagesize;
+       long pages;
+       unsigned long cachesize;
+
+       pagesize = getpagesize();
+       pages = sysconf(_SC_AVPHYS_PAGES);
+
+       cachesize = ((pagesize * 3) / 4) * pages;
+       if (cachesize > 500UL * 1024 * 1024)
+               cachesize = 500UL * 1024 * 1024;
+       else if (cachesize < 10 * 1024 * 1024)
+               cachesize = 10 * 1024 * 1024;
+
+       rc = db_env_create(&dbenv, 0);
+       if (rc != 0) {
+               fprintf(stderr, "%s: error creating dbenv: %s\n",
+                       progname, db_strerror(rc));
+               return -EINVAL;
+       }
+
+       rc = dbenv->set_cachesize(dbenv, 0, cachesize,  0);
+       if (rc != 0) {
+               dbenv->err(dbenv, rc, "set_cachesize");
+               dbenv->close(dbenv, 0);
+               return -EINVAL;
+       }
+       rc = dbenv->set_data_dir(dbenv, "/");
+       if (rc != 0) {
+               dbenv->err(dbenv, rc, "set_data_dir");
+               dbenv->close(dbenv, 0);
+               return -EINVAL;
+       }
+
+       /* Open the environment with full transactional support. */
+       rc = dbenv->open(dbenv, "/tmp", DB_CREATE | DB_PRIVATE |
+                        DB_INIT_MPOOL | DB_INIT_LOCK | DB_THREAD, 0);
+       if (rc != 0) {
+               dbenv->err(dbenv, rc, "environment open: ");
+               dbenv->close(dbenv, 0);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int lfsck_opendb(const char *fname, const char *dbname, DB **dbpp,
+                int allow_dup, int keydata_size, int num_files)
+{
+       static int dbenv_set;
+       DB *dbp;
+       int rc;
+       int pagesize = 512;
+       int h_ffactor = 0;
+
+       if (!dbenv_set) {
+               if (lfsck_create_dbenv(dbname))
+                       return -EIO;
+               dbenv_set = 1;
+       }
+
+       rc = db_create(&dbp, dbenv, 0);
+       if (rc) {
+               fprintf(stderr, "%s: error db_create: %s\n",
+                       dbname, db_strerror(rc));
+               return EIO;
+       }
+
+       rc = dbp->set_pagesize(dbp, pagesize);
+       if (rc != 0) {
+               dbp->err(dbp, rc, "set_pagesize");
+               dbp->close(dbp, 0);
+               return EIO;
+       }
+
+       rc = dbp->set_lorder(dbp, 1234);
+       if (rc != 0) {
+               dbp->err(dbp, rc, "set_lorder");
+               dbp->close(dbp, 0);
+               return EIO;
+       }
+
+       if (keydata_size && num_files) {
+               h_ffactor = (pagesize - 32) / (keydata_size + 8);
+               rc = dbp->set_h_ffactor(dbp, h_ffactor);
+               if (rc != 0)
+                       dbp->err(dbp, rc, "set_h_ffactor");
+
+               rc = dbp->set_h_nelem(dbp, num_files);
+               if (rc != 0)
+                       dbp->err(dbp, rc, "set_h_nelem");
+       }
+
+       rc = dbp->set_h_hash(dbp, lfsck_hash_fn);
+       if (rc != 0) {
+               dbp->err(dbp, rc, "set_h_hash");
+               dbp->close(dbp, 0);
+               return EIO;
+       }
+
+       if (allow_dup) {
+               rc = dbp->set_flags(dbp, DB_DUPSORT);
+               if (rc != 0) {
+                       fprintf(stderr, "Failure to allow duplicates\n");
+                       dbp->close(dbp, 0);
+                       return EIO;
+               }
+       }
+
+       rc = dbp->open(dbp, NULL, fname, dbname, DB_HASH,
+                      DB_CREATE | DB_THREAD, 0664);
+       if (rc != 0) {
+               dbp->err(dbp, rc, "%s:%s\n", fname, dbname);
+               dbp->close(dbp, 0);
+               return EIO;
+       }
+       *dbpp = dbp;
+       return 0;
+}
+
+void cputole_mds_hdr(struct lfsck_mds_hdr *mds_hdr)
+{
+       int i, num_osts = mds_hdr->mds_num_osts;
+
+       mds_hdr->mds_magic = ext2fs_cpu_to_le64(mds_hdr->mds_magic);
+       mds_hdr->mds_flags = ext2fs_cpu_to_le64(mds_hdr->mds_flags);
+       mds_hdr->mds_max_files = ext2fs_cpu_to_le64(mds_hdr->mds_max_files);
+       mds_hdr->mds_num_osts = ext2fs_cpu_to_le64(mds_hdr->mds_num_osts);
+
+       for (i = 0; i < num_osts; i++)
+               mds_hdr->mds_max_ost_id[i] =
+                       ext2fs_cpu_to_le64(mds_hdr->mds_max_ost_id[i]);
+}
+
+void letocpu_mds_hdr(struct lfsck_mds_hdr *mds_hdr)
+{
+       int i;
+
+       mds_hdr->mds_magic = ext2fs_le64_to_cpu(mds_hdr->mds_magic);
+       mds_hdr->mds_flags = ext2fs_le64_to_cpu(mds_hdr->mds_flags);
+       mds_hdr->mds_max_files = ext2fs_le64_to_cpu(mds_hdr->mds_max_files);
+       mds_hdr->mds_num_osts = ext2fs_le64_to_cpu(mds_hdr->mds_num_osts);
+
+       for (i = 0; i < mds_hdr->mds_num_osts; i++)
+               mds_hdr->mds_max_ost_id[i] =
+                            ext2fs_le64_to_cpu(mds_hdr->mds_max_ost_id[i]);
+}
+
+void cputole_ost_hdr(struct lfsck_ost_hdr *ost_hdr)
+{
+       ost_hdr->ost_magic = ext2fs_cpu_to_le64(ost_hdr->ost_magic);
+       ost_hdr->ost_flags = ext2fs_cpu_to_le64(ost_hdr->ost_flags);
+       ost_hdr->ost_num_files = ext2fs_cpu_to_le64(ost_hdr->ost_num_files);
+       ost_hdr->ost_last_id = ext2fs_cpu_to_le64(ost_hdr->ost_last_id);
+}
+
+void letocpu_ost_hdr(struct lfsck_ost_hdr *ost_hdr)
+{
+       ost_hdr->ost_magic = ext2fs_le64_to_cpu(ost_hdr->ost_magic);
+       ost_hdr->ost_flags = ext2fs_le64_to_cpu(ost_hdr->ost_flags);
+       ost_hdr->ost_num_files = ext2fs_le64_to_cpu(ost_hdr->ost_num_files);
+       ost_hdr->ost_last_id = ext2fs_le64_to_cpu(ost_hdr->ost_last_id);
+}
+
+void cputole_fid(struct lu_fid *fid)
+{
+       fid->f_seq = ext2fs_cpu_to_le64(fid->f_seq);
+       fid->f_oid = ext2fs_cpu_to_le32(fid->f_oid);
+       fid->f_ver = ext2fs_cpu_to_le32(fid->f_ver);
+}
+
+void letocpu_fid(struct lu_fid *fid)
+{
+       fid->f_seq = ext2fs_le64_to_cpu(fid->f_seq);
+       fid->f_oid = ext2fs_le32_to_cpu(fid->f_oid);
+       fid->f_ver = ext2fs_le32_to_cpu(fid->f_ver);
+}
+
+void cputole_oi(struct ost_id *oi)
+{
+       oi->oi_id = ext2fs_cpu_to_le64(oi->oi_id);
+       oi->oi_seq = ext2fs_cpu_to_le64(oi->oi_seq);
+}
+
+void letocpu_oi(struct ost_id *oi)
+{
+       oi->oi_id = ext2fs_le64_to_cpu(oi->oi_id);
+       oi->oi_seq = ext2fs_le64_to_cpu(oi->oi_seq);
+}
+
+void cputole_mds_dirent(struct lfsck_mds_dirent *mds_dirent)
+{
+       cputole_fid(&mds_dirent->mds_dirfid);
+       cputole_fid(&mds_dirent->mds_fid);
+}
+
+void letocpu_mds_dirent(struct lfsck_mds_dirent *mds_dirent)
+{
+       letocpu_fid(&mds_dirent->mds_dirfid);
+       letocpu_fid(&mds_dirent->mds_fid);
+}
+
+void cputole_mds_szinfo(struct lfsck_mds_szinfo *mds_szinfo)
+{
+       mds_szinfo->mds_fid = ext2fs_cpu_to_le64(mds_szinfo->mds_fid);
+       mds_szinfo->mds_seq = ext2fs_cpu_to_le64(mds_szinfo->mds_seq);
+       mds_szinfo->mds_size = ext2fs_cpu_to_le64(mds_szinfo->mds_size);
+       mds_szinfo->mds_calc_size =
+               ext2fs_cpu_to_le64(mds_szinfo->mds_calc_size);
+       mds_szinfo->mds_stripe_size =
+               ext2fs_cpu_to_le32(mds_szinfo->mds_stripe_size);
+       mds_szinfo->mds_stripe_pattern =
+                   ext2fs_cpu_to_le32(mds_szinfo->mds_stripe_pattern);
+       mds_szinfo->mds_stripe_count =
+               ext2fs_cpu_to_le16(mds_szinfo->mds_stripe_count);
+       mds_szinfo->mds_stripe_start =
+               ext2fs_cpu_to_le16(mds_szinfo->mds_stripe_start);
+}
+
+void letocpu_mds_szinfo(struct lfsck_mds_szinfo *mds_szinfo)
+{
+       mds_szinfo->mds_fid = ext2fs_le64_to_cpu(mds_szinfo->mds_fid);
+       mds_szinfo->mds_seq = ext2fs_le64_to_cpu(mds_szinfo->mds_seq);
+       mds_szinfo->mds_size = ext2fs_le64_to_cpu(mds_szinfo->mds_size);
+       mds_szinfo->mds_calc_size =
+               ext2fs_le64_to_cpu(mds_szinfo->mds_calc_size);
+       mds_szinfo->mds_stripe_size =
+               ext2fs_le32_to_cpu(mds_szinfo->mds_stripe_size);
+       mds_szinfo->mds_stripe_pattern =
+               ext2fs_le32_to_cpu(mds_szinfo->mds_stripe_pattern);
+       mds_szinfo->mds_stripe_count =
+               ext2fs_le16_to_cpu(mds_szinfo->mds_stripe_count);
+       mds_szinfo->mds_stripe_start =
+               ext2fs_le16_to_cpu(mds_szinfo->mds_stripe_start);
+}
+
+void cputole_mds_objent(struct lfsck_mds_objent *mds_objent)
+{
+       cputole_fid(&mds_objent->mds_fid);
+       cputole_oi(&mds_objent->mds_oi);
+       mds_objent->mds_ostidx = ext2fs_cpu_to_le32(mds_objent->mds_ostidx);
+       mds_objent->mds_ostoffset =
+               ext2fs_cpu_to_le32(mds_objent->mds_ostoffset);
+}
+
+void letocpu_mds_objent(struct lfsck_mds_objent *mds_objent)
+{
+       letocpu_fid(&mds_objent->mds_fid);
+       letocpu_oi(&mds_objent->mds_oi);
+       mds_objent->mds_ostidx = ext2fs_le32_to_cpu(mds_objent->mds_ostidx);
+       mds_objent->mds_ostoffset =
+               ext2fs_le32_to_cpu(mds_objent->mds_ostoffset);
+}
+
+void cputole_ost_objent(struct lfsck_ost_objent *ost_objent)
+{
+       cputole_oi(&ost_objent->ost_oi);
+       ost_objent->ost_size = ext2fs_cpu_to_le64(ost_objent->ost_size);
+       ost_objent->ost_bytes = ext2fs_cpu_to_le64(ost_objent->ost_bytes);
+}
+
+void letocpu_ost_objent(struct lfsck_ost_objent *ost_objent)
+{
+       letocpu_oi(&ost_objent->ost_oi);
+       ost_objent->ost_oi.oi_id = ext2fs_le64_to_cpu(ost_objent->ost_oi.oi_id);
+       ost_objent->ost_oi.oi_seq=ext2fs_le64_to_cpu(ost_objent->ost_oi.oi_seq);
+       ost_objent->ost_size = ext2fs_le64_to_cpu(ost_objent->ost_size);
+       ost_objent->ost_bytes = ext2fs_le64_to_cpu(ost_objent->ost_bytes);
+}
+
+void letocpu_lov_user_md(struct lov_user_md *lmm)
+{
+       struct lov_user_ost_data_v1 *loi;
+       int i;
+
+       lmm->lmm_magic = ext2fs_le32_to_cpu(lmm->lmm_magic);
+       lmm->lmm_pattern = ext2fs_le32_to_cpu(lmm->lmm_pattern);
+       letocpu_oi((struct ost_id *)&lmm->lmm_object_id);
+       lmm->lmm_stripe_size = ext2fs_le32_to_cpu(lmm->lmm_stripe_size);
+       lmm->lmm_stripe_count = ext2fs_le16_to_cpu(lmm->lmm_stripe_count);
+       /* No swabbing needed for the lov_user_md_v3 lmm_pool_name */
+
+       if (lmm->lmm_magic == LOV_USER_MAGIC_V3)
+               loi = ((struct lov_user_md_v3 *)lmm)->lmm_objects;
+       else /* if (lmm->lmm_magic == LOV_USER_MAGIC_V1) */
+               loi = lmm->lmm_objects;
+       /* If there is a bad magic, this will be found immediately in the
+        * call to lfsck_check_lov_ea() following this function. */
+
+       for (i = 0; i < lmm->lmm_stripe_count; i++, loi++) {
+               letocpu_oi((struct ost_id *)&loi->l_object_id);
+               loi->l_ost_gen = ext2fs_le32_to_cpu(loi->l_ost_gen);
+               loi->l_ost_idx = ext2fs_le32_to_cpu(loi->l_ost_idx);
+       }
+}
+
+int lfsck_get_fid(ext2_filsys fs, ino_t ino, struct lu_fid *fid)
+{
+       struct ext2_inode *inode;
+       errcode_t rc;
+       int size;
+       struct lustre_mdt_attrs lma;
+
+       rc = ext2fs_get_mem(EXT2_INODE_SIZE(fs->super), &inode);
+       if (rc) {
+               com_err("ext2fs_get_mem", rc, "allocating %d bytes\n",
+                       EXT2_INODE_SIZE(fs->super));
+               return rc;
+       }
+       rc = ext2fs_read_inode_full(fs, ino, inode, EXT2_INODE_SIZE(fs->super));
+       if (rc) {
+               com_err("ext2fs_read_inode_full", rc,
+                       "reading inode %lu\n", ino);
+               ext2fs_free_mem(&inode);
+               return rc;
+       }
+       rc = ext2fs_attr_get(fs, inode, EXT2_ATTR_INDEX_TRUSTED, "lma",
+                            (char *)&lma, sizeof(lma), &size);
+       if (rc) {
+               if (rc != EXT2_ET_EA_NAME_NOT_FOUND &&
+                   rc != EXT2_ET_EA_BAD_MAGIC) {
+                       ext2fs_free_mem(&inode);
+                       return rc;
+               }
+               /* compose igif */
+               fid->f_seq = ino;
+               fid->f_oid = inode->i_generation;
+               fid->f_ver = 0;
+       } else {
+               *fid = lma.lma_self_fid;
+       }
+       ext2fs_free_mem(&inode);
+       return 0;
+}
+
+int lfsck_is_dirfid_root(const struct lu_fid *dirfid)
+{
+       if (dirfid->f_seq == EXT2_ROOT_INO &&
+           dirfid->f_oid == 0 && dirfid->f_ver == 0)
+               return 1;
+       return 0;
+}
+
+int lfsck_fidcmp(const struct lu_fid *fid1, const struct lu_fid *fid2)
+{
+       if (fid_is_igif(fid1) && fid_is_igif(fid2)) {
+               /* do not compare f_ver for comparing igif-s */
+               if (fid1->f_seq == fid2->f_seq && fid1->f_oid == fid2->f_oid)
+                       return 0;
+               return 1;
+       }
+       if (!fid_is_igif(fid1) && !fid_is_igif(fid2)) {
+               if (fid1->f_seq == fid2->f_seq && fid1->f_oid == fid2->f_oid &&
+                       fid1->f_ver == fid2->f_ver)
+                       return 0;
+               return 1;
+       }
+       return 1;
+}
+#endif
index bdf2616..1edf041 100644 (file)
@@ -50,6 +50,7 @@
 
 #include "e2fsck.h"
 #include <ext2fs/ext2_ext_attr.h>
+#include "ext2fs/lfsck.h"
 
 #include "problem.h"
 
@@ -394,6 +395,8 @@ static void check_ea_in_inode(e2fsck_t ctx, struct problem_context *pctx)
        struct ext2_ext_attr_entry *entry;
        char *start, *end;
        unsigned int storage_size, remain;
+       struct lov_user_md_v1 *lmm = NULL;
+       struct lustre_mdt_attrs *lma = NULL;
        int problem = 0;
 
        inode = (struct ext2_inode_large *) pctx->inode;
@@ -464,6 +467,9 @@ static void check_ea_in_inode(e2fsck_t ctx, struct problem_context *pctx)
                        goto fix;
                }
 
+               e2fsck_lfsck_find_ea(ctx, inode, entry,
+                                    start + entry->e_value_offs, &lmm, &lma);
+
                /* If EA value is stored in external inode then it does not
                 * consume space here */
                if (entry->e_value_inum == 0)
@@ -471,7 +477,15 @@ static void check_ea_in_inode(e2fsck_t ctx, struct problem_context *pctx)
 
                entry = EXT2_EXT_ATTR_NEXT(entry);
        }
+
+       if (lmm)
+               e2fsck_lfsck_save_ea(ctx, pctx->ino, inode->i_generation,
+                                    lmm, lma);
 fix:
+       if (lmm)
+               ext2fs_free_mem(&lmm);
+       if (lma)
+               ext2fs_free_mem(&lma);
        /*
         * it seems like a corruption. it's very unlikely we could repair
         * EA(s) in automatic fashion -bzzz
@@ -1031,6 +1045,12 @@ void e2fsck_pass1(e2fsck_t ctx)
                ext2fs_mark_block_bitmap2(ctx->block_found_map,
                                          fs->super->s_mmp_block);
 
+       if (!(ctx->options & E2F_OPT_READONLY) &&
+           (ctx->lustre_devtype & LUSTRE_TYPE) == LUSTRE_MDS) {
+               if (e2fsck_lfsck_remove_pending(ctx, NULL))
+                       return;
+       }
+
        while (1) {
                if (ino % (fs->super->s_inodes_per_group * 4) == 1) {
                        if (e2fsck_mmp_update(fs))
@@ -1579,6 +1599,9 @@ void e2fsck_pass1(e2fsck_t ctx)
                }
                e2fsck_pass1_dupblocks(ctx, block_buf);
        }
+
+       e2fsck_lfsck_flush_ea(ctx);
+
        ext2fs_free_mem(&inodes_to_process);
 endit:
        e2fsck_use_inode_shortcuts(ctx, 0);
@@ -1852,6 +1875,8 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx,
        struct ext2_ext_attr_entry *entry;
        int             count;
        region_t        region = 0;
+       struct lov_user_md_v1 *lmm = NULL;
+       struct lustre_mdt_attrs *lma = NULL;
        int ret;
 
        blk = ext2fs_file_acl_block(fs, inode);
@@ -2012,13 +2037,28 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx,
                        entry->e_hash = hash;
                }
 
+               if (e2fsck_lfsck_find_ea(ctx, (struct ext2_inode_large *)inode,
+                                        entry, block_buf + entry->e_value_offs,
+                                        &lmm, &lma) != 0) {
+                       if (ctx->flags & E2F_FLAG_SIGNAL_MASK)
+                               return 0;
+               }
+
                entry = EXT2_EXT_ATTR_NEXT(entry);
        }
+
+       if (lmm)
+               e2fsck_lfsck_save_ea(ctx, ino, inode->i_generation, lmm, lma);
+
        if (region_allocate(region, (char *)entry - (char *)header, 4)) {
                if (fix_problem(ctx, PR_1_EA_ALLOC_COLLISION, pctx))
                        goto clear_extattr;
        }
        region_free(region);
+       if (lmm)
+               ext2fs_free_mem(&lmm);
+       if (lma)
+               ext2fs_free_mem(&lma);
 
        count = header->h_refcount - 1;
        if (count)
@@ -2028,6 +2068,11 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx,
        return 1;
 
 clear_extattr:
+       if (lmm)
+               ext2fs_free_mem(&lmm);
+       if (lma)
+               ext2fs_free_mem(&lma);
+
        if (region)
                region_free(region);
        ext2fs_file_acl_block_set(fs, inode, 0);
diff --git a/e2fsck/pass6.c b/e2fsck/pass6.c
new file mode 100644 (file)
index 0000000..5937d05
--- /dev/null
@@ -0,0 +1,1560 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ *
+ * Copyright (c) 2004  Hewlett-Packard Co.
+ */
+/*****************************************************************************
+ * e2fsck extentions: code for gathering data from the ost & mds filesystems
+ * when e2fsck is run against them. The best description and knowledge of the
+ * layout and information gathered is in lfsck.h where the structures
+ * defining each entry in the tables are declared. Basically the ost file
+ * contains one table with each entry holding the object id and size.
+ * In addition there is header information at the start of the file.
+ * The mds file contains multiple tables one per ost. Each mds/ost table
+ * contains an entry describing the mds fid and the ost object associated
+ * with this fid on an ost. In addition the mds also contains a table
+ * with the mds_fid and the fid of the containg directory. Header information
+ * for each table is also included.
+ * lfsck is run afterwards where the data gathered and stored here is cross
+ * checked to ensure consistency and correctness
+ *
+ *****************************************************************************/
+#include "config.h"
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "ext2fs/ext2_fs.h"
+#include "ext2fs/ext2fs.h"
+
+#ifdef HAVE_LFSCK
+#include "e2fsck.h"
+#include "ext2fs/lfsck.h"
+#include "problem.h"
+//#define LOG_REMOVAL
+
+#define VERBOSE(ctx, fmt, args...) \
+do { if (ctx->options & E2F_OPT_VERBOSE) printf(fmt, ##args); } while (0)
+
+#define DEBUG(ctx, fmt, args...) \
+do { if (ctx->options & E2F_OPT_DEBUG) printf(fmt, ##args); } while (0)
+
+struct lfsck_mds_ctx {
+       e2fsck_t        ctx;
+       DB              *outdb;
+       ext2_ino_t      dot;
+       ext2_ino_t      dotdot;
+       struct lu_fid   dotfid;
+       int             numfiles;
+};
+
+struct lfsck_ost_ctx {
+       e2fsck_t        ctx;
+       DB              *outdb;
+       ext2_ino_t      dirinode;
+       int             numfiles;
+       int             status;
+       __u64           max_objid;
+};
+
+int e2fsck_lfsck_cleanupdb(e2fsck_t ctx)
+{
+       int i;
+       int rc = 0;
+       DB *dbp;
+
+       if (ctx->lfsck_oinfo == NULL)
+               return 0;
+
+       for (i = 0; i < ctx->lfsck_oinfo->ost_count; i++) {
+               if (ctx->lfsck_oinfo->ofile_ctx[i].dbp != NULL) {
+                       dbp = ctx->lfsck_oinfo->ofile_ctx[i].dbp;
+                       rc += dbp->close(dbp, 0);
+                       ctx->lfsck_oinfo->ofile_ctx[i].dbp = NULL;
+               }
+       }
+       if (ctx->lfsck_oinfo->mds_sizeinfo_dbp != NULL) {
+               dbp = ctx->lfsck_oinfo->mds_sizeinfo_dbp;
+               rc += dbp->close(dbp, 0);
+               ctx->lfsck_oinfo->mds_sizeinfo_dbp = NULL;
+       }
+       if (ctx->lfsck_oinfo->ofile_ctx)
+               ext2fs_free_mem(ctx->lfsck_oinfo->ofile_ctx);
+       ext2fs_free_mem(&ctx->lfsck_oinfo);
+
+       return rc;
+}
+
+/* What is the last object id for the OST on the MDS */
+int e2fsck_get_lov_objids(e2fsck_t ctx, struct lfsck_outdb_info *outdb)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t inode;
+       ext2_file_t e2_file;
+       __u64 *lov_objids = NULL;
+       unsigned int got;
+       char *block_buf;
+       int i, rc = 0;
+
+       block_buf = e2fsck_allocate_memory(ctx, fs->blocksize * 3,
+                                          "block iterate buffer");
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, LOV_OBJID,
+                          strlen(LOV_OBJID), block_buf, &inode);
+       if (rc)
+               goto out;
+
+       lov_objids = e2fsck_allocate_memory(ctx,
+                                           sizeof(*lov_objids) * LOV_MAX_OSTS,
+                                           "lov_objids array");
+       if (lov_objids == NULL) {
+               rc = ENOMEM;
+               goto out;
+       }
+
+       rc = ext2fs_file_open(fs, inode, 0, &e2_file);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_file_read(e2_file, lov_objids,
+                             sizeof(*lov_objids) * LOV_MAX_OSTS, &got);
+       rc = ext2fs_file_close(e2_file);
+
+       outdb->ost_count = got / sizeof(*lov_objids);
+       for (i = 0; i < outdb->ost_count; i++) {
+               VERBOSE(ctx,"MDS: ost_idx %d max_id "LPU64"\n",i,lov_objids[i]);
+               outdb->ofile_ctx[i].max_id = lov_objids[i];
+               outdb->ofile_ctx[i].have_max_id = 1;
+               outdb->have_ost_count = 1;
+       }
+
+out:
+       ext2fs_free_mem(&block_buf);
+       if (lov_objids)
+               ext2fs_free_mem(&lov_objids);
+       if (rc)
+               VERBOSE(ctx, "MDS: unable to read lov_objids: rc %d\n", rc);
+       else
+               VERBOSE(ctx, "MDS: got %d bytes = %d entries in lov_objids\n",
+                       got, outdb->ost_count);
+       return rc;
+}
+
+static int lfsck_write_mds_hdrinfo(e2fsck_t ctx, struct lfsck_outdb_info *outdb)
+{
+       struct lfsck_mds_hdr mds_hdr;
+       ext2_filsys fs = ctx->fs;
+       char *mds_hdrname;
+       DB *mds_hdrdb = NULL;
+       DBT key, data;
+       int rc = 0;
+       int i;
+
+       mds_hdrname = e2fsck_allocate_memory(ctx, PATH_MAX,
+                                            "mds_hdr filename");
+       sprintf(mds_hdrname, "%s.mdshdr", ctx->lustre_mdsdb);
+
+       if (unlink(mds_hdrname)) {
+               if (errno != ENOENT) {
+                       fprintf(stderr, "Failure to remove old db file %s\n",
+                               mds_hdrname);
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return -EINVAL;
+               }
+       }
+
+       rc = lfsck_opendb(mds_hdrname, MDS_HDR, &mds_hdrdb, 0, 0, 0);
+       if (rc != 0) {
+               fprintf(stderr, "failure to open database for mdsdhr "
+                       "info%s: %s\n", MDS_HDR, db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               ext2fs_free_mem(&mds_hdrname);
+               return rc;
+       }
+
+       /* read in e2fsck_lfsck_save_ea() already if we opened read/write */
+       if (ctx->lfsck_oinfo->ost_count == 0)
+               e2fsck_get_lov_objids(ctx, ctx->lfsck_oinfo);
+
+       memset(&mds_hdr, 0, sizeof(mds_hdr));
+       mds_hdr.mds_magic = MDS_MAGIC;
+       mds_hdr.mds_flags = ctx->options & E2F_OPT_READONLY;
+       mds_hdr.mds_max_files = fs->super->s_inodes_count -
+                           fs->super->s_free_inodes_count;
+       VERBOSE(ctx, "MDS: max_files = "LPU64"\n", mds_hdr.mds_max_files);
+       mds_hdr.mds_num_osts = ctx->lfsck_oinfo->ost_count;
+       VERBOSE(ctx, "MDS: num_osts = %u\n", mds_hdr.mds_num_osts);
+       for (i = 0; i < mds_hdr.mds_num_osts; i++) {
+               mds_hdr.mds_max_ost_id[i] =
+                       ctx->lfsck_oinfo->ofile_ctx[i].max_id;
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &mds_hdr.mds_magic;
+       key.size = sizeof(mds_hdr.mds_magic);
+       cputole_mds_hdr(&mds_hdr);
+       data.data = &mds_hdr;
+       data.size = sizeof(mds_hdr);
+       rc = mds_hdrdb->put(mds_hdrdb, NULL, &key, &data, 0);
+       if (rc != 0) {
+               fprintf(stderr, "error: db put %s: %s\n", MDS_HDR,
+                       db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+out:
+       mds_hdrdb->close(mds_hdrdb, 0);
+       ext2fs_free_mem(&mds_hdrname);
+       if (rc == 0) {
+               printf("mds info db file written \n");
+               fflush(stdout);
+
+       }
+       return rc;
+}
+
+int e2fsck_lfsck_save_ea(e2fsck_t ctx, ext2_ino_t ino, __u32 generation,
+                        struct lov_user_md *lmm, struct lustre_mdt_attrs *lma)
+{
+       ext2_filsys fs = ctx->fs;
+#ifdef LFSCK_CHECK_SIZE
+       struct lfsck_mds_szinfo szinfo;
+#endif /* LFSCK_CHECK_SIZE */
+       struct lov_user_ost_data_v1 *loi;
+       int rc, i;
+       DBT key, data;
+       DB *dbp;
+       __u32 numfiles = fs->super->s_inodes_count -
+                        fs->super->s_free_inodes_count;
+
+       if (!ctx->lfsck_oinfo) {
+               /* remove old db file */
+               if (unlink(ctx->lustre_mdsdb)) {
+                       rc = errno;
+                       if (rc != ENOENT) {
+                               fprintf(stderr,"Error removing old db %s: %s\n",
+                                       ctx->lustre_mdsdb, strerror(rc));
+                               ctx->flags |= E2F_FLAG_ABORT;
+                               return rc;
+                       }
+               }
+
+               rc = ext2fs_get_mem(sizeof(struct lfsck_outdb_info),
+                                   &ctx->lfsck_oinfo);
+               if (rc) {
+                       ctx->lfsck_oinfo = NULL;
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return rc;
+               }
+               memset(ctx->lfsck_oinfo, 0, sizeof(struct lfsck_outdb_info));
+               rc = ext2fs_get_mem(sizeof(struct lfsck_ofile_ctx)*LOV_MAX_OSTS,
+                                   &ctx->lfsck_oinfo->ofile_ctx);
+               if (rc) {
+                       ext2fs_free_mem(&ctx->lfsck_oinfo);
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return rc;
+               }
+               memset(ctx->lfsck_oinfo->ofile_ctx, 0,
+                      sizeof(struct lfsck_ofile_ctx) * LOV_MAX_OSTS);
+#ifdef LFSCK_CHECK_SIZE
+               if (lfsck_opendb(ctx->lustre_mdsdb, MDS_SIZEINFO,
+                                &ctx->lfsck_oinfo->mds_sizeinfo_dbp, 0,
+                                sizeof(szinfo.mds_fid) + sizeof(szinfo),
+                                numfiles)) {
+                       fprintf(stderr, "Failed to open db file %s\n",
+                               MDS_SIZEINFO);
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return EIO;
+               }
+#endif /* LFSCK_CHECK_SIZE */
+
+               if (ctx->options & E2F_OPT_READONLY) {
+                       e2fsck_get_lov_objids(ctx, ctx->lfsck_oinfo);
+                       lfsck_write_mds_hdrinfo(ctx, ctx->lfsck_oinfo);
+               }
+       }
+       if (lmm->lmm_magic == LOV_USER_MAGIC_V3)
+               loi = ((struct lov_user_md_v3 *)lmm)->lmm_objects;
+       else /* if (lmm->lmm_magic == LOV_USER_MAGIC_V1) */
+               loi = lmm->lmm_objects;
+
+#ifdef LFSCK_CHECK_SIZE
+       /* XXX: We don't save the layout type here.  This doesn't matter for
+        *      now, we don't really need the pool information for lfsck, but
+        *      in the future we may need it for RAID-1 and other layouts. */
+       memset(&szinfo, 0, sizeof(szinfo));
+       szinfo.mds_fid = ino;
+       szinfo.mds_seq = lmm->lmm_object_seq;
+       szinfo.mds_stripe_size = lmm->lmm_stripe_size;
+       szinfo.mds_stripe_start = loi->l_ost_idx;
+       szinfo.mds_calc_size = 0;
+       szinfo.mds_stripe_pattern = lmm->lmm_pattern;
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &szinfo.mds_fid;
+       key.size = sizeof(szinfo.mds_fid);
+       cputole_mds_szinfo(&szinfo);
+       data.data = &szinfo;
+       data.size = sizeof(szinfo);
+       dbp = ctx->lfsck_oinfo->mds_sizeinfo_dbp;
+
+       rc = dbp->put(dbp, NULL, &key, &data, 0);
+       if (rc != 0) {
+               dbp->err(ctx->lfsck_oinfo->mds_sizeinfo_dbp, rc,
+                        "db->put failed\n");
+               e2fsck_lfsck_cleanupdb(ctx);
+               ctx->flags |= E2F_FLAG_ABORT;
+
+               return EIO;
+       }
+#endif /* LFSCK_CHECK_SIZE */
+       for (i = 0; i < lmm->lmm_stripe_count; i++, loi++) {
+               int ost_idx = loi->l_ost_idx;
+               struct lfsck_mds_objent mds_ent;
+               struct lfsck_ofile_ctx *ofile_ctx =
+                                        &ctx->lfsck_oinfo->ofile_ctx[ost_idx];
+
+               if (ost_idx >= LOV_MAX_OSTS) {
+                       fprintf(stderr, "invalid OST index %u ino %u[%d]\n",
+                               ost_idx, ino, i);
+                       continue;
+               }
+
+               if (ost_idx + 1 > ctx->lfsck_oinfo->ost_count) {
+                       if (ctx->lfsck_oinfo->have_ost_count) {
+                               fprintf(stderr, "bad OST index %u ino %u[%d]\n",
+                                       ost_idx, ino, i);
+                               continue;
+                       }
+                       ctx->lfsck_oinfo->ost_count = ost_idx + 1;
+               }
+
+               if (ofile_ctx->dbp == NULL) {
+                       char dbname[256];
+
+                       sprintf(dbname, "%s.%d", MDS_OSTDB, ost_idx);
+                       rc = lfsck_opendb(ctx->lustre_mdsdb, dbname,
+                                         &ofile_ctx->dbp, 1,
+                                         sizeof(mds_ent), numfiles);
+                       if (rc) {
+                               e2fsck_lfsck_cleanupdb(ctx);
+                               ctx->flags |= E2F_FLAG_ABORT;
+                               return EIO;
+                       }
+               }
+
+               memset(&mds_ent, 0, sizeof(mds_ent));
+               if (lma) {
+                       mds_ent.mds_fid = lma->lma_self_fid;
+               } else {
+                       mds_ent.mds_fid.f_seq = ino;
+                       mds_ent.mds_fid.f_oid = generation;
+                       mds_ent.mds_fid.f_ver = 0;
+               }
+               mds_ent.mds_oi = *(struct ost_id *)&loi->l_object_id;
+               mds_ent.mds_ostidx = ost_idx;
+               mds_ent.mds_ostoffset = i;
+
+               if (mds_ent.mds_oi.oi_id > ofile_ctx->max_id) {
+                       if (ofile_ctx->have_max_id) {
+                               DEBUG(ctx,
+                                     "[%d] skip obj "LPU64" > max "LPU64"\n",
+                                     ost_idx, mds_ent.mds_oi.oi_id,
+                                     ofile_ctx->max_id);
+                               continue;
+                       }
+                       ofile_ctx->max_id = mds_ent.mds_oi.oi_id;
+               }
+
+               memset(&key, 0, sizeof(key));
+               memset(&data, 0, sizeof(data));
+               key.data = &mds_ent.mds_oi;
+               key.size = sizeof(mds_ent.mds_oi);
+               cputole_mds_objent(&mds_ent);
+               data.data = &mds_ent;
+               data.size = sizeof(mds_ent);
+               dbp = ofile_ctx->dbp;
+#if 0
+               DEBUG(ctx, "OST[%u]: inode FID "DFID" oi "DOIF"\n", ost_idx,
+                     PFID(&mds_ent.mds_fid), POIF(&mds_ent.mds_oi));
+#endif
+               rc = dbp->put(dbp, NULL, &key, &data, 0);
+               if (rc != 0) {
+                       dbp->err(dbp, rc, "db->put failed\n");
+                       e2fsck_lfsck_cleanupdb(ctx);
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       /* XXX - Free lctx memory */
+                       return EIO;
+               }
+       }
+       return 0;
+}
+
+static int lfsck_check_lov_ea(e2fsck_t ctx, struct lov_user_md *lmm)
+{
+       if (lmm->lmm_magic != LOV_USER_MAGIC_V1 &&
+           lmm->lmm_magic != LOV_USER_MAGIC_V3) {
+               VERBOSE(ctx, "error: only handle v1/v3 LOV EAs, not %08x\n",
+                       lmm->lmm_magic);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int lfsck_check_lma_ea(e2fsck_t ctx, struct lustre_mdt_attrs *lma)
+{
+       return 0;
+}
+
+/*
+ * e2fsck pass1 has found a file with an EA let's save the information in
+ * the correct table(s).  This is only called for an MDS search.
+ */
+int e2fsck_lfsck_find_ea(e2fsck_t ctx, struct ext2_inode_large *inode,
+                        struct ext2_ext_attr_entry *entry, void *value,
+                        struct lov_user_md **lmm,
+                        struct lustre_mdt_attrs **lma)
+{
+       void *ea = NULL;
+       int retval;
+       unsigned int got;
+       ext2_file_t file;
+
+       /* This ensures that we don't open the file here if traversing an OST */
+       if ((ctx->lustre_devtype & LUSTRE_TYPE) != LUSTRE_MDS)
+               return 0;
+
+       if (!LINUX_S_ISREG(inode->i_mode))
+               return 0;
+
+       if (entry->e_name_len == 0)
+               return 0;
+
+       if (entry->e_name_index == EXT3_XATTR_INDEX_TRUSTED &&
+           entry->e_name_index == EXT3_XATTR_INDEX_LUSTRE)
+               return 0;
+
+       ea = e2fsck_allocate_memory(ctx, entry->e_value_size, "EA");
+
+       if (entry->e_value_inum != 0) {
+               /* EA in external inode */
+               retval = ext2fs_file_open(ctx->fs, entry->e_value_inum,
+                                         0, &file);
+               if (!retval) {
+                       retval = ext2fs_file_read(file, ea,
+                                                 entry->e_value_size, &got);
+                       ext2fs_file_close(file);
+                       if (retval != 0) {
+                               ext2fs_free_mem(&ea);
+                               return 0;
+                       }
+               } else {
+                       ext2fs_free_mem(&ea);
+                       return 0;
+               }
+       } else {
+               memcpy(ea, value, entry->e_value_size);
+       }
+
+       if (entry->e_name_len == strlen(XATTR_LUSTRE_MDS_LOV_EA) &&
+           strncmp(entry->e_name, XATTR_LUSTRE_MDS_LOV_EA,
+                   entry->e_name_len) == 0) {
+               if (*lmm) {
+                       ext2fs_free_mem(&ea);
+                       return -EINVAL;
+               }
+               *lmm = ea;
+               letocpu_lov_user_md(*lmm);
+
+               if (lfsck_check_lov_ea(ctx, *lmm)) {
+                       *lmm = NULL;
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       ext2fs_free_mem(&ea);
+                       return -EINVAL;
+               }
+       } else if (entry->e_name_len == strlen(XATTR_LUSTRE_MDT_LMA_EA) &&
+                  strncmp(entry->e_name, XATTR_LUSTRE_MDT_LMA_EA,
+                          entry->e_name_len) == 0) {
+               if (*lma) {
+                       ext2fs_free_mem(&ea);
+                       return -EINVAL;
+               }
+               *lma = ea;
+               if (lfsck_check_lma_ea(ctx, *lma)) {
+                       *lma = NULL;
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       ext2fs_free_mem(&ea);
+                       return -EINVAL;
+               }
+       } else {
+               ext2fs_free_mem(&ea);
+       }
+
+       return 0;
+}
+
+/* make sure that the mds data is on file */
+int e2fsck_lfsck_flush_ea(e2fsck_t ctx)
+{
+       int i, rc = 0;
+       DB *dbp;
+
+       if ((ctx->lustre_devtype & LUSTRE_TYPE) != LUSTRE_MDS)
+               return 0;
+
+       if (ctx->lfsck_oinfo == 0)
+               return 0;
+
+       for (i = 0; i < ctx->lfsck_oinfo->ost_count; i++) {
+               if (ctx->lfsck_oinfo->ofile_ctx == NULL)
+                       break;
+
+               if (ctx->lfsck_oinfo->ofile_ctx[i].dbp != NULL) {
+                       dbp = ctx->lfsck_oinfo->ofile_ctx[i].dbp;
+                       rc += dbp->close(dbp, 0);
+                       ctx->lfsck_oinfo->ofile_ctx[i].dbp = NULL;
+               }
+       }
+       if (ctx->lfsck_oinfo->mds_sizeinfo_dbp != NULL) {
+               dbp = ctx->lfsck_oinfo->mds_sizeinfo_dbp;
+               rc += dbp->close(dbp, 0);
+               ctx->lfsck_oinfo->mds_sizeinfo_dbp = NULL;
+       }
+
+       if (rc)
+               ctx->flags |= E2F_FLAG_ABORT;
+
+       return rc;
+}
+
+/* From debugfs.c for file removal */
+static int lfsck_release_blocks_proc(ext2_filsys fs, blk_t *blocknr,
+                              int blockcnt, void *private)
+{
+       blk_t   block;
+
+       block = *blocknr;
+       ext2fs_block_alloc_stats(fs, block, -1);
+       return 0;
+}
+
+static void lfsck_kill_file_by_inode(ext2_filsys fs, ext2_ino_t inode)
+{
+       struct ext2_inode inode_buf;
+
+       if (ext2fs_read_inode(fs, inode, &inode_buf))
+               return;
+
+       inode_buf.i_dtime = time(NULL);
+       if (ext2fs_write_inode(fs, inode, &inode_buf))
+               return;
+
+       ext2fs_block_iterate(fs, inode, 0, NULL,
+                            lfsck_release_blocks_proc, NULL);
+       ext2fs_inode_alloc_stats2(fs, inode, -1,
+                                 LINUX_S_ISDIR(inode_buf.i_mode));
+}
+
+/*
+ * remove a file. Currently this removes the lov_objids file
+ * since otherwise the automatic deletion of precreated objects on
+ * mds/ost connection could potentially remove objects with
+ * data - this would be especially the case if the mds has being
+ * restored from backup
+ */
+static int lfsck_rm_file(e2fsck_t ctx, ext2_ino_t dir, char *name)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t ino;
+       struct ext2_inode inode;
+       int rc;
+
+       rc = ext2fs_lookup(fs, dir, name, strlen(name),
+                          NULL, &ino);
+       if (rc)
+               return 0;
+
+       if (ext2fs_read_inode(fs, ino, &inode))
+               return -EINVAL;
+
+       --inode.i_links_count;
+
+       if (ext2fs_write_inode(fs, ino, &inode))
+               return -EINVAL;
+
+       if (ext2fs_unlink(fs, dir, name, ino, 0))
+               return -EIO;
+
+       if (inode.i_links_count == 0)
+               lfsck_kill_file_by_inode(fs, ino);
+
+       return 0;
+}
+
+/* called for each ost object - save the object id and size */
+static int lfsck_list_objs(ext2_ino_t dir, int entry,
+                          struct ext2_dir_entry *dirent, int offset,
+                          int blocksize, char *buf, void *priv_data)
+{
+       struct lfsck_ost_ctx *lctx = priv_data;
+       struct lfsck_ost_objent objent;
+       struct ext2_inode inode;
+       DBT key, data;
+       DB *dbp;
+       char name[32]; /* same as filter_fid2dentry() */
+
+       if (!ext2fs_check_directory(lctx->ctx->fs, dirent->inode))
+               return 0;
+
+       memset(name, 0, sizeof(name));
+       strncpy(name, dirent->name, dirent->name_len & 0xFF);
+       memset(&objent, 0, sizeof(objent));
+       objent.ost_oi.oi_id = STRTOUL(name, NULL, 10);
+       if (objent.ost_oi.oi_id == STRTOUL_MAX) {
+               lctx->status = 1;
+               lctx->ctx->flags |= E2F_FLAG_ABORT;
+               return DIRENT_ABORT;
+       }
+
+       if (ext2fs_read_inode(lctx->ctx->fs, dirent->inode, &inode)) {
+               lctx->status = 1;
+               lctx->ctx->flags |= E2F_FLAG_ABORT;
+               return DIRENT_ABORT;
+       }
+
+       if (LINUX_S_ISREG(inode.i_mode))
+               objent.ost_size = EXT2_I_SIZE(&inode);
+       else
+               objent.ost_size = inode.i_size;
+       objent.ost_bytes = (__u64)inode.i_blocks * 512;
+
+       if (objent.ost_oi.oi_id > lctx->max_objid)
+               lctx->max_objid = objent.ost_oi.oi_id;
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &objent.ost_oi;
+       key.size = sizeof(objent.ost_oi);
+       cputole_ost_objent(&objent);
+       data.data = &objent;
+       data.size = sizeof(objent);
+       dbp = lctx->outdb;
+       if (dbp->put(dbp, NULL, &key, &data, 0) != 0) {
+               fprintf(stderr, "Failure to put data into db\n");
+               lctx->ctx->flags |= E2F_FLAG_ABORT;
+               return DIRENT_ABORT;
+       }
+
+       lctx->numfiles++;
+       return 0;
+}
+
+/* For each file on the mds save the fid and the containing directory */
+static int lfsck_mds_dirs(ext2_ino_t dir, int entry,
+                         struct ext2_dir_entry *de, int offset,
+                         int blocksize, char *buf, void *priv_data)
+{
+       struct ext2_dir_entry_2 *dirent = (struct ext2_dir_entry_2 *)de;
+       struct lfsck_mds_ctx *lctx = priv_data;
+       struct lfsck_mds_ctx lctx2;
+       struct lfsck_mds_dirent mds_dirent;
+       DBT key, data;
+       DB *dbp = lctx->outdb;
+       int file_type;
+       int rc = 0;
+
+       DEBUG(lctx->ctx, "MDT: inode %u, file %.*s, type %u\n",
+             dirent->inode, dirent->name_len, dirent->name,
+             dirent->file_type & 0x15);
+       if (dirent->inode == lctx->dot || dirent->inode == lctx->dotdot)
+               return 0;
+
+       /* only the low 4 bits are used to specify file type */
+       file_type = dirent->file_type & 15;
+       if (file_type != EXT2_FT_DIR && file_type != EXT2_FT_REG_FILE)
+               return 0;
+
+       lctx->numfiles++;
+
+       rc = lfsck_get_fid(lctx->ctx->fs, dirent->inode, &mds_dirent.mds_fid);
+       if (rc != 0)
+               return 0;
+
+       DEBUG(lctx->ctx, "MDT: dirfid "DFID" child "DFID" file %.*s\n",
+             PFID(&mds_dirent.mds_dirfid), PFID(&mds_dirent.mds_fid),
+             dirent->name_len, dirent->name);
+
+       mds_dirent.mds_dirfid = lctx->dotfid;
+       cputole_mds_dirent(&mds_dirent);
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &mds_dirent.mds_fid;
+       key.size = sizeof(mds_dirent.mds_fid);
+       data.data = &mds_dirent;
+       data.size = sizeof(mds_dirent);
+
+       rc = dbp->put(dbp, NULL, &key, &data, 0);
+       if (rc != 0) {
+               if (rc != DB_KEYEXIST) {
+                       fprintf(stderr,
+                               "error adding MDS inode %.*s (inum %u): %s\n",
+                               dirent->name_len & 0xFF, dirent->name,
+                               dirent->inode, db_strerror(rc));
+                       lctx->ctx->flags |= E2F_FLAG_ABORT;
+                       return DIRENT_ABORT;
+               }
+       }
+       if (file_type == EXT2_FT_DIR) {
+               lctx2 = *lctx;
+               lctx2.dot = dirent->inode;
+               lctx2.dotdot = lctx->dot;
+               lctx2.dotfid = mds_dirent.mds_fid;
+               if (ext2fs_dir_iterate2(lctx->ctx->fs, dirent->inode, 0, NULL,
+                                       lfsck_mds_dirs, &lctx2)) {
+                       return DIRENT_ABORT;
+               }
+               lctx->numfiles = lctx2.numfiles;
+       }
+       return 0;
+}
+
+/* For each directory get the objects and save the data */
+static int lfsck_iterate_obj_dirs(ext2_ino_t dir, int entry,
+                                 struct ext2_dir_entry *dirent, int offset,
+                                 int blocksize, char *buf, void *priv_data)
+{
+       struct lfsck_ost_ctx *lctx = priv_data;
+
+       if (ext2fs_check_directory(lctx->ctx->fs, dirent->inode))
+               return 0;
+
+       /* Traverse the d* directories */
+       if (*dirent->name != 'd')
+               return 0;
+
+       ext2fs_dir_iterate2(lctx->ctx->fs, dirent->inode, 0, NULL,
+                           lfsck_list_objs, priv_data);
+       if (lctx->status != 0)
+               return DIRENT_ABORT;
+
+       return 0;
+}
+
+/* Get the starting point of where the objects reside */
+static int lfsck_get_object_dir(e2fsck_t ctx, char *block_buf,ext2_ino_t *inode)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t  tinode;
+       int rc;
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, OBJECT_DIR, strlen(OBJECT_DIR),
+                          block_buf, &tinode);
+       if (rc) {
+               fprintf(stderr, "error looking up OST object parent dir\n");
+               return ENOENT;
+       }
+       rc = ext2fs_check_directory(fs, tinode);
+       if (rc)
+               return ENOENT;
+
+       rc = ext2fs_lookup(fs, tinode, OBJECT_DIR_V1, strlen(OBJECT_DIR_V1),
+                          block_buf, inode);
+       if (rc) {
+               rc = ext2fs_lookup(fs, tinode, OBJECT_DIR_V2,
+                                  strlen(OBJECT_DIR_V2), block_buf, inode);
+               if (rc) {
+                       fprintf(stderr, "error looking up OST object subdir\n");
+                       return -ENOENT;
+               }
+       }
+       rc = ext2fs_check_directory(fs, *inode);
+       if (rc)
+               return -ENOENT;
+
+       return 0;
+}
+
+/* What is the last object id for the OST */
+static int lfsck_get_last_id(e2fsck_t ctx, __u64 *last_id)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t  inode, tinode;
+       ext2_file_t  e2_file;
+       char *block_buf;
+       unsigned int got;
+       int rc;
+
+       block_buf = e2fsck_allocate_memory(ctx, fs->blocksize * 3,
+                                          "lookup buffer");
+
+       rc = lfsck_get_object_dir(ctx, block_buf, &inode);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_lookup(fs, inode, LAST_ID,
+                          strlen(LAST_ID), block_buf, &tinode);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_file_open(fs, tinode, 0, &e2_file);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_file_read(e2_file, last_id, sizeof(__u64), &got);
+       if (rc) {
+               ext2fs_file_close(e2_file);
+               goto out;
+       }
+
+       if (got != sizeof(__u64)) {
+               rc = EIO;
+               ext2fs_file_close(e2_file);
+               goto out;
+       }
+
+       rc = ext2fs_file_close(e2_file);
+
+       *last_id = ext2fs_le64_to_cpu(*last_id);
+out:
+       ext2fs_free_mem(&block_buf);
+       return rc;
+}
+
+int lfsck_set_last_id(e2fsck_t ctx,  __u64 last_id)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t  inode, tinode;
+       ext2_file_t  e2_file;
+       char *block_buf;
+       unsigned int written;
+       int rc;
+
+       block_buf = e2fsck_allocate_memory(ctx, fs->blocksize * 3,
+                                          "lookup buffer");
+
+       rc = lfsck_get_object_dir(ctx, block_buf, &inode);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_lookup(fs, inode, LAST_ID,
+                          strlen(LAST_ID), block_buf, &tinode);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_file_open(fs, tinode, EXT2_FILE_WRITE, &e2_file);
+       if (rc)
+               goto out;
+
+       last_id = ext2fs_cpu_to_le64(last_id);
+
+       rc = ext2fs_file_write(e2_file, &last_id, sizeof(__u64), &written);
+       if (rc) {
+               fprintf(stderr, "Failure to update last id on file\n");
+               ext2fs_file_close(e2_file);
+               goto out;
+       }
+
+       if (written != sizeof(__u64)) {
+               rc = EIO;
+               fprintf(stderr, "Failure to update last id on file\n");
+               ext2fs_file_close(e2_file);
+               goto out;
+       }
+
+       rc = ext2fs_file_close(e2_file);
+
+out:
+       ext2fs_free_mem(&block_buf);
+       return rc;
+}
+
+int e2fsck_get_last_rcvd_info(e2fsck_t ctx, struct obd_uuid *local_uuid,
+                             struct obd_uuid *peer_uuid, __u32 *subdircount,
+                             __u32 *index, __u32 *compat, __u32 *rocompat,
+                             __u32 *incompat)
+{
+       ext2_filsys fs = ctx->fs;
+       ext2_ino_t inode;
+       ext2_file_t e2_file;
+       struct lustre_server_data *lsd = NULL;
+       unsigned int got;
+       char *block_buf;
+       __u32 cmp, inc;
+       int rc = 0;
+
+       block_buf = e2fsck_allocate_memory(ctx, fs->blocksize * 3,
+                                          "block iterate buffer");
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, LAST_RCVD, strlen(LAST_RCVD),
+                          block_buf, &inode);
+       if (rc)
+               goto out;
+
+       rc = ext2fs_file_open(fs, inode, 0, &e2_file);
+       if (rc)
+               goto out;
+
+       lsd = e2fsck_allocate_memory(ctx, sizeof(*lsd), "lustre server data");
+       if (lsd == NULL) {
+               rc = ENOMEM;
+               goto out;
+       }
+
+       rc = ext2fs_file_read(e2_file, lsd, sizeof(*lsd), &got);
+       if (rc)
+               goto out;
+       if (got != sizeof(*lsd)) {
+               rc = EIO;
+               goto out;
+       }
+
+       if (local_uuid)
+               memcpy(local_uuid, &lsd->lsd_uuid, sizeof(lsd->lsd_uuid));
+
+       if (peer_uuid)
+               memcpy(peer_uuid, &lsd->lsd_peeruuid,sizeof(lsd->lsd_peeruuid));
+
+       if (subdircount)
+               *subdircount = ext2fs_le16_to_cpu(lsd->lsd_subdir_count);
+
+       if (compat == NULL)
+               compat = &cmp;
+       *compat = ext2fs_le32_to_cpu(lsd->lsd_feature_compat);
+       if (rocompat)
+               *rocompat = ext2fs_le32_to_cpu(lsd->lsd_feature_rocompat);
+       if (incompat == NULL)
+               incompat = &inc;
+       *incompat = ext2fs_le32_to_cpu(lsd->lsd_feature_incompat);
+       if (index) {
+               if (*compat & OBD_COMPAT_OST || *incompat & OBD_INCOMPAT_OST)
+                       *index = ext2fs_le32_to_cpu(lsd->lsd_ost_index);
+               else if (*compat & OBD_COMPAT_MDT||*incompat & OBD_INCOMPAT_MDT)
+                       *index = ext2fs_le32_to_cpu(lsd->lsd_mdt_index);
+               else
+                       *index = -1;
+       }
+
+       rc = ext2fs_file_close(e2_file);
+
+out:
+       ext2fs_free_mem(&block_buf);
+       if (lsd)
+               ext2fs_free_mem(&lsd);
+       return rc;
+}
+
+int lfsck_rm_log(ext2_ino_t dir, int entry, struct ext2_dir_entry *dirent,
+                int offset, int blocksize, char *buf, void *priv_data)
+{
+       struct lfsck_ost_ctx *lctx = priv_data;
+       char name[EXT2_NAME_LEN + 1];
+
+       if (!ext2fs_check_directory(lctx->ctx->fs, dirent->inode))
+               return 0;
+
+       strncpy(name, dirent->name, dirent->name_len & 0xFF);
+       name[EXT2_NAME_LEN] = '\0';
+       if (memcmp(name, LAST_ID, strlen(LAST_ID)) == 0)
+               return 0;
+
+
+       if (lfsck_rm_file(lctx->ctx, lctx->dirinode, name))
+               return DIRENT_ABORT;
+
+       return 0;
+}
+
+/* Not 100% sure that this is correct so not activated yet */
+int lfsck_remove_ost_logs(e2fsck_t ctx, char *block_buf)
+{
+       ext2_filsys fs = ctx->fs;
+       struct lfsck_ost_ctx lctx;
+       ext2_ino_t inode;
+       ext2_ino_t  tinode;
+       int rc;
+
+       if (lfsck_rm_file(ctx, EXT2_ROOT_INO, CATLIST)) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -EINVAL;
+       }
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, OBJECT_DIR, strlen(OBJECT_DIR),
+                          block_buf, &tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+       rc = ext2fs_check_directory(fs, tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+
+       rc = ext2fs_lookup(fs, tinode, LOG_DIR, strlen(LOG_DIR),
+                          block_buf, &inode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+       rc = ext2fs_check_directory(fs, inode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+       lctx.ctx   = ctx;
+       lctx.dirinode = inode;
+
+       if (ext2fs_dir_iterate2(fs, inode, 0, block_buf, lfsck_rm_log, &lctx)) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -EIO;
+       }
+       return 0;
+}
+
+/* Remove files from PENDING dir - this needs to be done before getting ea from
+ * blocks but we need the inode_map bitmap loaded beforehand so load write any
+ * changes then remove references
+ */
+int e2fsck_lfsck_remove_pending(e2fsck_t ctx, char *block_buf)
+{
+       ext2_filsys fs = ctx->fs;
+       struct lfsck_ost_ctx lctx;
+       ext2_ino_t  tinode;
+       int rc = 0;
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, PENDING_DIR, strlen(PENDING_DIR),
+                          block_buf, &tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+       rc = ext2fs_check_directory(fs, tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+
+       lctx.ctx   = ctx;
+       lctx.dirinode = tinode;
+
+       e2fsck_read_bitmaps(ctx);
+
+       if (ext2fs_dir_iterate2(fs, tinode, 0, block_buf, lfsck_rm_log, &lctx)){
+               ctx->flags |= E2F_FLAG_ABORT;
+               rc = -EIO;
+       }
+       e2fsck_write_bitmaps(ctx);
+       ext2fs_free_inode_bitmap(fs->inode_map);
+       ext2fs_free_block_bitmap(fs->block_map);
+       fs->inode_map = NULL;
+       fs->block_map = NULL;
+       return rc;
+}
+
+/* partially using code from debugfs do_write() */
+int lfsck_create_objid(e2fsck_t ctx, __u64 objid)
+{
+       int rc = 0;
+       char dirname[32];
+       char name[32];
+       int len, dirlen;
+       __u32 compat, incompat, subdircount;
+       ext2_ino_t  inode, tinode, cinode;
+       struct ext2_inode ext2inode;
+       char *block_buf;
+
+       block_buf = e2fsck_allocate_memory(ctx, ctx->fs->blocksize * 3,
+                                          "lookup buffer");
+
+       memset(name, 0, 32);
+       memset(dirname, 0, 32);
+
+       len = sprintf(name, LPU64, objid);
+
+       fprintf(stderr, "creating %s\n", name);
+
+       rc = e2fsck_get_last_rcvd_info(ctx, NULL, NULL, &subdircount, NULL,
+                                      &compat, NULL, &incompat);
+       if (rc) {
+               fprintf(stderr, "Error: reading OST last_rcvd file\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+       if (compat & OBD_COMPAT_MDT || incompat & OBD_INCOMPAT_MDT) {
+               fprintf(stderr, "Error: MDS last_rcvd file doing OST check\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+       if (lfsck_get_object_dir(ctx, block_buf, &inode)) {
+               rc = EINVAL;
+               goto out;
+       }
+
+       dirlen = sprintf(dirname, "d%u", (int)objid & (subdircount - 1));
+
+       rc = ext2fs_lookup(ctx->fs, inode, dirname,
+                          dirlen, block_buf, &tinode);
+       if (rc) {
+               rc = EINVAL;
+               goto out;
+       }
+
+       if (ext2fs_namei(ctx->fs, EXT2_ROOT_INO, tinode, name, &cinode) == 0) {
+               fprintf(stderr, "Failure to create obj\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+       rc = ext2fs_new_inode(ctx->fs, tinode, 010755, 0, &cinode);
+       if (rc) {
+               fprintf(stderr, "Failure to create obj\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+       rc = ext2fs_link(ctx->fs, tinode, name, cinode, EXT2_FT_REG_FILE);
+       if (rc) {
+               fprintf(stderr, "Failure to create obj\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+       if (ext2fs_test_inode_bitmap2(ctx->fs->inode_map, cinode))
+               fprintf(stderr, "Warning: inode already set");
+
+       ext2fs_inode_alloc_stats2(ctx->fs, cinode, +1, 0);
+       memset(&ext2inode, 0, sizeof(ext2inode));
+       ext2inode.i_mode = LINUX_S_IFREG;
+       ext2inode.i_atime = ext2inode.i_ctime = ext2inode.i_mtime = time(NULL);
+       ext2inode.i_links_count = 1;
+       ext2inode.i_size = 0;
+       if (ext2fs_write_inode(ctx->fs, cinode, &ext2inode)) {
+               fprintf(stderr, "Failure to create obj\n");
+               rc = EINVAL;
+               goto out;
+       }
+
+out:
+       ext2fs_free_mem((void *)&(block_buf));
+       return rc;
+}
+
+/*
+ * For on ost iterate for the direcories and save the object information.
+ */
+void e2fsck_pass6_ost(e2fsck_t ctx)
+{
+       ext2_filsys fs = ctx->fs;
+       struct lfsck_ost_ctx lctx;
+       struct lfsck_ost_hdr ost_hdr;
+       struct lfsck_mds_hdr mds_hdr;
+       struct lfsck_ost_objent objent;
+       DB *outdb = NULL;
+       DB *mds_hdrdb = NULL;
+       DB *osthdr = NULL;
+       DBT key, data;
+       ext2_ino_t dir;
+       __u32 compat, rocompat, incompat;
+       int i, rc;
+       char *block_buf = NULL;
+
+       if (unlink(ctx->lustre_ostdb)) {
+               if (errno != ENOENT) {
+                       fprintf(stderr, "Failure to remove old db file %s\n",
+                               ctx->lustre_ostdb);
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return;
+               }
+       }
+
+       block_buf = e2fsck_allocate_memory(ctx, fs->blocksize * 3,
+                                          "block iterate buffer");
+
+       rc = lfsck_opendb(ctx->lustre_mdsdb, MDS_HDR, &mds_hdrdb, 0, 0, 0);
+       if (rc != 0) {
+               fprintf(stderr, "failure to open database %s: %s\n",
+                       MDS_HDR, db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       memset(&mds_hdr, 0, sizeof(mds_hdr));
+       mds_hdr.mds_magic = MDS_MAGIC;
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &mds_hdr.mds_magic;
+       key.size = sizeof(mds_hdr.mds_magic);
+       data.data = &mds_hdr;
+       data.size = sizeof(mds_hdr);
+       data.ulen = sizeof(mds_hdr);
+       data.flags = DB_DBT_USERMEM;
+       rc = mds_hdrdb->get(mds_hdrdb, NULL, &key, &data, 0);
+       if (rc) {
+               fprintf(stderr,"error getting mds_hdr ("LPU64":%u) in %s: %s\n",
+                       mds_hdr.mds_magic, (int)sizeof(mds_hdr.mds_magic),
+                       ctx->lustre_mdsdb, db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       memcpy(&mds_hdr, data.data, sizeof(mds_hdr));
+       letocpu_mds_hdr(&mds_hdr);
+
+       rc = lfsck_opendb(ctx->lustre_ostdb, OST_HDR, &osthdr, 0, 0, 0);
+       if (rc != 0) {
+               fprintf(stderr, "failure to open database %s: %s\n",
+                       OST_HDR, db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       rc = lfsck_opendb(ctx->lustre_ostdb, OST_OSTDB, &outdb, 0,
+                         sizeof(objent.ost_oi) + sizeof(objent),
+                         fs->super->s_inodes_count -
+                         fs->super->s_free_inodes_count);
+       if (rc != 0) {
+               fprintf(stderr, "error getting ost_hdr in %s: %s\n",
+                       ctx->lustre_ostdb, db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       memset(&ost_hdr, 0, sizeof(ost_hdr));
+       if (e2fsck_get_last_rcvd_info(ctx, &ost_hdr.ost_uuid,
+                                     &ost_hdr.ost_mds_uuid, NULL,
+                                     &ost_hdr.ost_index,
+                                     &compat, &rocompat, &incompat)) {
+               fprintf(stderr, "Failure to read OST last_rcvd file\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       VERBOSE(ctx, "OST: '%s' ost idx %u: compat %#x rocomp %#x incomp %#x\n",
+               (char *)&ost_hdr.ost_uuid.uuid, ost_hdr.ost_index,
+               compat, rocompat, incompat);
+
+       if (compat & OBD_COMPAT_MDT) {
+               fprintf(stderr, "Found MDS last_rcvd file doing OST check\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       /*
+        * Get /O/R or /O/0 directory
+        * for each entry scan all the dirents and get the object id
+        */
+       if (lfsck_get_object_dir(ctx, block_buf, &dir)) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       /*
+        * Okay so we have the containing directory so let's iterate over the
+        * containing d* dirs and then iterate again inside
+        */
+       lctx.ctx = ctx;
+       lctx.outdb = outdb;
+       lctx.status = 0;
+       lctx.numfiles = 0;
+       lctx.max_objid = 0;
+       lctx.status = ext2fs_dir_iterate2(fs, dir, 0, block_buf,
+                                         lfsck_iterate_obj_dirs, &lctx);
+       if (lctx.status) {
+               fprintf(stderr, "Failure in iterating object dirs\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               return;
+       }
+
+       ost_hdr.ost_magic = OST_MAGIC;
+       ost_hdr.ost_flags = ctx->options & E2F_OPT_READONLY;
+       ost_hdr.ost_num_files = lctx.numfiles;
+       VERBOSE(ctx, "OST: num files = %u\n", lctx.numfiles);
+
+       if (lfsck_get_last_id(ctx, &ost_hdr.ost_last_id)) {
+               fprintf(stderr, "Failure to get last id for objects\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+       VERBOSE(ctx, "OST: last_id = "LPU64"\n", ost_hdr.ost_last_id);
+
+       /* Update the last_id value on the OST if necessary/possible to the
+        * MDS value if larger.  Otherwise we risk creating duplicate objects.
+        * If running read-only, we skip this so new objects are ignored. */
+       ost_hdr.ost_last_id = lctx.max_objid;
+       if (!(ctx->options & E2F_OPT_READONLY) &&
+           !(mds_hdr.mds_flags & E2F_OPT_READONLY)) {
+               for (i = 0; i < mds_hdr.mds_num_osts; i++) {
+                       if (strcmp((char *)mds_hdr.mds_ost_info[i].uuid,
+                                  (char *)ost_hdr.ost_uuid.uuid) == 0 &&
+                           mds_hdr.mds_max_ost_id[i] >= ost_hdr.ost_last_id)
+                               ost_hdr.ost_last_id=mds_hdr.mds_max_ost_id[i]+1;
+               }
+
+               if (lfsck_set_last_id(ctx, ost_hdr.ost_last_id)) {
+                       fprintf(stderr, "Failure to set last id\n");
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       goto out;
+               }
+
+#ifdef LOG_REMOVAL
+               if (lfsck_remove_ost_logs(ctx, block_buf))
+                       ctx->flags |= E2F_FLAG_ABORT;
+#endif
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &ost_hdr.ost_magic;
+       key.size = sizeof(ost_hdr.ost_magic);
+       cputole_ost_hdr(&ost_hdr);
+       data.data = &ost_hdr;
+       data.size = sizeof(ost_hdr);
+       if (osthdr->put(osthdr, NULL, &key, &data, 0)) {
+               fprintf(stderr, "Failed to db_put data\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+out:
+       if (mds_hdrdb)
+               mds_hdrdb->close(mds_hdrdb, 0);
+       if (outdb)
+               outdb->close(outdb, 0);
+       if (osthdr)
+               osthdr->close(osthdr, 0);
+       if (block_buf)
+               ext2fs_free_mem((void *)&(block_buf));
+       return;
+}
+
+int lfsck_remove_mds_logs(e2fsck_t ctx)
+{
+       ext2_filsys fs = ctx->fs;
+       struct lfsck_ost_ctx lctx;
+       ext2_ino_t  tinode;
+       int rc = 0;
+
+       if (lfsck_rm_file(ctx, EXT2_ROOT_INO, CATLIST)) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -EINVAL;
+       }
+
+       rc = ext2fs_lookup(fs, EXT2_ROOT_INO, OBJECTS, strlen(OBJECTS),
+                          NULL, &tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+       rc = ext2fs_check_directory(fs, tinode);
+       if (rc) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               return -ENOENT;
+       }
+
+       lctx.ctx   = ctx;
+       lctx.dirinode = tinode;
+
+       if (ext2fs_dir_iterate2(fs, tinode, 0, NULL, lfsck_rm_log, &lctx)) {
+               ctx->flags |= E2F_FLAG_ABORT;
+               rc = -EIO;
+       }
+       return rc;
+}
+
+
+/*
+ * On the mds save the fid and directory information for each file.
+ * The mds ost tables have already been populated by pass1
+ */
+void e2fsck_pass6_mdt(e2fsck_t ctx)
+{
+       ext2_filsys fs = ctx->fs;
+       struct problem_context pctx;
+       struct lfsck_mds_ctx lctx;
+       struct lfsck_mds_dirent mds_dirent;
+       struct lfsck_mds_hdr mds_hdr;
+       DBT key, data;
+       DB *outdb = NULL, *dbhdr = NULL;
+       __u32 compat, rocompat, incompat, index;
+       int rc, i;
+
+       clear_problem_context(&pctx);
+
+       memset(&lctx, 0, sizeof(lctx));
+       lctx.ctx = ctx;
+
+       /* Found no files with EA on filesystem - empty */
+       if (ctx->lfsck_oinfo == NULL) {
+               if (unlink(ctx->lustre_mdsdb)) {
+                       if (errno != ENOENT) {
+                               fprintf(stderr, "Failure to remove old "
+                                       "db file %s\n", ctx->lustre_mdsdb);
+                               ctx->flags |= E2F_FLAG_ABORT;
+                               goto out;
+                       }
+               }
+               rc = ext2fs_get_mem(sizeof(struct lfsck_outdb_info),
+                                   &ctx->lfsck_oinfo);
+               if (rc) {
+                       ctx->lfsck_oinfo = NULL;
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       goto out;
+               }
+               memset(ctx->lfsck_oinfo, 0, sizeof(struct lfsck_outdb_info));
+               rc = ext2fs_get_mem(sizeof(struct lfsck_ofile_ctx)*LOV_MAX_OSTS,
+                                   &ctx->lfsck_oinfo->ofile_ctx);
+               if (rc) {
+                       ext2fs_free_mem(&ctx->lfsck_oinfo);
+                       ctx->lfsck_oinfo = NULL;
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       goto out;
+               }
+               memset(ctx->lfsck_oinfo->ofile_ctx, 0,
+                      sizeof(struct lfsck_ofile_ctx) * LOV_MAX_OSTS);
+       }
+
+       if (!(ctx->options & E2F_OPT_READONLY))
+                lfsck_write_mds_hdrinfo(ctx, ctx->lfsck_oinfo);
+
+       if (lfsck_opendb(ctx->lustre_mdsdb, MDS_DIRINFO, &outdb, 1,
+                        sizeof(mds_dirent.mds_fid) + sizeof(mds_dirent),
+                        fs->super->s_inodes_count -
+                        fs->super->s_free_inodes_count)) {
+               fprintf(stderr, "failure to open database %s\n", MDS_DIRINFO);
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       lctx.outdb = outdb;
+       lctx.numfiles = 0;
+       lctx.dot = EXT2_ROOT_INO;
+       lctx.dotdot = EXT2_ROOT_INO;
+       lctx.dotfid.f_seq = EXT2_ROOT_INO;
+
+       rc = ext2fs_dir_iterate2(fs, EXT2_ROOT_INO,0,NULL,lfsck_mds_dirs,&lctx);
+       if (rc != 0) {
+               fprintf(stderr, "Error iterating directories: %d\n", rc);
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       /* read in e2fsck_lfsck_save_ea() already if we opened read/write */
+       if (ctx->lfsck_oinfo->ost_count == 0)
+               e2fsck_get_lov_objids(ctx, ctx->lfsck_oinfo);
+
+       memset(&mds_hdr, 0, sizeof(mds_hdr));
+       mds_hdr.mds_magic = MDS_MAGIC;
+       mds_hdr.mds_flags = ctx->options & E2F_OPT_READONLY;
+       mds_hdr.mds_max_files = fs->super->s_inodes_count -
+                           fs->super->s_free_inodes_count;
+       VERBOSE(ctx, "MDS: max_files = "LPU64"\n", mds_hdr.mds_max_files);
+       mds_hdr.mds_num_osts = ctx->lfsck_oinfo->ost_count;
+       VERBOSE(ctx, "MDS: num_osts = %u\n", mds_hdr.mds_num_osts);
+       for (i = 0; i < mds_hdr.mds_num_osts; i++) {
+               mds_hdr.mds_max_ost_id[i] =
+                       ctx->lfsck_oinfo->ofile_ctx[i].max_id;
+       }
+
+       if (e2fsck_get_last_rcvd_info(ctx, &mds_hdr.mds_uuid, NULL, NULL,
+                                     &index, &compat, &rocompat, &incompat)) {
+               fprintf(stderr, "Failure to read MDS last_rcvd file\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       VERBOSE(ctx, "MDS: '%s' mdt idx %u: compat %#x rocomp %#x incomp %#x\n",
+               (char *)&mds_hdr.mds_uuid.uuid, index,compat,rocompat,incompat);
+
+       if (compat & OBD_COMPAT_OST || incompat & OBD_INCOMPAT_OST) {
+               fprintf(stderr, "Found OST last_rcvd file doing MDS check\n");
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       if (!(ctx->options & E2F_OPT_READONLY)) {
+               if (lfsck_rm_file(ctx, EXT2_ROOT_INO, LOV_OBJID)) {
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       goto out;
+               }
+#ifdef LOG_REMOVAL
+               if (lfsck_remove_mds_logs(ctx)) {
+                       ctx->flags |= E2F_FLAG_ABORT;
+                       return;
+               }
+#endif
+       }
+
+       rc = lfsck_opendb(ctx->lustre_mdsdb, MDS_HDR, &dbhdr, 0, 0, 0);
+       if (rc != 0) {
+               fprintf(stderr, "failure to open database %s: %s\n", MDS_HDR,
+                       db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+
+       memset(&key, 0, sizeof(key));
+       memset(&data, 0, sizeof(data));
+       key.data = &mds_hdr.mds_magic;
+       key.size = sizeof(mds_hdr.mds_magic);
+       cputole_mds_hdr(&mds_hdr);
+       data.data = &mds_hdr;
+       data.size = sizeof(mds_hdr);
+       rc = dbhdr->put(dbhdr, NULL, &key, &data, 0);
+       if (rc != 0) {
+               fprintf(stderr, "error: db put %s: %s\n", MDS_HDR,
+                       db_strerror(rc));
+               ctx->flags |= E2F_FLAG_ABORT;
+               goto out;
+       }
+out:
+       if (dbhdr)
+               dbhdr->close(dbhdr, 0);
+       if (outdb)
+               outdb->close(outdb, 0);
+}
+
+/* If lfsck checking requested then gather the data */
+void e2fsck_pass6(e2fsck_t ctx)
+{
+       if (ctx->lustre_devtype == LUSTRE_NULL)
+               return;
+
+       printf("Pass 6: Acquiring %s information for lfsck\n",
+              ctx->lustre_devtype & LUSTRE_OST ? "OST" :
+              ctx->lustre_devtype & LUSTRE_MDS ? "MDT" : "device");
+
+       fflush(stdout);
+
+       if (ctx->lustre_devtype & LUSTRE_OST)
+               e2fsck_pass6_ost(ctx);
+       else if (ctx->lustre_devtype & LUSTRE_MDS)
+               e2fsck_pass6_mdt(ctx);
+       else
+               fprintf(stderr, "Invalid lustre dev %x\n", ctx->lustre_devtype);
+
+       return;
+}
+#endif /* HAVE_LFSCK */
index 75b82d3..3be3fdf 100644 (file)
@@ -10,6 +10,7 @@
  */
 
 #define _XOPEN_SOURCE 600 /* for inclusion of sa_handler in Solaris */
+#define _GNU_SOURCE
 
 #include "config.h"
 #include <stdio.h>
@@ -45,6 +46,9 @@ extern int optind;
 #ifdef HAVE_DIRENT_H
 #include <dirent.h>
 #endif
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
 
 #include "e2p/e2p.h"
 #include "et/com_err.h"
@@ -53,6 +57,17 @@ extern int optind;
 #include "problem.h"
 #include "../version.h"
 
+#include "ext2fs/lfsck.h"
+
+static struct option long_options[] = {
+#ifdef HAVE_LFSCK
+       { "mdsdb", 1, NULL, 1 },
+       { "mdtdb", 1, NULL, 1 },
+       { "ostdb", 1, NULL, 2 },
+#endif
+       { 0, 0, 0, 0 }
+};
+
 /* Command line options */
 static int cflag;              /* check disk */
 static int show_version_only;
@@ -363,6 +378,15 @@ static void check_if_skip(e2fsck_t ctx)
                             fs->super->s_checkinterval*2))
                        reason = 0;
        }
+#ifdef HAVE_LFSCK
+       if (ctx->lustre_devtype & LUSTRE_TYPE) {
+               if (!reason || ctx->options & E2F_OPT_READONLY)
+                       ctx->lustre_devtype |= LUSTRE_ONLY;
+               if (!reason)
+                       reason = _(" lustre database creation");
+       }
+#endif
+
        if (reason) {
                log_out(ctx, "%s", ctx->device_name);
                log_out(ctx, reason, reason_arg);
@@ -429,6 +453,15 @@ static void check_if_skip(e2fsck_t ctx)
 skip:
        ext2fs_close(fs);
        ctx->fs = NULL;
+#ifdef HAVE_LFSCK
+       if (ctx->lustre_mdsdb)
+               free(ctx->lustre_mdsdb);
+       if (ctx->lustre_ostdb)
+               free(ctx->lustre_ostdb);
+       if (ctx->lfsck_oinfo)
+               e2fsck_lfsck_cleanupdb(ctx);
+#endif /* HAVE_LFSCK */
+
        e2fsck_free_context(ctx);
        exit(FSCK_OK);
 }
@@ -800,6 +833,7 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
 {
        int             flush = 0;
        int             c, fd;
+       int             option_index;
 #ifdef MTRACE
        extern void     *mallwatch;
 #endif
@@ -849,8 +883,75 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
 
        ctx->inode_badness_threshold = BADNESS_THRESHOLD;
 
-       while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+       ctx->lustre_devtype = LUSTRE_NULL;
+
+       while ((c = getopt_long(argc, argv,
+                               "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk",
+                               long_options, &option_index)) != EOF)
                switch (c) {
+#ifdef HAVE_LFSCK
+               case 1: {
+                       char *dbpath, *tmp;
+
+                       if (!optarg)
+                               usage(ctx);
+
+                       dbpath = malloc(PATH_MAX);
+                       if (dbpath == NULL) {
+                               fprintf(stderr, "Out of memory\n");
+                               exit(1);
+                       }
+                       tmp = malloc(PATH_MAX);
+                       if (tmp == NULL) {
+                               fprintf(stderr, "Out of memory\n");
+                               exit(1);
+                       }
+
+                       strcpy(tmp, optarg);
+                       if (realpath(my_dirname(tmp), dbpath) == NULL) {
+                               fprintf(stderr, "Failure to resolve path %s\n",
+                                       optarg);
+                               exit(1);
+                       }
+
+                       strcpy(tmp, optarg);
+                       sprintf(dbpath+strlen(dbpath), "/%s", my_basename(tmp));
+                       ctx->lustre_mdsdb = dbpath;
+                       ctx->lustre_devtype |= LUSTRE_MDS;
+
+                       free(tmp);
+                       break;
+               }
+               case 2: {
+                       char *dbpath, *tmp;
+
+                       dbpath = malloc(PATH_MAX);
+                       if (dbpath == NULL) {
+                               fprintf(stderr, "Out of memory\n");
+                               exit(1);
+                       }
+                       tmp = malloc(PATH_MAX);
+                       if (tmp == NULL) {
+                               fprintf(stderr, "Out of memory\n");
+                               exit(1);
+                       }
+
+                       strcpy(tmp, optarg);
+                       if (realpath(my_dirname(tmp), dbpath) == NULL) {
+                               fprintf(stderr, "Failure to resolve path %s\n",
+                                       optarg);
+                               exit(1);
+                       }
+
+                       strcpy(tmp, optarg);
+                       sprintf(dbpath+strlen(dbpath), "/%s", my_basename(tmp));
+                       ctx->lustre_ostdb = dbpath;
+                       ctx->lustre_devtype |= LUSTRE_OST;
+
+                       free(tmp);
+                       break;
+               }
+#endif /* HAVE_LFSCK */
                case 'C':
                        ctx->progress = e2fsck_update_progress;
                        res = sscanf(optarg, "%d", &ctx->progress_fd);
@@ -964,6 +1065,7 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
                        break;
                case 'v':
                        verbose = 1;
+                       ctx->options |= E2F_OPT_VERBOSE;
                        break;
                case 'V':
                        show_version_only = 1;
@@ -982,6 +1084,16 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
                default:
                        usage(ctx);
                }
+#ifdef HAVE_LFSCK
+       if (ctx->lustre_devtype) {
+               if ((ctx->lustre_devtype != LUSTRE_MDS) &&
+                   ctx->lustre_devtype != (LUSTRE_MDS | LUSTRE_OST)) {
+                       com_err(ctx->program_name, 0,
+                               _("must specify --mdsdb with --ostdb"));
+                       usage(ctx);
+               }
+       }
+#endif /* HAVE_LFSCK */
        if (show_version_only)
                return 0;
        if (optind != argc - 1)
@@ -1874,6 +1986,14 @@ no_journal:
        ext2fs_close(fs);
        ctx->fs = NULL;
        free(ctx->journal_name);
+#ifdef HAVE_LFSCK
+       if (ctx->lfsck_oinfo)
+               e2fsck_lfsck_cleanupdb(ctx);
+       if (ctx->lustre_mdsdb)
+               free(ctx->lustre_mdsdb);
+       if (ctx->lustre_ostdb)
+               free(ctx->lustre_ostdb);
+#endif /* HAVE_LFSCK */
 
        e2fsck_free_context(ctx);
        remove_error_table(&et_ext2_error_table);
index 0a5915c..eb6d879 100644 (file)
@@ -17,6 +17,8 @@ Source0: http://downloads.sourceforge.net/%{name}/%{name}-%{version}.tar.gz
 
 Url: http://downloads.whamcloud.com/public/
 Provides:       ldiskfsprogs = %{version}
+@LFSCK_CMT@Requires: db4 >= @DB4VERSION@
+@LFSCK_CMT@BuildRequires: db4-devel >= 4.1 texinfo
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 Requires: e2fsprogs-libs = %{version}-%{release}
 
@@ -156,6 +158,7 @@ It was originally inspired by the Multics SubSystem library.
 %configure --enable-elf-shlibs --enable-nls --disable-uuidd --disable-fsck \
           --disable-e2initrd-helper --disable-libblkid --disable-libuuid \
           --disable-defrag --enable-symlink-install \
+          @WITH_LUSTRE@ @ENABLE_LFSCK@ \
           %{?extra_config_flags:%extra_config_flags}
 make %{?_smp_mflags}
 
@@ -223,6 +226,7 @@ exit 0
 %{_root_sbindir}/fsck.ext3
 %{_root_sbindir}/fsck.ext4
 %{_root_sbindir}/fsck.ext4dev
+@LFSCK_CMT@%{_sbindir}/lfsck
 %{_root_sbindir}/logsave
 %{_root_sbindir}/mke2fs
 %{_root_sbindir}/mkfs.ext2
@@ -256,6 +260,7 @@ exit 0
 %{_mandir}/man8/e2image.8*
 %{_mandir}/man8/e2label.8*
 %{_mandir}/man8/e2undo.8*
+@LFSCK_CMT@%{_mandir}/man8/lfsck.8*
 %{_mandir}/man8/logsave.8*
 %{_mandir}/man8/mke2fs.8*
 %{_mandir}/man8/mkfs.ext2.8*
index 2ee50f8..a524f4e 100644 (file)
@@ -172,6 +172,7 @@ autoreconf --force --install
   --disable-libuuid \
   --disable-uuidd \
   --disable-fsck \
+  @WITH_LUSTRE@ @ENABLE_LFSCK@ \
   %{?extra_config_flags:%extra_config_flags} CFLAGS="$RPM_OPT_FLAGS"
 make V=1
 
@@ -220,6 +221,7 @@ rm -rf $RPM_BUILD_ROOT
 /sbin/fsck.ext2
 /sbin/fsck.ext3
 /sbin/fsck.ext4
+@LFSCK_CMT@/usr/sbin/lfsck
 /sbin/mke2fs
 /sbin/mkfs.ext2
 /sbin/mkfs.ext3
index 3f6e578..3bf0410 100644 (file)
@@ -15,10 +15,14 @@ Prereq: /sbin/ldconfig
 BuildRoot: %{_tmppath}/%{name}-root
 %if %{_vendor} == "suse"
 Group: System/Filesystems
+@LFSCK_CMT@Requires: db >= @DB4VERSION@
+@LFSCK_CMT@BuildRequires: db-devel >= 4.1 texinfo
 Provides: e2fsbn ext2fs libcom_err = %{version} ldiskfsprogs = %{version}
 Obsoletes: ext2fs libcom_err < %{version}
 %else
 Group: System Environment/Base
+@LFSCK_CMT@Requires: db4 >= @DB4VERSION@
+@LFSCK_CMT@BuildRequires: db4-devel >= 4.1 texinfo
 Provides: e2fsprogs-libs = %{version} ldiskfsprogs = %{version}
 Obsoletes: e4fsprogs e2fsprogs-libs < %{version}
 %endif
@@ -72,6 +76,7 @@ SMP systems.
 
 %build
 %configure --enable-elf-shlibs --enable-nls --disable-defrag \
+       @WITH_LUSTRE@ @ENABLE_LFSCK@ \
        %{?extra_config_flags:%extra_config_flags}
 make
 make check
@@ -132,6 +137,7 @@ exit 0
 %{_root_sbindir}/fsck.ext3
 %{_root_sbindir}/fsck.ext4
 %{_root_sbindir}/fsck.ext4dev
+@LFSCK_CMT@%{_sbindir}/lfsck
 %{_root_sbindir}/logsave
 %{_root_sbindir}/mke2fs
 %{_root_etcdir}/mke2fs.conf
@@ -169,6 +175,7 @@ exit 0
 %{_mandir}/man8/debugfs.8*
 %{_mandir}/man8/dumpe2fs.8*
 %{_mandir}/man8/e2fsck.8*
+@LFSCK_CMT@%{_mandir}/man8/lfsck.8*
 %{_mandir}/man8/findfs.8*
 %{_mandir}/man8/fsck.ext2.8*
 %{_mandir}/man8/fsck.ext3.8*
index 263af06..3e00db9 100644 (file)
 /* Define to 1 if you have the `chflags' function. */
 #undef HAVE_CHFLAGS
 
+/* Define to 1 if DB4 library is present */
+#undef HAVE_DB4
+
+/* Define to 1 if you have the <db.h> header file. */
+#undef HAVE_DB_H
+
 /* Define if the GNU dcgettext() function is already present or preinstalled.
    */
 #undef HAVE_DCGETTEXT
 /* Define if your <locale.h> file defines LC_MESSAGES. */
 #undef HAVE_LC_MESSAGES
 
+/* Define to 1 if Lustre lfsck is enabled */
+#undef HAVE_LFSCK
+
 /* Define to 1 if you have the <limits.h> header file. */
 #undef HAVE_LIMITS_H
 
diff --git a/lib/ext2fs/lfsck.h b/lib/ext2fs/lfsck.h
new file mode 100644 (file)
index 0000000..a29f1e9
--- /dev/null
@@ -0,0 +1,323 @@
+#undef PACKAGE
+#undef VERSION
+#ifndef LFSCK_H
+#define LFSCK_H
+
+#ifdef HAVE_LFSCK
+/* These are unfortunately needed for lustre_user.h to be usable */
+#define CLASSERT(cond)         ({ switch(42) { case (cond): case 0: break; } })
+#define LASSERT(cond)          do { } while (0)
+#define LASSERTF(cond, fmt, a) do { } while (0)
+
+#include "../lib/ext2fs/ext2fsP.h"
+#include <ext2fs/ext2_ext_attr.h>
+#include <lustre/liblustreapi.h>
+
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+
+#include <db.h>
+
+#ifndef LPU64
+#if (__WORDSIZE == 32) || defined(__x86_64__)
+# define LPU64 "%llu"
+# define LPD64 "%lld"
+# define LPX64 "%#llx"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#elif (__WORDSIZE == 64)
+# define LPU64 "%lu"
+# define LPD64 "%ld"
+# define LPX64 "%#lx"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#endif
+#endif /* !LPU64 */
+
+/* Compatibility to allow 1.x lustre_user.h to be used with 2.x fields.
+ * There are also structures from lustre_idl.h below that are defined in
+ * terms of the 2.x field names that would have to be handled for 1.x if
+ * that lustre_idl.h was ever fixed to allow inclusion from userspace. */
+#ifndef IDENTITY_DOWNCALL_MAGIC
+#define l_object_seq   l_object_gr             /* for lov_ost_data_v1 */
+#define lmm_object_seq lmm_object_gr           /* for lov_mds_md_v1/3 */
+#endif /* IDENTITY_DOWNCALL_MAGIC */
+
+/* Unfortunately, neither the 1.8 or 2.x lustre_idl.h file is suitable
+ * for inclusion by userspace programs because of external dependencies.
+ * Define the minimum set of replacement functions here until that is fixed. */
+#ifndef HAVE_LUSTRE_LUSTRE_IDL_H
+#define fid_seq(fid) ((fid)->f_seq)
+#define fid_oid(fid) ((fid)->f_oid)
+#define fid_ver(fid) ((fid)->f_ver)
+
+#ifndef LL_IOC_PATH2FID
+#define DFID "["LPX64":0x%x:0x%x]"
+#define PFID(fid)     \
+        fid_seq(fid), \
+        fid_oid(fid), \
+        fid_ver(fid)
+#define llapi_get_connect_flags(mnt, flags) (0)
+struct lu_fid {
+       __u64   f_seq;
+       __u32   f_oid;
+       __u32   f_ver;
+};
+#endif
+
+#define OBD_CONNECT_FID                0x40000000ULL
+
+struct lustre_mdt_attrs {
+       __u32           lma_compat;
+       __u32           lma_incompat;
+       struct lu_fid   lma_self_fid;
+       __u64           lma_flags;
+       __u64           lma_ioepoch;
+       __u64           lma_som_size;
+       __u64           lma_som_blocks;
+       __u64           lma_som_mountid;
+};
+
+struct ost_id {
+       __u64   oi_id;
+       __u64   oi_seq;
+};
+
+enum fid_seq {
+       FID_SEQ_IGIF            = 12ULL,
+       FID_SEQ_IGIF_MAX        = 0x0ffffffffULL,
+       FID_SEQ_IDIF            = 0x100000000ULL,
+};
+
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+       return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+       return fid_seq_is_igif(fid_seq(fid));
+}
+
+/* convert an OST objid + index into an IDIF FID SEQ number */
+static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx)
+{
+       return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert ost_id from 1.x compatible OST protocol into FID for future usage */
+static inline void ostid_idif_unpack(struct ost_id *oi, struct lu_fid *fid,
+                                    __u32 idx)
+{
+       fid->f_seq = fid_idif_seq(oi->oi_id, idx);
+       fid->f_oid = oi->oi_id;         /* truncate to 32 bits by assignment */
+       fid->f_ver = oi->oi_id >> 48;   /* in theory, not currently used */
+}
+#endif /* HAVE_LUSTRE_LUSTRE_IDL_H */
+
+#ifndef DOIF
+#define DOIF LPU64":"LPU64
+#define POIF(oi) (oi)->oi_seq, (oi)->oi_id
+#endif
+
+/* Get O/R or O/0 dir */
+#define OBJECT_DIR  "O"
+#define OBJECT_DIR_V1 "R"
+#define OBJECT_DIR_V2 "0"
+#define LOG_DIR "1"
+#define PENDING_DIR "PENDING"
+#define OBJECTS "OBJECTS"
+#define CATLIST "CATALOGS"
+#define LAST_ID "LAST_ID"
+#define LAST_RCVD "last_rcvd"
+#define LOV_OBJID "lov_objid"
+
+#ifndef EXT3_XATTR_INDEX_TRUSTED       /* temporary until we hit l28 kernel */
+#define EXT3_XATTR_INDEX_TRUSTED       4
+#endif
+#ifndef EXT3_XATTR_INDEX_LUSTRE
+#define EXT3_XATTR_INDEX_LUSTRE                5
+#endif
+#define XATTR_LUSTRE_MDS_LOV_EA                "lov"
+#define XATTR_LUSTRE_MDT_LMA_EA                "lma"
+
+/* Database names */
+#define MDS_HDR       "mdshdr"
+#define MDS_DIRINFO   "mds_dirinfo"
+#define MDS_SIZEINFO  "mds_sizeinfo"
+#define MDS_OSTDB     "mds_ostdb"
+#define OST_HDR       "osthdr"
+#define OST_OSTDB     "ost_db"
+
+#define MDS_MAGIC     0xDBABCD01
+#define OST_MAGIC     0xDB123402
+
+#define OBD_COMPAT_OST         0x00000002 /* this is an OST (1.6+) */
+#define OBD_COMPAT_MDT         0x00000004 /* this is an MDT (1.6+) */
+
+#define OBD_INCOMPAT_OST       0x00000002 /* this is an OST (1.8+) */
+#define OBD_INCOMPAT_MDT       0x00000004 /* this is an MDS (1.8+) */
+
+#define LOV_MAX_OSTS 2048       /* Arbitrary limit, can be increased */
+#define LOV_EA_SIZE(lum, num) (sizeof(*lum) + num * sizeof(*lum->lmm_objects))
+#define LOV_EA_MAX(lum) LOV_EA_SIZE(lum, LOV_MAX_OSTS)
+
+/*XXX*/
+#define STRTOUL strtoul
+#define STRTOUL_MAX ULONG_MAX
+
+#define HASH_SIZE 131072
+
+struct lustre_server_data {
+       __u8  lsd_uuid[40];        /* server UUID */
+       __u64 lsd_last_transno;    /* last completed transaction ID */
+       __u64 lsd_compat14;        /* reserved - compat with old last_rcvd */
+       __u64 lsd_mount_count;     /* incarnation number */
+       __u32 lsd_feature_compat;  /* compatible feature flags */
+       __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+       __u32 lsd_feature_incompat;/* incompatible feature flags */
+       __u32 lsd_server_size;     /* size of server data area */
+       __u32 lsd_client_start;    /* start of per-client data area */
+       __u16 lsd_client_size;     /* size of per-client data area */
+       __u16 lsd_subdir_count;    /* number of subdirectories for objects */
+       __u64 lsd_catalog_oid;     /* recovery catalog object id */
+       __u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+       __u8  lsd_peeruuid[40];    /* UUID of LOV/OSC associated with MDS */
+       __u32 lsd_ost_index;       /* index number of OST in LOV */
+       __u32 lsd_mdt_index;       /* index number of MDT in LMV */
+};
+
+struct lfsck_mds_hdr {
+       __u64 mds_magic;
+       __u64 mds_flags;
+       __u64 mds_max_files;
+       __u32 mds_num_osts;
+       __u32 mds_unused;
+       __u64 mds_max_ost_id[LOV_MAX_OSTS];
+       struct obd_uuid mds_uuid;
+       struct obd_uuid mds_ost_info[LOV_MAX_OSTS];
+};
+
+struct lfsck_ost_hdr  {
+       __u64 ost_magic;
+       __u64 ost_flags;
+       __u64 ost_num_files;
+       __u64 ost_last_id;
+       __u32 ost_index;
+       __u32 ost_unused;
+       struct obd_uuid ost_mds_uuid;
+       struct obd_uuid ost_uuid;
+};
+
+struct lfsck_mds_dirent {
+       struct lu_fid mds_dirfid;
+       struct lu_fid mds_fid;
+};
+
+struct lfsck_mds_szinfo {
+       __u64 mds_fid;
+       __u64 mds_seq;
+       __u64 mds_size;
+       __u64 mds_calc_size;
+       __u32 mds_stripe_size;
+       __u32 mds_stripe_pattern;
+       __u16 mds_stripe_count;
+       __u16 mds_stripe_start;
+};
+
+struct lfsck_mds_objent {
+       struct lu_fid   mds_fid;
+       struct ost_id   mds_oi;
+       __u32           mds_ostidx;
+       __u32           mds_ostoffset;
+};
+
+struct lfsck_ost_objent {
+       struct ost_id   ost_oi;
+       __u64           ost_size;
+       __u64           ost_bytes;
+};
+
+struct lfsck_ofile_ctx {
+       DB *dbp;
+       __u64 max_id;
+       int have_max_id;
+};
+
+struct lfsck_outdb_info {
+       __u32 ost_count;
+       int have_ost_count;
+       DB *mds_sizeinfo_dbp;
+       struct lfsck_ofile_ctx *ofile_ctx;
+};
+
+/* pass6.c */
+#ifdef FSCK_OK /* compiling for e2fsck or lfsck */
+extern int e2fsck_lfsck_find_ea(e2fsck_t ctx, struct ext2_inode_large *inode,
+                               struct ext2_ext_attr_entry *entry, void *value,
+                               struct lov_user_md_v1 **lmm,
+                               struct lustre_mdt_attrs **lma);
+extern int e2fsck_lfsck_save_ea(e2fsck_t ctx, ext2_ino_t ino, __u32 generation,
+                                struct lov_user_md_v1 *lmm,
+                                struct lustre_mdt_attrs *lma);
+extern int e2fsck_lfsck_flush_ea(e2fsck_t ctx);
+extern int e2fsck_lfsck_cleanupdb(e2fsck_t ctx);
+extern int e2fsck_lfsck_remove_pending(e2fsck_t ctx, char *block_buf);
+
+/* lfsck_common.c */
+extern char *my_dirname(char *path);
+extern const char *my_basename(const char *path);
+extern int lfsck_create_dbenv(const char *progname);
+extern int lfsck_opendb(const char *fname, const char *dbname, DB **dbpp,
+                       int allow_dup, int keydata_size, int num_files);
+extern void cputole_mds_hdr(struct lfsck_mds_hdr *mds_hdr);
+extern void letocpu_mds_hdr(struct lfsck_mds_hdr *mds_hdr);
+extern void cputole_ost_hdr(struct lfsck_ost_hdr *ost_hdr);
+extern void letocpu_ost_hdr(struct lfsck_ost_hdr *ost_hdr);
+extern void cputole_fid(struct lu_fid *fid);
+extern void letocpu_fid(struct lu_fid *fid);
+extern void cputole_mds_dirent(struct lfsck_mds_dirent *mds_dirent);
+extern void letocpu_mds_dirent(struct lfsck_mds_dirent *mds_dirent);
+extern void cputole_mds_szinfo(struct lfsck_mds_szinfo *mds_szinfo);
+extern void letocpu_mds_szinfo(struct lfsck_mds_szinfo *mds_szinfo);
+extern void cputole_mds_objent(struct lfsck_mds_objent *mds_objent);
+extern void letocpu_mds_objent(struct lfsck_mds_objent *mds_objent);
+extern void cputole_ost_objent(struct lfsck_ost_objent *ost_objent);
+extern void letocpu_ost_objent(struct lfsck_ost_objent *ost_objent);
+extern void letocpu_lov_user_md(struct lov_user_md *lmm);
+
+int lfsck_get_fid(ext2_filsys fs, ino_t ino, struct lu_fid *fid);
+int lfsck_is_dirfid_root(const struct lu_fid *dirfid);
+int lfsck_fidcmp(const struct lu_fid *fid1, const struct lu_fid *fid2);
+#endif /* FSCK_OK */
+
+#define MDS_START_DIRENT_TABLE sizeof(struct lfsck_mds_hdr)
+
+#define MDS_START_SZINFO_TABLE(numfiles) \
+  (sizeof(struct lfsck_mds_hdr) + (sizeof(struct lfsck_mds_dirent) * numfiles))
+
+#define MDS_START_OST_TABLE_OFFSET(idx, numfiles)                             \
+  (sizeof(struct lfsck_mds_hdr) + (sizeof(struct lfsck_mds_dirent) * numfiles)+\
+   (sizeof(struct lfsck_mds_szinfo) * numfiles) +                             \
+   (sizeof(struct lfsck_mds_objent_hdr) +                                     \
+   ((sizeof(struct lfsck_mds_objent) * numfiles)) * (idx)) +                  \
+   sizeof(struct lfsck_mds_objent_hdr))
+
+#define MDS_START_OST_HDR_OFFSET(idx, numfiles)                                       \
+  (sizeof(struct lfsck_mds_hdr) + (sizeof(struct lfsck_mds_dirent) * numfiles)+\
+   (sizeof(struct lfsck_mds_szinfo) * numfiles) +                             \
+   (sizeof(struct lfsck_mds_objent_hdr) +                                     \
+   ((sizeof(struct lfsck_mds_objent) * numfiles)) * (idx)))
+
+#define OST_START_OFFSET  sizeof(struct lfsck_ost_hdr)
+
+#else /* !HAVE_LFSCK */
+#define e2fsck_lfsck_find_ea(ctx, inode, entry, value, lmm, lma) (0)
+#define e2fsck_lfsck_save_ea(ctx, ino, generation, lmm, lma) do {} while(0)
+#define e2fsck_lfsck_flush_ea(ctx) (0)
+#define e2fsck_lfsck_cleanupdb(ctx) (0)
+#define e2fsck_lfsck_remove_pending(ctx, block_buf) (0)
+#endif /* HAVE_LFSCK */
+
+#endif /* LFSCK_H */
index 936bb04..377a9be 100644 (file)
@@ -1458,7 +1458,7 @@ profile_error:
                        }
                        break;
                case 'v':
-                       verbose = 1;
+                       verbose++;
                        break;
                case 'F':
                        force++;
index 0e074d8..d6cb4ff 100644 (file)
@@ -5,6 +5,8 @@ SS_DIR                  @SS_DIR@
 E2FSPROGS_MONTH                @E2FSPROGS_MONTH@
 E2FSPROGS_YEAR         @E2FSPROGS_YEAR@
 E2FSPROGS_VERSION      @E2FSPROGS_VERSION@
+LFSCK_CMT              @LFSCK_CMT@
+LFSCK_MAN              @LFSCK_MAN@
 SIZEOF_LONG_LONG       @SIZEOF_LONG_LONG@
 SIZEOF_LONG            @SIZEOF_LONG@
 SIZEOF_INT             @SIZEOF_INT@