Whamcloud - gitweb
LUDOC-394 manual: remove 'dbdoclet.' from crossrefs
[doc/manual.git] / TroubleShootingRecovery.xml
index fde2b4e..b9c97ce 100644 (file)
@@ -1,7 +1,7 @@
 <?xml version='1.0' encoding='utf-8'?>
 <chapter xmlns="http://docbook.org/ns/docbook"
-xmlns:xl="http://www.w3.org/1999/xlink" version="5.0" xml:lang="en-US"
-xml:id="troubleshootingrecovery">
+ xmlns:xl="http://www.w3.org/1999/xlink" version="5.0" xml:lang="en-US"
+ xml:id="troubleshootingrecovery">
   <title xml:id="troubleshootingrecovery.title">Troubleshooting
   Recovery</title>
   <para>This chapter describes what to do if something goes wrong during
@@ -9,26 +9,26 @@ xml:id="troubleshootingrecovery">
   <itemizedlist>
     <listitem>
       <para>
-        <xref linkend="dbdoclet.50438225_71141" />
+        <xref linkend="recover_ldiskfs_errors" />
       </para>
     </listitem>
     <listitem>
       <para>
-        <xref linkend="dbdoclet.50438225_37365" />
+        <xref linkend="recover_lustreFS_corruption" />
       </para>
     </listitem>
     <listitem>
       <para>
-        <xref linkend="dbdoclet.50438225_12316" />
+        <xref linkend="recover_unavailable_ost" />
       </para>
     </listitem>
     <listitem>
       <para>
-        <xref linkend="dbdoclet.lfsckadmin" />
+        <xref linkend="lfsckadmin" />
       </para>
     </listitem>
   </itemizedlist>
-  <section xml:id="dbdoclet.50438225_71141">
+  <section xml:id="recover_ldiskfs_errors">
     <title>
     <indexterm>
       <primary>recovery</primary>
@@ -57,14 +57,9 @@ Dec 29 14:11:32 mookie kernel: Remounting filesystem read-only </screen>
     <para>In the vast majority of cases, the Lustre software can cope with any
     inconsistencies found on the disk and between other devices in the file
     system.</para>
-    <note>
-         <para>The legacy offline-LFSCK tool included with e2fsprogs is rarely
-      required for Lustre file system operation. offline-LFSCK is not to be
-      confused with LFSCK tool, which is part of Lustre and provides online
-      consistency checking.</para>
-    </note>
     <para>For problem analysis, it is strongly recommended that
-    <literal>e2fsck</literal> be run under a logger, like script, to record all
+    <literal>e2fsck</literal> be run under a logger, like
+    <literal>script</literal>, to record all
     of the output and changes that are made to the file system in case this
     information is needed later.</para>
     <para>If time permits, it is also a good idea to first run
@@ -98,14 +93,14 @@ root# e2fsck -fn /dev/sda   # don't fix file system, just check for corruption
 :
 root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>yes</literal>)</screen>
   </section>
-  <section xml:id="dbdoclet.50438225_37365">
+  <section xml:id="recover_lustreFS_corruption">
     <title>
     <indexterm>
       <primary>recovery</primary>
       <secondary>corruption of Lustre file system</secondary>
     </indexterm>Recovering from Corruption in the Lustre File System</title>
     <para>In cases where an ldiskfs MDT or OST becomes corrupt, you need to run
-    e2fsck to correct the local filesystem consistency, then use
+    <literal>e2fsck</literal> to ensure local filesystem consistency, then use
     <literal>LFSCK</literal> to run a distributed check on the file system to
     resolve any inconsistencies between the MDTs and OSTs, or among MDTs.</para>
     <orderedlist>
@@ -123,7 +118,7 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
         necessary, to reduce the outage window.</para>
       </listitem>
     </orderedlist>
-    <section xml:id="dbdoclet.50438225_13916">
+    <section xml:id="orphan_objects">
       <title>
       <indexterm>
         <primary>recovery</primary>
@@ -140,7 +135,7 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
        identify and process orphan objects found on MDTs as well.</para>
     </section>
   </section>
-  <section xml:id="dbdoclet.50438225_12316">
+  <section xml:id="recover_unavailable_ost">
     <title>
     <indexterm>
       <primary>recovery</primary>
@@ -186,7 +181,7 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
       <xref linkend="lustrerecovery" />(Version-based Recovery).</para>
     </note>
   </section>
-  <section xml:id="dbdoclet.lfsckadmin" condition='l23'>
+  <section xml:id="lfsckadmin">
     <title>
     <indexterm>
       <primary>recovery</primary>
@@ -196,36 +191,34 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
       <primary>recovery</primary>
       <secondary>LFSCK</secondary>
     </indexterm>Checking the file system with LFSCK</title>
-       <para condition='l23'>LFSCK is an administrative tool introduced in Lustre
-    software release 2.3 for checking and repair of the attributes specific to a
-    mounted Lustre file system. It is similar in concept to an offline fsck repair
-    tool for a local filesystem, but LFSCK is implemented to run as part of the
-    Lustre file system while the file system is mounted and in use. This allows
-    consistency of checking and repair by the Lustre software without unnecessary
-    downtime, and can be run on the largest Lustre file systems with negligible
-    disruption to normal operations.</para>
-    <para condition='l23'>Since Lustre software release 2.3, LFSCK can verify
-    and repair the Object Index (OI) table that is used internally to map
-    Lustre File Identifiers (FIDs) to MDT internal ldiskfs inode numbers, in
-    an internal table called the OI Table. An OI Scrub traverses this the IO
-    Table and makes corrections where necessary. An OI Scrub is required after
-    restoring from a file-level MDT backup (
-    <xref linkend="dbdoclet.backup_device" />), or in case the OI Table is
-    otherwise corrupted. Later phases of LFSCK will add further checks to the
-    Lustre distributed file system state.</para>
-    <para condition='l24'>In Lustre software release 2.4, LFSCK namespace
-    scanning can verify and repair the directory FID-in-Dirent and LinkEA
-    consistency.</para>
+    <para>LFSCK is an administrative tool for checking and repair of the
+      attributes specific to a mounted Lustre file system. It is similar
+      in concept to an offline fsck repair tool for a local filesystem,
+      but LFSCK is implemented to run as part of the Lustre file system
+      while the file system is mounted and in use. This allows consistency
+      checking and repair of Lustre-specific metadata without unnecessary
+      downtime, and can be run on the largest Lustre file systems with
+      minimal impact to normal operations.</para>
+    <para>LFSCK can verify
+      and repair the Object Index (OI) table that is used internally to map
+      Lustre File Identifiers (FIDs) to MDT internal ldiskfs inode numbers, in
+      an internal table called the OI Table. An OI Scrub traverses the OI table
+      and makes corrections where necessary. An OI Scrub is required after
+      restoring from a file-level MDT backup (
+      <xref linkend="backup_device" />), or in case the OI Table is
+      otherwise corrupted. Later phases of LFSCK will add further checks to the
+      Lustre distributed file system state. LFSCK namespace scanning can verify
+      and repair the directory FID-in-dirent and LinkEA consistency.</para>
     <para condition='l26'>In Lustre software release 2.6, LFSCK layout scanning
-    can verify and repair MDT-OST file layout inconsistencies. File layout
-    inconsistencies between MDT-objects and OST-objects that are checked and
-    corrected include dangling reference, unreferenced OST-objects, mismatched
-    references and multiple references.</para>
+      can verify and repair MDT-OST file layout inconsistencies. File layout
+      inconsistencies between MDT-objects and OST-objects that are checked and
+      corrected include dangling reference, unreferenced OST-objects, mismatched
+      references and multiple references.</para>
     <para condition='l27'>In Lustre software release 2.7, LFSCK layout scanning
-    is enhanced to support verify and repair inconsistencies between multiple
-    MDTs.</para>
+      is enhanced to support verify and repair inconsistencies between multiple
+      MDTs.</para>
     <para>Control and monitoring of LFSCK is through LFSCK and the
-    <literal>/proc</literal> file system interfaces. LFSCK supports three types
+    <literal>lctl get_param</literal> command. LFSCK supports three types
     of interface: switch interface, status interface, and adjustment interface.
     These interfaces are detailed below.</para>
     <section>
@@ -243,6 +236,7 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
                     [-A | --all] \
                     [-c | --create_ostobj <replaceable>on | off</replaceable>] \
                     [-C | --create_mdtobj <replaceable>on | off</replaceable>] \
+                    [-d | --delay_create_ostobj <replaceable>on | off</replaceable>] \
                     [-e | --error <replaceable>{continue | abort}</replaceable>] \
                     [-h | --help] \
                     [-n | --dryrun <replaceable>on | off</replaceable>] \
@@ -333,6 +327,21 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
                 <row>
                   <entry>
                     <para>
+                      <literal>-d | --delay_create_ostobj</literal>
+                    </para>
+                  </entry>
+                  <entry>
+                    <para condition='l29'>
+                      Delay creating the lost OST-object for dangling LOV EA
+                      until the orphan OST-objects are handled.
+                      <literal>off</literal>(default) or
+                      <literal>on</literal>.
+                    </para>
+                  </entry>
+                </row>
+                <row>
+                  <entry>
+                    <para>
                       <literal>-e | --error</literal>
                     </para>
                   </entry>
@@ -425,9 +434,8 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
                     started. Anytime the LFSCK is triggered, the OI scrub will
                     run automatically, so there is no need to specify
                     OI_scrub in that case.</para>
-                    <para condition='l24'>
-                    <literal>namespace</literal>: check and repair
-                    FID-in-Dirent and LinkEA consistency.</para>
+                    <para><literal>namespace</literal>: check and repair
+                    FID-in-dirent and LinkEA consistency.</para>
                     <para condition='l27'> Lustre-2.7 enhances
                     namespace consistency verification under DNE mode.</para>
                     <para condition='l26'>
@@ -530,6 +538,91 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
         </section>
       </section>
     </section>
+    <section condition="l29">
+      <title>Check the LFSCK global status</title>
+      <section>
+        <title>Description</title>
+        <para>Check the LFSCK global status via a single
+        <literal>lctl lfsck_query</literal> command on the MDS.</para>
+      </section>
+      <section>
+        <title>Usage</title>
+<screen>lctl lfsck_query &lt;-M | --device <replaceable>MDT_device</replaceable>&gt; \
+                    [-h | --help] \
+                    [-t | --type <replaceable>lfsck_type[,lfsck_type...]</replaceable>] \
+                    [-w | --wait]</screen>
+      </section>
+      <section>
+        <title>Options</title>
+        <para>The various
+        <literal>lfsck_query</literal> options are listed and described below.
+        For a complete list of available options, type
+        <literal>lctl lfsck_query -h</literal>.</para>
+        <informaltable frame="all">
+          <tgroup cols="2">
+            <colspec colname="c1" colwidth="3*" />
+            <colspec colname="c2" colwidth="7*" />
+            <thead>
+              <row>
+                <entry>
+                  <para>
+                    <emphasis role="bold">Option</emphasis>
+                  </para>
+                </entry>
+                <entry>
+                  <para>
+                    <emphasis role="bold">Description</emphasis>
+                  </para>
+                </entry>
+              </row>
+            </thead>
+            <tbody>
+              <row>
+                <entry>
+                  <para>
+                    <literal>-M | --device</literal>
+                  </para>
+                </entry>
+                <entry>
+                  <para>The device to query for LFSCK status.</para>
+                </entry>
+              </row>
+              <row>
+                <entry>
+                  <para>
+                    <literal>-h | --help</literal>
+                  </para>
+                </entry>
+                <entry>
+                  <para>Operating help information.</para>
+                </entry>
+              </row>
+              <row>
+                <entry>
+                  <para>
+                    <literal>-t | --type</literal>
+                  </para>
+                </entry>
+                <entry>
+                  <para>The LFSCK type(s) that should be queried,
+                  including: layout, namespace.</para>
+                </entry>
+              </row>
+              <row>
+                <entry>
+                  <para>
+                    <literal>-w | --wait</literal>
+                  </para>
+                </entry>
+                <entry>
+                  <para>will wait if the LFSCK is in scanning.</para>
+                </entry>
+              </row>
+            </tbody>
+            </tgroup>
+          </informaltable>
+      </section>
+    </section>
     <section>
       <title>LFSCK status interface</title>
       <section>
@@ -722,14 +815,14 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
           </informaltable>
         </section>
       </section>
-      <section condition='l24'>
+      <section>
         <title>LFSCK status of namespace via
         <literal>procfs</literal></title>
         <section>
           <title>Description</title>
           <para>The
           <literal>namespace</literal> component is responsible for checks
-          described in <xref linkend="dbdoclet.lfsckadmin" />. The
+          described in <xref linkend="lfsckadmin" />. The
           <literal>procfs</literal> interface for this component is in the
           MDD layer, named
           <literal>lfsck_namespace</literal>. To show the status of this
@@ -806,7 +899,7 @@ root# e2fsck -fp /dev/sda   # fix errors with prudent answers (usually <literal>
                         <literal>scanned-once</literal>(the first cycle
                         scanning has been completed),
                         <literal>inconsistent</literal>(one or more
-                        inconsistent FID-in-Dirent or LinkEA entries that have
+                        inconsistent FID-in-dirent or LinkEA entries that have
                         been discovered),
                         <literal>upgrade</literal>(from Lustre software release
                         1.8 IGIF format.)</para>
@@ -1241,7 +1334,7 @@ lctl get_param -n obdfilter.
                       <listitem>
                         <para>
                         <literal>Repaired Unmatched Pairs</literal> total number
-                        of unmatched MDT and OST-object paris have been
+                        of unmatched MDT and OST-object pairs have been
                         repaired in the scanning-phase1</para>
                       </listitem>
                       <listitem>
@@ -1389,7 +1482,7 @@ lctl set_param obdfilter.${FSNAME}-${OST_target}.lfsck_speed_limit=
           </informaltable>
         </section>
       </section>
-      <section xml:id="dbdoclet.lfsck_auto_scrub">
+      <section xml:id="lfsck_auto_scrub">
         <title>Auto scrub</title>
         <section>
           <title>Description</title>
@@ -1400,9 +1493,9 @@ lctl set_param obdfilter.${FSNAME}-${OST_target}.lfsck_speed_limit=
           below.</para>
           <para>There is also a
           <literal>noscrub</literal> mount option (see
-          <xref linkend="dbdoclet.50438219_12635" />) which can be used to
-          disable automatic OI scrub upon detection of a file-level backup at
-          mount time. If the
+          <xref linkend="mount.lustre" />) which
+          can be used to disable automatic OI scrub upon detection of a
+          file-level backup at mount time. If the
           <literal>noscrub</literal> mount option is specified,
           <literal>auto_scrub</literal> will also be disabled, so OI scrub will
           not be triggered when an OI inconsistency is detected. Auto scrub can
@@ -1451,3 +1544,6 @@ lctl set_param obdfilter.${FSNAME}-${OST_target}.lfsck_speed_limit=
     </section>
   </section>
 </chapter>
+<!--
+  vim:expandtab:shiftwidth=2:tabstop=8:
+  -->