From 316d692b7a2913757ee1aafafcf2c751e34adc60 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 4 Sep 2014 08:10:10 +0800 Subject: [PATCH] LUDOC-254 lfsck: update Lustre manual for LFSCK 3 Include LFSCK command lines and proc interfaces changes. Signed-off-by: Fan Yong Change-Id: Id568699b7eb5694a21a1c7db4f05e59b71605386 Reviewed-on: http://review.whamcloud.com/12277 Tested-by: Jenkins Reviewed-by: James Nunez Reviewed-by: Ryan Haasken Reviewed-by: Richard Henwood --- TroubleShootingRecovery.xml | 100 +++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 30 deletions(-) diff --git a/TroubleShootingRecovery.xml b/TroubleShootingRecovery.xml index b48ffd5..f7bebcb 100644 --- a/TroubleShootingRecovery.xml +++ b/TroubleShootingRecovery.xml @@ -126,15 +126,17 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually
Synopsis lctl lfsck_start -M | --device [MDT,OST]_device \ - [-e | --error error_handle] \ + [-A | --all] \ + [-c | --create_ostobj [on | off]] \ + [-C | --create_mdtobj [on | off]] \ + [-e | --error {continue | abort}] \ [-h | --help] \ - [-n | --dryrun switch] \ + [-n | --dryrun [on | off]] \ + [-o | --orphan] \ [-r | --reset] \ - [-s | --speed speed_limit] \ - [-A | --all] \ + [-s | --speed ops_per_sec_limit] \ [-t | --type lfsck_type[,lfsck_type...]] \ - [-w | --windows win_size] \ - [-o | --orphan] + [-w | --window_size size]
@@ -169,6 +171,30 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually + -A | --all + + + Start LFSCK on all devices via a single lctl command. This applies to both layout and namespace consistency checking and repair. + + + + + -c | --create_ostobj + + + Create the lost OST-object for dangling LOV EA, off (default) or on. If not specified, then the default behaviour is to keep the dangling LOV EA there without creating the lost OST-object. + + + + + -C | --create_mdtobj + + + Create the lost MDT-object for dangling name entry, off (default) or on. If not specified, then the default behaviour is to keep the dangling name entry there without creating the lost MDT-object. + + + + -e | --error @@ -193,26 +219,26 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually - -r | --reset + -o | --orphan - Reset the start position for the object iteration to the beginning for the specified MDT. By default the iterator will resume scanning from the last checkpoint (saved periodically by LFSCK) provided it is available. + Repair orphan OST-objects for layout LFSCK. - -s | --speed + -r | --reset - Set the upper speed limit of LFSCK processing in objects per second. If it is not specified, the saved value (when resuming from checkpoint) or default value of 0 (0 = run as fast as possible) is used. Speed can be adjusted while LFSCK is running with the adjustment interface. + Reset the start position for the object iteration to the beginning for the specified MDT. By default the iterator will resume scanning from the last checkpoint (saved periodically by LFSCK) provided it is available. - -A | --all + -s | --speed - Start LFSCK on all devices via a single lctl command. It is not only used for layout consistency check/repair, but also for other LFSCK components, such as LFSCK for namespace consistency (LFSCK 1.5) and for DNE consistency check/repair in the future. + Set the upper speed limit of LFSCK processing in objects per second. If it is not specified, the saved value (when resuming from checkpoint) or default value of 0 (0 = run as fast as possible) is used. Speed can be adjusted while LFSCK is running with the adjustment interface. @@ -222,24 +248,16 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually The type of checking/repairing that should be performed. The new LFSCK framework provides a single interface for a variety of system consistency checking/repairing operations including: Without a specified option, the LFSCK component(s) which ran last time and did not finish or the component(s) corresponding to some known system inconsistency, will be started. Anytime the LFSCK is triggered, the OI scrub will run automatically, so there is no need to specify OI_scrub. -namespace: check and repair FID-in-Dirent and LinkEA consistency. +namespace: check and repair FID-in-Dirent and LinkEA consistency. Lustre-2.7 enhances namespace consistency verification under DNE mode. layout: check and repair MDT-OST inconsistency. - -w | --windows + -w | --window_size - The windows size for async requests pipeline. - - - - - -o | --orphan - - - Handle orphan objects, such as orphan OST-objects for layout LFSCK. + The window size for the async request pipeline. The LFSCK async request pipeline's input/output may have quite different processing speeds, and there may be too many requests in the pipeline as to cause abnormal memory/network pressure. If not specified, then the default window size for the async request pipeline is 1024. @@ -429,14 +447,14 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually Name: lfsck_namespace LFSCK namespace magic. LFSCK namespace version.. - Status: one of the status - init, scanning-phase1, scanning-phase2, completed, failed, stopped, paused, or crashed. + Status: one of the status - init, scanning-phase1, scanning-phase2, completed, failed, stopped, paused, partial, co-failed, co-stopped or co-paused. Flags: including - scanned-once (the first cycle scanning has been completed), inconsistent (one or more inconsistent FID-in-Dirent or LinkEA - entries have been discovered), + entries that have been discovered), upgrade (from Lustre software release 1.8 IGIF format.) - Parameters: including dryrun, all_targets and failout. + Parameters: including dryrun, all_targets, failout, broadcast, orphan, create_ostobjandcreate_mdtobj. Time Since Last Completed. Time Since Latest Start. Time Since Last Checkpoint. @@ -459,10 +477,32 @@ root# e2fsck -fp /dev/sda # fix errors with prudent answers (usually Updated Phase2 total number of objects repaired during scanning-phase2. Failed Phase1 total number of objets that failed to be repaired during scanning-phase1. Failed Phase2 total number of objets that failed to be repaired during scanning-phase2. - Dirs total number of directories scanned. - M-linked total number of multiple-linked objects that have been scanned. - Nlinks Repaired total number of objects with nlink attributes that have been repaired. - Lost_found total number of objects that have had a name entry added back to the namespace. + directories total number of directories scanned. + multiple_linked_checked total number of multiple-linked objects that have been scanned. + dirent_repaired total number of FID-in-dirent entries that have been repaired. + linkea_repaired total number of linkEA entries that have been repaired. + unknown_inconsistency total number of undefined inconsistencies found in scanning-phase2. + unmatched_pairs_repaired total number of unmatched pairs that have been repaired. + dangling_repaired total number of dangling name entries that have been found/repaired. + multi_referenced_repaired total number of multiple referenced name entries that have been found/repaired. + bad_file_type_repaired total number of name entries with bad file type that have been repaired. + lost_dirent_repaired total number of lost name entries that have been re-inserted. + striped_dirs_scanned total number of striped directories (master) that have been scanned. + striped_dirs_repaired total number of striped directories (master) that have been repaired. + striped_dirs_failed total number of striped directories (master) that have failed to be verified. + striped_dirs_disabled total number of striped directories (master) that have been disabled. + striped_dirs_skipped total number of striped directories (master) that have been skipped (for shards verification) because of lost master LMV EA. + striped_shards_scanned total number of striped directory shards (slave) that have been scanned. + striped_shards_repaired total number of striped directory shards (slave) that have been repaired. + striped_shards_failed total number of striped directory shards (slave) that have failed to be verified. + striped_shards_skipped total number of striped directory shards (slave) that have been skipped (for name hash verification) because LFSCK does not know whether the slave LMV EA is valid or not. + name_hash_repaired total number of name entries under striped directory with bad name hash that have been repaired. + nlinks_repaired total number of objects with nlink fixed. + mul_linked_repaired total number of multiple-linked objects that have been repaired. + local_lost_found_scanned total number of objects under /lost+found that have been scanned. + local_lost_found_moved total number of objects under /lost+found that have been moved to namespace visible directory. + local_lost_found_skipped total number of objects under /lost+found that have been skipped. + local_lost_found_failed total number of objects under /lost+found that have failed to be processed. Success Count the total number of completed LFSCK runs on the device. Run Time Phase1 the duration of the LFSCK run during scanning-phase1. Excluding the time spent paused between checkpoints. Run Time Phase2 the duration of the LFSCK run during scanning-phase2. Excluding the time spent paused between checkpoints. -- 1.8.3.1