Whamcloud - gitweb
LUDOC-370 snapshots: add snapshot feature documentation
[doc/manual.git] / LustreTuning.xml
index 6e6ebb0..d7f2e04 100644 (file)
@@ -626,6 +626,429 @@ cpu_partition_table=
       default values are automatically set and are chosen to work well across a
       number of typical scenarios.</para>
     </note>
+    <section>
+       <title>ko2iblnd Tuning</title>
+       <para>The following table outlines the ko2iblnd module parameters to be used
+    for tuning:</para>
+       <informaltable frame="all">
+         <tgroup cols="3">
+           <colspec colname="c1" colwidth="50*" />
+           <colspec colname="c2" colwidth="50*" />
+           <colspec colname="c3" colwidth="50*" />
+           <thead>
+             <row>
+               <entry>
+                 <para>
+                   <emphasis role="bold">Module Parameter</emphasis>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <emphasis role="bold">Default Value</emphasis>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <emphasis role="bold">Description</emphasis>
+                 </para>
+               </entry>
+             </row>
+           </thead>
+           <tbody>
+             <row>
+               <entry>
+                 <para>
+                   <literal>service</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>987</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Service number (within RDMA_PS_TCP).</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>cksum</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Set non-zero to enable message (not RDMA) checksums.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>timeout</literal>
+                 </para>
+               </entry>
+               <entry>
+               <para>
+                 <literal>50</literal>
+               </para>
+             </entry>
+               <entry>
+                 <para>Timeout in seconds.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>nscheds</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number of threads in each scheduler pool (per CPT).  Value of
+          zero means we derive the number from the number of cores.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>conns_per_peer</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>4 (OmniPath), 1 (Everything else)</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Introduced in 2.10. Number of connections to each peer. Messages
+          are sent round-robin over the connection pool.  Provides signifiant
+          improvement with OmniPath.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>ntx</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>512</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number of message descriptors allocated for each pool at
+          startup. Grows at runtime. Shared by all CPTs.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>credits</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>256</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number of concurrent sends on network.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>peer_credits</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>8</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number of concurrent sends to 1 peer. Related/limited by IB
+          queue size.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>peer_credits_hiw</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>When eagerly to return credits.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>peer_buffer_credits</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number per-peer router buffer credits.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>peer_timeout</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>180</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Seconds without aliveness news to declare peer dead (less than
+          or equal to 0 to disable).</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>ipif_name</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>ib0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>IPoIB interface name.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>retry_count</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>5</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Retransmissions when no ACK received.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>rnr_retry_count</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>6</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>RNR retransmissions.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>keepalive</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>100</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Idle time in seconds before sending a keepalive.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>ib_mtu</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>IB MTU 256/512/1024/2048/4096.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>concurrent_sends</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Send work-queue sizing. If zero, derived from
+          <literal>map_on_demand</literal> and <literal>peer_credits</literal>.
+          </para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>map_on_demand</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+            <literal>0 (pre-4.8 Linux) 1 (4.8 Linux onward) 32 (OmniPath)</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number of fragments reserved for connection.  If zero, use
+          global memory region (found to be security issue).  If non-zero, use
+          FMR or FastReg for memory registration.  Value needs to agree between
+          both peers of connection.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>fmr_pool_size</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>512</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Size of fmr pool on each CPT (>= ntx / 4).  Grows at runtime.
+          </para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>fmr_flush_trigger</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>384</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Number dirty FMRs that triggers pool flush.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>fmr_cache</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>1</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Non-zero to enable FMR caching.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>dev_failover</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>HCA failover for bonding (0 OFF, 1 ON, other values reserved).
+          </para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>require_privileged_port</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>0</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Require privileged port when accepting connection.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>use_privileged_port</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>1</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Use privileged port when initiating connection.</para>
+               </entry>
+             </row>
+             <row>
+               <entry>
+                 <para>
+                   <literal>wrq_sge</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>
+                   <literal>2</literal>
+                 </para>
+               </entry>
+               <entry>
+                 <para>Introduced in 2.10. Number scatter/gather element groups per
+          work request.  Used to deal with fragmentations which can consume
+          double the number of work requests.</para>
+               </entry>
+             </row>
+           </tbody>
+         </tgroup>
+       </informaltable>
+    </section>
   </section>
   <section xml:id="dbdoclet.nrstuning" condition='l24'>
     <title>
@@ -676,6 +1099,18 @@ regular_requests:
     queued: 2420
     active: 268
 
+  - name: tbf
+    state: stopped
+    fallback: no
+    queued: 0
+    active: 0
+
+  - name: delay
+    state: stopped
+    fallback: no
+    queued: 0
+    active: 0
+
 high_priority_requests:
   - name: fifo
     state: started
@@ -700,7 +1135,19 @@ high_priority_requests:
     fallback: no
     queued: 0
     active: 0
-      
+
+  - name: tbf
+    state: stopped
+    fallback: no
+    queued: 0
+    active: 0
+
+  - name: delay
+    state: stopped
+    fallback: no
+    queued: 0
+    active: 0
+
 </screen>
     <para>NRS policy state is shown in either one or two sections, depending on
     the PTLRPC service being queried. The first section is named 
@@ -1138,7 +1585,7 @@ ost.OSS.ost_io.nrs_orr_supported=reg_supported:reads_and_writes
         </listitem>
       </itemizedlist>
     </section>
-    <section condition='l26'>
+    <section xml:id="dbdoclet.tbftuning" condition='l26'>
       <title>
       <indexterm>
         <primary>tuning</primary>
@@ -1190,9 +1637,7 @@ ost.OSS.ost_io.nrs_orr_supported=reg_supported:reads_and_writes
           follows:</para>
           <screen>
 $ lctl set_param x.x.x.nrs_tbf_rule=
-                  "[reg|hp] start 
-<replaceable>rule_name</replaceable> 
-<replaceable>arguments</replaceable>..."
+          "[reg|hp] start <replaceable>rule_name</replaceable> <replaceable>arguments</replaceable>..."
 </screen>
           <para>The '
           <replaceable>rule_name</replaceable>' argument is a string which
@@ -1202,10 +1647,7 @@ $ lctl set_param x.x.x.nrs_tbf_rule=
           as follows:</para>
           <screen>
 $ lctl set_param x.x.x.nrs_tbf_rule=
-                  "[reg|hp] start 
-<replaceable>rule_name</replaceable> {
-<replaceable>nidlist</replaceable>} 
-<replaceable>rate</replaceable>"
+          "[reg|hp] start <replaceable>rule_name</replaceable> {<replaceable>nidlist</replaceable>} <replaceable>rate</replaceable>"
 </screen>
           <para>The format of '
           <replaceable>nidlist</replaceable>' argument is the same as the
@@ -1217,64 +1659,59 @@ $ lctl set_param x.x.x.nrs_tbf_rule=
           critical too.</para>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "start other_clients {192.168.*.*@tcp} 50"
+          "start other_clients {192.168.*.*@tcp} 50"
 </screen>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "start loginnode {192.168.1.1@tcp} 100"
+          "start loginnode {192.168.1.1@tcp} 100"
 </screen>
           <para>General rule can be replaced by two rules (reg and hp) as
           follows:</para>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "reg start loginnode {192.168.1.1@tcp} 100"
+          "reg start loginnode {192.168.1.1@tcp} 100"
 </screen>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "hp start loginnode {192.168.1.1@tcp} 100"
+          "hp start loginnode {192.168.1.1@tcp} 100"
 </screen>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "start computes {192.168.1.[2-128]@tcp} 500"
+          "start computes {192.168.1.[2-128]@tcp} 500"
 </screen>
           <para>The above rules will put an upper limit for servers to process
           at most 5x as many RPCs from compute nodes as login nodes.</para>
-          <para>For the JobID (please see 
+          <para>For the JobID (please see
           <xref xmlns:xlink="http://www.w3.org/1999/xlink"
-          linkend="dbdoclet.jobstats" />for more details) based TBF policy, its
-          format is as follows:</para>
+                linkend="dbdoclet.jobstats" /> for more details) based TBF
+          policy, its format is as follows:</para>
           <screen>
 $ lctl set_param x.x.x.nrs_tbf_rule=
-                  "[reg|hp] start 
-<replaceable>name</replaceable> {
-<replaceable>jobid_list</replaceable>} 
-<replaceable>rate</replaceable>"
+          "[reg|hp] start <replaceable>name</replaceable> {<replaceable>jobid_list</replaceable>} <replaceable>rate</replaceable>"
 </screen>
           <para>Following commands are valid:</para>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "start user1 {iozone.500 dd.500} 100"
+          "start user1 {iozone.500 dd.500} 100"
 </screen>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "start iozone_user1 {iozone.500} 100"
+          "start iozone_user1 {iozone.500} 100"
 </screen>
           <para>Same as nid, could use reg and hp rules separately:</para>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "hp start iozone_user1 {iozone.500} 100"
+          "hp start iozone_user1 {iozone.500} 100"
 </screen>
           <screen>
 $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule=
-                  "reg start iozone_user1 {iozone.500} 100"
+          "reg start iozone_user1 {iozone.500} 100"
 </screen>
           <para>The format of the rule change command of TBF policy is as
           follows:</para>
           <screen>
 $ lctl set_param x.x.x.nrs_tbf_rule=
-                  "[reg|hp] change 
-<replaceable>rule_name</replaceable> 
-<replaceable>rate</replaceable>"
+          "[reg|hp] change <replaceable>rule_name</replaceable> <replaceable>rate</replaceable>"
 </screen>
           <para>Following commands are valid:</para>
           <screen>
@@ -1305,6 +1742,151 @@ $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule="hp stop loginnode"
         </listitem>
       </itemizedlist>
     </section>
+    <section xml:id="dbdoclet.delaytuning" condition='l2A'>
+      <title>
+      <indexterm>
+        <primary>tuning</primary>
+        <secondary>Network Request Scheduler (NRS) Tuning</secondary>
+        <tertiary>Delay policy</tertiary>
+      </indexterm>Delay policy</title>
+      <para>The NRS Delay policy seeks to perturb the timing of request
+      processing at the PtlRPC layer, with the goal of simulating high server
+      load, and finding and exposing timing related problems. When this policy
+      is active, upon arrival of a request the policy will calculate an offset,
+      within a defined, user-configurable range, from the request arrival
+      time, to determine a time after which the request should be handled.
+      The request is then stored using the cfs_binheap implementation,
+      which sorts the request according to the assigned start time.
+      Requests are removed from the binheap for handling once their start
+      time has been passed.</para>
+      <para>The Delay policy can be enabled on all types of PtlRPC services,
+      and has the following tunables that can be used to adjust its behavior:
+      </para>
+      <itemizedlist>
+        <listitem>
+          <para>
+            <literal>{service}.nrs_delay_min</literal>
+          </para>
+          <para>The
+          <literal>{service}.nrs_delay_min</literal> tunable controls the
+          minimum amount of time, in seconds, that a request will be delayed by
+          this policy.  The default is 5 seconds. To read this value run:</para>
+          <screen>
+lctl get_param {service}.nrs_delay_min</screen>
+          <para>For example, to read the minimum delay set on the ost_io
+          service, run:</para>
+          <screen>
+$ lctl get_param ost.OSS.ost_io.nrs_delay_min
+ost.OSS.ost_io.nrs_delay_min=reg_delay_min:5
+hp_delay_min:5</screen>
+        <para>To set the minimum delay in RPC processing, run:</para>
+        <screen>
+lctl set_param {service}.nrs_delay_min=<replaceable>0-65535</replaceable></screen>
+        <para>This will set the minimum delay time on a given service, for both
+        regular and high-priority RPCs (if the PtlRPC service supports
+        high-priority RPCs), to the indicated value.</para>
+        <para>For example, to set the minimum delay time on the ost_io service
+        to 10, run:</para>
+        <screen>
+$ lctl set_param ost.OSS.ost_io.nrs_delay_min=10
+ost.OSS.ost_io.nrs_delay_min=10</screen>
+        <para>For PtlRPC services that support high-priority RPCs, to set a
+        different minimum delay time for regular and high-priority RPCs, run:
+        </para>
+        <screen>
+lctl set_param {service}.nrs_delay_min=<replaceable>reg_delay_min|hp_delay_min</replaceable>:<replaceable>0-65535</replaceable>
+        </screen>
+        <para>For example, to set the minimum delay time on the ost_io service
+        for high-priority RPCs to 3, run:</para>
+        <screen>
+$ lctl set_param ost.OSS.ost_io.nrs_delay_min=hp_delay_min:3
+ost.OSS.ost_io.nrs_delay_min=hp_delay_min:3</screen>
+        <para>Note, in all cases the minimum delay time cannot exceed the
+        maximum delay time.</para>
+        </listitem>
+        <listitem>
+          <para>
+            <literal>{service}.nrs_delay_max</literal>
+          </para>
+          <para>The
+          <literal>{service}.nrs_delay_max</literal> tunable controls the
+          maximum amount of time, in seconds, that a request will be delayed by
+          this policy.  The default is 300 seconds. To read this value run:
+          </para>
+          <screen>lctl get_param {service}.nrs_delay_max</screen>
+          <para>For example, to read the maximum delay set on the ost_io
+          service, run:</para>
+          <screen>
+$ lctl get_param ost.OSS.ost_io.nrs_delay_max
+ost.OSS.ost_io.nrs_delay_max=reg_delay_max:300
+hp_delay_max:300</screen>
+        <para>To set the maximum delay in RPC processing, run:</para>
+        <screen>lctl set_param {service}.nrs_delay_max=<replaceable>0-65535</replaceable>
+</screen>
+        <para>This will set the maximum delay time on a given service, for both
+        regular and high-priority RPCs (if the PtlRPC service supports
+        high-priority RPCs), to the indicated value.</para>
+        <para>For example, to set the maximum delay time on the ost_io service
+        to 60, run:</para>
+        <screen>
+$ lctl set_param ost.OSS.ost_io.nrs_delay_max=60
+ost.OSS.ost_io.nrs_delay_max=60</screen>
+        <para>For PtlRPC services that support high-priority RPCs, to set a
+        different maximum delay time for regular and high-priority RPCs, run:
+        </para>
+        <screen>lctl set_param {service}.nrs_delay_max=<replaceable>reg_delay_max|hp_delay_max</replaceable>:<replaceable>0-65535</replaceable></screen>
+        <para>For example, to set the maximum delay time on the ost_io service
+        for high-priority RPCs to 30, run:</para>
+        <screen>
+$ lctl set_param ost.OSS.ost_io.nrs_delay_max=hp_delay_max:30
+ost.OSS.ost_io.nrs_delay_max=hp_delay_max:30</screen>
+        <para>Note, in all cases the maximum delay time cannot be less than the
+        minimum delay time.</para>
+        </listitem>
+        <listitem>
+          <para>
+            <literal>{service}.nrs_delay_pct</literal>
+          </para>
+          <para>The
+          <literal>{service}.nrs_delay_pct</literal> tunable controls the
+          percentage of requests that will be delayed by this policy. The
+          default is 100. Note, when a request is not selected for handling by
+          the delay policy due to this variable then the request will be handled
+          by whatever fallback policy is defined for that service. If no other
+          fallback policy is defined then the request will be handled by the
+          FIFO policy.  To read this value run:</para>
+          <screen>lctl get_param {service}.nrs_delay_pct</screen>
+          <para>For example, to read the percentage of requests being delayed on
+          the ost_io service, run:</para>
+          <screen>
+$ lctl get_param ost.OSS.ost_io.nrs_delay_pct
+ost.OSS.ost_io.nrs_delay_pct=reg_delay_pct:100
+hp_delay_pct:100</screen>
+        <para>To set the percentage of delayed requests, run:</para>
+        <screen>
+lctl set_param {service}.nrs_delay_pct=<replaceable>0-100</replaceable></screen>
+        <para>This will set the percentage of requests delayed on a given
+        service, for both regular and high-priority RPCs (if the PtlRPC service
+        supports high-priority RPCs), to the indicated value.</para>
+        <para>For example, to set the percentage of delayed requests on the
+        ost_io service to 50, run:</para>
+        <screen>
+$ lctl set_param ost.OSS.ost_io.nrs_delay_pct=50
+ost.OSS.ost_io.nrs_delay_pct=50
+</screen>
+        <para>For PtlRPC services that support high-priority RPCs, to set a
+        different delay percentage for regular and high-priority RPCs, run:
+        </para>
+        <screen>lctl set_param {service}.nrs_delay_pct=<replaceable>reg_delay_pct|hp_delay_pct</replaceable>:<replaceable>0-100</replaceable>
+</screen>
+        <para>For example, to set the percentage of delayed requests on the
+        ost_io service for high-priority RPCs to 5, run:</para>
+        <screen>$ lctl set_param ost.OSS.ost_io.nrs_delay_pct=hp_delay_pct:5
+ost.OSS.ost_io.nrs_delay_pct=hp_delay_pct:5
+</screen>
+        </listitem>
+      </itemizedlist>
+    </section>
   </section>
   <section xml:id="dbdoclet.50438272_25884">
     <title>
@@ -1313,8 +1895,8 @@ $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule="hp stop loginnode"
       <secondary>lockless I/O</secondary>
     </indexterm>Lockless I/O Tunables</title>
     <para>The lockless I/O tunable feature allows servers to ask clients to do
-    lockless I/O (liblustre-style where the server does the locking) on
-    contended files.</para>
+    lockless I/O (the server does the locking on behalf of clients) for
+    contended files to avoid lock ping-pong.</para>
     <para>The lockless I/O patch introduces these tunables:</para>
     <itemizedlist>
       <listitem>
@@ -1322,7 +1904,7 @@ $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule="hp stop loginnode"
           <emphasis role="bold">OST-side:</emphasis>
         </para>
         <screen>
-/proc/fs/lustre/ldlm/namespaces/filter-lustre-*
+ldlm.namespaces.filter-<replaceable>fsname</replaceable>-*.
 </screen>
         <para>
         <literal>contended_locks</literal>- If the number of lock conflicts in
@@ -1333,9 +1915,9 @@ $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule="hp stop loginnode"
         contended state as set in the parameter.</para>
         <para>
         <literal>max_nolock_bytes</literal>- Server-side locking set only for
-        requests less than the blocks set in the 
-        <literal>max_nolock_bytes</literal> parameter. If this tunable is set to
-        zero (0), it disables server-side locking for read/write
+        requests less than the blocks set in the
+        <literal>max_nolock_bytes</literal> parameter. If this tunable is
+        set to zero (0), it disables server-side locking for read/write
         requests.</para>
       </listitem>
       <listitem>
@@ -1493,28 +2075,91 @@ $ lctl set_param ost.OSS.ost_io.nrs_tbf_rule="hp stop loginnode"
         </screen>
       </section>
   </section>
+  <section condition="l29">
+      <title>
+          <indexterm>
+              <primary>tuning</primary>
+              <secondary>Large Bulk IO</secondary>
+          </indexterm>
+          Large Bulk IO (16MB RPC)
+      </title>
+      <section><title>Overview</title>
+          <para>Beginning with Lustre 2.9, Lustre is extended to support RPCs up
+          to 16MB in size. By enabling a larger RPC size, fewer RPCs will be
+          required to transfer the same amount of data between clients and
+          servers.  With a larger RPC size, the OSS can submit more data to the
+          underlying disks at once, therefore it can produce larger disk I/Os
+          to fully utilize the increasing bandwidth of disks.</para>
+          <para>At client connection time, clients will negotiate with
+          servers what the maximum RPC size it is possible to use, but the
+         client can always send RPCs smaller than this maximum.</para>
+          <para>The parameter <literal>brw_size</literal> is used on the OST
+         to tell the client the maximum (preferred) IO size.  All clients that
+          talk to this target should never send an RPC greater than this size.
+         Clients can individually set a smaller RPC size limit via the
+         <literal>osc.*.max_pages_per_rpc</literal> tunable.
+          </para>
+         <note>
+         <para>The smallest <literal>brw_size</literal> that can be set for
+         ZFS OSTs is the <literal>recordsize</literal> of that dataset.  This
+         ensures that the client can always write a full ZFS file block if it
+         has enough dirty data, and does not otherwise force it to do read-
+         modify-write operations for every RPC.
+          </para>
+         </note>
+      </section>
+      <section><title>Usage</title>
+          <para>In order to enable a larger RPC size,
+          <literal>brw_size</literal> must be changed to an IO size value up to
+          16MB.  To temporarily change <literal>brw_size</literal>, the
+          following command should be run on the OSS:</para>
+          <screen>oss# lctl set_param obdfilter.<replaceable>fsname</replaceable>-OST*.brw_size=16</screen>
+          <para>To persistently change <literal>brw_size</literal>, one of the following
+          commands should be run on the OSS:</para>
+          <screen>oss# lctl set_param -P obdfilter.<replaceable>fsname</replaceable>-OST*.brw_size=16</screen>
+          <screen>oss# lctl conf_param <replaceable>fsname</replaceable>-OST*.obdfilter.brw_size=16</screen>
+          <para>When a client connects to an OST target, it will fetch
+          <literal>brw_size</literal> from the target and pick the maximum value
+          of <literal>brw_size</literal> and its local setting for
+          <literal>max_pages_per_rpc</literal> as the actual RPC size.
+          Therefore, the <literal>max_pages_per_rpc</literal> on the client side
+          would have to be set to 16M, or 4096 if the PAGESIZE is 4KB, to enable
+          a 16MB RPC.  To temporarily make the change, the following command
+          should be run on the client to set
+          <literal>max_pages_per_rpc</literal>:</para>
+          <screen>client$ lctl set_param osc.<replaceable>fsname</replaceable>-OST*.max_pages_per_rpc=16M</screen>
+          <para>To persistently make this change, the following command should
+          be run:</para>
+          <screen>client$ lctl conf_param <replaceable>fsname</replaceable>-OST*.osc.max_pages_per_rpc=16M</screen>
+          <caution><para>The <literal>brw_size</literal> of an OST can be
+          changed on the fly.  However, clients have to be remounted to
+          renegotiate the new maximum RPC size.</para></caution>
+      </section>
+  </section>
   <section xml:id="dbdoclet.50438272_80545">
     <title>
     <indexterm>
       <primary>tuning</primary>
       <secondary>for small files</secondary>
-    </indexterm>Improving Lustre File System Performance When Working with
-    Small Files</title>
+    </indexterm>Improving Lustre I/O Performance for Small Files</title>
     <para>An environment where an application writes small file chunks from
-    many clients to a single file will result in bad I/O performance. To
+    many clients to a single file can result in poor I/O performance. To
     improve the performance of the Lustre file system with small files:</para>
     <itemizedlist>
       <listitem>
         <para>Have the application aggregate writes some amount before
         submitting them to the Lustre file system. By default, the Lustre
         software enforces POSIX coherency semantics, so it results in lock
-        ping-pong between client nodes if they are all writing to the same file
-        at one time.</para>
+        ping-pong between client nodes if they are all writing to the same
+        file at one time.</para>
+        <para>Using MPI-IO Collective Write functionality in
+        the Lustre ADIO driver is one way to achieve this in a straight
+        forward manner if the application is already using MPI-IO.</para>
       </listitem>
       <listitem>
-        <para>Have the application do 4kB 
-        <literal>O_DIRECT</literal> sized I/O to the file and disable locking on
-        the output file. This avoids partial-page IO submissions and, by
+        <para>Have the application do 4kB
+        <literal>O_DIRECT</literal> sized I/O to the file and disable locking
+        on the output file. This avoids partial-page IO submissions and, by
         disabling locking, you avoid contention between clients.</para>
       </listitem>
       <listitem>