onnxruntime_session_options_config_keys.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. // Copyright (c) Microsoft Corporation. All rights reserved.
  2. // Licensed under the MIT License.
  3. #pragma once
  4. /*
  5. * This file defines SessionOptions Config Keys and format of the Config Values.
  6. *
  7. * The Naming Convention for a SessionOptions Config Key,
  8. * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
  9. * Such as "ep.cuda.use_arena"
  10. * The Config Key cannot be empty
  11. * The maximum length of the Config Key is 128
  12. *
  13. * The string format of a SessionOptions Config Value is defined individually for each Config.
  14. * The maximum length of the Config Value is 1024
  15. */
  16. // Key for disable PrePacking,
  17. // If the config value is set to "1" then the prepacking is disabled, otherwise prepacking is enabled (default value)
  18. static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking";
  19. // A value of "1" means allocators registered in the env will be used. "0" means the allocators created in the session
  20. // will be used. Use this to override the usage of env allocators on a per session level.
  21. static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators";
  22. // Set to 'ORT' (case sensitive) to load an ORT format model.
  23. // If unset, model type will default to ONNX unless inferred from filename ('.ort' == ORT format) or bytes to be ORT
  24. static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format";
  25. // Set to 'ORT' (case sensitive) to save optimized model in ORT format when SessionOptions.optimized_model_path is set.
  26. // If unset, format will default to ONNX unless optimized_model_filepath ends in '.ort'.
  27. static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format";
  28. // If a value is "1", flush-to-zero and denormal-as-zero are applied. The default is "0".
  29. // When multiple sessions are created, a main thread doesn't override changes from succeeding session options,
  30. // but threads in session thread pools follow option changes.
  31. // When ORT runs with OpenMP, the same rule is applied, i.e. the first session option to flush-to-zero and
  32. // denormal-as-zero is only applied to global OpenMP thread pool, which doesn't support per-session thread pool.
  33. // Note that an alternative way not using this option at runtime is to train and export a model without denormals
  34. // and that's recommended because turning this option on may hurt model accuracy.
  35. static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero";
  36. // It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not.
  37. // "0": enable. ORT does fusion logic for QDQ format.
  38. // "1": disable. ORT doesn't do fusion logic for QDQ format.
  39. // Its default value is "0" unless the DirectML execution provider is registered, in which case it defaults to "1".
  40. static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_quant_qdq";
  41. // It controls whether to enable Double QDQ remover and Identical Children Consolidation
  42. // "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
  43. // "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
  44. // Its default value is "0"
  45. static const char* const kOrtSessionOptionsDisableDoubleQDQRemover = "session.disable_double_qdq_remover";
  46. // If set to "1", enables the removal of QuantizeLinear/DequantizeLinear node pairs once all QDQ handling has been
  47. // completed. e.g. If after all QDQ handling has completed and we have -> FloatOp -> Q -> DQ -> FloatOp -> the
  48. // Q -> DQ could potentially be removed. This will provide a performance benefit by avoiding going from float to
  49. // 8-bit and back to float, but could impact accuracy. The impact on accuracy will be model specific and depend on
  50. // other factors like whether the model was created using Quantization Aware Training or Post Training Quantization.
  51. // As such, it's best to test to determine if enabling this works well for your scenario.
  52. // The default value is "0"
  53. // Available since version 1.11.
  54. static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enable_quant_qdq_cleanup";
  55. // Enable or disable gelu approximation in graph optimization. "0": disable; "1": enable. The default is "0".
  56. // GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
  57. static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
  58. // This setting controls whether to enable AheadOfTime function inlining.
  59. // AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
  60. // as possible with the help of enabled execution providers.
  61. // This can reduce the number of function calls and improve performance because it is done before
  62. // Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available,
  63. // one can disable the AOT inlining, produce an optimized model and postpone AOT until run time.
  64. // "0": enable; "1": disable.
  65. // Its default value is "0".
  66. static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining";
  67. #ifdef ENABLE_TRAINING
  68. // Specifies a list of op types for memory footprint reduction.
  69. // The value should be a ","-delimited list of pair of
  70. // <subgraph string: optimization strategy: number of subgraph to apply>.
  71. // For example, "Gelu+Cast+:1:0,Dropout+:1:1".
  72. // A valid "subgraph string" should be one subgraph representation output by ORT graph transformations.
  73. // "optimization strategy" currently has valid values: 0 - disabled, 1 - recompute.
  74. // "number of subgraph to apply" is used to control how many subgraphs to apply optimization, to avoid "oversaving"
  75. // the memory.
  76. static const char* const kOrtSessionOptionsMemoryOptimizerEnabler = "optimization.memory_optimizer_config";
  77. // Specifies the config for detecting subgraphs for memory footprint reduction.
  78. // The value should be a string contains int separated using commas. The default value is "0:0".
  79. static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config";
  80. #endif
  81. // This setting if set should contain a comma separated list of optimizers names that should be disabled.
  82. // Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer
  83. // does not provider runtime benefits, but affects your model loading time you may disable it using this config
  84. // entry. This option is not enabled in ORT_MINIMAL_BUILD build.
  85. // A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc
  86. //
  87. // Default is an empty string which means no optimizers are disabled.
  88. static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
  89. // Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
  90. // Using device allocators means the memory allocation is made using malloc/new.
  91. static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
  92. // Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking
  93. // "0": thread will block if found no job to run
  94. // "1": default, thread will spin a number of times before blocking
  95. static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning";
  96. static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning";
  97. // Key for using model bytes directly for ORT format
  98. // If a session is created using an input byte array contains the ORT format model data,
  99. // By default we will copy the model bytes at the time of session creation to ensure the model bytes
  100. // buffer is valid.
  101. // Setting this option to "1" will disable copy the model bytes, and use the model bytes directly. The caller
  102. // has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
  103. static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
  104. /// <summary>
  105. /// Key for using the ORT format model flatbuffer bytes directly for initializers.
  106. /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
  107. /// Requires `session.use_ort_model_bytes_directly` to be true.
  108. /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
  109. /// duration of the InferenceSession.
  110. /// </summary>
  111. static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
  112. "session.use_ort_model_bytes_for_initializers";
  113. // This should only be specified when exporting an ORT format model for use on a different platform.
  114. // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
  115. // Available since version 1.11.
  116. static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
  117. // x64 SSE4.1/AVX2/AVX512(with no VNNI) has overflow problem with quantizied matrix multiplication with U8S8.
  118. // To avoid this we need to use slower U8U8 matrix multiplication instead. This option, if
  119. // turned on, use slower U8U8 matrix multiplications. Only effective with AVX2 or AVX512
  120. // platforms.
  121. static const char* const kOrtSessionOptionsAvx2PrecisionMode = "session.x64quantprecision";
  122. // Specifies how minimal build graph optimizations are handled in a full build.
  123. // These optimizations are at the extended level or higher.
  124. // Possible values and their effects are:
  125. // "save": Save runtime optimizations when saving an ORT format model.
  126. // "apply": Only apply optimizations available in a minimal build.
  127. // ""/<unspecified>: Apply optimizations available in a full build.
  128. // Available since version 1.11.
  129. static const char* const kOrtSessionOptionsConfigMinimalBuildOptimizations =
  130. "optimization.minimal_build_optimizations";
  131. // Note: The options specific to an EP should be specified prior to appending that EP to the session options object in
  132. // order for them to take effect.
  133. // Specifies a list of stop op types. Nodes of a type in the stop op types and nodes downstream from them will not be
  134. // run by the NNAPI EP.
  135. // The value should be a ","-delimited list of op types. For example, "Add,Sub".
  136. // If not specified, the default set of stop ops is used. To specify an empty stop ops types list and disable stop op
  137. // exclusion, set the value to "".
  138. static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "ep.nnapi.partitioning_stop_ops";
  139. // Enabling dynamic block-sizing for multithreading.
  140. // With a positive value, thread pool will split a task of N iterations to blocks of size starting from:
  141. // N / (num_of_threads * dynamic_block_base)
  142. // As execution progresses, the size will decrease according to the diminishing residual of N,
  143. // meaning the task will be distributed in smaller granularity for better parallelism.
  144. // For some models, it helps to reduce the variance of E2E inference latency and boost performance.
  145. // The feature will not function by default, specify any positive integer, e.g. "4", to enable it.
  146. // Available since version 1.11.
  147. static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base";
  148. // This option allows to decrease CPU usage between infrequent
  149. // requests and forces any TP threads spinning stop immediately when the last of
  150. // concurrent Run() call returns.
  151. // Spinning is restarted on the next Run() call.
  152. // Applies only to internal thread-pools
  153. static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop";
  154. // "1": all inconsistencies encountered during shape and type inference
  155. // will result in failures.
  156. // "0": in some cases warnings will be logged but processing will continue. The default.
  157. // May be useful to expose bugs in models.
  158. static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference";
  159. // "1": every model using a more recent opset than the latest released one will fail
  160. // "0": the model may or may not work if onnxruntime cannot find an implementation, this option
  161. // is used for development purpose.
  162. static const char* const kOrtSessionOptionsConfigStrictAllowReleasedOpsetsOnly = "session.allow_released_opsets_only";
  163. // The file saves configuration for partitioning node among logic streams
  164. static const char* const kNodePartitionConfigFile = "session.node_partition_config_file";
  165. // This Option allows setting affinities for intra op threads.
  166. // Affinity string follows format:
  167. // logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
  168. // Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
  169. // e.g.1,2,3;4,5
  170. // specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
  171. // To ease the configuration, an "interval" is also allowed:
  172. // e.g. 1-8;8-16;17-24
  173. // orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
  174. // Note:
  175. // 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which
  176. // is started and managed by the calling app;
  177. // 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
  178. // an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
  179. // Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
  180. static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities";
  181. // This option will dump out the model to assist debugging any issues with layout transformation,
  182. // and is primarily intended for developer usage. It is only relevant if an execution provider that requests
  183. // NHWC layout is enabled such as NNAPI, XNNPACK or QNN.
  184. //
  185. // Default is off. Set to "1" to enable.
  186. //
  187. // If modified by layout transformation the model will be dumped after these steps:
  188. // 1) insertion of the layout transformation Transpose nodes
  189. // 2) after those are optimized using the transpose optimizer,
  190. // 3) after the L1 transformers are applied to the updated graph.
  191. // The model will be saved to filename post_layout_transform_step_<step_number>.onnx.
  192. static const char* const kDebugLayoutTransformation = "session.debug_layout_transformation";
  193. // Graph nodes that are not supported by the execution providers (EPs) explicitly added to the session are
  194. // assigned (i.e., "fallback") to the CPU EP by default.
  195. //
  196. // This option allows the user to disable the fallback of unsupported graph nodes to the CPU EP.
  197. // If this option is set to "1", session creation will fail if the execution providers other than the CPU EP cannot
  198. // fully support all of the nodes in the graph.
  199. //
  200. // It is invalid to set this option and explicitly add the CPU EP to the session. In this case, session creation
  201. // will also fail with an error.
  202. //
  203. // Option values:
  204. // - "0": CPU EP fallback is not disabled. [DEFAULT]
  205. // - "1": CPU EP fallback is disabled.
  206. static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disable_cpu_ep_fallback";
  207. // Use this config when serializing a large model after optimization to specify an external initializers file
  208. static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
  209. "session.optimized_model_external_initializers_file_name";
  210. // Use this config to control the minimum size of the initializer when externalizing it during serialization
  211. static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
  212. "session.optimized_model_external_initializers_min_size_in_bytes";
  213. // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
  214. // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
  215. // "0": disable. (default)
  216. // "1": enable.
  217. static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
  218. // Specify the file path for the Onnx model which has EP context.
  219. // Default to original_file_name_ctx.onnx if not specified
  220. static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
  221. // Flag to specify whether to dump the EP context into the Onnx model.
  222. // "0": dump the EP context into separate file, keep the file name in the Onnx model.
  223. // "1": dump the EP context into the Onnx model. (default).
  224. static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
  225. // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
  226. // Option values:
  227. // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
  228. // - "1": Gemm FastMath mode is enabled.
  229. static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";