hang_watcher.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. // Copyright 2020 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_THREADING_HANG_WATCHER_H_
  5. #define BASE_THREADING_HANG_WATCHER_H_
  6. #include <atomic>
  7. #include <memory>
  8. #include <vector>
  9. #include "base/atomicops.h"
  10. #include "base/callback.h"
  11. #include "base/callback_forward.h"
  12. #include "base/callback_helpers.h"
  13. #include "base/compiler_specific.h"
  14. #include "base/feature_list.h"
  15. #include "base/synchronization/lock.h"
  16. #include "base/thread_annotations.h"
  17. #include "base/threading/platform_thread.h"
  18. #include "base/threading/simple_thread.h"
  19. #include "base/threading/thread_checker.h"
  20. #include "base/threading/thread_local.h"
  21. #include "base/time/tick_clock.h"
  22. #include "base/time/time.h"
  23. namespace base {
  24. class HangWatchScope;
  25. namespace internal {
  26. class HangWatchState;
  27. } // namespace internal
  28. } // namespace base
  29. namespace base {
  30. // Instantiate a HangWatchScope in a scope to register to be
  31. // watched for hangs of more than |timeout| by the HangWatcher.
  32. //
  33. // Example usage:
  34. //
  35. // void FooBar(){
  36. // HangWatchScope scope(base::TimeDelta::FromSeconds(5));
  37. // DoWork();
  38. // }
  39. //
  40. // If DoWork() takes more than 5s to run and the HangWatcher
  41. // inspects the thread state before Foobar returns a hang will be
  42. // reported.
  43. //
  44. // HangWatchScopes are typically meant to live on the stack. In some cases it's
  45. // necessary to keep a HangWatchScope instance as a class member but special
  46. // care is required when doing so as a HangWatchScope that stays alive longer
  47. // than intended will generate non-actionable hang reports.
  48. class BASE_EXPORT HangWatchScope {
  49. public:
  50. // A good default value needs to be large enough to represent a significant
  51. // hang and avoid noise while being small enough to not exclude too many
  52. // hangs. The nature of the work that gets executed on the thread is also
  53. // important. We can be much stricter when monitoring a UI thread compared tp
  54. // a ThreadPool thread for example.
  55. static const base::TimeDelta kDefaultHangWatchTime;
  56. // Constructing/destructing thread must be the same thread.
  57. explicit HangWatchScope(TimeDelta timeout);
  58. ~HangWatchScope();
  59. HangWatchScope(const HangWatchScope&) = delete;
  60. HangWatchScope& operator=(const HangWatchScope&) = delete;
  61. private:
  62. // This object should always be constructed and destructed on the same thread.
  63. THREAD_CHECKER(thread_checker_);
  64. // The deadline set by the previous HangWatchScope created on this thread.
  65. // Stored so it can be restored when this HangWatchScope is destroyed.
  66. TimeTicks previous_deadline_;
  67. #if DCHECK_IS_ON()
  68. // The previous HangWatchScope created on this thread.
  69. HangWatchScope* previous_scope_;
  70. #endif
  71. };
  72. // Monitors registered threads for hangs by inspecting their associated
  73. // HangWatchStates for deadline overruns. This happens at a regular interval on
  74. // a separate thread. Only one instance of HangWatcher can exist at a time
  75. // within a single process. This instance must outlive all monitored threads.
  76. class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
  77. public:
  78. static const base::Feature kEnableHangWatcher;
  79. // The first invocation of the constructor will set the global instance
  80. // accessible through GetInstance(). This means that only one instance can
  81. // exist at a time.
  82. HangWatcher();
  83. // Clears the global instance for the class.
  84. ~HangWatcher() override;
  85. HangWatcher(const HangWatcher&) = delete;
  86. HangWatcher& operator=(const HangWatcher&) = delete;
  87. // Returns a non-owning pointer to the global HangWatcher instance.
  88. static HangWatcher* GetInstance();
  89. // Sets up the calling thread to be monitored for threads. Returns a
  90. // ScopedClosureRunner that unregisters the thread. This closure has to be
  91. // called from the registered thread before it's joined.
  92. ScopedClosureRunner RegisterThread()
  93. LOCKS_EXCLUDED(watch_state_lock_) WARN_UNUSED_RESULT;
  94. // Choose a closure to be run at the end of each call to Monitor(). Use only
  95. // for testing. Reentering the HangWatcher in the closure must be done with
  96. // care. It should only be done through certain testing functions because
  97. // deadlocks are possible.
  98. void SetAfterMonitorClosureForTesting(base::RepeatingClosure closure);
  99. // Choose a closure to be run instead of recording the hang. Used to test
  100. // that certain conditions hold true at the time of recording. Use only
  101. // for testing. Reentering the HangWatcher in the closure must be done with
  102. // care. It should only be done through certain testing functions because
  103. // deadlocks are possible.
  104. void SetOnHangClosureForTesting(base::RepeatingClosure closure);
  105. // Set a monitoring period other than the default. Use only for
  106. // testing.
  107. void SetMonitoringPeriodForTesting(base::TimeDelta period);
  108. // Choose a callback to invoke right after waiting to monitor in Wait(). Use
  109. // only for testing.
  110. void SetAfterWaitCallbackForTesting(
  111. RepeatingCallback<void(TimeTicks)> callback);
  112. // Force the monitoring loop to resume and evaluate whether to continue.
  113. // This can trigger a call to Monitor() or not depending on why the
  114. // HangWatcher thread is sleeping. Use only for testing.
  115. void SignalMonitorEventForTesting();
  116. // Call to make sure no more monitoring takes place. The
  117. // function is thread-safe and can be called at anytime but won't stop
  118. // monitoring that is currently taking place. Use only for testing.
  119. void StopMonitoringForTesting();
  120. // Replace the clock used when calculating time spent
  121. // sleeping. Use only for testing.
  122. void SetTickClockForTesting(const base::TickClock* tick_clock);
  123. // Use to block until the hang is recorded. Allows the caller to halt
  124. // execution so it does not overshoot the hang watch target and result in a
  125. // non-actionable stack trace in the crash recorded.
  126. void BlockIfCaptureInProgress();
  127. // Begin executing the monitoring loop on the HangWatcher thread.
  128. void Start();
  129. private:
  130. // Use to assert that functions are called on the monitoring thread.
  131. THREAD_CHECKER(hang_watcher_thread_checker_);
  132. // Use to assert that functions are called on the constructing thread.
  133. THREAD_CHECKER(constructing_thread_checker_);
  134. // Invoke base::debug::DumpWithoutCrashing() insuring that the stack frame
  135. // right under it in the trace belongs to HangWatcher for easier attribution.
  136. NOINLINE static void RecordHang();
  137. using HangWatchStates =
  138. std::vector<std::unique_ptr<internal::HangWatchState>>;
  139. // Used to save a snapshots of the state of hang watching during capture.
  140. // Only the state of hung threads is retained.
  141. class BASE_EXPORT WatchStateSnapShot {
  142. public:
  143. struct WatchStateCopy {
  144. base::TimeTicks deadline;
  145. base::PlatformThreadId thread_id;
  146. };
  147. // Construct the snapshot from provided data. |snapshot_time| can be
  148. // different than now() to be coherent with other operations recently done
  149. // on |watch_states|. If any deadline in |watch_states| is before
  150. // |deadline_ignore_threshold|, the snapshot is empty.
  151. WatchStateSnapShot(const HangWatchStates& watch_states,
  152. base::TimeTicks snapshot_time,
  153. base::TimeTicks deadline_ignore_threshold);
  154. WatchStateSnapShot(const WatchStateSnapShot& other);
  155. ~WatchStateSnapShot();
  156. // Returns a string that contains the ids of the hung threads separated by a
  157. // '|'. The size of the string is capped at debug::CrashKeySize::Size256. If
  158. // no threads are hung returns an empty string.
  159. std::string PrepareHungThreadListCrashKey() const;
  160. // Return the highest deadline included in this snapshot.
  161. base::TimeTicks GetHighestDeadline() const;
  162. private:
  163. base::TimeTicks snapshot_time_;
  164. std::vector<WatchStateCopy> hung_watch_state_copies_;
  165. };
  166. // Return a watch state snapshot taken Now() to be inspected in tests.
  167. // NO_THREAD_SAFETY_ANALYSIS is needed because the analyzer can't figure out
  168. // that calls to this function done from |on_hang_closure_| are properly
  169. // locked.
  170. WatchStateSnapShot GrabWatchStateSnapshotForTesting() const
  171. NO_THREAD_SAFETY_ANALYSIS;
  172. // Inspects the state of all registered threads to check if they are hung and
  173. // invokes the appropriate closure if so.
  174. void Monitor() LOCKS_EXCLUDED(watch_state_lock_);
  175. // Record the hang and perform the necessary housekeeping before and after.
  176. void CaptureHang(base::TimeTicks capture_time)
  177. EXCLUSIVE_LOCKS_REQUIRED(watch_state_lock_) LOCKS_EXCLUDED(capture_lock_);
  178. // Stop all monitoring and join the HangWatcher thread.
  179. void Stop();
  180. // Wait until it's time to monitor.
  181. void Wait();
  182. // Run the loop that periodically monitors the registered thread at a
  183. // set time interval.
  184. void Run() override;
  185. base::TimeDelta monitor_period_;
  186. // Indicates whether Run() should return after the next monitoring.
  187. std::atomic<bool> keep_monitoring_{true};
  188. // Use to make the HangWatcher thread wake or sleep to schedule the
  189. // appropriate monitoring frequency.
  190. WaitableEvent should_monitor_;
  191. bool IsWatchListEmpty() LOCKS_EXCLUDED(watch_state_lock_);
  192. // Stops hang watching on the calling thread by removing the entry from the
  193. // watch list.
  194. void UnregisterThread() LOCKS_EXCLUDED(watch_state_lock_);
  195. Lock watch_state_lock_;
  196. std::vector<std::unique_ptr<internal::HangWatchState>> watch_states_
  197. GUARDED_BY(watch_state_lock_);
  198. base::DelegateSimpleThread thread_;
  199. RepeatingClosure after_monitor_closure_for_testing_;
  200. RepeatingClosure on_hang_closure_for_testing_;
  201. RepeatingCallback<void(TimeTicks)> after_wait_callback_;
  202. base::Lock capture_lock_ ACQUIRED_AFTER(watch_state_lock_);
  203. std::atomic<bool> capture_in_progress{false};
  204. const base::TickClock* tick_clock_;
  205. // The time after which all deadlines in |watch_states_| need to be for a hang
  206. // to be reported.
  207. base::TimeTicks deadline_ignore_threshold_;
  208. FRIEND_TEST_ALL_PREFIXES(HangWatcherTest, NestedScopes);
  209. FRIEND_TEST_ALL_PREFIXES(HangWatcherSnapshotTest, HungThreadIDs);
  210. };
  211. // Classes here are exposed in the header only for testing. They are not
  212. // intended to be used outside of base.
  213. namespace internal {
  214. // Contains the information necessary for hang watching a specific
  215. // thread. Instances of this class are accessed concurrently by the associated
  216. // thread and the HangWatcher. The HangWatcher owns instances of this
  217. // class and outside of it they are accessed through
  218. // GetHangWatchStateForCurrentThread().
  219. class BASE_EXPORT HangWatchState {
  220. public:
  221. HangWatchState();
  222. ~HangWatchState();
  223. HangWatchState(const HangWatchState&) = delete;
  224. HangWatchState& operator=(const HangWatchState&) = delete;
  225. // Allocates a new state object bound to the calling thread and returns an
  226. // owning pointer to it.
  227. static std::unique_ptr<HangWatchState> CreateHangWatchStateForCurrentThread();
  228. // Retrieves the hang watch state associated with the calling thread.
  229. // Returns nullptr if no HangWatchState exists for the current thread (see
  230. // CreateHangWatchStateForCurrentThread()).
  231. static ThreadLocalPointer<HangWatchState>*
  232. GetHangWatchStateForCurrentThread();
  233. // Returns the value of the current deadline. Use this function if you need to
  234. // store the value. To test if the deadline has expired use IsOverDeadline().
  235. TimeTicks GetDeadline() const;
  236. // Atomically sets the deadline to a new value.
  237. void SetDeadline(TimeTicks deadline);
  238. // Tests whether the associated thread's execution has gone over the deadline.
  239. bool IsOverDeadline() const;
  240. #if DCHECK_IS_ON()
  241. // Saves the supplied HangWatchScope as the currently active scope.
  242. void SetCurrentHangWatchScope(HangWatchScope* scope);
  243. // Retrieve the currently active scope.
  244. HangWatchScope* GetCurrentHangWatchScope();
  245. #endif
  246. PlatformThreadId GetThreadID() const;
  247. private:
  248. // The thread that creates the instance should be the class that updates
  249. // the deadline.
  250. THREAD_CHECKER(thread_checker_);
  251. // If the deadline fails to be updated before TimeTicks::Now() ever
  252. // reaches the value contained in it this constistutes a hang.
  253. std::atomic<TimeTicks> deadline_{base::TimeTicks::Max()};
  254. const PlatformThreadId thread_id_;
  255. #if DCHECK_IS_ON()
  256. // Used to keep track of the current HangWatchScope and detect improper usage.
  257. // Scopes should always be destructed in reverse order from the one they were
  258. // constructed in. Example of improper use:
  259. //
  260. // {
  261. // std::unique_ptr<Scope> scope = std::make_unique<Scope>(...);
  262. // Scope other_scope;
  263. // |scope| gets deallocated first, violating reverse destruction order.
  264. // scope.reset();
  265. // }
  266. HangWatchScope* current_hang_watch_scope_{nullptr};
  267. #endif
  268. };
  269. } // namespace internal
  270. } // namespace base
  271. #endif // BASE_THREADING_HANG_WATCHER_H_