// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: vitus@google.com (Michael Vitus),
//          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

#ifndef CERES_INTERNAL_PARALLEL_FOR_H_
#define CERES_INTERNAL_PARALLEL_FOR_H_

#include <mutex>
#include <vector>

#include "ceres/context_impl.h"
#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
#include "ceres/parallel_invoke.h"
#include "ceres/partition_range_for_parallel_for.h"
#include "glog/logging.h"

namespace ceres::internal {

// Use a dummy mutex if num_threads = 1.
inline decltype(auto) MakeConditionalLock(const int num_threads,
                                          std::mutex& m) {
  return (num_threads == 1) ? std::unique_lock<std::mutex>{}
                            : std::unique_lock<std::mutex>{m};
}

// Execute the function for every element in the range [start, end) with at most
// num_threads. It will execute all the work on the calling thread if
// num_threads or (end - start) is equal to 1.
// Depending on function signature, it will be supplied with either loop index
// or a range of loop indicies; function can also be supplied with thread_id.
// The following function signatures are supported:
//  - Functions accepting a single loop index:
//     - [](int index) { ... }
//     - [](int thread_id, int index) { ... }
//  - Functions accepting a range of loop index:
//     - [](std::tuple<int, int> index) { ... }
//     - [](int thread_id, std::tuple<int, int> index) { ... }
//
// When distributing workload between threads, it is assumed that each loop
// iteration takes approximately equal time to complete.
template <typename F>
void ParallelFor(ContextImpl* context,
                 int start,
                 int end,
                 int num_threads,
                 F&& function,
                 int min_block_size = 1) {
  CHECK_GT(num_threads, 0);
  if (start >= end) {
    return;
  }

  if (num_threads == 1 || end - start < min_block_size * 2) {
    InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
    return;
  }

  CHECK(context != nullptr);
  ParallelInvoke(context,
                 start,
                 end,
                 num_threads,
                 std::forward<F>(function),
                 min_block_size);
}

// Execute function for every element in the range [start, end) with at most
// num_threads, using user-provided partitions array.
// When distributing workload between threads, it is assumed that each segment
// bounded by adjacent elements of partitions array takes approximately equal
// time to process.
template <typename F>
void ParallelFor(ContextImpl* context,
                 int start,
                 int end,
                 int num_threads,
                 F&& function,
                 const std::vector<int>& partitions) {
  CHECK_GT(num_threads, 0);
  if (start >= end) {
    return;
  }
  CHECK_EQ(partitions.front(), start);
  CHECK_EQ(partitions.back(), end);
  if (num_threads == 1 || end - start <= num_threads) {
    ParallelFor(context, start, end, num_threads, std::forward<F>(function));
    return;
  }
  CHECK_GT(partitions.size(), 1);
  const int num_partitions = partitions.size() - 1;
  ParallelFor(context,
              0,
              num_partitions,
              num_threads,
              [&function, &partitions](int thread_id,
                                       std::tuple<int, int> partition_ids) {
                // partition_ids is a range of partition indices
                const auto [partition_start, partition_end] = partition_ids;
                // Execution over several adjacent segments is equivalent
                // to execution over union of those segments (which is also a
                // contiguous segment)
                const int range_start = partitions[partition_start];
                const int range_end = partitions[partition_end];
                // Range of original loop indices
                const auto range = std::make_tuple(range_start, range_end);
                InvokeOnSegment(thread_id, range, function);
              });
}

// Execute function for every element in the range [start, end) with at most
// num_threads, taking into account user-provided integer cumulative costs of
// iterations. Cumulative costs of iteration for indices in range [0, end) are
// stored in objects from cumulative_cost_data. User-provided
// cumulative_cost_fun returns non-decreasing integer values corresponding to
// inclusive cumulative cost of loop iterations, provided with a reference to
// user-defined object. Only indices from [start, end) will be referenced. This
// routine assumes that cumulative_cost_fun is non-decreasing (in other words,
// all costs are non-negative);
// When distributing workload between threads, input range of loop indices will
// be partitioned into disjoint contiguous intervals, with the maximal cost
// being minimized.
// For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun
// should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range
// will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs
// [2, 5, 4, 4].
template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
void ParallelFor(ContextImpl* context,
                 int start,
                 int end,
                 int num_threads,
                 F&& function,
                 const CumulativeCostData* cumulative_cost_data,
                 CumulativeCostFun&& cumulative_cost_fun) {
  CHECK_GT(num_threads, 0);
  if (start >= end) {
    return;
  }
  if (num_threads == 1 || end - start <= num_threads) {
    ParallelFor(context, start, end, num_threads, std::forward<F>(function));
    return;
  }
  // Creating several partitions allows us to tolerate imperfections of
  // partitioning and user-supplied iteration costs up to a certain extent
  constexpr int kNumPartitionsPerThread = 4;
  const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
  const auto& partitions = PartitionRangeForParallelFor(
      start,
      end,
      kMaxPartitions,
      cumulative_cost_data,
      std::forward<CumulativeCostFun>(cumulative_cost_fun));
  CHECK_GT(partitions.size(), 1);
  ParallelFor(
      context, start, end, num_threads, std::forward<F>(function), partitions);
}
}  // namespace ceres::internal

#endif  // CERES_INTERNAL_PARALLEL_FOR_H_