|
- // This file is part of Eigen, a lightweight C++ template library
- // for linear algebra.
- //
- // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
- //
- // This Source Code Form is subject to the terms of the Mozilla
- // Public License v. 2.0. If a copy of the MPL was not distributed
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
- #include <iostream>
- #include <cstdint>
- #include <cstdlib>
- #include <vector>
- #include <fstream>
- #include <memory>
- #include <cstdio>
- bool eigen_use_specific_block_size;
- int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
- #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
- #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
- #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
- #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
- #include <Eigen/Core>
- #include <bench/BenchTimer.h>
- using namespace Eigen;
- using namespace std;
- static BenchTimer timer;
- // how many times we repeat each measurement.
- // measurements are randomly shuffled - we're not doing
- // all N identical measurements in a row.
- const int measurement_repetitions = 3;
- // Timings below this value are too short to be accurate,
- // we'll repeat measurements with more iterations until
- // we get a timing above that threshold.
- const float min_accurate_time = 1e-2f;
- // See --min-working-set-size command line parameter.
- size_t min_working_set_size = 0;
- float max_clock_speed = 0.0f;
- // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
- const size_t maxsize = 2048;
- const size_t minsize = 16;
- typedef MatrixXf MatrixType;
- typedef MatrixType::Scalar Scalar;
- typedef internal::packet_traits<Scalar>::type Packet;
- static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
- static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
- static_assert(maxsize > minsize, "maxsize must be larger than minsize");
- static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
- // just a helper to store a triple of K,M,N sizes for matrix product
- struct size_triple_t
- {
- size_t k, m, n;
- size_triple_t() : k(0), m(0), n(0) {}
- size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
- size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
- size_triple_t(uint16_t compact)
- {
- k = 1 << ((compact & 0xf00) >> 8);
- m = 1 << ((compact & 0x0f0) >> 4);
- n = 1 << ((compact & 0x00f) >> 0);
- }
- };
- uint8_t log2_pot(size_t x) {
- size_t l = 0;
- while (x >>= 1) l++;
- return l;
- }
- // Convert between size tripes and a compact form fitting in 12 bits
- // where each size, which must be a POT, is encoded as its log2, on 4 bits
- // so the largest representable size is 2^15 == 32k ... big enough.
- uint16_t compact_size_triple(size_t k, size_t m, size_t n)
- {
- return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
- }
- uint16_t compact_size_triple(const size_triple_t& t)
- {
- return compact_size_triple(t.k, t.m, t.n);
- }
- // A single benchmark. Initially only contains benchmark params.
- // Then call run(), which stores the result in the gflops field.
- struct benchmark_t
- {
- uint16_t compact_product_size;
- uint16_t compact_block_size;
- bool use_default_block_size;
- float gflops;
- benchmark_t()
- : compact_product_size(0)
- , compact_block_size(0)
- , use_default_block_size(false)
- , gflops(0)
- {
- }
- benchmark_t(size_t pk, size_t pm, size_t pn,
- size_t bk, size_t bm, size_t bn)
- : compact_product_size(compact_size_triple(pk, pm, pn))
- , compact_block_size(compact_size_triple(bk, bm, bn))
- , use_default_block_size(false)
- , gflops(0)
- {}
- benchmark_t(size_t pk, size_t pm, size_t pn)
- : compact_product_size(compact_size_triple(pk, pm, pn))
- , compact_block_size(0)
- , use_default_block_size(true)
- , gflops(0)
- {}
- void run();
- };
- ostream& operator<<(ostream& s, const benchmark_t& b)
- {
- s << hex << b.compact_product_size << dec;
- if (b.use_default_block_size) {
- size_triple_t t(b.compact_product_size);
- Index k = t.k, m = t.m, n = t.n;
- internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
- s << " default(" << k << ", " << m << ", " << n << ")";
- } else {
- s << " " << hex << b.compact_block_size << dec;
- }
- s << " " << b.gflops;
- return s;
- }
- // We sort first by increasing benchmark parameters,
- // then by decreasing performance.
- bool operator<(const benchmark_t& b1, const benchmark_t& b2)
- {
- return b1.compact_product_size < b2.compact_product_size ||
- (b1.compact_product_size == b2.compact_product_size && (
- (b1.compact_block_size < b2.compact_block_size || (
- b1.compact_block_size == b2.compact_block_size &&
- b1.gflops > b2.gflops))));
- }
- void benchmark_t::run()
- {
- size_triple_t productsizes(compact_product_size);
- if (use_default_block_size) {
- eigen_use_specific_block_size = false;
- } else {
- // feed eigen with our custom blocking params
- eigen_use_specific_block_size = true;
- size_triple_t blocksizes(compact_block_size);
- eigen_block_size_k = blocksizes.k;
- eigen_block_size_m = blocksizes.m;
- eigen_block_size_n = blocksizes.n;
- }
- // set up the matrix pool
- const size_t combined_three_matrices_sizes =
- sizeof(Scalar) *
- (productsizes.k * productsizes.m +
- productsizes.k * productsizes.n +
- productsizes.m * productsizes.n);
- // 64 M is large enough that nobody has a cache bigger than that,
- // while still being small enough that everybody has this much RAM,
- // so conveniently we don't need to special-case platforms here.
- const size_t unlikely_large_cache_size = 64 << 20;
- const size_t working_set_size =
- min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
- const size_t matrix_pool_size =
- 1 + working_set_size / combined_three_matrices_sizes;
- MatrixType *lhs = new MatrixType[matrix_pool_size];
- MatrixType *rhs = new MatrixType[matrix_pool_size];
- MatrixType *dst = new MatrixType[matrix_pool_size];
-
- for (size_t i = 0; i < matrix_pool_size; i++) {
- lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
- rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
- dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
- }
- // main benchmark loop
- int iters_at_a_time = 1;
- float time_per_iter = 0.0f;
- size_t matrix_index = 0;
- while (true) {
- double starttime = timer.getCpuTime();
- for (int i = 0; i < iters_at_a_time; i++) {
- dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
- matrix_index++;
- if (matrix_index == matrix_pool_size) {
- matrix_index = 0;
- }
- }
- double endtime = timer.getCpuTime();
- const float timing = float(endtime - starttime);
- if (timing >= min_accurate_time) {
- time_per_iter = timing / iters_at_a_time;
- break;
- }
- iters_at_a_time *= 2;
- }
- delete[] lhs;
- delete[] rhs;
- delete[] dst;
- gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
- }
- void print_cpuinfo()
- {
- #ifdef __linux__
- cout << "contents of /proc/cpuinfo:" << endl;
- string line;
- ifstream cpuinfo("/proc/cpuinfo");
- if (cpuinfo.is_open()) {
- while (getline(cpuinfo, line)) {
- cout << line << endl;
- }
- cpuinfo.close();
- }
- cout << endl;
- #elif defined __APPLE__
- cout << "output of sysctl hw:" << endl;
- system("sysctl hw");
- cout << endl;
- #endif
- }
- template <typename T>
- string type_name()
- {
- return "unknown";
- }
- template<>
- string type_name<float>()
- {
- return "float";
- }
- template<>
- string type_name<double>()
- {
- return "double";
- }
- struct action_t
- {
- virtual const char* invokation_name() const { abort(); return nullptr; }
- virtual void run() const { abort(); }
- virtual ~action_t() {}
- };
- void show_usage_and_exit(int /*argc*/, char* argv[],
- const vector<unique_ptr<action_t>>& available_actions)
- {
- cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
- cerr << "available actions:" << endl << endl;
- for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
- cerr << " " << (*it)->invokation_name() << endl;
- }
- cerr << endl;
- cerr << "options:" << endl << endl;
- cerr << " --min-working-set-size=N:" << endl;
- cerr << " Set the minimum working set size to N bytes." << endl;
- cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
- cerr << " A larger working set lowers the chance of a warm cache." << endl;
- cerr << " The default value 0 means use a large enough working" << endl;
- cerr << " set to likely outsize caches." << endl;
- cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
- cerr << " avoid warm caches." << endl;
- exit(1);
- }
-
- float measure_clock_speed()
- {
- cerr << "Measuring clock speed... \r" << flush;
-
- vector<float> all_gflops;
- for (int i = 0; i < 8; i++) {
- benchmark_t b(1024, 1024, 1024);
- b.run();
- all_gflops.push_back(b.gflops);
- }
- sort(all_gflops.begin(), all_gflops.end());
- float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
- // multiply by an arbitrary constant to discourage trying doing anything with the
- // returned values besides just comparing them with each other.
- float result = stable_estimate * 123.456f;
- return result;
- }
- struct human_duration_t
- {
- int seconds;
- human_duration_t(int s) : seconds(s) {}
- };
- ostream& operator<<(ostream& s, const human_duration_t& d)
- {
- int remainder = d.seconds;
- if (remainder > 3600) {
- int hours = remainder / 3600;
- s << hours << " h ";
- remainder -= hours * 3600;
- }
- if (remainder > 60) {
- int minutes = remainder / 60;
- s << minutes << " min ";
- remainder -= minutes * 60;
- }
- if (d.seconds < 600) {
- s << remainder << " s";
- }
- return s;
- }
- const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
- void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
- {
- FILE* file = fopen(filename, "w");
- if (!file) {
- cerr << "Could not open file " << filename << " for writing." << endl;
- cerr << "Do you have write permissions on the current working directory?" << endl;
- exit(1);
- }
- size_t benchmarks_vector_size = benchmarks.size();
- fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
- fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
- fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
- fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
- fclose(file);
- }
- bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
- {
- FILE* file = fopen(filename, "r");
- if (!file) {
- return false;
- }
- if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
- return false;
- }
- size_t benchmarks_vector_size = 0;
- if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
- return false;
- }
- if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
- return false;
- }
- benchmarks.resize(benchmarks_vector_size);
- if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
- return false;
- }
- unlink(filename);
- return true;
- }
- void try_run_some_benchmarks(
- vector<benchmark_t>& benchmarks,
- double time_start,
- size_t& first_benchmark_to_run)
- {
- if (first_benchmark_to_run == benchmarks.size()) {
- return;
- }
- double time_last_progress_update = 0;
- double time_last_clock_speed_measurement = 0;
- double time_now = 0;
- size_t benchmark_index = first_benchmark_to_run;
- while (true) {
- float ratio_done = float(benchmark_index) / benchmarks.size();
- time_now = timer.getRealTime();
- // We check clock speed every minute and at the end.
- if (benchmark_index == benchmarks.size() ||
- time_now > time_last_clock_speed_measurement + 60.0f)
- {
- time_last_clock_speed_measurement = time_now;
- // Ensure that clock speed is as expected
- float current_clock_speed = measure_clock_speed();
- // The tolerance needs to be smaller than the relative difference between
- // clock speeds that a device could operate under.
- // It seems unlikely that a device would be throttling clock speeds by
- // amounts smaller than 2%.
- // With a value of 1%, I was getting within noise on a Sandy Bridge.
- const float clock_speed_tolerance = 0.02f;
- if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
- // Clock speed is now higher than we previously measured.
- // Either our initial measurement was inaccurate, which won't happen
- // too many times as we are keeping the best clock speed value and
- // and allowing some tolerance; or something really weird happened,
- // which invalidates all benchmark results collected so far.
- // Either way, we better restart all over again now.
- if (benchmark_index) {
- cerr << "Restarting at " << 100.0f * ratio_done
- << " % because clock speed increased. " << endl;
- }
- max_clock_speed = current_clock_speed;
- first_benchmark_to_run = 0;
- return;
- }
- bool rerun_last_tests = false;
- if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
- cerr << "Measurements completed so far: "
- << 100.0f * ratio_done
- << " % " << endl;
- cerr << "Clock speed seems to be only "
- << current_clock_speed/max_clock_speed
- << " times what it used to be." << endl;
- unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
- while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
- if (seconds_to_sleep_if_lower_clock_speed > 32) {
- cerr << "Sleeping longer probably won't make a difference." << endl;
- cerr << "Serializing benchmarks to " << session_filename << endl;
- serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
- cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
- exit(2);
- }
- rerun_last_tests = true;
- cerr << "Sleeping "
- << seconds_to_sleep_if_lower_clock_speed
- << " s... \r" << endl;
- sleep(seconds_to_sleep_if_lower_clock_speed);
- current_clock_speed = measure_clock_speed();
- seconds_to_sleep_if_lower_clock_speed *= 2;
- }
- }
- if (rerun_last_tests) {
- cerr << "Redoing the last "
- << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
- << " % because clock speed had been low. " << endl;
- return;
- }
- // nothing wrong with the clock speed so far, so there won't be a need to rerun
- // benchmarks run so far in case we later encounter a lower clock speed.
- first_benchmark_to_run = benchmark_index;
- }
- if (benchmark_index == benchmarks.size()) {
- // We're done!
- first_benchmark_to_run = benchmarks.size();
- // Erase progress info
- cerr << " " << endl;
- return;
- }
- // Display progress info on stderr
- if (time_now > time_last_progress_update + 1.0f) {
- time_last_progress_update = time_now;
- cerr << "Measurements... " << 100.0f * ratio_done
- << " %, ETA "
- << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
- << " \r" << flush;
- }
- // This is where we actually run a benchmark!
- benchmarks[benchmark_index].run();
- benchmark_index++;
- }
- }
- void run_benchmarks(vector<benchmark_t>& benchmarks)
- {
- size_t first_benchmark_to_run;
- vector<benchmark_t> deserialized_benchmarks;
- bool use_deserialized_benchmarks = false;
- if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
- cerr << "Found serialized session with "
- << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
- << " % already done" << endl;
- if (deserialized_benchmarks.size() == benchmarks.size() &&
- first_benchmark_to_run > 0 &&
- first_benchmark_to_run < benchmarks.size())
- {
- use_deserialized_benchmarks = true;
- }
- }
- if (use_deserialized_benchmarks) {
- benchmarks = deserialized_benchmarks;
- } else {
- // not using deserialized benchmarks, starting from scratch
- first_benchmark_to_run = 0;
- // Randomly shuffling benchmarks allows us to get accurate enough progress info,
- // as now the cheap/expensive benchmarks are randomly mixed so they average out.
- // It also means that if data is corrupted for some time span, the odds are that
- // not all repetitions of a given benchmark will be corrupted.
- random_shuffle(benchmarks.begin(), benchmarks.end());
- }
- for (int i = 0; i < 4; i++) {
- max_clock_speed = max(max_clock_speed, measure_clock_speed());
- }
-
- double time_start = 0.0;
- while (first_benchmark_to_run < benchmarks.size()) {
- if (first_benchmark_to_run == 0) {
- time_start = timer.getRealTime();
- }
- try_run_some_benchmarks(benchmarks,
- time_start,
- first_benchmark_to_run);
- }
- // Sort timings by increasing benchmark parameters, and decreasing gflops.
- // The latter is very important. It means that we can ignore all but the first
- // benchmark with given parameters.
- sort(benchmarks.begin(), benchmarks.end());
- // Collect best (i.e. now first) results for each parameter values.
- vector<benchmark_t> best_benchmarks;
- for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
- if (best_benchmarks.empty() ||
- best_benchmarks.back().compact_product_size != it->compact_product_size ||
- best_benchmarks.back().compact_block_size != it->compact_block_size)
- {
- best_benchmarks.push_back(*it);
- }
- }
- // keep and return only the best benchmarks
- benchmarks = best_benchmarks;
- }
- struct measure_all_pot_sizes_action_t : action_t
- {
- virtual const char* invokation_name() const { return "all-pot-sizes"; }
- virtual void run() const
- {
- vector<benchmark_t> benchmarks;
- for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
- for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
- for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
- for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
- for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
- for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
- for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
- benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
- }
- }
- }
- }
- }
- }
- }
- run_benchmarks(benchmarks);
- cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
- for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
- cout << *it << endl;
- }
- }
- };
- struct measure_default_sizes_action_t : action_t
- {
- virtual const char* invokation_name() const { return "default-sizes"; }
- virtual void run() const
- {
- vector<benchmark_t> benchmarks;
- for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
- for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
- for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
- for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
- benchmarks.emplace_back(ksize, msize, nsize);
- }
- }
- }
- }
- run_benchmarks(benchmarks);
- cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
- for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
- cout << *it << endl;
- }
- }
- };
- int main(int argc, char* argv[])
- {
- double time_start = timer.getRealTime();
- cout.precision(4);
- cerr.precision(4);
- vector<unique_ptr<action_t>> available_actions;
- available_actions.emplace_back(new measure_all_pot_sizes_action_t);
- available_actions.emplace_back(new measure_default_sizes_action_t);
- auto action = available_actions.end();
- if (argc <= 1) {
- show_usage_and_exit(argc, argv, available_actions);
- }
- for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
- if (!strcmp(argv[1], (*it)->invokation_name())) {
- action = it;
- break;
- }
- }
- if (action == available_actions.end()) {
- show_usage_and_exit(argc, argv, available_actions);
- }
- for (int i = 2; i < argc; i++) {
- if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
- const char* equals_sign = strchr(argv[i], '=');
- min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
- } else {
- cerr << "unrecognized option: " << argv[i] << endl << endl;
- show_usage_and_exit(argc, argv, available_actions);
- }
- }
- print_cpuinfo();
- cout << "benchmark parameters:" << endl;
- cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
- cout << "scalar type: " << type_name<Scalar>() << endl;
- cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
- cout << "minsize = " << minsize << endl;
- cout << "maxsize = " << maxsize << endl;
- cout << "measurement_repetitions = " << measurement_repetitions << endl;
- cout << "min_accurate_time = " << min_accurate_time << endl;
- cout << "min_working_set_size = " << min_working_set_size;
- if (min_working_set_size == 0) {
- cout << " (try to outsize caches)";
- }
- cout << endl << endl;
- (*action)->run();
- double time_end = timer.getRealTime();
- cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
- }
|