123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- ///////////////////////////////////////////////////////////////
- // Copyright 2020 Madhur Chauhan.
- // Copyright 2020 John Maddock. Distributed under the Boost
- // Software License, Version 1.0. (See accompanying file
- // LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt
- #ifndef BOOST_MP_ADD_UNSIGNED_ADDC_32_HPP
- #define BOOST_MP_ADD_UNSIGNED_ADDC_32_HPP
- #include <boost/multiprecision/cpp_int/intel_intrinsics.hpp>
- namespace boost { namespace multiprecision { namespace backends {
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void add_unsigned_constexpr(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- using ::boost::multiprecision::std_constexpr::swap;
- //
- // This is the generic, C++ only version of addition.
- // It's also used for all constexpr branches, hence the name.
- // Nothing fancy, just let uintmax_t take the strain:
- //
- double_limb_type carry = 0;
- unsigned m(0), x(0);
- unsigned as = a.size();
- unsigned bs = b.size();
- minmax(as, bs, m, x);
- if (x == 1)
- {
- bool s = a.sign();
- result = static_cast<double_limb_type>(*a.limbs()) + static_cast<double_limb_type>(*b.limbs());
- result.sign(s);
- return;
- }
- result.resize(x, x);
- typename CppInt2::const_limb_pointer pa = a.limbs();
- typename CppInt3::const_limb_pointer pb = b.limbs();
- typename CppInt1::limb_pointer pr = result.limbs();
- typename CppInt1::limb_pointer pr_end = pr + m;
- if (as < bs)
- swap(pa, pb);
- // First where a and b overlap:
- while (pr != pr_end)
- {
- carry += static_cast<double_limb_type>(*pa) + static_cast<double_limb_type>(*pb);
- #ifdef __MSVC_RUNTIME_CHECKS
- *pr = static_cast<limb_type>(carry & ~static_cast<limb_type>(0));
- #else
- *pr = static_cast<limb_type>(carry);
- #endif
- carry >>= CppInt1::limb_bits;
- ++pr, ++pa, ++pb;
- }
- pr_end += x - m;
- // Now where only a has digits:
- while (pr != pr_end)
- {
- if (!carry)
- {
- if (pa != pr)
- std_constexpr::copy(pa, pa + (pr_end - pr), pr);
- break;
- }
- carry += static_cast<double_limb_type>(*pa);
- #ifdef __MSVC_RUNTIME_CHECKS
- *pr = static_cast<limb_type>(carry & ~static_cast<limb_type>(0));
- #else
- *pr = static_cast<limb_type>(carry);
- #endif
- carry >>= CppInt1::limb_bits;
- ++pr, ++pa;
- }
- if (carry)
- {
- // We overflowed, need to add one more limb:
- result.resize(x + 1, x + 1);
- if (result.size() > x)
- result.limbs()[x] = static_cast<limb_type>(1u);
- }
- result.normalize();
- result.sign(a.sign());
- }
- //
- // Core subtraction routine for all non-trivial cpp_int's:
- //
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void subtract_unsigned_constexpr(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- using ::boost::multiprecision::std_constexpr::swap;
- //
- // This is the generic, C++ only version of subtraction.
- // It's also used for all constexpr branches, hence the name.
- // Nothing fancy, just let uintmax_t take the strain:
- //
- double_limb_type borrow = 0;
- unsigned m(0), x(0);
- minmax(a.size(), b.size(), m, x);
- //
- // special cases for small limb counts:
- //
- if (x == 1)
- {
- bool s = a.sign();
- limb_type al = *a.limbs();
- limb_type bl = *b.limbs();
- if (bl > al)
- {
- ::boost::multiprecision::std_constexpr::swap(al, bl);
- s = !s;
- }
- result = al - bl;
- result.sign(s);
- return;
- }
- // This isn't used till later, but comparison has to occur before we resize the result,
- // as that may also resize a or b if this is an inplace operation:
- int c = a.compare_unsigned(b);
- // Set up the result vector:
- result.resize(x, x);
- // Now that a, b, and result are stable, get pointers to their limbs:
- typename CppInt2::const_limb_pointer pa = a.limbs();
- typename CppInt3::const_limb_pointer pb = b.limbs();
- typename CppInt1::limb_pointer pr = result.limbs();
- bool swapped = false;
- if (c < 0)
- {
- swap(pa, pb);
- swapped = true;
- }
- else if (c == 0)
- {
- result = static_cast<limb_type>(0);
- return;
- }
- unsigned i = 0;
- // First where a and b overlap:
- while (i < m)
- {
- borrow = static_cast<double_limb_type>(pa[i]) - static_cast<double_limb_type>(pb[i]) - borrow;
- pr[i] = static_cast<limb_type>(borrow);
- borrow = (borrow >> CppInt1::limb_bits) & 1u;
- ++i;
- }
- // Now where only a has digits, only as long as we've borrowed:
- while (borrow && (i < x))
- {
- borrow = static_cast<double_limb_type>(pa[i]) - borrow;
- pr[i] = static_cast<limb_type>(borrow);
- borrow = (borrow >> CppInt1::limb_bits) & 1u;
- ++i;
- }
- // Any remaining digits are the same as those in pa:
- if ((x != i) && (pa != pr))
- std_constexpr::copy(pa + i, pa + x, pr + i);
- BOOST_ASSERT(0 == borrow);
- //
- // We may have lost digits, if so update limb usage count:
- //
- result.normalize();
- result.sign(a.sign());
- if (swapped)
- result.negate();
- }
- #ifdef BOOST_MP_HAS_IMMINTRIN_H
- //
- // This is the key addition routine where all the argument types are non-trivial cpp_int's:
- //
- //
- // This optimization is limited to: GCC, LLVM, ICC (Intel), MSVC for x86_64 and i386.
- // If your architecture and compiler supports ADC intrinsic, please file a bug
- //
- // As of May, 2020 major compilers don't recognize carry chain though adc
- // intrinsics are used to hint compilers to use ADC and still compilers don't
- // unroll the loop efficiently (except LLVM) so manual unrolling is done.
- //
- // Also note that these intrinsics were only introduced by Intel as part of the
- // ADX processor extensions, even though the addc instruction has been available
- // for basically all x86 processors. That means gcc-9, clang-9, msvc-14.2 and up
- // are required to support these intrinsics.
- //
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void add_unsigned(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- #ifndef BOOST_MP_NO_CONSTEXPR_DETECTION
- if (BOOST_MP_IS_CONST_EVALUATED(a.size()))
- {
- add_unsigned_constexpr(result, a, b);
- }
- else
- #endif
- {
- using std::swap;
- // Nothing fancy, just let uintmax_t take the strain:
- unsigned m(0), x(0);
- unsigned as = a.size();
- unsigned bs = b.size();
- minmax(as, bs, m, x);
- if (x == 1)
- {
- bool s = a.sign();
- result = static_cast<double_limb_type>(*a.limbs()) + static_cast<double_limb_type>(*b.limbs());
- result.sign(s);
- return;
- }
- result.resize(x, x);
- typename CppInt2::const_limb_pointer pa = a.limbs();
- typename CppInt3::const_limb_pointer pb = b.limbs();
- typename CppInt1::limb_pointer pr = result.limbs();
- if (as < bs)
- swap(pa, pb);
- // First where a and b overlap:
- unsigned i = 0;
- unsigned char carry = 0;
- #if defined(BOOST_MSVC) && !defined(BOOST_HAS_INT128) && defined(_M_X64)
- //
- // Special case for 32-bit limbs on 64-bit architecture - we can process
- // 2 limbs with each instruction.
- //
- for (; i + 8 <= m; i += 8)
- {
- carry = _addcarry_u64(carry, *(unsigned long long*)(pa + i + 0), *(unsigned long long*)(pb + i + 0), (unsigned long long*)(pr + i));
- carry = _addcarry_u64(carry, *(unsigned long long*)(pa + i + 2), *(unsigned long long*)(pb + i + 2), (unsigned long long*)(pr + i + 2));
- carry = _addcarry_u64(carry, *(unsigned long long*)(pa + i + 4), *(unsigned long long*)(pb + i + 4), (unsigned long long*)(pr + i + 4));
- carry = _addcarry_u64(carry, *(unsigned long long*)(pa + i + 6), *(unsigned long long*)(pb + i + 6), (unsigned long long*)(pr + i + 6));
- }
- #else
- for (; i + 4 <= m; i += 4)
- {
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i + 0], pb[i + 0], pr + i);
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i + 1], pb[i + 1], pr + i + 1);
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i + 2], pb[i + 2], pr + i + 2);
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i + 3], pb[i + 3], pr + i + 3);
- }
- #endif
- for (; i < m; ++i)
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i], pb[i], pr + i);
- for (; i < x && carry; ++i)
- carry = ::boost::multiprecision::detail::addcarry_limb(carry, pa[i], 0, pr + i);
- if (i == x && carry)
- {
- // We overflowed, need to add one more limb:
- result.resize(x + 1, x + 1);
- if (result.size() > x)
- result.limbs()[x] = static_cast<limb_type>(1u);
- }
- else
- std::copy(pa + i, pa + x, pr + i);
- result.normalize();
- result.sign(a.sign());
- }
- }
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void subtract_unsigned(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- #ifndef BOOST_MP_NO_CONSTEXPR_DETECTION
- if (BOOST_MP_IS_CONST_EVALUATED(a.size()))
- {
- subtract_unsigned_constexpr(result, a, b);
- }
- else
- #endif
- {
- using std::swap;
- // Nothing fancy, just let uintmax_t take the strain:
- unsigned m(0), x(0);
- minmax(a.size(), b.size(), m, x);
- //
- // special cases for small limb counts:
- //
- if (x == 1)
- {
- bool s = a.sign();
- limb_type al = *a.limbs();
- limb_type bl = *b.limbs();
- if (bl > al)
- {
- ::boost::multiprecision::std_constexpr::swap(al, bl);
- s = !s;
- }
- result = al - bl;
- result.sign(s);
- return;
- }
- // This isn't used till later, but comparison has to occur before we resize the result,
- // as that may also resize a or b if this is an inplace operation:
- int c = a.compare_unsigned(b);
- // Set up the result vector:
- result.resize(x, x);
- // Now that a, b, and result are stable, get pointers to their limbs:
- typename CppInt2::const_limb_pointer pa = a.limbs();
- typename CppInt3::const_limb_pointer pb = b.limbs();
- typename CppInt1::limb_pointer pr = result.limbs();
- bool swapped = false;
- if (c < 0)
- {
- swap(pa, pb);
- swapped = true;
- }
- else if (c == 0)
- {
- result = static_cast<limb_type>(0);
- return;
- }
- unsigned i = 0;
- unsigned char borrow = 0;
- // First where a and b overlap:
- #if defined(BOOST_MSVC) && !defined(BOOST_HAS_INT128) && defined(_M_X64)
- //
- // Special case for 32-bit limbs on 64-bit architecture - we can process
- // 2 limbs with each instruction.
- //
- for (; i + 8 <= m; i += 8)
- {
- borrow = _subborrow_u64(borrow, *reinterpret_cast<const unsigned long long*>(pa + i), *reinterpret_cast<const unsigned long long*>(pb + i), reinterpret_cast<unsigned long long*>(pr + i));
- borrow = _subborrow_u64(borrow, *reinterpret_cast<const unsigned long long*>(pa + i + 2), *reinterpret_cast<const unsigned long long*>(pb + i + 2), reinterpret_cast<unsigned long long*>(pr + i + 2));
- borrow = _subborrow_u64(borrow, *reinterpret_cast<const unsigned long long*>(pa + i + 4), *reinterpret_cast<const unsigned long long*>(pb + i + 4), reinterpret_cast<unsigned long long*>(pr + i + 4));
- borrow = _subborrow_u64(borrow, *reinterpret_cast<const unsigned long long*>(pa + i + 6), *reinterpret_cast<const unsigned long long*>(pb + i + 6), reinterpret_cast<unsigned long long*>(pr + i + 6));
- }
- #else
- for(; i + 4 <= m; i += 4)
- {
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i], pb[i], pr + i);
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i + 1], pb[i + 1], pr + i + 1);
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i + 2], pb[i + 2], pr + i + 2);
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i + 3], pb[i + 3], pr + i + 3);
- }
- #endif
- for (; i < m; ++i)
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i], pb[i], pr + i);
- // Now where only a has digits, only as long as we've borrowed:
- while (borrow && (i < x))
- {
- borrow = boost::multiprecision::detail::subborrow_limb(borrow, pa[i], 0, pr + i);
- ++i;
- }
- // Any remaining digits are the same as those in pa:
- if ((x != i) && (pa != pr))
- std_constexpr::copy(pa + i, pa + x, pr + i);
- BOOST_ASSERT(0 == borrow);
- //
- // We may have lost digits, if so update limb usage count:
- //
- result.normalize();
- result.sign(a.sign());
- if (swapped)
- result.negate();
- } // constepxr.
- }
- #else
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void add_unsigned(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- add_unsigned_constexpr(result, a, b);
- }
- template <class CppInt1, class CppInt2, class CppInt3>
- inline BOOST_MP_CXX14_CONSTEXPR void subtract_unsigned(CppInt1& result, const CppInt2& a, const CppInt3& b) noexcept(is_non_throwing_cpp_int<CppInt1>::value)
- {
- subtract_unsigned_constexpr(result, a, b);
- }
- #endif
- } } } // namespaces
- #endif
|