Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
26 changes: 26 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ include(config_ccache)

option(AF_BUILD_CPU "Build ArrayFire with a CPU backend" ON)
option(AF_BUILD_CUDA "Build ArrayFire with a CUDA backend" ${CUDA_FOUND})
option(AF_BUILD_HIP "Build ArrayFire with a HIP/ROCm backend" OFF)
option(AF_BUILD_OPENCL "Build ArrayFire with a OpenCL backend" ${OpenCL_FOUND})
option(AF_BUILD_ONEAPI "Build ArrayFire with a oneAPI backend" OFF)
option(AF_BUILD_UNIFIED "Build Backend-Independent ArrayFire API" ON)
Expand Down Expand Up @@ -191,6 +192,30 @@ if(AF_BUILD_CUDA)
endif()
endif()

# The HIP/ROCm backend (src/backend/hip) is a sibling of the CUDA backend cloned
# from it; it builds as library `afcuda` and reports AF_BACKEND_CUDA so the
# unified dispatcher and the test suite treat it as the CUDA-model backend on
# AMD. It is therefore mutually exclusive with AF_BUILD_CUDA.
if(AF_BUILD_HIP)
if(AF_BUILD_CUDA)
message(FATAL_ERROR
"AF_BUILD_HIP and AF_BUILD_CUDA are mutually exclusive: the HIP backend "
"builds as the CUDA-identity backend (afcuda).")
endif()
if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
# Default the target GPU arch when the caller has not set one; override with
# -DCMAKE_HIP_ARCHITECTURES=<arch> (e.g. gfx90a, gfx942, gfx1100) as needed.
set(CMAKE_HIP_ARCHITECTURES "gfx90a")
endif()
check_language(HIP)
if(CMAKE_HIP_COMPILER)
enable_language(HIP)
else()
message(FATAL_ERROR "No HIP support (set CMAKE_HIP_COMPILER, e.g. /opt/rocm/llvm/bin/clang++)")
endif()
find_package(hip REQUIRED)
endif()

af_deprecate(BUILD_CPU AF_BUILD_CPU)
af_deprecate(BUILD_CUDA AF_BUILD_CUDA)
af_deprecate(BUILD_OPENCL AF_BUILD_OPENCL)
Expand Down Expand Up @@ -384,6 +409,7 @@ add_subdirectory(src/api/cpp)

conditional_directory(AF_BUILD_CPU src/backend/cpu)
conditional_directory(AF_BUILD_CUDA src/backend/cuda)
conditional_directory(AF_BUILD_HIP src/backend/hip)
conditional_directory(AF_BUILD_ONEAPI src/backend/oneapi)
conditional_directory(AF_BUILD_OPENCL src/backend/opencl)
conditional_directory(AF_BUILD_UNIFIED src/api/unified)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Several of ArrayFire's benefits include:
* [Easy to use](http://arrayfire.org/docs/gettingstarted.htm), stable,
[well-documented](http://arrayfire.org/docs) API
* Rigorous benchmarks and tests ensuring top performance and numerical accuracy
* Cross-platform compatibility with support for CUDA, oneAPI, OpenCL, and
native CPU on Windows, Mac, and Linux
* Cross-platform compatibility with support for CUDA (on NVIDIA GPUs, or AMD
GPUs via ROCm/HIP), oneAPI, OpenCL, and native CPU on Windows, Mac, and Linux
* Built-in visualization functions through
[Forge](https://github.com/arrayfire/forge)
* Commercially friendly open-source licensing
Expand Down
17 changes: 10 additions & 7 deletions src/backend/common/ArrayFireTypesIO.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ struct fmt::formatter<af_seq> {
}

template<typename FormatContext>
auto format(const af_seq& p, FormatContext& ctx) -> decltype(ctx.out()) {
auto format(const af_seq& p, FormatContext& ctx) const
-> decltype(ctx.out()) {
// ctx.out() is an output iterator to write to.
if (p.begin == af_span.begin && p.end == af_span.end &&
p.step == af_span.step) {
Expand Down Expand Up @@ -73,18 +74,20 @@ struct fmt::formatter<arrayfire::common::Version> {
}

template<typename FormatContext>
auto format(const arrayfire::common::Version& ver, FormatContext& ctx)
auto format(const arrayfire::common::Version& ver, FormatContext& ctx) const
-> decltype(ctx.out()) {
// fmt v10+ requires format() to be const, so do not mutate the
// show_minor/show_patch members here; use locals.
if (ver.major() == -1) return format_to(ctx.out(), "N/A");
if (ver.minor() == -1) show_minor = false;
if (ver.patch() == -1) show_patch = false;
if (show_major && !show_minor && !show_patch) {
const bool minor = show_minor && (ver.minor() != -1);
const bool patch = show_patch && (ver.patch() != -1);
if (show_major && !minor && !patch) {
return format_to(ctx.out(), "{}", ver.major());
}
if (show_major && show_minor && !show_patch) {
if (show_major && minor && !patch) {
return format_to(ctx.out(), "{}.{}", ver.major(), ver.minor());
}
if (show_major && show_minor && show_patch) {
if (show_major && minor && patch) {
return format_to(ctx.out(), "{}.{}.{}", ver.major(), ver.minor(),
ver.patch());
}
Expand Down
117 changes: 114 additions & 3 deletions src/backend/common/half.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,122 @@ struct is_same<T, T> {
template<class T, class U>
constexpr bool is_same_v = is_same<T, U>::value;

#if defined(__HIP_RTC__)
// hipRTC's bundled std is smaller than NVRTC's, so the integer traits the half
// int2half SFINAE needs are provided here too (NVRTC already had them). NVRTC
// bundles the full <type_traits>/<limits>/<cmath>, so this block is sub-gated to
// the hipRTC path only and stays inert for the NVRTC build.
template<class T, T v>
struct integral_constant {
static constexpr T value = v;
using value_type = T;
using type = integral_constant;
constexpr operator value_type() const noexcept { return value; }
};
using true_type = integral_constant<bool, true>;
using false_type = integral_constant<bool, false>;

template<class T>
struct is_integral : false_type {};
template<>
struct is_integral<bool> : true_type {};
template<>
struct is_integral<char> : true_type {};
template<>
struct is_integral<signed char> : true_type {};
template<>
struct is_integral<unsigned char> : true_type {};
template<>
struct is_integral<short> : true_type {};
template<>
struct is_integral<unsigned short> : true_type {};
template<>
struct is_integral<int> : true_type {};
template<>
struct is_integral<unsigned int> : true_type {};
template<>
struct is_integral<long> : true_type {};
template<>
struct is_integral<unsigned long> : true_type {};
template<>
struct is_integral<long long> : true_type {};
template<>
struct is_integral<unsigned long long> : true_type {};

template<class T>
struct is_signed {
static constexpr bool value = (T(-1) < T(0));
};

// Minimal numeric_limits for the half conversion code (min/max for the integer
// target types and a float specialization for the half numeric_limits base).
// NVRTC bundles the full <limits>; hipRTC does not.
template<class T>
struct numeric_limits {
__device__ static constexpr T min() { return T(); }
__device__ static constexpr T max() { return T(); }
static constexpr int round_style = 1; // round_to_nearest
};
#define AF_RTC_NLIMITS(T, MN, MX) \
template<> \
struct numeric_limits<T> { \
__device__ static constexpr T min() { return MN; } \
__device__ static constexpr T max() { return MX; } \
static constexpr int round_style = 1; \
};
AF_RTC_NLIMITS(char, (-128), 127)
AF_RTC_NLIMITS(signed char, (-128), 127)
AF_RTC_NLIMITS(unsigned char, 0, 255)
AF_RTC_NLIMITS(short, (-32768), 32767)
AF_RTC_NLIMITS(unsigned short, 0, 65535)
AF_RTC_NLIMITS(int, (-2147483647 - 1), 2147483647)
AF_RTC_NLIMITS(unsigned int, 0u, 4294967295u)
AF_RTC_NLIMITS(long long, (-9223372036854775807LL - 1), 9223372036854775807LL)
AF_RTC_NLIMITS(unsigned long long, 0ull, 18446744073709551615ull)
#undef AF_RTC_NLIMITS
template<>
struct numeric_limits<float> {
__device__ static constexpr float min() { return 1.17549435e-38f; }
__device__ static constexpr float max() { return 3.40282347e+38f; }
__device__ static constexpr float lowest() { return -3.40282347e+38f; }
__device__ static constexpr float infinity() { return __builtin_huge_valf(); }
static constexpr int round_style = 1;
};
template<>
struct numeric_limits<double> {
__device__ static constexpr double min() { return 2.2250738585072014e-308; }
__device__ static constexpr double max() { return 1.7976931348623157e+308; }
__device__ static constexpr double lowest() {
return -1.7976931348623157e+308;
}
__device__ static constexpr double infinity() { return __builtin_huge_val(); }
static constexpr int round_style = 1;
};

// hipRTC's runtime header injects an isnan/isinf overload taking hip_bfloat16
// but no float/double overload, so an unqualified std::isnan(float) in the
// embedded device headers (math.hpp is_nan<float>) finds only the bfloat16
// candidate. Provide the float/double overloads (device builtins) so the exact
// match wins. NVRTC bundles the full <cmath> set.
__device__ inline bool isnan(float v) { return __builtin_isnan(v); }
__device__ inline bool isnan(double v) { return __builtin_isnan(v); }
__device__ inline bool isinf(float v) { return __builtin_isinf(v); }
__device__ inline bool isinf(double v) { return __builtin_isinf(v); }
#endif // __HIP_RTC__

} // namespace std

using uint16_t = unsigned short;
// we do not include the af/compilers header in nvrtc compilations so
// we are defining the AF_CONSTEXPR expression here
// we are defining the AF_CONSTEXPR expression here. Under hipRTC the half ctors
// call __float2half-style intrinsics that are not constexpr (clang rejects them
// with -Winvalid-constexpr), so AF_CONSTEXPR must be empty there; NVRTC keeps
// constexpr.
#if defined(__HIP_RTC__)
#define AF_CONSTEXPR
#else
#define AF_CONSTEXPR constexpr
#endif
#else
#include <af/compilers.h>
#include <algorithm>
Expand Down Expand Up @@ -861,7 +971,7 @@ AF_CONSTEXPR __DH__ native_half_t int2half(T value) noexcept {
/// value
/// \param value The value to convert to integer
template<std::float_round_style R, bool E, typename T>
AF_CONSTEXPR T half2int(native_half_t value) {
AF_CONSTEXPR __DH__ T half2int(native_half_t value) {
#ifdef __CUDA_ARCH__
AF_IF_CONSTEXPR(std::is_same<T, short>::value ||
std::is_same<T, char>::value ||
Expand Down Expand Up @@ -1012,6 +1122,7 @@ class alignas(2) half {
}
#endif


__DH__ explicit operator float() const noexcept {
return half2float(data_);
}
Expand Down Expand Up @@ -1089,7 +1200,7 @@ class alignas(2) half {
return *this;
}

AF_CONSTEXPR static half infinity() {
AF_CONSTEXPR __DH__ static half infinity() {
half out;
#ifdef __CUDA_ARCH__
out.data_ = __half_raw{0x7C00};
Expand Down
Loading