Program Listing for File cudaStorage.hpp#
↰ Return to documentation for file (librapid/include/librapid/cuda/cudaStorage.hpp)
#ifndef LIBRAPID_ARRAY_CUDA_STORAGE_HPP
#define LIBRAPID_ARRAY_CUDA_STORAGE_HPP
#if defined(LIBRAPID_HAS_CUDA)
/*
* Implement a storage class for CUDA data structures. It will expose
* the same functions as the `librapid::Storage` class, but the underlying
* memory buffer will be allocated on the GPU.
*/
namespace librapid {
namespace typetraits {
template<typename Scalar_>
struct TypeInfo<CudaStorage<Scalar_>> {
static constexpr bool isLibRapidType = true;
using Scalar = Scalar_;
using Backend = backend::CUDA;
};
template<typename T>
struct IsCudaStorage : std::false_type {};
template<typename Scalar>
struct IsCudaStorage<CudaStorage<Scalar>> : std::true_type {};
LIBRAPID_DEFINE_AS_TYPE(typename Scalar_, CudaStorage<Scalar_>);
} // namespace typetraits
namespace detail {
# define CUDA_REF_OPERATOR(OP) \
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const RHS &rhs) { \
return lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const LHS &lhs, const CudaRef<RHS> &rhs) { \
return lhs OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
return lhs.get() OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP##=(CudaRef<LHS> &lhs, const RHS &rhs) { \
lhs = lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP##=(CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
lhs = lhs.get() OP rhs.get(); \
}
# define CUDA_REF_OPERATOR_NO_ASSIGN(OP) \
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const RHS &rhs) { \
return lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const LHS &lhs, const CudaRef<RHS> &rhs) { \
return lhs OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
return lhs.get() OP rhs.get(); \
}
template<typename T>
class CudaRef {
public:
using PtrType = T *;
CudaRef(PtrType ptr, size_t offset) : m_ptr(ptr), m_offset(offset) {}
LIBRAPID_ALWAYS_INLINE CudaRef &operator=(const T &val) {
cudaSafeCall(cudaMemcpyAsync(
m_ptr + m_offset, &val, sizeof(T), cudaMemcpyHostToDevice, global::cudaStream));
return *this;
}
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE T get() const {
T tmp;
cudaSafeCall(cudaMemcpyAsync(
&tmp, m_ptr + m_offset, sizeof(T), cudaMemcpyDeviceToHost, global::cudaStream));
return tmp;
}
template<typename CAST>
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE operator CAST() const {
return static_cast<CAST>(get());
}
template<typename T_, typename Char, typename Ctx>
void str(const fmt::formatter<T_, Char> &format, Ctx &ctx) const {
format.format(get(), ctx);
}
private:
T *m_ptr;
size_t m_offset;
};
CUDA_REF_OPERATOR(+)
CUDA_REF_OPERATOR(-)
CUDA_REF_OPERATOR(*)
CUDA_REF_OPERATOR(/)
CUDA_REF_OPERATOR(%)
CUDA_REF_OPERATOR(^)
CUDA_REF_OPERATOR(&)
CUDA_REF_OPERATOR(|)
CUDA_REF_OPERATOR(<<)
CUDA_REF_OPERATOR(>>)
CUDA_REF_OPERATOR_NO_ASSIGN(==)
CUDA_REF_OPERATOR_NO_ASSIGN(!=)
CUDA_REF_OPERATOR_NO_ASSIGN(<)
CUDA_REF_OPERATOR_NO_ASSIGN(>)
CUDA_REF_OPERATOR_NO_ASSIGN(<=)
CUDA_REF_OPERATOR_NO_ASSIGN(>=)
} // namespace detail
template<typename Scalar_>
class CudaStorage {
public:
using Scalar = Scalar_;
using Pointer = Scalar *;
using ConstPointer = const Scalar *;
using Reference = Scalar &;
using ConstReference = const Scalar &;
using DifferenceType = std::ptrdiff_t;
using SizeType = std::size_t;
CudaStorage() = default;
LIBRAPID_ALWAYS_INLINE explicit CudaStorage(SizeType size);
LIBRAPID_ALWAYS_INLINE CudaStorage(SizeType size, ConstReference value);
LIBRAPID_ALWAYS_INLINE CudaStorage(Scalar *begin, SizeType size, bool ownsData);
LIBRAPID_ALWAYS_INLINE CudaStorage(const CudaStorage &other);
LIBRAPID_ALWAYS_INLINE CudaStorage(CudaStorage &&other) noexcept;
LIBRAPID_ALWAYS_INLINE CudaStorage(const std::initializer_list<Scalar> &list);
LIBRAPID_ALWAYS_INLINE explicit CudaStorage(const std::vector<Scalar> &vec);
template<typename ShapeType>
static ShapeType defaultShape();
static CudaStorage fromData(const std::initializer_list<Scalar> &vec);
static CudaStorage fromData(const std::vector<Scalar> &vec);
LIBRAPID_ALWAYS_INLINE CudaStorage &operator=(const CudaStorage &other);
LIBRAPID_ALWAYS_INLINE CudaStorage &operator=(CudaStorage &&other) noexcept;
~CudaStorage();
CudaStorage copy() const;
LIBRAPID_ALWAYS_INLINE void resize(SizeType newSize);
LIBRAPID_ALWAYS_INLINE void resize(SizeType newSize, int);
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE SizeType size() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE detail::CudaRef<Scalar>
operator[](SizeType index) const;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE detail::CudaRef<Scalar>
operator[](SizeType index);
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer data() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer begin() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer end() const noexcept;
private:
// Copy data from \p begin to \p end into this Storage object
template<typename P>
LIBRAPID_ALWAYS_INLINE void initData(P begin, P end);
Pointer m_begin = nullptr;
size_t m_size;
bool m_ownsData = true;
};
namespace detail {
template<typename T>
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE T *__restrict cudaSafeAllocate(size_t size) {
if (size == 0) return nullptr;
static_assert(typetraits::TriviallyDefaultConstructible<T>::value,
"Data type must be trivially constructable for use with CUDA");
T *result;
// Round size up to nearest multiple of 32
size = (size + size_t(31)) & ~size_t(31);
cudaSafeCall(cudaMallocAsync(&result, sizeof(T) * size, global::cudaStream));
return result;
}
template<typename T>
LIBRAPID_ALWAYS_INLINE void cudaSafeDeallocate(T *__restrict data) {
static_assert(std::is_trivially_destructible_v<T>,
"Data type must be trivially constructable for use with CUDA");
if (data == nullptr) return;
cudaSafeCall(cudaFreeAsync(data, global::cudaStream));
}
} // namespace detail
template<typename T>
CudaStorage<T>::CudaStorage(SizeType size) :
m_size(size), m_begin(detail::cudaSafeAllocate<T>(size)), m_ownsData(true) {}
template<typename T>
CudaStorage<T>::CudaStorage(SizeType size, ConstReference value) :
m_size(size), m_begin(detail::cudaSafeAllocate<T>(size)), m_ownsData(true) {
// Fill the data with "value"
cuda::runKernel<T, T>("fill", "fillArray", size, size, m_begin, value);
}
template<typename T>
CudaStorage<T>::CudaStorage(Scalar *begin, SizeType size, bool ownsData) :
m_size(size), m_begin(begin), m_ownsData(ownsData) {}
template<typename T>
CudaStorage<T>::CudaStorage(const CudaStorage &other) : m_size(other.m_size), m_ownsData(true) {
// Copy the data
initData(other.begin(), other.end());
}
template<typename T>
CudaStorage<T>::CudaStorage(CudaStorage &&other) noexcept :
m_begin(std::move(other.begin())), m_size(std::move(other.size())),
m_ownsData(std::move(other.m_ownsData)) {
other.m_begin = nullptr;
other.m_size = 0;
other.m_ownsData = false;
}
template<typename T>
CudaStorage<T>::CudaStorage(const std::initializer_list<T> &list) :
m_size(list.size()), m_begin(detail::cudaSafeAllocate<T>(list.size())),
m_ownsData(true) {
cudaSafeCall(cudaMemcpyAsync(
m_begin, list.begin(), sizeof(T) * m_size, cudaMemcpyHostToDevice, global::cudaStream));
}
template<typename T>
CudaStorage<T>::CudaStorage(const std::vector<T> &list) :
m_size(list.size()), m_begin(detail::cudaSafeAllocate<T>(list.size())),
m_ownsData(true) {
cudaSafeCall(cudaMemcpyAsync(
m_begin, &list[0], sizeof(T) * m_size, cudaMemcpyHostToDevice, global::cudaStream));
}
template<typename T>
template<typename ShapeType>
ShapeType CudaStorage<T>::defaultShape() {
return ShapeType({0});
}
template<typename T>
auto CudaStorage<T>::fromData(const std::initializer_list<T> &list) -> CudaStorage {
CudaStorage ret;
// ret.initData(list.begin(), list.end());
ret.initData(static_cast<const T *>(list.begin()), static_cast<const T *>(list.end()));
return ret;
}
template<typename T>
auto CudaStorage<T>::fromData(const std::vector<T> &vec) -> CudaStorage {
CudaStorage ret;
// ret.initData(vec.begin(), vec.end());
ret.initData(&vec[0], &vec[0] + vec.size());
return ret;
}
template<typename T>
auto CudaStorage<T>::operator=(const CudaStorage<T> &other) -> CudaStorage & {
if (this != &other) {
if (other.m_size == 0) return *this; // Quick return
size_t oldSize = m_size;
m_size = other.m_size;
if (oldSize != m_size) LIBRAPID_UNLIKELY {
if (m_ownsData) LIBRAPID_LIKELY {
// Reallocate
detail::cudaSafeDeallocate(m_begin);
m_begin = detail::cudaSafeAllocate<T>(m_size);
}
else
LIBRAPID_UNLIKELY {
// We do not own this data, so we cannot reallocate it
LIBRAPID_ASSERT(false, "Cannot reallocate dependent CUDA storage");
}
}
// Copy the data
cudaSafeCall(cudaMemcpyAsync(m_begin,
other.begin(),
sizeof(T) * m_size,
cudaMemcpyDeviceToDevice,
global::cudaStream));
}
return *this;
}
template<typename T>
auto CudaStorage<T>::operator=(CudaStorage &&other) noexcept -> CudaStorage & {
if (this != &other) {
m_begin = std::move(other.m_begin);
m_size = std::move(other.m_size);
m_ownsData = std::move(other.m_ownsData);
other.m_begin = nullptr;
other.m_size = 0;
other.m_ownsData = false;
}
return *this;
}
template<typename T>
CudaStorage<T>::~CudaStorage() {
// If we own the data, we can free it
if (m_ownsData) detail::cudaSafeDeallocate(m_begin);
}
template<typename T>
auto CudaStorage<T>::copy() const -> CudaStorage {
CudaStorage ret(m_size);
cudaSafeCall(cudaMemcpyAsync(
ret.begin(), m_begin, sizeof(T) * m_size, cudaMemcpyDeviceToDevice, global::cudaStream));
return ret;
}
template<typename T>
template<typename P>
void CudaStorage<T>::initData(P begin, P end) {
// Quick return in the case of empty range
if (begin == nullptr || end == nullptr || begin == end) return;
auto size = std::distance(begin, end);
m_begin = detail::cudaSafeAllocate<T>(size);
m_size = size;
m_ownsData = true;
cudaSafeCall(cudaMemcpyAsync(
m_begin, begin, sizeof(T) * size, cudaMemcpyHostToDevice, global::cudaStream));
}
template<typename T>
void CudaStorage<T>::resize(SizeType newSize) {
LIBRAPID_ASSERT(newSize > 0, "Cannot resize to a zero-size array");
if (newSize == size()) { return; }
LIBRAPID_ASSERT(m_ownsData, "Dependent CUDA storage cannot be resized");
Pointer oldBegin = m_begin;
SizeType oldSize = m_size;
// Reallocate
m_begin = detail::cudaSafeAllocate<T>(newSize);
m_size = newSize;
// Copy old data
cudaSafeCall(cudaMemcpyAsync(m_begin,
oldBegin,
sizeof(T) * std::min(oldSize, newSize),
cudaMemcpyDeviceToDevice,
global::cudaStream));
// Free old data
detail::cudaSafeDeallocate(oldBegin);
}
template<typename T>
void CudaStorage<T>::resize(SizeType newSize, int) {
LIBRAPID_ASSERT(newSize > 0, "Cannot resize to a zero-size array");
if (newSize == size()) return;
LIBRAPID_ASSERT(m_ownsData, "Dependent CUDA storage cannot be resized");
detail::cudaSafeDeallocate(m_begin);
m_begin = detail::cudaSafeAllocate<T>(newSize);
m_size = newSize;
}
template<typename T>
auto CudaStorage<T>::size() const noexcept -> SizeType {
return m_size;
}
template<typename T>
auto CudaStorage<T>::operator[](SizeType index) const -> detail::CudaRef<Scalar> {
return {m_begin, index};
}
template<typename T>
auto CudaStorage<T>::operator[](SizeType index) -> detail::CudaRef<Scalar> {
return {m_begin, index};
}
template<typename T>
auto CudaStorage<T>::data() const noexcept -> Pointer {
return m_begin;
}
template<typename T>
auto CudaStorage<T>::begin() const noexcept -> Pointer {
return m_begin;
}
template<typename T>
auto CudaStorage<T>::end() const noexcept -> Pointer {
return m_begin + m_size;
}
} // namespace librapid
# if defined(FMT_API)
// LIBRAPID_SIMPLE_IO_IMPL(typename T, librapid::detail::CudaRef<T>)
template<typename T, typename Char>
struct fmt::formatter<librapid::detail::CudaRef<T>, Char> {
private:
using Base = fmt::formatter<T, Char>;
Base m_base;
public:
template<typename ParseContext>
FMT_CONSTEXPR auto parse(ParseContext &ctx) -> const char * {
return m_base.parse(ctx);
}
template<typename FormatContext>
FMT_CONSTEXPR auto format(const librapid::detail::CudaRef<T> &val, FormatContext &ctx) const
-> decltype(ctx.out()) {
val.str(m_base, ctx);
return ctx.out();
}
};
# endif // FM_API
#else
// Trait implementations
namespace librapid::typetraits {
// Define this so things still work correctly
template<typename T>
struct IsCudaStorage : std::false_type {};
} // namespace librapid::typetraits
#endif // LIBRAPID_HAS_CUDA
#endif // LIBRAPID_ARRAY_CUDA_STORAGE_HPP