Program Listing for File cudaStorage.hpp#
↰ Return to documentation for file (librapid/include/librapid/cuda/cudaStorage.hpp)
#ifndef LIBRAPID_ARRAY_CUDA_STORAGE_HPP
#define LIBRAPID_ARRAY_CUDA_STORAGE_HPP
#if defined(LIBRAPID_HAS_CUDA)
/*
* Implement a storage class for CUDA data structures. It will expose
* the same functions as the `librapid::Storage` class, but the underlying
* memory buffer will be allocated on the GPU.
*/
namespace librapid {
namespace typetraits {
template<typename Scalar_>
struct TypeInfo<CudaStorage<Scalar_>> {
static constexpr bool isLibRapidType = true;
using Scalar = Scalar_;
using Backend = backend::CUDA;
};
template<typename T>
struct IsCudaStorage : std::false_type {};
template<typename Scalar>
struct IsCudaStorage<CudaStorage<Scalar>> : std::true_type {};
LIBRAPID_DEFINE_AS_TYPE(typename Scalar_, CudaStorage<Scalar_>);
} // namespace typetraits
namespace detail {
template<typename T>
T *__restrict cudaSafeAllocate(size_t size);
template<typename T>
void cudaSafeDeallocate(T *__restrict data);
template<typename T>
std::shared_ptr<T> cudaSharedPtrAllocate(size_t size);
# define CUDA_REF_OPERATOR(OP) \
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const RHS &rhs) { \
return lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const LHS &lhs, const CudaRef<RHS> &rhs) { \
return lhs OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
return lhs.get() OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP##=(CudaRef<LHS> &lhs, const RHS &rhs) { \
lhs = lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP##=(CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
lhs = lhs.get() OP rhs.get(); \
}
# define CUDA_REF_OPERATOR_NO_ASSIGN(OP) \
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const RHS &rhs) { \
return lhs.get() OP rhs; \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const LHS &lhs, const CudaRef<RHS> &rhs) { \
return lhs OP rhs.get(); \
} \
\
template<typename LHS, typename RHS> \
auto operator OP(const CudaRef<LHS> &lhs, const CudaRef<RHS> &rhs) { \
return lhs.get() OP rhs.get(); \
}
template<typename T>
class CudaRef {
public:
using PtrType = std::shared_ptr<T>;
CudaRef(const PtrType &ptr, size_t offset) : m_ptr(ptr), m_offset(offset) {}
LIBRAPID_ALWAYS_INLINE CudaRef &operator=(const T &val) {
cudaSafeCall(cudaMemcpyAsync(m_ptr.get() + m_offset,
&val,
sizeof(T),
cudaMemcpyHostToDevice,
global::cudaStream));
return *this;
}
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE T get() const {
T tmp;
cudaSafeCall(cudaMemcpyAsync(&tmp,
m_ptr.get() + m_offset,
sizeof(T),
cudaMemcpyDeviceToHost,
global::cudaStream));
return tmp;
}
template<typename CAST>
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE operator CAST() const {
return static_cast<CAST>(get());
}
LIBRAPID_NODISCARD std::string str(const std::string &format = "{}") const {
return fmt::format(format, get());
}
private:
std::shared_ptr<T> m_ptr;
size_t m_offset;
};
CUDA_REF_OPERATOR(+)
CUDA_REF_OPERATOR(-)
CUDA_REF_OPERATOR(*)
CUDA_REF_OPERATOR(/)
CUDA_REF_OPERATOR(%)
CUDA_REF_OPERATOR(^)
CUDA_REF_OPERATOR(&)
CUDA_REF_OPERATOR(|)
CUDA_REF_OPERATOR(<<)
CUDA_REF_OPERATOR(>>)
CUDA_REF_OPERATOR_NO_ASSIGN(==)
CUDA_REF_OPERATOR_NO_ASSIGN(!=)
CUDA_REF_OPERATOR_NO_ASSIGN(<)
CUDA_REF_OPERATOR_NO_ASSIGN(>)
CUDA_REF_OPERATOR_NO_ASSIGN(<=)
CUDA_REF_OPERATOR_NO_ASSIGN(>=)
} // namespace detail
template<typename Scalar_>
class CudaStorage {
public:
using Scalar = Scalar_;
using Pointer = std::shared_ptr<Scalar>; // Scalar *__restrict;
using ConstPointer = const std::shared_ptr<Scalar>; // const Scalar *__restrict;
using Reference = Scalar &;
using ConstReference = const Scalar &;
using DifferenceType = std::ptrdiff_t;
using SizeType = std::size_t;
CudaStorage() = default;
LIBRAPID_ALWAYS_INLINE explicit CudaStorage(SizeType size);
LIBRAPID_ALWAYS_INLINE CudaStorage(SizeType size, ConstReference value);
LIBRAPID_ALWAYS_INLINE CudaStorage(Scalar *begin, SizeType size, bool independent);
LIBRAPID_ALWAYS_INLINE CudaStorage(const CudaStorage &other);
LIBRAPID_ALWAYS_INLINE CudaStorage(CudaStorage &&other) noexcept;
LIBRAPID_ALWAYS_INLINE CudaStorage(const std::initializer_list<Scalar> &list);
LIBRAPID_ALWAYS_INLINE explicit CudaStorage(const std::vector<Scalar> &vec);
template<typename ShapeType>
static ShapeType defaultShape();
template<typename V>
static CudaStorage fromData(const std::initializer_list<V> &vec);
template<typename V>
static CudaStorage fromData(const std::vector<V> &vec);
LIBRAPID_ALWAYS_INLINE CudaStorage &operator=(const CudaStorage &other);
LIBRAPID_ALWAYS_INLINE CudaStorage &operator=(CudaStorage &&other) noexcept;
~CudaStorage();
void set(const CudaStorage &other);
CudaStorage copy() const;
LIBRAPID_ALWAYS_INLINE void resize(SizeType newSize);
LIBRAPID_ALWAYS_INLINE void resize(SizeType newSize, int);
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE SizeType size() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE detail::CudaRef<Scalar>
operator[](SizeType index) const;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE detail::CudaRef<Scalar>
operator[](SizeType index);
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer data() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer begin() const noexcept;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Pointer end() const noexcept;
private:
// Copy data from \p begin to \p end into this Storage object
template<typename P>
LIBRAPID_ALWAYS_INLINE void initData(P begin, P end);
LIBRAPID_ALWAYS_INLINE void resizeImpl(SizeType newSize, int);
LIBRAPID_ALWAYS_INLINE void resizeImpl(SizeType newSize);
Pointer m_begin = nullptr;
size_t m_size;
bool m_ownsData = true;
};
namespace detail {
template<typename T>
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE T *__restrict cudaSafeAllocate(size_t size) {
static_assert(typetraits::TriviallyDefaultConstructible<T>::value,
"Data type must be trivially constructable for use with CUDA");
T *result;
// Round size up to nearest multiple of 32
size = (size + size_t(31)) & ~size_t(31);
cudaSafeCall(cudaMallocAsync(&result, sizeof(T) * size, global::cudaStream));
return result;
}
template<typename T>
LIBRAPID_ALWAYS_INLINE void cudaSafeDeallocate(T *__restrict data) {
static_assert(std::is_trivially_destructible_v<T>,
"Data type must be trivially constructable for use with CUDA");
cudaSafeCall(cudaFreeAsync(data, global::cudaStream));
}
template<typename T>
std::shared_ptr<T> cudaSharedPtrAllocate(size_t size) {
return std::shared_ptr<T>(cudaSafeAllocate<T>(size), cudaSafeDeallocate<T>);
}
template<typename T>
std::shared_ptr<T> safePointerCopyCuda(T *ptr, bool ownsData = true) {
using RawPointer = T *;
using Pointer = std::shared_ptr<T>;
if (ownsData) {
return Pointer(ptr, cudaSafeDeallocate<T>);
} else {
return Pointer(ptr, [](RawPointer) {});
}
}
template<typename T>
std::shared_ptr<T> safePointerCopyCuda(std::shared_ptr<T> ptr, bool ownsData = true) {
using RawPointer = T *;
using Pointer = std::shared_ptr<T>;
if (ownsData) {
return Pointer(ptr.get(), cudaSafeDeallocate<T>);
} else {
return Pointer(ptr.get(), [](RawPointer) {});
}
}
} // namespace detail
template<typename T>
CudaStorage<T>::CudaStorage(SizeType size) :
m_size(size), m_begin(detail::cudaSharedPtrAllocate<T>(size)), m_ownsData(true) {}
template<typename T>
CudaStorage<T>::CudaStorage(SizeType size, ConstReference value) :
m_size(size), m_begin(detail::cudaSharedPtrAllocate<T>(size)), m_ownsData(true) {
// Fill the data with "value"
cuda::runKernel<T, T>("fill", "fillArray", size, size, m_begin, value);
}
template<typename T>
CudaStorage<T>::CudaStorage(Scalar *begin, SizeType size, bool ownsData) :
m_size(size), m_begin(detail::safePointerCopyCuda(begin, ownsData)),
m_ownsData(ownsData) {}
template<typename T>
CudaStorage<T>::CudaStorage(const CudaStorage &other) :
m_size(other.m_size), m_begin(other.m_begin), m_ownsData(other.m_ownsData) {}
template<typename T>
CudaStorage<T>::CudaStorage(CudaStorage &&other) noexcept :
m_begin(other.m_begin), m_size(other.m_size), m_ownsData(other.m_ownsData) {
other.m_begin = nullptr;
other.m_size = 0;
other.m_ownsData = false;
}
template<typename T>
CudaStorage<T>::CudaStorage(const std::initializer_list<T> &list) :
m_size(list.size()), m_begin(detail::cudaSharedPtrAllocate<T>(list.size())),
m_ownsData(true) {
cudaSafeCall(cudaMemcpyAsync(m_begin.get(),
list.begin(),
sizeof(T) * m_size,
cudaMemcpyHostToDevice,
global::cudaStream));
}
template<typename T>
CudaStorage<T>::CudaStorage(const std::vector<T> &list) :
m_size(list.size()), m_begin(detail::cudaSharedPtrAllocate<T>(list.size())),
m_ownsData(true) {
cudaSafeCall(cudaMemcpyAsync(m_begin.get(),
list.begin(),
sizeof(T) * m_size,
cudaMemcpyHostToDevice,
global::cudaStream));
}
template<typename T>
template<typename ShapeType>
ShapeType CudaStorage<T>::defaultShape() {
return ShapeType({0});
}
template<typename T>
template<typename V>
auto CudaStorage<T>::fromData(const std::initializer_list<V> &list) -> CudaStorage {
CudaStorage ret;
ret.initData(list.begin(), list.end());
return ret;
}
template<typename T>
template<typename V>
auto CudaStorage<T>::fromData(const std::vector<V> &vec) -> CudaStorage {
CudaStorage ret;
ret.initData(vec.begin(), vec.end());
return ret;
}
template<typename T>
auto CudaStorage<T>::operator=(const CudaStorage<T> &other) -> CudaStorage & {
if (this != &other) {
if (m_ownsData) {
// If we own the data already, we can just copy the pointer since we know it won't
// affect anything else. The shared pointer deals with the reference counting, so
// we don't need to worry about other arrays that might be using the same data.
m_begin = other.m_begin;
m_size = other.m_size;
} else {
LIBRAPID_ASSERT(m_size == other.m_size,
"Cannot copy storage with {} elements to dependent storage with "
"{} elements",
other.m_size,
m_size);
cudaSafeCall(cudaMemcpyAsync(m_begin.get(),
other.begin().get(),
sizeof(T) * m_size,
cudaMemcpyDeviceToDevice,
global::cudaStream));
}
}
return *this;
}
template<typename T>
auto CudaStorage<T>::operator=(CudaStorage &&other) noexcept -> CudaStorage & {
if (this != &other) {
if (m_ownsData) {
std::swap(m_begin, other.m_begin);
std::swap(m_size, other.m_size);
std::swap(m_ownsData, other.m_ownsData);
} else {
LIBRAPID_ASSERT(
size() == other.size(),
"Mismatched storage sizes. Cannot assign CUDA storage with {} elements to "
"dependent CUDA storage with {} elements",
other.size(),
size());
cudaSafeCall(cudaMemcpyAsync(m_begin.get(),
other.begin().get(),
sizeof(T) * m_size,
cudaMemcpyDeviceToDevice,
global::cudaStream));
}
}
return *this;
}
template<typename T>
CudaStorage<T>::~CudaStorage() {
// Data is freed automatically by the shared_ptr. A custom deleter is used to ensure that
// nothing happens if the storage is dependent.
}
template<typename T>
void CudaStorage<T>::set(const CudaStorage &other) {
m_begin = other.m_begin;
m_size = other.m_size;
m_ownsData = other.m_ownsData;
}
template<typename T>
auto CudaStorage<T>::copy() const -> CudaStorage {
CudaStorage ret(m_size);
cudaSafeCall(cudaMemcpyAsync(ret.begin().get(),
m_begin.get(),
sizeof(T) * m_size,
cudaMemcpyDeviceToDevice,
global::cudaStream));
return ret;
}
template<typename T>
template<typename P>
void CudaStorage<T>::initData(P begin, P end) {
auto size = std::distance(begin, end);
m_begin = detail::cudaSharedPtrAllocate<T>(size);
m_size = size;
auto tmpBegin = [begin]() {
if constexpr (std::is_pointer_v<P>)
return begin;
else
return &(*begin);
}();
cudaSafeCall(cudaMemcpyAsync(
m_begin.get(), tmpBegin, sizeof(T) * size, cudaMemcpyDefault, global::cudaStream));
}
template<typename T>
void CudaStorage<T>::resize(SizeType newSize) {
resizeImpl(newSize);
}
template<typename T>
void CudaStorage<T>::resize(SizeType newSize, int) {
resizeImpl(newSize, 0);
}
template<typename T>
void CudaStorage<T>::resizeImpl(SizeType newSize) {
if (newSize == size()) { return; }
LIBRAPID_ASSERT(m_ownsData, "Dependent CUDA storage cannot be resized");
Pointer oldBegin = m_begin;
SizeType oldSize = m_size;
// Reallocate
m_begin = detail::cudaSharedPtrAllocate<T>(newSize);
m_size = newSize;
// Copy old data
cudaSafeCall(cudaMemcpyAsync(m_begin.get(),
oldBegin.get(),
sizeof(T) * std::min(oldSize, newSize),
cudaMemcpyDeviceToDevice,
global::cudaStream));
m_size = newSize;
}
template<typename T>
void CudaStorage<T>::resizeImpl(SizeType newSize, int) {
if (newSize == size()) return;
LIBRAPID_ASSERT(m_ownsData, "Dependent CUDA storage cannot be resized");
m_begin = detail::cudaSharedPtrAllocate<T>(newSize);
m_size = newSize;
}
template<typename T>
auto CudaStorage<T>::size() const noexcept -> SizeType {
return m_size;
}
template<typename T>
auto CudaStorage<T>::operator[](SizeType index) const -> detail::CudaRef<Scalar> {
return {m_begin, index};
}
template<typename T>
auto CudaStorage<T>::operator[](SizeType index) -> detail::CudaRef<Scalar> {
return {m_begin, index};
}
template<typename T>
auto CudaStorage<T>::data() const noexcept -> Pointer {
return m_begin;
}
template<typename T>
auto CudaStorage<T>::begin() const noexcept -> Pointer {
return m_begin;
}
template<typename T>
auto CudaStorage<T>::end() const noexcept -> Pointer {
return m_begin + m_size;
}
} // namespace librapid
# if defined(FMT_API)
LIBRAPID_SIMPLE_IO_IMPL(typename T, librapid::detail::CudaRef<T>)
# endif // FM_API
#else
// Trait implementations
namespace librapid::typetraits {
// Define this so things still work correctly
template<typename T>
struct IsCudaStorage : std::false_type {};
} // namespace librapid::typetraits
#endif // LIBRAPID_HAS_CUDA
#endif // LIBRAPID_ARRAY_CUDA_STORAGE_HPP