[Rust] CUDA/cuBLAS/cuSOLVERを使う
8 min
1531 words
Suzuki Shun
RustでCUDA/cuBLAS/cuSOLVERを使う備忘録.
ソースはここ.
CUDA
Rustのcc
crateはCUDAのコンパイルに対応している.
また, CUDAのAPI bindingであるcuda-sys
crateもあるので比較的簡単にCUDAを使用する事ができる.
まず, 適当なプロジェクトを作成する.
cargo new --bin cuda-sample
cd cuda-sample
cargo add cc --build
cargo add cuda-sys
次に, kernal.cu
を作成し, CUDAのコードを書く.
#include <cstdint>
__global__ void add_kernel(const double *x, double *y, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
y[i] += x[i];
}
}
#include <cstdint>
#ifdef __cplusplus
extern "C" {
#endif
void cu_add(const double *x, double *y, const int32_t len) {
int threadsPerBlock = 256;
int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;
add_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, y, len);
}
#ifdef __cplusplus
}
#endif
次に, build.rs
を作成する.
fn main() {
println!("cargo:rerun-if-changed=kernel.cu");
println!("cargo:rerun-if-changed=build.rs");
cc::Build::new()
.cuda(true)
.flag("-allow-unsupported-compiler")
.flag("-cudart=shared")
.flag("-gencode=arch=compute_75,code=sm_75")
.flag("-gencode=arch=compute_80,code=sm_80")
.flag("-gencode=arch=compute_86,code=sm_86")
.flag("-gencode=arch=compute_87,code=sm_87")
.file("kernel.cu")
.compile("my_kernel");
}
なお, nvccのフラグはこの記事などを参考に, 使用するGPUに合わせる.
あとは普通に外部ライブラリとしてリンクして使用できる.
use std::mem::size_of;
use cuda_sys::cudart::{
cudaMalloc,
cudaMemcpy,
cudaMemcpyKind_cudaMemcpyDeviceToHost,
cudaMemcpyKind_cudaMemcpyHostToDevice,
cudaMemset,
};
#[link(name = "my_kernel", kind = "static")]
extern "C" {
fn cu_add(x: *const f64, y: *mut f64, n: i32);
}
macro_rules! alloc {
($ty:ty, $r:expr, $c:expr) => {{
let mut v: *mut $ty = std::ptr::null_mut();
cudaMalloc(&mut v as *mut *mut $ty as _, size_of::<$ty>() * $r * $c);
cudaMemset(v as _, 0, size_of::<$ty>() * $r * $c);
(v, $r, $c)
}};
}
macro_rules! free {
($p:expr) => {{
cuda_sys::cudart::cudaFree($p.0 as _)
}};
}
fn main() {
unsafe {
let m = 2;
let n = 1024;
let one = 1.0;
let zero = 0.0;
let a_ = vec![one; m * n];
let a = alloc!(f64, m, n);
let c = alloc!(f64, m, m);
cudaMemcpy(
a.0 as _,
a_.as_ptr() as _,
m * n * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyHostToDevice,
);
cu_add(a.0, c.0, (m * m) as _);
let mut c_: Vec<f64> = vec![zero; m * m];
cudaMemcpy(
c_.as_mut_ptr() as _,
c.0 as _,
c_.len() * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyDeviceToHost,
);
println!("{:?}", c_);
free!(a);
free!(c);
}
}
cuBLAS
cuda-sys
crateはcuBLASのAPI bindingも提供している.
use std::mem::size_of;
use cuda_sys::{
cublas::{
cublasCreate_v2, cublasDestroy_v2, cublasHandle_t, cublasOperation_t,
cublasOperation_t_CUBLAS_OP_N,
},
cudart::{
cudaMalloc, cudaMemcpy, cudaMemcpyKind_cudaMemcpyDeviceToHost,
cudaMemcpyKind_cudaMemcpyHostToDevice, cudaMemset,
},
};
#[link(name = "my_kernel", kind = "static")]
extern "C" {
fn cu_add(x: *const f64, y: *mut f64, n: i32);
}
macro_rules! alloc {
($ty:ty, $r:expr, $c:expr) => {{
let mut v: *mut $ty = std::ptr::null_mut();
cudaMalloc(&mut v as *mut *mut $ty as _, size_of::<$ty>() * $r * $c);
cudaMemset(v as _, 0, size_of::<$ty>() * $r * $c);
(v, $r, $c)
}};
}
macro_rules! free {
($p:expr) => {{
cuda_sys::cudart::cudaFree($p.0 as _)
}};
}
fn mat_mul(
handle: cublasHandle_t,
transa: cublasOperation_t,
transb: cublasOperation_t,
alpha: *const f64,
a: (*mut f64, usize, usize),
b: (*mut f64, usize, usize),
beta: *const f64,
c: (*mut f64, usize, usize),
) {
unsafe {
cuda_sys::cublas::cublasDgemm_v2(
handle,
transa,
transb,
c.1 as _,
c.2 as _,
if transa == cublasOperation_t_CUBLAS_OP_N {
a.2
} else {
a.1
} as _,
alpha,
a.0,
a.1 as _,
b.0,
b.1 as _,
beta,
c.0,
c.1 as _,
);
}
}
fn main() {
unsafe {
let m = 2;
let n = 1024;
let one = 1.0;
let zero = 0.0;
let a_ = vec![one; m * n];
let b_ = vec![one; m * n];
let a = alloc!(f64, m, n);
let b = alloc!(f64, n, m);
let c = alloc!(f64, m, m);
cudaMemcpy(
a.0 as _,
a_.as_ptr() as _,
m * n * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyHostToDevice,
);
cudaMemcpy(
b.0 as _,
b_.as_ptr() as _,
m * n * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyHostToDevice,
);
let mut handle: cublasHandle_t = std::ptr::null_mut();
cublasCreate_v2(&mut handle as *mut _);
mat_mul(
handle,
cublasOperation_t_CUBLAS_OP_N,
cublasOperation_t_CUBLAS_OP_N,
&one,
a,
b,
&zero,
c,
);
cu_add(a.0, c.0, (m * m) as _);
cublasDestroy_v2(handle);
let mut c_: Vec<f64> = vec![zero; m * m];
cudaMemcpy(
c_.as_mut_ptr() as _,
c.0 as _,
c_.len() * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyDeviceToHost,
);
println!("{:?}", c_);
free!(a);
free!(b);
free!(c);
}
}
cuSOLVER
残念ながら, cuda-sys
crateはcuSOLVERのAPI bindingを提供していない.
そのため, 自分でbindingを作成する必要がある.
手作業で作るのは苦痛なので, bindgenなどを使うと良い.
また, cusolver
ライブラリをリンクする必要がある.
まず, cuda_config
crateを追加しておく.
cargo add cuda-config --build
次に, build.rs
を更新して, 以下のようにする.
use cuda_config::*;
fn main() {
println!("cargo:rerun-if-changed=kernel.cu");
println!("cargo:rerun-if-changed=build.rs");
cc::Build::new()
.cuda(true)
.flag("-allow-unsupported-compiler")
.flag("-cudart=shared")
.flag("-gencode=arch=compute_75,code=sm_75")
.flag("-gencode=arch=compute_80,code=sm_80")
.flag("-gencode=arch=compute_86,code=sm_86")
.flag("-gencode=arch=compute_87,code=sm_87")
.file("kernel.cu")
.compile("my_kernel");
if cfg!(target_os = "windows") {
println!(
"cargo:rustc-link-search=native={}",
find_cuda_windows().display()
);
} else {
for path in find_cuda() {
println!("cargo:rustc-link-search=native={}", path.display());
}
};
println!("cargo:rustc-link-lib=dylib=cusolver");
}
これで, 無事にcuSOLVERを使うことができる.
#[allow(non_camel_case_types)]
#[allow(dead_code)]
#[allow(deref_nullptr)]
mod cusolver;
use std::mem::size_of;
use cuda_sys::{
cublas::{
cublasCreate_v2, cublasDestroy_v2, cublasHandle_t, cublasOperation_t,
cublasOperation_t_CUBLAS_OP_N,
},
cudart::{
cudaMalloc, cudaMemcpy, cudaMemcpyKind_cudaMemcpyDeviceToDevice,
cudaMemcpyKind_cudaMemcpyDeviceToHost, cudaMemcpyKind_cudaMemcpyHostToDevice, cudaMemset,
},
};
use cusolver::{
cudaDataType_t::CUDA_R_64F, cusolverDnCreate, cusolverDnDestroy, cusolverDnHandle_t,
cusolverEigMode_t::CUSOLVER_EIG_MODE_VECTOR,
};
#[link(name = "my_kernel", kind = "static")]
extern "C" {
fn cu_add(x: *const f64, y: *mut f64, n: i32);
}
macro_rules! alloc {
($ty:ty, $r:expr, $c:expr) => {{
let mut v: *mut $ty = std::ptr::null_mut();
cudaMalloc(&mut v as *mut *mut $ty as _, size_of::<$ty>() * $r * $c);
cudaMemset(v as _, 0, size_of::<$ty>() * $r * $c);
(v, $r, $c)
}};
}
macro_rules! free {
($p:expr) => {{
cuda_sys::cudart::cudaFree($p.0 as _)
}};
}
fn mat_mul(
handle: cublasHandle_t,
transa: cublasOperation_t,
transb: cublasOperation_t,
alpha: *const f64,
a: (*mut f64, usize, usize),
b: (*mut f64, usize, usize),
beta: *const f64,
c: (*mut f64, usize, usize),
) {
unsafe {
cuda_sys::cublas::cublasDgemm_v2(
handle,
transa,
transb,
c.1 as _,
c.2 as _,
if transa == cublasOperation_t_CUBLAS_OP_N {
a.2
} else {
a.1
} as _,
alpha,
a.0,
a.1 as _,
b.0,
b.1 as _,
beta,
c.0,
c.1 as _,
);
}
}
unsafe fn svd(
handle: cusolverDnHandle_t,
src: (*mut f64, usize, usize),
) -> (
(*mut f64, usize, usize),
(*mut f64, usize, usize),
(*mut f64, usize, usize),
) {
let m = src.1;
let n = src.2;
let s_size = m.min(n);
let u = alloc!(f64, m, m);
let s = alloc!(f64, s_size, 1);
let vt = alloc!(f64, n, n);
let lda = m;
let ldu = m;
let ldv = n;
let mut workspace_in_bytes_on_device: u64 = 0;
let mut workspace_in_bytes_on_host: u64 = 0;
cusolver::cusolverDnXgesvdp_bufferSize(
handle,
std::ptr::null_mut(),
CUSOLVER_EIG_MODE_VECTOR,
0,
m as _,
n as _,
CUDA_R_64F,
src.0 as _,
lda as _,
CUDA_R_64F,
s.0 as _,
CUDA_R_64F,
u.0 as _,
ldu as _,
CUDA_R_64F,
vt.0 as _,
ldv as _,
CUDA_R_64F,
&mut workspace_in_bytes_on_device as _,
&mut workspace_in_bytes_on_host as _,
);
let workspace_buffer_on_device = alloc!(u8, workspace_in_bytes_on_device as usize, 1);
let mut workspace_buffer_on_host_v = vec![0u8; workspace_in_bytes_on_host as usize];
let workspace_buffer_on_host = if workspace_in_bytes_on_host > 0 {
workspace_buffer_on_host_v.as_mut_ptr()
} else {
std::ptr::null_mut()
};
let info = alloc!(i32, 1, 1);
let mut h_err_sigma = 0.;
cusolver::cusolverDnXgesvdp(
handle,
std::ptr::null_mut(),
CUSOLVER_EIG_MODE_VECTOR,
0,
m as _,
n as _,
CUDA_R_64F,
src.0 as _,
lda as _,
CUDA_R_64F,
s.0 as _,
CUDA_R_64F,
u.0 as _,
ldu as _,
CUDA_R_64F,
vt.0 as _,
ldv as _,
CUDA_R_64F,
workspace_buffer_on_device.0 as _,
workspace_in_bytes_on_device,
workspace_buffer_on_host as _,
workspace_in_bytes_on_host,
info.0 as _,
&mut h_err_sigma as _,
);
free!(info);
free!(workspace_buffer_on_device);
(u, s, vt)
}
fn main() {
unsafe {
let m = 2;
let n = 1024;
let one = 1.0;
let zero = 0.0;
let a_ = vec![one; m * n];
let b_ = vec![one; m * n];
let a = alloc!(f64, m, n);
let b = alloc!(f64, n, m);
let c = alloc!(f64, m, m);
cudaMemcpy(
a.0 as _,
a_.as_ptr() as _,
m * n * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyHostToDevice,
);
cudaMemcpy(
b.0 as _,
b_.as_ptr() as _,
m * n * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyHostToDevice,
);
let mut handle: cublasHandle_t = std::ptr::null_mut();
cublasCreate_v2(&mut handle as *mut _);
mat_mul(
handle,
cublasOperation_t_CUBLAS_OP_N,
cublasOperation_t_CUBLAS_OP_N,
&one,
a,
b,
&zero,
c,
);
cu_add(a.0, c.0, (m * m) as _);
let mut handle_s: cusolverDnHandle_t = std::ptr::null_mut();
cusolverDnCreate(&mut handle_s as *mut _);
let (u, s, vt) = svd(handle_s, c);
let sm = alloc!(f64, m, m);
cudaMemcpy(
sm.0 as _,
s.0 as _,
size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyDeviceToDevice,
);
cudaMemcpy(
sm.0.add(2) as _,
s.0.add(1) as _,
size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyDeviceToDevice,
);
let tmp = alloc!(f64, m, m);
mat_mul(
handle,
cublasOperation_t_CUBLAS_OP_N,
cublasOperation_t_CUBLAS_OP_N,
&one,
u,
sm,
&zero,
tmp,
);
mat_mul(
handle,
cublasOperation_t_CUBLAS_OP_N,
cublasOperation_t_CUBLAS_OP_N,
&one,
tmp,
vt,
&zero,
c,
);
cublasDestroy_v2(handle);
cusolverDnDestroy(handle_s);
let mut c_: Vec<f64> = vec![zero; m * m];
cudaMemcpy(
c_.as_mut_ptr() as _,
c.0 as _,
c_.len() * size_of::<f64>(),
cudaMemcpyKind_cudaMemcpyDeviceToHost,
);
println!("{:?}", c_);
free!(a);
free!(b);
free!(c);
free!(u);
free!(s);
free!(vt);
}
}