ROCm全称为Radeon Open Computing,是AMD的一个完整的GPGPU生态。
其在源码层面上支持各种深度学习的框架,不过对于科学计算的话,最关键的还是矩阵操作部分。
ROCm里面对应的数学库为rocblas,当前也支持到blas3级别的并行。
这里面我们就以双精度dgemm的计算为例,介绍一下其调用方式。
其cpp的示例程序如下(使用AMD自带的算例修改得来)
/* ************************************************************************
* Copyright 2016 Advanced Micro Devices, Inc.
* Modified by Yingjin Ma from sgemm to dgemm, for testing
* ************************************************************************ */
#include "rocblas.h"
#include
#include
#include
#include
#include
#include // newly add in Nov.23 2019
//#include "utility.hpp"
#define DIM1 1023
#define DIM2 1024
#define DIM3 1025
template
void mat_mat_mult(T alpha,
T beta,
int M,
int N,
int K,
T* A,
int As1,
int As2,
T* B,
int Bs1,
int Bs2,
T* C,
int Cs1,
int Cs2)
{
for(int i1 = 0; i1 < M; i1++)
{
for(int i2 = 0; i2 < N; i2++)
{
T t = 0.0;
for(int i3 = 0; i3 < K; i3++)
{
t += A[i1 * As1 + i3 * As2] * B[i3 * Bs1 + i2 * Bs2];
}
C[i1 * Cs1 + i2 * Cs2] = beta * C[i1 * Cs1 + i2 * Cs2] + alpha * t;
}
}
}
int main()
{
rocblas_operation transa = rocblas_operation_none, transb = rocblas_operation_transpose;
double alpha = 1.1, beta = 0.9;
rocblas_int m = DIM1, n = DIM2, k = DIM3;
rocblas_int lda, ldb, ldc, size_a, size_b, size_c;
int a_stride_1, a_stride_2, b_stride_1, b_stride_2;
std::cout << "dgemm example" << std::endl;
if(transa == rocblas_operation_none)
{
lda = m;
size_a = k * lda;
a_stride_1 = 1;
a_stride_2 = lda;
std::cout << "N";
}
else
{
lda = k;
size_a = m * lda;
a_stride_1 = lda;
a_stride_2 = 1;
std::cout << "T";
}
if(transb == rocblas_operation_none)
{
ldb = k;
size_b = n * ldb;
b_stride_1 = 1;
b_stride_2 = ldb;
std::cout << "N: ";
}
else
{
ldb = n;
size_b = k * ldb;
b_stride_1 = ldb;
b_stride_2 = 1;
std::cout << "T: ";
}
ldc = m;
size_c = n * ldc;
// Naming: da is in GPU (device) memory. ha is in CPU (host) memory
std::vector ha(size_a);
std::vector hb(size_b);
std::vector hc(size_c);
std::vector hc_gold(size_c);
// initial data on host
srand(1);
for(int i = 0; i < size_a; ++i)
{
ha[i] = rand() % 17;
}
for(int i = 0; i < size_b; ++i)
{
hb[i] = rand() % 17;
}
for(int i = 0; i < size_c; ++i)
{
hc[i] = rand() % 17;
}
hc_gold = hc;
// allocate memory on device
double *da, *db, *dc;
hipMalloc(&da, size_a * sizeof(double));
hipMalloc(&db, size_b * sizeof(double));
hipMalloc(&dc, size_c * sizeof(double));
// copy matrices from host to device
hipMemcpy(da, ha.data(), sizeof(double) * size_a, hipMemcpyHostToDevice);
hipMemcpy(db, hb.data(), sizeof(double) * size_b, hipMemcpyHostToDevice);
hipMemcpy(dc, hc.data(), sizeof(double) * size_c, hipMemcpyHostToDevice);
rocblas_handle handle;
rocblas_create_handle(&handle);
rocblas_dgemm(handle, transa, transb, m, n, k, &alpha, da, lda, db, ldb, &beta, dc, ldc);
// copy output from device to CPU
hipMemcpy(hc.data(), dc, sizeof(double) * size_c, hipMemcpyDeviceToHost);
std::cout << "m, n, k, lda, ldb, ldc = " << m << ", " << n << ", " << k << ", " << lda << ", "
<< ldb << ", " << ldc << std::endl;
double max_relative_error = std::numeric_limits::min();
// calculate golden or correct result
mat_mat_mult(alpha,
beta,
m,
n,
k,
ha.data(),
a_stride_1,
a_stride_2,
hb.data(),
b_stride_1,
b_stride_2,
hc_gold.data(),
1,
ldc);
for(int i = 0; i < size_c; i++)
{
double relative_error = (hc_gold[i] - hc[i]) / hc_gold[i];
relative_error = relative_error > 0 ? relative_error : -relative_error;
max_relative_error
= relative_error < max_relative_error ? max_relative_error : relative_error;
}
double eps = std::numeric_limits::epsilon();
double tolerance = 10;
if(max_relative_error != max_relative_error || max_relative_error > eps * tolerance)
{
std::cout << "FAIL: max_relative_error = " << max_relative_error << std::endl;
}
else
{
std::cout << "PASS: max_relative_error = " << max_relative_error << std::endl;
}
hipFree(da);
hipFree(db);
hipFree(dc);
rocblas_destroy_handle(handle);
return EXIT_SUCCESS;
}
简单使用Makefile按照编译器位置等进行修改,make之后就可以得到可执行程序rocblas_dgemm,执行./rocblas_dgemm之后,可以看到以下输出
dgemm example
NT: m, n, k, lda, ldb, ldc = 1023, 1024, 1025, 1023, 1024, 1023
PASS: max_relative_error = 2.22026e-16
对应的Makefile如下:
ROCBLAS_INSTALL_DIR=/opt/rocm/rocblas
ROCBLAS_INCLUDE=$(ROCBLAS_INSTALL_DIR)/include
ROCBLAS_LIB_PATH=$(ROCBLAS_INSTALL_DIR)/lib
ROCBLAS_LIB=rocblas
HIP_INCLUDE=/opt/rocm/hip/include
#HPP_INCLUDE=/public/home/mayj/Quantum_Soft/rocBLAS_dev/clients/include
#CBLAS_INCLUDE=/public/home/mayj/Quantum_Soft/CBLAS/include
LDFLAGS=-L$(ROCBLAS_LIB_PATH) -l$(ROCBLAS_LIB)
LD=hipcc
CFLAGS=-I$(ROCBLAS_INCLUDE) -I$(HIP_INCLUDE) -I$(HPP_INCLUDE) -I$(CBLAS_INCLUDE)
CPP=hipcc
OBJ=rocblas_dgemm.o
EXE=rocblas_dgemm
%.o: %.cpp
$(CPP) -c -o $@ $< $(CFLAGS)
$(EXE) : $(OBJ)
$(LD) $(OBJ) $(LDFLAGS) -o $@
clean:
rm -f $(EXE) $(OBJ)
或者也可以直接使用以下的编译指令
hipcc -c -o rocblas_dgemm.o rocblas_dgemm.cpp -I/opt/rocm/rocblas/include -I/opt/rocm/hip/include/hip
hipcc rocblas_dgemm.o -L/opt/rocm/rocblas/lib -lrocblas -o rocblas_dgemm
或者
g++ -c -o rocblas_dgemm.o rocblas_dgemm.cpp -I/opt/rocm/rocblas/include -I/opt/rocm/hip/include/hip -D__HIP_PLATFORM_HCC__
g++ rocblas_dgemm.o -L/opt/rocm/rocblas/lib -lrocblas -L/opt/rocm/lib -lhip_hcc -o rocblas_dgemm
(J_Sagat)