ROCm全称为Radeon Open Computing,是AMD的一个完整的GPGPU生态。
其在源码层面上支持各种深度学习的框架,不过对于科学计算的话,最关键的还是矩阵操作部分。 ROCm里面对应的数学库为rocblas,当前也支持到blas3级别的并行。
这里面我们就以双精度dgemm的计算为例,介绍一下其调用方式。 其cpp的示例程序如下(使用AMD自带的算例修改得来)
/* ************************************************************************ * Copyright 2016 Advanced Micro Devices, Inc. * Modified by Yingjin Ma from sgemm to dgemm, for testing * ************************************************************************ */ #include "rocblas.h" #include <cstdio> #include <iostream> #include <string> #include <type_traits> #include <vector> #include <hip/hip_runtime.h> // newly add in Nov.23 2019 //#include "utility.hpp" #define DIM1 1023 #define DIM2 1024 #define DIM3 1025 template <typename T> void mat_mat_mult(T alpha, T beta, int M, int N, int K, T* A, int As1, int As2, T* B, int Bs1, int Bs2, T* C, int Cs1, int Cs2) { for(int i1 = 0; i1 < M; i1++) { for(int i2 = 0; i2 < N; i2++) { T t = 0.0; for(int i3 = 0; i3 < K; i3++) { t += A[i1 * As1 + i3 * As2] * B[i3 * Bs1 + i2 * Bs2]; } C[i1 * Cs1 + i2 * Cs2] = beta * C[i1 * Cs1 + i2 * Cs2] + alpha * t; } } } int main() { rocblas_operation transa = rocblas_operation_none, transb = rocblas_operation_transpose; double alpha = 1.1, beta = 0.9; rocblas_int m = DIM1, n = DIM2, k = DIM3; rocblas_int lda, ldb, ldc, size_a, size_b, size_c; int a_stride_1, a_stride_2, b_stride_1, b_stride_2; std::cout << "dgemm example" << std::endl; if(transa == rocblas_operation_none) { lda = m; size_a = k * lda; a_stride_1 = 1; a_stride_2 = lda; std::cout << "N"; } else { lda = k; size_a = m * lda; a_stride_1 = lda; a_stride_2 = 1; std::cout << "T"; } if(transb == rocblas_operation_none) { ldb = k; size_b = n * ldb; b_stride_1 = 1; b_stride_2 = ldb; std::cout << "N: "; } else { ldb = n; size_b = k * ldb; b_stride_1 = ldb; b_stride_2 = 1; std::cout << "T: "; } ldc = m; size_c = n * ldc; // Naming: da is in GPU (device) memory. ha is in CPU (host) memory std::vector<double> ha(size_a); std::vector<double> hb(size_b); std::vector<double> hc(size_c); std::vector<double> hc_gold(size_c); // initial data on host srand(1); for(int i = 0; i < size_a; ++i) { ha[i] = rand() % 17; } for(int i = 0; i < size_b; ++i) { hb[i] = rand() % 17; } for(int i = 0; i < size_c; ++i) { hc[i] = rand() % 17; } hc_gold = hc; // allocate memory on device double *da, *db, *dc; hipMalloc(&da, size_a * sizeof(double)); hipMalloc(&db, size_b * sizeof(double)); hipMalloc(&dc, size_c * sizeof(double)); // copy matrices from host to device hipMemcpy(da, ha.data(), sizeof(double) * size_a, hipMemcpyHostToDevice); hipMemcpy(db, hb.data(), sizeof(double) * size_b, hipMemcpyHostToDevice); hipMemcpy(dc, hc.data(), sizeof(double) * size_c, hipMemcpyHostToDevice); rocblas_handle handle; rocblas_create_handle(&handle); rocblas_dgemm(handle, transa, transb, m, n, k, &alpha, da, lda, db, ldb, &beta, dc, ldc); // copy output from device to CPU hipMemcpy(hc.data(), dc, sizeof(double) * size_c, hipMemcpyDeviceToHost); std::cout << "m, n, k, lda, ldb, ldc = " << m << ", " << n << ", " << k << ", " << lda << ", " << ldb << ", " << ldc << std::endl; double max_relative_error = std::numeric_limits<double>::min(); // calculate golden or correct result mat_mat_mult<double>(alpha, beta, m, n, k, ha.data(), a_stride_1, a_stride_2, hb.data(), b_stride_1, b_stride_2, hc_gold.data(), 1, ldc); for(int i = 0; i < size_c; i++) { double relative_error = (hc_gold[i] - hc[i]) / hc_gold[i]; relative_error = relative_error > 0 ? relative_error : -relative_error; max_relative_error = relative_error < max_relative_error ? max_relative_error : relative_error; } double eps = std::numeric_limits<double>::epsilon(); double tolerance = 10; if(max_relative_error != max_relative_error || max_relative_error > eps * tolerance) { std::cout << "FAIL: max_relative_error = " << max_relative_error << std::endl; } else { std::cout << "PASS: max_relative_error = " << max_relative_error << std::endl; } hipFree(da); hipFree(db); hipFree(dc); rocblas_destroy_handle(handle); return EXIT_SUCCESS; }
简单使用Makefile按照编译器位置等进行修改,make之后就可以得到可执行程序rocblas_dgemm,执行./rocblas_dgemm之后,可以看到以下输出
dgemm example NT: m, n, k, lda, ldb, ldc = 1023, 1024, 1025, 1023, 1024, 1023 PASS: max_relative_error = 2.22026e-16
对应的Makefile如下:
ROCBLAS_INSTALL_DIR=/opt/rocm/rocblas ROCBLAS_INCLUDE=$(ROCBLAS_INSTALL_DIR)/include ROCBLAS_LIB_PATH=$(ROCBLAS_INSTALL_DIR)/lib ROCBLAS_LIB=rocblas HIP_INCLUDE=/opt/rocm/hip/include #HPP_INCLUDE=/public/home/mayj/Quantum_Soft/rocBLAS_dev/clients/include #CBLAS_INCLUDE=/public/home/mayj/Quantum_Soft/CBLAS/include LDFLAGS=-L$(ROCBLAS_LIB_PATH) -l$(ROCBLAS_LIB) LD=hipcc CFLAGS=-I$(ROCBLAS_INCLUDE) -I$(HIP_INCLUDE) -I$(HPP_INCLUDE) -I$(CBLAS_INCLUDE) CPP=hipcc OBJ=rocblas_dgemm.o EXE=rocblas_dgemm %.o: %.cpp $(CPP) -c -o $@ $< $(CFLAGS) $(EXE) : $(OBJ) $(LD) $(OBJ) $(LDFLAGS) -o $@ clean: rm -f $(EXE) $(OBJ)
或者也可以直接使用以下的编译指令
hipcc -c -o rocblas_dgemm.o rocblas_dgemm.cpp -I/opt/rocm/rocblas/include -I/opt/rocm/hip/include/hip hipcc rocblas_dgemm.o -L/opt/rocm/rocblas/lib -lrocblas -o rocblas_dgemm
或者
g++ -c -o rocblas_dgemm.o rocblas_dgemm.cpp -I/opt/rocm/rocblas/include -I/opt/rocm/hip/include/hip -D__HIP_PLATFORM_HCC__ g++ rocblas_dgemm.o -L/opt/rocm/rocblas/lib -lrocblas -L/opt/rocm/lib -lhip_hcc -o rocblas_dgemm
(J_Sagat)