环境版本:
工程目录:
测试输出:
WORKSPACE
参考仓库:CUDA rules for Bazel 及 examples
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")http_archive(name = "rules_cuda",sha256 = "76aea02d6763e0ba5bdf5f981174d6ec39c19e575812cf6956329e453e091adf", # 可选strip_prefix = "rules_cuda-main",urls = ["https://github.com/bazel-contrib/rules_cuda/archive/main.zip"],
)# CUDA rules for Bazel
# https://github.com/bazel-contrib/rules_cuda
# bazel for cuda 具体用法示例
# https://github.com/bazel-contrib/rules_cuda/tree/main/examples
# https://github.com/bazel-contrib/rules_cuda/blob/main/examples/basic/BUILD.bazelload("@rules_cuda//cuda:repositories.bzl", "rules_cuda_dependencies", "rules_cuda_toolchains")rules_cuda_dependencies()rules_cuda_toolchains(register_toolchains = True)
BUILD
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_binary")cuda_library(name = "cuda_matmul_lib",srcs = ["matmul.cu"],hdrs = ["matmul.h"],
)cc_binary(name = "cuda_matmul",srcs = ["main.cc"],deps = [":cuda_matmul_lib"],
)
main.cc
#include <iostream>
#include <vector>
#include "matmul.h"#define N 4 // 矩阵大小 (可调整)void printMatrix(const float* M, int size) {for (int i = 0; i < size; i++) {for (int j = 0; j < size; j++) {std::cout << M[i * size + j] << " ";}std::cout << std::endl;}
}int main() {float A[N * N] = {1, 2, 3, 4,5, 6, 7, 8,9, 10, 11, 12,13, 14, 15, 16};float B[N * N] = {1, 0, 0, 0,0, 1, 0, 0,0, 0, 1, 0,0, 0, 0, 1};float C[N * N] = {0};std::cout << "Matrix A:\n";printMatrix(A, N);std::cout << "Matrix B:\n";printMatrix(B, N);// 调用 CUDA 矩阵乘法matrixMultiply(A, B, C, N);std::cout << "Matrix C (A * B):\n";printMatrix(C, N);return 0;
}
matmul.h
#ifndef MATMUL_H
#define MATMUL_Hvoid matrixMultiply(float *A, float *B, float *C, int N);#endif // MATMUL_H
matmul.cu
#include <cuda_runtime.h>
#include "matmul.h"// CUDA 核函数
__global__ void matmulKernel(float *A, float *B, float *C, int N) {int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;if (row < N && col < N) {float sum = 0.0f;for (int i = 0; i < N; i++) {sum += A[row * N + i] * B[i * N + col];}C[row * N + col] = sum;}
}// 矩阵乘法封装函数
void matrixMultiply(float *A, float *B, float *C, int N) {float *d_A, *d_B, *d_C;size_t size = N * N * sizeof(float);// 设备内存分配cudaMalloc((void**)&d_A, size);cudaMalloc((void**)&d_B, size);cudaMalloc((void**)&d_C, size);// 复制数据到 GPUcudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);// 设置 CUDA 线程块dim3 blockDim(16, 16);dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);// 启动 KernelmatmulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);cudaDeviceSynchronize();// 复制结果回 CPUcudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);// 释放设备内存cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
}