怎么在pytorch中使用cuda扩展 - 行业资讯 - 肥雀云

　　介绍

怎么在pytorch中使用cuda扩展?针对这个问题,这篇文章详细介绍了相对应的分析和解答,希望可以帮助更多想解决这个问题的小伙伴找到更简单易行的方法。

<强>第一步:cuda编程的源文件和头文件

//, mathutil_cuda_kernel.cu//,头文件,最后一个是cuda特有的　　# include & lt; curand.h> 　　# include & lt; stdio.h> 　　# include & lt; math.h> 　　# include & lt; float.h> 　　# include “mathutil_cuda_kernel.h"//,获取GPU线程通道信息　　dim3 cuda_gridsize (int n) 　　{ 　　int 才能;k =, (n 安康;1),/,BLOCK +, 1; 　　int 才能;x =, k; 　　int 才能;y =, 1; 　　如果才能(x 祝辞,65535),{ 　　,,,x =,装天花板(sqrt (k)); 　　,,,y =, (n 安康;1),/,(* x 块),+,1; 　　,,} 　　dim3 才能;d (x, y,, 1); 　　return 才能;d; 　　}//,这个函数是cuda执行函数,可以看到细化到了每一个元素　　__global__ void broadcast_sum_kernel (float *,, float * b, int x,, int y, int 大小) 　　{ 　　int 才能;小姐:=,(时间+ blockIdx.x blockIdx.y *, gridDim.x), *, blockDim.x +, threadIdx.x; 　　,,,如果我的在=,大小),返回; 　　int 才能;j =,小姐:%,x,,小姐:=,小姐:/,x; 　　int 才能;k =,小姐:%,y; 　　一个才能[IDX2D (j, k,, y)], +=, b [k]; 　　}//,这个函数是与c语言函数链接的接口函数　　void broadcast_sum_cuda (float *, float * b, int x,, int y, cudaStream_t 流) 　　{ 　　int 才能;size =, x *, y; 　　cudaError_t 才能,犯错; 　　,,//,才能上面定义的函数　　broadcast_sum_kernel<才能;& lt; & lt; cuda_gridsize(大小),,,,0,,stream>在祝辞(a, b,, x,, y,,大小); 　　　　时间=err 才能;cudaGetLastError (); 　　if 才能;(cudaSuccess !=,犯错) 　　{才能　　,,,流(stderr,“CUDA kernel failed :, % s \ n",, cudaGetErrorString (err)); 　　,,,退出(1); 　　,,} 　　} # ifndef _MATHUTIL_CUDA_KERNEL 　　# define _MATHUTIL_CUDA_KERNEL 　　　　# define IDX2D (i, j, dj), (dj *,小姐:+,j) 　　# define IDX3D (i, j, k, dj,, dk), (IDX2D (IDX2D (i, j, dj),, k, dk)) 　　　　# define BLOCK 512年　　# define MAX_STREAMS 512年　　　　# ifdef __cplusplus 　　extern “C", { 　　# endif 　　　　void broadcast_sum_cuda (float *, float * b, int x,, int y, cudaStream_t 流); 　　　　# ifdef __cplusplus 　　} 　　# endif 　　　　# endif

<强>第二步: C编程的源文件和头文件(接口函数)

//, mathutil_cuda.c//,THC是pytorch底层GPU库　　# include & lt; THC/THC.h> 　　# include “mathutil_cuda_kernel.h" 　　　　extern THCState *状态; 　　　　int broadcast_sum (THCudaTensor * a_tensor, THCudaTensor * b_tensor, int x,, int y) 　　{ 　　float 才能;* a =, THCudaTensor_data(国家,a_tensor); 　　float 才能;* b =, THCudaTensor_data(国家,b_tensor); 　　cudaStream_t 才能;stream =, THCState_getCurrentStream(状态);//才能,这里调用之前在cuda中编写的接口函数　　broadcast_sum_cuda才能(a, b,, x,, y,,流); 　　　　return 才能;1; 　　} int broadcast_sum (THCudaTensor * a_tensor, THCudaTensor * b_tensor, int x,, int y);

<强>第三步:编译,先编译cuda模块,再编译接口函数模块(不能放在一起同时编译)

nvcc -c -o mathutil_cuda_kernel.cu.o mathutil_cuda_kernel.cu -x cu -Xcompiler -fPIC 拱=sm_52 import 操作系统　　import 火炬　　得到torch.utils.ffi import create_extension 　　　　时间=this_file os.path.dirname (__file__) 　　　　时间=sources [] 　　时间=headers [] 　　时间=defines [] 　　with_cuda =False 　　　　if torch.cuda.is_available (): 　　打印才能(& # 39;Including CUDA 代码# 39;公司) 　　sources 才能+=,(& # 39;src/mathutil_cuda.c& # 39;】　　headers 才能+=,(& # 39;src/mathutil_cuda.h& # 39;】　　defines 才能+=,((& # 39;WITH_CUDA& # 39;,,没有一个)) 　　with_cuda 才能=,真的　　　　时间=this_file os.path.dirname (os.path.realpath (__file__)) 　　　　时间=extra_objects [& # 39; src/mathutil_cuda_kernel.cu.o& # 39;],, #,这里是编译好后的. o文件位置　　时间=extra_objects [os.path.join (this_file,帧),for fname 拷贝extra_objects] 　　　　　　时间=ffi create_extension ( 　　& # 39;才能_ext.cuda_util& # 39; 　　头=头,才能=消息来源,才能　　define_macros=定义,才能　　relative_to=__file__,才能　　with_cuda=with_cuda,才能　　extra_objects=extra_objects才能　　) 　　　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null