通用图形处理单元(General-Purpose Computing on Graphics Processing Units,简称GPGPU)是一种利用图形处理单元(GPU)执行重复计算的技术。GPU拥有大量的处理元素,非常适合执行并行计算任务。本文将通过一个简单的三角计算示例,展示如何在GPU上执行计算。此外,还将提供附加代码,展示如何使用GPGPU进行矩阵的平方计算(即矩阵乘法),通过生成n行n列的GPU线程来实现。选择这个示例是因为输出元素可以独立计算。
GPGPU技术已经存在了一年多,NVIDIA推出了CUDA,AMD推出了接近金属(close to metal)和AMD stream,许多爱好者也试图使用DirectX9像素着色器来实现GPGPU。
附带的代码是使用VS2010 Beta 1和2009年8月的DirectX SDK在Windows 7 RC上编译的。这段代码不会在Windows XP上运行,因为DirectX11不支持Windows XP。源代码的某些部分是从DirectX SDK 2009年8月的示例中提取并适应程序的。
程序的起点是Start(void*)。程序分为以下几个子部分:
使用D3D_DRIVER_TYPE_REFERENCE进行仿真,使用D3D_DRIVER_TYPE_HARDWARE在GPU上运行代码(这需要硬件支持)。
C++
D3D11CreateDevice( NULL, D3D_DRIVER_TYPE_REFERENCE
/*
D3D_DRIVER_TYPE_HARDWARE*/
, NULL, D3D11_CREATE_DEVICE_SINGLETHREADED|D3D11_CREATE_DEVICE_DEBUG, NULL, 0, D3D11_SDK_VERSION, &pDeviceOut, &flOut, &pContextOut );
程序员必须将缓冲区加载到GPU进行处理,这是比较困难的部分。附带的源代码将提供更多信息:
C++
//for input buffer
HRESULT CreateStructuredBufferOnGPU( ID3D11Device* pDevice, UINT uElementSize, UINT uCount, VOID* pInitData, ID3D11Buffer** ppBufOut )
{
*ppBufOut = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory( &desc, sizeof(desc) );
desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
desc.ByteWidth = uElementSize * uCount;
desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
desc.StructureByteStride = uElementSize;
if ( pInitData )
{
D3D11_SUBRESOURCE_DATA InitData;
InitData.pSysMem = pInitData;
return pDevice->CreateBuffer( &desc, &InitData, ppBufOut );
}
else
return pDevice->CreateBuffer( &desc, NULL, ppBufOut );
}
//for input buffer
HRESULT CreateBufferSRV( ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11ShaderResourceView** ppSRVOut )
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory( &descBuf, sizeof(descBuf) );
pBuffer->GetDesc( &descBuf );
D3D11_SHADER_RESOURCE_VIEW_DESC desc;
ZeroMemory( &desc, sizeof(desc) );
desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX;
desc.BufferEx.FirstElement = 0;
if ( descBuf.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
{
// This is a Raw Buffer
desc.Format = DXGI_FORMAT_R32_TYPELESS;
desc.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
desc.BufferEx.NumElements = descBuf.ByteWidth / 4;
}
else if ( descBuf.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_STRUCTURED )
{
// This is a Structured Buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.BufferEx.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
}
else
{
return E_INVALIDARG;
}
return pDevice->CreateShaderResourceView( pBuffer, &desc, ppSRVOut );
}
//for output buffer
HRESULT CreateBufferUAV( ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11UnorderedAccessView** ppUAVOut )
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory( &descBuf, sizeof(descBuf) );
pBuffer->GetDesc( &descBuf );
D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
ZeroMemory( &desc, sizeof(desc) );
desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
desc.Buffer.FirstElement = 0;
if ( descBuf.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
{
// This is a Raw Buffer
desc.Format = DXGI_FORMAT_R32_TYPELESS;
desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
desc.Buffer.NumElements = descBuf.ByteWidth / 4;
}
else if ( descBuf.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_STRUCTURED )
{
// This is a Structured Buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.Buffer.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
}
else
{
return E_INVALIDARG;
}
return pDevice->CreateUnorderedAccessView( pBuffer, &desc, ppUAVOut );
}
这个命令将数据分派给GPU上可用的处理元素,其性能直接与硬件和驱动程序支持相关(这是为使用D3D_DRIVER_TYPE_HARDWARE创建的设备)。
C++
pd3dImmediateContext->Dispatch( X, Y, Z );
在DirectX9中,这部分是最痛苦的部分,但随着DirectX 11计算着色器的出现,这变得容易多了。首先,创建一个临时读取缓冲区,将CPU访问标志设置为D3D11_CPU_ACCESS_READ。然后,复制缓冲区,并将其映射到指针,如下所示:
C++
pd3dImmediateContext->CopyResource( debugbuf, pBuffer );
BufType *p;
pContextOut->Map( debugbuf, 0, D3D11_MAP_READ, 0, &MappedResource );
p = (BufType*)MappedResource.pData;
//p将持有输出缓冲区