How to access Tier 1 Argument Buffer struct without indexing - ios

According to this example from Apple, Tier 1 Argument Buffers cannot be accessed through pointer indexing (https://developer.apple.com/documentation/metal/buffers/about_argument_buffers). If this is not allowed, how can I index into a particular struct in my argument buffer array?
// Shader.metal
struct MyTexture {
metal::texture2d<float, metal::access::sample> texture;
};
fragment half4 myFragment(VertexOut vert [[stage_in]],
....,
constant int &count [[buffer(4)]],
constant MyTexture *textures [[buffer(5)]],
....)
{
for(int i = 0; i < count; i++) {
MyTexture resource = textures[i];
float4 color = resource.texture.sample(sampler, pos.xy);
outputColor = mix(inputColor, color, 0.5); // <-- Causes error
}
}
The error that I get is from creating the MTLRenderPipelineState with this error message:
Inlining all functions due to use of indirect argument bufferbuffer(5): Argument buffer accessed with non-zero array index.

Short answer: you can't.
The reason you can't is because tier 1 hardware can only emulate argument buffers using regular bind points. With tier2 you can bind any number of textures there, so the driver can't know at bind time how many slots it will need to use, and the hardware itself can't do a dependent read for other GPU objects, like textures and samplers.

A workaround for Tier 1 would be to pass a pointer to the instance inside the Argument Buffer, rather than the entire buffer.
Example, look at the use of Material.
// Argument-buffered resource
struct Material {
metal::sampler sampler [[id(AB_MaterialSampler)]];
metal::texture2d<float> base_color_texture [[id(AB_MaterialBaseColorTexture)]];
metal::texture2d<float> normal_map [[id(AB_MaterialNormalMap)]];
metal::texture2d<float> ao_metallic_roughness_map [[id(AB_MaterialAoMetallicRoughnessMap)]];
float3 base_color_factor [[id(AB_MaterialBaseColorFactor)]];
float metallic_factor [[id(AB_MaterialMetallicFactor)]];
float roughness_factor [[id(AB_MaterialRoughnessFactor)]];
};
// GPU-driven rendering kernel
kernel void icb_frame_kernel(device IcbContainer& icb_container [[buffer(KB_IcbContainer)]],
constant VertexUniforms* vertex_uniforms [[buffer(KB_VertexUniforms)]],
constant FragmentUniforms* fragment_uniforms [[buffer(KB_FragmentUniforms)]],
device Mesh* meshes [[buffer(KB_Meshes)]],
constant Transform* transforms [[buffer(KB_Transforms)]],
device Material* materials [[buffer(KB_Materials)]],
constant ShadowMap* shadow_map [[buffer(KB_ShadowMap)]],
constant Ibl* ibl [[buffer(KB_Ibl)]],
constant Cubemap* cubemap [[buffer(KB_Cubemap)]],
device MTLIndirectCommandBufferExecutionRange& range [[buffer(KB_ExecutionRange)]],
const uint instance_id [[thread_position_in_grid]]) {
device auto& mesh = meshes[instance_id];
device auto* range_length = reinterpret_cast<device atomic_uint*>(&range.length);
const auto index = atomic_fetch_add_explicit(range_length, 1, memory_order_relaxed);
render_command cmd(icb_container.icb, index);
cmd.set_render_pipeline_state(mesh.pipeline_state);
cmd.set_vertex_buffer(mesh.vertex_buffer, VB_Vertices);
cmd.set_vertex_buffer(vertex_uniforms, VB_VertexUniforms);
cmd.set_vertex_buffer(transforms, VB_Transforms);
cmd.set_fragment_buffer(fragment_uniforms, FB_FragmentUniforms);
cmd.set_fragment_buffer(transforms, FB_Transforms);
// Tier 1: use indexed access and pass pointer to instance
cmd.set_fragment_buffer(&materials[instance_id], FB_Material);
// Tier 2: pass entire buffer and use indexed access in fragment shader
cmd.set_fragment_buffer(materials, FB_Material);
cmd.set_fragment_buffer(shadow_map, FB_ShadowMap);
cmd.set_fragment_buffer(ibl, FB_Ibl);
cmd.set_fragment_buffer(cubemap, FB_Cubemap);
if (mesh.is_uint16_index){
constant auto* index_buffer = static_cast<constant ushort*>(mesh.index_buffer);
cmd.draw_indexed_primitives(primitive_type::triangle, mesh.index_count, index_buffer, 1, 0, instance_id);
} else {
constant auto* index_buffer = static_cast<constant uint*>(mesh.index_buffer);
cmd.draw_indexed_primitives(primitive_type::triangle, mesh.index_count, index_buffer, 1, 0, instance_id);
}
}
// Tier 1
fragment half4 pbr_fragment(ProjectedVertex vert [[stage_in]],
constant FragmentUniforms& uniforms [[buffer(FB_FragmentUniforms)]],
constant Material& material [[buffer(FB_Material)]],
constant Ibl& ibl [[buffer(FB_Ibl), function_constant(HAS_IBL)]],
constant ShadowMap& shadow_map [[buffer(FB_ShadowMap), function_constant(HAS_SHADOW_MAP)]]
) {
// Use Material
}
// Tier 2
fragment half4 pbr_fragment(ProjectedVertex vert [[stage_in]],
constant FragmentUniforms& uniforms [[buffer(FB_FragmentUniforms)]],
device Material* materials [[buffer(FB_Material)]],
constant Ibl& ibl [[buffer(FB_Ibl), function_constant(HAS_IBL)]],
constant ShadowMap& shadow_map [[buffer(FB_ShadowMap), function_constant(HAS_SHADOW_MAP)]]
) {
// Use indexed Material
const auto& material = materials[vert.instance_id];
}
I did not have time to edit the example for brevity, but it should be clear enough.
Side note: the Metal spec recommends to use device address space whenever you use pointer arithmetic (indexed access). See page 61 of the spec.

Related

Error when using Metal Indirect Command Buffer: "Fragment shader cannot be used with indirect command buffers"

I’m working on a Metal, MTKView based app that takes advantage of the A11 TBDR architecture to do deferred shading in a single render pass. I used Apple’s Deferred Lighting sample code as reference, and it works great.
I’d like to try changing the geometry buffer pass to be GPU-driven, using the Indirect Command Buffer feature of Metal 2 on A11 hardware.
I’ve been using Apple’s Encoding Indirect Command Buffers on the GPU sample code as my main point of reference for this. I’m able to run this sample on my iPhone XR (although, probably off-topic, the scrolling is not smooth, it judders).
I’m running into difficulties however with my own code, when I try to move my geometry buffer pass into an indirect command buffer. When I set supportIndirectCommandBuffers to true on the MTLRenderPipelineDescriptor of the Geometry Buffer pipeline, device.makeRenderPipelineState fails with the error
AGXMetalA12 Code=3 "Fragment shader cannot be used with indirect command buffers"
I’ve not been able to find any information in the documentation on this error. I’m wondering, are there certain kinds of fragment operation that are not allowed in indirect pipelines, or some kind of limit to GPU-driven drawing that I've overlooked (the number of color attachments perhaps)?
SharedTypes.h
Header shared by Metal and Swift
#ifndef SharedTypes_h
#define SharedTypes_h
#ifdef __METAL_VERSION__
#define NS_CLOSED_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
#define NSInteger metal::int32_t
#else
#import <Foundation/Foundation.h>
#endif
#include <simd/simd.h>
typedef struct {
uint32_t meshId;
matrix_float3x3 normalViewMatrix;
matrix_float4x4 modelMatrix;
matrix_float4x4 shadowMVPTransformMatrix;
} InstanceData;
typedef struct {
vector_float3 cameraPosition;
float voxelScale;
float blockScale;
vector_float3 lightDirection;
matrix_float4x4 viewMatrix;
matrix_float4x4 projectionMatrix;
matrix_float4x4 projectionMatrixInverse;
matrix_float4x4 shadowViewProjectionMatrix;
} VoxelUniforms;
typedef NS_CLOSED_ENUM(NSInteger, BufferIndex)
{
BufferIndexInstances = 0,
BufferIndexVertices = 1,
BufferIndexIndices = 2,
BufferIndexVoxelUniforms = 3,
};
typedef NS_CLOSED_ENUM(NSInteger, RenderTarget)
{
RenderTargetLighting = 0,
RenderTargetNormal_shadow = 1,
RenderTargetVoxelIndex = 2,
RenderTargetDepth = 3,
};
#endif /* SharedTypes_h */
GBuffer shader
#include <metal_stdlib>
using namespace metal;
#include "../SharedTypes.h"
struct VertexIn {
packed_half3 position;
packed_half3 texCoord3D;
half ambientOcclusion;
uchar normalIndex;
};
struct VertexInOut {
float4 position [[ position ]];
half3 worldPos;
half3 eyeNormal;
half3 localPosition;
half3 localNormal;
float eyeDepth;
float3 shadowCoord;
half3 texCoord3D;
};
vertex VertexInOut gBufferVertex(device InstanceData* instances [[ buffer( BufferIndexInstances ) ]],
device VertexIn* vertices [[ buffer( BufferIndexVertices ) ]],
constant VoxelUniforms &uniforms [[ buffer( BufferIndexVoxelUniforms ) ]],
uint vid [[ vertex_id ]],
ushort iid [[ instance_id ]])
{
InstanceData instance = instances[iid];
VertexIn vert = vertices[vid];
VertexInOut out;
float4 position = float4(float3(vert.position), 1);
float4 worldPos = instance.modelMatrix * position;
float4 eyePosition = uniforms.viewMatrix * worldPos;
out.position = uniforms.projectionMatrix * eyePosition;
out.worldPos = half3(worldPos.xyz);
out.eyeDepth = eyePosition.z;
half3 normal = normals[vert.normalIndex];
out.eyeNormal = half3(instance.normalViewMatrix * float3(normal));
out.shadowCoord = (instance.shadowMVPTransformMatrix * position).xyz;
out.localPosition = half3(vert.position);
out.localNormal = normal;
out.texCoord3D = half3(vert.texCoord3D);
return out;
}
fragment GBufferData gBufferFragment(VertexInOut in [[ stage_in ]],
constant VoxelUniforms &uniforms [[ buffer( BufferIndexVoxelUniforms ) ]],
texture3d<ushort, access::sample> voxelMap [[ texture(0) ]],
depth2d<float> shadowMap [[ texture(1) ]],
texture3d<half, access::sample> fogOfWarMap [[ texture(2) ]]
) {
// voxel index
half3 center = round(in.texCoord3D);
uchar voxIndex = voxelMap.read(ushort3(center)).r - 1;
// ambient occlusion
half3 neighborPos = center + in.localNormal;
half3 absNormal = abs(in.localNormal);
half2 texCoord2D = tc2d(in.localPosition / uniforms.voxelScale, absNormal);
half ao = getAO(voxelMap, neighborPos, absNormal.yzx, absNormal.zxy, texCoord2D);
// shadow
constexpr sampler shadowSampler(coord::normalized,
filter::linear,
mip_filter::none,
address::clamp_to_edge,
compare_func::less);
float shadow_sample = ambientLightingLevel;
for (short i = 0; i < shadowSampleCount; i++){
shadow_sample += shadowMap.sample_compare(shadowSampler, in.shadowCoord.xy + poissonDisk[i] * 0.002, in.shadowCoord.z - 0.0018) * shadowContributionPerSample;
}
shadow_sample = min(1.0, shadow_sample);
//fog-of-war
half fogOfWarSample = fogOfWarMap.sample(fogOfWarSampler, (float3(in.worldPos) / uniforms.blockScale) + float3(0.5, 0.4, 0.5)).r;
half notVisible = max(fogOfWarSample, 0.5h);
// output
GBufferData out;
out.normal_shadow = half4(in.eyeNormal, ao * half(shadow_sample) * notVisible);
out.voxelIndex = voxIndex;
out.depth = in.eyeDepth;
return out;
};
Pipeline setup
extension RenderTarget {
var pixelFormat: MTLPixelFormat {
switch self {
case .lighting: return .bgra8Unorm
case .normal_shadow: return .rgba8Snorm
case .voxelIndex: return .r8Uint
case .depth: return .r32Float
}
}
static var allCases: [RenderTarget] = [.lighting, .normal_shadow, .voxelIndex, .depth]
}
public final class GBufferRenderer {
private let renderPipelineState: MTLRenderPipelineState
weak var shadowMap: MTLTexture?
public init(depthPixelFormat: MTLPixelFormat, colorPixelFormat: MTLPixelFormat, sampleCount: Int = 1) throws {
let library = try LibraryMonad.getLibrary()
let device = library.device
let descriptor = MTLRenderPipelineDescriptor()
descriptor.vertexFunction = library.makeFunction(name: "gBufferVertex")!
descriptor.fragmentFunction = library.makeFunction(name: "gBufferFragment")!
descriptor.depthAttachmentPixelFormat = depthPixelFormat
descriptor.stencilAttachmentPixelFormat = depthPixelFormat
descriptor.sampleCount = sampleCount
for target in RenderTarget.allCases {
descriptor.colorAttachments[target.rawValue].pixelFormat = target.pixelFormat
}
// uncomment below to trigger throw
// descriptor.supportIndirectCommandBuffers = true
renderPipelineState = try device.makeRenderPipelineState(descriptor: descriptor) // throws "Fragment shader cannot be used with indirect command buffers"
}
public convenience init(mtkView: MTKView) throws {
try self.init(depthPixelFormat: mtkView.depthStencilPixelFormat, colorPixelFormat: mtkView.colorPixelFormat, sampleCount: mtkView.sampleCount)
}
}
The above works great when triggering draws from the CPU in the usual way, but when setting supportIndirectCommandBuffers in preparation for GPU drawing it throws the error.
I've tried stripping down the fragment shader to just return constant values for the GBuffers, and then makeRenderPipelineState succeeds, but when I add texture sampling back in it begins complaining again. I can't seem to pin down what exactly it doesn't like about the frag shader.
Looking through the code and through Metal documentation and Metal Shading Language specification, I think I know why you get this error.
If you look through render_command interface that is present in metal_command_buffer header in Metal, you'll find that to pass parameters to indirect render commands, you only have these functions: set_vertex_buffer and set_fragment_buffer, there is no set_vertex_texture or set_vertex_sampler like you have in MTLRenderCommandEncoder.
But, since your pipeline uses shader that in turn uses textures as arguments and you indicate by using supportIndirectCommandBuffers that you would like to use this pipeline in indirect commands, Metal has no choice but to fail pipeline creation.
Instead if you want to pass textures or samplers to indirect render commands, you should use argument buffers, that you will pass to the shader that issues indirect render commands, which in turn will bind them using set_vertex_buffer and set_fragment_buffer for each render_command.
Specification: Metal Shading Language Specification (Section 5.16)

Fragment function seems properly written but Metal complains

TL;DR: Metal doesn't seem to detect what my vertex shader returns
I have these two functions written in MSL :
vertex float4 base_image_rect(constant float4 *pos [[buffer(0)]],
uint vid [[vertex_id]]) {
return pos[vid];
}
fragment float4 fragment_image_display(float4 vPos [[stage_in]],
texture2d<float, access::sample> imageToRender [[texture(0)]],
sampler imageSampler [[sampler(0)]]) {
return imageToRender.sample(imageSampler, float2(vPos.x, vPos.y));
}
When I try to create my render pipeline state with those, using this code:
// Make image display render pipeline state
let imageDisplayStateDescriptor = MTLRenderPipelineDescriptor()
imageDisplayStateDescriptor.colorAttachments[0].pixelFormat = view.colorPixelFormat
imageDisplayStateDescriptor.vertexFunction = library.makeFunction(name: "base_image_rect")
imageDisplayStateDescriptor.fragmentFunction = library.makeFunction(name: "fragment_image_display")
displayImagePipelineState = try! device.makeRenderPipelineState(descriptor: imageDisplayStateDescriptor)
There is an error at the creation of the pipeline state:
fatal error: 'try!' expression unexpectedly raised an error: Error
Domain=CompilerError Code=1 "Link failed: fragment input vPos was not
found in vertex shader outputs" [...]
I checked and rechecked the code and can't understand what's wrong.
Any ideas? Thank you!
Try replacing stage_in with position. I think that stage_in is mostly used with structs where each field is either annotated with a specific attribute qualifier or matched by name. Apparently, when it's used with a non-struct type, it's trying to match by name. For example, if your vertex function were to output a struct one of whose fields was vPos, that would find it.

Shader reflection : variable name?

How (if at all possible) could i get the name of variables / structure members in a shader from reflection?
I'm talking about raw hlsl shaders (no effet Framework / no D3DX, just raw directX).
I'm using SharpDX and found out how to get most of the information i need from the shader signature
new SharpDX.D3DCompiler.ShaderReflection(MyShaderByteCode);
I can get most of the information i need, but while i can retrieve the sementic name (TEXCOORD, POSITION etc) i can't retrieve the actual name of the element (there's no "name" property).
Am i going about this the wrong way? Is this even possible at all?
struct Vertex
{
float4 Position : POSITION;
float2 UVPosition : TEXCOORD;
};
struct Pixel
{
float4 Position : SV_POSITION;
float2 UVPosition : TEXCOORD;
};
float4x4 worldViewProj;
Texture2D<float4> diffuse : register(t0);
Texture2D<float4> height : register(t1);
Texture2D<float4> lightmap : register(t2);
SamplerState pictureSampler;
Pixel PerVertex(Vertex input)
{
Pixel output = (Pixel) 0;
input.Position.z += height.SampleLevel(pictureSampler, input.UVPosition, 0).r / 2;
output.Position = mul(input.Position, worldViewProj);
output.UVPosition = input.UVPosition;
return output;
}
float4 PerPixel(Pixel input) : SV_Target
{
return diffuse.Sample(pictureSampler, input.UVPosition) * lightmap.Sample(pictureSampler, input.UVPosition);
}
What i can retrieve is "POSITION" and "TEXCOORD", what i want to retrieve is "Position" and "UVPosition"
You need to iterate trough constant buffers.
Please note that is a Constant Buffer is not used, it will be stripped off the bytecode.
Attached code that iterates though all variables:
SharpDX.D3DCompiler.ShaderReflection sr = new SharpDX.D3DCompiler.ShaderReflection(mybytecode);
for (int i = 0; i < sr.Description.ConstantBuffers; i++)
{
SharpDX.D3DCompiler.ConstantBuffer cb = sr.GetConstantBuffer(i);
for (int j = 0; j < cb.Description.VariableCount; j++)
{
SharpDX.D3DCompiler.ShaderReflectionVariable variable = cb.GetVariable(j);
Console.WriteLine(variable.Description.Name);
}
}
I don't think that is possible.
What you can do alternatively, is to explicitly assign semantic indices (in addition to semantic names):
struct Vertex
{
float4 Position : POSITION0;
float2 UVPosition : TEXCOORD0;
};
and use them to uniquely identify your inputs on CPU side by reading SemanticName and SemanticIndex fields of D3D11_SIGNATURE_PARAMETER_DESC.
Another way that comes in mind is to parse source HLSL file directly (and so rolling out your own reflection engine).
Hope it helps!

cuda: involuntary memory changes during kernels [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 8 years ago.
Improve this question
im a beginer cuda programmer,
im trying to build an application similar to the Nvidia particle system example (many balls in a cube).
i have a kernel louncher function as below :
void Ccuda:: sort_Particles_And_Find_Cell_Start (int *Cell_Start, // output
int *Cell_End, // output
float3 *Sorted_Pos, // output
float3 *Sorted_Vel, //output
int *Particle_Cell, // input
int *Particle_Index, // input
float3 *Old_Pos,
float3 *Old_Vel,
int Num_Particles,
int Num_Cells)
{
int numThreads, numBlocks;
/*Cell_Start = (int*) cudaAlloc (Num_Cells, sizeof(int));
Cell_End = (int*) cudaAlloc (Num_Cells, sizeof(int));
Sorted_Pos = (float3*) cudaAlloc (Num_Particles, sizeof(int));
Sorted_Vel = (float3*) cudaAlloc (Num_Particles, sizeof(int));*/
int *h_p_cell = (int *) malloc (Num_Particles * sizeof (int));
cudaMemcpy (h_p_cell,Particle_Cell, Num_Particles*sizeof(int),cudaMemcpyDeviceToHost);
free (h_p_cell);
computeGridSize(Num_Particles, 512, numBlocks, numThreads);
sort_Particles_And_Find_Cell_StartD<<<numBlocks, numThreads>>>(Cell_Start,Cell_End, Sorted_Pos, Sorted_Vel, Particle_Cell, Particle_Index, Old_Pos, Old_Vel, Num_Particles);
h_p_cell = (int *) malloc (Num_Particles * sizeof (int));
cudaMemcpy (h_p_cell,Particle_Cell, Num_Particles*sizeof(int),cudaMemcpyDeviceToHost);
free (h_p_cell);
}
And this global kernel function :
__global__ void sort_Particles_And_Find_Cell_StartD(int *Cell_Start, // output
int *Cell_End, // output
float3 *Sorted_Pos, // output
float3 *Sorted_Vel, //output
int *Particle_Cell, // input
int *Particle_Index, // input
float3 *Old_Pos,
float3 *Old_Vel,
int Num_Particles)
{
int hash;
extern __shared__ int Shared_Hash[]; // blockSize + 1 elements
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < Num_Particles)
{
hash = Particle_Cell[index];
Shared_Hash[threadIdx.x+1] = hash;
if (index > 0 && threadIdx.x == 0)
{
// first thread in block load previous particle hash
Shared_Hash[0] = Particle_Cell[index-1];
}
}
__syncthreads();
if (index < Num_Particles)
{
// If this particle has a different cell index to the previous
// particle then it must be the first particle in the cell,
// so store the index of this particle in the cell.
// As it isn't the first particle, it must also be the cell end of
// the previous particle's cell
if (index == 0 || hash != Shared_Hash[threadIdx.x]) // if its the first thread in the grid or its particle cell index is different from cell index of the previous neighboring thread
{
Cell_Start[hash] = index;
if (index > 0)
Cell_End[Shared_Hash[threadIdx.x]] = index;
}
if (index == Num_Particles - 1)
{
Cell_End[hash] = index + 1;
}
// Now use the sorted index to reorder the pos and vel data
int Sorted_Index = Particle_Index[index];
//float3 pos = FETCH(Old_Pos, Sorted_Index); // macro does either global read or texture fetch
//float3 vel = FETCH(Old_Vel, Sorted_Index); // see particles_kernel.cuh
float3 pos = Old_Pos[Sorted_Index];
float3 vel = Old_Vel[Sorted_Index];
Sorted_Pos[index] = pos;
Sorted_Vel[index] = vel;
}
during execute i got this debug arror massege r6010 saying an abort has been called.
as you may see in the louncher function (the first one) i use int *h_p_cell to view
Particle_Cell content before and after the kernel execution, and it seems like the content has been changed, although inside the kernel there is no assignment to Particle_Cell.
Particle_Cell memory allocated by cudaMemcpy during program init().
i have trying for few days to solve this issue, without success
can anyone help ?
Your kernel is expecting dynamically allocated shared memory:
extern __shared__ int Shared_Hash[]; // blockSize + 1 elements
But you aren't allocating any in your kernel invocation:
sort_Particles_And_Find_Cell_StartD<<<numBlocks, numThreads>>>(Cell_Start,Cell_End, Sorted_Pos, Sorted_Vel, Particle_Cell, Particle_Index, Old_Pos, Old_Vel, Num_Particles);
^
|
missing shared memory size parameter
You should provide a shared memory amount in your launch configuration. You probably want something like this:
sort_Particles_And_Find_Cell_StartD<<<numBlocks, numThreads, ((numThreads+1)*sizeof(int))>>>(Cell_Start,Cell_End, Sorted_Pos, Sorted_Vel, Particle_Cell, Particle_Index, Old_Pos, Old_Vel, Num_Particles);
This error will cause your kernel to abort when it tries to access shared memory.
You should also do cuda error checking on all cuda API calls and kernel calls. I don't see any evidence of that in your code.
Once you have all the API errors sorted out, run your code with cuda-memcheck. The reason for the unexpected writes to Particle_Cell may be due to out-of-bounds accesses from your kernel, which will become evident with cuda-memcheck.

Reading output from geometry shader on CPU

I'm trying to read the output from a geometry shader which is using stream-output to output to a buffer.
The output buffer used by the geometry shader is described like this:
D3D10_BUFFER_DESC vbdesc =
{
numPoints * sizeof( MESH_VERTEX ),
D3D10_USAGE_DEFAULT,
D3D10_BIND_VERTEX_BUFFER | D3D10_BIND_STREAM_OUTPUT,
0,
0
};
V_RETURN( pd3dDevice->CreateBuffer( &vbdesc, NULL, &g_pDrawFrom ) );
The geometry shader creates a number of triangles based on a single point (at max 12 triangles per point), and if I understand the SDK correctly I have to create a staging resource in order to read the output from the geometry shader on the CPU.
I have declared another buffer resource (this time setting the STAGING flag) like this:
D3D10_BUFFER_DESC sbdesc =
{
(numPoints * (12*3)) * sizeof( VERTEX_STREAMOUT ),
D3D10_USAGE_STAGING,
NULL,
D3D10_CPU_ACCESS_READ,
0
};
V_RETURN( pd3dDevice->CreateBuffer( &sbdesc, NULL, &g_pStaging ) );
After the first draw call of the application the geometry shader is done creating all triangles and can be drawn. However, after this first draw call I would like to be able to read the vertices output by the geometry shader.
Using the buffer staging resource I'm trying to do it like this (right after the first draw call):
pd3dDevice->CopyResource(g_pStaging, g_pDrawFrom]);
pd3dDevice->Flush();
void *ptr = 0;
HRESULT hr = g_pStaging->Map( D3D10_MAP_READ, NULL, &ptr );
if( FAILED( hr ) )
return hr;
VERTEX_STREAMOUT *mv = (VERTEX_STREAMOUT*)ptr;
g_pStaging->Unmap();
This compiles and doesn't give any errors at runtime. However, I don't seem to be getting any output.
The geometry shader outputs the following:
struct VSSceneStreamOut
{
float4 Pos : POS;
float3 Norm : NORM;
float2 Tex : TEX;
};
and the VERTEX_STREAMOUT is declared like this:
struct VERTEX_STREAMOUT
{
D3DXVECTOR4 Pos;
D3DXVECTOR3 Norm;
D3DXVECTOR2 Tex;
};
Am I missing something here?
Problem solved by creating the staging resource buffer like this:
D3D10_BUFFER_DESC sbdesc;
ZeroMemory( &sbdesc, sizeof(sbdesc) );
g_pDrawFrom->GetDesc( &sbdesc );
sbdesc.CPUAccessFlags = D3D10_CPU_ACCESS_READ;
sbdesc.Usage = D3D10_USAGE_STAGING;
sbdesc.BindFlags = 0;
sbdesc.MiscFlags = 0;
V_RETURN( pd3dDevice->CreateBuffer( &sbdesc, NULL, &g_pStaging ) );
Problem was with the ByteWidth.

Resources