Is it possible to MMAP a PCI BAR memory? - driver

I want to user access memory from a PCIe board which provides a 1GB memory with BAR0.
Currently I use only read and write functionality of my character device driver, which is VERY slow (1MB/s read and 16MB/s write) on a 8x PCIe Gen3.
static ssize_t
MPD_read(
struct file *filp,
char *buffer,
size_t bufferSize,
loff_t *offset )
{
unsigned long unusedBytes = copy_to_user(
( void * ) buffer,
MPD_AdapterBoard.bars[ 0 ].barHWAddress,
bufferSize );
return 0;
}
static ssize_t
MPD_write(
struct file *filp,
const char *buffer,
size_t bufferSize,
loff_t *offset )
{
unsigned long unusedBytes = copy_from_user(
MPD_AdapterBoard.bars[ 0 ].barHWAddress,
( void * ) buffer,
bufferSize );
return 0;
}
Is it possible to use the MMAP (with the .mmap file operation) to get more speed ?
Or is DMA the only option ?
Thanks in advance!
/Jesko

I found out how it's working:
static int
MPD_mmap(
struct file *filp,
struct vm_area_struct *vma )
{
unsigned long offset;
offset = vma->vm_pgoff << PAGE_SHIFT;
if (( offset + ( vma->vm_end - vma->vm_start )) > MPD_AdapterBoard.bars[ 0 ].barSizeInBytes )
{
return -EINVAL;
}
offset += ( unsigned long ) MPD_AdapterBoard.bars[ 0 ].mmioStart;
vma->vm_page_prot = pgprot_noncached( vma->vm_page_prot );
if ( io_remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot ))
{
return -EAGAIN;
}
return 0;
}
Attention: This is work in progress, so error checking is fairly limited.
In the hope to help someone here, the complete code can be downloaded including a test program from here: https://github.com/jesko42/minipci

Related

Windows DPDK L2fwd- Receiving packets out of sequence

I am validating DPDK receive functionality & for this I'm shooting a pcap externally &
added code in l2fwd to dump received packets to pcap, the l2fwd dumped pcap have all the packets from shooter but some of them are not in sequence.
Shooter is already validated.
DPDK version in use-21.11
link of the pcap used : https://wiki.wireshark.org/uploads/__moin_import__/attachments/SampleCaptures/tcp-ecn-sample.pcap
Out of order packets are random. For the first run I saw no jumbled packets but was able to replicate the issue on second run with the 2nd,3rd,4th packets jumbled having order 3,4,2.
Below is snipped from l2fwd example & our modifications as //TESTCODE..
/* Read packet from RX queues. 8< */
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
nb_rx = rte_eth_rx_burst(portid, 0,
pkts_burst, MAX_PKT_BURST);
port_statistics[portid].rx += nb_rx;
for (j = 0; j < nb_rx; j++) {
m = pkts_burst[j];
// TESTCODE_STARTS
uint8_t* pkt = rte_pktmbuf_mtod(m, uint8_t*);
dump_to_pcap(pkt, rte_pktmbuf_pkt_len(m));
// TESTCODE_ENDS
rte_prefetch0(rte_pktmbuf_mtod(m, void *));
l2fwd_simple_forward(m, portid);
}
}
/* >8 End of read packet from RX queues. */
Below is code for dump_to_pcap
static int
dump_to_pcap(uint8_t* pkt, int pkt_len)
{
static FILE* fp = NULL;
static int init_file = 0;
if (0 == init_file) {
printf("Creating pcap\n");
char pcap_filename[256] = { 0 };
char Two_pcap_filename[256] = { 0 };
currentDateTime(pcap_filename);
sprintf(Two_pcap_filename,".\\Rx_%d_%s.pcap", 0, pcap_filename);
printf("FileSName to Create: %s\n", Two_pcap_filename);
fp = fopen(Two_pcap_filename, "wb");
if (NULL == fp) {
printf("Unable to open file\n");
fp = NULL;
}
else {
printf("File create success..\n");
init_file = 1;
typedef struct pcap_file_header1 {
unsigned int magic; // a 32-bit "magic number"
unsigned short version_major; //a 16-bit major version number
unsigned short version_minor; //a 16-bit minor version number
unsigned int thiszone; //a 32-bit "time zone offset" field that's actually not used, so ou can (and probably should) just make it 0
unsigned int sigfigs; //a 32-bit "time stamp accuracy" field that's not actually used,so you can (and probably should) just make it 0;
unsigned int snaplen; //a 32-bit "snapshot length" field
unsigned int linktype; //a 32-bit "link layer type" field
}dumpFileHdr;
dumpFileHdr file_hdr;
file_hdr.magic = 2712847316; //0xa1b2c3d4;
file_hdr.version_major = 2;
file_hdr.version_minor = 4;
file_hdr.thiszone = 0;
file_hdr.sigfigs = 0;
file_hdr.snaplen = 65535;
file_hdr.linktype = 1;
fwrite((void*)(&file_hdr), sizeof(dumpFileHdr), 1, fp);
//printf("Pcap Header written\n");
}
}
typedef struct pcap_pkthdr1 {
unsigned int ts_sec; /* time stamp */
unsigned int ts_usec;
unsigned int caplen; /* length of portion present */
unsigned int len; /* length this packet (off wire) */
}dumpPktHdr;
dumpPktHdr pkt_hdr;
static int ts_sec = 1;
pkt_hdr.ts_sec = ts_sec++;
pkt_hdr.ts_usec = 0;
pkt_hdr.caplen = pkt_hdr.len = pkt_len;
if (NULL != fp) {
fwrite((void*)(&pkt_hdr), sizeof(dumpPktHdr), 1, fp);
fwrite((void*)(pkt), pkt_len, 1, fp);
fflush(fp);
}
return 0;
}

histogram kernel memory issue

I am trying to implement an algorithm to process images with more than 256 bins.
The main issue to process histogram in such case comes from the impossibility to allocate more than 32 Kb as local tab in the GPU.
All the algorithms I found for 8 bits per pixel images use a fixed size tab locally.
The histogram is the first process in that tab then a barrier is up and at last an addition is made with the output vector.
I am working with IR image which has more than 32K bins of dynamic.
So I cannot allocate a fixed size tab inside the GPU.
My algorithm use an atomic_add in order to create directly the output histogram.
I am interfacing with OpenCV so, in order to manage the possible case of saturation my bins use floating points. Depending on the ability of the GPU to manage single or double precision.
OpenCV doesn't manage unsigned int, long, and unsigned long data type as matrix type.
I have an error... I do think this error is a kind of segmentation fault.
After several days I still have no idea what can be wrong.
Here is my code :
histogram.cl :
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
static void Atomic_Add_f64(__global double *val, double delta)
{
union {
double f;
ulong i;
} old;
union {
double f;
ulong i;
} new;
do {
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
static void Atomic_Add_f32(__global float *val, double delta)
{
union
{
float f;
uint i;
} old;
union
{
float f;
uint i;
} new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
__kernel void khist(
__global const uchar* _src,
const int src_steps,
const int src_offset,
const int rows,
const int cols,
__global uchar* _dst,
const int dst_steps,
const int dst_offset)
{
const int gid = get_global_id(0);
// printf("This message has been printed from the OpenCL kernel %d \n",gid);
if(gid < rows)
{
__global const _Sty* src = (__global const _Sty*)_src;
__global _Dty* dst = (__global _Dty*) _dst;
const int src_step1 = src_steps/sizeof(_Sty);
const int dst_step1 = dst_steps/sizeof(_Dty);
src += mad24(gid,src_step1,src_offset);
dst += mad24(gid,dst_step1,dst_offset);
_Dty one = (_Dty)1;
for(int c=0;c<cols;c++)
{
const _Rty idx = (_Rty)(*(src+c+src_offset));
ATOMIC_FUN(dst+idx+dst_offset,one);
}
}
}
The function Atomic_Add_f64 directly come from here and there
main.cpp
#include <opencv2/core.hpp>
#include <opencv2/core/ocl.hpp>
#include <fstream>
#include <sstream>
#include <chrono>
int main()
{
cv::Mat_<unsigned short> a(480,640);
cv::RNG rng(std::time(nullptr));
std::for_each(a.begin(),a.end(),[&](unsigned short& v){ v = rng.uniform(0,100);});
bool ret = false;
cv::String file_content;
{
std::ifstream file_stream("../test/histogram.cl");
std::ostringstream file_buf;
file_buf<<file_stream.rdbuf();
file_content = file_buf.str();
}
int output_flag = cv::ocl::Device::getDefault().doubleFPConfig() == 0 ? CV_32F : CV_64F;
cv::String atomic_fun = output_flag == CV_32F ? "Atomic_Add_f32" : "Atomic_Add_f64";
cv::ocl::ProgramSource source(file_content);
// std::cout<<source.source()<<std::endl;
cv::ocl::Kernel k;
cv::UMat src;
cv::UMat dst = cv::UMat::zeros(1,65536,output_flag);
a.copyTo(src);
atomic_fun = cv::format("-D _Sty=%s -D _Rty=%s -D _Dty=%s -D ATOMIC_FUN=%s",
cv::ocl::typeToStr(src.depth()),
cv::ocl::typeToStr(src.depth()), // this to manage case like a matrix of usigned short stored as a matrix of float.
cv::ocl::typeToStr(output_flag),
atomic_fun.c_str());
ret = k.create("khist",source,atomic_fun);
std::cout<<"check create : "<<ret<<std::endl;
k.args(cv::ocl::KernelArg::ReadOnly(src),cv::ocl::KernelArg::WriteOnlyNoSize(dst));
std::size_t sz = a.rows;
ret = k.run(1,&sz,nullptr,false);
std::cout<<"check "<<ret<<std::endl;
cv::Mat b;
dst.copyTo(b);
std::copy_n(b.ptr<double>(0),101,std::ostream_iterator<double>(std::cout," "));
std::cout<<std::endl;
return EXIT_SUCCESS;
}
Hello I arrived to fix it.
I don't really know where the issue come from.
But if I suppose the output as a pointer rather than a matrix it work.
The changes I made are these :
histogram.cl :
__kernel void khist(
__global const uchar* _src,
const int src_steps,
const int src_offset,
const int rows,
const int cols,
__global _Dty* _dst)
{
const int gid = get_global_id(0);
if(gid < rows)
{
__global const _Sty* src = (__global const _Sty*)_src;
__global _Dty* dst = _dst;
const int src_step1 = src_steps/sizeof(_Sty);
src += mad24(gid,src_step1,src_offset);
ulong one = 1;
for(int c=0;c<cols;c++)
{
const _Rty idx = (_Rty)(*(src+c+src_offset));
ATOMIC_FUN(dst+idx,one);
}
}
}
main.cpp
k.args(cv::ocl::KernelArg::ReadOnly(src),cv::ocl::KernelArg::PtrWriteOnly(dst));
The rest of the code is the same in the two files.
For me it work fine.
If someone know why it work when the ouput is declared as a pointer rather than a vector (matrix of one row) I am interested.
Nevertheless my issue is fix :).

cudaMalloc and cudaMemcpy not working on kernel call

I have an array already initialized that I am trying to use in each thread of the kernel call (each thread uses a different part of the array so there are no dependencies). I create the array and save memory on the device using cudaMalloc and the array is copied from host to device using cudaMemcpy.
I pass the pointer returned by cudaMalloc to the kernel call to be used by each thread.
int SIZE = 100;
int* data = new int[SIZE];
int* d_data = 0;
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
This code was taken from here.
For the kernel call.
kernel<<<blocks, threads>>> (results, d_data);
I keep track of the results from each thread by using the struct Result. The next code works without errors.
__global__ void mainKernel(Result res[], int* data){
int x = data[0];
}
But when I assign that value to res:
__global__ void mainKernel(Result res[], int* data){
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int x = data[0];
res[threadId].x = x;
}
An error is raised:
cudaSafeCall() Runtime API error in file , line 355 : an illegal memory access was encountered.
The same error appears with any operation involving the use of that pointer
__global__ void mainKernel(Result res[], int* data){
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int x = data[0];
if (x > 10)
res[threadId].x = 5;
}
There is no problem with the definition of res. Assigning any other value to res[threadId].x does not give me any error.
This is the output of running cuda-memcheck:
========= Invalid __global__ read of size 4
========= at 0x00000150 in mainKernel(Result*, int*)
========= by thread (86,0,0) in block (49,0,0)
========= Address 0x13024c0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x150d6d]
========= Host Frame:./out [0x2cc4b]
========= Host Frame:./out [0x46c23]
========= Host Frame:./out [0x3e37]
========= Host Frame:./out [0x3ca1]
========= Host Frame:./out [0x3cd6]
========= Host Frame:./out [0x39e9]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./out [0x31b9]
EDIT:
This is an example of the full code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <iostream>
#include <assert.h>
typedef struct
{
int x,y,z;
} Result;
__global__ void mainKernel(Result pResults[], int* dataimage)
{
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int xVal = dataimage[0];
if (xVal > 10)
pResults[threadId].x = 5;
}
int main (int argc, char** argv)
{
int NUM_THREADS = 5*5;
int SIZE = 100;
int* data = new int[SIZE];
int* d_data = 0;
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
unsigned int GPU_ID = 1; // not actually :-)
// unsigned int GPU_ID = cutGetMaxGflopsDeviceId() ;
cudaSetDevice(GPU_ID);
Result * results_GPU = 0;
cutilSafeCall( cudaMalloc( &results_GPU, NUM_THREADS * sizeof(Result)) );
Result * results_CPU = 0;
cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );
mainKernel<<<5,5>>> ( results_GPU, d_data );
cudaThreadSynchronize();
cutilSafeCall( cudaMemcpy(results_CPU, results_GPU, NUM_THREADS * sizeof(Result),cudaMemcpyDeviceToHost) );
cutilSafeCall(cudaFree(results_GPU));
cutilSafeCall(cudaFreeHost(results_CPU));
cudaThreadExit();
} // ()
Your problem lies in this sequence of calls:
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
unsigned int GPU_ID = 1;
cudaSetDevice(GPU_ID);
Result * results_GPU = 0;
cutilSafeCall( cudaMalloc( &results_GPU, NUM_THREADS * sizeof(Result)) );
Result * results_CPU = 0;
cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );
mainKernel<<<5,5>>> ( results_GPU, d_data );
What is effectively happening is that you are allocating d_data and running your kernel on different GPUs, and d_data is not valid on the GPU you are launching the kernel on.
In detail, because you call cudaMalloc for d_data before cudaSetDevice, you are allocating d_data on the default device, and then explicitly allocating results_GPU and running the kernel on device 1. Clearly device 1 and the default device are not the same GPU (enumeration of devices usually starts at 0 in the runtime API).
If you change the code like this:
unsigned int GPU_ID = 1;
cutilSafeCall(cudaSetDevice(GPU_ID));
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
Result * results_GPU = 0;
cutilSafeCall( cudaMalloc( &results_GPU, NUM_THREADS * sizeof(Result)) );
Result * results_CPU = 0;
cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );
mainKernel<<<5,5>>> ( results_GPU, d_data );
i.e. select the non-default device before any allocations are made, the problem should disappear. The reason this doesn't happen with your very simple kernel:
__global__ void mainKernel(Result res[], int* data){
int x = data[0];
}
is simply that the CUDA compiler performs very aggressive optimisations by default, and because the result of the read of data[0] isn't actually used, the entire read can be optimised away and you are left with an empty stub kernel which doesn't do anything. Only when the result of the load from memory is used in a memory write will the code not be optimised away during compilation. You can confirm this yourself by dissassembling the code emitted by the compiler, if you are curious.
Note that there are ways to make this work on multi-GPU systems which supported it, via peer-to-peer access, but that must be explicitly configured in your code for that facility to be used.

Copy_to_user problems

I wrote module for a device, and I'm having problems in my read function:
ssize_t my_sys_read(struct file *f, char __user *buffer, size_t s, loff_t *off){
char * myBuffer = "ossec buffer";
size_t read_bytes;
if (s > ( sizeof(char) * 13 ) ) s = ( sizeof(char) * 13 );
if (!access_ok(%VERIFY_WRITE, (void *) buffer, s)) return -EFAULT;
read_bytes = copy_to_user((void *) buffer, (void *) myBuffer, s);
printk(KERN_INFO "myBuffer %s", myBuffer);
printk(KERN_INFO "buffer %s", buffer);
read_bytes = s - ( sizeof(char) * 13 );
return read_bytes;
}
I really don't know why but the copy it's not working, and the printed buffer has no sense for me.
[10038.885838] buffer \xffffff81\xffffffc3\xffffffcb\x1a
I guess the problem is in the copy because the program which use the device is simple.
int main(void)
{
int fd = open(device_name, O_RDONLY);
if(fd < 0)
{
printf("Error: Impossible to open device, action not permited.\n");
return 0;
}
char * buff;
int read_bytes;
read_bytes = read(fd, buff, (13 * sizeof(char) ) );
printf(" %s\n", buff);
}
Thanks!
char * buff;
int read_bytes;
read_bytes = read(fd, buff, (13 * sizeof(char) ) );
You are using the pointer buff without allocating memory for it, it is then a dangling pointer with a random value. You need to do
char *buff = new char(); or
char buff;
int read_bytes;
read_bytes = read(fd, &buff, (13 * sizeof(char)) );

CUDA: use of shared memory to return an array from device functions

is it possible to allocate shared memory for a kernel (inside or extern) and use it in other device functions called from the kernel?
Specially interesting for me will be, if/how i can use it as a returned parameter/array.
It seems to be no problem to use shared memory as input parameter in device functions (at least i get no problems, errors or unexpected results.
When I use it as a return parameter, I get several problems:
I can run the program when it was built from debug configuration.
But i can't debug it -> it crashes in the device functions when i use the shared memory
Also i get errors with cuda-memchecker -> invalid __global__ read
because address is out of bound an it read from shared address space
So is it possible to use shared memory for returning arrays from device functions to kernels?
EDIT:
I wrote a very simple example to exclude other errors done by me.
#define CUDA_CHECK_RETURN(value) { \
cudaError_t _m_cudaStat = (value); \
if (_m_cudaStat != cudaSuccess) { \
printf( "Error %s at line %d in file %s\n", \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
exit(-1); \
} }
__device__ void Function( const int *aInput, volatile int *aOutput )
{
for( int i = 0; i < 10; i++ )
aOutput[i] = aInput[i] * aInput[i];
}
__global__ void Kernel( int *aInOut )
{
__shared__ int aShared[10];
for(int i=0; i<10; i++)
aShared[i] = i+1;
Function( aShared, aInOut );
}
int main( int argc, char** argv )
{
int *hArray = NULL;
int *dArray = NULL;
hArray = ( int* )malloc( 10*sizeof(int) );
CUDA_CHECK_RETURN( cudaMalloc( (void**)&dArray, 10*sizeof(int) ) );
for( int i = 0; i < 10; i++ )
hArray[i] = i+1;
CUDA_CHECK_RETURN( cudaMemcpy( dArray, hArray, 10*sizeof(int), cudaMemcpyHostToDevice ) );
cudaMemcpy( dArray, hArray, 10*sizeof(int), cudaMemcpyHostToDevice );
Kernel<<<1,1>>>( dArray );
CUDA_CHECK_RETURN( cudaMemcpy( hArray, dArray, 10*sizeof(int), cudaMemcpyDeviceToHost ) );
cudaMemcpy( hArray, dArray, 10*sizeof(int), cudaMemcpyDeviceToHost );
free( hArray );
CUDA_CHECK_RETURN( cudaFree( dArray ) );
cudaFree( dArray );
return 0;
}
I excecute the kernel by one threadblock and one thread per block. It's no problem to build the program and run it. I get the expected results.
But if the program is testet with cuda-memchecker it terminates the kernel and following log appears.
Error unspecified launch failure at line 49 in file ../CuTest.cu
========= Invalid __global__ read of size 4
========= at 0x00000078 in /home/strautz/Develop/Software/CuTest/Debug/../CuTest.cu:14:Function(int const *, int volatile *)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x01000000 is out of bounds
========= Device Frame:/home/strautz/Develop/Software/CuTest/Debug/../CuTest.cu:25:Kernel(int*) (Kernel(int*) : 0xd0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x34b) [0x55d0b]
========= Host Frame:/usr/lib/libcudart.so.5.0 [0x8f6a]
=========
========= Program hit error 4 on CUDA API call to cudaMemcpy
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/libcuda.so [0x24e129]
========= Host Frame:/usr/lib/libcudart.so.5.0 (cudaMemcpy + 0x2bc) [0x3772c]
========= Host Frame:[0x5400000]
=========
========= ERROR SUMMARY: 2 errors
Does the shared memory have to be aligned, do I have to do something else or can it be ignored - don't think so?
see CUDA 5.0 installation file /usr/local/cuda-5.0/samples/6_Advanced/reduction/doc/reduction.ppt
sdata is a local var of device function warpReduce(). It stores the addr of the shared mem. The shared mem can be read/write by the addr within the device function. The final reduction result is then read from shared mem outside warpReduce()
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
As here described it was just a driver problem. After I updated to the current one everything is working fine.

Resources