Windows DPDK L2fwd- Receiving packets out of sequence - network-programming

I am validating DPDK receive functionality & for this I'm shooting a pcap externally &
added code in l2fwd to dump received packets to pcap, the l2fwd dumped pcap have all the packets from shooter but some of them are not in sequence.
Shooter is already validated.
DPDK version in use-21.11
link of the pcap used : https://wiki.wireshark.org/uploads/__moin_import__/attachments/SampleCaptures/tcp-ecn-sample.pcap
Out of order packets are random. For the first run I saw no jumbled packets but was able to replicate the issue on second run with the 2nd,3rd,4th packets jumbled having order 3,4,2.
Below is snipped from l2fwd example & our modifications as //TESTCODE..
/* Read packet from RX queues. 8< */
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
nb_rx = rte_eth_rx_burst(portid, 0,
pkts_burst, MAX_PKT_BURST);
port_statistics[portid].rx += nb_rx;
for (j = 0; j < nb_rx; j++) {
m = pkts_burst[j];
// TESTCODE_STARTS
uint8_t* pkt = rte_pktmbuf_mtod(m, uint8_t*);
dump_to_pcap(pkt, rte_pktmbuf_pkt_len(m));
// TESTCODE_ENDS
rte_prefetch0(rte_pktmbuf_mtod(m, void *));
l2fwd_simple_forward(m, portid);
}
}
/* >8 End of read packet from RX queues. */
Below is code for dump_to_pcap
static int
dump_to_pcap(uint8_t* pkt, int pkt_len)
{
static FILE* fp = NULL;
static int init_file = 0;
if (0 == init_file) {
printf("Creating pcap\n");
char pcap_filename[256] = { 0 };
char Two_pcap_filename[256] = { 0 };
currentDateTime(pcap_filename);
sprintf(Two_pcap_filename,".\\Rx_%d_%s.pcap", 0, pcap_filename);
printf("FileSName to Create: %s\n", Two_pcap_filename);
fp = fopen(Two_pcap_filename, "wb");
if (NULL == fp) {
printf("Unable to open file\n");
fp = NULL;
}
else {
printf("File create success..\n");
init_file = 1;
typedef struct pcap_file_header1 {
unsigned int magic; // a 32-bit "magic number"
unsigned short version_major; //a 16-bit major version number
unsigned short version_minor; //a 16-bit minor version number
unsigned int thiszone; //a 32-bit "time zone offset" field that's actually not used, so ou can (and probably should) just make it 0
unsigned int sigfigs; //a 32-bit "time stamp accuracy" field that's not actually used,so you can (and probably should) just make it 0;
unsigned int snaplen; //a 32-bit "snapshot length" field
unsigned int linktype; //a 32-bit "link layer type" field
}dumpFileHdr;
dumpFileHdr file_hdr;
file_hdr.magic = 2712847316; //0xa1b2c3d4;
file_hdr.version_major = 2;
file_hdr.version_minor = 4;
file_hdr.thiszone = 0;
file_hdr.sigfigs = 0;
file_hdr.snaplen = 65535;
file_hdr.linktype = 1;
fwrite((void*)(&file_hdr), sizeof(dumpFileHdr), 1, fp);
//printf("Pcap Header written\n");
}
}
typedef struct pcap_pkthdr1 {
unsigned int ts_sec; /* time stamp */
unsigned int ts_usec;
unsigned int caplen; /* length of portion present */
unsigned int len; /* length this packet (off wire) */
}dumpPktHdr;
dumpPktHdr pkt_hdr;
static int ts_sec = 1;
pkt_hdr.ts_sec = ts_sec++;
pkt_hdr.ts_usec = 0;
pkt_hdr.caplen = pkt_hdr.len = pkt_len;
if (NULL != fp) {
fwrite((void*)(&pkt_hdr), sizeof(dumpPktHdr), 1, fp);
fwrite((void*)(pkt), pkt_len, 1, fp);
fflush(fp);
}
return 0;
}

Related

How to discover physical address corresponding to PCIe device memory?

I'm trying to access a PCIe device memory from a user space program. I open the file: /sys/bus/pci/devices/0000:3b:00.0/resource0 and then I call mmap that will return a virtual address.
When writing at this virtual address (VA) the MMU will translate it to a physical address (PA), the memory controller will convert the write to the PA into a TLP to request a write to the PCIe device. (AFAIU)
How can I get the physical address that is being used? I had a look to /proc//maps and I see that there is an address that coincides with the PCIe bar0 address (0xa0000000).
But this address seems too low, it overlaps with DDR memory.
I also tried this program to convert VA to PA but it doesn't seem to give sensible results for such mapping:
virt2phys$ cat v2p.c
#define _XOPEN_SOURCE 700
#include <fcntl.h> /* open */
#include <stdint.h> /* uint64_t */
#include <stdio.h> /* printf */
#include <stdlib.h> /* size_t */
#include <unistd.h> /* pread, sysconf */
typedef struct {
uint64_t pfn : 55;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
uintptr_t vpn;
vpn = vaddr / sysconf(_SC_PAGE_SIZE);
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data) - nread,
vpn * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 55) - 1);
entry->soft_dirty = (data >> 55) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
int main(int argc, char **argv)
{
pid_t pid;
uintptr_t vaddr, paddr = 0;
if (argc < 3) {
printf("Usage: %s pid vaddr(in hex)\n", argv[0]);
return EXIT_FAILURE;
}
pid = strtoull(argv[1], NULL, 0);
vaddr = strtoull(argv[2], NULL, 16);
if (virt_to_phys_user(&paddr, pid, vaddr)) {
fprintf(stderr, "error: virt_to_phys_user\n");
return EXIT_FAILURE;
};
printf("0x%jx\n", (uintmax_t)paddr);
return EXIT_SUCCESS;
}

STM32F4 read memory flash per pages

How can I read flash memory per page? I have not been able to read it by sector.
I have to read 2 memory sectors (2*16kbytes).
I am under FreeRtos and I am using STM32F401RE.
I am using pointer, I just need to read two sectors of memory flash.
uint32_t data_store0[4096] = {0};
uint32_t data_store1[4096] = {0};
uint32_t *words0 = (uint32_t *) BASE_ADDR_SECTOR_0;
uint32_t *words1 = (uint32_t *) BASE_ADDR_SECTOR_1;
unsigned int buffer_firmware[8192] = {0};
unsigned char firmware[32768] = {0};
int k, m, i, j = 0;
int count_time = 0;
for(int i=0; i<4096; i++)
{
data_store0[i] += words0[i];
buffer_firmware[i] = data_store0[i];
}
for(int j=0; j<4096; j++)
{
data_store1[j] += words1[j];
}
for(m = 0, k = 4096; k < 8192 && m < 4096; m++, k++)
{
buffer_firmware[k] = data_store1[m];
}
SHA256Input(&ctx, (const uint8_t *) buffer_firmware, 8192);
First of all, you must use volatile keyword while accessing memory map, furthermore, while reading the two page if you accept it is 4096 bytes, then have to use uint8_t because it has 4096 bytes. However, you want to access it a uint32_t pointer then the max value of for loop must be 4096/4 since it has 4096/4 words.
uint8_t buffer_firmware[4096]
uint8_t *words0 = (volatile uint8_t *) BASE_ADDR_SECTOR_0;
uint8_t *words1 = (volatile uint8_t *) BASE_ADDR_SECTOR_1;
for(uint16_t i=0; i<4096; i++){
buffer_firmware[i] = words0[i];
}

Checking code integrity in iOS

How could I guarantee the integrity of the code of an iOS app? I've been taking a look to Apple's Security Overview document, would code signing be enough? Is there any other recommended mechanism to guarantee the code integrity?
Thanks in advance
I had a same problem. This is easy on OS X but somewhat difficult in iOS because iOS doesn't have API like SecStaticCodeCheckValidity.
There are two sections in mach-o binary that you can use to ensure integrity of the app.
LC_ENCRYPTION_INFO
LC_CODE_SIGNATURE
1. LC_ENCRYPTION_INFO
First, LC_ENCRYPTION_INFO stores informations about 'app store encryption'. Once an app is uploaded to app store, app is encrypted before it is released to users.
binary before uploading to appstore or decrypted
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 0
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 0
pad 0
binary after uploading to appstore (encrypted)
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 1
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 1
pad 0
As you can see, 'cryptid' is set to 1 when app is uploaded. So checking 'cryptid' bit will tell us if the binary is encrypted or not.
You may think that this can be bypassed easily by just setting the bit to 1, but then OS will try to decrypt the binary which will make the codes to unrecognizable bytes.
bool isBinaryEncrypted()
{
// checking current binary's LC_ENCRYPTION_INFO
const void *binaryBase;
struct load_command *machoCmd;
const struct mach_header *machoHeader;
NSString *path = [[NSBundle mainBundle] executablePath];
NSData *filedata = [NSData dataWithContentsOfFile:path];
binaryBase = (char *)[filedata bytes];
machoHeader = (const struct mach_header *) binaryBase;
if(machoHeader->magic == FAT_CIGAM)
{
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_ENCRYPTION_INFO)
{
struct encryption_info_command *cryptCmd = (struct encryption_info_command *) machoCmd;
return cryptCmd->cryptid;
}
if(machoCmd->cmd == LC_ENCRYPTION_INFO_64)
{
struct encryption_info_command_64 *cryptCmd = (struct encryption_info_command_64 *) machoCmd;
return cryptCmd->cryptid;
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}
return FALSE; // couldn't find cryptcmd
}
2. LC_CODE_SIGNATURE
LC_CODE_SIGNATURE is the section that /usr/bin/codesign actually refers when checking validity of the binary. But parsing the section is a little bit more difficult than parsing LC_ENCRYPTION_INFO, because it's undocumented and there are no types like signature_info_command.
LC_CODE_SIGNATURE contains hashes of all of the binary except the section itself, and hashes are adjusted whenever it's re-signed.
I ported the codes of /usr/bin/codesign to parse this section. check here and SecStaticCode::validateExecutable defined in here
CodeSigning.h
#ifndef CodeSigning_h
#define CodeSigning_h
#include <stdio.h>
// codes from https://opensource.apple.com/source/Security/Security-55179.1/libsecurity_codesigning/lib/cscdefs.h
enum {
CSMAGIC_REQUIREMENT = 0xfade0c00, /* single Requirement blob */
CSMAGIC_REQUIREMENTS = 0xfade0c01, /* Requirements vector (internal requirements) */
CSMAGIC_CODEDIRECTORY = 0xfade0c02, /* CodeDirectory blob */
CSMAGIC_EMBEDDED_SIGNATURE = 0xfade0cc0, /* embedded form of signature data */
CSMAGIC_DETACHED_SIGNATURE = 0xfade0cc1, /* multi-arch collection of embedded signatures */
CSSLOT_CODEDIRECTORY = 0, /* slot index for CodeDirectory */
};
/*
* Structure of an embedded-signature SuperBlob
*/
typedef struct __BlobIndex {
uint32_t type; /* type of entry */
uint32_t offset; /* offset of entry */
} CS_BlobIndex;
typedef struct __SuperBlob {
uint32_t magic; /* magic number */
uint32_t length; /* total length of SuperBlob */
uint32_t count; /* number of index entries following */
CS_BlobIndex index[]; /* (count) entries */
/* followed by Blobs in no particular order as indicated by offsets in index */
} CS_SuperBlob;
/*
* C form of a CodeDirectory.
*/
typedef struct __CodeDirectory {
uint32_t magic; /* magic number (CSMAGIC_CODEDIRECTORY) */
uint32_t length; /* total length of CodeDirectory blob */
uint32_t version; /* compatibility version */
uint32_t flags; /* setup and mode flags */
uint32_t hashOffset; /* offset of hash slot element at index zero */
uint32_t identOffset; /* offset of identifier string */
uint32_t nSpecialSlots; /* number of special hash slots */
uint32_t nCodeSlots; /* number of ordinary (code) hash slots */
uint32_t codeLimit; /* limit to main image signature range */
uint8_t hashSize; /* size of each hash in bytes */
uint8_t hashType; /* type of hash (cdHashType* constants) */
uint8_t spare1; /* unused (must be zero) */
uint8_t pageSize; /* log2(page size in bytes); 0 => infinite */
uint32_t spare2; /* unused (must be zero) */
/* followed by dynamic content as located by offset fields above */
} CS_CodeDirectory;
static inline const CS_CodeDirectory *findCodeDirectory(const CS_SuperBlob *embedded)
{
if (embedded && ntohl(embedded->magic) == CSMAGIC_EMBEDDED_SIGNATURE) {
const CS_BlobIndex *limit = &embedded->index[ntohl(embedded->count)];
const CS_BlobIndex *p;
for (p = embedded->index; p < limit; ++p)
if (ntohl(p->type) == CSSLOT_CODEDIRECTORY) {
const unsigned char *base = (const unsigned char *)embedded;
const CS_CodeDirectory *cd = (const CS_CodeDirectory *)(base + ntohl(p->offset));
if (ntohl(cd->magic) == CSMAGIC_CODEDIRECTORY){
return cd;
}
else{
break;
}
}
}
// not found
return NULL;
}
//
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory);
#endif /* CodeSigning_h */
CodeSigning.c
#include "CodeSigning.h"
#include <stdio.h>
#include <string.h>
#import <CommonCrypto/CommonDigest.h>
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory)
{
uint8_t digest[CC_SHA1_DIGEST_LENGTH + 1] = {0, };
CC_SHA1(data, (CC_LONG)length, digest);
return (memcmp(digest, (void *)((char *)codeDirectory + ntohl(codeDirectory->hashOffset) + 20*slot), 20) == 0);
}
parsing the section
void checkCodeSignature(void *binaryContent){
struct load_command *machoCmd;
const struct mach_header *machoHeader;
machoHeader = (const struct mach_header *) binaryContent;
if(machoHeader->magic == FAT_CIGAM){
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_CODE_SIGNATURE)
{
struct linkedit_data_command *codeSigCmd = (struct linkedit_data_command *) machoCmd;
const CS_SuperBlob *codeEmbedded = (const CS_SuperBlob *)&((char *)machoHeader)[codeSigCmd->dataoff];
void *binaryBase = (void *)machoHeader;
const CS_BlobIndex curIndex = codeEmbedded->index[0];
const CS_CodeDirectory *codeDirectory = (const CS_CodeDirectory *)((char *)codeEmbedded + ntohl(curIndex.offset));
size_t pageSize = codeDirectory->pageSize ? (1 << codeDirectory->pageSize) : 0;
size_t remaining = ntohl(codeDirectory->codeLimit);
size_t processed = 0;
for(size_t slot = 0; slot < ntohl(codeDirectory->nCodeSlots); ++slot){
size_t size = MIN(remaining, pageSize);
if(!validateSlot(binaryBase+processed, size, slot, codeDirectory)){
return;
}
processed += size;
remaining -= size;
}
printf("[*] Code is valid!");
}
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}

Converting a 2D Canny Edge image to 1D edge pixel array in CUDA - Strange behaviour

I have a CUDA kernel which takes an edge image and processes it to create a smaller, 1D array of the edge pixels. Now here is the strange behaviour. Every time I run the kernel and calculate the number of edge pixels in "d_nlist" (see the code near the printf), I get a greater pixel count each time, even when I use the same image and stop the program completely and re-run. Therefore, each time I run it, it takes longer to run, until eventually, it throws an un-caught exception.
My question is, how can I stop this from happening so that I can get consistent results each time I run the kernel?
My device is a Geforce 620.
Constants:
THREADS_X = 32
THREADS_Y = 4
PIXELS_PER_THREAD = 4
MAX_QUEUE_LENGTH = THREADS_X * THREADS_Y * PIXELS_PER_THREAD
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_SIZE = IMG_WIDTH * IMG_HEIGHT
BLOCKS_X = IMG_WIDTH / (THREADS_X * PIXELS_PER_THREAD)
BLOCKS_Y = IMG_HEIGHT / THREADS_Y
The kernel is as follows:
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}
The following is the method which calls the kernel:
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
// Start timer w/ io
cudaEventRecord(startio,0);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
Thank you very much in advance for your help everyone.
As a preamble, let me suggest some troubleshooting steps that are useful:
instrument your code with proper cuda error checking
run your code with cuda-memcheck e.g. cuda-memcheck ./myapp
If you do the above steps, you'll find that your kernel is failing, and the failures have to do with global writes of size 4. So that will focus your attention on the last segment of your kernel, beginning with the comment // Copy local queues to global queue
Regarding your code, then, you have at least 2 problems:
The addressing/indexing in your final segment of your kernel, where you are writing the individual queues out to global memory, is messed up. I'm not going to try and debug this for you.
You are not initializing your d_nlist variable to zero. Therefore when you do an atomic add to it, you are adding your values to a junk value, which will tend to increase as you repeat the process.
Here's some code which has the problems removed, (I did not try to sort out your queue copy code) and error checking added. It produces repeatable results for me:
$ cat t216.cu
#include <stdio.h>
#include <stdlib.h>
#define THREADS_X 32
#define THREADS_Y 4
#define PIXELS_PER_THREAD 4
#define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
#define IMG_WIDTH 256
#define IMG_HEIGHT 256
#define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
#define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
#define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
/*
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
*/
}
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist=0, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
cudaCheckErrors("cudamalloc fail");
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
float et, etio;
// Start timer w/ io
cudaEventRecord(startio,0);
cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy 1");
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
// cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
int main(){
unsigned char *image;
image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
if (image == 0) {printf("malloc fail\n"); return 0;}
for (int i =0 ; i<IMG_SIZE; i++)
image[i] = rand()%2;
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
cudaCheckErrors("some error");
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t216 t216.cu
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$

Bit field ordering on Big-Endian (SPARC) processor

Consider the code below:
#include <stdio.h>
#include <stdlib.h>
#define FORCE_CAST(var, type) *(type*)&var
struct processor_status_register
{
unsigned int cwp:5;
unsigned int et:1;
unsigned int ps:1;
unsigned int s:1;
unsigned int pil:4;
unsigned int ef:1;
unsigned int ec:1;
unsigned int reserved:6;
unsigned int c:1;
unsigned int v:1;
unsigned int z:1;
unsigned int n:1;
unsigned int ver:4;
unsigned int impl:4;
}__attribute__ ((__packed__));
struct registers
{
unsigned long* registerSet;
unsigned long* globalRegisters;
unsigned long* cwptr;
unsigned long wim, tbr, y, pc, npc;
unsigned short registerWindows;
/* Though Intel x86 architecture allows un-aligned memory access, SPARC mandates memory accesses to be 8 byte aligned. Without __attribute__ ((aligned (8))) or a preceding dummy byte e.g. unsigned short dummyByte, the code below crashes with a dreaded Bus error and Core dump. For more details, follow the links below:
http://blog.jgc.org/2007/04/debugging-solaris-bus-error-caused-by.html
https://groups.google.com/forum/?fromgroups=#!topic/comp.unix.solaris/8SgFiMudGL4
*/
struct processor_status_register __attribute__ ((aligned (8))) psr;
}__attribute__ ((__packed__));
int getBit(unsigned long bitStream, int position)
{
int bit;
bit = (bitStream & (1 << position)) >> position;
return bit;
}
char* showBits(unsigned long bitStream, int startPosition, int endPosition)
{
// Allocate one extra byte for NULL character
char* bits = (char*)malloc(endPosition - startPosition + 2);
int bitIndex;
for(bitIndex = 0; bitIndex <= endPosition; bitIndex++)
bits[bitIndex] = (getBit(bitStream, endPosition - bitIndex)) ? '1' : '0';
bits[bitIndex] = '\0';
return bits;
}
int main()
{
struct registers sparcRegisters; short isLittleEndian;
// Check for Endianness
unsigned long checkEndian = 0x00000001;
if(*((char*)(&checkEndian)))
{printf("Little Endian\n"); isLittleEndian = 1;} // Little
Endian architecture detected
else
{printf("Big Endian\n"); isLittleEndian = 0;} // Big
Endian architecture detected
unsigned long registerValue = 0xF30010A7;
unsigned long swappedRegisterValue = isLittleEndian ? registerValue :
__builtin_bswap32(registerValue);
sparcRegisters.psr = FORCE_CAST(swappedRegisterValue, struct
processor_status_register);
registerValue = isLittleEndian ? FORCE_CAST (sparcRegisters.psr,
unsigned long) : __builtin_bswap32(FORCE_CAST (sparcRegisters.psr,
unsigned long));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
sparcRegisters.psr.impl, sparcRegisters.psr.ver,
sparcRegisters.psr.cwp);
printf("PSR=%s\n",showBits(registerValue, 0, 31));
sparcRegisters.psr.cwp = 7;
sparcRegisters.psr.et = 1;
sparcRegisters.psr.ps = 0;
sparcRegisters.psr.s = 1;
sparcRegisters.psr.pil = 0;
sparcRegisters.psr.ef = 0;
sparcRegisters.psr.ec = 0;
sparcRegisters.psr.reserved = 0;
sparcRegisters.psr.c = 0;
sparcRegisters.psr.v = 0;
sparcRegisters.psr.z = 0;
sparcRegisters.psr.n = 0;
sparcRegisters.psr.ver = 3;
sparcRegisters.psr.impl = 0xF;
registerValue = isLittleEndian ? FORCE_CAST (sparcRegisters.psr,
unsigned long) : __builtin_bswap32(FORCE_CAST (sparcRegisters.psr,
unsigned long));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
sparcRegisters.psr.impl, sparcRegisters.psr.ver,
sparcRegisters.psr.cwp);
printf("PSR=%s\n\n",showBits(registerValue, 0, 31));
return 0;
}
I have used gcc-4.7.2 on Solaris 10 on SPARC to compile the following
code to produce the Big-Endian output:
Big Endian
PSR=0xF30010A7, IMPL=3, VER=15, CWP=20
PSR=11110011000000000001000010100111
PSR=0x3F00003D, IMPL=15, VER=3, CWP=7
PSR=00111111000000000000000000111101
I have used gcc-4.4 on Ubuntu-10.04 on Intel-x86 to compile the
following code to produce the Little-Endian output:
Little Endian
PSR=0xF30010A7, IMPL=15, VER=3, CWP=7
PSR=11110011000000000001000010100111
PSR=0xF30000A7, IMPL=15, VER=3, CWP=7
PSR=11110011000000000000000010100111
While the later one is as expected, can anyone please explain the
Big-Endian counterpart? Considering the showBits() method to be
correct, how can PSR=0x3F00003D give rise to IMPL=15, VER=3, CWP=7
values? How is the bit-field is being arranged and interpreted in
memory on a Big-Endian system?
... PSR=0x3F00003D give rise to IMPL=15, VER=3, CWP=7 values?
It cant. I don't know why you're calling __builtin_bswap32 but 0x3F00003D does not represent the memory of the sparcRegisters struct as you initialized it.
Lets check this code:
sparcRegisters.psr.cwp = 7;
sparcRegisters.psr.et = 1;
sparcRegisters.psr.ps = 0;
sparcRegisters.psr.s = 1;
sparcRegisters.psr.pil = 0;
sparcRegisters.psr.ef = 0;
sparcRegisters.psr.ec = 0;
sparcRegisters.psr.reserved = 0;
sparcRegisters.psr.c = 0;
sparcRegisters.psr.v = 0;
sparcRegisters.psr.z = 0;
sparcRegisters.psr.n = 0;
sparcRegisters.psr.ver = 3;
sparcRegisters.psr.impl = 0xF;
The individual translations are as follows:
7 => 00111
1 => 1
0 => 0
1 => 1
0 => 0000
0 => 0
0 => 0
0 => 000000
0 => 0
0 => 0
0 => 0
0 => 0
3 => 0011
F => 1111
The structure therefore in memory becomes 00111101000000000000000000111111 which is 0x3D00003F in big-endian.
You can confirm with this code (tested using CC in solaris):
#include <stdio.h>
#include <string.h>
struct processor_status_register
{
unsigned int cwp:5;
unsigned int et:1;
unsigned int ps:1;
unsigned int s:1;
unsigned int pil:4;
unsigned int ef:1;
unsigned int ec:1;
unsigned int reserved:6;
unsigned int c:1;
unsigned int v:1;
unsigned int z:1;
unsigned int n:1;
unsigned int ver:4;
unsigned int impl:4;
}__attribute__ ((__packed__));
int getBit(unsigned long bitStream, int position)
{
int bit;
bit = (bitStream & (1 << position)) >> position;
return bit;
}
char* showBits(unsigned long bitStream, int startPosition, int endPosition)
{
// Allocate one extra byte for NULL character
static char bits[33];
memset(bits, 0, 33);
int bitIndex;
for(bitIndex = 0; bitIndex <= endPosition; bitIndex++)
{
bits[bitIndex] = (getBit(bitStream, endPosition - bitIndex)) ? '1' : '0';
}
return bits;
}
int main()
{
processor_status_register psr;
psr.cwp = 7;
psr.et = 1;
psr.ps = 0;
psr.s = 1;
psr.pil = 0;
psr.ef = 0;
psr.ec = 0;
psr.reserved = 0;
psr.c = 0;
psr.v = 0;
psr.z = 0;
psr.n = 0;
psr.ver = 3;
psr.impl = 0xF;
unsigned long registerValue = 0;
memcpy(&registerValue, &psr, sizeof(registerValue));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
psr.impl, psr.ver,
psr.cwp);
printf("PSR=%s\n\n",showBits(registerValue, 0, 31));
return 0;
}
The output of this is:
PSR=0x3D00003F, IMPL=15, VER=3, CWP=7
PSR=00111101000000000000000000111111

Resources