How could I guarantee the integrity of the code of an iOS app? I've been taking a look to Apple's Security Overview document, would code signing be enough? Is there any other recommended mechanism to guarantee the code integrity?
Thanks in advance
I had a same problem. This is easy on OS X but somewhat difficult in iOS because iOS doesn't have API like SecStaticCodeCheckValidity.
There are two sections in mach-o binary that you can use to ensure integrity of the app.
LC_ENCRYPTION_INFO
LC_CODE_SIGNATURE
1. LC_ENCRYPTION_INFO
First, LC_ENCRYPTION_INFO stores informations about 'app store encryption'. Once an app is uploaded to app store, app is encrypted before it is released to users.
binary before uploading to appstore or decrypted
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 0
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 0
pad 0
binary after uploading to appstore (encrypted)
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 1
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 1
pad 0
As you can see, 'cryptid' is set to 1 when app is uploaded. So checking 'cryptid' bit will tell us if the binary is encrypted or not.
You may think that this can be bypassed easily by just setting the bit to 1, but then OS will try to decrypt the binary which will make the codes to unrecognizable bytes.
bool isBinaryEncrypted()
{
// checking current binary's LC_ENCRYPTION_INFO
const void *binaryBase;
struct load_command *machoCmd;
const struct mach_header *machoHeader;
NSString *path = [[NSBundle mainBundle] executablePath];
NSData *filedata = [NSData dataWithContentsOfFile:path];
binaryBase = (char *)[filedata bytes];
machoHeader = (const struct mach_header *) binaryBase;
if(machoHeader->magic == FAT_CIGAM)
{
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_ENCRYPTION_INFO)
{
struct encryption_info_command *cryptCmd = (struct encryption_info_command *) machoCmd;
return cryptCmd->cryptid;
}
if(machoCmd->cmd == LC_ENCRYPTION_INFO_64)
{
struct encryption_info_command_64 *cryptCmd = (struct encryption_info_command_64 *) machoCmd;
return cryptCmd->cryptid;
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}
return FALSE; // couldn't find cryptcmd
}
2. LC_CODE_SIGNATURE
LC_CODE_SIGNATURE is the section that /usr/bin/codesign actually refers when checking validity of the binary. But parsing the section is a little bit more difficult than parsing LC_ENCRYPTION_INFO, because it's undocumented and there are no types like signature_info_command.
LC_CODE_SIGNATURE contains hashes of all of the binary except the section itself, and hashes are adjusted whenever it's re-signed.
I ported the codes of /usr/bin/codesign to parse this section. check here and SecStaticCode::validateExecutable defined in here
CodeSigning.h
#ifndef CodeSigning_h
#define CodeSigning_h
#include <stdio.h>
// codes from https://opensource.apple.com/source/Security/Security-55179.1/libsecurity_codesigning/lib/cscdefs.h
enum {
CSMAGIC_REQUIREMENT = 0xfade0c00, /* single Requirement blob */
CSMAGIC_REQUIREMENTS = 0xfade0c01, /* Requirements vector (internal requirements) */
CSMAGIC_CODEDIRECTORY = 0xfade0c02, /* CodeDirectory blob */
CSMAGIC_EMBEDDED_SIGNATURE = 0xfade0cc0, /* embedded form of signature data */
CSMAGIC_DETACHED_SIGNATURE = 0xfade0cc1, /* multi-arch collection of embedded signatures */
CSSLOT_CODEDIRECTORY = 0, /* slot index for CodeDirectory */
};
/*
* Structure of an embedded-signature SuperBlob
*/
typedef struct __BlobIndex {
uint32_t type; /* type of entry */
uint32_t offset; /* offset of entry */
} CS_BlobIndex;
typedef struct __SuperBlob {
uint32_t magic; /* magic number */
uint32_t length; /* total length of SuperBlob */
uint32_t count; /* number of index entries following */
CS_BlobIndex index[]; /* (count) entries */
/* followed by Blobs in no particular order as indicated by offsets in index */
} CS_SuperBlob;
/*
* C form of a CodeDirectory.
*/
typedef struct __CodeDirectory {
uint32_t magic; /* magic number (CSMAGIC_CODEDIRECTORY) */
uint32_t length; /* total length of CodeDirectory blob */
uint32_t version; /* compatibility version */
uint32_t flags; /* setup and mode flags */
uint32_t hashOffset; /* offset of hash slot element at index zero */
uint32_t identOffset; /* offset of identifier string */
uint32_t nSpecialSlots; /* number of special hash slots */
uint32_t nCodeSlots; /* number of ordinary (code) hash slots */
uint32_t codeLimit; /* limit to main image signature range */
uint8_t hashSize; /* size of each hash in bytes */
uint8_t hashType; /* type of hash (cdHashType* constants) */
uint8_t spare1; /* unused (must be zero) */
uint8_t pageSize; /* log2(page size in bytes); 0 => infinite */
uint32_t spare2; /* unused (must be zero) */
/* followed by dynamic content as located by offset fields above */
} CS_CodeDirectory;
static inline const CS_CodeDirectory *findCodeDirectory(const CS_SuperBlob *embedded)
{
if (embedded && ntohl(embedded->magic) == CSMAGIC_EMBEDDED_SIGNATURE) {
const CS_BlobIndex *limit = &embedded->index[ntohl(embedded->count)];
const CS_BlobIndex *p;
for (p = embedded->index; p < limit; ++p)
if (ntohl(p->type) == CSSLOT_CODEDIRECTORY) {
const unsigned char *base = (const unsigned char *)embedded;
const CS_CodeDirectory *cd = (const CS_CodeDirectory *)(base + ntohl(p->offset));
if (ntohl(cd->magic) == CSMAGIC_CODEDIRECTORY){
return cd;
}
else{
break;
}
}
}
// not found
return NULL;
}
//
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory);
#endif /* CodeSigning_h */
CodeSigning.c
#include "CodeSigning.h"
#include <stdio.h>
#include <string.h>
#import <CommonCrypto/CommonDigest.h>
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory)
{
uint8_t digest[CC_SHA1_DIGEST_LENGTH + 1] = {0, };
CC_SHA1(data, (CC_LONG)length, digest);
return (memcmp(digest, (void *)((char *)codeDirectory + ntohl(codeDirectory->hashOffset) + 20*slot), 20) == 0);
}
parsing the section
void checkCodeSignature(void *binaryContent){
struct load_command *machoCmd;
const struct mach_header *machoHeader;
machoHeader = (const struct mach_header *) binaryContent;
if(machoHeader->magic == FAT_CIGAM){
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_CODE_SIGNATURE)
{
struct linkedit_data_command *codeSigCmd = (struct linkedit_data_command *) machoCmd;
const CS_SuperBlob *codeEmbedded = (const CS_SuperBlob *)&((char *)machoHeader)[codeSigCmd->dataoff];
void *binaryBase = (void *)machoHeader;
const CS_BlobIndex curIndex = codeEmbedded->index[0];
const CS_CodeDirectory *codeDirectory = (const CS_CodeDirectory *)((char *)codeEmbedded + ntohl(curIndex.offset));
size_t pageSize = codeDirectory->pageSize ? (1 << codeDirectory->pageSize) : 0;
size_t remaining = ntohl(codeDirectory->codeLimit);
size_t processed = 0;
for(size_t slot = 0; slot < ntohl(codeDirectory->nCodeSlots); ++slot){
size_t size = MIN(remaining, pageSize);
if(!validateSlot(binaryBase+processed, size, slot, codeDirectory)){
return;
}
processed += size;
remaining -= size;
}
printf("[*] Code is valid!");
}
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}
Related
I'm trying to access a PCIe device memory from a user space program. I open the file: /sys/bus/pci/devices/0000:3b:00.0/resource0 and then I call mmap that will return a virtual address.
When writing at this virtual address (VA) the MMU will translate it to a physical address (PA), the memory controller will convert the write to the PA into a TLP to request a write to the PCIe device. (AFAIU)
How can I get the physical address that is being used? I had a look to /proc//maps and I see that there is an address that coincides with the PCIe bar0 address (0xa0000000).
But this address seems too low, it overlaps with DDR memory.
I also tried this program to convert VA to PA but it doesn't seem to give sensible results for such mapping:
virt2phys$ cat v2p.c
#define _XOPEN_SOURCE 700
#include <fcntl.h> /* open */
#include <stdint.h> /* uint64_t */
#include <stdio.h> /* printf */
#include <stdlib.h> /* size_t */
#include <unistd.h> /* pread, sysconf */
typedef struct {
uint64_t pfn : 55;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
uintptr_t vpn;
vpn = vaddr / sysconf(_SC_PAGE_SIZE);
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data) - nread,
vpn * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 55) - 1);
entry->soft_dirty = (data >> 55) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
int main(int argc, char **argv)
{
pid_t pid;
uintptr_t vaddr, paddr = 0;
if (argc < 3) {
printf("Usage: %s pid vaddr(in hex)\n", argv[0]);
return EXIT_FAILURE;
}
pid = strtoull(argv[1], NULL, 0);
vaddr = strtoull(argv[2], NULL, 16);
if (virt_to_phys_user(&paddr, pid, vaddr)) {
fprintf(stderr, "error: virt_to_phys_user\n");
return EXIT_FAILURE;
};
printf("0x%jx\n", (uintmax_t)paddr);
return EXIT_SUCCESS;
}
I am validating DPDK receive functionality & for this I'm shooting a pcap externally &
added code in l2fwd to dump received packets to pcap, the l2fwd dumped pcap have all the packets from shooter but some of them are not in sequence.
Shooter is already validated.
DPDK version in use-21.11
link of the pcap used : https://wiki.wireshark.org/uploads/__moin_import__/attachments/SampleCaptures/tcp-ecn-sample.pcap
Out of order packets are random. For the first run I saw no jumbled packets but was able to replicate the issue on second run with the 2nd,3rd,4th packets jumbled having order 3,4,2.
Below is snipped from l2fwd example & our modifications as //TESTCODE..
/* Read packet from RX queues. 8< */
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
nb_rx = rte_eth_rx_burst(portid, 0,
pkts_burst, MAX_PKT_BURST);
port_statistics[portid].rx += nb_rx;
for (j = 0; j < nb_rx; j++) {
m = pkts_burst[j];
// TESTCODE_STARTS
uint8_t* pkt = rte_pktmbuf_mtod(m, uint8_t*);
dump_to_pcap(pkt, rte_pktmbuf_pkt_len(m));
// TESTCODE_ENDS
rte_prefetch0(rte_pktmbuf_mtod(m, void *));
l2fwd_simple_forward(m, portid);
}
}
/* >8 End of read packet from RX queues. */
Below is code for dump_to_pcap
static int
dump_to_pcap(uint8_t* pkt, int pkt_len)
{
static FILE* fp = NULL;
static int init_file = 0;
if (0 == init_file) {
printf("Creating pcap\n");
char pcap_filename[256] = { 0 };
char Two_pcap_filename[256] = { 0 };
currentDateTime(pcap_filename);
sprintf(Two_pcap_filename,".\\Rx_%d_%s.pcap", 0, pcap_filename);
printf("FileSName to Create: %s\n", Two_pcap_filename);
fp = fopen(Two_pcap_filename, "wb");
if (NULL == fp) {
printf("Unable to open file\n");
fp = NULL;
}
else {
printf("File create success..\n");
init_file = 1;
typedef struct pcap_file_header1 {
unsigned int magic; // a 32-bit "magic number"
unsigned short version_major; //a 16-bit major version number
unsigned short version_minor; //a 16-bit minor version number
unsigned int thiszone; //a 32-bit "time zone offset" field that's actually not used, so ou can (and probably should) just make it 0
unsigned int sigfigs; //a 32-bit "time stamp accuracy" field that's not actually used,so you can (and probably should) just make it 0;
unsigned int snaplen; //a 32-bit "snapshot length" field
unsigned int linktype; //a 32-bit "link layer type" field
}dumpFileHdr;
dumpFileHdr file_hdr;
file_hdr.magic = 2712847316; //0xa1b2c3d4;
file_hdr.version_major = 2;
file_hdr.version_minor = 4;
file_hdr.thiszone = 0;
file_hdr.sigfigs = 0;
file_hdr.snaplen = 65535;
file_hdr.linktype = 1;
fwrite((void*)(&file_hdr), sizeof(dumpFileHdr), 1, fp);
//printf("Pcap Header written\n");
}
}
typedef struct pcap_pkthdr1 {
unsigned int ts_sec; /* time stamp */
unsigned int ts_usec;
unsigned int caplen; /* length of portion present */
unsigned int len; /* length this packet (off wire) */
}dumpPktHdr;
dumpPktHdr pkt_hdr;
static int ts_sec = 1;
pkt_hdr.ts_sec = ts_sec++;
pkt_hdr.ts_usec = 0;
pkt_hdr.caplen = pkt_hdr.len = pkt_len;
if (NULL != fp) {
fwrite((void*)(&pkt_hdr), sizeof(dumpPktHdr), 1, fp);
fwrite((void*)(pkt), pkt_len, 1, fp);
fflush(fp);
}
return 0;
}
How can I find the text segment (AKA code segment) range in iOS? Meaning, what is the start address and the end address of the text segment?
I found this interesting post but it works for me on Android but not on iOS.
After some digging and expert help (thanks Moshe Kravchik) I got to the desired solution - getting the text segment range by parsing the mach header and retrieving the load commands, segments and sections.
#include <mach-o/dyld.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dlfcn.h>
#import <Foundation/Foundation.h>
#define PRINT_STR "Found __text Section of %s, addr 0x%x, size %u, offset 0x%x, calc address 0x%x"
#define LC_SEGMENT_NATIVE LC_SEGMENT
#define segment_command_native segment_command
#define section_native section
struct libRange
{
uint32_t start;
uint32_t end;
};
void getTextSegmentAddr(struct libRange *txtSegRange)
{
if (txtSegRange==NULL)
return;
txtSegRange->start=0;
const struct mach_header *mach_hdr;
mach_hdr = _dyld_get_image_header(0);
const struct load_command *cmds = (const struct load_command *)(mach_hdr + 1);
uint32_t cmdsleft;
const struct load_command *lc;
for(lc = cmds, cmdsleft = mach_hdr->ncmds; cmdsleft-- && (0 == txtSegRange->start);) {
if(lc->cmd == LC_SEGMENT_NATIVE) {
const struct segment_command_native *sc = (void *) lc;
const struct section_native *sect = (void *) (sc + 1);
for(uint32_t sect_idx = 0; sect_idx < sc->nsects; sect_idx++) {
if(!strcmp("__TEXT", sect->segname) && !strcmp("__text", sect->sectname)) {
uint32_t memAddr = (sc->vmaddr + _dyld_get_image_vmaddr_slide(0) + sect->offset - sc->fileoff);
NSLog(#PRINT_STR,_dyld_get_image_name(0), sect->addr, sect->size, sect->offset, memAddr);
txtSegRange->start = memAddr;
txtSegRange->end = memAddr + sect->size;
break;
}
sect++;
}
}
lc = (void *) ((char *) lc + lc->cmdsize);
}
}
int main()
{
struct libRange txtSegRange;
getTextSegmentAddr(&txtSegRange);
return 0;
}
I have a CUDA kernel which takes an edge image and processes it to create a smaller, 1D array of the edge pixels. Now here is the strange behaviour. Every time I run the kernel and calculate the number of edge pixels in "d_nlist" (see the code near the printf), I get a greater pixel count each time, even when I use the same image and stop the program completely and re-run. Therefore, each time I run it, it takes longer to run, until eventually, it throws an un-caught exception.
My question is, how can I stop this from happening so that I can get consistent results each time I run the kernel?
My device is a Geforce 620.
Constants:
THREADS_X = 32
THREADS_Y = 4
PIXELS_PER_THREAD = 4
MAX_QUEUE_LENGTH = THREADS_X * THREADS_Y * PIXELS_PER_THREAD
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_SIZE = IMG_WIDTH * IMG_HEIGHT
BLOCKS_X = IMG_WIDTH / (THREADS_X * PIXELS_PER_THREAD)
BLOCKS_Y = IMG_HEIGHT / THREADS_Y
The kernel is as follows:
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}
The following is the method which calls the kernel:
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
// Start timer w/ io
cudaEventRecord(startio,0);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
Thank you very much in advance for your help everyone.
As a preamble, let me suggest some troubleshooting steps that are useful:
instrument your code with proper cuda error checking
run your code with cuda-memcheck e.g. cuda-memcheck ./myapp
If you do the above steps, you'll find that your kernel is failing, and the failures have to do with global writes of size 4. So that will focus your attention on the last segment of your kernel, beginning with the comment // Copy local queues to global queue
Regarding your code, then, you have at least 2 problems:
The addressing/indexing in your final segment of your kernel, where you are writing the individual queues out to global memory, is messed up. I'm not going to try and debug this for you.
You are not initializing your d_nlist variable to zero. Therefore when you do an atomic add to it, you are adding your values to a junk value, which will tend to increase as you repeat the process.
Here's some code which has the problems removed, (I did not try to sort out your queue copy code) and error checking added. It produces repeatable results for me:
$ cat t216.cu
#include <stdio.h>
#include <stdlib.h>
#define THREADS_X 32
#define THREADS_Y 4
#define PIXELS_PER_THREAD 4
#define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
#define IMG_WIDTH 256
#define IMG_HEIGHT 256
#define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
#define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
#define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
/*
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
*/
}
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist=0, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
cudaCheckErrors("cudamalloc fail");
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
float et, etio;
// Start timer w/ io
cudaEventRecord(startio,0);
cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy 1");
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
// cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
int main(){
unsigned char *image;
image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
if (image == 0) {printf("malloc fail\n"); return 0;}
for (int i =0 ; i<IMG_SIZE; i++)
image[i] = rand()%2;
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
cudaCheckErrors("some error");
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t216 t216.cu
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$
I have problem with passing structure to kernel local memory. Here is the kernel kode:
typedef struct data {
unsigned long wId; // group_id
unsigned long iId[1]; // global_item_id
} DATA;
__kernel void tKernel(__global DATA *x, __local DATA tmp) {
int wd = get_work_dim();
// x dimension
int xGrId = get_group_id(0);
int xLId = get_local_id(0);
int xGlId = get_global_id(0);
x += xGrId;
x->wId = tmp.wId;
x->iId[xLId] = ++tmp.wId;
}
Here is the host code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define GLOBAL_ITEM_SIZE (1)
#define LOCAL_ITEM_SIZE (1)
#define MAX_SOURCE_SIZE (0x100000)
typedef struct data {
unsigned long wId;
unsigned long iId[LOCAL_ITEM_SIZE];
} DATA;
int main() {
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_mem cmPinnedBufOut = NULL;
DATA *cDataOut = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t group_size = GLOBAL_ITEM_SIZE / LOCAL_ITEM_SIZE;
FILE *fp;
const char fileName[] = "./kernel.cl";
size_t source_size;
char *source_str;
/* Load kernel source file */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(EXIT_FAILURE);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue with measurment of preformance */
commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create memory object */
cmPinnedBufOut = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, group_size * sizeof(DATA), NULL, &ret);
cDataOut = (DATA *)malloc(group_size * sizeof(DATA));
/* Create kernel program from source file */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
assert(ret == CL_SUCCESS);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("\nFail to build the program\n");
char buffer[10240];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
printf("%s\n", buffer);
exit(EXIT_FAILURE);
}
/* Create data parallel OpenCL kernel */
kernel = clCreateKernel(program, "tKernel", &ret);
assert(ret == CL_SUCCESS);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cmPinnedBufOut);
assert(ret == CL_SUCCESS);
DATA tmp;
tmp.wId = 66;
ret = clSetKernelArg(kernel, 1, sizeof(DATA), &tmp);
assert(ret == CL_SUCCESS);
size_t global_item_size = GLOBAL_ITEM_SIZE;
size_t local_item_size = LOCAL_ITEM_SIZE;
/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret == CL_INVALID_WORK_GROUP_SIZE) {
printf("Invalid work group size: error when compute group size: %lu/%lu", global_item_size, local_item_size);
exit(EXIT_FAILURE);
}
assert(ret == CL_SUCCESS);
/* Transfer result to host */
ret = clEnqueueReadBuffer(commandQueue, cmPinnedBufOut, CL_TRUE, 0, group_size * sizeof(DATA), cDataOut, 0, NULL, NULL);
assert(ret == CL_SUCCESS);
/* Display Results */
for (int i = 0; i < group_size; i++) {
printf("%d: -> group_id %lu ~> work_item_ids: ", i, cDataOut[i].wId);
for (int j = 0; j < LOCAL_ITEM_SIZE; j++)
printf("%2lu, ", cDataOut[i].iId[j]);
printf("\n");
}
printf("\n");
/* Finalization */
ret = clFlush(commandQueue);
ret = clFinish(commandQueue); // blockink function, wait until all queue cmd are finished
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
free(source_str);
return 0;
}
So I expected as result 0: -> group_id 66 ~> work_item_ids: 67,
But I get 0: -> group_id 0 ~> work_item_ids: 1,
From this I conclude that the hh structure with the number 66 was not reading correctly
by the kernel. I try to put this same way on integer number and this works perfectly.
So my question is, am I doing something wrong, or there isn't way to copy data structure from host to device local memory, or is there another way to doing this?
The clSetKernelArg for __local buffers only specifies the size, and the pointer must be 0. See OpenCL spec 5.7.2. There is no way you can initialize local memory from the host.