pass structure to kernel local memory - memory

I have problem with passing structure to kernel local memory. Here is the kernel kode:
typedef struct data {
unsigned long wId; // group_id
unsigned long iId[1]; // global_item_id
} DATA;
__kernel void tKernel(__global DATA *x, __local DATA tmp) {
int wd = get_work_dim();
// x dimension
int xGrId = get_group_id(0);
int xLId = get_local_id(0);
int xGlId = get_global_id(0);
x += xGrId;
x->wId = tmp.wId;
x->iId[xLId] = ++tmp.wId;
}
Here is the host code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define GLOBAL_ITEM_SIZE (1)
#define LOCAL_ITEM_SIZE (1)
#define MAX_SOURCE_SIZE (0x100000)
typedef struct data {
unsigned long wId;
unsigned long iId[LOCAL_ITEM_SIZE];
} DATA;
int main() {
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_mem cmPinnedBufOut = NULL;
DATA *cDataOut = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t group_size = GLOBAL_ITEM_SIZE / LOCAL_ITEM_SIZE;
FILE *fp;
const char fileName[] = "./kernel.cl";
size_t source_size;
char *source_str;
/* Load kernel source file */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(EXIT_FAILURE);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue with measurment of preformance */
commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create memory object */
cmPinnedBufOut = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, group_size * sizeof(DATA), NULL, &ret);
cDataOut = (DATA *)malloc(group_size * sizeof(DATA));
/* Create kernel program from source file */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
assert(ret == CL_SUCCESS);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("\nFail to build the program\n");
char buffer[10240];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
printf("%s\n", buffer);
exit(EXIT_FAILURE);
}
/* Create data parallel OpenCL kernel */
kernel = clCreateKernel(program, "tKernel", &ret);
assert(ret == CL_SUCCESS);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cmPinnedBufOut);
assert(ret == CL_SUCCESS);
DATA tmp;
tmp.wId = 66;
ret = clSetKernelArg(kernel, 1, sizeof(DATA), &tmp);
assert(ret == CL_SUCCESS);
size_t global_item_size = GLOBAL_ITEM_SIZE;
size_t local_item_size = LOCAL_ITEM_SIZE;
/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret == CL_INVALID_WORK_GROUP_SIZE) {
printf("Invalid work group size: error when compute group size: %lu/%lu", global_item_size, local_item_size);
exit(EXIT_FAILURE);
}
assert(ret == CL_SUCCESS);
/* Transfer result to host */
ret = clEnqueueReadBuffer(commandQueue, cmPinnedBufOut, CL_TRUE, 0, group_size * sizeof(DATA), cDataOut, 0, NULL, NULL);
assert(ret == CL_SUCCESS);
/* Display Results */
for (int i = 0; i < group_size; i++) {
printf("%d: -> group_id %lu ~> work_item_ids: ", i, cDataOut[i].wId);
for (int j = 0; j < LOCAL_ITEM_SIZE; j++)
printf("%2lu, ", cDataOut[i].iId[j]);
printf("\n");
}
printf("\n");
/* Finalization */
ret = clFlush(commandQueue);
ret = clFinish(commandQueue); // blockink function, wait until all queue cmd are finished
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
free(source_str);
return 0;
}
So I expected as result 0: -> group_id 66 ~> work_item_ids: 67,
But I get 0: -> group_id 0 ~> work_item_ids: 1,
From this I conclude that the hh structure with the number 66 was not reading correctly
by the kernel. I try to put this same way on integer number and this works perfectly.
So my question is, am I doing something wrong, or there isn't way to copy data structure from host to device local memory, or is there another way to doing this?

The clSetKernelArg for __local buffers only specifies the size, and the pointer must be 0. See OpenCL spec 5.7.2. There is no way you can initialize local memory from the host.

Related

How to discover physical address corresponding to PCIe device memory?

I'm trying to access a PCIe device memory from a user space program. I open the file: /sys/bus/pci/devices/0000:3b:00.0/resource0 and then I call mmap that will return a virtual address.
When writing at this virtual address (VA) the MMU will translate it to a physical address (PA), the memory controller will convert the write to the PA into a TLP to request a write to the PCIe device. (AFAIU)
How can I get the physical address that is being used? I had a look to /proc//maps and I see that there is an address that coincides with the PCIe bar0 address (0xa0000000).
But this address seems too low, it overlaps with DDR memory.
I also tried this program to convert VA to PA but it doesn't seem to give sensible results for such mapping:
virt2phys$ cat v2p.c
#define _XOPEN_SOURCE 700
#include <fcntl.h> /* open */
#include <stdint.h> /* uint64_t */
#include <stdio.h> /* printf */
#include <stdlib.h> /* size_t */
#include <unistd.h> /* pread, sysconf */
typedef struct {
uint64_t pfn : 55;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
uintptr_t vpn;
vpn = vaddr / sysconf(_SC_PAGE_SIZE);
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data) - nread,
vpn * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 55) - 1);
entry->soft_dirty = (data >> 55) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
int main(int argc, char **argv)
{
pid_t pid;
uintptr_t vaddr, paddr = 0;
if (argc < 3) {
printf("Usage: %s pid vaddr(in hex)\n", argv[0]);
return EXIT_FAILURE;
}
pid = strtoull(argv[1], NULL, 0);
vaddr = strtoull(argv[2], NULL, 16);
if (virt_to_phys_user(&paddr, pid, vaddr)) {
fprintf(stderr, "error: virt_to_phys_user\n");
return EXIT_FAILURE;
};
printf("0x%jx\n", (uintmax_t)paddr);
return EXIT_SUCCESS;
}

Checking code integrity in iOS

How could I guarantee the integrity of the code of an iOS app? I've been taking a look to Apple's Security Overview document, would code signing be enough? Is there any other recommended mechanism to guarantee the code integrity?
Thanks in advance
I had a same problem. This is easy on OS X but somewhat difficult in iOS because iOS doesn't have API like SecStaticCodeCheckValidity.
There are two sections in mach-o binary that you can use to ensure integrity of the app.
LC_ENCRYPTION_INFO
LC_CODE_SIGNATURE
1. LC_ENCRYPTION_INFO
First, LC_ENCRYPTION_INFO stores informations about 'app store encryption'. Once an app is uploaded to app store, app is encrypted before it is released to users.
binary before uploading to appstore or decrypted
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 0
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 0
pad 0
binary after uploading to appstore (encrypted)
otool -l [binary] | grep LC_ENCRYPTION_INFO -A5
cmd LC_ENCRYPTION_INFO
cmdsize 20
cryptoff 16384
cryptsize 5783552
cryptid 1
--
cmd LC_ENCRYPTION_INFO_64
cmdsize 24
cryptoff 16384
cryptsize 6635520
cryptid 1
pad 0
As you can see, 'cryptid' is set to 1 when app is uploaded. So checking 'cryptid' bit will tell us if the binary is encrypted or not.
You may think that this can be bypassed easily by just setting the bit to 1, but then OS will try to decrypt the binary which will make the codes to unrecognizable bytes.
bool isBinaryEncrypted()
{
// checking current binary's LC_ENCRYPTION_INFO
const void *binaryBase;
struct load_command *machoCmd;
const struct mach_header *machoHeader;
NSString *path = [[NSBundle mainBundle] executablePath];
NSData *filedata = [NSData dataWithContentsOfFile:path];
binaryBase = (char *)[filedata bytes];
machoHeader = (const struct mach_header *) binaryBase;
if(machoHeader->magic == FAT_CIGAM)
{
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_ENCRYPTION_INFO)
{
struct encryption_info_command *cryptCmd = (struct encryption_info_command *) machoCmd;
return cryptCmd->cryptid;
}
if(machoCmd->cmd == LC_ENCRYPTION_INFO_64)
{
struct encryption_info_command_64 *cryptCmd = (struct encryption_info_command_64 *) machoCmd;
return cryptCmd->cryptid;
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}
return FALSE; // couldn't find cryptcmd
}
2. LC_CODE_SIGNATURE
LC_CODE_SIGNATURE is the section that /usr/bin/codesign actually refers when checking validity of the binary. But parsing the section is a little bit more difficult than parsing LC_ENCRYPTION_INFO, because it's undocumented and there are no types like signature_info_command.
LC_CODE_SIGNATURE contains hashes of all of the binary except the section itself, and hashes are adjusted whenever it's re-signed.
I ported the codes of /usr/bin/codesign to parse this section. check here and SecStaticCode::validateExecutable defined in here
CodeSigning.h
#ifndef CodeSigning_h
#define CodeSigning_h
#include <stdio.h>
// codes from https://opensource.apple.com/source/Security/Security-55179.1/libsecurity_codesigning/lib/cscdefs.h
enum {
CSMAGIC_REQUIREMENT = 0xfade0c00, /* single Requirement blob */
CSMAGIC_REQUIREMENTS = 0xfade0c01, /* Requirements vector (internal requirements) */
CSMAGIC_CODEDIRECTORY = 0xfade0c02, /* CodeDirectory blob */
CSMAGIC_EMBEDDED_SIGNATURE = 0xfade0cc0, /* embedded form of signature data */
CSMAGIC_DETACHED_SIGNATURE = 0xfade0cc1, /* multi-arch collection of embedded signatures */
CSSLOT_CODEDIRECTORY = 0, /* slot index for CodeDirectory */
};
/*
* Structure of an embedded-signature SuperBlob
*/
typedef struct __BlobIndex {
uint32_t type; /* type of entry */
uint32_t offset; /* offset of entry */
} CS_BlobIndex;
typedef struct __SuperBlob {
uint32_t magic; /* magic number */
uint32_t length; /* total length of SuperBlob */
uint32_t count; /* number of index entries following */
CS_BlobIndex index[]; /* (count) entries */
/* followed by Blobs in no particular order as indicated by offsets in index */
} CS_SuperBlob;
/*
* C form of a CodeDirectory.
*/
typedef struct __CodeDirectory {
uint32_t magic; /* magic number (CSMAGIC_CODEDIRECTORY) */
uint32_t length; /* total length of CodeDirectory blob */
uint32_t version; /* compatibility version */
uint32_t flags; /* setup and mode flags */
uint32_t hashOffset; /* offset of hash slot element at index zero */
uint32_t identOffset; /* offset of identifier string */
uint32_t nSpecialSlots; /* number of special hash slots */
uint32_t nCodeSlots; /* number of ordinary (code) hash slots */
uint32_t codeLimit; /* limit to main image signature range */
uint8_t hashSize; /* size of each hash in bytes */
uint8_t hashType; /* type of hash (cdHashType* constants) */
uint8_t spare1; /* unused (must be zero) */
uint8_t pageSize; /* log2(page size in bytes); 0 => infinite */
uint32_t spare2; /* unused (must be zero) */
/* followed by dynamic content as located by offset fields above */
} CS_CodeDirectory;
static inline const CS_CodeDirectory *findCodeDirectory(const CS_SuperBlob *embedded)
{
if (embedded && ntohl(embedded->magic) == CSMAGIC_EMBEDDED_SIGNATURE) {
const CS_BlobIndex *limit = &embedded->index[ntohl(embedded->count)];
const CS_BlobIndex *p;
for (p = embedded->index; p < limit; ++p)
if (ntohl(p->type) == CSSLOT_CODEDIRECTORY) {
const unsigned char *base = (const unsigned char *)embedded;
const CS_CodeDirectory *cd = (const CS_CodeDirectory *)(base + ntohl(p->offset));
if (ntohl(cd->magic) == CSMAGIC_CODEDIRECTORY){
return cd;
}
else{
break;
}
}
}
// not found
return NULL;
}
//
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory);
#endif /* CodeSigning_h */
CodeSigning.c
#include "CodeSigning.h"
#include <stdio.h>
#include <string.h>
#import <CommonCrypto/CommonDigest.h>
unsigned char validateSlot(const void *data, size_t length, size_t slot, const CS_CodeDirectory *codeDirectory)
{
uint8_t digest[CC_SHA1_DIGEST_LENGTH + 1] = {0, };
CC_SHA1(data, (CC_LONG)length, digest);
return (memcmp(digest, (void *)((char *)codeDirectory + ntohl(codeDirectory->hashOffset) + 20*slot), 20) == 0);
}
parsing the section
void checkCodeSignature(void *binaryContent){
struct load_command *machoCmd;
const struct mach_header *machoHeader;
machoHeader = (const struct mach_header *) binaryContent;
if(machoHeader->magic == FAT_CIGAM){
unsigned int offset = 0;
struct fat_arch *fatArch = (struct fat_arch *)((struct fat_header *)machoHeader + 1);
struct fat_header *fatHeader = (struct fat_header *)machoHeader;
for(uint32_t i = 0; i < ntohl(fatHeader->nfat_arch); i++)
{
if(sizeof(int *) == 4 && !(ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // check 32bit section for 32bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
else if(sizeof(int *) == 8 && (ntohl(fatArch->cputype) & CPU_ARCH_ABI64)) // and 64bit section for 64bit architecture
{
offset = ntohl(fatArch->offset);
break;
}
fatArch = (struct fat_arch *)((uint8_t *)fatArch + sizeof(struct fat_arch));
}
machoHeader = (const struct mach_header *)((uint8_t *)machoHeader + offset);
}
if(machoHeader->magic == MH_MAGIC) // 32bit
{
machoCmd = (struct load_command *)((struct mach_header *)machoHeader + 1);
}
else if(machoHeader->magic == MH_MAGIC_64) // 64bit
{
machoCmd = (struct load_command *)((struct mach_header_64 *)machoHeader + 1);
}
for(uint32_t i=0; i < machoHeader->ncmds && machoCmd != NULL; i++){
if(machoCmd->cmd == LC_CODE_SIGNATURE)
{
struct linkedit_data_command *codeSigCmd = (struct linkedit_data_command *) machoCmd;
const CS_SuperBlob *codeEmbedded = (const CS_SuperBlob *)&((char *)machoHeader)[codeSigCmd->dataoff];
void *binaryBase = (void *)machoHeader;
const CS_BlobIndex curIndex = codeEmbedded->index[0];
const CS_CodeDirectory *codeDirectory = (const CS_CodeDirectory *)((char *)codeEmbedded + ntohl(curIndex.offset));
size_t pageSize = codeDirectory->pageSize ? (1 << codeDirectory->pageSize) : 0;
size_t remaining = ntohl(codeDirectory->codeLimit);
size_t processed = 0;
for(size_t slot = 0; slot < ntohl(codeDirectory->nCodeSlots); ++slot){
size_t size = MIN(remaining, pageSize);
if(!validateSlot(binaryBase+processed, size, slot, codeDirectory)){
return;
}
processed += size;
remaining -= size;
}
printf("[*] Code is valid!");
}
}
machoCmd = (struct load_command *)((uint8_t *)machoCmd + machoCmd->cmdsize);
}

OpenCL matrix multiplication

I'm a beginner in OpenCL. And I've been trying to write a matrix multiplication code.
It works fine only it gives garbage value as the output for C array. I'm unable to fix the error.
Any help will be much appreciated.
Here's is the host and kernel code.
#include <CL/cl.h>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
using namespace std;
#define SUCCESS 0
#define FAILURE 1
// Function to convert file name into a string
int convertToString(const char *filename, std::string &s)
{
size_t size;
char *str;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if (f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size + 1];
if (!str)
{
f.close();
return 0;
}
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return 0;
}
cout << "Error: failed to open file\n:" << filename << endl;
return FAILURE;
}
int main()
{
cl_uint status;
cl_int *error;
int A[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
int B[9] = {2, 2, 2, 2, 2, 2, 2, 2, 2};
int C[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
// Setting up platforms
cl_platform_id platform = NULL;
cl_uint numPlatforms = 0;
// Getting no of platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
cout << "\nUnable to query platforms";
return 0;
}
// Get the platform
if (numPlatforms > 0)
{
cl_platform_id*platforms=
cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
}
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status =
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, devices, &numDevices);
if (numDevices == 0)
{
cout << "No GPU device available! Choosing CPU.\n";
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, devices,
&numDevices);
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices,
devices, NULL);
}
else
{
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices,
devices, NULL);
if (status == 0)
{
cout << "Device error!";
return 0;
}
}
// Creating contexts
cl_context context =
clCreateContext(NULL, 1, devices, NULL, NULL, (cl_int *)status);
if (status != CL_SUCCESS)
{
cout << status;
}
// Creating command queues
cl_command_queue command =
clCreateCommandQueue(context, devices[0], 0, NULL);
// if(error!=CL_SUCCESS)
//{
// cout<<error;
//}
// Creating buffers
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferA, CL_TRUE, 0, 9 * sizeof(int),
(void *)A, 0, NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferB, CL_TRUE, 0, 9 * sizeof(int),
(void *)B, 0, NULL, NULL);
// status=clEnqueueReadBuffer(command,bufferA,CL_TRUE,0,9*sizeof(int),(void*)C,0,NULL,NULL);
const char *filename = "kernel.cl";
string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = {strlen(source)};
cl_program program =
clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, numDevices, 0, NULL, NULL, NULL);
cl_kernel myKernel = clCreateKernel(program, "multiply", NULL);
// Setting kernel arguments
clSetKernelArg(myKernel, 0, sizeof(cl_mem), &bufferC);
clSetKernelArg(myKernel, 1, sizeof(cl_mem), &bufferA);
clSetKernelArg(myKernel, 2, sizeof(cl_mem), &bufferB);
size_t localws[2] = {9, 9};
size_t globalws[2] = {3, 3};
status = clEnqueueNDRangeKernel(command, myKernel, 2, NULL, globalws,
localws, 0, NULL, NULL);
status = clEnqueueReadBuffer(command, bufferC, CL_TRUE, 0, 9 * sizeof(int),
(void *)C, 0, NULL, NULL);
for (int i = 0; i < 9; i++) cout << C[i] << " ";
status = clReleaseKernel(myKernel); // Release kernel.
status = clReleaseProgram(program); // Release program object.
status = clReleaseMemObject(bufferA); // Release mem object.
status = clReleaseMemObject(bufferB);
status = clReleaseMemObject(bufferC);
status = clReleaseCommandQueue(command); // Release Command queue.
status = clReleaseContext(context); // Release context.
}
Kernel code:
__kernel void multiply(_global int outputC, _global int inputA,
_global int inputB)
{
int row = get_global_id(0);
int col = get_global_id(1);
int sum = 0;
for (int i = 0; i < 3; i++)
sum += inputA[row * 3 + 1] * inputB[i * 3 + col];
outputC[row + 3 + col] = sum;
}
As already pointed out by #Marco13 the kernel suffers from quite a few issues.
When running this kernel through a tool like clcc you can see that there are a number of compilation errors to begin with:
> clcc matmul.cl
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 2: error: identifier "_global" is undefined
_global int inputB)
^
"/tmp/OCLu7FyFF.cl", line 2: error: invalid combination of type specifiers
_global int inputB)
^
6 errors detected in the compilation of "/tmp/OCLu7FyFF.cl".
A tool like clcc is very useful for catching errors early on. Most vendors also have their own version of a standalone kernel compiler/checker: e.g. Intel has its Kernel Builder, AMD's CodeXL contains a static kernel analyzer. Another option is to retrieve kernel compilation errors right from your host code, by calling clGetProgramBuildInfo to retrieve the compiler output, after clBuildProgram returned CL_BUILD_PROGRAM_FAILURE.
Once these compilation errors are fixed, it looks like your kernel is still not doing what you expect: as noted, the inputs and outputs should be pointers, as you will be passing buffers to the kernel. Also, the indexing of your input and output arrays is incorrect: In the for-loop inputA[row * 3 + 1] should be inputA[row * 3 + i] (i instead of 1). When saving the result to outputC, I would expect outputC[row * 3 + col] (row * 3) instead of row + 3).
I haven't looked in detail at the host code, but I would at least make sure, especially when just starting out with OpenCL, to always check every return code and error. This will save you a lot of time and frustration.
Finally, if you want a quick jump-start to learning OpenCL with a hands-on approach, I would strongly recommend going through the open source Hands-on OpenCL training by Simon McIntosh-Smith and Tom Deakin. It doesn't take very long, is quite pragmatic and provides lots of useful insights. Optimizing matrix multiplication is one of the use cases that is shown step-by-step.

SCTP Multistreaming: infinite loop

I have a simple client-server application on SCTP! The client connects to server opening 3 streams and the server sends a file per stream. The problem is that I don't know how to control the 3 streams, to know when sctp_rcvmsg() from a stream i 0 it would mean that the file transmission is ended for that stream...but it seems that sctp_recvmsg() never stops. here's my code.
CLIENT
#include <sys/types.h>
#include <sys/socket.h>
#include <signal.h>
#include <netinet/in.h>
#include <netinet/sctp.h>
#include <arpa/inet.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <stdlib.h>
#include <time.h>
#define BUFFERSIZE 1024
int main(int argc, char** argv) {
int i, sockCliSCTP, flags, res;
/* Server netwrok informations */
struct sockaddr_in servAddr;
/* To get which stream it has received data from */
struct sctp_sndrcvinfo sndrcvinfo;
/* Init message to setup number of streams */
struct sctp_initmsg initmsg;
/* Catching events */
struct sctp_event_subscribe events;
/* Buffer to receive files */
char buffer[BUFFERSIZE];
/* Remove previous recently used files */
remove("first.txt");
remove("second.txt");
remove("third.txt");
char ipServ[32] = "127.0.0.1";
short int servPort = 29008;
/* BEGIN SCTP PART */
/* Creating client socket for SCTP protocol */
sockCliSCTP = socket( AF_INET, SOCK_STREAM, IPPROTO_SCTP );
/* Specify that a maximum of 3 streams will be available per socket */
memset( &initmsg, 0, sizeof(initmsg) );
initmsg.sinit_num_ostreams = 3; /* output streams */
initmsg.sinit_max_instreams = 3; /* input streams */
initmsg.sinit_max_attempts = 2;
setsockopt(sockCliSCTP, IPPROTO_SCTP, SCTP_INITMSG, &initmsg, sizeof(initmsg) );
/* Initializing server network data structs */
bzero( (void *)&servAddr, sizeof(servAddr) );
servAddr.sin_family = AF_INET;
inet_pton(AF_INET, ipServ, &servAddr.sin_addr);
servAddr.sin_port = htons(29008);
int sizeServ = sizeof(servAddr);
/* Connect to server */
res = connect(sockCliSCTP, (struct sockaddr *)&servAddr, sizeof(servAddr));
if (res < 0) {
printf("Connection to server refused!\n");
exit(1);
}
memset( (void *)&events, 0, sizeof(events) );
events.sctp_data_io_event = 1;
res = setsockopt(sockCliSCTP, SOL_SCTP, SCTP_EVENTS, (const void *)&events, sizeof(events));
if (res < 0) {
printf("setsockopt failed!\n");
exit(1);
}
/* The clients simply waits and receives for three files from the server.
* The size of the files is increased each time this client is launched. */
FILE *oneF, *twoF, *threeF;
oneF = fopen("first.txt", "a"); /* Stream 0 */
twoF = fopen("second.txt", "a"); /* Stream 1 */
threeF = fopen("third.txt", "a"); /* Stream 2 */
/* To measure time */
time_t timeStart;
time_t timeEnd;
time_t timeRes = 0;
time(&timeStart);
int count0 = 0, count1 = 0, count2 = 0;
int checkRead[3];
for(i = 0; i<3; i++) {
checkRead[i] = 1;
}
/* Receiving in parallel the files from 3 streams */
while(checkRead[0] || checkRead[1] || checkRead[2]) {
printf("%d %d %d\n", checkRead[0], checkRead[1], checkRead[2]);
res = sctp_recvmsg(sockCliSCTP, (void*)buffer, sizeof(buffer), (struct sockaddr*)&servAddr, (socklen_t *)&sizeServ, &sndrcvinfo, &flags);
if (res == 0) {
printf("%d stream is zero\n", sndrcvinfo.sinfo_stream);
checkRead[sndrcvinfo.sinfo_stream] = 0;
continue;
}
/* Check from which stream the data came in */
switch(sndrcvinfo.sinfo_stream) {
/* Write on file oneF --> first.txt */
case 0:
count0++;
printf("Message received from stream 0\n");
//printf("%s\n\n", buffer);
fprintf(oneF, "%s", buffer);
break;
/* Write on file twoF --> second.txt */
case 1:
count1++;
printf("Message received from stream 1\n");
//printf("%s\n\n", buffer);
fprintf(twoF, "%s", buffer);
break;
/* Write on file threeF --> third.txt */
case 2:
count2++;
printf("Message received from stream 2\n");
//printf("%s\n\n", buffer);
fprintf(threeF, "%s", buffer);
break;
}
memset(buffer, 0, sizeof(buffer));
sleep(1);
}
close(sockCliSCTP);
time(&timeEnd);
timeRes = timeEnd - timeStart;
printf("Time elapsed is: %d seconds\n", (int)timeRes);
printf("%d messages on stream 0,\n %d messages on stream 1,\n %d messages on stream 2\n", count0, count1, count2);
}
AND THE SERVER:
#include <sys/types.h>
#include <sys/socket.h>
#include <signal.h>
#include <netinet/in.h>
#include <netinet/sctp.h>
#include <arpa/inet.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <stdlib.h>
#include <time.h>
#include <sys/stat.h>
#include <fcntl.h>
#define BUFFERSIZE 1024
int main(int argc, char** argv) {
int sockCli, sockServ, one, two, three, i, res;
struct sockaddr_in client, server;
/* data struct to declarate streams */
struct sctp_initmsg initmsg;
/* buffer to read from file */
char buffer[BUFFERSIZE];
/* socket server listening */
sockServ = socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
bzero( (void *)&client, sizeof(client));
bzero( (void *)&server, sizeof(server));
/* Preparing sever data struct and bind() */
bzero( (void *)&server, sizeof(server) );
server.sin_family = AF_INET;
server.sin_addr.s_addr = htonl( INADDR_ANY );
server.sin_port = htons(29008);
bind(sockServ, (struct sockaddr *)&server, sizeof(server));
/* Maximum of 3 streams will be available per socket */
memset( &initmsg, 0, sizeof(initmsg) );
initmsg.sinit_num_ostreams = 3;
initmsg.sinit_max_instreams = 3;
initmsg.sinit_max_attempts = 2;
res = setsockopt(sockServ, IPPROTO_SCTP, SCTP_INITMSG, &initmsg, sizeof(initmsg));
if (res < 0) {
printf("setsockopt() failed!\n");
exit(1);
}
/* Preparing the three files to be sent */
one = open("files/first.txt", O_RDONLY);
if (one < 0) {
printf("Error on opening first file!\n");
exit(1);
}
two = open("files/second.txt", O_RDONLY);
if (two < 0) {
printf("Error on opening second file!\n");
exit(1);
}
three = open("files/third.txt", O_RDONLY);
if (three < 0) {
printf("Error on opening third files!\n");
exit(1);
}
int checkFiles[3];
for(i=0; i<3; i++) {
checkFiles[i] = 1;
}
res = listen(sockServ, 5);
if (res < 0) {
printf("listen() failed!\n");
exit(1);
}
while(1) {
ssize_t readRes;
int len = sizeof(client);
sockCli = accept(sockServ, (struct sockaddr*)&client, &len);
if (sockCli < 0) {
printf("Error on accept()!\n");
exit(1);
}
printf("Associated to client!\n");
while(1) {
memset(buffer, 0, sizeof(buffer));
if ((readRes = read(one, (void*)buffer, sizeof(buffer))) > 0) {
sctp_sendmsg(sockCli, (void*)buffer, (size_t)strlen(buffer), NULL, 0, 0, 0, 0 /* stream number */, 0, 0);
}
memset(buffer, 0, sizeof(buffer));
if ((readRes = read(two, (void*)buffer, sizeof(buffer))) > 0) {
sctp_sendmsg(sockCli, (void*)buffer, (size_t)strlen(buffer), NULL, 0, 0, 0, 1 /* stream number */, 0, 0);
}
memset(buffer, 0, sizeof(buffer));
if ((readRes = read(three, (void*)buffer, sizeof(buffer))) > 0) {
sctp_sendmsg(sockCli, (void*)buffer, (size_t)strlen(buffer), NULL, 0, 0, 0, 2 /* stream number */, 0, 0);
}
else {break;}
}
close(sockCli);
close(one);
close(two);
close(three);
}
}
Where am I making a mistake? :(
sctp_recvmsg doesn't receive a message from a single stream. It simply returns ANY message received and then the application can figure out which stream the message came from.
After all the data has been received by your client when this code executes:
res = sctp_recvmsg(sockCliSCTP, (void*)buffer, sizeof(buffer), (struct sockaddr*)&servAddr, (socklen_t *)&sizeServ, &sndrcvinfo, &flags);
if (res == 0) {
printf("%d stream is zero\n", sndrcvinfo.sinfo_stream);
checkRead[sndrcvinfo.sinfo_stream] = 0;
continue;
}
res becomes 0 since no message is received and the sndrcvinfo structure does not get changed. So sndrcvinfo.sinfo_stream will remain equal to whatever stream the last message came from and you will get stuck in a loop since you won't change the checkRead[] values.
There's some other errors that will make the server/client behave strangely.
For example you can't run the client twice in a row without a a segmentation fault since the server closes the file descriptors and won't send any data the second time. Because of this you will segfault when you do:
checkRead[sndrcvinfo.sinfo_stream] = 0;
since sndrcvinfo will be a null pointer.

HIDAPI in two threads

According to https://github.com/signal11/hidapi/issues/72 HIDAPI ought to be thread safe on Linux machines. However, I can't get it working at all. This is what I do:
#ifdef WIN32
#include <windows.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include "hidapi.h"
hid_device *handle;
static void *TaskCode(void *argument)
{
int res;
//hid_device *handle;
unsigned char buf[64];
// res = hid_init();
// if( res == -1 )
// {
// return (void*)1;
// }
//
// handle = hid_open(0x0911, 0x251c, NULL);
// if( handle == NULL )
// {
// return (void*)2;
// }
printf( "while 2\n");
while( 1 )
{
memset( buf, 64, 0 );
res = hid_read(handle, buf, 0);
if( res == -1 )
{
return (void*)3;
}
printf( "received %d bytes\n", res);
for (int i = 0; i < res; i++)
printf("Byte %d: %02x ", i+1, buf[i]);
//printf( "%02x ", buf[0]);
fflush(stdout);
}
return (void*)0;
}
int main(int argc, char* argv[])
{
int res;
//hid_device *handle;
unsigned char buf[65];
res = hid_init();
if( res == -1 )
{
return 1;
}
handle = hid_open(0x0911, 0x251c, NULL);
if( handle == NULL )
{
return 2;
}
hid_set_nonblocking( handle, 0 );
pthread_t thread;
int rc = pthread_create(&thread, NULL, TaskCode, NULL);
printf( "while 1\n");
while(1)
{
int a = getchar();
if( a == 'a')
{
// Get Device Type (cmd 0x82). The first byte is the report number (0x0).
buf[0] = 0x0;
buf[1] = 0x82;
res = hid_write(handle, buf, 65);
if( res != -1 )
printf( "write ok, transferred %d bytes\n", res );
else
{
printf( "write error\n" );
char* str = hid_error(handle);
printf( "error: %s\n", str );
return 1;
}
}
else if( a== 'b')
break;
}
void* trc;
rc = pthread_join(thread, &trc);
printf( "rc code: %d\n", (int)trc );
// Finalize the hidapi library
res = hid_exit();
return 0;
}
If I don't use the global handle, I get 'write error' every time. If I do, as in the example, formally everything works but hid_read always returns 0 bytes... Of course, if I do simple hid_write() followed by hid_read(), I'll get the correct reply to the command 0x82 as intended. I'm really lost here, am I overlooking something?
EDIT: to clarify, zero bytes return also for everything, incl. buttons on mouse etc. So it seems to work but the data buffer is always zero bytes.
Shame on me, a dumb mistake. The code should be:
memset( buf, 0, 64 );
res = hid_read(handle, buf, 64);
and then it works. Should sleep more and write less!

Resources