How I can apply static memory allocation rather than dynamic allocation? - memory

How I can change the function from dynamic allocation to static allocation for the tokinize function which is use the same logic of strtok. It is working fine using the dynamic allocation but could not figure out how i can modify it to work with static allocation
char **tokenize ( char *string, char delimiter) {
char **lines = NULL;
char **temp = NULL;
char limit[2] = "";
size_t skip = 0;
size_t span = 0;
size_t extent = 0;
size_t line = 0;
size_t len = strlen ( string);
limit[0] = delimiter;
if ( NULL == ( lines = malloc ( sizeof ( *lines) * 2))) {//allocate two pointers
fprintf ( stderr, "malloc problem\n");
return NULL;
}
lines[line + 1] = NULL;//sentinel
while ( extent < len) {
skip = strspn ( string + extent, limit);//get number of delimiters
extent += skip;//advance past spaces
if ( ( span = strcspn ( string + extent, limit))) {//find next delimiter or '\0'
if ( NULL == ( lines[line] = malloc ( span + 1))) {
fprintf ( stderr, "malloc problem\n");
return lines;
}
strncpy ( lines[line], string + extent, span);
lines[line][span] = '\0';
if ( NULL == ( temp = realloc ( lines, sizeof ( *lines) * ( line + 3)))) {
fprintf ( stderr, "realloc problem\n");
return lines;
}
lines = temp;
line++;
lines[line] = NULL;
lines[line + 1] = NULL;//sentinel
extent += span;//advance past the token
}
}//loop to end of string
return lines;
}

Related

What are my options to convert OpenCV reduce loop to a native iOS code. SIMD anyone?

Which native iOS framework is best used to eradicate this cpu hog written in OpenCV?
/// Reduce the channel elements of given Mat to a single channel
static func reduce(input: Mat) throws -> Mat {
let output = Mat(rows: input.rows(), cols: input.cols(), type: CvType.CV_8UC1)
for x in 0 ..< input.rows() {
for y in 0 ..< input.cols() {
let value = input.get(row: x, col: y)
let dataValue = value.reduce(0, +)
try output.put(row: x, col: y, data: [dataValue])
}
}
return output
}
takes about 20+ seconds to do those gets and puts on real world data I put this code through.
Assuming your input matrix is CV_64FC2, call computeSumX2 C function for each row.
Untested.
#include <arm_neon.h>
#include <stdint.h>
#include <stddef.h>
// Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
inline uint32x4_t reduce4( const double* rsi )
{
// Load 8 values
float64x2x4_t f64 = vld1q_f64_x4( rsi );
// Add them pairwise
float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );
// Convert FP64 to uint64
uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );
// Convert int64 to int32 in a single vector, using saturation
uint32x2_t low = vqmovn_u64( i64_1 );
return vqmovn_high_u64( low, i64_2 );
}
// Compute pairwise sum of FP64 values, cast to bytes
void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
{
const double* const rsiEnd = rsi + length * 2;
size_t lengthAligned = ( length / 16 ) * 16;
const double* const rsiEndAligned = rsi + lengthAligned * 2;
for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
{
// Each iteration of the loop loads 32 source values, stores 16 bytes
uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
uint8x8_t low8 = vqmovn_u16( u16 );
low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
uint8x16_t res = vqmovn_high_u16( low8, u16 );
vst1q_u8( rdi, res );
}
for( ; rsi < rsiEnd; rsi += 2, rdi++ )
{
// Each iteration of the loop loads 2 source values, stores a single byte
float64x2_t f64 = vld1q_f64( rsi );
double sum = vaddvq_f64( f64 );
*rdi = (uint8_t)sum;
}
}
For folks such as myself who have a poor comprehension of ARM Intrinsics
a simpler solution is to bridge into Objective C code as Soonts did
and thusly ditch crude Swift api to opencv bypassing costly memory copying with gets and puts.
void fasterSumX2( const char *input,
int rows,
int columns,
long step,
int channels,
char* output,
long output_step
)
{
for(int j = 0;j < rows;j++){
for(int i = 0;i < columns;i++){
long offset = step * j + i * channels;
const unsigned char *ptr = (const unsigned char *)(input + offset);
int res = ptr[0]+ptr[1];
if (res > 0) {
if (res > 255) {
assert(false);
}
}
*(output + output_step * j + i) = res;
}
}
}

Crypto++ CTR mode manual implement

I am trying to make CTR manually on top of ECB mode (but still) using Crypto++.
The idea is:
For single block: Just use ECB For multiple block, use CTR algorithm
(AFAIK):
//We have n block of plain data -> M
PlainData M[n];
key;
iv;
char *CTR;
cipher ="";
for(i = 0; i<n; i++ ){
if(i ==0){
CTR = iv;
}
ei = encryptECB(CTR + i)
cipherI = xor(ei, M[i])
cipher += cipherI;
}
//My xor() to XOR two char array
void xor(char *s1, char* s2, char *& result, int len){
try{
int i;
for (i = 0; i < len; i++){
int u = s1[i] ^ s2[i];
result[i] = u;
}
result[i] = '\0';
}
catch (...){
cout << "Errp";
}
}
Test 1: 100% Crypto++ CTR
string auto_ctr(char * s1, long size){
CTR_Mode< AES >::Encryption e;
e.SetKeyWithIV(key, sizeof(key), iv, sizeof(iv));
string cipherZ;
StringSource s(s1, true,
new StreamTransformationFilter(e,
new StringSink(cipherZ), BlockPaddingSchemeDef::BlockPaddingScheme::NO_PADDING
)
);
return cipherZ;
}
Test 2: Manual CTR based on ECB
string encrypt(char* s1, int size){
ECB_Mode< AES >::Encryption e;
e.SetKey(key, size);
string cipher;
string s(s1, size);
StringSource ss1(s, true,
new StreamTransformationFilter(e,
new StringSink(cipher), BlockPaddingSchemeDef::BlockPaddingScheme::NO_PADDING
) // StreamTransformationFilter
); // StringSource
return cipher;
}
static string manual_ctr(char *plain, long &size){
int nBlocks = size / BLOCK_SIZE;
char* encryptBefore = new char[BLOCK_SIZE];
char *ci = new char[BLOCK_SIZE] ;
string cipher;
for (int i = 0; i < nBlocks; i++){
//If the first loop, CTR = IV
if (i == 0){
memcpy(encryptBefore, iv, BLOCK_SIZE);
}
encryptBefore[BLOCK_SIZE] = '\0';
memcpy(encryptBefore, encryptBefore + i, BLOCK_SIZE);
char *buffer = new char[BLOCK_SIZE];
memcpy(buffer, &plain[i], BLOCK_SIZE);
buffer[BLOCK_SIZE] = '\0';
//Encrypt the CTR
string e1 = encrypt(encryptBefore, BLOCK_SIZE);
//Xor it with m[i] => c[i]
xor((char*)e1.c_str(), buffer, ci, BLOCK_SIZE);
//Append to the summary cipher
/*for (int j = 0; j < BLOCK_SIZE/2; j++){
SetChar(cipher, ci[j], i*BLOCK_SIZE + j);
}*/
cipher += ci;
//Set the cipher back to iv
//memcpy(encryptBefore, ci, BLOCK_SIZE);
}
return cipher;
}
And this is Main for testing:
void main(){
long size = 0;
char * plain = FileUtil::readAllByte("some1.txt", size);
string auto_result = auto_ctr(plain, size);
string manual_result = manual_ctr(plain, size);
getchar();
}
The auto_result is:
"Yž+eÞsÂÙ\bü´\x1a¨Ü_ÙR•L¸Ð€¦å«ÎÍÊ[w®Ÿg\fT½\ý7!p\r^ÍdžúP\bîT\x3\x1cZï.s%\x1ei{ÚMˆØ…Pä¾õ\x46\r5\tâýï‚ú\x16ç’Qiæ²\x15š€á^ªê]W
ÊNqdŒ¥ ˆ†¾j%8.Ìù\x6Þ›ÔÏ’[c\x19"
The manual_result is:
"Yž+eÞsÂÙ\bü´\x1a¨Ü_Ù·\x18ýuù\n\nl\x11Á\x19À†Žaðƒºñ®GäþŽá•\x11ÇYœf+^Q\x1a\x13B³‘QQµºëÑÌåM\"\x12\x115â\x10¿Ô„›s°‰=\x18*\x1c:²IF'n#ŠŠ¾mGÂzõžÀ\x1eÏ\SëYU¼í‘"
>
What is the problem with my implement?
Since your first block seems to be working fine, I've only searched for problems in the management of the counter itself and here is what seems me wrong :
memcpy(encryptBefore, encryptBefore + i, BLOCK_SIZE);
Here you are trying to increment your IV i times, I presume, but this is not what happens, what you do is trying to copy into your encryptBefore pointer the content of the encryptBefore+i pointer spanning over BLOCK_SIZE bytes. This is not at all incrementing the IV, but it works for the first block because then i=0.
What you want to do is actually creating a big integer using CryptoPP::Integer to use as an IV and increment that integer and then convert it into a byte array using the Encode(byte *output, size_t outputLen, Signedness sign=UNSIGNED) const function from the CryptoPP Integer class when you need to use bytes instead of integers.
Ps: when performing i/o operations, I recommend you to use hexadecimal strings, take a look at the CryptoPP::HexEncoder and HexDecoder classes, they both are well documented on CryptoPP wiki.

Output for sample code for an upcoming exam concerning pthread

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
int token = 2;
int value = 3;
void * red ( void *arg ) {
int myid = * ((int *) arg);
pthread_mutex_lock( &mutex );
while ( myid != token) {
pthread_cond_wait( &cond, &mutex );
}
value = value + (myid + 3);
printf( "RED: id is %d \n", value);
token = (token + 1) % 3;
pthread_cond_broadcast( &cond );
pthread_mutex_unlock( &mutex );
}
void * blue ( void *arg ) {
int myid = * ((int *) arg);
pthread_mutex_lock( &mutex );
while ( myid != token) {
pthread_cond_wait( &cond, &mutex );
}
value = value * (myid + 2);
printf( "BLUE: id is %d \n", value);
token = (token + 1) % 3;
pthread_cond_broadcast( &cond );
pthread_mutex_unlock( &mutex );
}
void * white ( void *arg ) {
int myid = * ((int *) arg);
pthread_mutex_lock( &mutex );
while ( myid != token) {
pthread_cond_wait( &cond, &mutex );
}
value = value * (myid + 1);
printf( "WHITE: id is %d \n", value);
token = (token + 1) % 3;
pthread_cond_broadcast( &cond );
pthread_mutex_unlock( &mutex );
}
main( int argc, char *argv[] ) {
pthread_t tid;
int count = 0;
int id1, id2, id3;
id1 = count;
n = pthread_create( &tid, NULL, red, &id1);
id2 = ++count;
n = pthread_create( &tid, NULL, blue, &id2);
id3 = ++count;
n = pthread_create( &tid, NULL, white, &id3);
if ( n = pthread_join( tid, NULL ) ) {
fprintf( stderr, "pthread_join: %s\n", strerror( n ) );
exit( 1 );
}
}
I am just looking for comments and or notes to what the output would be. THIS IS FOR AN EXAM AND WAS OFFERED AS AN EXAMPLE. THIS IS NOT HOMEWORK OR GOING TO BE USED FOR ANY TYPE OF SUBMISSION. I am looking to understand what is going on. Any help is greatly appreciated.
I'm going to assume that you know the function of the locks, condition variables, and the waits. Basically you have three threads that each call Red, Blue, and White. Token is originally 2, and value is originally 3.
Red is called when id1 = 0, but it will stay in the while block calling wait() until the token = 0.
Blue is called when id3 = 1, and will stay in the while block called wait() until the token is 1.
White is called when id2 = 2, and will stay in the while block calling wait() until the token is 2.
So White will enter the critical section first, since it's the only one that won't enter the while loop. So value = 3 * ( 3 ) = 9; token = ( 3 ) % 3 = 0;
Broadcast wakes every waiting thread, but the only one that will enter the critical section is Red. It adds 3 to value for 12; token = ( 1 ) % 3 = 1; Broadcast wakes Blue.
Blue enters the critical section. value = 12 * 3; token = 2 ( but it doesn't matter anymore ).
This would be the order of the threads would execute, which is what I assume the test is really asking. However, what should really come out is just:
White is 9
This is because there is only one pthread_t tid. So after pthread_join( tid, NULL ), it can immediately exit. If you put different pthread_t in each of the pthread_create() then all of them would print.

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

SlimDX (DirectX10) - How to change a texel in Texture?

I try to change the texels of a Texture which is already loaded.
My assumption was to use the Texture2D::Map and UnMap functions, but there is no change when I change the data of given DataRectangle.
I need a simple example like, creating a texture of 128x128 with a gradient from black to white from each side.
Thx
ps: A Direct3D 10 C++ example may also help, SlimDX is only a wrapper and has nearly complete the same functions.
This is my D3D10 2D texture loader
bool D3D10Texture::Init( GFXHandler* pHandler, unsigned int usage, unsigned int width, unsigned int height, unsigned int textureType, bool bMipmapped, void* pTextureData )
{
mMipmapped = bMipmapped;
//SetData( pHandler, 0 );
D3D10Handler* pD3DHandler = (D3D10Handler*)pHandler;
ID3D10Device* pDevice = pD3DHandler->GetDevice();
DXGI_SAMPLE_DESC dxgiSampleDesc;
dxgiSampleDesc.Count = 1;
dxgiSampleDesc.Quality = 0;
D3D10_USAGE d3d10Usage;
if ( usage & RU_All_Dynamic ) d3d10Usage = D3D10_USAGE_DYNAMIC;
else d3d10Usage = D3D10_USAGE_DEFAULT;
//unsigned int cpuAccess = D3D10_CPU_ACCESS_WRITE;
//if ( (usage & RU_Buffer_WriteOnly) == 0 ) cpuAccess |= D3D10_CPU_ACCESS_READ;
unsigned int cpuAccess = 0;
if ( !pTextureData )
{
cpuAccess = D3D10_CPU_ACCESS_WRITE;
//if ( (usage & RU_Buffer_WriteOnly) == 0 ) cpuAccess |= D3D10_CPU_ACCESS_READ;
}
unsigned int bindFlags = D3D10_BIND_SHADER_RESOURCE;
if ( usage & RU_Texture_RenderTarget ) bindFlags |= D3D10_BIND_RENDER_TARGET;
unsigned int miscFlags = 0;
if ( usage & RU_Texture_AutoGenMipmap ) miscFlags |= D3D10_RESOURCE_MISC_GENERATE_MIPS;
D3D10_TEXTURE2D_DESC d3d10Texture2DDesc;
d3d10Texture2DDesc.Width = width;
d3d10Texture2DDesc.Height = height;
d3d10Texture2DDesc.MipLevels = GetNumMipMaps( width, height, bMipmapped );
d3d10Texture2DDesc.ArraySize = 1;
d3d10Texture2DDesc.Format = GetD3DFormat( (TextureTypes)textureType );
d3d10Texture2DDesc.SampleDesc = dxgiSampleDesc;
d3d10Texture2DDesc.Usage = d3d10Usage;
d3d10Texture2DDesc.BindFlags = D3D10_BIND_SHADER_RESOURCE;
d3d10Texture2DDesc.CPUAccessFlags = cpuAccess;
d3d10Texture2DDesc.MiscFlags = miscFlags;
//D3D10_SUBRESOURCE_DATA d3d10SubResourceData;
//d3d10SubResourceData.pSysMem = pTextureData;
//d3d10SubResourceData.SysMemPitch = GetPitch( width, (TextureTypes)textureType );
//d3d10SubResourceData.SysMemSlicePitch = 0;
D3D10_SUBRESOURCE_DATA* pSubResourceData = NULL;
if ( pTextureData )
{
pSubResourceData = new D3D10_SUBRESOURCE_DATA[d3d10Texture2DDesc.MipLevels];
char* pTexPos = (char*)pTextureData;
unsigned int pitch = GetPitch( width, (TextureTypes)textureType );
unsigned int count = 0;
unsigned int max = d3d10Texture2DDesc.MipLevels;
while( count < max )
{
pSubResourceData[count].pSysMem = pTexPos;
pSubResourceData[count].SysMemPitch = pitch;
pSubResourceData[count].SysMemSlicePitch = 0;
pTexPos += pitch * height;
pitch >>= 1;
count++;
}
}
if ( FAILED( pDevice->CreateTexture2D( &d3d10Texture2DDesc, pSubResourceData, &mpTexture ) ) )
{
return false;
}
if ( pSubResourceData )
{
delete[] pSubResourceData;
pSubResourceData = NULL;
}
mWidth = width;
mHeight = height;
mFormat = (TextureTypes)textureType;
mpTexture->AddRef();
mpTexture->Release();
D3D10_SHADER_RESOURCE_VIEW_DESC d3d10ShaderResourceViewDesc;
d3d10ShaderResourceViewDesc.Format = d3d10Texture2DDesc.Format;
d3d10ShaderResourceViewDesc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D;
d3d10ShaderResourceViewDesc.Texture2D.MostDetailedMip = 0;
d3d10ShaderResourceViewDesc.Texture2D.MipLevels = GetNumMipMaps( width, height, bMipmapped );
if ( FAILED( pDevice->CreateShaderResourceView( mpTexture, &d3d10ShaderResourceViewDesc, &mpView ) ) )
{
return false;
}
ResourceRecorder::Instance()->AddResource( this );
return true;
}
With that function all you need to do is pass in the whit to black texture. For example to write a 256x256 textue with each horizontal line being one brighter than the previous line the following code will work
int* pTexture = new int[256 * 256];
int count = 0;
while( count < 256 )
{
int count2 = 0;
while( count2 < 256 )
{
pTexture[(count * 256) + count2] = 0xff000000 | (count << 16) | (count << 8) | count;
count2++;
}
count++;
}
Make sure you follow the rules in the "Resource Usage Restrictions" section:
MSDN: D3D10_USAGE
public void NewData(byte[] newData)
{
DataRectangle mappedTex = null;
//assign and lock the resource
mappedTex = pTexture.Map(0, D3D10.MapMode.WriteDiscard, D3D10.MapFlags.None);
// if unable to hold texture
if (!mappedTex.Data.CanWrite)
{
throw new ApplicationException("Cannot Write to the Texture");
}
// write new data to the texture
mappedTex.Data.WriteRange<byte>(newData);
// unlock the resource
pTexture.Unmap(0);
if (samplerflag)
temptex = newData;
}
this overwrites the buffer on every new frame, you may want to use a D3D10.MapMode.readwrite or something if ur only trying to write one texel
you will also need to write to the datarectangle in a specific point using one of the other write functions

Resources