Crypto++ CTR mode manual implement - crypto++

I am trying to make CTR manually on top of ECB mode (but still) using Crypto++.
The idea is:
For single block: Just use ECB For multiple block, use CTR algorithm
(AFAIK):
//We have n block of plain data -> M
PlainData M[n];
key;
iv;
char *CTR;
cipher ="";
for(i = 0; i<n; i++ ){
if(i ==0){
CTR = iv;
}
ei = encryptECB(CTR + i)
cipherI = xor(ei, M[i])
cipher += cipherI;
}
//My xor() to XOR two char array
void xor(char *s1, char* s2, char *& result, int len){
try{
int i;
for (i = 0; i < len; i++){
int u = s1[i] ^ s2[i];
result[i] = u;
}
result[i] = '\0';
}
catch (...){
cout << "Errp";
}
}
Test 1: 100% Crypto++ CTR
string auto_ctr(char * s1, long size){
CTR_Mode< AES >::Encryption e;
e.SetKeyWithIV(key, sizeof(key), iv, sizeof(iv));
string cipherZ;
StringSource s(s1, true,
new StreamTransformationFilter(e,
new StringSink(cipherZ), BlockPaddingSchemeDef::BlockPaddingScheme::NO_PADDING
)
);
return cipherZ;
}
Test 2: Manual CTR based on ECB
string encrypt(char* s1, int size){
ECB_Mode< AES >::Encryption e;
e.SetKey(key, size);
string cipher;
string s(s1, size);
StringSource ss1(s, true,
new StreamTransformationFilter(e,
new StringSink(cipher), BlockPaddingSchemeDef::BlockPaddingScheme::NO_PADDING
) // StreamTransformationFilter
); // StringSource
return cipher;
}
static string manual_ctr(char *plain, long &size){
int nBlocks = size / BLOCK_SIZE;
char* encryptBefore = new char[BLOCK_SIZE];
char *ci = new char[BLOCK_SIZE] ;
string cipher;
for (int i = 0; i < nBlocks; i++){
//If the first loop, CTR = IV
if (i == 0){
memcpy(encryptBefore, iv, BLOCK_SIZE);
}
encryptBefore[BLOCK_SIZE] = '\0';
memcpy(encryptBefore, encryptBefore + i, BLOCK_SIZE);
char *buffer = new char[BLOCK_SIZE];
memcpy(buffer, &plain[i], BLOCK_SIZE);
buffer[BLOCK_SIZE] = '\0';
//Encrypt the CTR
string e1 = encrypt(encryptBefore, BLOCK_SIZE);
//Xor it with m[i] => c[i]
xor((char*)e1.c_str(), buffer, ci, BLOCK_SIZE);
//Append to the summary cipher
/*for (int j = 0; j < BLOCK_SIZE/2; j++){
SetChar(cipher, ci[j], i*BLOCK_SIZE + j);
}*/
cipher += ci;
//Set the cipher back to iv
//memcpy(encryptBefore, ci, BLOCK_SIZE);
}
return cipher;
}
And this is Main for testing:
void main(){
long size = 0;
char * plain = FileUtil::readAllByte("some1.txt", size);
string auto_result = auto_ctr(plain, size);
string manual_result = manual_ctr(plain, size);
getchar();
}
The auto_result is:
"Yž+eÞsÂÙ\bü´\x1a¨Ü_ÙR•L¸Ð€¦å«ÎÍÊ[w®Ÿg\fT½\ý7!p\r^ÍdžúP\bîT\x3\x1cZï.s%\x1ei{ÚMˆØ…Pä¾õ\x46\r5\tâýï‚ú\x16ç’Qiæ²\x15š€á^ªê]W
ÊNqdŒ¥ ˆ†¾j%8.Ìù\x6Þ›ÔÏ’[c\x19"
The manual_result is:
"Yž+eÞsÂÙ\bü´\x1a¨Ü_Ù·\x18ýuù\n\nl\x11Á\x19À†Žaðƒºñ®GäþŽá•\x11ÇYœf+^Q\x1a\x13B³‘QQµºëÑÌåM\"\x12\x115â\x10¿Ô„›s°‰=\x18*\x1c:²IF'n#ŠŠ¾mGÂzõžÀ\x1eÏ\SëYU¼í‘"
>
What is the problem with my implement?

Since your first block seems to be working fine, I've only searched for problems in the management of the counter itself and here is what seems me wrong :
memcpy(encryptBefore, encryptBefore + i, BLOCK_SIZE);
Here you are trying to increment your IV i times, I presume, but this is not what happens, what you do is trying to copy into your encryptBefore pointer the content of the encryptBefore+i pointer spanning over BLOCK_SIZE bytes. This is not at all incrementing the IV, but it works for the first block because then i=0.
What you want to do is actually creating a big integer using CryptoPP::Integer to use as an IV and increment that integer and then convert it into a byte array using the Encode(byte *output, size_t outputLen, Signedness sign=UNSIGNED) const function from the CryptoPP Integer class when you need to use bytes instead of integers.
Ps: when performing i/o operations, I recommend you to use hexadecimal strings, take a look at the CryptoPP::HexEncoder and HexDecoder classes, they both are well documented on CryptoPP wiki.

Related

What are my options to convert OpenCV reduce loop to a native iOS code. SIMD anyone?

Which native iOS framework is best used to eradicate this cpu hog written in OpenCV?
/// Reduce the channel elements of given Mat to a single channel
static func reduce(input: Mat) throws -> Mat {
let output = Mat(rows: input.rows(), cols: input.cols(), type: CvType.CV_8UC1)
for x in 0 ..< input.rows() {
for y in 0 ..< input.cols() {
let value = input.get(row: x, col: y)
let dataValue = value.reduce(0, +)
try output.put(row: x, col: y, data: [dataValue])
}
}
return output
}
takes about 20+ seconds to do those gets and puts on real world data I put this code through.
Assuming your input matrix is CV_64FC2, call computeSumX2 C function for each row.
Untested.
#include <arm_neon.h>
#include <stdint.h>
#include <stddef.h>
// Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
inline uint32x4_t reduce4( const double* rsi )
{
// Load 8 values
float64x2x4_t f64 = vld1q_f64_x4( rsi );
// Add them pairwise
float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );
// Convert FP64 to uint64
uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );
// Convert int64 to int32 in a single vector, using saturation
uint32x2_t low = vqmovn_u64( i64_1 );
return vqmovn_high_u64( low, i64_2 );
}
// Compute pairwise sum of FP64 values, cast to bytes
void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
{
const double* const rsiEnd = rsi + length * 2;
size_t lengthAligned = ( length / 16 ) * 16;
const double* const rsiEndAligned = rsi + lengthAligned * 2;
for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
{
// Each iteration of the loop loads 32 source values, stores 16 bytes
uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
uint8x8_t low8 = vqmovn_u16( u16 );
low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
uint8x16_t res = vqmovn_high_u16( low8, u16 );
vst1q_u8( rdi, res );
}
for( ; rsi < rsiEnd; rsi += 2, rdi++ )
{
// Each iteration of the loop loads 2 source values, stores a single byte
float64x2_t f64 = vld1q_f64( rsi );
double sum = vaddvq_f64( f64 );
*rdi = (uint8_t)sum;
}
}
For folks such as myself who have a poor comprehension of ARM Intrinsics
a simpler solution is to bridge into Objective C code as Soonts did
and thusly ditch crude Swift api to opencv bypassing costly memory copying with gets and puts.
void fasterSumX2( const char *input,
int rows,
int columns,
long step,
int channels,
char* output,
long output_step
)
{
for(int j = 0;j < rows;j++){
for(int i = 0;i < columns;i++){
long offset = step * j + i * channels;
const unsigned char *ptr = (const unsigned char *)(input + offset);
int res = ptr[0]+ptr[1];
if (res > 0) {
if (res > 255) {
assert(false);
}
}
*(output + output_step * j + i) = res;
}
}
}

Manual CBC encryption handing with Crypto++

I am trying to play around with a manual encryption in CBC mode but still use Crypto++, just to know can I do it manually.
The CBC algorithm is (AFAIK):
Presume we have n block K[1]....k[n]
0. cipher = empty;
1. xor(IV, K1) -> t1
2. encrypt(t1) -> r1
3. cipher += r1
4. xor (r1, K2) -> t2
5. encrypt(t2) -> r2
6. cipher += r2
7. xor(r2, K3)->t3
8. ...
So I tried to implement it with Crypto++. I have a text file with alphanumeric characters only. Test 1 is read file chunk by chunk (16 byte) and encrypt them using CBC mode manually, then sum up the cipher. Test 2 is use Crypto++ built-in CBC mode.
Test 1
char* key;
char* iv;
//Iterate in K[n] array of n blocks
BSIZE = 16;
std::string vectorToString(vector<char> v){
string s ="";
for (int i = 0; i < v.size(); i++){
s[i] = v[i];
}
return s;
}
vector<char> xor( vector<char> s1, vector<char> s2, int len){
vector<char> r;
for (int i = 0; i < len; i++){
int u = s1[i] ^ s2[i];
r.push_back(u);
}
return r;
}
vector<char> byteToVector(byte *b, int len){
vector<char> v;
for (int i = 0; i < len; i++){
v.push_back( b[i]);
}
return v;
}
string cbc_manual(byte [n]){
int i = 0;
//Open a file and read from it, buffer size = 16
// , equal to DEFAULT_BLOCK_SIZE
std::ifstream fin(fileName, std::ios::binary | std::ios::in);
const int BSIZE = 16;
vector<char> encryptBefore;
//This function will return cpc
string cpc ="";
while (!fin.eof()){
char buffer[BSIZE];
//Read a chunk of file
fin.read(buffer, BSIZE);
int sb = sizeof(buffer);
if (i == 0){
encryptBefore = byteToVector( iv, BSIZE);
}
//If i == 0, xor IV with current buffer
//else, xor encryptBefore with current buffer
vector<char> t1 = xor(encryptBefore, byteToVector((byte*) buffer, BSIZE), BSIZE);
//After xored, encrypt the xor result, it will be current step cipher
string r1= encrypt(t1, BSIZE).c_str();
cpc += r1;
const char* end = r1.c_str() ;
encryptBefore = stringToVector( r1);
i++;
}
return cpc;
}
This is my encrypt() function, because we have only one block so I use ECB (?) mode
string encrypt(string s, int size){
ECB_Mode< AES >::Encryption e;
e.SetKey(key, size);
string cipher;
StringSource ss1(s, true,
new StreamTransformationFilter(e,
new StringSink(cipher)
) // StreamTransformationFilter
); // StringSource
return cipher;
}
And this is 100% Crypto++ made solution:
Test 2
encryptCBC(char * plain){
CBC_Mode < AES >::Encryption encryption(key, sizeof(key), iv);
StreamTransformationFilter encryptor(encryption, NULL);
for (size_t j = 0; j < plain.size(); j++)
encryptor.Put((byte)plain[j]);
encryptor.MessageEnd();
size_t ready = encryptor.MaxRetrievable();
string cipher(ready, 0x00);
encryptor.Get((byte*)&cipher[0], cipher.size());
}
Result of Test 1 and Test 2 are different. In the fact, ciphered text from Test 1 is contain the result of Test 2. Example:
Test 1's result aaa[....]bbb[....]ccc[...]...
Test 2 (Crypto++ built-in CBC)'s result: aaabbbccc...
I know the xor() function may cause a problem relate to "sameChar ^ sameChar = 0", but is there any problem relate to algorithm in my code?
This is my Test 2.1 after the 1st solution of jww.
static string auto_cbc2(string plain, long size){
CBC_Mode< AES >::Encryption e;
e.SetKeyWithIV(key, sizeof(key), iv, sizeof(iv));
string cipherText;
CryptoPP::StringSource ss(plain, true,
new CryptoPP::StreamTransformationFilter(e,
new CryptoPP::StringSink(cipherText)
, BlockPaddingSchemeDef::NO_PADDING
) // StreamTransformationFilter
); // StringSource
return cipherText;
}
It throw an error:
Unhandled exception at 0x7407A6F2 in AES-CRPP.exe: Microsoft C++
exception: CryptoPP::InvalidDataFormat at memory location 0x00EFEA74
I only got this error when use BlockPaddingSchemeDef::NO_PADDING, tried to remove BlockPaddingSchemeDef or using BlockPaddingSchemeDef::DEFAULT_PADDING, I got no error . :?
StringSource ss1(s, true,
new StreamTransformationFilter(e,
new StringSink(cipher)));
This uses PKCS padding by default. It takes a 16-byte input and produces a 32-byte output due to padding. You should do one of two things.
First, you can use BlockPaddingScheme::NO_PADDING. Something like:
StringSource ss1(s, true,
new StreamTransformationFilter(e,
new StringSink(cipher)
BlockPaddingScheme::NO_PADDING));
Second, you can process blocks manually, 16 bytes at a time. Something like:
AES::Encryption encryptor(key, keySize);
byte ibuff[<some size>] = ...;
byte obuff[<some size>];
ASSERT(<some size> % AES::BLOCKSIZE == 0);
unsigned int BLOCKS = <some size>/AES::BLOCKSIZE;
for (unsigned int i=0; i<BLOCKS; i==)
{
encryptor.ProcessBlock(&ibuff[i*16], &obuff[i*16]);
// Do the CBC XOR thing...
}
You may be able to call ProcessAndXorBlock from the BlockCipher base class and do it in one shot.

_crtisvalidheappointer error when trying to free memory

I get _CrtIsValidHeapPointer(pUserData) error when running the code above.
Sometimes the code works perfectly, and sometimes this message appears. So I guess the problem is related to the memory allocation. But I've gone through the code many times and the numbers make sence to me (and also when debugging).
I noticed it happens in line "free(str_temp)" at the debugging.
The relevant code is here:
int main(){
int n;
int len;
char *str;
char command[3];
printf("Enter your string:\n");
scanf("%d", &n);
str = malloc(n+1);
scanf("%s", str);
while (1){
printf(">");
scanf("%s", command);
if (compare(command, "ml")) {
int k;
scanf("%d", &k);
multiply(str, n, k);
printf("Current string is %s\n", str);
n = ln(str);
continue;
}
free(str);
return 0;
}
void multiply(char *str, int n, int k) {
char *str_temp = malloc(n+1);
int i;
int j;
int q;
for (i = 0; i < n; i++){
str_temp[i] = str[i];
}
str_temp[n] = '\0';
free(str);
*str = malloc(n*k+1);
for (i = 0; i < k; i++){
for (j = 0; j < n; j++){
str[i*n + j] = str_temp[j];
}
}
str[n*k] = '\0';
free(str_temp);
}
Try to use message defination
void multiply(char **str, int n, int k)//Use **str(double pointer) instead of *str.
And call it like
multiply(&str, n, k);

How to convert int* (bgr image) from c++ to unity3d texture image?

I have a bgr image uchar format from opencv c++.
the function is like int* texture(int* data, int width, int height); the function processes the image in c++ end and returns the pointer to the data. How do I convert this data in Unity to texture. basically make this data available to be put as a texture. I dont want to write it to a file. Please help.
Code snippet (I am using dlls) :::
public static WebCamTexture webCamTexture;
private Color32[] data;
private int[] imageData;
private int[] imdat;
void Start () {
....
data = new Color32[webCamTexture.width * webCamTexture.height];
imageData = new int[data.Length * 3];
}
void Update()
{
webCamTexture.GetPixels32(data);
// Convert the Color32[] in int* and emit it in bgr format
for (int i = 0; i < data.Length; ++i)
{
imageData[i * 3] = (int)data[i].b;
imageData[i * 3 + 1] = (int)data[i].g;
imageData[i * 3 + 2] = (int)data[i].r;
}
//this is the function called from dll
imdat = texture(imageData, int width, int height);
}
And the DLL end looks like ::
char *tmp;
int* texture(int* imageData ,int width ,int height)
{
int n = w * h * 3;
tmp = new char[n];
//ImageData inverted here and then passed onto tmp 3 channels image
for (int i = 0; i < (w*3); ++i)
for (int j = 0; j < h; ++j)
tmp[i + j * (w*3)] = (char)imageData[i + (h - j - 1) * (w*3)];
return (int)tmp;
}
I'm not sure what format texture you have is, but if you can convert it into byte[] you can use Texture2D.LoadImage(byte[]) to turn in into working texture.
You should be able to achieve what to want with BitConverter.GetBytes() and Texture2D.LoadImage(). Make sure you take special note of the image format restrictions in the Unity manual page there.
Not sure how your binding between your C++land and C#land code but you should be able to do something a little like this:
/* iImportedTexture = [Your c++ function here]; */
byte[] bImportedTexture = BitConverter.GetBytes(iImportedTexture);
Texture2D importedTexture = Texture2D.LoadImage(bImportedTexture);

Cuda-memcheck not reporting out of bounds shared memory access

I am runnig the follwoing code using shared memory:
__global__ void computeAddShared(int *in , int *out, int sizeInput){
//not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
extern __shared__ float temp[];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int ltid = threadIdx.x;
temp[ltid] = 0;
while(tid < sizeInput){
temp[ltid] += in[tid];
tid+=gridDim.x * blockDim.x; // to handle array of any size
}
__syncthreads();
int offset = 1;
while(offset < blockDim.x){
if(ltid % (offset * 2) == 0){
temp[ltid] = temp[ltid] + temp[ltid + offset];
}
__syncthreads();
offset*=2;
}
if(ltid == 0){
out[blockIdx.x] = temp[0];
}
}
int main(){
int size = 16; // size of present input array. Changes after every loop iteration
int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
/*FILE *f;
f = fopen("invertedList.txt" , "w");
a[0] = 1 + (rand() % 8);
fprintf(f, "%d,",a[0]);
for( int i = 1 ; i< N; i++){
a[i] = a[i-1] + (rand() % 8) + 1;
fprintf(f, "%d,",a[i]);
}
fclose(f);*/
int* gidata;
int* godata;
cudaMalloc((void**)&gidata, size* sizeof(int));
cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
int TPB = 4;
int blocks = 10; //to get things kicked off
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(blocks != 1 ){
if(size < TPB){
TPB = size; // size is 2^sth
}
blocks = (size+ TPB -1 ) / TPB;
cudaMalloc((void**)&godata, blocks * sizeof(int));
computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
cudaFree(gidata);
gidata = godata;
size = blocks;
}
//printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int *output = (int*)malloc(sizeof(int));
cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
//Cant free either earlier as both point to same location
cudaError_t chk = cudaFree(godata);
if(chk!=0){
printf("First chk also printed error. Maybe error in my logic\n");
}
printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
printf("The sum of the array is %d\n", output[0]);
getchar();
return 0;
}
Clearly, the first while loop in computeAddShared is causing out of bounds error because I am allocating 4 bytes to shared memory. Why does cudamemcheck not catch this. Below is the output of cuda-memcheck
========= CUDA-MEMCHECK
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13
6
========= ERROR SUMMARY: 0 errors
Shared memory allocation granularity. The Hardware undoubtedly has a page size for allocations (probably the same as the L1 cache line side). With only 4 threads per block, there will "accidentally" be enough shared memory in a single page to let you code work. If you used a sensible number of threads block (ie. a round multiple of the warp size) the error would be detected because there would not be enough allocated memory.

Resources