avx slower then sse multimedia extensions - sse

I am programming a perfect program to parallelize with multimedia extensions. The program consists of transforming an image, so i go over a matrix and i modify each pixel inside it. For go over faster, i use multimedia extensions:
At first i used SSE3 extensions and achieved a 2.5 speedup. Next, i programmed it extending the sse algorithm for using AVX extensions (Double size vector), but i do not get gains respect to SSE3. More or less the time to execute the program with SSE is the same as AVX.
Here is the summary of the code for SSE and AVX, respectively:
for(i=0; i<lim; i+=12) { //tam vale n*n o n*n-12 dependiendo de si n*n es multiplo de 12. 12 ya que 3componentes*4pixeles(4 tamvector)
vectorR = _mm_set_ps(matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
vectorG = _mm_set_ps(matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
vectorB = _mm_set_ps(matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);
calcular_coeficientes_sse3(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
values0, values255, values128, &vectorY, &vectorU, &vectorV);
_mm_store_ps(&y_aux[0], vectorY);
_mm_store_ps(&u_aux[0], vectorU);
_mm_store_ps(&v_aux[0], vectorV);
//Colocamos los datos en la matriz
//PIXEL 1
matrix[i] = y_aux[0];
matrix[i+1]=u_aux[0];
matrix[i+2]=v_aux[0];
//PIXEL 2
matrix[i+3]=y_aux[1];
matrix[i+4]=u_aux[1];
matrix[i+5]=v_aux[1];
//PIXEL 3
matrix[i+6] = y_aux[2];;
matrix[i+7]=u_aux[2];
matrix[i+8]=v_aux[2];
//PIXEL 4
matrix[i+9]=y_aux[3];
matrix[i+10]=u_aux[3];
matrix[i+11]=v_aux[3];
}
for(i=0; i<lim; i+=24) { //Vamos de 8 en 8 pixeles
vectorR = _mm256_set_ps(matrix[i+21], matrix[i+18], matrix[i+15] ,matrix[i+12],matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
vectorG = _mm256_set_ps(matrix[i+22], matrix[i+19], matrix[i+16], matrix[i+13], matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
vectorB = _mm256_set_ps(matrix[i+23], matrix[i+20], matrix[i+17], matrix[i+14], matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);
calcular_coeficientes_avx(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
values0, values255, values128, &vectorY, &vectorU, &vectorV);
_mm256_store_ps(&y_aux[0], vectorY);
_mm256_store_ps(&u_aux[0], vectorU);
_mm256_store_ps(&v_aux[0], vectorV);
//Colocamos los datos en la matriz
//PIXEL 1
matrix[i] = y_aux[0];
matrix[i+1]=u_aux[0];
matrix[i+2]=v_aux[0];
//PIXEL 2
matrix[i+3]=y_aux[1];
matrix[i+4]=u_aux[1];
matrix[i+5]=v_aux[1];
//PIXEL 3
matrix[i+6] = y_aux[2];;
matrix[i+7]=u_aux[2];
matrix[i+8]=v_aux[2];
//PIXEL 4
matrix[i+9]=y_aux[3];
matrix[i+10]=u_aux[3];
matrix[i+11]=v_aux[3];
//PIXEL 5
matrix[i+12]=y_aux[4];
matrix[i+13]=u_aux[4];
matrix[i+14]=v_aux[4];
//PIXEL 6
matrix[i+15]=y_aux[5];
matrix[i+16]=u_aux[5];
matrix[i+17]=v_aux[5];
//PIXEL 7
matrix[i+18]=y_aux[6];
matrix[i+19]=u_aux[6];
matrix[i+20]=v_aux[6];
//PIXEL 8
matrix[i+21]=y_aux[7];
matrix[i+22]=u_aux[7];
matrix[i+23]=v_aux[7];
}
void calcular_coeficientes_sse3(__m128 Ycoef0, __m128 Ycoef1, __m128 Ycoef2, __m128 Ucoef0, __m128 Vcoef0, __m128 vectorR,
__m128 vectorG, __m128 vectorB, __m128 values0, __m128 values255, __m128 values128,
__m128 *vectorY, __m128 *vectorU, __m128 *vectorV) {
//CALCULO DE Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)
//PRIMERA VUELta
__m128 valores1 = _mm_mul_ps(Ycoef0, vectorR); // valores1 = (0.299*R[3], 0.299*R[2], 0.299*R[1], 0.299*R[0])
__m128 valores2 = _mm_mul_ps(Ycoef1, vectorG); // valores2 = (0.587*G[3], 0.587*G[2], 0.587*G[1], 0.587*G[0])
__m128 valores3 = _mm_mul_ps(Ycoef2, vectorB); // valores3 = (0.114*B[3], 0.114*B[2], 0.114*B[1], 0.114*B[0]);
valores1 = _mm_add_ps(valores1, valores2); // valores1 = (0.299*R[3] + 0.587*G[3], 0.299*R[2] + 0.587*G[2], 0.299*G[1]+ ..., ...)
*vectorY = _mm_add_ps(valores1, valores3); // vectorY = (Y[3], Y[2], Y[1], Y[0])
*vectorY = _mm_floor_ps(*vectorY);
//Calculo de U3, U2, U1, U0
//B-Y
valores1 = _mm_sub_ps(vectorB, *vectorY); // valores1 = (B[3]-Y[3], B[2]-Y[2], B[1]-Y[1], B[0]-Y[0])
valores1 = _mm_mul_ps(Ucoef0, valores1); // valores1 = (0.492*(B[3]-Y[3]), 0.492*(B[2]-Y[2]), 0.492*(B[1]-Y[1]), 0.492*(...))
*vectorU = _mm_add_ps(valores1, values128); // vectorU = (U[3], U[2], U[1], U[0])
//CALCULO DE V3, V2, V1, V0
// R-Y
valores1 = _mm_sub_ps(vectorR, *vectorY); // valores1 = (R[3]-Y[3], R[2]-Y[2], R[1]-Y[1], R[0]-Y[0])
valores1 = _mm_mul_ps(Vcoef0, valores1); // valores1 = (0.877*(R[3]-Y[3]), 0.877*(R[2]-Y[2]), 0.877*(R[1]-Y[1]), 0.877*(...))
valores1 = _mm_add_ps(valores1, values128); // valores1 = (0.877*(R[3]-Y[3]) + 128, 0.877*(R[2]-Y[2]) + 128, ..., ...)
//valores1 pueden necesitar saturacion.
//SATURACIONES a 0
//Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:
// Si V[i] > 0 se queda con V[i] pues es mayor que 0.
// Si V[i] < 0 se queda con 0 pues es mayor que un número negativo.
valores1 = _mm_max_ps(valores1, values0); // valores1 = (max(0.877*(R[3]-Y[3]) + 128,0), ..., ..., ...)
// SATURACIONES a 255
//Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
// Si V[i] < 255 entonces se queda con el menor, V[i]
// Si V[i] > 255 entonces se queda con el menor, 255.
*vectorV = _mm_min_ps(valores1, values255); //vectorV = (V[3], V[2], V[1], V[0])
//NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo.
//por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.
return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV
}
void calcular_coeficientes_avx(__m256 Ycoef0, __m256 Ycoef1, __m256 Ycoef2, __m256 Ucoef0, __m256 Vcoef0, __m256 vectorR,
__m256 vectorG, __m256 vectorB, __m256 values0, __m256 values255, __m256 values128,
__m256 *vectorY, __m256 *vectorU, __m256 *vectorV) {
//CALCULO DE Y7, Y6, Y5, Y4, Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)
__m256 valores1 = _mm256_mul_ps(Ycoef0, vectorR);
__m256 valores2 = _mm256_mul_ps(Ycoef1, vectorG);
__m256 valores3 = _mm256_mul_ps(Ycoef2, vectorB);
valores1 = _mm256_add_ps(valores1, valores2);
*vectorY = _mm256_add_ps(valores1, valores3);
*vectorY = _mm256_floor_ps(*vectorY);
//Calculo de U7, U6, U5, U4, U3, U2, U1, U0
valores1 = _mm256_sub_ps(vectorB, *vectorY);
valores1 = _mm256_mul_ps(Ucoef0, valores1);
*vectorU = _mm256_add_ps(valores1, values128);
//CALCULO DE V7, V6, V5, V4, V3, V2, V1, V0
// R-Y
valores1 = _mm256_sub_ps(vectorR, *vectorY);
valores1 = _mm256_mul_ps(Vcoef0, valores1);
valores1 = _mm256_add_ps(valores1, values128);
//valores1 pueden necesitar saturacion.
//SATURACIONES a 0
//Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:
valores1 = _mm256_max_ps(valores1, values0);
// SATURACIONES a 255
//Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
// Si V[i] < 255 entonces se queda con el menor, V[i]
// Si V[i] > 255 entonces se queda con el menor, 255.
*vectorV = _mm256_min_ps(valores1, values255); //vectorV = (V[3], V[2], V[1], V[0])
//NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo.
//por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.
return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV
}
As you can see both sse and avx are the same but the last is extended and use longer size vector. Why is the same execution time?
NOTE: I have tried in two differents computers with AVX support (obviusly) and i have the same problem.
Thank you very much.

Using floating point arithmetic for RGB to YUV is massive overkill, so I'll use fixed point arithmetic here. The output format is very annoying, there is probably a smarter way to shuffle the stuff into place than I used but no matter what you do it will also be annoying again when you want to do anything with the YUV data. It would make more sense to use a format such as [YYYYYYYY UUUUUUUU VVVVVVVV].
While there are many RGB to YUV converters already available, I gave it an other try anyway. It can't hurt to have more alternatives available, and most of them don't quite match - for example doing chroma sub-sampling and usually using an other output format.
I haven't tested it (other than checking that it compiles), so any comments about correctness or performance would be appreciated.
void toYUV(uint8_t* pixels, int size)
{
// r g b r g b r g b r g b r g b r | g b r g b r g b
short rw = 9798; // (short)round(0.299 * 2^15)
short gw = 19234;// (short)round(0.587 * 2^15)
short bw = 3736; // (short)round(0.114 * 2^15)
char _ = -1;
__m128i z = _mm_setzero_si128();
for (int i = 0; i < size; i += 24)
{
__m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
__m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));
// widen
// w0 = r0 g0 b0 r1 g1 b1 r2 g2
__m128i w0 = _mm_unpacklo_epi8(p0, z);
// w1 = b2 r3 g3 b3 r4 g4 b4 r5
__m128i w1 = _mm_unpackhi_epi8(p0, z);
// w2 = g5 b5 r6 g6 b6 r7 g7 b7
__m128i w2 = _mm_unpacklo_epi8(p1, z);
__m128i zRGBRGBz = _mm_setr_epi16(0, rw, gw, bw, rw, gw, bw, 0);
// calculate Y
// 0+r0 g0+b0 r1+g1 b1+0
__m128i s0 = _mm_madd_epi16(_mm_bslli_si128(w0, 2), zRGBRGBz);
// 0+r2 g2+b2 r3+g3 b3+0
__m128i s1 = _mm_madd_epi16(_mm_alignr_epi8(w1, w0, 10), zRGBRGBz);
// 0+r4 g4+b4 r5+g5 b5+0
__m128i s2 = _mm_madd_epi16(_mm_alignr_epi8(w2, w1, 6), zRGBRGBz);
// 0+r6 g6+b6 r7+g7 b7+0
__m128i s3 = _mm_madd_epi16(_mm_bsrli_si128(w2, 2), zRGBRGBz);
// the math works out to make the Y's in [0 .. 254] after scaling back, so
// y0 = y0 0 y1 0 y2 0 y3 0
// y1 = y4 0 y5 0 y6 0 y7 0
__m128i y0 = _mm_srli_epi32(_mm_hadd_epi32(s0, s1), 15);
__m128i y1 = _mm_srli_epi32(_mm_hadd_epi32(s2, s3), 15);
__m128i y = _mm_packus_epi32(y0, y1);
y = _mm_packus_epi16(y, y);
// calculate U
// todo: do smarter shuffles?
// b0 = 0 b0 0 b1 0 b2 0 b3
__m128i b0 = _mm_or_si128(
_mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 4, 5, _, _, 10, 11, _, _, _, _, _, _, _, _)),
_mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 6, 7)));
// b1 = 0 b4 0 b5 0 b6 0 b7
__m128i b1 = _mm_or_si128(
_mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 13, 14, _, _, _, _, _, _, _, _, _, _, _, _)),
_mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _, 2, 3, _, _, 8, 9, _, _, 14, 15)));
// b - y in [-225 .. 226]
// u = 18492 * (b - y) >> 15 -> (18492 * b + -18492 * y) >> 15
// .. so u in [-128 .. 127]
short us = 18492;
__m128i u0 = _mm_madd_epi16(_mm_or_si128(b0, y0), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
u0 = _mm_srai_epi32(u0, 15);
__m128i u1 = _mm_madd_epi16(_mm_or_si128(b1, y1), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
u1 = _mm_srai_epi32(u1, 15);
// pack to sbytes
__m128i u = _mm_packs_epi32(u0, u1);
u = _mm_packs_epi16(u, u);
// calculate V
// todo: do smarter shuffles?
// r0 = 0 r0 0 r1 0 r2 0 r3
__m128i r0 = _mm_or_si128(
_mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 0, 1, _, _, 6, 7, _, _, 12, 13, _, _, _, _)),
_mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, _, _, _, _, 2, 3)));
// r1 = 0 r4 0 r5 0 r6 0 r7
__m128i r1 = _mm_or_si128(
_mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 8, 9, _, _, 14, 15, _, _, _, _, _, _, _, _)),
_mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 4, 5, _, _, 10, 11)));
// r - y in [-178 .. 179]
// v = 23372 * (r - y) >> 15 -> (23372 * r + -23372 * y) >> 15
// .. so v in [-128 .. 127]
short vs = 23372;
__m128i v0 = _mm_madd_epi16(_mm_or_si128(r0, y0), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
v0 = _mm_srai_epi32(v0, 15);
__m128i v1 = _mm_madd_epi16(_mm_or_si128(r1, y1), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
v1 = _mm_srai_epi32(v1, 15);
// pack to sbytes
__m128i v = _mm_packs_epi32(v0, v1);
v = _mm_packs_epi16(v, v);
// interleave :(
// y0 u0 v0 y1 u1 v1 y2 u2 v2 y3 u3 v3 y4 u4 v4 y5
// u5 v5 y6 u6 v6 y7 u7 v7
__m128i shuf0 = _mm_setr_epi8(0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _, 5);
p0 = _mm_or_si128(_mm_or_si128(
_mm_shuffle_epi8(y, shuf0),
_mm_shuffle_epi8(u, _mm_bslli_si128(shuf0, 1))),
_mm_shuffle_epi8(v, _mm_bslli_si128(shuf0, 2)));
__m128i shuf1 = _mm_setr_epi8(_, 5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _);
p1 = _mm_or_si128(_mm_or_si128(
_mm_shuffle_epi8(y, _mm_srli_si128(shuf1, 2)),
_mm_shuffle_epi8(u, _mm_srli_si128(shuf1, 1))),
_mm_shuffle_epi8(v, shuf1));
_mm_storeu_si128((__m128i*)(pixels + i), p0);
_mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
}
}
This isn't great, it's far too p5-heavy with all those shuffles and (un)packs. On Ivy that's not such a problem. Using a less interleaved output format would help a lot there, saving 6 shuffles. Of course since this is fixed point, you can't really convert this to AVX, but you could try AVX2.
Here's a "more vertical" version, based on pmulhrsw instead of pmaddwd, also not tested (it's probably broken). The asm looks better, but I haven't tested performance either.
void toYUV(uint8_t* pixels, int size)
{
// r g b r g b r g b r g b r g b r | g b r g b r g b
char _ = -1;
__m128i rshuf0 = _mm_setr_epi8(0, _, 3, _, 6, _, 9, _, 12, _, 15, _, _, _, _, _);
__m128i rshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, _, _, 2, _, 5, _);
__m128i gshuf0 = _mm_setr_epi8(1, _, 4, _, 7, _, 10, _, 13, _, _, _, _, _, _, _);
__m128i gshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 0, _, 3, _, 6, _);
__m128i bshuf0 = _mm_setr_epi8(2, _, 5, _, 8, _, 11, _, 14, _, _, _, _, _, _, _);
__m128i bshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 1, _, 2, _, 5, _);
__m128i ryweight = _mm_set1_epi16(9798); // (short)round(0.299 * 2^15)
__m128i gyweight = _mm_set1_epi16(19235);// (short)round(0.587 * 2^15)
__m128i byweight = _mm_set1_epi16(3736); // (short)round(0.114 * 2^15)
__m128i uscale = _mm_set1_epi16(18492); // (short)floor(2^15 / (2^15 - byweight) * 0.5 * 2^15)
__m128i uofs = _mm_set1_epi16(128);
__m128i vscale = _mm_set1_epi16(23372); // (short)floor(2^15 / (2^15 - ryweight) * 0.5 * 2^15)
__m128i vofs = _mm_set1_epi16(128);
__m128i yshuf0 = _mm_setr_epi8(0, _, _, 2, _, _, 4, _, _, 6, _, _, 8, _, _, _);
__m128i yshuf1 = _mm_setr_epi8(10, _, _, 12, _, _, 14, _, _, _, _, _, _, _, _, _);
__m128i ushuf0 = _mm_setr_epi8(_, 0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _);
__m128i ushuf1 = _mm_setr_epi8(5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _, _);
for (int i = 0; i < size; i += 24)
{
__m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
__m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));
__m128i r = _mm_or_si128(_mm_shuffle_epi8(p0, rshuf0), _mm_shuffle_epi8(p1, rshuf1));
__m128i g = _mm_or_si128(_mm_shuffle_epi8(p0, gshuf0), _mm_shuffle_epi8(p1, gshuf1));
__m128i b = _mm_or_si128(_mm_shuffle_epi8(p0, bshuf0), _mm_shuffle_epi8(p1, bshuf1));
__m128i scaledr = _mm_mulhrs_epi16(r, ryweight);
__m128i scaledg = _mm_mulhrs_epi16(g, gyweight);
__m128i scaledb = _mm_mulhrs_epi16(b, byweight);
__m128i y = _mm_add_epi16(_mm_add_epi16(scaledr, scaledg), scaledb);
__m128i u = _mm_mulhrs_epi16(_mm_sub_epi16(b, y), uscale);
__m128i v = _mm_mulhrs_epi16(_mm_sub_epi16(r, y), vscale);
// pack back to bytes and shuffle
// words in y are guaranteed to be in [0 - 255] so just shuffle them into place
// swords in u and v may be slightly wonky so pack with saturation first
u = _mm_packs_epi16(u, u);
v = _mm_packs_epi16(v, v);
p0 = _mm_or_si128(_mm_or_si128(
_mm_shuffle_epi8(y, yshuf0),
_mm_shuffle_epi8(u, ushuf0)),
_mm_shuffle_epi8(v, _mm_bslli_si128(ushuf0, 1)));
p1 = _mm_or_si128(_mm_or_si128(
_mm_shuffle_epi8(y, yshuf1),
_mm_shuffle_epi8(u, ushuf1)),
_mm_shuffle_epi8(v, _mm_bslli_si128(ushuf1, 1)));
_mm_storeu_si128((__m128i*)(pixels + i), p0);
_mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
}
}

Related

optimize Octave code for Zhang Suen thinning algorithm

I would like to know if it is possible to vectorize (cf page 599 of this doc) operations that need a whole matrix scan, but with a lot of conditions to check concerning neighbours pixels. The goal is to make it faster, because the code is working when I use a for loop with 10 iterations, but I tried using a while loop, and it never ends. It might also be me screwing the stop condition, but I think the operations could still be faster. The operations I want to optimize are described here.
Here is the code that I want to optimize:
while (stopCond>0)
stopCond = 0;
ap1 = 0;
bp1 = 0;
tabPixel = [];
for x=2:NL-1
for y= 2:NC-1
p1 = imgSeuil(x,y); %current pixel
p2 = imgSeuil(x-1, y); %pixel neighbours
p3 = imgSeuil(x-1, y+1);
p4 = imgSeuil(x, y+1);
p5 = imgSeuil(x+1, y+1);
p6 = imgSeuil(x+1, y);
p7 = imgSeuil(x+1, y-1);
p8 = imgSeuil(x, y-1);
p9 = imgSeuil(x-1, y-1);
tabNeighbour = [p2, p3, p4, p5, p6, p7, p8, p9];
tmpTabl = diff([tabNeighbour, p2]);
tmpTabl = max(tmpTabl, 0);
ap1 = sum(tmpTabl);
bp1 = sum(tabNeighbour);
%%%--------Can I vectorize ops with below conditions--------
if((p1==1)&&(bp1>=2)&&(bp1<=6)&&(ap1==1)&&
((p2==0)||(p4==0)||(p6==0))&&
((p4==0)||(p6==0)||(p8==0)))
%%%adding indexes of current pxl matching these conditions to then change their value when loop is over
tabPixel = [tabPixel, [x; y]];
stopCond += 1;
endif
endfor
endfor
for i=2:columns(tabPixel)
imgSeuil(tabPixel(1, i), tabPixel(2, i)) = 0;
endfor
I also read that using bolean indexing is encouraged, and I would like to know if those changes would significantly improve exec time.
Here is the whole code if you want to run it :
clear all;
close all;
img=imread("/home/redouane/Documents/L3/S6/TIA/TD/ED_3_6_originale.png");
imshow(img);
colorbar();
sizeImg = size(img);
NL=sizeImg(1,1);
NC=sizeImg(1,2);
tab=zeros(2,256); %tab de niveaux de gris
tab(1,1:256)=0:255;
%remplissage tab niveaux de gris
%et affichage de l'histogramme
for y=1:NL
for x= 1:NC
val=img(y,x);
tab(2,val+1)=tab(2,val+1)+1;
end
end
ticktab=zeros(1,25);
for i=1:25
ticktab(1, i)=10*i;
end
figure(2);
plot(tab(1, 1:256),tab(2, 1:256));
set(gca,'XTick',ticktab(1, 1:25));
xlim([0, 255]);
%seuillage de l'img
figure(3);
imgSeuil=img;
for y=1:NL
for x= 1:NC
val=imgSeuil(y,x);
if(val<30)
imgSeuil(y,x)=0;
else
imgSeuil(y,x)=255;
end
end
end
imgSeuil=~imgSeuil;%inversion pour lignes blanches
imshow(imgSeuil);
%squelettisation: cf Zhang Suen algorithm sur rosetta code
%%%--------------
stopCond = 1;
while (stopCond>0)
stopCond = 0;
ap1 = 0;
bp1 = 0;
tabPixel = [];
for x=2:NL-1
for y= 2:NC-1
p1 = imgSeuil(x,y); %les voisins du pixel
p2 = imgSeuil(x-1, y);
p3 = imgSeuil(x-1, y+1);
p4 = imgSeuil(x, y+1);
p5 = imgSeuil(x+1, y+1);
p6 = imgSeuil(x+1, y);
p7 = imgSeuil(x+1, y-1);
p8 = imgSeuil(x, y-1);
p9 = imgSeuil(x-1, y-1);
tabNeighbour = [p2, p3, p4, p5, p6, p7, p8, p9];
tmpTabl = diff([tabNeighbour, p2]);
tmpTabl = max(tmpTabl, 0);
ap1 = sum(tmpTabl);
bp1 = sum(tabNeighbour);
if((p1==1)&&(bp1>=2)&&(bp1<=6)&&(ap1==1)&&
((p2==0)||(p4==0)||(p6==0))&&
((p4==0)||(p6==0)||(p8==0)))
tabPixel = [tabPixel, [x; y]];
stopCond += 1;
endif
endfor
endfor
for i=2:columns(tabPixel)
imgSeuil(tabPixel(1, i), tabPixel(2, i)) = 0;
endfor
ap1 = 0;
bp1 = 0;
tabPixel = [];
for x=2:NL-1
for y= 2:NC-1
p1 = imgSeuil(x,y);
p2 = imgSeuil(x-1, y);
p3 = imgSeuil(x-1, y+1);
p4 = imgSeuil(x, y+1);
p5 = imgSeuil(x+1, y+1);
p6 = imgSeuil(x+1, y);
p7 = imgSeuil(x+1, y-1);
p8 = imgSeuil(x, y-1);
p9 = imgSeuil(x-1, y-1);
tabNeighbour = [p2, p3, p4, p5, p6, p7, p8, p9];
ap1 = sum(diff([tabNeighbour, p2]));
bp1=sum(tabNeighbour);
if((p1==1)&&(bp1>=2)&&(bp1<=6)&&(ap1==1)&&
((p2==0)||(p4==0)||(p8==0))&&
((p2==0)||(p6==0)||(p8==0)))
tabPixel=[tabPixel, x; y];
stopCond += 1;
endif
endfor
endfor
for i=1:columns(tabPixel)
imgSeuil(tabPixel(1, i), tabPixel(2, i))=0;
endfor
endwhile
figure(4);
imshow(imgSeuil);
%%%-------------
##tabSquel=zeros(1,10);
##hold on;
##for y=2:NL-1
## for x= 2:NC-1
## % on utilise ces valeurs pour ne pas acceder aux bords de l'image
## %Pixel1 (P1) correspond à imgSeuil(y,x), c'est le pixel du milieu et on etudie ses voisins
## A=0;%nombre de transi de 0 à 1
## B=0;%nombre de voisins
##
## tabSquel(1,2)=imgSeuil(y,x);%p1
## tabSquel(1,2)=imgSeuil(y-1,x);%p2
## tabSquel(1,3)=imgSeuil(y-1,x+1);%p3
## tabSquel(1,4)=imgSeuil(y,x+1);%p4
## tabSquel(1,5)=imgSeuil(y+1,x+1);%p5
## tabSquel(1,6)=imgSeuil(y+1,x);%p6
## tabSquel(1,7)=imgSeuil(y+1,x-1);%p7
## tabSquel(1,8)=imgSeuil(y,x-1);%p8
## tabSquel(1,9)=imgSeuil(y-1,x-1);%p9
## tabSquel(1,10)=imgSeuil(y-1,x);
##
##
##sum = (0.5*(abs(tabSquel(1,6)-tabSquel(1,1)) + abs(tabSquel(1,7)-tabSquel(1,6)) + abs(tabSquel(1,8)-tabSquel(1,7)) + abs(tabSquel(1,9)-tabSquel(1,8)) + abs(tabSquel(1,2)-tabSquel(1,9)) + abs(tabSquel(1,3)-tabSquel(1,2)) + abs(tabSquel(1,4)-tabSquel(1,3)) + abs(tabSquel(1,5)-tabSquel(1,4))));
## if(sum==3)
## plot(y,x, "ro-");
## end
## end
##end
%imshow(imgSeuil);
I decided to try out the idea of generating a lookup table for each of the 256 possible neighbor combinations. Given a kernel:
encoding_kernel =
1 128 64
2 0 32
4 8 16
we can use filter2 to encode each neighborhood of 8 pixels to an integer in the range 0..255.
Now we just need to determine which bit combinations satisfy the conditions. You'll notice that P1 is multiplied by 0 in the kernel. We'll just use the thresholded image directly to get the value of P1. I've only coded the LUT for Step 1 in the algorithm, but Step 2 is virtually identical.
% generate Step 1 lookup table for encoded neighborhood of P1
%
% encoding_kernel
% 1 128 64
% 2 0 32
% 4 8 16
LUT_Step1 = ones(256, 1);
% generate binary values for keys 0..255
% pixel ordering is: [P2 P3 P4 P5 P6 P7 P8 P9]
binary_keys = dec2bin(0:255, 8)-'0'; % -'0' makes the array numeric
% LUT = LUT AND (2 <= B(P1) <= 6)
B_P1 = sum(binary_keys, 2);
LUT_Step1 = LUT_Step1 & (2 <= B_P1) & (B_P1 <= 6);
% Generate A(P1) by finding the transitions from 0 to 1
% which corresponds to a value of 1 in the diff
A_P1 = sum(diff([binary_keys, binary_keys(:,1)], [], 2) == 1, 2) == 1;
LUT_Step1 = LUT_Step1 & (A_P1 == 1);
% At least one of P2 and P4 and P6 is white (0)
LUT_Step1 = LUT_Step1 & ~all(binary_keys(:,[1,3,5]), 2);
% At least one of P4 and P6 and P8 is white (0)
LUT_Step1 = LUT_Step1 & ~all(binary_keys(:,[3,5,7]), 2);
I haven't manually verified all of the values in the LUT, but from a spot check it seems to be correct. On my machine, a 3GHz Core i5, generating the table took about 0.7-0.8 msec. You could, of course, hardcode the resulting table in your script if you wished.
Once you have the lookup table and the kernel, checking the conditions is pretty easy. Just encode the current image, and then look up the encoded value in the table and if it's 1, then the image should be 0 at that location. (We don't really need to make sure that the image is 1 (black) before applying the other conditions, because changing 0 to 0 doesn't change the outcome.)
clear all;
close all;
% generate the lookup table
zs_lookup_table;
img=imread("https://i.stack.imgur.com/ejNSg.png");
imshow(img);
colorbar();
% threshold img
figure(2)
imgSeuil = img < 30;
imshow(imgSeuil);
encoding_kernel = [
1 128 64
2 0 32
4 8 16
];
% use "valid" in filter2 to ensure each pixel has 8 neighbors
% valid region of image is 1 pixel smaller on each side,
% so we'll need to adjust when we recalculate imgSeuil
encoded_img = filter2(encoding_kernel, imgSeuil, "valid");
% use lookup table to determine which pixels satisfy conditions 1-4
conds1_4 = false(size(imgSeuil));
conds1_4(2:end-1, 2:end-1) = LUT_Step1(encoded_img+1); % convert range to 1..256
imgSeuil(conds1_4) = 0; % no need to explicitly check for pixel == 1
figure(3)
imshow(imgSeuil);
The timings for each pass through Step 1 varied a bit more then generating the table, probably because I updated the image between iterations and there were fewer pixels changing in later iterations. Each pass took between 0.7 and 1.7 msec. Again, I didn't code Step 2 and I didn't check for changes between iterations, but adding it all together you should be able to reach equilibrium in well under a second.

Creating a crc8 function with lua

Is there a simple algorithm to create a crc8 checksum from a table in lua?
Polynomial should be x^8+x^5+x^4+1 (0x31)
This algorithm will be used to check the UID of the DS28CM00 UID-chip.
Here you can find a table returned by the chip (LS-byte last) :
table = {112,232,9,80,1,0,0}
Thanks for any help
For Lua 5.3+
local function crc8(t)
local c = 0
for _, b in ipairs(t) do
for i = 0, 7 do
c = c >> 1 ~ ((c ~ b >> i) & 1) * 0x8C
end
end
return c
end
print(crc8{112, 232, 9, 80, 1, 0, 0}) --> 219
print(crc8{2, 0x1C, 0xB8, 1, 0, 0, 0}) --> 0xA2 as in example from AN-27
For Lua 5.2-
local function crc8(t)
local c = 0
for _, b in ipairs(t) do
for i = 0, 7 do
local c0 = c % 2
local b0 = b % 2
c = (c - c0) / 2
b = (b - b0) / 2
if c0 + b0 == 1 then
c = c + 0x80 + (c % 16 < 8 and 8 or -8) + (c % 8 < 4 and 4 or -4)
end
end
end
return c
end

SHA512 pure Lua 5.1 adaptation

I was searching for a pure Lua 5.1 adaptation for SHA512 and yielded no results anywhere I went. I found a similar question where someone tried to convert the SHA256 adaptation into SHA512 (except he was using Lua 5.3):
Adaptation of SHA2 512 gives incorrect results
Basically I couldn't use bitwise operators (not implemented in Lua 5.1) so I had to write my own implementations of them.
This is my code:
local MOD = 2^64;
local MODM = MOD-1;
local function memoize(f)
local mt = {}
local t = setmetatable({}, mt)
function mt:__index(k)
local v = f(k)
t[k] = v
return v
end
return t
end
local function make_bitop_uncached(t, m)
local function bitop(a, b)
local res,p = 0,1
while a ~= 0 and b ~= 0 do
local am, bm = a % m, b % m
res = res + t[am][bm] * p
a = (a - am) / m
b = (b - bm) / m
p = p*m
end
res = res + (a + b) * p
return res
end
return bitop
end
local function make_bitop(t)
local op1 = make_bitop_uncached(t,2^1)
local op2 = memoize(function(a) return memoize(function(b) return op1(a, b)
end) end)
return make_bitop_uncached(op2, 2 ^ (t.n or 1))
end
local bxor1 = make_bitop({[0] = {[0] = 0,[1] = 1}, [1] = {[0] = 1, [1] = 0}, n = 4})
local function bxor(a, b, c, ...)
local z = nil
if b then
a = a % MOD
b = b % MOD
z = bxor1(a, b)
if c then z = bxor(z, c, ...) end
return z
elseif a then return a % MOD
else return 0 end
end
local function band(a, b, c, ...)
local z
if b then
a = a % MOD
b = b % MOD
z = ((a + b) - bxor1(a,b)) / 2
if c then z = bit32_band(z, c, ...) end
return z
elseif a then return a % MOD
else return MODM end
end
local function bnot(x) return (-1 - x) % MOD end
local function rshift1(a, disp)
if disp < 0 then return lshift(a,-disp) end
return math.floor(a % 2 ^ 32 / 2 ^ disp)
end
local function rshift(x, disp)
if disp > 31 or disp < -31 then return 0 end
return rshift1(x % MOD, disp)
end
local function lshift(a, disp)
if disp < 0 then return rshift(a,-disp) end
return (a * 2 ^ disp) % 2 ^ 32
end
-- UTILITY FUNCTIONS
--
-- transform a string of bytes in a string of hexadecimal digits
local function str2hexa (s)
local h = string.gsub(s, ".", function(c)
return string.format("%02x", string.byte(c))
end)
return h
end
-- transforms number 'l' into a big-endian sequence of 'n' bytes
--(coded as a string)
local function num2string(l, n)
local s = ""
for i = 1, n do
--most significant byte of l
local remainder = l % 256
s = string.char(remainder) .. s
--remove from l the bits we have already transformed
l = (l-remainder) / 256;
end
return s
end
-- transform the big-endian sequence of eight bytes starting at
-- index 'i' in 's' into a number
local function s264num (s, i)
local n = 0
for i = i, i + 7 do
n = n*256 + string.byte(s, i)
end
return n
end
--
-- MAIN SECTION
--
-- FIRST STEP: INITIALIZE HASH VALUES
--(second 32 bits of the fractional parts of the square roots of the first
9th through 16th primes 23..53)
local HH = {}
local function initH512(H)
H = {0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179}
return H
end
-- SECOND STEP: INITIALIZE ROUND CONSTANTS
--(first 80 bits of the fractional parts of the cube roots of the first 80 primes 2..409)
local k = {
0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0x3956c25bf348b538,
0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, 0x12835b0145706fbe,
0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 0x983e5152ee66dfab,
0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed,
0x53380d139d95b3df, 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, 0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373,
0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 0xca273eceea26619c,
0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, 0x0a637dc5a2c898a6,
0x113f9804bef90dae, 0x1b710b35131c471b, 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
}
-- THIRD STEP: PRE-PROCESSING (padding)
local function preprocess(toProcess, len)
--append a single '1' bit
--append K '0' bits, where K is the minimum number >= 0 such that L + 1 + K = 896mod1024
local extra = - (len + 17) % 128 + 8
len = num2string(8 * len, 8)
toProcess = toProcess .. "\128" .. string.rep("\0", extra) .. len
assert(#toProcess % 128 == 0)
return toProcess
end
local function rrotate(rot, n)
return rshift(rot, n) or (rshift(rot, 64 - n))
end
local function digestblock(msg, i, H)
local w = {}
for j = 1, 16 do w[j] = s264num(msg, i + (j - 1) * 8) end
for j = 17, 80 do
local v = w[j - 15]
local s0 = bxor(rrotate(v, 1), rrotate(v, 8), rshift(v, 7))
v = w[j - 2]
w[j] = w[j - 16] + s0 + w[j - 7] + bxor(rrotate(v, 19), rrotate(v, 61),
rshift(v, 6))
end
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for i = 1, 80 do
a, b, c, d, e, f, g, h = a , b , c , d , e , f , g , h
local s0 = bxor(rrotate(a, 28), rrotate(a, 34), rrotate(a, 39))
local maj = bxor(band(a, b), band(a, c), band(b, c))
local t2 = s0 + maj
local s1 = bxor(rrotate(e, 14), rrotate(e, 18), rrotate(e, 41))
local ch = bxor (band(e, f), band(bnot(e), g))
local t1 = h + s1 + ch + k[i] + w[i]
h, g, f, e, d, c, b, a = g, f, e, d + t1, c, b, a, t1 + t2
end
H[1] = (H[1] + a)
H[2] = (H[2] + b)
H[3] = (H[3] + c)
H[4] = (H[4] + d)
H[5] = (H[5] + e)
H[6] = (H[6] + f)
H[7] = (H[7] + g)
H[8] = (H[8] + h)
end
local function finalresult512 (H)
-- Produce the final hash value:
return
str2hexa(num2string(H[1], 8)..num2string(H[2], 8)..num2string(H[3], 8)..num2string(H[4], 8)..
num2string(H[5], 8)..num2string(H[6], 8)..num2string(H[7], 8)..num2string(H[8], 8))
end
-- Returns the hash512 for the given string.
local function hash512 (msg)
msg = preprocess(msg, #msg)
local H = initH512(HH)
-- Process the message in successive 1024-bit (128 bytes) chunks:
for i = 1, #msg, 128 do
digestblock(msg, i, H)
end
return finalresult512(H)
end
print( hash512("a") )
At the end, when "a" is hashed, it turns into this:
8c14f3e36400000074d6c495c0000000fd2e4ad8b40000009a78880fb00000002c13f4fdc0000000bf50f67658000000cdf76c796c000000df8163cae8000000
Instead of the actual hash (which is this):
1F40FC92DA241694750979EE6CF582F2D5D7D28E18335DE05ABC54D0560E0F5302860C652BF08D560252AA5E74210546F369FBBBCE8C12CFC7957B2652FE9A75
So my question is, why is it wielding such different results. Is it a problem with the bitwise operator functions? I am stumped.
Here is a working implementation of SHA512 for Lua 5.1
File sha2for51.lua
-- This module contains functions to calculate SHA2 digest.
-- Supported hashes: SHA-224, SHA-256, SHA-384, SHA-512, SHA-512/224, SHA-512/256
-- This is a pure-Lua module, compatible with Lua 5.1
-- It works on Lua 5.1/5.2/5.3/5.4/LuaJIT, but it doesn't use benefits of Lua versions 5.2+
-- Input data may must be provided either as a whole string or as a sequence of substrings (chunk-by-chunk).
-- Result (SHA2 digest) is a string of lowercase hex digits.
--
-- Simplest usage example:
-- local your_hash = require("sha2for51").sha512("your string")
-- See file "sha2for51_test.lua" for more examples.
local unpack, table_concat, byte, char, string_rep, sub, string_format, floor, ceil, min, max =
table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.format, math.floor, math.ceil, math.min, math.max
--------------------------------------------------------------------------------
-- BASIC BITWISE FUNCTIONS
--------------------------------------------------------------------------------
-- 32-bit bitwise functions
local AND, OR, XOR, SHL, SHR, ROL, ROR, HEX
-- Only low 32 bits of function arguments matter, high bits are ignored
-- The result of all functions (except HEX) is an integer (pair of integers) inside range 0..(2^32-1)
function SHL(x, n)
return (x * 2^n) % 4294967296
end
function SHR(x, n)
x = x % 4294967296 / 2^n
return x - x % 1
end
function ROL(x, n)
x = x % 4294967296 * 2^n
local r = x % 4294967296
return r + (x - r) / 4294967296
end
function ROR(x, n)
x = x % 4294967296 / 2^n
local r = x % 1
return r * 4294967296 + (x - r)
end
local AND_of_two_bytes = {} -- look-up table (256*256 entries)
for idx = 0, 65535 do
local x = idx % 256
local y = (idx - x) / 256
local res = 0
local w = 1
while x * y ~= 0 do
local rx = x % 2
local ry = y % 2
res = res + rx * ry * w
x = (x - rx) / 2
y = (y - ry) / 2
w = w * 2
end
AND_of_two_bytes[idx] = res
end
local function and_or_xor(x, y, operation)
-- operation: nil = AND, 1 = OR, 2 = XOR
local x0 = x % 4294967296
local y0 = y % 4294967296
local rx = x0 % 256
local ry = y0 % 256
local res = AND_of_two_bytes[rx + ry * 256]
x = x0 - rx
y = (y0 - ry) / 256
rx = x % 65536
ry = y % 256
res = res + AND_of_two_bytes[rx + ry] * 256
x = (x - rx) / 256
y = (y - ry) / 256
rx = x % 65536 + y % 256
res = res + AND_of_two_bytes[rx] * 65536
res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216
if operation then
res = x0 + y0 - operation * res
end
return res
end
function AND(x, y)
return and_or_xor(x, y)
end
function OR(x, y)
return and_or_xor(x, y, 1)
end
function XOR(x, y, z) -- 2 or 3 arguments
if z then
y = and_or_xor(y, z, 2)
end
return and_or_xor(x, y, 2)
end
function HEX(x)
return string_format("%08x", x % 4294967296)
end
-- Arrays of SHA2 "magic numbers"
local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi = {}, {}, {}, {}
local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi}
local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi}
local common_W = {} -- a temporary table shared between all calculations
local function sha256_feed_64(H, K, str, W, offs, size)
-- offs >= 0, size >= 0, size is multiple of 64
for pos = offs, size + offs - 1, 64 do
for j = 1, 16 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos)
W[j] = ((a * 256 + b) * 256 + c) * 256 + d
end
for j = 17, 64 do
local a, b = W[j-15], W[j-2]
W[j] = XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16]
end
local a, b, c, d, e, f, g, h, z = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for j = 1, 64 do
z = XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + AND(e, f) + AND(-1-e, g) + h + K[j] + W[j]
h = g
g = f
f = e
e = z + d
d = c
c = b
b = a
a = z + AND(d, c) + AND(a, XOR(d, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10))
end
H[1], H[2], H[3], H[4] = (a + H[1]) % 4294967296, (b + H[2]) % 4294967296, (c + H[3]) % 4294967296, (d + H[4]) % 4294967296
H[5], H[6], H[7], H[8] = (e + H[5]) % 4294967296, (f + H[6]) % 4294967296, (g + H[7]) % 4294967296, (h + H[8]) % 4294967296
end
end
local function sha512_feed_128(H_lo, H_hi, K_lo, K_hi, str, W, offs, size)
-- offs >= 0, size >= 0, size is multiple of 128
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
for pos = offs, size + offs - 1, 128 do
for j = 1, 32 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos)
W[j] = ((a * 256 + b) * 256 + c) * 256 + d
end
local tmp1, tmp2
for jj = 17 * 2, 80 * 2, 2 do
local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5]
tmp1 = XOR(SHR(a_lo, 1) + SHL(a_hi, 31), SHR(a_lo, 8) + SHL(a_hi, 24), SHR(a_lo, 7) + SHL(a_hi, 25)) + XOR(SHR(b_lo, 19) + SHL(b_hi, 13), SHL(b_lo, 3) + SHR(b_hi, 29), SHR(b_lo, 6) + SHL(b_hi, 26)) + W[jj-14] + W[jj-32]
tmp2 = tmp1 % 4294967296
W[jj-1] = XOR(SHR(a_hi, 1) + SHL(a_lo, 31), SHR(a_hi, 8) + SHL(a_lo, 24), SHR(a_hi, 7)) + XOR(SHR(b_hi, 19) + SHL(b_lo, 13), SHL(b_hi, 3) + SHR(b_lo, 29), SHR(b_hi, 6)) + W[jj-15] + W[jj-33] + (tmp1 - tmp2) / 4294967296
W[jj] = tmp2
end
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo, z_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi, z_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
for j = 1, 80 do
local jj = 2 * j
tmp1 = XOR(SHR(e_lo, 14) + SHL(e_hi, 18), SHR(e_lo, 18) + SHL(e_hi, 14), SHL(e_lo, 23) + SHR(e_hi, 9)) + AND(e_lo, f_lo) + AND(-1-e_lo, g_lo) + h_lo + K_lo[j] + W[jj]
z_lo = tmp1 % 4294967296
z_hi = XOR(SHR(e_hi, 14) + SHL(e_lo, 18), SHR(e_hi, 18) + SHL(e_lo, 14), SHL(e_hi, 23) + SHR(e_lo, 9)) + AND(e_hi, f_hi) + AND(-1-e_hi, g_hi) + h_hi + K_hi[j] + W[jj-1] + (tmp1 - z_lo) / 4294967296
h_lo = g_lo
h_hi = g_hi
g_lo = f_lo
g_hi = f_hi
f_lo = e_lo
f_hi = e_hi
tmp1 = z_lo + d_lo
e_lo = tmp1 % 4294967296
e_hi = z_hi + d_hi + (tmp1 - e_lo) / 4294967296
d_lo = c_lo
d_hi = c_hi
c_lo = b_lo
c_hi = b_hi
b_lo = a_lo
b_hi = a_hi
tmp1 = z_lo + AND(d_lo, c_lo) + AND(b_lo, XOR(d_lo, c_lo)) + XOR(SHR(b_lo, 28) + SHL(b_hi, 4), SHL(b_lo, 30) + SHR(b_hi, 2), SHL(b_lo, 25) + SHR(b_hi, 7))
a_lo = tmp1 % 4294967296
a_hi = z_hi + (AND(d_hi, c_hi) + AND(b_hi, XOR(d_hi, c_hi))) + XOR(SHR(b_hi, 28) + SHL(b_lo, 4), SHL(b_hi, 30) + SHR(b_lo, 2), SHL(b_hi, 25) + SHR(b_lo, 7)) + (tmp1 - a_lo) / 4294967296
end
tmp1 = H_lo[1] + a_lo
tmp2 = tmp1 % 4294967296
H_lo[1], H_hi[1] = tmp2, (H_hi[1] + a_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[2] + b_lo
tmp2 = tmp1 % 4294967296
H_lo[2], H_hi[2] = tmp2, (H_hi[2] + b_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[3] + c_lo
tmp2 = tmp1 % 4294967296
H_lo[3], H_hi[3] = tmp2, (H_hi[3] + c_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[4] + d_lo
tmp2 = tmp1 % 4294967296
H_lo[4], H_hi[4] = tmp2, (H_hi[4] + d_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[5] + e_lo
tmp2 = tmp1 % 4294967296
H_lo[5], H_hi[5] = tmp2, (H_hi[5] + e_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[6] + f_lo
tmp2 = tmp1 % 4294967296
H_lo[6], H_hi[6] = tmp2, (H_hi[6] + f_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[7] + g_lo
tmp2 = tmp1 % 4294967296
H_lo[7], H_hi[7] = tmp2, (H_hi[7] + g_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
tmp1 = H_lo[8] + h_lo
tmp2 = tmp1 % 4294967296
H_lo[8], H_hi[8] = tmp2, (H_hi[8] + h_hi + (tmp1 - tmp2) / 4294967296) % 4294967296
end
end
--------------------------------------------------------------------------------
-- CALCULATING THE MAGIC NUMBERS (roots of primes)
--------------------------------------------------------------------------------
do
local function mul(src1, src2, factor, result_length)
-- Long arithmetic multiplication: src1 * src2 * factor
-- src1, src2 - long integers (arrays of digits in base 2^24)
-- factor - short integer
local result = {}
local carry = 0
local value = 0.0
local weight = 1.0
for j = 1, result_length do
local prod = 0
for k = max(1, j + 1 - #src2), min(j, #src1) do
prod = prod + src1[k] * src2[j + 1 - k]
end
carry = carry + prod * factor
local digit = carry % 16777216
result[j] = digit
carry = floor(carry / 16777216)
value = value + digit * weight
weight = weight * 2^24
end
return
result, -- long integer
value -- and its floating point approximation
end
local idx, step, p, one = 0, {4, 1, 2, -2, 2}, 4, {1}
local sqrt_hi, sqrt_lo, idx_disp = sha2_H_hi, sha2_H_lo, 0
repeat
p = p + step[p % 6]
local d = 1
repeat
d = d + step[d % 6]
if d * d > p then
idx = idx + 1
local root = p^(1/3)
local R = mul({floor(root * 2^40)}, one, 1, 2)
local _, delta = mul(R, mul(R, R, 1, 4), -1, 4)
local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
local lo = R[1] % 256 * 16777216 + floor(delta * (2^-56 / 3) * root / p)
sha2_K_hi[idx], sha2_K_lo[idx] = hi, lo
if idx < 17 then
root = p^(1/2)
R = mul({floor(root * 2^40)}, one, 1, 2)
_, delta = mul(R, R, -1, 2)
hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
lo = R[1] % 256 * 16777216 + floor(delta * 2^-17 / root)
sha2_H_ext256[224][idx + idx_disp] = lo
sqrt_hi[idx + idx_disp], sqrt_lo[idx + idx_disp] = hi, lo
if idx == 8 then
sqrt_hi, sqrt_lo, idx_disp = sha2_H_ext512_hi[384], sha2_H_ext512_lo[384], -8
end
end
break
end
until p % d == 0
until idx > 79
end
-- Calculating IV for SHA512/224 and SHA512/256
for width = 224, 256, 32 do
local H_lo, H_hi = {}, {}
for j = 1, 8 do
H_lo[j] = XOR(sha2_H_lo[j], 0xa5a5a5a5)
H_hi[j] = XOR(sha2_H_hi[j], 0xa5a5a5a5)
end
sha512_feed_128(H_lo, H_hi, sha2_K_lo, sha2_K_hi, "SHA-512/"..tonumber(width).."\128"..string_rep("\0", 115).."\88", common_W, 0, 128)
sha2_H_ext512_lo[width] = H_lo
sha2_H_ext512_hi[width] = H_hi
end
--------------------------------------------------------------------------------
-- FINAL FUNCTIONS
--------------------------------------------------------------------------------
local function sha256ext(width, text)
-- Create an instance (private objects for current calculation)
local H, length, tail = {unpack(sha2_H_ext256[width])}, 0, ""
local function partial(text_part)
if text_part then
if tail then
length = length + #text_part
local offs = 0
if tail ~= "" and #tail + #text_part >= 64 then
offs = 64 - #tail
sha256_feed_64(H, sha2_K_hi, tail..sub(text_part, 1, offs), common_W, 0, 64)
tail = ""
end
local size = #text_part - offs
local size_tail = size % 64
sha256_feed_64(H, sha2_K_hi, text_part, common_W, offs, size - size_tail)
tail = tail..sub(text_part, #text_part + 1 - size_tail)
return partial
else
error("Adding more chunks is not allowed after asking for final result", 2)
end
else
if tail then
local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
tail = nil
-- Assuming user data length is shorter than 2^53 bytes
-- Anyway, it looks very unrealistic that one would spend enough time to process a 2^53 bytes of data by using this Lua script :-)
-- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move floating point to the left
for j = 4, 10 do
length = length % 1 * 256
final_blocks[j] = char(floor(length))
end
final_blocks = table_concat(final_blocks)
sha256_feed_64(H, sha2_K_hi, final_blocks, common_W, 0, #final_blocks)
local max_reg = width / 32
for j = 1, max_reg do
H[j] = HEX(H[j])
end
H = table_concat(H, "", 1, max_reg)
end
return H
end
end
if text then
-- Actually perform calculations and return the SHA256 digest of a message
return partial(text)()
else
-- Return function for partial chunk loading
-- User should feed every chunks of input data as single argument to this function and receive SHA256 digest by invoking this function without an argument
return partial
end
end
local function sha512ext(width, text)
-- Create an instance (private objects for current calculation)
local length, tail, H_lo, H_hi = 0, "", {unpack(sha2_H_ext512_lo[width])}, {unpack(sha2_H_ext512_hi[width])}
local function partial(text_part)
if text_part then
if tail then
length = length + #text_part
local offs = 0
if tail ~= "" and #tail + #text_part >= 128 then
offs = 128 - #tail
sha512_feed_128(H_lo, H_hi, sha2_K_lo, sha2_K_hi, tail..sub(text_part, 1, offs), common_W, 0, 128)
tail = ""
end
local size = #text_part - offs
local size_tail = size % 128
sha512_feed_128(H_lo, H_hi, sha2_K_lo, sha2_K_hi, text_part, common_W, offs, size - size_tail)
tail = tail..sub(text_part, #text_part + 1 - size_tail)
return partial
else
error("Adding more chunks is not allowed after asking for final result", 2)
end
else
if tail then
local final_blocks = {tail, "\128", string_rep("\0", (-17-length) % 128 + 9)}
tail = nil
-- Assuming user data length is shorter than 2^53 bytes
-- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move floating point to the left
for j = 4, 10 do
length = length % 1 * 256
final_blocks[j] = char(floor(length))
end
final_blocks = table_concat(final_blocks)
sha512_feed_128(H_lo, H_hi, sha2_K_lo, sha2_K_hi, final_blocks, common_W, 0, #final_blocks)
local max_reg = ceil(width / 64)
for j = 1, max_reg do
H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
end
H_hi = nil
H_lo = table_concat(H_lo, "", 1, max_reg):sub(1, width / 4)
end
return H_lo
end
end
if text then
-- Actually perform calculations and return the SHA256 digest of a message
return partial(text)()
else
-- Return function for partial chunk loading
-- User should feed every chunks of input data as single argument to this function and receive SHA256 digest by invoking this function without an argument
return partial
end
end
local sha2for51 = {
sha224 = function (text) return sha256ext(224, text) end, -- SHA-224
sha256 = function (text) return sha256ext(256, text) end, -- SHA-256
sha384 = function (text) return sha512ext(384, text) end, -- SHA-384
sha512 = function (text) return sha512ext(512, text) end, -- SHA-512
sha512_224 = function (text) return sha512ext(224, text) end, -- SHA-512/224
sha512_256 = function (text) return sha512ext(256, text) end, -- SHA-512/256
}
return sha2for51
File sha2for51_test.lua
--------------------------------------------------------------------------------
-- TESTS
--------------------------------------------------------------------------------
local sha2 = require"sha2for51"
local function test_sha256()
local sha256 = sha2.sha256
-- some test strings
assert(sha256("The quick brown fox jumps over the lazy dog") == "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592")
assert(sha256("The quick brown fox jumps over the lazy cog") == "e4c4d8f3bf76b692de791a173e05321150f7a345b46484fe427f6acc7ecc81be")
assert(sha256("abc") == "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad")
assert(sha256("123456") == "8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92")
assert(sha256("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") == "248d6a61d20638b8e5c026930c3e6039a33ce45964ff2167f6ecedd419db06c1")
assert(sha256("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") == "cf5b16a778af8380036ce59e7b0492370b249b11e8f07a51afac45037afee9d1")
-- chunk-by-chunk loading: sha256("string") == sha256()("st")("ri")("ng")()
local append_next_chunk = sha256() -- create a private closure for calculating digest of single string
append_next_chunk("The quick brown fox")
append_next_chunk(" jumps ")
append_next_chunk("") -- chunk may be empty string
append_next_chunk("over the lazy dog")
assert(append_next_chunk() == "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592") -- asking for final result (invocation without an argument)
assert(append_next_chunk() == "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592") -- you can ask the same result multiple times if needed
-- append_next_chunk("more text") will fail here: no more chunks are allowed after receiving the result, the closure is useless now, let it be GC-ed
assert(not pcall(append_next_chunk, "more text"))
-- one-liner is possible due to "append_next_chunk(chunk)" returns the function "append_next_chunk"
assert(sha256()("The quick brown fox")(" jumps ")("")("over the lazy dog")() == "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592")
-- empty string
assert(sha256("") == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
assert(sha256()() == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
-- computations of different strings don't interfere with each other
local chunk_for_digits = sha256()
chunk_for_digits("123")
local chunk_for_fox = sha256()
chunk_for_fox("The quick brown fox jumps ")
chunk_for_digits("45")
chunk_for_fox("over the lazy dog")
chunk_for_digits("6")
assert(chunk_for_digits() == "8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92")
assert(chunk_for_fox() == "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592")
-- "00...0\n"
for i, dgst in pairs{ -- from 50 to 70 zeroes
[50] = "9660acb8046abf46cf27280e61abd174ebac98ad6855e093772b78df85523129",
[51] = "31e1c552b357ace9bcb924691799a3c0d3aa10d8b428d9de28a278e3c79ecb7b",
[52] = "0be5c4bcb6f47e30c13515594dbef4faa3a6485af67c177179fee8b33cd4f2a0",
[53] = "d368c7f6038c1743bdbfe6a9c3a72d4e6916aa219ed8d559766c9e8f9845f3b8",
[54] = "7080a4aa6ff030ae152fe610a62ee29464f92afeb176474551a69d35aab154a0",
[55] = "149c1cda81fa9359c0c2a5e405ca972986f1d53e05f6282871dd1581046b3f44",
[56] = "eb2d4d41948ce546c8adff07ee97342070c5b89789f616a33efe52c7d3ec73d4",
[57] = "c831db596ccbbf248023461b1c05d3ae084bcc79bcb2626c5ec179fb34371f2a",
[58] = "1345b8a930737b1069bbf9b891ce095850f6cdba6e25874ea526a2ccb611fe46",
[59] = "380ad21e466885fae080ceeada75ac04944687e626e161c0b24e91af3eec2def",
[60] = "b9ab06fa30ef8531c5eee11651aa86f8279a245e0a3c29bf6228c59475cc610a",
[61] = "bcc187de6605d9e11a0cc6edf02b67fb651fe1779ec59438788093d8e376c07c",
[62] = "ae0b3681157b83b34de8591d2453915e40c3105ae79434e241d82d4035218e01",
[63] = "68a27b4735f6806fb5983c1805a23797aa93ea06e0ebcb6daada2ea1ab5a05af",
[64] = "827d096d92f3deeaa0e8070d79f45beb176768e57a958a1cd325f5f4b754b048",
[65] = "6c7bd8ec0fe9b4e05a2d27dd5e41a8687a9716a2e8926bdfa141266b12942ec1",
[66] = "2f4b4c41017a2ddd1cc8cd75478a82e9452e445d4242f09782535376d6f4ba50",
[67] = "b777b86e005807a446ead00986fcbf3bdd6c022524deabf017eeb3f0c30b6eed",
[68] = "777da331f60c793f582e4ca33223778218ddfd241981f15be5886171fb8301b5",
[69] = "06ed0c4cbf7d2b38de5f01eab2d2cd552d9cb87f97b714b96bb7a9d1b6117c6d",
[70] = "e82223344d5f3c024514cfbe6d478b5df98bb878f34d7a07e7b064fa7fa91946"
} do
assert(sha256(("0"):rep(i).."\n") == dgst)
end
-- "aa...a"
assert(sha256(("a"):rep(55)) == "9f4390f8d30c2dd92ec9f095b65e2b9ae9b0a925a5258e241c9f1e910f734318")
assert(sha256(("a"):rep(56)) == "b35439a4ac6f0948b6d6f9e3c6af0f5f590ce20f1bde7090ef7970686ec6738a")
-- "aa...a\n" in chunk-by-chunk mode
local next_chunk = sha256()
for i = 1, 65 do
next_chunk("a")
end
next_chunk("\n")
assert(next_chunk() == "574883a9977284a46845620eaa55c3fa8209eaa3ebffe44774b6eb2dba2cb325")
local function split_and_calculate_sha256(s, len) -- split string s in chunks of length len
local next_chunk = sha256()
for idx = 1, #s, len do
next_chunk(s:sub(idx, idx + len - 1))
end
return next_chunk()
end
-- "00...0\n00...0\n...00...0\n" (80 lines of 80 zeroes each) in chunk-by-chunk mode with different chunk lengths
local s = (("0"):rep(80).."\n"):rep(80)
assert(split_and_calculate_sha256(s, 1) == "736c7a8b17e2cfd44a3267a844db1a8a3e8988d739e3e95b8dd32678fb599139")
assert(split_and_calculate_sha256(s, 2) == "736c7a8b17e2cfd44a3267a844db1a8a3e8988d739e3e95b8dd32678fb599139")
assert(split_and_calculate_sha256(s, 7) == "736c7a8b17e2cfd44a3267a844db1a8a3e8988d739e3e95b8dd32678fb599139")
assert(split_and_calculate_sha256(s, 70) == "736c7a8b17e2cfd44a3267a844db1a8a3e8988d739e3e95b8dd32678fb599139")
end
local function test_sha512()
local sha512 = sha2.sha512
assert(sha512("abc") == "ddaf35a193617abacc417349ae20413112e6fa4e89a97ea20a9eeee64b55d39a2192992a274fc1a836ba3c23a3feebbd454d4423643ce80e2a9ac94fa54ca49f")
assert(sha512("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") ==
"8e959b75dae313da8cf4f72814fc143f8f7779c6eb9f7fa17299aeadb6889018501d289e4900f7e4331b99dec4b5433ac7d329eeb6dd26545e96e55b874be909")
-- "aa...a"
for i, dgst in pairs{ -- from 109 to 116 letters "a"
[109] = "0cda6b04d9466bb7f3995c16732e1347f29c23a64fe0b085fadba0995644cc5aa71587423c274c10e09518310c5f866cfaceb229fabb574219f12182eb114182",
[110] = "c825949632e509824543f7eaf159fb6041722fce3c1cdcbb613b3d37ff107c519417baac32f8e74fe29d7f4823bf6886956603dca5354a6ed6e4a542e06b7d28",
[111] = "fa9121c7b32b9e01733d034cfc78cbf67f926c7ed83e82200ef86818196921760b4beff48404df811b953828274461673c68d04e297b0eb7b2b4d60fc6b566a2",
[112] = "c01d080efd492776a1c43bd23dd99d0a2e626d481e16782e75d54c2503b5dc32bd05f0f1ba33e568b88fd2d970929b719ecbb152f58f130a407c8830604b70ca",
[113] = "55ddd8ac210a6e18ba1ee055af84c966e0dbff091c43580ae1be703bdb85da31acf6948cf5bd90c55a20e5450f22fb89bd8d0085e39f85a86cc46abbca75e24d",
[114] = "5e9eb0e4b270d086e77eeaf3ce8b1cfc615031b8c463dc34f5c139786f274f22accb4d89e8f40d1a0c2acc84c4dc0f2bab390a9d9495493bd617ed004271bb64",
[115] = "eaa30f93760743ac7d0a6cb8ed5ef3b30c59097bc44d0ec337344301deba9fb92b20c488d55de415f6aaed0df4925b42894b81d2e1cde89d91ec7f6cc67262b4",
[116] = "a8bff469314a1ce0c990bb3fd539d92accb6249cc674b559bc9d3898b7a126fee597197fa42c971443470053c7d7f54b09371a59b0f7af87b1917c5347e8f8e0",
} do
assert(sha512(("a"):rep(i)) == dgst)
end
end
local function all_tests_sha2()
test_sha256()
assert(sha2.sha224"abc" == "23097d223405d8228642a477bda255b32aadbce4bda0b3f7e36c9da7")
assert(sha2.sha224"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" == "75388b16512776cc5dba5da1fd890150b0c6455cb4f58b1952522525")
test_sha512()
assert(sha2.sha384"abc" == "cb00753f45a35e8bb5a03d699ac65007272c32ab0eded1631a8b605a43ff5bed8086072ba1e7cc2358baeca134c825a7")
assert(sha2.sha384"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" == "09330c33f71147e83d192fc782cd1b4753111b173b3b05d22fa08086e3b0f712fcc7c71a557e2db966c3e9fa91746039")
assert(sha2.sha512_224"abc" == "4634270f707b6a54daae7530460842e20e37ed265ceee9a43e8924aa")
assert(sha2.sha512_224"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" == "23fec5bb94d60b23308192640b0c453335d664734fe40e7268674af9")
assert(sha2.sha512_256"abc" == "53048e2681941ef99b2e29b76b4c7dabe4c2d0c634fc6d46e0e2f13107e7af23")
assert(sha2.sha512_256"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" == "3928e184fb8690f840da3988121d31be65cb9d3ef83ee6146feac861e19b563a")
print"All tests passed"
end
all_tests_sha2()
local function benchmark()
print("Benchmarking (calculating SHA512 of 1MByte string of letters 'a')...")
local time_intervals = {}
local length = 2^20
local part = ("a"):rep(2^12)
local N = length/#part
local result
local k = 2
for j = 1, 2*k-1 do
local clk0 = os.clock()
local x = sha2.sha512()
for j = 1, N do
x(part)
end
result = x()
time_intervals[j] = os.clock() - clk0
end
--print("Result = "..result)
-- get median time
table.sort(time_intervals)
print('CPU seconds:', time_intervals[k])
end
benchmark() -- about 15 seconds per megabyte

How to use Z3Py online to solve problems with Operational Amplifiers

Find the value of R in the following circuit
This problem is solved using the following code:
R, V1, V2, Vo = Reals('R V1 V2 Vo')
I1 = V1/(R -50)
I2 = V2/(R + 10)
g = - R*(I1 + I2)
print g
equations = [Vo == g]
print equations
problem = [Vo == -2 , V1 == 1, V2 == 0.5, R != -10, R != 50, R > 0]
solve(equations + problem)
and the corresponding output is:
-R·(V1/(R - 50) + V2/(R + 10))
[Vo = -R·(V1/(R - 50) + V2/(R + 10))]
[R = 143.8986691902?, V2 = 1/2, V1 = 1, Vo = -2]
Other example: Find the value of R in the following circuit
This problem is solved using the following code:
R, Vs, Ve, R1, R2 = Reals('R Vs Ve R1 R2')
g1 = (Vs - Ve)/R1
print g1
g2 = Ve/R2
print g2
equations = [g1 == g2, R1 == 2*R - 100, R2 ==R]
print equations
problem = [Vs == 35 , Ve == 15, R > 0, R1 >0, R2 >0]
solve(equations + problem)
and the corresponding output is:
(Vs - Ve)/R1
Ve/R2
[(Vs - Ve)/R1 = Ve/R2, R1 = 2·R - 100, R2 = R]
[R = 150, Ve = 15, Vs = 35, R2 = 150, R1 = 200]
Other example:
This problem is solved using the following code:
Rf, Rg, Vo, V1, V2, R1, R2, I1, I2, V = Reals('Rf Rg Vo V1 V2 R1 R2 I1 I2 V')
equations = [V1 - V == R1*I1, V - Vo == Rf*I1,
V2 - V == R2*I2, V == Rg*I2]
print equations
problem = [V1 == 10 , V2 == 15, R1 == 100, R2 == 200,
Rf == 100, Rg ==500]
solve(equations + problem)
and the corresponding output is
[V1 - V = R1·I1, V - Vo = Rf·I1, V2 - V = R2·I2, V = Rg·I2]
[I1 = -1/140,
Vo = 80/7,
Rg = 500,
Rf = 100,
R2 = 200,
R1 = 100,
V2 = 15,
V1 = 10,
I2 = 3/140,
V = 75/7]
Other example:
Code:
Rf, Rg, Vo, V1, V2, R1, R2, I1, I2, V = Reals('Rf Rg Vo V1 V2 R1 R2 I1 I2 V')
equations = [V1 - V == R1*I1, V - Vo == Rf*I1,
V2 - V == R2*I2, V == Rg*I2, Rf == 200 + Rg]
print equations
problem = [V1 == 10 , V2 == 15, R1 == 100, R2 == 200, Vo == 20, Rg > 0, Rf >0
]
solve(equations + problem)
Output:
[V1 - V = R1·I1, V - Vo = Rf·I1, V2 - V = R2·I2, V = Rg·I2, Rf = 200 + Rg]
[I1 = -0.0113999063?,
Rg = 577.2001872658?,
Vo = 20,
R2 = 200,
R1 = 100,
V2 = 15,
V1 = 10,
I2 = 0.0193000468?,
V = 11.1399906367?,
Rf = 777.2001872658?]
Other example:
Code:
Vi, R1, I1, Va, R2, I2, R4, R3, Vo= Reals('Vi R1 I1 Va R2 I2 R4 R3 Vo')
equations = [Vi == R1*I1, -Va == R2*I1, Va == R4*I2, Va - Vo ==R3*(I1-I2)]
print equations
problem = [Vi == 1, R1 == 1000, R2 == 1000, R3 == 1000, R4 == 10]
solve(equations + problem)
Output:
[Vi = R1·I1, -Va = R2·I1, Va = R4·I2, Va - Vo = R3·(I1 - I2)]
[R4 = 10,
R3 = 1000,
R2 = 1000,
R1 = 1000,
Vi = 1,
Vo = -102,
I2 = -1/10,
Va = -1,
I1 = 1/1000]
Other example:
Code:
Vi, R1, I1, Va, R2, I2, R4, R3, Vo= Reals('Vi R1 I1 Va R2 I2 R4 R3 Vo')
equations = [Vi == R1*I1, -Va == R2*I1, Va == R4*I2, Va - Vo ==R3*(I1-I2),
R2 == R1 - 100, R3 == R1 - 200]
print equations
problem = [Vi == 1, Vo == -10, R4 == 10, R1 >0, R2 > 0, R3 >0]
solve(equations + problem)
Output:
[Vi = R1·I1, -Va = R2·I1, Va = R4·I2, Va - Vo = R3·(I1 - I2), R2 = R1 - 100, R3 = R1 - 200]
[I1 = 0.0030468970?,
R1 = 328.2027496108?,
I2 = -0.0695310291?,
R4 = 10,
Vo = -10,
Vi = 1,
R3 = 128.2027496108?,
R2 = 228.2027496108?,
Va = -0.6953102918?]
Other example:
Code:
Vi, V1, R, I1, V2, Ri, I2, R1, Vo, RF, A= Reals('Vi V1 R I1 V2 Ri I2 R1 Vo RF A')
equations = [Vi-V1 == R*I1,V1 -V2 == Ri*I2, V2 == R1*I2, V1 - Vo ==RF*(I1-I2),
Vo==A* (V2-V1)]
print equations
problem = [Vi == 1, R1 == 2000, Ri == 100, RF == 1000, R == 300, A == 100]
set_option(rational_to_decimal=True)
solve(equations + problem)
Output:
[Vi - V1 = R·I1, V1 - V2 = Ri·I2, V2 = R1·I2, V1 - Vo = RF·(I1 - I2), Vo = A·(V2 - V1)]
[I2 = 0.0001658374?,
A = 100,
R = 300,
RF = 1000,
Ri = 100,
R1 = 2000,
Vi = 1,
Vo = -1.6583747927?,
I1 = 0.0021724709?,
V1 = 0.3482587064?,
V2 = 0.3316749585?]
Other example:
Code:
Vi, V1, R, I1, V2, Ri, I2, R1, Vo, RF, A= Reals('Vi V1 R I1 V2 Ri I2 R1 Vo RF A')
equations = [Vi-V1 == R*I1,V1 -V2 == Ri*I2, V2 == R1*I2, V1 - Vo ==RF*(I1-I2),
Vo==A* (V2-V1), RF == R + 1000, Ri == R + 500, R1 == R + 1500]
print equations
problem = [Vi == 1, A == 100, Vo == -2, R >0, RF >0, R1 >0]
set_option(rational_to_decimal=True)
solve(equations + problem)
Output:
[Vi - V1 = R·I1, V1 - V2 = Ri·I2, V2 = R1·I2, V1 - Vo = RF·(I1 - I2),
Vo = A·(V2 - V1), RF = R + 1000, Ri = R + 500, R1 = R + 1500]
[I2 = 0.0000150298?,
R = 830.6885937397?,
I1 = 0.0011375745?,
Vo = -2,
A = 100,
Vi = 1,
RF = 1830.6885937397?,
V1 = 0.0550298124?,
V2 = 0.0350298124?,
R1 = 2330.6885937397?,
Ri = 1330.6885937397?]
Other example:
What value of the resistance RB will provide balance of the bridge yielding Vo = 0
Code:
V, RC, RD, I1, VB, RA,RB, I2, VA, V2, R1, I3, R2, V1, R3, R4, I4, Vo =
Reals('V RC RD I1 VB RA RB I2 VA V2 R1 I3 R2 V1 R3 R4 I4 Vo')
equations = [V == (RC+RD)*I1, VB == RD*I1, V == (RA + RB)*I2,
VA == RB*I2, VB-V2 == R1*I3, V2 == R2*I3, VA-V1 == R3*I4,
V1 - Vo == R4*I4, V2 == V1, RD == RB + 10]
print equations
problem = [Vo == 0, V == 5, R1 == 10, R2 == 12, R3 == 10, R4 == 22,
RA ==1, RC ==1, RB >0, RD >0]
set_option(rational_to_decimal=True)
solve(equations + problem)
Output:
[V = (RC + RD)·I1, VB = RD·I1, V = (RA + RB)·I2, VA = RB·I2, VB - V2 = R1·I3,
V2 = R2·I3, VA - V1 = R3·I4, V1 - Vo = R4·I4, V2 = V1, RD = RB + 10]
[I1 = 0.3626991607?,
RB = 2.7855295545?,
I2 = 1.3208191688?,
I4 = 0.1149744009?,
I3 = 0.2107864017?,
RC = 1,
RA = 1,
R4 = 22,
R3 = 10,
R2 = 12,
R1 = 10,
V = 5,
Vo = 0,
V1 = 2.5294368214?,
V2 = 2.5294368214?,
VA = 3.6791808311?,
VB = 4.6373008392?,
RD = 12.7855295545?]
Please let me know what do you think and if you know a more efficient code for these kind of problems. Many thanks.

How to compute with Quaternion numbers in Z3?

In Complex numbers in Z3 Leonardo de Moura was able to introduce and to compute with complex numbers in Z3.
Using the code proposed by Leonardo I am introducing and computing with quaternion numbers in Z3 according with the code presented here . Using this "quaternion " code I am solving the following problem:
x = Quaternion("x")
s = Tactic('qfnra-nlsat').solver()
s.add(x*x + 30 == 0, x.i3 > 0, x.i2 >0, x.i1 > 0)
print(s.check())
m = s.model()
print m
and the corresponding output is:
sat
[x.r = 0, x.i1 = 1, x.i2 = 1, x.i3 = 5.2915026221?]
This result was verified using Maple.
Other example:
x = Quaternion("x")
y = Quaternion("y")
z = Quaternion("z")
s = Tactic('qfnra-nlsat').solver()
s.add(x*y + 30 + x + y*z == 0, x - y + z == 10)
print(s.check())
m = s.model()
print m
and the output is:
sat
[y.r = 1/8,
z.r = 2601/64,
y.i1 = 1/2,
z.i1 = 45/8,
y.i2 = -1/2,
z.i2 = -45/8,
y.i3 = -1/2,
z.i3 = -45/8,
x.i3 = 41/8,
x.i2 = 41/8,
x.i1 = -41/8,
x.r = -1953/64]
Other example:
Proving that
x * y != y * x
Code:
x = Quaternion("x")
y = Quaternion("y")
a1, b1, c1, d1 = Reals('a1 b1 c1 d1')
a2, b2, c2, d2 = Reals('a2 b2 c2 d2')
x.r = a1
x.i1 = b1
x.i2 = c1
x.i3 = d1
y.r = a2
y.i1 = b2
y.i2 = c2
y.i3 = d2
print simplify((x * y - y * x).r)
print simplify((x * y - y * x).i1)
print simplify((x * y - y * x).i2)
print simplify((x * y - y * x).i3)
Output:
0
2·c2·d1 + -2·c1·d2
-2·b2·d1 + 2·b1·d2
2·b2·c1 + -2·b1·c2
Other example : Proving that the quaternions
A = (1+ I)/sqrt(2),
B =(1 + J)/sqrt(2),
C = (1 + K)/sqrt(2)
generate a representation of the Braid Group, it is to say, we have that
ABA = BAB, ACA = CAC, BCB = CBC.
Code:
A = Quaternion('A')
B = Quaternion('B')
C = Quaternion('C')
A.r = 1/Sqrt(2)
A.i1 = 1/Sqrt(2)
A.i2 = 0
A.i3 = 0
B.r = 1/Sqrt(2)
B.i1 = 0
B.i2 = 1/Sqrt(2)
B.i3 = 0
C.r = 1/Sqrt(2)
C.i1 = 0
C.i2 = 0
C.i3 = 1/Sqrt(2)
print simplify((A*B*A-B*A*B).r)
print simplify((A*B*A-B*A*B).i1)
print simplify((A*B*A-B*A*B).i2)
print simplify((A*B*A-B*A*B).i3)
print "Proved : ABA = BAB:"
print simplify((A*C*A-C*A*C).r)
print simplify((A*C*A-C*A*C).i1)
print simplify((A*C*A-C*A*C).i2)
print simplify((A*C*A-C*A*C).i3)
print "Proved : ACA = CAC:"
print simplify((B*C*B-C*B*C).r)
print simplify((B*C*B-C*B*C).i1)
print simplify((B*C*B-C*B*C).i2)
print simplify((B*C*B-C*B*C).i3)
print "Proved : BCB = CBC:"
Output:
0
0
0
0
Proved : ABA = BAB.
0
0
0
0
Proved : ACA = CAC.
0
0
0
0
Proved : BCB = CBC.
Other example: Proving that
x / x = 1
for all invertible quaternion:
Code:
x = Quaternion("x")
a, a1, a2, a3 = Reals('a a1 a2 a3')
x.r = a
x.i1 = a1
x.i2 = a2
x.i3 = a3
s = Solver()
s.add(Or(a != 0, a1 != 0, a2 != 0, a3 != 0), Not((x/x).r == 1))
print s.check()
s1 = Solver()
s1.add(Or(a != 0, a1 != 0, a2 != 0, a3 != 0), Not((x/x).i1 == 0))
print s1.check()
s2 = Solver()
s2.add(Or(a != 0, a1 != 0, a2 != 0, a3 != 0), Not((x/x).i2 == 0))
print s2.check()
s3 = Solver()
s3.add(Or(a != 0, a1 != 0, a2 != 0, a3 != 0), Not((x/x).i3 == 0))
print s3.check()
Output:
unsat
unsat
unsat
unsat
Please let me know what do you think about the "quaternion" code and how the "quaternion" code can be improved. Many thanks.

Resources