Inner product of two 16bit integer vectors with AVX2 in C++ - vectorization

I am searching for the most efficient way to multiply two aligned int16_t arrays whose length can be divided by 16 with AVX2.
After multiplication into a vector x I started with _mm256_extracti128_si256 and _mm256_castsi256_si128 to have the low and high part of x and added them with _mm_add_epi16.
I copied the result register and applied _mm_move_epi64 to the original register and added both again with _mm_add_epi16. Now, I think that I have:
-, -, -, -, x15+x7+x11+x3, x14+x6+x10+x2, x13+x5+x9+x1, x12+x4+x8+x0
within the 128bit register. But now I am stuck and don't know how to efficiently sum up the remaining four entries and how to extract the 16bit result.

Following the comments and hours of google my working solution:
// AVX multiply
hash = 1;
start1 = std::chrono::high_resolution_clock::now();
for(int i=0; i<2000000; i++) {
ZTYPE* xv = al_entr1.c.data();
ZTYPE* yv = al_entr2.c.data();
__m256i tres = _mm256_setzero_si256();
for(int ii=0; ii < MAX_SIEVING_DIM; ii = ii+16/*8*/)
{
// editor's note: alignment required. Use loadu for unaligned
__m256i xr = _mm256_load_si256((__m256i*)(xv+ii));
__m256i yr = _mm256_load_si256((__m256i*)(yv+ii));
const __m256i tmp = _mm256_madd_epi16 (xr, yr);
tres = _mm256_add_epi32(tmp, tres);
}
// Reduction
const __m128i x128 = _mm_add_epi32 ( _mm256_extracti128_si256(tres, 1), _mm256_castsi256_si128(tres));
const __m128i x128_up = _mm_shuffle_epi32(x128, 78);
const __m128i x64 = _mm_add_epi32 (x128, x128_up);
const __m128i _x32 = _mm_hadd_epi32(x64, x64);
const int res = _mm_extract_epi32(_x32, 0);
hash |= res;
}
finish1 = std::chrono::high_resolution_clock::now();
elapsed1 = finish1 - start1;
std::cout << "AVX multiply: " <<elapsed1.count() << " sec. (" << hash << ")" << std::endl;
It is at least the fastest solution so far:
std::inner_product: 0.819781 sec. (-14335)
std::inner_product (aligned): 0.964058 sec. (-14335)
naive multiply: 0.588623 sec. (-14335)
Unroll multiply: 0.505639 sec. (-14335)
AVX multiply: 0.0488352 sec. (-14335)

Related

How to Calculate CRC Starting at Last Byte

I'm trying to implement a CRC-CCITT calculator in VHDL. I was able to initially do that; however, I recently found out that data is delivered starting at the least-significant byte. In my code, data is transmitted 7 bytes at a time through a frame. So let's say we have the following data: 123456789 in ASCII or 313233343536373839 in hex. The data would be transmitted as such (with the following CRC):
-- First frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second/last frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31", -- MSB
3 => xx, -- "xx" means irrelevant data, not part of CRC calculation.
4 => xx, -- This occurs only in the last frame, when it specified in
5 => xx, -- byte 0 which bytes contain data
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x31C3
Another example with data 0x4376669A1CFC048321313233343536373839 and its correct CRC is shown below:
-- First incoming frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second incoming frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31",
3 => x"21",
4 => x"83",
5 => x"04",
6 => x"FC",
7 => x"1C"
);
-- Third/last incoming frame of data
RxFrame.Data <= (
1 => x"9A",
2 => x"66",
3 => x"76",
4 => x"43", -- MSB
5 => xx, -- Irrelevant data, specified in byte 0
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x2848
Is there a concept I'm missing? Is there a way to calculate the CRC with the data being received in reverse order? I am implementing this for CANopen SDO block protocols. Thanks!
CRC calculation algorithm to verify SDO block transfer from CANopen standard
Example code to generate a CRC16 with the bytes read in reverse (last byte first), using a function to do a carryless multiply modulo the CRC polynomial. An explanation follows.
#include <stdio.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#define POLY (0x1021u)
/* carryless multiply modulo crc polynomial */
uint16_t MpyModPoly(uint16_t a, uint16_t b) /* (a*b)%poly */
{
uint16_t pd = 0;
uint16_t i;
for(i = 0; i < 16; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>15))&POLY);
pd ^= (0-(b>>15))&a;
b <<= 1;
}
return pd;
}
/* generate crc in reverse byte order */
uint16_t Crc16R(uint8_t * b, size_t sz)
{
uint8_t *e = b + sz; /* end of bfr ptr */
uint16_t crc = 0u; /* crc */
uint16_t pdm = 0x100u; /* padding multiplier */
while(e > b){ /* generate crc */
pdm = MpyModPoly(0x100, pdm);
crc ^= MpyModPoly( *--e, pdm);
}
return(crc);
}
/* msg will be processed in reverse order */
static uint8_t msg[] = {0x43,0x76,0x66,0x9A,0x1C,0xFC,0x04,0x83,
0x21,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
0x38,0x39};
int main()
{
uint16_t crc;
crc = Crc16R(msg, sizeof(msg));
printf("%04x\n", crc);
return 0;
}
Example code using X86 xmm pclmulqdq and psrlq, to emulate a 16 bit by 16 bit hardware (VHDL) carryless multiply:
/* __m128i is an intrinsic for X86 128 bit xmm register */
static __m128i poly = {.m128i_u32[0] = 0x00011021u}; /* poly */
static __m128i invpoly = {.m128i_u32[0] = 0x00008898u}; /* 2^31 / poly */
/* carryless multiply modulo crc polynomial */
/* using xmm pclmulqdq and psrlq */
uint16_t MpyModPoly(uint16_t a, uint16_t b)
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* mp = a*b */
mt = _mm_srli_epi64(mp, 16); /* mt = mp>>16 */
mt = _mm_clmulepi64_si128(mt, invpoly, 0x00); /* mt = mt*ipoly */
mt = _mm_srli_epi64(mt, 15); /* mt = mt>>15 = (a*b)/poly */
mt = _mm_clmulepi64_si128(mt, poly, 0x00); /* mt = mt*poly */
return mp.m128i_u16[0] ^ mt.m128i_u16[0]; /* ret mp^mt */
}
/* external code to generate invpoly */
#define POLY (0x11021u)
static __m128i invpoly; /* 2^31 / poly */
void GenMPoly(void) /* generate __m12i8 invpoly */
{
uint32_t N = 0x10000u; /* numerator = x^16 */
uint32_t Q = 0; /* quotient = 0 */
for(size_t i = 0; i <= 15; i++){ /* 31 - 16 = 15 */
Q <<= 1;
if(N&0x10000u){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
invpoly.m128i_u16[0] = Q;
}
Explanation: consider the data as separate strings of ever increasing length, padded with zeroes at the end. For the first few bytes of your example, the logic would calculate
CRC = CRC16({39})
CRC ^= CRC16({38 00})
CRC ^= CRC16({37 00 00})
CRC ^= CRC16({36 00 00 00})
...
To speed up this calculation, rather than actually pad with n zero bytes, you can do a carryless multiply of a CRC by 2^{n·8} modulo POLY, where POLY is the 17 bit polynomial used for CRC16:
CRC = CRC16({39})
CRC ^= (CRC16({38}) · (2^08 % POLY)) % POLY
CRC ^= (CRC16({37}) · (2^10 % POLY)) % POLY
CRC ^= (CRC16({36}) · (2^18 % POLY)) % POLY
...
A carryless multiply modulo POLY is equivalent to what CRC16 does, so this translates into pseudo code (all values in hex, 2^8 = 100)
CRC = 0
PDM = 100 ;padding multiplier
PDM = (100 · PDM) % POLY ;main loop (2 lines per byte)
CRC ^= ( 39 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 38 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 37 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 36 · PDM) % POLY
...
Implementing (A · B) % POLY is based on binary math:
(A · B) % POLY = (A · B) ^ (((A · B) / POLY) · POLY)
Where multiply is carryless (XOR instead of add) and divide is borrowless (XOR instead of subtract). Since the divide is borrowless, and most significant term of POLY is x^16, the quotient
Q = (A · B) / POLY
only depends on the upper 16 bits of (A · B). Dividing by POLY uses multiplication by the 16 bit constant IPOLY = (2^31)/POLY followed by a right shift:
Q = (A · B) / POLY = (((A · B) >> 16) · IPOLY) >> 15
The process uses a 16 bit by 16 bit carryless multiply, producing a 31 bit product.
POLY = 0x11021u ; CRC polynomial (17 bit)
IPOLY = 0x08898u ; 2^31 / POLY
; generated by external software
MpyModPoly(A, B)
{
MP = A · B ; MP = A · B
MT = MP >> 16 ; MT = MP >> 16
MT = MT · IPOLY ; MT = MT · IPOLY
MT = MT >> 15 ; MT = (A · B) / POLY
MT = MT · POLY ; MT = ((A · B) / POLY) * POLY
return MP xor MT ; (A·B) ^ (((A · B) / POLY) · POLY)
}
A hardware based carryless multiply would look something like this 4 bit · 4 bit example.
p[] = [a3 a2 a1 a0] · [b3 b2 b1 b0]
p[] is a 7 bit product generated with 7 parallel circuits.
The time for multiply would be worst case propagation time for p3.
p6 = a3&b3
p5 = a3&b2 ^ a2&b3
p4 = a3&b1 ^ a2&b2 ^ a1&b3
p3 = a3&b0 ^ a2&b1 ^ a1&b2 ^ a0&b3
p2 = a2&b0 ^ a1&b1 ^ a0&b2
p1 = a1&b0 ^ a0&b1
p0 = a0&b0
If the xor gates available only have 2 bit inputs, the logic can
be split up. For example:
p3 = (a3&b0 ^ a2&b1) ^ (a1&b2 ^ a0&b3)
I don't know if your VHDL toolset includes a library for carryless multiply. For a 16 bit by 16 bit multiply resulting in a 31 bit product (p30 to p00), p15 has 16 outputs from the 16 ands (in parallel), which could be xor'ed using a tree like structure, 8 xors in parallel feeding into 4 xors in parallel feeding into 2 xor's in parallel into a single xor. So the propagation time would be 1 and and 4 xor propagation times.
Here is an example in C that you can adapt. Since you mentioned VHDL, this is a bit-wise implementation suitable for casting into gates and flip-flops. However, if cycles are more precious to you than memory and gates, then there is also a byte-wise table-driven version that would run in 1/8 the number of cycles.
What this does is the inverse of what is done in a normal CRC calculation. It then applies the same size input in zeros with a normal CRC to get what the normal CRC would have been on that input. Running the zeros through takes the same number of cycles as the inverse CRC, i.e. O(n) where n is the size of the input. If that latency is too large, that can be reduced to O(log n) cycles, with some investment in gates.
#include <stddef.h>
// Update crc with the CRC-16/XMODEM of n zero bytes. (This can be done in
// O(log n) time or cycles instead of O(n), with a little more effort.)
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
for (size_t i = 0; i < n; i++)
for (int k = 0; k < 8; k++)
crc = crc & 0x8000 ? (crc << 1) ^ 0x1021 : crc << 1;
return crc & 0xffff;
}
// Update crc with the CRC-16/XMODEM of the len bytes at mem in reverse. If mem
// is NULL, then return the initial value for the CRC. When done,
// crc16x_zeros_bit() must be used to apply the total length of zero bytes, in
// order to get what the CRC would have been if it were calculated on the bytes
// fed in the opposite order.
static unsigned crc16x_inverse_bit(unsigned crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc &= 0xffff;
for (size_t i = 0; i < len; i++) {
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ 0x8810 : crc >> 1;
crc ^= (unsigned)data[i] << 8;
}
return crc;
}
#include <stdio.h>
int main(void) {
// Do framed example.
unsigned crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21", 2);
crc = crc16x_zeros_bit(crc, 9);
printf("%04x\n", crc);
// Do another one.
crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21!\x83\x04\xfc\x1c", 7);
crc = crc16x_inverse_bit(crc, (void const *)"\x9a" "fvC", 4);
crc = crc16x_zeros_bit(crc, 18);
printf("%04x\n", crc);
return 0;
}
Here is the O(log n) version of crc16x_zeros_bit():
// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC
// polynomial. For speed, a cannot be zero.
static inline unsigned multmodp(unsigned a, unsigned b) {
unsigned p = 0;
for (;;) {
if (a & 1) {
p ^= b;
if (a == 1)
break;
}
a >>= 1;
b = b & 0x8000 ? (b << 1) ^ 0x1021 : b << 1;
}
return p & 0xffff;
}
// Return x^(8n) modulo p(x).
static unsigned x2nmodp(size_t n) {
unsigned p = 1; // x^0 == 1
unsigned q = 0x10; // x^2^2
while (n) {
q = multmodp(q, q); // x^2^k mod p(x), k = 3,4,...
if (n & 1)
p = multmodp(q, p);
n >>= 1;
}
return p;
}
// Update crc with the CRC-16/XMODEM of n zero bytes.
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
return multmodp(x2nmodp(n), crc);
}

Eigen FFT library

I am trying to use Eigen unsupported FFT library using FFTW backend. Specifically I am want to do a 2D FFT. Here's my code :
void fft2(Eigen::MatrixXf * matIn,Eigen::MatrixXcf * matOut)
{
const int nRows = matIn->rows();
const int nCols = matIn->cols();
Eigen::FFT< float > fft;
for (int k = 0; k < nRows; ++k) {
Eigen::VectorXcf tmpOut(nRows);
fft.fwd(tmpOut, matIn->row(k));
matOut->row(k) = tmpOut;
}
for (int k = 0; k < nCols; ++k) {
Eigen::VectorXcf tmpOut(nCols);
fft.fwd(tmpOut, matOut->col(k));
matOut->col(k) = tmpOut;
}
}
I have 2 problems :
First, I get a segmentation fault when using this code on some matrix. This error doesn't happen for all matrixes. I guess it's related to an alignment error. I use the functions in the following way :
Eigen::MatrixXcf matFFT(mat.rows(),mat.cols());
fft2(&matFloat,&matFFT);
where mat can be any matrix. Funnily, the code plants only when I compute the FFT over the 2nd dimension, never on the first one. This doesn't happen with kissFFT backend.
Second I don't get the same result as Matlab (that uses FFTW), when the function works. Eg :
Input Matrix :
[2, 1, 2]
[3, 2, 1]
[1, 2, 3]
Eigen gives :
[ (0,5), (0.5,0.86603), (0,0.5)]
[ (-4.3301,-2.5), (-1,-1.7321), (0.31699,-1.549)]
[ (-1.5,-0.86603), (2,3.4641), (2,3.4641)]
Matlab gives :
17 + 0i 0.5 + 0.86603i 0.5 - 0.86603i
-1 + 0i -1 - 1.7321i 2 - 3.4641i
-1 + 0i 2 + 3.4641i -1 + 1.7321i
Only the central part is the same.
Any help would be welcome.
I failed to activate EIGEN_FFTW_DEFAULT in my first solution, activating it reveals an error in the fftw-support implementation of Eigen. The following works:
#define EIGEN_FFTW_DEFAULT
#include <iostream>
#include <unsupported/Eigen/FFT>
int main(int argc, char *argv[])
{
Eigen::MatrixXf A(3,3);
A << 2,1,2, 3,2,1, 1,2,3;
const int nRows = A.rows();
const int nCols = A.cols();
std::cout << A << "\n\n";
Eigen::MatrixXcf B(3,3);
Eigen::FFT< float > fft;
for (int k = 0; k < nRows; ++k) {
Eigen::VectorXcf tmpOut(nRows);
fft.fwd(tmpOut, A.row(k));
B.row(k) = tmpOut;
}
std::cout << B << "\n\n";
Eigen::FFT< float > fft2; // Workaround: Using the same FFT object for a real and a complex FFT seems not to work with FFTW
for (int k = 0; k < nCols; ++k) {
Eigen::VectorXcf tmpOut(nCols);
fft2.fwd(tmpOut, B.col(k));
B.col(k) = tmpOut;
}
std::cout << B << '\n';
}
I get this output:
2 1 2
3 2 1
1 2 3
(17,0) (0.5,0.866025) (0.5,-0.866025)
(-1,0) (-1,-1.73205) (2,-3.4641)
(-1,0) (2,3.4641) (-1,1.73205)
Which is the same as your Matlab result.
N.B.: FFTW seems to support 2D real->complex FFT natively (without using individual FFTs). This is likely more efficient.
fftwf_plan fftwf_plan_dft_r2c_2d(int n0, int n1,
float *in, fftwf_complex *out, unsigned flags);

How does #pragma simd reduction(<operator>:<variable>) work under the hood?

I would like to know in more detail how the simd reduction clause used by Intel compilers works under the hood.
In particular, for a loop of the form
double x = x_initial;
#pragma simd reduction(<operator1>:x)
for( int i = 0; i < N; i++ )
x <operator2> some_value;
my naive guess is as follows:
The compiler initializes a private copy of x for each vector lane, then iterates through the loop one vector width at a time. If the vector width is 4 doubles, for example, this would correspond to N/4 iterations plus a peel loop at the end. At each step of the iteration, each lane's private copy of x is updated using operator2, then at the end, the 4 vector lanes' private copies are combined using operator1. The auto-vectorization guide does not appear to address this directly.
I did some experimentation and found some results that agree with my expectation and some that don't. For example, I tried the case
double x = 1;
#pragma simd reduction(*:x) assert
for( int i = 0; i < 16; i++ )
x += a[i]; // All elements of a are equal to 3.0
cout << "x after (*:x), x += a[i] loop: " << x << endl;
where operator1 is * and operator2 is +=. When I compile for avx2, which has a vector width of 4 doubles, the output is 28561 = ( 1 + 4*a[i] )^4. This implies that the code first initializes 4 lane-private copies of x to 1, then adds 3 to each of those copies 4 times as the 4-double-wide vector lane iterates across the trip count of 16. Each lane-private copy of x is now equal to 13. Finally, the lane-private copies are combined (reduced) using operator2 which is *, yielding 13*13*13*13 = 28561.
However, when I switch the * and + operators, like so
x = 1;
#pragma simd reduction(+:x) assert
for( int i = 0; i < 16; i++ )
x *= a[i];
cout << "x after (+:x), x *= a[i] loop: " << x << endl;
and compile again for avx2, the output is 1.0. If my theory were correct, each vector lane should end up containing a value of 1*3^4, which would then be combined using + to yield 4*3^4 = 324. Evidently this is not the case. What am I missing?

MSE for two Vec3b images in OpenCV

I have two Vec3b images and I want to find the MSE (Mean Square Error) between them. I know how to do it when you have two uchar images, but when you have two Vec3b images where there are 3 different values stored for each pixel how do you calculate it?
You should compute the Euclidean distance for each pair of pixels:
MSE = 0;
for(int i = 0; i < width; i++)
for(int j = 0; j < height; j++)
MSE += sqrt(pow(img1.at<Vec3b>(j, i)[0] - img2.at<Vec3b>(j, i)[0]), 2) + pow(img1.at<Vec3b>(j, i)[1] - img2.at<Vec3b>(j, i)[1]), 2) + pow(img1.at<Vec3b>(j, i)[2] - img2.at<Vec3b>(j, i)[2]), 2));
MSE /= width * height;
This code can be optimized and if you convert your image from BGR to HSV, you could get better results according what you want to do.
To calculate the Mean Square Error for 1D and 3D images in opencv, you can use this post which might be faster since image scanning takes longer times.
double getMSE(Mat& I1, Mat& I2)
{
Mat s1;
// save the I! and I2 type before converting to float
int im1type = I1.type();
int im2type = I2.type();
// convert to float to avoid producing zero for negative numbers
I1.convertTo(I1, CV_32F);
I2.convertTo(I2, CV_32F);
absdiff(I1, I2, s1); // |I1 - I2|
s1.convertTo(s1, CV_32F); // cannot make a square on 8 bits
s1 = s1.mul(s1); // |I1 - I2|^2
Scalar s = sum(s1); // sum elements per channel
double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels
if( sse <= 1e-10) // for small values return zero
return 0;
else
{
double mse =sse /(double)(I1.channels() * I1.total());
return mse;
// Instead of returning MSE, the tutorial code returned PSNR (below).
//double psnr = 10.0*log10((255*255)/mse);
//return psnr;
}
// return I1 and I2 to their initial types
I1.convertTo(I1, im1type);
I2.convertTo(I2, im2type);
}
The above code returns zero for small mse values (under 1e-10). Terms s.val1 and s.val[2] are zero for 1D images.
If you want to check for 1D image input, use the following code to test (with random unsigned numbers):
Mat I1(12, 12, CV_8UC1), I2(12, 12, CV_8UC1);
double low = 0;
double high = 255;
cv::randu(I1, Scalar(low), Scalar(high));
cv::randu(I2, Scalar(low), Scalar(high));
double mse = getMSE(I1, I2);
cout << mse << endl;
If you want to check for 3D image input, use the following code to test (with random unsigned numbers):
Mat I1(12, 12, CV_8UC3), I2(12, 12, CV_8UC3);
double low = 0;
double high = 255;
cv::randu(I1, Scalar(low), Scalar(high));
cv::randu(I2, Scalar(low), Scalar(high));
double mse = getMSE(I1, I2);
cout << mse << endl;

CRC Calculation Of A Mostly Static Data Stream

Background:
I have a section of memory, 1024 bytes. The last 1020 bytes will always be the same. The first 4 bytes will change (serial number of a product). I need to calculate the CRC-16 CCITT (0xFFFF starting, 0x1021 mask) for the entire section of memory, CRC_WHOLE.
Question:
Is it possible to calculate the CRC for only the first 4 bytes, CRC_A, then apply a function such as the one below to calculate the full CRC? We can assume that the checksum for the last 1020 bytes, CRC_B, is already known.
CRC_WHOLE = XOR(CRC_A, CRC_B)
I know that this formula does not work (tried it), but I am hoping that something similar exists.
Yes. You can see how in zlib's crc32_combine(). If you have two sequences A and B, then the pure CRC of AB is the exclusive-or of the CRC of A0 and the CRC of 0B, where the 0's represent a series of zero bytes with the length of the corresponding sequence, i.e. B and A respectively.
For your application, you can pre-compute a single operator that applies 1020 zeros to the CRC of your first four bytes very rapidly. Then you can exclusive-or that with the pre-computed CRC of the 1020 bytes.
Update:
Here is a post of mine from 2008 with a detailed explanation that #ArtemB discovered (that I had forgotten about):
crc32_combine() in zlib is based on two key tricks. For what follows,
we set aside the fact that the standard 32-bit CRC is pre and post-
conditioned. We can deal with that later. Assume for now a CRC that
has no such conditioning, and so starts with the register filled with
zeros.
Trick #1: CRCs are linear. So if you have stream X and stream Y of
the same length and exclusive-or the two streams bit-by-bit to get Z,
i.e. Z = X ^ Y (using the C notation for exclusive-or), then CRC(Z) =
CRC(X) ^ CRC(Y). For the problem at hand we have two streams A and B
of differing length that we want to concatenate into stream Z. What
we have available are CRC(A) and CRC(B). What we want is a quick way
to compute CRC(Z). The trick is to construct X = A concatenated with
length(B) zero bits, and Y = length(A) zero bits concatenated with B.
So if we represent concatenation simply by juxtaposition of the
symbols, X = A0, Y = 0B, then X^Y = Z = AB. Then we have CRC(Z) =
CRC(A0) ^ CRC(0B).
Now we need to know CRC(A0) and CRC(0B). CRC(0B) is easy. If we feed
a bunch of zeros to the CRC machine starting with zero, the register
is still filled with zeros. So it's as if we did nothing at all.
Therefore CRC(0B) = CRC(B).
CRC(A0) requires more work however. Taking a non-zero CRC and feeding
zeros to the CRC machine doesn't leave it alone. Every zero changes
the register contents. So to get CRC(A0), we need to set the register
to CRC(A), and then run length(B) zeros through it. Then we can
exclusive-or the result of that with CRC(B) = CRC(0B), and we get what
we want, which is CRC(Z) = CRC(AB). Voila!
Well, actually the voila is premature. I wasn't at all satisfied with
that answer. I didn't want a calculation that took a time
proportional to the length of B. That wouldn't save any time compared
to simply setting the register to CRC(A) and running the B stream
through. I figured there must be a faster way to compute the effect
of feeding n zeros into the CRC machine (where n = length(B)). So
that leads us to:
Trick #2: The CRC machine is a linear state machine. If we know the
linear transformation that occurs when we feed a zero to the machine,
then we can do operations on that transformation to more efficiently
find the transformation that results from feeding n zeros into the
machine.
The transformation of feeding a single zero bit into the CRC machine
is completely represented by a 32x32 binary matrix. To apply the
transformation we multiply the matrix by the register, taking the
register as a 32 bit column vector. For the matrix multiplication in
binary (i.e. over the Galois Field of 2), the role of multiplication
is played by and'ing, and the role of addition is played by exclusive-
or'ing.
There are a few different ways to construct the magic matrix that
represents the transformation caused by feeding the CRC machine a
single zero bit. One way is to observe that each column of the matrix
is what you get when your register starts off with a single one in
it. So the first column is what you get when the register is 100...
and then feed a zero, the second column comes from starting with
0100..., etc. (Those are referred to as basis vectors.) You can see
this simply by doing the matrix multiplication with those vectors.
The matrix multiplication selects the column of the matrix
corresponding to the location of the single one.
Now for the trick. Once we have the magic matrix, we can set aside
the initial register contents for a while, and instead use the
transformation for one zero to compute the transformation for n
zeros. We could just multiply n copies of the matrix together to get
the matrix for n zeros. But that's even worse than just running the n
zeros through the machine. However there's an easy way to avoid most
of those matrix multiplications to get the same answer. Suppose we
want to know the transformation for running eight zero bits, or one
byte through. Let's call the magic matrix that represents running one
zero through: M. We could do seven matrix multiplications to get R =
MxMxMxMxMxMxMxM. Instead, let's start with MxM and call that P. Then
PxP is MxMxMxM. Let's call that Q. Then QxQ is R. So now we've
reduced the seven multiplications to three. P = MxM, Q = PxP, and R =
QxQ.
Now I'm sure you get the idea for an arbitrary n number of zeros. We
can very rapidly generate transformation matrices Mk, where Mk is the
transformation for running 2k zeros through. (In the
paragraph above M3 is R.) We can make M1 through Mk with only k
matrix multiplications, starting with M0 = M. k only has to be as
large as the number of bits in the binary representation of n. We can
then pick those matrices where there are ones in the binary
representation of n and multiply them together to get the
transformation of running n zeros through the CRC machine. So if n =
13, compute M0 x M2 x M3.
If j is the number of one's in the binary representation of n, then we
just have j - 1 more matrix multiplications. So we have a total of k
j - 1 matrix multiplications, where j <= k = floor(logbase2(n)).
Now we take our rapidly constructed matrix for n zeros, and multiply
that by CRC(A) to get CRC(A0). We can compute CRC(A0) in O(log(n))
time, instead of O(n) time. We exclusive or that with CRC(B) and
Voila! (really this time), we have CRC(Z).
That's what zlib's crc32_combine() does.
I will leave it as an exercise for the reader as to how to deal with
the pre and post conditioning of the CRC register. You just need to
apply the linearity observations above. Hint: You don't need to know
length(A). In fact crc32_combine() only takes three arguments:
CRC(A), CRC(B), and length(B) (in bytes).
Below is example C code for an alternative approach for CRC(A0). Rather than working with a matrix, a CRC can be cycled forward n bits by muliplying (CRC · ((2^n)%POLY)%POLY . So the repeated squaring is performed on an integer rather than a matrix. If n is constant, then (2^n)%POLY can be pre-computed.
/* crcpad.c - crc - data has a large number of trailing zeroes */
#include <stdio.h>
#include <stdlib.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
#define POLY (0x04c11db7u)
static uint32_t crctbl[256];
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLY);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
uint32_t pd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>31))&POLY);
pd ^= (0-(b>>31))&a;
b <<= 1;
}
return pd;
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
GenTbl(); /* generate crc table */
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}
Example C code using intrinsic for carryless multiply, pclmulqdq == _mm_clmulepi64_si128:
/* crcpadm.c - crc - data has a large number of trailing zeroes */
/* pclmulqdq intrinsic version */
#include <stdio.h>
#include <stdlib.h>
#include <intrin.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define POLY (0x104c11db7ull)
#define POLYM ( 0x04c11db7u)
static uint32_t crctbl[256];
static __m128i poly; /* poly */
static __m128i invpoly; /* 2^64 / POLY */
void GenMPoly(void) /* generate __m12i8 poly info */
{
uint64_t N = 0x100000000ull;
uint64_t Q = 0;
for(size_t i = 0; i < 33; i++){
Q <<= 1;
if(N&0x100000000ull){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
poly.m128i_u64[0] = POLY;
invpoly.m128i_u64[0] = Q;
}
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLYM);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* p[0] = a*b */
mt = _mm_clmulepi64_si128(mp, invpoly, 0x00); /* t[1] = (p[0]*((2^64)/POLY))>>64 */
mt = _mm_clmulepi64_si128(mt, poly, 0x01); /* t[0] = t[1]*POLY */
return mp.m128i_u32[0] ^ mt.m128i_u32[0]; /* ret = p[0] ^ t[0] */
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
GenMPoly(); /* generate __m128 polys */
GenTbl(); /* generate crc table */
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}

Resources