A mod B, A and B are very large numbers - modulo

I want to know if A and B are relatively prime using Euclidean Algorithm. A and B are large numbers that cannot be stored in any data type(in C), so they are stored in a linked list. In the algorithm, the operator % is used. My question is, is there a way to compute for A mod B without actually directly using the % operator. I found out that % is distributive over addition:
A%B = ((a1%B)+(a2%B))%B.
But the problem still persists because I will still be doing %B operations.

You need calculate a % b without the % operator. OK? By definition the modulo operation finds the remainder after division of one number by another.
In python:
# mod = a % b
def mod(a, b):
return a-b*int(a/b)
>>> x = [mod(i,j) for j in range(1,100) for i in range(1,100)]
>>> y = [i % j for j in range(1,100) for i in range(1,100)]
>>> x == y
True
In C++:
#include <iostream>
#include <math.h>
using namespace std;
unsigned int mod(unsigned int a, unsigned int b) {
return (unsigned int)(a-b*floor(a/b));
}
int main() {
for (unsigned int i=1; i<=sizeof(unsigned int); ++i)
for (unsigned int j=1; j<=sizeof(unsigned int); ++j)
if (mod(i,j) != i%j)
cout << "Somthing wrong!!";
cout << "Proved for all unsigned int!";
return 0;
}
Proved for all unsigned int!
Now, just extend the result to your big numbers...!!!

Related

How to Calculate CRC Starting at Last Byte

I'm trying to implement a CRC-CCITT calculator in VHDL. I was able to initially do that; however, I recently found out that data is delivered starting at the least-significant byte. In my code, data is transmitted 7 bytes at a time through a frame. So let's say we have the following data: 123456789 in ASCII or 313233343536373839 in hex. The data would be transmitted as such (with the following CRC):
-- First frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second/last frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31", -- MSB
3 => xx, -- "xx" means irrelevant data, not part of CRC calculation.
4 => xx, -- This occurs only in the last frame, when it specified in
5 => xx, -- byte 0 which bytes contain data
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x31C3
Another example with data 0x4376669A1CFC048321313233343536373839 and its correct CRC is shown below:
-- First incoming frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second incoming frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31",
3 => x"21",
4 => x"83",
5 => x"04",
6 => x"FC",
7 => x"1C"
);
-- Third/last incoming frame of data
RxFrame.Data <= (
1 => x"9A",
2 => x"66",
3 => x"76",
4 => x"43", -- MSB
5 => xx, -- Irrelevant data, specified in byte 0
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x2848
Is there a concept I'm missing? Is there a way to calculate the CRC with the data being received in reverse order? I am implementing this for CANopen SDO block protocols. Thanks!
CRC calculation algorithm to verify SDO block transfer from CANopen standard
Example code to generate a CRC16 with the bytes read in reverse (last byte first), using a function to do a carryless multiply modulo the CRC polynomial. An explanation follows.
#include <stdio.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#define POLY (0x1021u)
/* carryless multiply modulo crc polynomial */
uint16_t MpyModPoly(uint16_t a, uint16_t b) /* (a*b)%poly */
{
uint16_t pd = 0;
uint16_t i;
for(i = 0; i < 16; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>15))&POLY);
pd ^= (0-(b>>15))&a;
b <<= 1;
}
return pd;
}
/* generate crc in reverse byte order */
uint16_t Crc16R(uint8_t * b, size_t sz)
{
uint8_t *e = b + sz; /* end of bfr ptr */
uint16_t crc = 0u; /* crc */
uint16_t pdm = 0x100u; /* padding multiplier */
while(e > b){ /* generate crc */
pdm = MpyModPoly(0x100, pdm);
crc ^= MpyModPoly( *--e, pdm);
}
return(crc);
}
/* msg will be processed in reverse order */
static uint8_t msg[] = {0x43,0x76,0x66,0x9A,0x1C,0xFC,0x04,0x83,
0x21,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
0x38,0x39};
int main()
{
uint16_t crc;
crc = Crc16R(msg, sizeof(msg));
printf("%04x\n", crc);
return 0;
}
Example code using X86 xmm pclmulqdq and psrlq, to emulate a 16 bit by 16 bit hardware (VHDL) carryless multiply:
/* __m128i is an intrinsic for X86 128 bit xmm register */
static __m128i poly = {.m128i_u32[0] = 0x00011021u}; /* poly */
static __m128i invpoly = {.m128i_u32[0] = 0x00008898u}; /* 2^31 / poly */
/* carryless multiply modulo crc polynomial */
/* using xmm pclmulqdq and psrlq */
uint16_t MpyModPoly(uint16_t a, uint16_t b)
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* mp = a*b */
mt = _mm_srli_epi64(mp, 16); /* mt = mp>>16 */
mt = _mm_clmulepi64_si128(mt, invpoly, 0x00); /* mt = mt*ipoly */
mt = _mm_srli_epi64(mt, 15); /* mt = mt>>15 = (a*b)/poly */
mt = _mm_clmulepi64_si128(mt, poly, 0x00); /* mt = mt*poly */
return mp.m128i_u16[0] ^ mt.m128i_u16[0]; /* ret mp^mt */
}
/* external code to generate invpoly */
#define POLY (0x11021u)
static __m128i invpoly; /* 2^31 / poly */
void GenMPoly(void) /* generate __m12i8 invpoly */
{
uint32_t N = 0x10000u; /* numerator = x^16 */
uint32_t Q = 0; /* quotient = 0 */
for(size_t i = 0; i <= 15; i++){ /* 31 - 16 = 15 */
Q <<= 1;
if(N&0x10000u){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
invpoly.m128i_u16[0] = Q;
}
Explanation: consider the data as separate strings of ever increasing length, padded with zeroes at the end. For the first few bytes of your example, the logic would calculate
CRC = CRC16({39})
CRC ^= CRC16({38 00})
CRC ^= CRC16({37 00 00})
CRC ^= CRC16({36 00 00 00})
...
To speed up this calculation, rather than actually pad with n zero bytes, you can do a carryless multiply of a CRC by 2^{n·8} modulo POLY, where POLY is the 17 bit polynomial used for CRC16:
CRC = CRC16({39})
CRC ^= (CRC16({38}) · (2^08 % POLY)) % POLY
CRC ^= (CRC16({37}) · (2^10 % POLY)) % POLY
CRC ^= (CRC16({36}) · (2^18 % POLY)) % POLY
...
A carryless multiply modulo POLY is equivalent to what CRC16 does, so this translates into pseudo code (all values in hex, 2^8 = 100)
CRC = 0
PDM = 100 ;padding multiplier
PDM = (100 · PDM) % POLY ;main loop (2 lines per byte)
CRC ^= ( 39 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 38 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 37 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 36 · PDM) % POLY
...
Implementing (A · B) % POLY is based on binary math:
(A · B) % POLY = (A · B) ^ (((A · B) / POLY) · POLY)
Where multiply is carryless (XOR instead of add) and divide is borrowless (XOR instead of subtract). Since the divide is borrowless, and most significant term of POLY is x^16, the quotient
Q = (A · B) / POLY
only depends on the upper 16 bits of (A · B). Dividing by POLY uses multiplication by the 16 bit constant IPOLY = (2^31)/POLY followed by a right shift:
Q = (A · B) / POLY = (((A · B) >> 16) · IPOLY) >> 15
The process uses a 16 bit by 16 bit carryless multiply, producing a 31 bit product.
POLY = 0x11021u ; CRC polynomial (17 bit)
IPOLY = 0x08898u ; 2^31 / POLY
; generated by external software
MpyModPoly(A, B)
{
MP = A · B ; MP = A · B
MT = MP >> 16 ; MT = MP >> 16
MT = MT · IPOLY ; MT = MT · IPOLY
MT = MT >> 15 ; MT = (A · B) / POLY
MT = MT · POLY ; MT = ((A · B) / POLY) * POLY
return MP xor MT ; (A·B) ^ (((A · B) / POLY) · POLY)
}
A hardware based carryless multiply would look something like this 4 bit · 4 bit example.
p[] = [a3 a2 a1 a0] · [b3 b2 b1 b0]
p[] is a 7 bit product generated with 7 parallel circuits.
The time for multiply would be worst case propagation time for p3.
p6 = a3&b3
p5 = a3&b2 ^ a2&b3
p4 = a3&b1 ^ a2&b2 ^ a1&b3
p3 = a3&b0 ^ a2&b1 ^ a1&b2 ^ a0&b3
p2 = a2&b0 ^ a1&b1 ^ a0&b2
p1 = a1&b0 ^ a0&b1
p0 = a0&b0
If the xor gates available only have 2 bit inputs, the logic can
be split up. For example:
p3 = (a3&b0 ^ a2&b1) ^ (a1&b2 ^ a0&b3)
I don't know if your VHDL toolset includes a library for carryless multiply. For a 16 bit by 16 bit multiply resulting in a 31 bit product (p30 to p00), p15 has 16 outputs from the 16 ands (in parallel), which could be xor'ed using a tree like structure, 8 xors in parallel feeding into 4 xors in parallel feeding into 2 xor's in parallel into a single xor. So the propagation time would be 1 and and 4 xor propagation times.
Here is an example in C that you can adapt. Since you mentioned VHDL, this is a bit-wise implementation suitable for casting into gates and flip-flops. However, if cycles are more precious to you than memory and gates, then there is also a byte-wise table-driven version that would run in 1/8 the number of cycles.
What this does is the inverse of what is done in a normal CRC calculation. It then applies the same size input in zeros with a normal CRC to get what the normal CRC would have been on that input. Running the zeros through takes the same number of cycles as the inverse CRC, i.e. O(n) where n is the size of the input. If that latency is too large, that can be reduced to O(log n) cycles, with some investment in gates.
#include <stddef.h>
// Update crc with the CRC-16/XMODEM of n zero bytes. (This can be done in
// O(log n) time or cycles instead of O(n), with a little more effort.)
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
for (size_t i = 0; i < n; i++)
for (int k = 0; k < 8; k++)
crc = crc & 0x8000 ? (crc << 1) ^ 0x1021 : crc << 1;
return crc & 0xffff;
}
// Update crc with the CRC-16/XMODEM of the len bytes at mem in reverse. If mem
// is NULL, then return the initial value for the CRC. When done,
// crc16x_zeros_bit() must be used to apply the total length of zero bytes, in
// order to get what the CRC would have been if it were calculated on the bytes
// fed in the opposite order.
static unsigned crc16x_inverse_bit(unsigned crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc &= 0xffff;
for (size_t i = 0; i < len; i++) {
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ 0x8810 : crc >> 1;
crc ^= (unsigned)data[i] << 8;
}
return crc;
}
#include <stdio.h>
int main(void) {
// Do framed example.
unsigned crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21", 2);
crc = crc16x_zeros_bit(crc, 9);
printf("%04x\n", crc);
// Do another one.
crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21!\x83\x04\xfc\x1c", 7);
crc = crc16x_inverse_bit(crc, (void const *)"\x9a" "fvC", 4);
crc = crc16x_zeros_bit(crc, 18);
printf("%04x\n", crc);
return 0;
}
Here is the O(log n) version of crc16x_zeros_bit():
// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC
// polynomial. For speed, a cannot be zero.
static inline unsigned multmodp(unsigned a, unsigned b) {
unsigned p = 0;
for (;;) {
if (a & 1) {
p ^= b;
if (a == 1)
break;
}
a >>= 1;
b = b & 0x8000 ? (b << 1) ^ 0x1021 : b << 1;
}
return p & 0xffff;
}
// Return x^(8n) modulo p(x).
static unsigned x2nmodp(size_t n) {
unsigned p = 1; // x^0 == 1
unsigned q = 0x10; // x^2^2
while (n) {
q = multmodp(q, q); // x^2^k mod p(x), k = 3,4,...
if (n & 1)
p = multmodp(q, p);
n >>= 1;
}
return p;
}
// Update crc with the CRC-16/XMODEM of n zero bytes.
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
return multmodp(x2nmodp(n), crc);
}

Eigen FFT library

I am trying to use Eigen unsupported FFT library using FFTW backend. Specifically I am want to do a 2D FFT. Here's my code :
void fft2(Eigen::MatrixXf * matIn,Eigen::MatrixXcf * matOut)
{
const int nRows = matIn->rows();
const int nCols = matIn->cols();
Eigen::FFT< float > fft;
for (int k = 0; k < nRows; ++k) {
Eigen::VectorXcf tmpOut(nRows);
fft.fwd(tmpOut, matIn->row(k));
matOut->row(k) = tmpOut;
}
for (int k = 0; k < nCols; ++k) {
Eigen::VectorXcf tmpOut(nCols);
fft.fwd(tmpOut, matOut->col(k));
matOut->col(k) = tmpOut;
}
}
I have 2 problems :
First, I get a segmentation fault when using this code on some matrix. This error doesn't happen for all matrixes. I guess it's related to an alignment error. I use the functions in the following way :
Eigen::MatrixXcf matFFT(mat.rows(),mat.cols());
fft2(&matFloat,&matFFT);
where mat can be any matrix. Funnily, the code plants only when I compute the FFT over the 2nd dimension, never on the first one. This doesn't happen with kissFFT backend.
Second I don't get the same result as Matlab (that uses FFTW), when the function works. Eg :
Input Matrix :
[2, 1, 2]
[3, 2, 1]
[1, 2, 3]
Eigen gives :
[ (0,5), (0.5,0.86603), (0,0.5)]
[ (-4.3301,-2.5), (-1,-1.7321), (0.31699,-1.549)]
[ (-1.5,-0.86603), (2,3.4641), (2,3.4641)]
Matlab gives :
17 + 0i 0.5 + 0.86603i 0.5 - 0.86603i
-1 + 0i -1 - 1.7321i 2 - 3.4641i
-1 + 0i 2 + 3.4641i -1 + 1.7321i
Only the central part is the same.
Any help would be welcome.
I failed to activate EIGEN_FFTW_DEFAULT in my first solution, activating it reveals an error in the fftw-support implementation of Eigen. The following works:
#define EIGEN_FFTW_DEFAULT
#include <iostream>
#include <unsupported/Eigen/FFT>
int main(int argc, char *argv[])
{
Eigen::MatrixXf A(3,3);
A << 2,1,2, 3,2,1, 1,2,3;
const int nRows = A.rows();
const int nCols = A.cols();
std::cout << A << "\n\n";
Eigen::MatrixXcf B(3,3);
Eigen::FFT< float > fft;
for (int k = 0; k < nRows; ++k) {
Eigen::VectorXcf tmpOut(nRows);
fft.fwd(tmpOut, A.row(k));
B.row(k) = tmpOut;
}
std::cout << B << "\n\n";
Eigen::FFT< float > fft2; // Workaround: Using the same FFT object for a real and a complex FFT seems not to work with FFTW
for (int k = 0; k < nCols; ++k) {
Eigen::VectorXcf tmpOut(nCols);
fft2.fwd(tmpOut, B.col(k));
B.col(k) = tmpOut;
}
std::cout << B << '\n';
}
I get this output:
2 1 2
3 2 1
1 2 3
(17,0) (0.5,0.866025) (0.5,-0.866025)
(-1,0) (-1,-1.73205) (2,-3.4641)
(-1,0) (2,3.4641) (-1,1.73205)
Which is the same as your Matlab result.
N.B.: FFTW seems to support 2D real->complex FFT natively (without using individual FFTs). This is likely more efficient.
fftwf_plan fftwf_plan_dft_r2c_2d(int n0, int n1,
float *in, fftwf_complex *out, unsigned flags);

nth Catalan number using Combinations

I have written a c++ program to find n-th catalan number by using combinations but I am always getting output 0. Please point out mistakes in this code:
#include <iostream>
using namespace std;
int fact(unsigned int x)
{
unsigned long long f = 1;
for (int i = 1; i <= x; i++)
{
f = f*i;
}
return f;
}
int comb(int y, int z)
{
unsigned long long int c;
c = fact(y) / (fact(z)*fact(y - z));
return c;
}
int catalan(int b)
{
unsigned long long int a;
a = (1 / (b + 1))*(comb((2 * b), b));
return a;
}
int main()
{
int n;
cout << "enter value of n for nth catalan number=";
cin >> n;
cout << n << " Catalan number=" << catalan(n) << endl;
return 0;
}
(1 / (b + 1)) is always going to be zero. Instead use
a = comb(2 * b, b) / (b + 1);
Also, you do your calculations using unsigned long long. Why not use that as return type instead of int.

Can Montgomery multiplication be used to speed up the computation of (large number)! % (some prime)

This question originates in a comment I almost wrote below this question, where Zack is computing the factorial of a large number modulo a large number (that we will assume to be prime for the sake of this question). Zack is using the traditional computation of factorial, taking the remainder at each multiplication.
I almost commented that an alternative to consider was Montgomery multiplication, but thinking more about it, I have only seen this technique used to speed up several multiplications by the same multiplicand (in particular, to speed up the computation of an mod p).
My question is: can Montgomery multiplication be used to speed up the computation of n! mod p for large n and p?
Naively, no; you need to transform each of the n terms of the product into the "Montgomery space", so you have n full reductions mod m, the same as the "usual" algorithm.
However, a factorial isn't just an arbitrary product of n terms; it's much more structured. In particular, if you already have the "Montgomerized" kr mod m, then you can use a very cheap reduction to get (k+1)r mod m.
So this is perfectly feasible, though I haven't seen it done before. I went ahead and wrote a quick-and-dirty implementation (very untested, I wouldn't trust it very far at all):
// returns m^-1 mod 2**64 via clever 2-adic arithmetic (http://arxiv.org/pdf/1209.6626.pdf)
uint64_t inverse(uint64_t m) {
assert(m % 2 == 1);
uint64_t minv = 2 - m;
uint64_t m_1 = m - 1;
for (int i=1; i<6; i+=1) { m_1 *= m_1; minv *= (1 + m_1); }
return minv;
}
uint64_t montgomery_reduce(__uint128_t x, uint64_t minv, uint64_t m) {
return x + (__uint128_t)((uint64_t)x*-minv)*m >> 64;
}
uint64_t montgomery_multiply(uint64_t x, uint64_t y, uint64_t minv, uint64_t m) {
return montgomery_reduce(full_product(x, y), minv, m);
}
uint64_t montgomery_factorial(uint64_t x, uint64_t m) {
assert(x < m && m % 2 == 1);
uint64_t minv = inverse(m); // m^-1 mod 2**64
uint64_t r_mod_m = -m % m; // 2**64 mod m
uint64_t mont_term = r_mod_m;
uint64_t mont_result = r_mod_m;
for (uint64_t k=2; k<=x; k++) {
// Compute the montgomerized product term: kr mod m = (k-1)r + r mod m.
mont_term += r_mod_m;
if (mont_term >= m) mont_term -= m;
// Update the result by multiplying in the new term.
mont_result = montgomery_multiply(mont_result, mont_term, minv, m);
}
// Final reduction
return montgomery_reduce(mont_result, minv, m);
}
and benchmarked it against the usual implementation:
__uint128_t full_product(uint64_t x, uint64_t y) {
return (__uint128_t)x*y;
}
uint64_t naive_factorial(uint64_t x, uint64_t m) {
assert(x < m);
uint64_t result = x ? x : 1;
while (x --> 2) result = full_product(result,x) % m;
return result;
}
and against the usual implementation with some inline asm to fix a minor inefficiency:
uint64_t x86_asm_factorial(uint64_t x, uint64_t m) {
assert(x < m);
uint64_t result = x ? x : 1;
while (x --> 2) {
__asm__("mov %[result], %%rax; mul %[x]; div %[m]"
: [result] "+d" (result) : [x] "r" (x), [m] "r" (m) : "%rax", "flags");
}
return result;
}
Results were as follows on my Haswell laptop for reasonably large x:
implementation speedup
---------------------------
naive 1.00x
x86_asm 1.76x
montgomery 5.68x
So this really does seem to be a pretty nice win. The codegen for the Montgomery implementation is pretty decent, but could probably be improved somewhat further with hand-written assembly as well.
This is an interesting approach for "modest" x and m. Once x gets large, the various approaches that have sub-linear complexity in x will necessarily win out; factorial has so much structure that this method doesn't take advantage of.

CRC Calculation Of A Mostly Static Data Stream

Background:
I have a section of memory, 1024 bytes. The last 1020 bytes will always be the same. The first 4 bytes will change (serial number of a product). I need to calculate the CRC-16 CCITT (0xFFFF starting, 0x1021 mask) for the entire section of memory, CRC_WHOLE.
Question:
Is it possible to calculate the CRC for only the first 4 bytes, CRC_A, then apply a function such as the one below to calculate the full CRC? We can assume that the checksum for the last 1020 bytes, CRC_B, is already known.
CRC_WHOLE = XOR(CRC_A, CRC_B)
I know that this formula does not work (tried it), but I am hoping that something similar exists.
Yes. You can see how in zlib's crc32_combine(). If you have two sequences A and B, then the pure CRC of AB is the exclusive-or of the CRC of A0 and the CRC of 0B, where the 0's represent a series of zero bytes with the length of the corresponding sequence, i.e. B and A respectively.
For your application, you can pre-compute a single operator that applies 1020 zeros to the CRC of your first four bytes very rapidly. Then you can exclusive-or that with the pre-computed CRC of the 1020 bytes.
Update:
Here is a post of mine from 2008 with a detailed explanation that #ArtemB discovered (that I had forgotten about):
crc32_combine() in zlib is based on two key tricks. For what follows,
we set aside the fact that the standard 32-bit CRC is pre and post-
conditioned. We can deal with that later. Assume for now a CRC that
has no such conditioning, and so starts with the register filled with
zeros.
Trick #1: CRCs are linear. So if you have stream X and stream Y of
the same length and exclusive-or the two streams bit-by-bit to get Z,
i.e. Z = X ^ Y (using the C notation for exclusive-or), then CRC(Z) =
CRC(X) ^ CRC(Y). For the problem at hand we have two streams A and B
of differing length that we want to concatenate into stream Z. What
we have available are CRC(A) and CRC(B). What we want is a quick way
to compute CRC(Z). The trick is to construct X = A concatenated with
length(B) zero bits, and Y = length(A) zero bits concatenated with B.
So if we represent concatenation simply by juxtaposition of the
symbols, X = A0, Y = 0B, then X^Y = Z = AB. Then we have CRC(Z) =
CRC(A0) ^ CRC(0B).
Now we need to know CRC(A0) and CRC(0B). CRC(0B) is easy. If we feed
a bunch of zeros to the CRC machine starting with zero, the register
is still filled with zeros. So it's as if we did nothing at all.
Therefore CRC(0B) = CRC(B).
CRC(A0) requires more work however. Taking a non-zero CRC and feeding
zeros to the CRC machine doesn't leave it alone. Every zero changes
the register contents. So to get CRC(A0), we need to set the register
to CRC(A), and then run length(B) zeros through it. Then we can
exclusive-or the result of that with CRC(B) = CRC(0B), and we get what
we want, which is CRC(Z) = CRC(AB). Voila!
Well, actually the voila is premature. I wasn't at all satisfied with
that answer. I didn't want a calculation that took a time
proportional to the length of B. That wouldn't save any time compared
to simply setting the register to CRC(A) and running the B stream
through. I figured there must be a faster way to compute the effect
of feeding n zeros into the CRC machine (where n = length(B)). So
that leads us to:
Trick #2: The CRC machine is a linear state machine. If we know the
linear transformation that occurs when we feed a zero to the machine,
then we can do operations on that transformation to more efficiently
find the transformation that results from feeding n zeros into the
machine.
The transformation of feeding a single zero bit into the CRC machine
is completely represented by a 32x32 binary matrix. To apply the
transformation we multiply the matrix by the register, taking the
register as a 32 bit column vector. For the matrix multiplication in
binary (i.e. over the Galois Field of 2), the role of multiplication
is played by and'ing, and the role of addition is played by exclusive-
or'ing.
There are a few different ways to construct the magic matrix that
represents the transformation caused by feeding the CRC machine a
single zero bit. One way is to observe that each column of the matrix
is what you get when your register starts off with a single one in
it. So the first column is what you get when the register is 100...
and then feed a zero, the second column comes from starting with
0100..., etc. (Those are referred to as basis vectors.) You can see
this simply by doing the matrix multiplication with those vectors.
The matrix multiplication selects the column of the matrix
corresponding to the location of the single one.
Now for the trick. Once we have the magic matrix, we can set aside
the initial register contents for a while, and instead use the
transformation for one zero to compute the transformation for n
zeros. We could just multiply n copies of the matrix together to get
the matrix for n zeros. But that's even worse than just running the n
zeros through the machine. However there's an easy way to avoid most
of those matrix multiplications to get the same answer. Suppose we
want to know the transformation for running eight zero bits, or one
byte through. Let's call the magic matrix that represents running one
zero through: M. We could do seven matrix multiplications to get R =
MxMxMxMxMxMxMxM. Instead, let's start with MxM and call that P. Then
PxP is MxMxMxM. Let's call that Q. Then QxQ is R. So now we've
reduced the seven multiplications to three. P = MxM, Q = PxP, and R =
QxQ.
Now I'm sure you get the idea for an arbitrary n number of zeros. We
can very rapidly generate transformation matrices Mk, where Mk is the
transformation for running 2k zeros through. (In the
paragraph above M3 is R.) We can make M1 through Mk with only k
matrix multiplications, starting with M0 = M. k only has to be as
large as the number of bits in the binary representation of n. We can
then pick those matrices where there are ones in the binary
representation of n and multiply them together to get the
transformation of running n zeros through the CRC machine. So if n =
13, compute M0 x M2 x M3.
If j is the number of one's in the binary representation of n, then we
just have j - 1 more matrix multiplications. So we have a total of k
j - 1 matrix multiplications, where j <= k = floor(logbase2(n)).
Now we take our rapidly constructed matrix for n zeros, and multiply
that by CRC(A) to get CRC(A0). We can compute CRC(A0) in O(log(n))
time, instead of O(n) time. We exclusive or that with CRC(B) and
Voila! (really this time), we have CRC(Z).
That's what zlib's crc32_combine() does.
I will leave it as an exercise for the reader as to how to deal with
the pre and post conditioning of the CRC register. You just need to
apply the linearity observations above. Hint: You don't need to know
length(A). In fact crc32_combine() only takes three arguments:
CRC(A), CRC(B), and length(B) (in bytes).
Below is example C code for an alternative approach for CRC(A0). Rather than working with a matrix, a CRC can be cycled forward n bits by muliplying (CRC · ((2^n)%POLY)%POLY . So the repeated squaring is performed on an integer rather than a matrix. If n is constant, then (2^n)%POLY can be pre-computed.
/* crcpad.c - crc - data has a large number of trailing zeroes */
#include <stdio.h>
#include <stdlib.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
#define POLY (0x04c11db7u)
static uint32_t crctbl[256];
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLY);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
uint32_t pd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>31))&POLY);
pd ^= (0-(b>>31))&a;
b <<= 1;
}
return pd;
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
GenTbl(); /* generate crc table */
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}
Example C code using intrinsic for carryless multiply, pclmulqdq == _mm_clmulepi64_si128:
/* crcpadm.c - crc - data has a large number of trailing zeroes */
/* pclmulqdq intrinsic version */
#include <stdio.h>
#include <stdlib.h>
#include <intrin.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define POLY (0x104c11db7ull)
#define POLYM ( 0x04c11db7u)
static uint32_t crctbl[256];
static __m128i poly; /* poly */
static __m128i invpoly; /* 2^64 / POLY */
void GenMPoly(void) /* generate __m12i8 poly info */
{
uint64_t N = 0x100000000ull;
uint64_t Q = 0;
for(size_t i = 0; i < 33; i++){
Q <<= 1;
if(N&0x100000000ull){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
poly.m128i_u64[0] = POLY;
invpoly.m128i_u64[0] = Q;
}
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLYM);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* p[0] = a*b */
mt = _mm_clmulepi64_si128(mp, invpoly, 0x00); /* t[1] = (p[0]*((2^64)/POLY))>>64 */
mt = _mm_clmulepi64_si128(mt, poly, 0x01); /* t[0] = t[1]*POLY */
return mp.m128i_u32[0] ^ mt.m128i_u32[0]; /* ret = p[0] ^ t[0] */
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
GenMPoly(); /* generate __m128 polys */
GenTbl(); /* generate crc table */
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}

Resources