CLANG is ignoring AVX2 intrinsics in this code - clang

I'm testing LLVM's ability to vectorize some code in https://rust.godbolt.org/
Options : -mavx2 -ffast-math -fno-math-errno -O3
Compiler LLVM 13, but any LLVM actually does the same thing.
#include <immintrin.h>
template<class T>
struct V4
{
T A,B,C,D;
V4() { };
V4(T x) : A(x), B(x), C(x), D(x) { };
V4(T a, T b, T c, T d) : A(a), B(b), C(c), D(d) { };
void operator +=(const V4& x)
{
//A += x.A; B += x.B; C += x.C; D += x.D;
__m256 f = _mm256_loadu_ps(&A);
__m256 f2 = _mm256_loadu_ps(&x.A);
_mm256_store_ps(&A, _mm256_add_ps(f, f2));
};
T GetSum() const { return A + B + C + D; };
};
typedef V4<float> V4F;
double FN(float f[4], float g[4], int cnt)
{
V4F vec1(f[0], f[1], f[2], f[3]), vec2(g[0], g[1], g[2], g[3]);
for (int i=0; i<cnt; i++)
vec1 += vec2;
return vec1.GetSum();
};
This is the resulting disassembly:
FN(float*, float*, int): # #FN(float*, float*, int)
vmovddup xmm0, qword ptr [rdi + 8] # xmm0 = mem[0,0]
vaddps xmm0, xmm0, xmmword ptr [rdi]
vmovshdup xmm1, xmm0 # xmm1 = xmm0[1,1,3,3]
vaddss xmm0, xmm0, xmm1
vcvtss2sd xmm0, xmm0, xmm0
ret
So it is completely ignoring the intrinsics. If I uncomment that part that should be doing the same thing in C++, a really long code appears, so it apparently starts understanding it.
Am I missing something or is this a bug in LLVM?

Related

X86_64 call convention issue

As we know, X86_64 use register rdi, rsi, rdx, rcx, r8, r9 to store normal function's arguments, and use stack memory to store large argument, and use xmm to store float and double argument. But in my code, the function 'myuprobe_sum_dww_ptr' is wired. It doesn't use rdi to store the first argument but to store a local variable. Please see the code below, and I have commented the register usage information in the code. Could anyone help to explain?
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
struct double_wraper
{
double data;
};
struct double_wraper_wraper
{
double_wraper_wraper()
{
dp = new double_wraper();
}
~double_wraper_wraper()
{
delete dp;
}
uint32_t a;
struct double_wraper d;
struct double_wraper* dp;
};
// a -> rdi, b -> rsi
uint32_t myuprobe_sum_int(uint32_t a, uint32_t b)
{
//printf("%u + %u = %u, addr = %p\n", a, b, &a);
return a + b;
}
// d1 -> rsi, d2 -> rdx, v -> rcx
double_wraper_wraper myuprobe_sum_dww_ptr(const double_wraper_wraper* d1, const double_wraper_wraper* d2, uint32_t v)
{
double_wraper_wraper d3; // d3 -> rdi, why??????????????
d3.a = d1->a + d2->a + v;
d3.d.data = d1->d.data + d2->d.data;
d3.dp->data = d1->dp->data + d2->dp->data;
//printf("%u + %u = %u, addr = %p\n", d1->a, d2->a, d3.a, d1);
//printf("%lf + %lf = %lf, addr = %p\n", d1->d.data, d2->d.data, d3.d.data, d1);
printf("%lf + %lf = %lf, addr = %p\n", d1->dp->data, d2->dp->data, d3.dp->data, d1);
return d3;
}
// d1 -> rdi, d2 -> rsi, d3 -> rdx, v -> rcx
double_wraper myuprobe_sum_dw_ptr(const double_wraper* d1, const double_wraper* d2, const double_wraper_wraper* d3, uint32_t v)
{
double_wraper d4;
d4.data = d1->data + d2->data + d3->d.data;
//printf("%lf + %lf + %lf = %lf, addr = %p\n", d1->data, d2->data, d3->d.data, d4.data, d1);
return d4;
}
int main()
{
while(1) {
double_wraper_wraper d4, d5;
d4.a = rand();
d4.d.data = rand() + (double)rand() / RAND_MAX;
d4.dp->data = rand() + (double)rand() / RAND_MAX;
d5.a = rand();
d5.d.data = rand() + (double)rand() / RAND_MAX;
d5.dp->data = rand() + (double)rand() / RAND_MAX;
auto d7 = myuprobe_sum_dww_ptr(&d4, &d5, 100);
uint32_t a,b;
a = rand();
b = rand();
auto c = myuprobe_sum_int(a, b);
double_wraper d8,d9;
d8.data = rand() + (double)rand() / RAND_MAX;
d9.data = rand() + (double)rand() / RAND_MAX;
auto d10 = myuprobe_sum_dw_ptr(&d8, &d9, &d4, 100);
usleep(5000000);
}
}
Compile: g++ test.cpp -o test -g -O0

How to Calculate CRC Starting at Last Byte

I'm trying to implement a CRC-CCITT calculator in VHDL. I was able to initially do that; however, I recently found out that data is delivered starting at the least-significant byte. In my code, data is transmitted 7 bytes at a time through a frame. So let's say we have the following data: 123456789 in ASCII or 313233343536373839 in hex. The data would be transmitted as such (with the following CRC):
-- First frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second/last frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31", -- MSB
3 => xx, -- "xx" means irrelevant data, not part of CRC calculation.
4 => xx, -- This occurs only in the last frame, when it specified in
5 => xx, -- byte 0 which bytes contain data
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x31C3
Another example with data 0x4376669A1CFC048321313233343536373839 and its correct CRC is shown below:
-- First incoming frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second incoming frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31",
3 => x"21",
4 => x"83",
5 => x"04",
6 => x"FC",
7 => x"1C"
);
-- Third/last incoming frame of data
RxFrame.Data <= (
1 => x"9A",
2 => x"66",
3 => x"76",
4 => x"43", -- MSB
5 => xx, -- Irrelevant data, specified in byte 0
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x2848
Is there a concept I'm missing? Is there a way to calculate the CRC with the data being received in reverse order? I am implementing this for CANopen SDO block protocols. Thanks!
CRC calculation algorithm to verify SDO block transfer from CANopen standard
Example code to generate a CRC16 with the bytes read in reverse (last byte first), using a function to do a carryless multiply modulo the CRC polynomial. An explanation follows.
#include <stdio.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#define POLY (0x1021u)
/* carryless multiply modulo crc polynomial */
uint16_t MpyModPoly(uint16_t a, uint16_t b) /* (a*b)%poly */
{
uint16_t pd = 0;
uint16_t i;
for(i = 0; i < 16; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>15))&POLY);
pd ^= (0-(b>>15))&a;
b <<= 1;
}
return pd;
}
/* generate crc in reverse byte order */
uint16_t Crc16R(uint8_t * b, size_t sz)
{
uint8_t *e = b + sz; /* end of bfr ptr */
uint16_t crc = 0u; /* crc */
uint16_t pdm = 0x100u; /* padding multiplier */
while(e > b){ /* generate crc */
pdm = MpyModPoly(0x100, pdm);
crc ^= MpyModPoly( *--e, pdm);
}
return(crc);
}
/* msg will be processed in reverse order */
static uint8_t msg[] = {0x43,0x76,0x66,0x9A,0x1C,0xFC,0x04,0x83,
0x21,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
0x38,0x39};
int main()
{
uint16_t crc;
crc = Crc16R(msg, sizeof(msg));
printf("%04x\n", crc);
return 0;
}
Example code using X86 xmm pclmulqdq and psrlq, to emulate a 16 bit by 16 bit hardware (VHDL) carryless multiply:
/* __m128i is an intrinsic for X86 128 bit xmm register */
static __m128i poly = {.m128i_u32[0] = 0x00011021u}; /* poly */
static __m128i invpoly = {.m128i_u32[0] = 0x00008898u}; /* 2^31 / poly */
/* carryless multiply modulo crc polynomial */
/* using xmm pclmulqdq and psrlq */
uint16_t MpyModPoly(uint16_t a, uint16_t b)
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* mp = a*b */
mt = _mm_srli_epi64(mp, 16); /* mt = mp>>16 */
mt = _mm_clmulepi64_si128(mt, invpoly, 0x00); /* mt = mt*ipoly */
mt = _mm_srli_epi64(mt, 15); /* mt = mt>>15 = (a*b)/poly */
mt = _mm_clmulepi64_si128(mt, poly, 0x00); /* mt = mt*poly */
return mp.m128i_u16[0] ^ mt.m128i_u16[0]; /* ret mp^mt */
}
/* external code to generate invpoly */
#define POLY (0x11021u)
static __m128i invpoly; /* 2^31 / poly */
void GenMPoly(void) /* generate __m12i8 invpoly */
{
uint32_t N = 0x10000u; /* numerator = x^16 */
uint32_t Q = 0; /* quotient = 0 */
for(size_t i = 0; i <= 15; i++){ /* 31 - 16 = 15 */
Q <<= 1;
if(N&0x10000u){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
invpoly.m128i_u16[0] = Q;
}
Explanation: consider the data as separate strings of ever increasing length, padded with zeroes at the end. For the first few bytes of your example, the logic would calculate
CRC = CRC16({39})
CRC ^= CRC16({38 00})
CRC ^= CRC16({37 00 00})
CRC ^= CRC16({36 00 00 00})
...
To speed up this calculation, rather than actually pad with n zero bytes, you can do a carryless multiply of a CRC by 2^{n·8} modulo POLY, where POLY is the 17 bit polynomial used for CRC16:
CRC = CRC16({39})
CRC ^= (CRC16({38}) · (2^08 % POLY)) % POLY
CRC ^= (CRC16({37}) · (2^10 % POLY)) % POLY
CRC ^= (CRC16({36}) · (2^18 % POLY)) % POLY
...
A carryless multiply modulo POLY is equivalent to what CRC16 does, so this translates into pseudo code (all values in hex, 2^8 = 100)
CRC = 0
PDM = 100 ;padding multiplier
PDM = (100 · PDM) % POLY ;main loop (2 lines per byte)
CRC ^= ( 39 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 38 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 37 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 36 · PDM) % POLY
...
Implementing (A · B) % POLY is based on binary math:
(A · B) % POLY = (A · B) ^ (((A · B) / POLY) · POLY)
Where multiply is carryless (XOR instead of add) and divide is borrowless (XOR instead of subtract). Since the divide is borrowless, and most significant term of POLY is x^16, the quotient
Q = (A · B) / POLY
only depends on the upper 16 bits of (A · B). Dividing by POLY uses multiplication by the 16 bit constant IPOLY = (2^31)/POLY followed by a right shift:
Q = (A · B) / POLY = (((A · B) >> 16) · IPOLY) >> 15
The process uses a 16 bit by 16 bit carryless multiply, producing a 31 bit product.
POLY = 0x11021u ; CRC polynomial (17 bit)
IPOLY = 0x08898u ; 2^31 / POLY
; generated by external software
MpyModPoly(A, B)
{
MP = A · B ; MP = A · B
MT = MP >> 16 ; MT = MP >> 16
MT = MT · IPOLY ; MT = MT · IPOLY
MT = MT >> 15 ; MT = (A · B) / POLY
MT = MT · POLY ; MT = ((A · B) / POLY) * POLY
return MP xor MT ; (A·B) ^ (((A · B) / POLY) · POLY)
}
A hardware based carryless multiply would look something like this 4 bit · 4 bit example.
p[] = [a3 a2 a1 a0] · [b3 b2 b1 b0]
p[] is a 7 bit product generated with 7 parallel circuits.
The time for multiply would be worst case propagation time for p3.
p6 = a3&b3
p5 = a3&b2 ^ a2&b3
p4 = a3&b1 ^ a2&b2 ^ a1&b3
p3 = a3&b0 ^ a2&b1 ^ a1&b2 ^ a0&b3
p2 = a2&b0 ^ a1&b1 ^ a0&b2
p1 = a1&b0 ^ a0&b1
p0 = a0&b0
If the xor gates available only have 2 bit inputs, the logic can
be split up. For example:
p3 = (a3&b0 ^ a2&b1) ^ (a1&b2 ^ a0&b3)
I don't know if your VHDL toolset includes a library for carryless multiply. For a 16 bit by 16 bit multiply resulting in a 31 bit product (p30 to p00), p15 has 16 outputs from the 16 ands (in parallel), which could be xor'ed using a tree like structure, 8 xors in parallel feeding into 4 xors in parallel feeding into 2 xor's in parallel into a single xor. So the propagation time would be 1 and and 4 xor propagation times.
Here is an example in C that you can adapt. Since you mentioned VHDL, this is a bit-wise implementation suitable for casting into gates and flip-flops. However, if cycles are more precious to you than memory and gates, then there is also a byte-wise table-driven version that would run in 1/8 the number of cycles.
What this does is the inverse of what is done in a normal CRC calculation. It then applies the same size input in zeros with a normal CRC to get what the normal CRC would have been on that input. Running the zeros through takes the same number of cycles as the inverse CRC, i.e. O(n) where n is the size of the input. If that latency is too large, that can be reduced to O(log n) cycles, with some investment in gates.
#include <stddef.h>
// Update crc with the CRC-16/XMODEM of n zero bytes. (This can be done in
// O(log n) time or cycles instead of O(n), with a little more effort.)
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
for (size_t i = 0; i < n; i++)
for (int k = 0; k < 8; k++)
crc = crc & 0x8000 ? (crc << 1) ^ 0x1021 : crc << 1;
return crc & 0xffff;
}
// Update crc with the CRC-16/XMODEM of the len bytes at mem in reverse. If mem
// is NULL, then return the initial value for the CRC. When done,
// crc16x_zeros_bit() must be used to apply the total length of zero bytes, in
// order to get what the CRC would have been if it were calculated on the bytes
// fed in the opposite order.
static unsigned crc16x_inverse_bit(unsigned crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc &= 0xffff;
for (size_t i = 0; i < len; i++) {
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ 0x8810 : crc >> 1;
crc ^= (unsigned)data[i] << 8;
}
return crc;
}
#include <stdio.h>
int main(void) {
// Do framed example.
unsigned crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21", 2);
crc = crc16x_zeros_bit(crc, 9);
printf("%04x\n", crc);
// Do another one.
crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21!\x83\x04\xfc\x1c", 7);
crc = crc16x_inverse_bit(crc, (void const *)"\x9a" "fvC", 4);
crc = crc16x_zeros_bit(crc, 18);
printf("%04x\n", crc);
return 0;
}
Here is the O(log n) version of crc16x_zeros_bit():
// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC
// polynomial. For speed, a cannot be zero.
static inline unsigned multmodp(unsigned a, unsigned b) {
unsigned p = 0;
for (;;) {
if (a & 1) {
p ^= b;
if (a == 1)
break;
}
a >>= 1;
b = b & 0x8000 ? (b << 1) ^ 0x1021 : b << 1;
}
return p & 0xffff;
}
// Return x^(8n) modulo p(x).
static unsigned x2nmodp(size_t n) {
unsigned p = 1; // x^0 == 1
unsigned q = 0x10; // x^2^2
while (n) {
q = multmodp(q, q); // x^2^k mod p(x), k = 3,4,...
if (n & 1)
p = multmodp(q, p);
n >>= 1;
}
return p;
}
// Update crc with the CRC-16/XMODEM of n zero bytes.
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
return multmodp(x2nmodp(n), crc);
}

Testing memory bandwidth - Odd results

Hy everyone,
Was asking myself the other day how much different access patterns affected memory read speed (mostly thinking about the frequency vs bus size discussion, and the impact of cache hit rate), so made a small program to test memory speed doing sequential and fully random accesses, but the results I got are quite odd, so I'm not trusting my code.
My idea was quite straightforward, just loop on an array and mov the data to a register. Made 3 versions, one moves 128 bits at a time with sse, the other 32 , and the last one 32 again but doing two movs, the first one loading a random number from an array, and the second one reading from the position specified by the prev value.
I got ~40 GB/s for the sse version, that it's reasonable considering i'm using an i7 4790K with DDR3 1600 cl9 memory at dual channel, that gives about 25 GB/s, so add to that cache and it feels ok, but then I got 3.3 GB/s for the normal sequential, and the worst, 15 GB/s for the random one. That last result makes me think that the bench is bogus.
Below is the code, if anyone could shed some light on this it would be appreciated. Did the inner loop in assembly to make sure it only did a mov.
EDIT: Managed to get a bit more performance by using vlddqu ymm0, buffL[esi] (avx) instead of movlps, went from 38 GB/s to 41 GB/s
EDIT 2: Did some more testing, unrolling the inner assembly loop, making a version that loads 4 times per iteration and another one that loads 8 times. Got ~35 GB/s for the x4 version and ~24 GB/s for the x8 version
#define PASSES 1000000
double bw = 0;
int main()
{
cout << "Running : ";
bw = 0;
for(int n = 0; n < PASSES;n++)
{
if(n % 100000 == 0) cout << ".";
const int l = 1 << 16;
int buffL[l];
LARGE_INTEGER frequency; // ticks per second
LARGE_INTEGER t1, t2; // ticks
// get ticks per second
QueryPerformanceFrequency(&frequency);
// start timer
QueryPerformanceCounter(&t1);
int maxByte = l*4;
__asm
{
push esi
mov esi,0
loopL0:
movlps xmm0, buffL[esi]
add esi,16
cmp esi,maxByte
jb loopL0
pop esi
}
// stop timer
QueryPerformanceCounter(&t2);
// compute elapsed time in millisec
double ms = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
bw += (double(4ull*l)/1073741824.0) / (double(ms)*0.001);
}
bw /= double(PASSES);
cout << endl;
cout << " Sequential (SSE) : " << bw << " GB/s " << endl;
cout << "Running : ";
bw = 0;
for(int n = 0; n < PASSES;n++)
{
if(n % 100000 == 0) cout << ".";
const int l = 1 << 16;
int buffL[l];
for(int t = 0;t < l;t++) buffL[t] = (t+1)*4;
LARGE_INTEGER frequency; // ticks per second
LARGE_INTEGER t1, t2; // ticks
// get ticks per second
QueryPerformanceFrequency(&frequency);
// start timer
QueryPerformanceCounter(&t1);
int maxByte = l*4;
__asm
{
push esi
mov esi,0
loopL1:
mov esi, buffL[esi]
cmp esi,maxByte
jb loopL1
pop esi
}
// stop timer
QueryPerformanceCounter(&t2);
// compute elapsed time in millisec
double ms = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
bw += (double(4ull*l)/1073741824.0) / (double(ms)*0.001);
}
bw /= double(PASSES);
cout << endl;
cout << " Sequential : " << bw << " GB/s " << endl;
cout << "Running : ";
bw = 0;
for(int n = 0; n < PASSES;n++)
{
if(n % 100000 == 0) cout << ".";
const int l = 1 << 14;
int buffL[l];
int maxByte = l*4;
int roffset[l];
for(int t = 0;t < l;t++) roffset[t] = (rand()*4) % maxByte;
LARGE_INTEGER frequency; // ticks per second
LARGE_INTEGER t1, t2; // ticks
// get ticks per second
QueryPerformanceFrequency(&frequency);
// start timer
QueryPerformanceCounter(&t1);
__asm
{
push esi
push edi
mov esi,0
loopL2:
mov edi, roffset[esi]
mov edi, buffL[edi]
add esi,4
cmp esi,maxByte
jb loopL2
pop edi
pop esi
}
// stop timer
QueryPerformanceCounter(&t2);
// compute elapsed time in millisec
double ms = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
bw += (double(2*4ull*l)/1073741824.0) / (double(ms)*0.001);
}
bw /= double(PASSES);
cout << endl;
cout << " Random : " << bw << " GB/s " << endl;
return 0;
}
Gathering the measurement code into a Bandwidth class, creating some constants, having all three tests use the same buffer (and size) aligning the tops of the loops and computing random offset into the entire buffer (3rd test):
#include "stdafx.h"
#include "windows.h"
#include <iostream>
#include <vector>
using namespace std;
constexpr size_t passes = 1000000;
constexpr size_t buffsize = 64 * 1024;
constexpr double gigabyte = 1024.0 * 1024.0 * 1024.0;
constexpr double gb_per_test = double(long long(buffsize) * passes) / gigabyte;
struct Bandwidth
{
LARGE_INTEGER pc_tick_per_sec;
LARGE_INTEGER start_pc;
const char* _label;
public:
Bandwidth(const char* label): _label(label)
{
cout << "Running : ";
QueryPerformanceFrequency(&pc_tick_per_sec);
QueryPerformanceCounter(&start_pc);
}
~Bandwidth() {
LARGE_INTEGER end_pc{};
QueryPerformanceCounter(&end_pc);
const auto seconds = double(end_pc.QuadPart - start_pc.QuadPart) / pc_tick_per_sec.QuadPart;
cout << "\n " << _label << ": " << gb_per_test / seconds << " GB/s " << endl;
}
};
int wmain()
{
vector<char> buff(buffsize, 0);
const auto buff_begin = buff.data();
const auto buff_end = buff.data()+buffsize;
{
Bandwidth b("Sequential (SSE)");
for (size_t n = 0; n < passes; ++n) {
__asm {
push esi
push edi
mov esi, buff_begin
mov edi, buff_end
align 16
loopL0:
movlps xmm0, [esi]
lea esi, [esi + 16]
cmp esi, edi
jne loopL0
pop edi
pop esi
}
}
}
{
Bandwidth b("Sequential (DWORD)");
for (int n = 0; n < passes; ++n) {
__asm {
push esi
push edi
mov esi, buff
mov edi, buff_end
align 16
loopL1:
mov eax, [esi]
lea esi, [esi + 4]
cmp esi, edi
jne loopL1
pop edi
pop esi
}
}
}
{
uint32_t* roffset[buffsize];
for (auto& roff : roffset)
roff = (uint32_t*)(buff.data())+(uint32_t)(double(rand()) / RAND_MAX * (buffsize / sizeof(int)));
const auto roffset_end = end(roffset);
Bandwidth b("Random");
for (int n = 0; n < passes; ++n) {
__asm {
push esi
push edi
push ebx
lea edi, roffset //begin(roffset)
mov ebx, roffset_end //end(roffset)
align 16
loopL2:
mov esi, [edi] //fetch the next random offset
mov eax, [esi] //read from the random location
lea edi, [edi + 4] // point to the next random offset
cmp edi, ebx //are we done?
jne loopL2
pop ebx
pop edi
pop esi
}
}
}
}
I have also found more consistent results if I SetPriorityClass(GetCurrentProcess, HIGH_PRIORITY_CLASS); and SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
Your second test has one array on the stack that is 1 << 16 in size. That's 64k. Or more easier to read:
int buffL[65536];
Your third test has two arrays on the stack. Both at `1 << 14' in size. That's 16K each
int buffL[16384];
int roffset[16384];
So right away you are using a much smaller stack size (i.e. fewer pages being cached and swapped out). I think your loop is only iterating half as many times in the third test as it is in the second. Maybe you meant to declare 1 << 15 (or 1 << 16) as the size instead for each array instead?

MCR and MRC instruction usage

here i have written code to find number of cycles taken by a function but i am getting error at first MCR instruction can any one suggest me how to solve this problem.This code is written in XCODE and running on ios.
#include <stdio.h>
static inline unsigned int get_cyclecount (void)
{
unsigned int value;
// Read CCNT Register
asm volatile ("MRC p15, 0, %0, c9, c13, 0\t\n": "=r"(value));
return value;
}
static inline void init_perfcounters (int do_reset, int enable_divider)
{
// in general enable all counters (including cycle counter)
int value = 1;
// perform reset:
if (do_reset)
{
value |= 2; // reset all counters to zero.
value |= 4; // reset cycle counter to zero.
}
if (enable_divider)
value |= 8; // enable "by 64" divider for CCNT.
value |= 16;
// program the performance-counter control-register:
asm volatile ("MCR p15, 0, %0, c9, c12, 0\t\n" :: "r"(value));
// enable all counters:
asm volatile ("MCR p15, 0, %0, c9, c12, 1\t\n" :: "r"(0x8000000f));
// clear overflows:
asm volatile ("MCR p15, 0, %0, c9, c12, 3\t\n" :: "r"(0x8000000f));
}
int main () {
float x = 100.0f;
float y = 0.00000f;
float inst,cycl,cycl_inst;
int do_reset=0;
int enable_divider=0;
init_perfcounters (1, 0);
// measure the counting overhead:
unsigned int overhead = get_cyclecount();
overhead = get_cyclecount() - overhead;
unsigned int t = get_cyclecount();
// do some stuff here..
log_10_c_function(x);
t = get_cyclecount() - t;
printf ("Totaly %d cycles (including function call) ", t - overhead);
return 0;
}

OpenCV SURF comparing descriptors

Folowing snippet is from OpenCV find_obj.cpp which is demo for using SURF,
double
compareSURFDescriptors( const float* d1, const float* d2, double best, int length )
{
double total_cost = 0;
assert( length % 4 == 0 );
int i;
for( i = 0; i best )
break;
}
return total_cost;
}
As far as I can tell it checking the euclidian distance, what I do not understand is why is it doing it in groups of 4? Why not calculate the whole thing at once?
Usually things like this are done for making SSE optimizations possible. SSE registers are 128 bits long and can contain 4 floats, so you can do the 4 subtractions using one instruction, parallelly.
Another upside: you have to check the loop counter only after every fourth difference. That makes the code faster even if the compiler doesn't use the opportunity to generate SSE code. For example, VS2008 didn't, not even with -O2:
double t0 = d1[i] - d2[i];
00D91666 fld dword ptr [edx-0Ch]
00D91669 fsub dword ptr [ecx-4]
double t1 = d1[i+1] - d2[i+1];
00D9166C fld dword ptr [ebx+ecx]
00D9166F fsub dword ptr [ecx]
double t2 = d1[i+2] - d2[i+2];
00D91671 fld dword ptr [edx-4]
00D91674 fsub dword ptr [ecx+4]
double t3 = d1[i+3] - d2[i+3];
00D91677 fld dword ptr [edx]
00D91679 fsub dword ptr [ecx+8]
total_cost += t0*t0 + t1*t1 + t2*t2 + t3*t3;
00D9167C fld st(2)
00D9167E fmulp st(3),st
00D91680 fld st(3)
00D91682 fmulp st(4),st
00D91684 fxch st(2)
00D91686 faddp st(3),st
00D91688 fmul st(0),st
00D9168A faddp st(2),st
00D9168C fmul st(0),st
00D9168E faddp st(1),st
00D91690 faddp st(2),st
I think it is because for each subregion we get 4 numbers. Totally 4x4x4 subregions making 64 length vector. So its basically getting the difference between 2 sub regions.

Resources