How does LLVM translate OpenMP multi-threaded code with runtime library calls? - clang

When I studied the LLVM OpenMP Runtime Library document, I found there is an example about work sharing:
extern float foo( void );
int main () {
int i;
float r = 0.0;
#pragma omp parallel for schedule(dynamic) reduction(+:r)
for ( i = 0; i < 10; i ++ ) {
r += foo();
}
}
and then it shows the transformed code like below:
extern float foo( void );
int main () {
static int zero = 0;
auto int gtid;
auto float r = 0.0;
__kmpc_begin( & loc3, 0 );
// The gtid is not actually required in this example so could be omitted;
// We show its initialization here because it is often required for calls into
// the runtime and should be locally cached like this.
gtid = __kmpc_global thread num( & loc3 );
__kmpc_fork call( & loc7, 1, main_7_parallel_3, & r );
__kmpc_end( & loc0 );
return 0;
}
struct main_10_reduction_t_5 { float r_10_rpr; };
static kmp_critical_name lck = { 0 };
static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set
// if compiler has generated an atomic reduction.
void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {
auto int i_7_pr;
auto int lower, upper, liter, incr;
auto struct main_10_reduction_t_5 reduce;
reduce.r_10_rpr = 0.F;
liter = 0;
__kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 );
while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr
) ) {
for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ )
reduce.r_10_rpr += foo();
}
switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, &lck ) ) {
case 1:
*r_7_shp += reduce.r_10_rpr;
__kmpc_end_reduce_nowait( & loc10, *gtid, & lck );
break;
case 2:
__kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );
break;
default:;
}
}
I spent a lot of time to find how does OpenMP transform code like above, but still could not find the way to show the result like the example, and how it work in in OpenMP.
So, here is my question which make me confused for a long time:
Is there any way to output files or show the result directly like the example?

You can inspect the LLVM IR (see https://llvm.org/docs/LangRef.html).
For example:
clang -fopenmp -O2 -emit-llvm -S -o - example.c
Will print the following to stdout:
[...]
; Function Attrs: nounwind uwtable
define dso_local i32 #main() local_unnamed_addr #0 {
entry:
%i = alloca i32, align 4
%r = alloca float, align 4
%0 = bitcast i32* %i to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
%1 = bitcast float* %r to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
store float 0.000000e+00, float* %r, align 4, !tbaa !2
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #__kmpc_fork_call(%struct.ident_t* nonnull #0, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*)* #.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %i, float* nonnull %r) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
ret i32 0
}
[...]
; Function Attrs: norecurse nounwind uwtable
define internal void #.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture readnone dereferenceable(4) %i, float* nocapture dereferenceable(4) %r) #2 {
entry:
%.omp.lb = alloca i32, align 4
%.omp.ub = alloca i32, align 4
%.omp.stride = alloca i32, align 4
%.omp.is_last = alloca i32, align 4
%r1 = alloca float, align 4
%.omp.reduction.red_list = alloca [1 x i8*], align 8
%0 = bitcast i32* %.omp.lb to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
store i32 0, i32* %.omp.lb, align 4, !tbaa !6
%1 = bitcast i32* %.omp.ub to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
store i32 9, i32* %.omp.ub, align 4, !tbaa !6
%2 = bitcast i32* %.omp.stride to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
store i32 1, i32* %.omp.stride, align 4, !tbaa !6
%3 = bitcast i32* %.omp.is_last to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
store i32 0, i32* %.omp.is_last, align 4, !tbaa !6
%4 = bitcast float* %r1 to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #4
store float 0.000000e+00, float* %r1, align 4, !tbaa !2
%5 = load i32, i32* %.global_tid., align 4, !tbaa !6
tail call void #__kmpc_dispatch_init_4(%struct.ident_t* nonnull #0, i32 %5, i32 35, i32 0, i32 9, i32 1, i32 1) #4
%6 = call i32 #__kmpc_dispatch_next_4(%struct.ident_t* nonnull #0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool14 = icmp eq i32 %6, 0
br i1 %tobool14, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.cond.loopexit: ; preds = %omp.inner.for.body, %omp.dispatch.body
%7 = call i32 #__kmpc_dispatch_next_4(%struct.ident_t* nonnull #0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool = icmp eq i32 %7, 0
br i1 %tobool, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.body: ; preds = %entry, %omp.dispatch.cond.loopexit
%8 = load i32, i32* %.omp.lb, align 4, !tbaa !6
%9 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%cmp12 = icmp sgt i32 %8, %9
br i1 %cmp12, label %omp.dispatch.cond.loopexit, label %omp.inner.for.body
omp.inner.for.body: ; preds = %omp.dispatch.body, %omp.inner.for.body
%.omp.iv.013 = phi i32 [ %add4, %omp.inner.for.body ], [ %8, %omp.dispatch.body ]
%call = call float #foo() #4, !llvm.mem.parallel_loop_access !8
%10 = load float, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
%add3 = fadd float %call, %10
store float %add3, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
%add4 = add nsw i32 %.omp.iv.013, 1
%11 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%cmp = icmp slt i32 %.omp.iv.013, %11
br i1 %cmp, label %omp.inner.for.body, label %omp.dispatch.cond.loopexit, !llvm.loop !8
omp.dispatch.end: ; preds = %omp.dispatch.cond.loopexit, %entry
%12 = bitcast [1 x i8*]* %.omp.reduction.red_list to float**
store float* %r1, float** %12, align 8
%13 = bitcast [1 x i8*]* %.omp.reduction.red_list to i8*
%14 = call i32 #__kmpc_reduce_nowait(%struct.ident_t* nonnull #1, i32 %5, i32 1, i64 8, i8* nonnull %13, void (i8*, i8*)* nonnull #.omp.reduction.reduction_func, [8 x i32]* nonnull #.gomp_critical_user_.reduction.var) #4
switch i32 %14, label %.omp.reduction.default [
i32 1, label %.omp.reduction.case1
i32 2, label %.omp.reduction.case2
]
.omp.reduction.case1: ; preds = %omp.dispatch.end
%15 = load float, float* %r, align 4, !tbaa !2
%16 = load float, float* %r1, align 4, !tbaa !2
%add5 = fadd float %15, %16
store float %add5, float* %r, align 4, !tbaa !2
call void #__kmpc_end_reduce_nowait(%struct.ident_t* nonnull #1, i32 %5, [8 x i32]* nonnull #.gomp_critical_user_.reduction.var) #4
br label %.omp.reduction.default
.omp.reduction.case2: ; preds = %omp.dispatch.end
%17 = bitcast float* %r to i32*
%atomic-load = load atomic i32, i32* %17 monotonic, align 4, !tbaa !2
%18 = load float, float* %r1, align 4, !tbaa !2
br label %atomic_cont
atomic_cont: ; preds = %atomic_cont, %.omp.reduction.case2
%19 = phi i32 [ %atomic-load, %.omp.reduction.case2 ], [ %23, %atomic_cont ]
%20 = bitcast i32 %19 to float
%add7 = fadd float %18, %20
%21 = bitcast float %add7 to i32
%22 = cmpxchg i32* %17, i32 %19, i32 %21 monotonic monotonic
%23 = extractvalue { i32, i1 } %22, 0
%24 = extractvalue { i32, i1 } %22, 1
br i1 %24, label %.omp.reduction.default, label %atomic_cont
.omp.reduction.default: ; preds = %atomic_cont, %.omp.reduction.case1, %omp.dispatch.end
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
ret void
}
[...]

Related

CLANG is ignoring AVX2 intrinsics in this code

I'm testing LLVM's ability to vectorize some code in https://rust.godbolt.org/
Options : -mavx2 -ffast-math -fno-math-errno -O3
Compiler LLVM 13, but any LLVM actually does the same thing.
#include <immintrin.h>
template<class T>
struct V4
{
T A,B,C,D;
V4() { };
V4(T x) : A(x), B(x), C(x), D(x) { };
V4(T a, T b, T c, T d) : A(a), B(b), C(c), D(d) { };
void operator +=(const V4& x)
{
//A += x.A; B += x.B; C += x.C; D += x.D;
__m256 f = _mm256_loadu_ps(&A);
__m256 f2 = _mm256_loadu_ps(&x.A);
_mm256_store_ps(&A, _mm256_add_ps(f, f2));
};
T GetSum() const { return A + B + C + D; };
};
typedef V4<float> V4F;
double FN(float f[4], float g[4], int cnt)
{
V4F vec1(f[0], f[1], f[2], f[3]), vec2(g[0], g[1], g[2], g[3]);
for (int i=0; i<cnt; i++)
vec1 += vec2;
return vec1.GetSum();
};
This is the resulting disassembly:
FN(float*, float*, int): # #FN(float*, float*, int)
vmovddup xmm0, qword ptr [rdi + 8] # xmm0 = mem[0,0]
vaddps xmm0, xmm0, xmmword ptr [rdi]
vmovshdup xmm1, xmm0 # xmm1 = xmm0[1,1,3,3]
vaddss xmm0, xmm0, xmm1
vcvtss2sd xmm0, xmm0, xmm0
ret
So it is completely ignoring the intrinsics. If I uncomment that part that should be doing the same thing in C++, a really long code appears, so it apparently starts understanding it.
Am I missing something or is this a bug in LLVM?

LLVM IR - from IR Assembly Format to C

I know you can you llvm-dis to get Assembly Format (*.ll) from IR. Is there a way to get C from Assembly Format? Actually to translate certain parts of it to C?
I.e this code:
test.c
#define NMAX 9
#include <stdio.h>
typedef short Word16;
Word16 gmed_n( /* o : the median value */
Word16 ind[], /* i : input values */
Word16 n /* i : number of inputs */
)
{
Word16 i, j, ix = 0;
Word16 max;
Word16 medianIndex;
Word16 tmp[NMAX];
Word16 tmp2[NMAX];
for (i = 0; i < n; i++)
{
*(tmp2 + i) = *(ind + i);
}
for (i = 0; i < n; i++)
{
max = -32767;
for (j = 0; j < n; j++)
{
if (*(tmp2 + j) >= max)
{
max = *(tmp2 + j);
ix = j;
}
}
*(tmp2 + ix) = -32768;
*(tmp + i) = ix;
}
medianIndex = *(tmp + (n >> 1)); /* account for complex addressing */
return (*(ind + medianIndex));
}
int main()
{
short a[5]={1,32767,3,4,5};
int i = 0;
short res;
res=gmed_n(a,5);
printf("%i\n",res);
}
How can I translate (part from LL)
85: ; preds = %34
%86 = getelementptr inbounds [9 x i16], [9 x i16]* %10, i64 0, i64 0
%87 = load i16, i16* %4, align 2
%88 = sext i16 %87 to i32
%89 = ashr i32 %88, 1
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds i16, i16* %86, i64 %90
%92 = load i16, i16* %91, align 2
store i16 %92, i16* %9, align 2
%93 = load i16*, i16** %3, align 8
%94 = load i16, i16* %9, align 2
%95 = sext i16 %94 to i32
%96 = sext i32 %95 to i64
%97 = getelementptr inbounds i16, i16* %93, i64 %96
%98 = load i16, i16* %97, align 2
ret i16 %98
To C:
return (*(ind + medianIndex));
How would you approach it? What tech stack?
I would like to pinpoint C code from LLVM IR Assembly format.

MCR and MRC instruction usage

here i have written code to find number of cycles taken by a function but i am getting error at first MCR instruction can any one suggest me how to solve this problem.This code is written in XCODE and running on ios.
#include <stdio.h>
static inline unsigned int get_cyclecount (void)
{
unsigned int value;
// Read CCNT Register
asm volatile ("MRC p15, 0, %0, c9, c13, 0\t\n": "=r"(value));
return value;
}
static inline void init_perfcounters (int do_reset, int enable_divider)
{
// in general enable all counters (including cycle counter)
int value = 1;
// perform reset:
if (do_reset)
{
value |= 2; // reset all counters to zero.
value |= 4; // reset cycle counter to zero.
}
if (enable_divider)
value |= 8; // enable "by 64" divider for CCNT.
value |= 16;
// program the performance-counter control-register:
asm volatile ("MCR p15, 0, %0, c9, c12, 0\t\n" :: "r"(value));
// enable all counters:
asm volatile ("MCR p15, 0, %0, c9, c12, 1\t\n" :: "r"(0x8000000f));
// clear overflows:
asm volatile ("MCR p15, 0, %0, c9, c12, 3\t\n" :: "r"(0x8000000f));
}
int main () {
float x = 100.0f;
float y = 0.00000f;
float inst,cycl,cycl_inst;
int do_reset=0;
int enable_divider=0;
init_perfcounters (1, 0);
// measure the counting overhead:
unsigned int overhead = get_cyclecount();
overhead = get_cyclecount() - overhead;
unsigned int t = get_cyclecount();
// do some stuff here..
log_10_c_function(x);
t = get_cyclecount() - t;
printf ("Totaly %d cycles (including function call) ", t - overhead);
return 0;
}

Error on MapView (IOS) and llvm

Created a new project, and place on controller view MapView, then compile, and got error:
LLVM ERROR: Cannot select: 0xac06488: v16i8 = X86ISD::PSHUFB 0xac04da0, 0xa3c5570 [ID=502]
0xac04da0: v16i8 = llvm.x86.sse2.packuswb.128 0xac04d18, 0xac04c08, 0xac04c90 [ORD=1614] [ID=501]
0xac04d18: i32 = TargetConstant<2543> [ORD=1614] [ID=33]
0xac04c08: v8i16 = llvm.x86.sse2.packssdw.128 0xabf5f70, 0xabf5a20, 0xabf5bb8 [ORD=1612] [ID=500]
0xabf5f70: i32 = TargetConstant<2541> [ORD=1612] [ID=32]
0xabf5a20: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5910 [ORD=1602] [ID=498]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5910: v4f32 = fadd 0xabf5778, 0xac04fc0 [ORD=1601] [ID=495]
0xabf5778: v4f32 = fmul 0xabf5118, 0xabf5888 [ORD=1600] [ID=492]
0xabf5118: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5090, 0xa3c1cf0 [ORD=1587] [ID=489]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5090: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf5008, 0xa3bed70 [ORD=1586] [ID=486]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf5008: v4f32,ch = load 0xabfa170, 0xabd68b0, 0xa3bb570<LD16[%429](tbaa=!"omnipotent char")> [ORD=1585] [ID=482]
0xabd68b0: i32 = FrameIndex<1> [ORD=1190] [ID=4]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5bb8: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5b30 [ORD=1605] [ID=497]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5b30: v4f32 = fadd 0xabf5aa8, 0xac04fc0 [ORD=1604] [ID=494]
0xabf5aa8: v4f32 = fmul 0xabf52b0, 0xabf5888 [ORD=1603] [ID=491]
0xabf52b0: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5228, 0xa3c1cf0 [ORD=1591] [ID=488]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5228: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf51a0, 0xa3bed70 [ORD=1590] [ID=484]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf51a0: v4f32,ch = load 0xabfa170, 0xabd7240, 0xa3bb570<LD16[%433](tbaa=!"omnipotent char")> [ORD=1589] [ID=481]
0xabd7240: i32 = add 0xabd68b0, 0xabd7350 [ORD=1418] [ID=74]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04c90: v8i16 = llvm.x86.sse2.packssdw.128 0xabf5f70, 0xabf5d50, 0xabf5ee8 [ORD=1613] [ID=499]
0xabf5f70: i32 = TargetConstant<2541> [ORD=1612] [ID=32]
0xabf5d50: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5cc8 [ORD=1608] [ID=496]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5cc8: v4f32 = fadd 0xabf5c40, 0xac04fc0 [ORD=1607] [ID=493]
0xabf5c40: v4f32 = fmul 0xabf5448, 0xabf5888 [ORD=1606] [ID=490]
0xabf5448: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf53c0, 0xa3c1cf0 [ORD=1595] [ID=487]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf53c0: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf5338, 0xa3bed70 [ORD=1594] [ID=483]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf5338: v4f32,ch = load 0xabfa170, 0xabd32d0, 0xa3bb570<LD16[%437](tbaa=!"omnipotent char")> [ORD=1593] [ID=480]
0xabd32d0: i32 = add 0xabd68b0, 0xa3c7828 [ORD=1422] [ID=76]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5ee8: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5e60 [ORD=1611] [ID=479]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5e60: v4f32 = fadd 0xabf5dd8, 0xac04fc0 [ORD=1610] [ID=477]
0xabf5dd8: v4f32 = fmul 0xabf55e0, 0xabf5888 [ORD=1609] [ID=475]
0xabf55e0: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5558, 0xa3c1cf0 [ORD=1599] [ID=473]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5558: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf9ec8, 0xa3bed70 [ORD=1598] [ID=471]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf9ec8: v4f32 = fadd 0xabf9e40, 0xabf9db8 [ORD=1575] [ID=467]
0xabf9e40: v4f32 = fmul 0xa3c8570, 0xac14f38 [ORD=1574] [ID=463]
0xabf9db8: v4f32 = fmul 0xac14da0, 0xa3be1c0 [ORD=1573] [ID=418]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3c5570: v16i8 = bitcast 0xac04f38 [ID=162]
0xac04f38: v2i64,ch = load 0x9a78fac, 0xa3c6a48, 0xa3bb570<LD16[ConstantPool]> [ID=158]
0xa3c6a48: i32 = add 0xa3c17a0, 0xabd34f0 [ID=139]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xabd34f0: i32 = X86ISD::Wrapper 0xa3d3a28 [ID=111]
0xa3d3a28: i32 = TargetConstantPool<<16 x i8> <i8 2, i8 1, i8 0, i8 3, i8 6, i8 5, i8 4, i8 7, i8 10, i8 9, i8 8, i8 11, i8 14, i8 13, i8 12, i8 15>> 0 [TF=2] [ID=67]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
In function: gldLLVMFPTransform8
Anyone can help? TNX!

getAffineTransform fails when one of its argument matrices is not continuous?

I'm using openCv 2.4.6.
The following doesn't work:
cv::Mat src = cv::imread(argv[1], 1);
int x11 = src.rows/3; int y11 = src.cols/3;
int x12 = src.rows/3; int y12 = src.rows*2/3;
int x13 = src.cols*2/3; int y13 = src.rows*2/3;
int x21 = 0; int y21 = 0;
int x22 = 0; int y22 = src.rows-1;
int x23 = src.cols-1; int y23 = src.rows-1;
#if 1 // doesn't work
float src_tri_data[] = {x11, y11, 0, x12, y12, 0, x13, y13, 0};
cv::Mat src_tri(3, 2, CV_32F, src_tri_data, 3*sizeof(float));
float dst_tri_data[] = {x21, y21, 0, x22, y22, 0, x23, y23, 0};
cv::Mat dst_tri(3, 2, CV_32F, dst_tri_data, 3*sizeof(float));
#else // works
float src_tri_data[] = {x11, y11, x12, y12, x13, y13};
cv::Mat src_tri(3, 2, CV_32F, src_tri_data, 2*sizeof(float));
float dst_tri_data[] = {x21, y21, x22, y22, x23, y23};
cv::Mat dst_tri(3, 2, CV_32F, dst_tri_data, 2*sizeof(float));
#endif
cv::Mat trans = cv::getAffineTransform(src_tri, dst_tri);
It fails with an assertion:
OpenCV Error: Assertion failed (src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3) in unknown function, file C:\slave\builds\WinInstallerMegaPack\src\opencv\modules\imgproc\src\imgwarp.cpp, line 3612
However, if I make the matrices continuous it works.
Any idea why?
The assertions check that the size of the row of index 2 contains 3 elements but your matrices contains 2 elements per row.
The size of the matrices are not correct, modify the construction of your matrices :
cv::Mat src_tri(3,
3, // <=== 3 instead of 2
CV_32F,
src_tri_data,
3*sizeof(float));

Resources