Created a new project, and place on controller view MapView, then compile, and got error:
LLVM ERROR: Cannot select: 0xac06488: v16i8 = X86ISD::PSHUFB 0xac04da0, 0xa3c5570 [ID=502]
0xac04da0: v16i8 = llvm.x86.sse2.packuswb.128 0xac04d18, 0xac04c08, 0xac04c90 [ORD=1614] [ID=501]
0xac04d18: i32 = TargetConstant<2543> [ORD=1614] [ID=33]
0xac04c08: v8i16 = llvm.x86.sse2.packssdw.128 0xabf5f70, 0xabf5a20, 0xabf5bb8 [ORD=1612] [ID=500]
0xabf5f70: i32 = TargetConstant<2541> [ORD=1612] [ID=32]
0xabf5a20: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5910 [ORD=1602] [ID=498]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5910: v4f32 = fadd 0xabf5778, 0xac04fc0 [ORD=1601] [ID=495]
0xabf5778: v4f32 = fmul 0xabf5118, 0xabf5888 [ORD=1600] [ID=492]
0xabf5118: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5090, 0xa3c1cf0 [ORD=1587] [ID=489]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5090: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf5008, 0xa3bed70 [ORD=1586] [ID=486]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf5008: v4f32,ch = load 0xabfa170, 0xabd68b0, 0xa3bb570<LD16[%429](tbaa=!"omnipotent char")> [ORD=1585] [ID=482]
0xabd68b0: i32 = FrameIndex<1> [ORD=1190] [ID=4]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5bb8: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5b30 [ORD=1605] [ID=497]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5b30: v4f32 = fadd 0xabf5aa8, 0xac04fc0 [ORD=1604] [ID=494]
0xabf5aa8: v4f32 = fmul 0xabf52b0, 0xabf5888 [ORD=1603] [ID=491]
0xabf52b0: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5228, 0xa3c1cf0 [ORD=1591] [ID=488]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5228: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf51a0, 0xa3bed70 [ORD=1590] [ID=484]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf51a0: v4f32,ch = load 0xabfa170, 0xabd7240, 0xa3bb570<LD16[%433](tbaa=!"omnipotent char")> [ORD=1589] [ID=481]
0xabd7240: i32 = add 0xabd68b0, 0xabd7350 [ORD=1418] [ID=74]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04c90: v8i16 = llvm.x86.sse2.packssdw.128 0xabf5f70, 0xabf5d50, 0xabf5ee8 [ORD=1613] [ID=499]
0xabf5f70: i32 = TargetConstant<2541> [ORD=1612] [ID=32]
0xabf5d50: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5cc8 [ORD=1608] [ID=496]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5cc8: v4f32 = fadd 0xabf5c40, 0xac04fc0 [ORD=1607] [ID=493]
0xabf5c40: v4f32 = fmul 0xabf5448, 0xabf5888 [ORD=1606] [ID=490]
0xabf5448: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf53c0, 0xa3c1cf0 [ORD=1595] [ID=487]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf53c0: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf5338, 0xa3bed70 [ORD=1594] [ID=483]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf5338: v4f32,ch = load 0xabfa170, 0xabd32d0, 0xa3bb570<LD16[%437](tbaa=!"omnipotent char")> [ORD=1593] [ID=480]
0xabd32d0: i32 = add 0xabd68b0, 0xa3c7828 [ORD=1422] [ID=76]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5ee8: v4i32 = llvm.x86.sse2.cvttps2dq 0xabf5998, 0xabf5e60 [ORD=1611] [ID=479]
0xabf5998: i32 = TargetConstant<2528> [ORD=1602] [ID=31]
0xabf5e60: v4f32 = fadd 0xabf5dd8, 0xac04fc0 [ORD=1610] [ID=477]
0xabf5dd8: v4f32 = fmul 0xabf55e0, 0xabf5888 [ORD=1609] [ID=475]
0xabf55e0: v4f32 = llvm.x86.sse.min.ps 0xa3d39a0, 0xabf5558, 0xa3c1cf0 [ORD=1599] [ID=473]
0xa3d39a0: i32 = TargetConstant<2700> [ORD=1406] [ID=23]
0xabf5558: v4f32 = llvm.x86.sse.max.ps 0xa3bdd70, 0xabf9ec8, 0xa3bed70 [ORD=1598] [ID=471]
0xa3bdd70: i32 = TargetConstant<2698> [ORD=1456] [ID=27]
0xabf9ec8: v4f32 = fadd 0xabf9e40, 0xabf9db8 [ORD=1575] [ID=467]
0xabf9e40: v4f32 = fmul 0xa3c8570, 0xac14f38 [ORD=1574] [ID=463]
0xabf9db8: v4f32 = fmul 0xac14da0, 0xa3be1c0 [ORD=1573] [ID=418]
0xa3bed70: v4f32 = bitcast 0xa3c81b8 [ID=124]
0xa3c81b8: v4i32 = BUILD_VECTOR 0xa3c8130, 0xa3c8130, 0xa3c8130, 0xa3c8130 [ID=98]
0xa3c1cf0: v4f32,ch = load 0x9a78fac, 0xa3c6be0, 0xa3bb570<LD16[ConstantPool]> [ID=146]
0xa3c6be0: i32 = add 0xa3c17a0, 0xa3c22c8 [ID=127]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xa3c22c8: i32 = X86ISD::Wrapper 0xa3c6718 [ID=99]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xabf5888: v4f32,ch = load 0x9a78fac, 0xac05eb0, 0xa3bb570<LD16[ConstantPool]> [ID=156]
0xac05eb0: i32 = add 0xa3c17a0, 0xac15ae8 [ID=137]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac15ae8: i32 = X86ISD::Wrapper 0xabf5668 [ID=109]
0xabf5668: i32 = TargetConstantPool<<4 x float> <float 2.550000e+02, float 2.550000e+02, float 2.550000e+02, float 2.550000e+02>> 0 [TF=2] [ID=65]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xac04fc0: v4f32,ch = load 0x9a78fac, 0xac15d18, 0xa3bb570<LD16[ConstantPool]> [ID=157]
0xac15d18: i32 = add 0xa3c17a0, 0xac05fc0 [ID=138]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xac05fc0: i32 = X86ISD::Wrapper 0xabf5800 [ID=110]
0xabf5800: i32 = TargetConstantPool<<4 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>> 0 [TF=2] [ID=66]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
0xa3c5570: v16i8 = bitcast 0xac04f38 [ID=162]
0xac04f38: v2i64,ch = load 0x9a78fac, 0xa3c6a48, 0xa3bb570<LD16[ConstantPool]> [ID=158]
0xa3c6a48: i32 = add 0xa3c17a0, 0xabd34f0 [ID=139]
0xa3c17a0: i32 = X86ISD::GlobalBaseReg [ID=55]
0xabd34f0: i32 = X86ISD::Wrapper 0xa3d3a28 [ID=111]
0xa3d3a28: i32 = TargetConstantPool<<16 x i8> <i8 2, i8 1, i8 0, i8 3, i8 6, i8 5, i8 4, i8 7, i8 10, i8 9, i8 8, i8 11, i8 14, i8 13, i8 12, i8 15>> 0 [TF=2] [ID=67]
0xa3bb570: i32 = undef [ORD=1189] [ID=3]
In function: gldLLVMFPTransform8
Anyone can help? TNX!
Related
I'm testing LLVM's ability to vectorize some code in https://rust.godbolt.org/
Options : -mavx2 -ffast-math -fno-math-errno -O3
Compiler LLVM 13, but any LLVM actually does the same thing.
#include <immintrin.h>
template<class T>
struct V4
{
T A,B,C,D;
V4() { };
V4(T x) : A(x), B(x), C(x), D(x) { };
V4(T a, T b, T c, T d) : A(a), B(b), C(c), D(d) { };
void operator +=(const V4& x)
{
//A += x.A; B += x.B; C += x.C; D += x.D;
__m256 f = _mm256_loadu_ps(&A);
__m256 f2 = _mm256_loadu_ps(&x.A);
_mm256_store_ps(&A, _mm256_add_ps(f, f2));
};
T GetSum() const { return A + B + C + D; };
};
typedef V4<float> V4F;
double FN(float f[4], float g[4], int cnt)
{
V4F vec1(f[0], f[1], f[2], f[3]), vec2(g[0], g[1], g[2], g[3]);
for (int i=0; i<cnt; i++)
vec1 += vec2;
return vec1.GetSum();
};
This is the resulting disassembly:
FN(float*, float*, int): # #FN(float*, float*, int)
vmovddup xmm0, qword ptr [rdi + 8] # xmm0 = mem[0,0]
vaddps xmm0, xmm0, xmmword ptr [rdi]
vmovshdup xmm1, xmm0 # xmm1 = xmm0[1,1,3,3]
vaddss xmm0, xmm0, xmm1
vcvtss2sd xmm0, xmm0, xmm0
ret
So it is completely ignoring the intrinsics. If I uncomment that part that should be doing the same thing in C++, a really long code appears, so it apparently starts understanding it.
Am I missing something or is this a bug in LLVM?
I know you can you llvm-dis to get Assembly Format (*.ll) from IR. Is there a way to get C from Assembly Format? Actually to translate certain parts of it to C?
I.e this code:
test.c
#define NMAX 9
#include <stdio.h>
typedef short Word16;
Word16 gmed_n( /* o : the median value */
Word16 ind[], /* i : input values */
Word16 n /* i : number of inputs */
)
{
Word16 i, j, ix = 0;
Word16 max;
Word16 medianIndex;
Word16 tmp[NMAX];
Word16 tmp2[NMAX];
for (i = 0; i < n; i++)
{
*(tmp2 + i) = *(ind + i);
}
for (i = 0; i < n; i++)
{
max = -32767;
for (j = 0; j < n; j++)
{
if (*(tmp2 + j) >= max)
{
max = *(tmp2 + j);
ix = j;
}
}
*(tmp2 + ix) = -32768;
*(tmp + i) = ix;
}
medianIndex = *(tmp + (n >> 1)); /* account for complex addressing */
return (*(ind + medianIndex));
}
int main()
{
short a[5]={1,32767,3,4,5};
int i = 0;
short res;
res=gmed_n(a,5);
printf("%i\n",res);
}
How can I translate (part from LL)
85: ; preds = %34
%86 = getelementptr inbounds [9 x i16], [9 x i16]* %10, i64 0, i64 0
%87 = load i16, i16* %4, align 2
%88 = sext i16 %87 to i32
%89 = ashr i32 %88, 1
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds i16, i16* %86, i64 %90
%92 = load i16, i16* %91, align 2
store i16 %92, i16* %9, align 2
%93 = load i16*, i16** %3, align 8
%94 = load i16, i16* %9, align 2
%95 = sext i16 %94 to i32
%96 = sext i32 %95 to i64
%97 = getelementptr inbounds i16, i16* %93, i64 %96
%98 = load i16, i16* %97, align 2
ret i16 %98
To C:
return (*(ind + medianIndex));
How would you approach it? What tech stack?
I would like to pinpoint C code from LLVM IR Assembly format.
When I studied the LLVM OpenMP Runtime Library document, I found there is an example about work sharing:
extern float foo( void );
int main () {
int i;
float r = 0.0;
#pragma omp parallel for schedule(dynamic) reduction(+:r)
for ( i = 0; i < 10; i ++ ) {
r += foo();
}
}
and then it shows the transformed code like below:
extern float foo( void );
int main () {
static int zero = 0;
auto int gtid;
auto float r = 0.0;
__kmpc_begin( & loc3, 0 );
// The gtid is not actually required in this example so could be omitted;
// We show its initialization here because it is often required for calls into
// the runtime and should be locally cached like this.
gtid = __kmpc_global thread num( & loc3 );
__kmpc_fork call( & loc7, 1, main_7_parallel_3, & r );
__kmpc_end( & loc0 );
return 0;
}
struct main_10_reduction_t_5 { float r_10_rpr; };
static kmp_critical_name lck = { 0 };
static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set
// if compiler has generated an atomic reduction.
void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {
auto int i_7_pr;
auto int lower, upper, liter, incr;
auto struct main_10_reduction_t_5 reduce;
reduce.r_10_rpr = 0.F;
liter = 0;
__kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 );
while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr
) ) {
for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ )
reduce.r_10_rpr += foo();
}
switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, &lck ) ) {
case 1:
*r_7_shp += reduce.r_10_rpr;
__kmpc_end_reduce_nowait( & loc10, *gtid, & lck );
break;
case 2:
__kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );
break;
default:;
}
}
I spent a lot of time to find how does OpenMP transform code like above, but still could not find the way to show the result like the example, and how it work in in OpenMP.
So, here is my question which make me confused for a long time:
Is there any way to output files or show the result directly like the example?
You can inspect the LLVM IR (see https://llvm.org/docs/LangRef.html).
For example:
clang -fopenmp -O2 -emit-llvm -S -o - example.c
Will print the following to stdout:
[...]
; Function Attrs: nounwind uwtable
define dso_local i32 #main() local_unnamed_addr #0 {
entry:
%i = alloca i32, align 4
%r = alloca float, align 4
%0 = bitcast i32* %i to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
%1 = bitcast float* %r to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
store float 0.000000e+00, float* %r, align 4, !tbaa !2
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #__kmpc_fork_call(%struct.ident_t* nonnull #0, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*)* #.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %i, float* nonnull %r) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
ret i32 0
}
[...]
; Function Attrs: norecurse nounwind uwtable
define internal void #.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture readnone dereferenceable(4) %i, float* nocapture dereferenceable(4) %r) #2 {
entry:
%.omp.lb = alloca i32, align 4
%.omp.ub = alloca i32, align 4
%.omp.stride = alloca i32, align 4
%.omp.is_last = alloca i32, align 4
%r1 = alloca float, align 4
%.omp.reduction.red_list = alloca [1 x i8*], align 8
%0 = bitcast i32* %.omp.lb to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
store i32 0, i32* %.omp.lb, align 4, !tbaa !6
%1 = bitcast i32* %.omp.ub to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
store i32 9, i32* %.omp.ub, align 4, !tbaa !6
%2 = bitcast i32* %.omp.stride to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
store i32 1, i32* %.omp.stride, align 4, !tbaa !6
%3 = bitcast i32* %.omp.is_last to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
store i32 0, i32* %.omp.is_last, align 4, !tbaa !6
%4 = bitcast float* %r1 to i8*
call void #llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #4
store float 0.000000e+00, float* %r1, align 4, !tbaa !2
%5 = load i32, i32* %.global_tid., align 4, !tbaa !6
tail call void #__kmpc_dispatch_init_4(%struct.ident_t* nonnull #0, i32 %5, i32 35, i32 0, i32 9, i32 1, i32 1) #4
%6 = call i32 #__kmpc_dispatch_next_4(%struct.ident_t* nonnull #0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool14 = icmp eq i32 %6, 0
br i1 %tobool14, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.cond.loopexit: ; preds = %omp.inner.for.body, %omp.dispatch.body
%7 = call i32 #__kmpc_dispatch_next_4(%struct.ident_t* nonnull #0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool = icmp eq i32 %7, 0
br i1 %tobool, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.body: ; preds = %entry, %omp.dispatch.cond.loopexit
%8 = load i32, i32* %.omp.lb, align 4, !tbaa !6
%9 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%cmp12 = icmp sgt i32 %8, %9
br i1 %cmp12, label %omp.dispatch.cond.loopexit, label %omp.inner.for.body
omp.inner.for.body: ; preds = %omp.dispatch.body, %omp.inner.for.body
%.omp.iv.013 = phi i32 [ %add4, %omp.inner.for.body ], [ %8, %omp.dispatch.body ]
%call = call float #foo() #4, !llvm.mem.parallel_loop_access !8
%10 = load float, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
%add3 = fadd float %call, %10
store float %add3, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
%add4 = add nsw i32 %.omp.iv.013, 1
%11 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%cmp = icmp slt i32 %.omp.iv.013, %11
br i1 %cmp, label %omp.inner.for.body, label %omp.dispatch.cond.loopexit, !llvm.loop !8
omp.dispatch.end: ; preds = %omp.dispatch.cond.loopexit, %entry
%12 = bitcast [1 x i8*]* %.omp.reduction.red_list to float**
store float* %r1, float** %12, align 8
%13 = bitcast [1 x i8*]* %.omp.reduction.red_list to i8*
%14 = call i32 #__kmpc_reduce_nowait(%struct.ident_t* nonnull #1, i32 %5, i32 1, i64 8, i8* nonnull %13, void (i8*, i8*)* nonnull #.omp.reduction.reduction_func, [8 x i32]* nonnull #.gomp_critical_user_.reduction.var) #4
switch i32 %14, label %.omp.reduction.default [
i32 1, label %.omp.reduction.case1
i32 2, label %.omp.reduction.case2
]
.omp.reduction.case1: ; preds = %omp.dispatch.end
%15 = load float, float* %r, align 4, !tbaa !2
%16 = load float, float* %r1, align 4, !tbaa !2
%add5 = fadd float %15, %16
store float %add5, float* %r, align 4, !tbaa !2
call void #__kmpc_end_reduce_nowait(%struct.ident_t* nonnull #1, i32 %5, [8 x i32]* nonnull #.gomp_critical_user_.reduction.var) #4
br label %.omp.reduction.default
.omp.reduction.case2: ; preds = %omp.dispatch.end
%17 = bitcast float* %r to i32*
%atomic-load = load atomic i32, i32* %17 monotonic, align 4, !tbaa !2
%18 = load float, float* %r1, align 4, !tbaa !2
br label %atomic_cont
atomic_cont: ; preds = %atomic_cont, %.omp.reduction.case2
%19 = phi i32 [ %atomic-load, %.omp.reduction.case2 ], [ %23, %atomic_cont ]
%20 = bitcast i32 %19 to float
%add7 = fadd float %18, %20
%21 = bitcast float %add7 to i32
%22 = cmpxchg i32* %17, i32 %19, i32 %21 monotonic monotonic
%23 = extractvalue { i32, i1 } %22, 0
%24 = extractvalue { i32, i1 } %22, 1
br i1 %24, label %.omp.reduction.default, label %atomic_cont
.omp.reduction.default: ; preds = %atomic_cont, %.omp.reduction.case1, %omp.dispatch.end
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
call void #llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
ret void
}
[...]
I started to make a Skybox for a simple game, and it's all good and fine when I want to use just only one rotation type transformation either X, Y, or Z.
Image:
But before moving on to OpenGL, there is a problem that for my game I want a simple square to fly inside the skybox, thus I'll need multiple rotations type transformation, like X, Y, and Z altogether to rotate the camera aka rotate the world around the camera. Whenever multiple rotation transformations stack together they expose the cube by exposing its edges.
Image:
Code:
let canvas = document.querySelector("canvas");
let gl = canvas.getContext("webgl");
gl.canvas.width = canvas.getBoundingClientRect().width;
gl.canvas.height = canvas.getBoundingClientRect().height;
let sliders = [
{
label : "rotationX",
valueStart : 0,
valueEnd : 360,
valueCurrent : 0,
measurement : "°"
}, {
label : "rotationY",
valueStart : 0,
valueEnd : 360,
valueCurrent : 0,
measurement : "°"
}, {
label: "rotationZ",
valueStart : 0,
valueEnd : 360,
valueCurrent : 0,
measurement: "°"
}, {
label: "translationCamera",
valueStart : -2,
valueEnd : 2,
valueCurrent : 0,
measurement: "t"
}
];
setSliders(sliders, drawScene, true, gl);
let vertexShaderSource = `
attribute vec4 a_position;
attribute vec3 a_texture;
varying vec3 v_texture;
uniform mat4 u_matrix;
void main() {
vec4 pos = u_matrix*a_position;
pos.z /= 2.0*sqrt(2.0); //Given the length of cube' side, the maximum value of z is 2.0*sqrt(2.0), bring it back to 1.0 or -1.0 if min value.
gl_Position = vec4(pos.xyz, 1.0);
v_texture = a_position.xyz / sqrt(2.0); //a_position is [-sqrt(2.0), sqrt(2.0)]
}
`;
let fragmentShaderSource = `
precision mediump float;
varying vec3 v_texture;
uniform samplerCube u_skybox;
void main() {
gl_FragColor = textureCube(u_skybox, v_texture);
}
`;
function createCube(x1, y1, length) {
return [
//Blue
x1+length, y1, length/2,
x1+length, y1+length, length/2,
x1+length, y1+length, -length/2,
x1+length, y1+length, -length/2,
x1+length, y1, -length/2,
x1+length, y1, length/2,
//Green
x1, y1, length/2,
x1, y1, -length/2,
x1, y1+length, -length/2,
x1, y1+length, -length/2,
x1, y1+length, length/2,
x1, y1, length/2,
//Yellow
x1, y1+length, length/2,
x1+length, y1+length, -length/2,
x1+length, y1+length, length/2,
x1+length, y1+length, -length/2,
x1, y1+length, length/2,
x1, y1+length, -length/2,
//Light Blue
x1, y1, length/2,
x1+length, y1, length/2,
x1+length, y1, -length/2,
x1+length, y1, -length/2,
x1, y1, -length/2,
x1, y1, length/2,
//Rose
x1, y1, length/2,
x1, y1+length, length/2,
x1+length, y1+length, length/2,
x1+length, y1+length, length/2,
x1+length, y1, length/2,
x1, y1, length/2,
//Red
x1, y1, -length/2,
x1+length, y1+length, -length/2,
x1, y1+length, -length/2,
x1+length, y1, -length/2,
x1+length, y1+length, -length/2,
x1, y1, -length/2
];
}
function createShader(gl, type, source) {
let shader = gl.createShader(type);
gl.shaderSource(shader, source);
gl.compileShader(shader);
let success = gl.getShaderParameter(shader, gl.COMPILE_STATUS);
if(success)
return shader;
console.log(gl.getShaderInfoLog(shader));
gl.deleteShader(shader);
}
function createProgram(gl, vertexShader, fragmentShader) {
let program = gl.createProgram();
gl.attachShader(program, vertexShader);
gl.attachShader(program, fragmentShader);
gl.linkProgram(program);
let success = gl.getProgramParameter(program, gl.LINK_STATUS);
if(success)
return program;
console.log(gl.getProgramInfoLog(program));
gl.deleteProgram(program);
}
function drawScene(gl) {
gl.clear(gl.COLOR_BUFFER_BIT | gl.DEPTH_BUFFER_BIT);
let rotationXCamera = matrices["rotationX"](Math.sin(sliders[0].valueCurrent/360*2*Math.PI), Math.cos(sliders[0].valueCurrent/360*2*Math.PI));
let rotationYCamera = matrices["rotationY"](Math.sin(sliders[1].valueCurrent/360*2*Math.PI), Math.cos(sliders[1].valueCurrent/360*2*Math.PI));
let rotationZCamera = matrices["rotationZ"](Math.sin(sliders[2].valueCurrent/360*2*Math.PI), Math.cos(sliders[2].valueCurrent/360*2*Math.PI));
let perspectiveMatrix = matrices["perspective"](1.0472, gl.canvas.clientWidth / gl.canvas.clientHeight, 1, 2000);
let viewCamera = multiplyMatrices(rotationXCamera, rotationYCamera, rotationZCamera);
let viewMatrix = inverseMatrix(viewCamera);
gl.uniformMatrix4fv(matrixCameraLocation, false, multiplyMatrices(viewMatrix));
gl.drawArrays(gl.TRIANGLES, 0, 36);
}
let vertexPositionLocation, matrixCameraLocation;
let bufferPosition;
function resize(gl) {
let realToCSSPixels = window.devicePixelRatio;
let displayWidth = Math.floor(gl.canvas.clientWidth * realToCSSPixels);
let displayHeight = Math.floor(gl.canvas.clientHeight * realToCSSPixels);
if (gl.canvas.width !== displayWidth ||
gl.canvas.height !== displayHeight) {
gl.canvas.width = displayWidth;
gl.canvas.height = displayHeight;
}
}
let imgs = [];
let promises = [];
[
"Left",
"Right",
"Up",
"Down",
"Front",
"Back"
].forEach((path) => {
let img = document.createElement("img");
img.crossOrigin = "null";
img.src = "http://localhost:8000/texture?filename=FullMoon" + path + "2048.png";
let res;
promises.push(new Promise((resolve, reject) => {
res = resolve;
}));
img.addEventListener("load", res);
imgs.push(img);
});
Promise.all(promises).then(() => {
startWebGL(gl);
});
function isPowerOf2(value) {
return (value & (value - 1)) === 0;
}
function startWebGL(gl) {
resize(gl);
let vertexShader = createShader(gl, gl.VERTEX_SHADER, vertexShaderSource);
let fragmentShader = createShader(gl, gl.FRAGMENT_SHADER, fragmentShaderSource);
let program = createProgram(gl, vertexShader, fragmentShader);
vertexPositionLocation = gl.getAttribLocation(program, "a_position");
matrixCameraLocation = gl.getUniformLocation(program, "u_matrix");
let texture = gl.createTexture();
gl.bindTexture(gl.TEXTURE_CUBE_MAP, texture);
for(let g = 0; g < 6; g++) {
gl.texImage2D(gl.TEXTURE_CUBE_MAP_POSITIVE_X+g, 0, gl.RGBA, gl.RGBA, gl.UNSIGNED_BYTE, imgs[g]);
}
if(isPowerOf2(imgs[0].width) && isPowerOf2(imgs[0].height)) {
gl.generateMipmap(gl.TEXTURE_CUBE_MAP);
} else {
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_MIN_FILTER, gl.LINEAR);
}
bufferPosition = gl.createBuffer();
gl.useProgram(program);
gl.viewport(0, 0, gl.canvas.width, gl.canvas.height);
gl.enable(gl.CULL_FACE);
gl.disable(gl.DEPTH_TEST);
gl.depthMask(false);
gl.enableVertexAttribArray(vertexPositionLocation);
gl.bindBuffer(gl.ARRAY_BUFFER, bufferPosition);
gl.bufferData(gl.ARRAY_BUFFER, new Float32Array(createCube(-Math.sqrt(2), -Math.sqrt(2), Math.sqrt(2)*2)), gl.STATIC_DRAW);
gl.vertexAttribPointer(vertexPositionLocation, 3, gl.FLOAT, false, 0, 0);
drawScene(gl);
}
I'm using a cube that has the length sqrt(2.0) because at rotation x, y should be at minimum 1.0 at every situation...
How can I soften that edge? Any ideas will be greatly appreciated.
EDIT:
Following gman proposal, here is the refined version with a single quad with the cubemap being mapped to it:
let canvas = document.querySelector("canvas");
let gl = canvas.getContext("webgl");
gl.canvas.width = canvas.getBoundingClientRect().width;
gl.canvas.height = canvas.getBoundingClientRect().height;
let sliders = [
{
label : "rotationX",
valueStart : 0,
valueEnd : 2*Math.PI,
value : 0,
measurement : "°"
}, {
label : "rotationY",
valueStart : 0,
valueEnd : 2*Math.PI,
value : 0,
measurement : "°"
}, {
label: "rotationZ",
valueStart : 0,
valueEnd : 2*Math.PI,
value : 0,
measurement: "°"
}
];
setSliders(sliders, drawScene, true, gl);
let vertexShaderSource = `
attribute vec4 a_position;
attribute vec3 a_texture;
uniform mat4 u_matrix;
varying vec3 v_texture;
void main() {
vec4 pos = u_matrix*a_position;
gl_Position = vec4(a_position.xy, 0, 1.0);
v_texture = pos.xyz;
}
`;
let fragmentShaderSource = `
precision mediump float;
uniform samplerCube u_skybox;
varying vec3 v_texture;
void main() {
gl_FragColor = textureCube(u_skybox, v_texture);
}
`;
function createFacade() {
return [
-1, -1, -1,
1, 1, -1,
-1, 1, -1,
1, 1, -1,
-1, -1, -1,
1, -1, -1
];
}
function createShader(gl, type, source) {
let shader = gl.createShader(type);
gl.shaderSource(shader, source);
gl.compileShader(shader);
let success = gl.getShaderParameter(shader, gl.COMPILE_STATUS);
if(success)
return shader;
console.log(gl.getShaderInfoLog(shader));
gl.deleteShader(shader);
}
function createProgram(gl, vertexShader, fragmentShader) {
let program = gl.createProgram();
gl.attachShader(program, vertexShader);
gl.attachShader(program, fragmentShader);
gl.linkProgram(program);
let success = gl.getProgramParameter(program, gl.LINK_STATUS);
if(success)
return program;
console.log(gl.getProgramInfoLog(program));
gl.deleteProgram(program);
}
function drawScene(gl) {
gl.clear(gl.COLOR_BUFFER_BIT | gl.DEPTH_BUFFER_BIT);
let quaternion = matrices["quarternion"]();
let quaternionRot = matrices["fromEuler"](quaternion,
sliders[0].value/Math.PI*360,
sliders[1].value/Math.PI*360,
sliders[2].value/Math.PI*360);
let quaternionMatrix = matrices["fromQuat"](matrices["idMatrix"](), quaternionRot);
let viewMatrix = inverseMatrix(quaternionMatrix);
gl.uniformMatrix4fv(matrixCameraLocation, false, viewMatrix);
gl.drawArrays(gl.TRIANGLES, 0, 6);
}
let vertexPositionLocation, matrixCameraLocation;
let bufferPosition;
function resize(gl) {
let realToCSSPixels = window.devicePixelRatio;
let displayWidth = Math.floor(gl.canvas.clientWidth * realToCSSPixels);
let displayHeight = Math.floor(gl.canvas.clientHeight * realToCSSPixels);
if (gl.canvas.width !== displayWidth ||
gl.canvas.height !== displayHeight) {
gl.canvas.width = displayWidth;
gl.canvas.height = displayHeight;
}
}
let imgs = [];
let promises = [];
[
"Left",
"Right",
"Up",
"Down",
"Front",
"Back"
].forEach((path) => {
let img = document.createElement("img");
img.crossOrigin = "null";
img.src = "http://localhost:8000/texture?filename=./SunSet/SunSet" + path + "2048.png";
let res;
promises.push(new Promise((resolve, reject) => {
res = resolve;
}));
img.addEventListener("load", res);
imgs.push(img);
});
Promise.all(promises).then(() => {
startWebGL(gl);
});
function isPowerOf2(value) {
return (value & (value - 1)) === 0;
}
function startWebGL(gl) {
resize(gl);
let vertexShader = createShader(gl, gl.VERTEX_SHADER, vertexShaderSource);
let fragmentShader = createShader(gl, gl.FRAGMENT_SHADER, fragmentShaderSource);
let program = createProgram(gl, vertexShader, fragmentShader);
vertexPositionLocation = gl.getAttribLocation(program, "a_position");
matrixCameraLocation = gl.getUniformLocation(program, "u_matrix");
let texture = gl.createTexture();
gl.bindTexture(gl.TEXTURE_CUBE_MAP, texture);
for(let g = 0; g < 6; g++) {
gl.texImage2D(gl.TEXTURE_CUBE_MAP_POSITIVE_X+g, 0, gl.RGBA, gl.RGBA, gl.UNSIGNED_BYTE, imgs[g]);
}
if(isPowerOf2(imgs[0].width) && isPowerOf2(imgs[0].height)) {
gl.generateMipmap(gl.TEXTURE_CUBE_MAP);
} else {
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
gl.texParameteri(gl.TEXTURE_CUBE_MAP, gl.TEXTURE_MIN_FILTER, gl.LINEAR);
}
bufferPosition = gl.createBuffer();
gl.useProgram(program);
gl.viewport(0, 0, gl.canvas.width, gl.canvas.height);
gl.enable(gl.CULL_FACE);
gl.disable(gl.DEPTH_TEST);
gl.depthMask(false);
gl.enableVertexAttribArray(vertexPositionLocation);
gl.bindBuffer(gl.ARRAY_BUFFER, bufferPosition);
gl.bufferData(gl.ARRAY_BUFFER, new Float32Array(createFacade()), gl.STATIC_DRAW);
gl.vertexAttribPointer(vertexPositionLocation, 3, gl.FLOAT, false, 0, 0);
drawScene(gl)
I found by luck and by trying every possibility :p, instead of rotating the world aka cube around the camera, I change the cube to stay statically, thus I am apply the world rotation to the texture not the cube, and it works flawlessly:
The changed vertex shader:
let vertexShaderSource = `
attribute vec4 a_position;
attribute vec3 a_texture;
varying vec3 v_texture;
uniform mat4 u_matrix;
void main() {
vec4 pos = u_matrix*a_position;
pos.xyz /= 2.0*sqrt(2.0); //Here I divide by the maximum magnitude so that the vector will become unit vector
gl_Position = vec4(a_position.xy, a_position.z/sqrt(2.0), 1.0); //Doing the same trick as in the question' code
v_texture = pos.xyz;
}
`;
I'm using openCv 2.4.6.
The following doesn't work:
cv::Mat src = cv::imread(argv[1], 1);
int x11 = src.rows/3; int y11 = src.cols/3;
int x12 = src.rows/3; int y12 = src.rows*2/3;
int x13 = src.cols*2/3; int y13 = src.rows*2/3;
int x21 = 0; int y21 = 0;
int x22 = 0; int y22 = src.rows-1;
int x23 = src.cols-1; int y23 = src.rows-1;
#if 1 // doesn't work
float src_tri_data[] = {x11, y11, 0, x12, y12, 0, x13, y13, 0};
cv::Mat src_tri(3, 2, CV_32F, src_tri_data, 3*sizeof(float));
float dst_tri_data[] = {x21, y21, 0, x22, y22, 0, x23, y23, 0};
cv::Mat dst_tri(3, 2, CV_32F, dst_tri_data, 3*sizeof(float));
#else // works
float src_tri_data[] = {x11, y11, x12, y12, x13, y13};
cv::Mat src_tri(3, 2, CV_32F, src_tri_data, 2*sizeof(float));
float dst_tri_data[] = {x21, y21, x22, y22, x23, y23};
cv::Mat dst_tri(3, 2, CV_32F, dst_tri_data, 2*sizeof(float));
#endif
cv::Mat trans = cv::getAffineTransform(src_tri, dst_tri);
It fails with an assertion:
OpenCV Error: Assertion failed (src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3) in unknown function, file C:\slave\builds\WinInstallerMegaPack\src\opencv\modules\imgproc\src\imgwarp.cpp, line 3612
However, if I make the matrices continuous it works.
Any idea why?
The assertions check that the size of the row of index 2 contains 3 elements but your matrices contains 2 elements per row.
The size of the matrices are not correct, modify the construction of your matrices :
cv::Mat src_tri(3,
3, // <=== 3 instead of 2
CV_32F,
src_tri_data,
3*sizeof(float));