Clang disable inlining getelementptr

Clang disable inlining getelementptr - clang

I have the following simple c code.
int a[2] = {1,2};
int main(){
return a[0];
}
Now the IR that clang generates for this case is as follows.
%retval = alloca i32, align 4
store i32 0, i32* %retval, align 4
%0 = load i32, i32* getelementptr inbounds ([2 x i32], [2 x i32]* #a, i64 0, i64 0), align 4
ret i32 %0
As seen, the getelementptr instruction is inlined inside of the load instruction. Is there a way to disable this inlining in clang? I'm using clang version 3.8.0.

Related

LLVM11 debug info is different from LLVM8 debug info when running the same compilation command

The C source code is as following.
#include "fdlibm.h"
#include <errno.h>
#ifdef __STDC__
double ldexp(double value, int exp)
#else
double ldexp(value, exp)
double value; int exp;
#endif
{
if(!finite(value)||value==0.0) return value;
value = scalbn(value,exp);
if(!finite(value)||value==0.0) errno = ERANGE;
return value;
}
The compilation command is "clang -emit-llvm -g -O3 -D_IEEE_LIBM -Wall -Wuninitialized -c s_ldexp.c -o s_ldexp.bc"
I transform the bitcode into human-readable LLVM assembly language using this command "llvm-dis s_ldexp.bc -o s_ldexp.ll"
The IR compiled by clang8 is as following.
; ModuleID = 's_ldexp.llvm8.O3.bc'
source_filename = "s_ldexp.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local double #ldexp(double, i32) local_unnamed_addr #0 !dbg !7 {
call void #llvm.dbg.value(metadata double %0, metadata !13, metadata !DIExpression()), !dbg !15
call void #llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !16
%3 = tail call double #llvm.fabs.f64(double %0) #4, !dbg !17
%4 = fcmp ueq double %3, 0x7FF0000000000000, !dbg !17
%5 = fcmp oeq double %0, 0.000000e+00, !dbg !19
%6 = or i1 %5, %4, !dbg !20
br i1 %6, label %15, label %7, !dbg !20
; <label>:7: ; preds = %2
%8 = tail call double #scalbn(double %0, i32 %1) #5, !dbg !21
call void #llvm.dbg.value(metadata double %8, metadata !13, metadata !DIExpression()), !dbg !15
%9 = tail call double #llvm.fabs.f64(double %8) #4, !dbg !22
%10 = fcmp ueq double %9, 0x7FF0000000000000, !dbg !22
%11 = fcmp oeq double %8, 0.000000e+00, !dbg !24
%12 = or i1 %11, %10, !dbg !25
br i1 %12, label %13, label %15, !dbg !25
; <label>:13: ; preds = %7
%14 = tail call i32* #__errno_location() #6, !dbg !26
store i32 34, i32* %14, align 4, !dbg !27, !tbaa !28
br label %15, !dbg !26
; <label>:15: ; preds = %13, %7, %2
%16 = phi double [ %0, %2 ], [ %8, %7 ], [ %8, %13 ], !dbg !32
ret double %16, !dbg !33
}
; Function Attrs: nounwind readnone speculatable
declare double #llvm.fabs.f64(double) #1
; Function Attrs: nounwind
declare dso_local double #scalbn(double, i32) local_unnamed_addr #2
; Function Attrs: nounwind readnone
declare dso_local i32* #__errno_location() local_unnamed_addr #3
; Function Attrs: nounwind readnone speculatable
declare void #llvm.dbg.value(metadata, metadata, metadata) #1
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { readnone }
attributes #5 = { nounwind }
attributes #6 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (tags/RELEASE_800/final)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
!1 = !DIFile(filename: "s_ldexp.c", directory: "/root/test/ficl/fdlibm53")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !{i32 1, !"wchar_size", i32 4}
!6 = !{!"clang version 8.0.0 (tags/RELEASE_800/final)"}
!7 = distinct !DISubprogram(name: "ldexp", scope: !1, file: !1, line: 18, type: !8, scopeLine: 23, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
!8 = !DISubroutineType(types: !9)
!9 = !{!10, !10, !11}
!10 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!12 = !{!13, !14}
!13 = !DILocalVariable(name: "value", arg: 1, scope: !7, file: !1, line: 18, type: !10)
!14 = !DILocalVariable(name: "exp", arg: 2, scope: !7, file: !1, line: 18, type: !11)
!15 = !DILocation(line: 18, column: 22, scope: !7)
!16 = !DILocation(line: 18, column: 33, scope: !7)
!17 = !DILocation(line: 24, column: 6, scope: !18)
!18 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24, column: 5)
!19 = !DILocation(line: 24, column: 26, scope: !18)
!20 = !DILocation(line: 24, column: 19, scope: !18)
!21 = !DILocation(line: 25, column: 10, scope: !7)
!22 = !DILocation(line: 26, column: 6, scope: !23)
!23 = distinct !DILexicalBlock(scope: !7, file: !1, line: 26, column: 5)
!24 = !DILocation(line: 26, column: 26, scope: !23)
!25 = !DILocation(line: 26, column: 19, scope: !23)
!26 = !DILocation(line: 26, column: 33, scope: !23)
!27 = !DILocation(line: 26, column: 39, scope: !23)
!28 = !{!29, !29, i64 0}
!29 = !{!"int", !30, i64 0}
!30 = !{!"omnipotent char", !31, i64 0}
!31 = !{!"Simple C/C++ TBAA"}
!32 = !DILocation(line: 0, scope: !7)
!33 = !DILocation(line: 28, column: 1, scope: !7)
The IR compiled by clang11 is as following.
; ModuleID = 's_ldexp.llvm11.O3.bc'
source_filename = "s_ldexp.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local double #ldexp(double %0, i32 %1) local_unnamed_addr #0 !dbg !7 {
call void #llvm.dbg.value(metadata double %0, metadata !13, metadata !DIExpression()), !dbg !15
call void #llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !15
%3 = tail call double #llvm.fabs.f64(double %0) #4, !dbg !16
%4 = fcmp ueq double %3, 0x7FF0000000000000, !dbg !16
%5 = fcmp oeq double %0, 0.000000e+00
%6 = or i1 %5, %4, !dbg !18
br i1 %6, label %15, label %7, !dbg !18
7: ; preds = %2
%8 = tail call double #scalbn(double %0, i32 %1) #5, !dbg !19
call void #llvm.dbg.value(metadata double %8, metadata !13, metadata !DIExpression()), !dbg !15
%9 = tail call double #llvm.fabs.f64(double %8) #4, !dbg !20
%10 = fcmp ueq double %9, 0x7FF0000000000000, !dbg !20
%11 = fcmp oeq double %8, 0.000000e+00
%12 = or i1 %11, %10, !dbg !22
br i1 %12, label %13, label %15, !dbg !22
13: ; preds = %7
%14 = tail call i32* #__errno_location() #6, !dbg !23
store i32 34, i32* %14, align 4, !dbg !24, !tbaa !25
br label %15, !dbg !23
15: ; preds = %13, %7, %2
%16 = phi double [ %0, %2 ], [ %8, %7 ], [ %8, %13 ], !dbg !15
ret double %16, !dbg !29
}
; Function Attrs: nounwind readnone speculatable willreturn
declare double #llvm.fabs.f64(double) #1
; Function Attrs: nounwind
declare dso_local double #scalbn(double, i32) local_unnamed_addr #2
; Function Attrs: nounwind readnone
declare dso_local i32* #__errno_location() local_unnamed_addr #3
; Function Attrs: nounwind readnone speculatable willreturn
declare void #llvm.dbg.value(metadata, metadata, metadata) #1
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { readnone }
attributes #5 = { nounwind }
attributes #6 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (https://github.com/llvm/llvm-project.git 0160ad802e899c2922bc9b29564080c22eb0908c)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "s_ldexp.c", directory: "/root/test/ficl/fdlibm53")
!2 = !{}
!3 = !{i32 7, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !{i32 1, !"wchar_size", i32 4}
!6 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 0160ad802e899c2922bc9b29564080c22eb0908c)"}
!7 = distinct !DISubprogram(name: "ldexp", scope: !1, file: !1, line: 18, type: !8, scopeLine: 23, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
!8 = !DISubroutineType(types: !9)
!9 = !{!10, !10, !11}
!10 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!12 = !{!13, !14}
!13 = !DILocalVariable(name: "value", arg: 1, scope: !7, file: !1, line: 18, type: !10)
!14 = !DILocalVariable(name: "exp", arg: 2, scope: !7, file: !1, line: 18, type: !11)
!15 = !DILocation(line: 0, scope: !7)
!16 = !DILocation(line: 24, column: 6, scope: !17)
!17 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24, column: 5)
!18 = !DILocation(line: 24, column: 19, scope: !17)
!19 = !DILocation(line: 25, column: 10, scope: !7)
!20 = !DILocation(line: 26, column: 6, scope: !21)
!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 26, column: 5)
!22 = !DILocation(line: 26, column: 19, scope: !21)
!23 = !DILocation(line: 26, column: 33, scope: !21)
!24 = !DILocation(line: 26, column: 39, scope: !21)
!25 = !{!26, !26, i64 0}
!26 = !{!"int", !27, i64 0}
!27 = !{!"omnipotent char", !28, i64 0}
!28 = !{!"Simple C/C++ TBAA"}
!29 = !DILocation(line: 28, column: 1, scope: !7)
In the s_ldexp.c, there one code snippet “value==0.0”.
The corresponding IR compiled by clang8 is "%5 = fcmp oeq double %0, 0.000000e+00, !dbg !19".
The corresponding IR compiled by clang11 is "%5 = fcmp oeq double %0, 0.000000e+00".
Compared with clang8, the debug info "!dbg !19" is missing in the corresponding IR compiled by clang11. The debug info is important to me. I hope that LLVM11 can reserve the debug info. For LLVM11, there are there methods that can solve the problem.
The first method is to run these two commands.
clang -emit-llvm -O3 -D_IEEE_LIBM -Wall -Wuninitialized -c s_ldexp.c -o s_ldexp.bc
opt -enable-debugify s_ldexp.bc -o s_ldexp.debug.bc
When using clang to compile source code, "-g" parameter is not used. And we can use llvm opt command to attach debug info into the IR.
The second method is to run this command.
clang -emit-llvm -g -O0 -D_IEEE_LIBM -Wall -Wuninitialized -c s_ldexp.c -o s_ldexp.bc
The command is to lower the optimization level of compiler and keep as much debug info as possible.
The third method is to modify the C source code.
The code after modification is as following.
#include "fdlibm.h"
#include <errno.h>
#include <stdbool.h>
#ifdef __STDC__
double ldexp(double value, int exp)
#else
double ldexp(value, exp)
double value; int exp;
#endif
{
bool flag1 = (!finite(value)) || (value==0.0);
if(flag1) return value;
value = scalbn(value,exp);
bool flag2 = (!finite(value)) || (value==0.0);
if(flag2) errno = ERANGE;
return value;
}
We can get around this problem using the third method.
Though these three method can solve this problem, I still have two questions.
The first question is why the debug info is missing in the IR compiled by clang11.
The second question is that is there any option or LLVM pass can check whether some debug info is missing and complement the missing debug info.

Array access and assignment has variable alignment size in LLVM IR output generated by clang

Hey I'm working on a project where I'm creating a compiler.
While trying to implement array element assignment I ran into this piece of llvm ir code:
define dso_local i32 #main() {
%1 = alloca [10 x i32], align 16
%2 = getelementptr inbounds [10 x i32], [10 x i32]* %1, i64 0, i64 0
store i32 2, i32* %2, align 16
%3 = getelementptr inbounds [10 x i32], [10 x i32]* %1, i64 0, i64 1
store i32 4, i32* %3, align 4
%4 = getelementptr inbounds [10 x i32], [10 x i32]* %1, i64 0, i64 2
store i32 6, i32* %4, align 8
%5 = getelementptr inbounds [10 x i32], [10 x i32]* %1, i64 0, i64 3
store i32 8, i32* %5, align 4
%6 = getelementptr inbounds [10 x i32], [10 x i32]* %1, i64 0, i64 4
store i32 10, i32* %6, align 16
ret i32 0
}
Generated from:
int main () {
int x[10];
x[0] = 2;
x[1] = 4;
x[2] = 6;
x[3] = 8;
x[4] = 10;
}
While inspecting the IR output, I noticed that the alignment always seems to be different (16-4-8-4). I am quite puzzled by this, and don't know why the alignment would be variable. Any hints or nudges in the right directions would be much appreciated.

Clang seems to be using its knowledge of the alloca's alignment and the pointer offsets to compute the maximum alignment of each element individually. The alloca is aligned to 16B, so a pointer to element 0 (no offset) will have that alignment. But since the elements in the array in the alloca are only 4B in size (i32), which is less than 16B, the pointer alignment changes once you offset it to get later elements.
Clang could just set the alignment on everything here to be 4 instead, but there's no rule that requires it to. Moreover there can be some benefits to using this large alignment of 16 for the array, such as enabling 4 elements (4 × 4B = 16B) to be loaded at once with a SIMD instruction.

How does Clang compute indices of GEP instructions in virtual tables when compiling virtual inheritance classes in C++?

I'm trying to understand the way Clang compiles virtual inheritance classes in C++. Here is my code:
// test.cpp
#include <stdio.h>
int global_obj;
int *global_ptr = &global_obj;
class A {
public:
virtual int f(int *i) { return *i; }
};
class B: virtual public A { // class B is virtual inheritance class of A
};
int main(int argc, char **argv)
{
int *ptr = &global_obj;
B *pb = new B;
int a = pb->f(ptr);
return a;
}
My compilation command is:
clang -O0 -Xclang -disable-llvm-passes -Xclang -disable-O0-optnone -c -emit-llvm test.c
opt -mem2reg test.bc
and below is the compiled LLVM bitcode, where _ZN1BC1Ev and _ZN1AC2Ev are the compiled constructors of class B and A.
%class.B = type { %class.A }
%class.A = type { i32 (...)** }
#global_obj = global i32 0, align 4
#global_ptr = global i32* #global_obj, align 8
#_ZTV1B = linkonce_odr unnamed_addr constant { [5 x i8*] } { [5 x i8*] [i8* null, i8* null, i8* null, i8* null, i8* bitcast (i32 (%class.A*, i32*)* #_ZN1A1fEPi to i8*)] }, align 8
#_ZTT1B = linkonce_odr unnamed_addr constant [2 x i8*] [i8* bitcast (i8** getelementptr inbounds ({ [5 x i8*] }, { [5 x i8*] }* #_ZTV1B, i32 0, inrange i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ({ [5 x i8*] }, { [5 x i8*] }* #_ZTV1B, i32 0, inrange i32 0, i32 4) to i8*)], align 8
#_ZTV1A = linkonce_odr unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* null, i8* bitcast (i32 (%class.A*, i32*)* #_ZN1A1fEPi to i8*)] }, align 8
; Function Attrs: noinline norecurse ssp uwtable
define i32 #main(i32, i8**) #0 {
%3 = call i8* #_Znwm(i64 8) #3
%4 = bitcast i8* %3 to %class.B*
call void #_ZN1BC1Ev(%class.B* %4) #4
%5 = bitcast %class.B* %4 to i8**
%6 = load i8*, i8** %5, align 8
%7 = getelementptr i8, i8* %6, i64 -32
%8 = bitcast i8* %7 to i64*
%9 = load i64, i64* %8, align 8
%10 = bitcast %class.B* %4 to i8*
%11 = getelementptr inbounds i8, i8* %10, i64 %9
%12 = bitcast i8* %11 to %class.A*
%13 = bitcast %class.A* %12 to i32 (%class.A*, i32*)***
%14 = load i32 (%class.A*, i32*)**, i32 (%class.A*, i32*)*** %13, align 8
%15 = getelementptr inbounds i32 (%class.A*, i32*)*, i32 (%class.A*, i32*)** %14, i64 0
%16 = load i32 (%class.A*, i32*)*, i32 (%class.A*, i32*)** %15, align 8
%17 = call i32 %16(%class.A* %12, i32* #global_obj)
ret i32 %17
}
; Function Attrs: nobuiltin
declare noalias i8* #_Znwm(i64) #1
; Function Attrs: noinline nounwind ssp uwtable
define linkonce_odr void #_ZN1BC1Ev(%class.B*) unnamed_addr #2 align 2 {
%2 = bitcast %class.B* %0 to %class.A*
call void #_ZN1AC2Ev(%class.A* %2) #4
%3 = bitcast %class.B* %0 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*] }, { [5 x i8*] }* #_ZTV1B, i32 0, inrange i32 0, i32 4) to i32 (...)**), i32 (...)*** %3, align 8
%4 = bitcast %class.B* %0 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*] }, { [5 x i8*] }* #_ZTV1B, i32 0, inrange i32 0, i32 4) to i32 (...)**), i32 (...)*** %4, align 8
ret void
}
; Function Attrs: noinline nounwind ssp uwtable
define linkonce_odr void #_ZN1AC2Ev(%class.A*) unnamed_addr #2 align 2 {
%2 = bitcast %class.A* %0 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* #_ZTV1A, i32 0, inrange i32 0, i32 2) to i32 (...)**), i32 (...)*** %2, align 8
ret void
}
; Function Attrs: noinline nounwind ssp uwtable
define linkonce_odr i32 #_ZN1A1fEPi(%class.A*, i32*) unnamed_addr #2 align 2 {
%3 = load i32, i32* %1, align 4
ret i32 %3
}
I understand that Clang will introduce a virtual table to capture objects of the classes A and B.
But when delving deeper into the compilation of the main function, I don't really understand why Clang introduces a GEP of index -32 in the main function.
And what is the value of index %9 in the next GEP. Why can't it be determined at compile time?
%7 = getelementptr i8, i8* %6, i64 -32
%8 = bitcast i8* %7 to i64*
%9 = load i64, i64* %8, align 8
%10 = bitcast %class.B* %4 to i8*
%11 = getelementptr inbounds i8, i8* %10, i64 %9
Does anyone know why Clang does so?
Thank you very much for reading my very long question!

Location of virtually inherited base class can not be determined at compile-time and is delayed until runtime. The virtual base offset (vbase offset) is located within the vtable so first your code loads the vtable pointer:
%5 = bitcast %class.B* %4 to i8**
%6 = load i8*, i8** %5, align 8
and then loads the vbase offset (from pre-defined location at vptr - 32):
%7 = getelementptr i8, i8* %6, i64 -32
%8 = bitcast i8* %7 to i64*
%9 = load i64, i64* %8, align 8
This is used to compute the offset to base class:
%10 = bitcast %class.B* %4 to i8*
%11 = getelementptr inbounds i8, i8* %10, i64 %9
%12 = bitcast i8* %11 to %class.A*
and load pointer to virtual method from base class's vtable:
%13 = bitcast %class.A* %12 to i32 (%class.A*, i32*)***
%14 = load i32 (%class.A*, i32*)**, i32 (%class.A*, i32*)*** %13, align 8
%15 = getelementptr inbounds i32 (%class.A*, i32*)*, i32 (%class.A*, i32*)** %14, i64 0
%16 = load i32 (%class.A*, i32*)*, i32 (%class.A*, i32*)** %15, align 8
and finally call it:
%17 = call i32 %16(%class.A* %12, i32* #global_obj)
You can find more details on how vtables are organized in Itanium ABI (but beware that it's not for the faint-hearted).

MPI - Difference between mpi_type_get_extent and mpi_type_get_true_extent

I have some problem in understanding the difference between mpi_type_get_extent and mpi_type_get_true_extent. In practice, I was using the former, expecting the results I then obtained with the latter, so I checked the MPI 3.1 Standard, where I found (at the section 4.1.8 True Extent of Datatypes)
However, the datatype extent cannot be used as an estimate of the
amount of space that needs to be allocated, if the user has modified
the extent
which made me think that I should have experienced no difference in the use of the two subroutines as long as I hadn't modified the extent of the datatype.
But I'm evidently missing something.
Declared the following MPI derived data type,
sizes = [10,10,10]
subsizes = [ 3, 3, 3]
starts = [ 2, 2, 2]
CALL MPI_TYPE_CREATE_SUBARRAY(ndims, sizes, subsizes, starts, MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, newtype, ierr)
the following code
call mpi_type_size(newtype, k, ierr)
call mpi_type_get_extent(newtype, lb, extent, ierr)
call mpi_type_get_true_extent(newtype, tlb, textent, ierr)
write(*,*) k/DBS, lb/DBS, extent/DBS, tlb/DBS, textent/DBS ! DBS is the size of double precision
produces the output (obviously the same for all processes)
27 0 1000 222 223
So mpi_type_size behave like I expect, returning PRODUCT(subsizes)*DBS in k; on the other hand, I'd have expected from both mpi_type_get_extent and mpi_type_get_true_extent what only the latter returns (since I have not modified newtype at all), specifically 222 223, which are basically starts(1) + starts(2)*sizes(1) + starts(3)*sizes(1)*sizes(2) and 1 + (subsizes - 1)*[1, sizes(1), sizes(1)*sizes(2)].
Why does mpi_type_get_extent return 0 and PRODUCT(sizes) in lb and extent, regardless of subsizes and starts?
I haven't posted an MWE since I have no errors at all (not at compile time, nor at runtime), I simply haven't got the way the two aforementioned routines work. I would basically like someone to help me in understanding the description of those subroutine in the standard document and why it is correct to obtain those result that I didn't expect.
EDIT
As requested by #GillesGouaillardet, I add a "minimal" working example to be run with at least 4 processes (please run it with exactly 4 processes, so that we have the same output), at the end of this question. The last lines can be uncommented (with awareness) to show that the types representing non-contiguous memory location work properly when used with count > 1, once they've been properly resized by means of mpi_type_create_resized. With those lines commented, the program prints size, lb, extent, true_lb, true_extent for all types created (even those intermediate, not committed):
mpi_type_contiguous 4 0 4 0 4
mpi_type_vector 4 0 13 0 13
mpi_type_vector res 4 0 1 0 13
mpi_type_create_subarray 4 0 16 0 13
mpi_type_create_subarray res 4 0 1 0 13
All types represent one row or column of a 4 by 4 matrix, so their size is predictably always 4; the column type has extent and true_extent both equal to 4 units as well, since it represents four contiguous reals in memory; the type created with mpi_type_vector has extent and true_extent both equal to 13 reals, as I expected (see the nice sketch); if I want to use it with count > 1, I must resize it, changing its extent (and true_extent stays the same); now the hard part comes:
What is that 16 as extent of the type created with mpi_type_create_subarray? To be honest I'd have expected that routine to return an already resized type, ready to be used with count > 1 (i.e. a type with size = 4, extent = 1, true_extent = 13), but it seems it does not: surprisingly for me, extent is 16, which is the size of the global array!
The question is: why? Why the extent of a type created with mpi_type_create_subarray is the product of the elements of the array_of_sizes argument?
program subarray
use mpi
implicit none
integer :: i, j, k, ierr, myid, npro, rs, mycol, myrowugly, myrow_vec, myrow_sub
integer(kind = mpi_address_kind) :: lb, extent, tlb, textent
real, dimension(:,:), allocatable :: mat
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, myid, ierr)
call mpi_comm_size(mpi_comm_world, npro, ierr)
allocate(mat(npro,npro))
mat = myid*1.0
call mpi_type_size(mpi_real, rs, ierr)
call mpi_type_contiguous(npro, mpi_real, mycol, ierr)
call mpi_type_commit(mycol, ierr)
call mpi_type_size(mycol, k, ierr)
call mpi_type_get_extent(mycol, lb, extent, ierr)
call mpi_type_get_true_extent(mycol, tlb, textent, ierr)
if (myid == 0) print *, 'mpi_type_contiguous ', k/rs, lb/rs, extent/rs, tlb/rs, textent/rs
call mpi_type_vector(npro, 1, npro, mpi_real, myrowugly, ierr)
call mpi_type_size(myrowugly, k, ierr)
call mpi_type_get_extent(myrowugly, lb, extent, ierr)
call mpi_type_get_true_extent(myrowugly, tlb, textent, ierr)
if (myid == 0) print *, 'mpi_type_vector ', k/rs, lb/rs, extent/rs, tlb/rs, textent/rs
call mpi_type_create_resized(myrowugly, int(0, mpi_address_kind)*rs, int(1, mpi_address_kind)*rs, myrow_vec, ierr)
call mpi_type_commit(myrow_vec, ierr)
call mpi_type_size(myrow_vec, k, ierr)
call mpi_type_get_extent(myrow_vec, lb, extent, ierr)
call mpi_type_get_true_extent(myrow_vec, tlb, textent, ierr)
if (myid == 0) print *, 'mpi_type_vector res ', k/rs, lb/rs, extent/rs, tlb/rs, textent/rs
call mpi_type_create_subarray(2, [npro, npro], [1, npro], [0, 0], mpi_order_fortran, mpi_real, myrowugly, ierr)
call mpi_type_size(myrowugly, k, ierr)
call mpi_type_get_extent(myrowugly, lb, extent, ierr)
call mpi_type_get_true_extent(myrowugly, tlb, textent, ierr)
if (myid == 0) print *, 'mpi_type_create_subarray ', k/rs, lb/rs, extent/rs, tlb/rs, textent/rs
call mpi_type_create_resized(myrowugly, int(0, mpi_address_kind)*rs, int(1, mpi_address_kind)*rs, myrow_sub, ierr)
call mpi_type_commit(myrow_sub, ierr)
call mpi_type_size(myrow_sub, k, ierr)
call mpi_type_get_extent(myrow_sub, lb, extent, ierr)
call mpi_type_get_true_extent(myrow_sub, tlb, textent, ierr)
if (myid == 0) print *, 'mpi_type_create_subarray res', k/rs, lb/rs, extent/rs, tlb/rs, textent/rs
!if (myid == 0) call mpi_send(mat(1,1), 2, mycol, 1, 666, mpi_comm_world, ierr)
!if (myid == 0) call mpi_recv(mat(1,3), 2, mycol, 1, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_recv(mat(1,1), 2, mycol, 0, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_send(mat(1,3), 2, mycol, 0, 666, mpi_comm_world, ierr)
!if (myid == 0) call mpi_send(mat(1,1), 2, myrow_vec, 1, 666, mpi_comm_world, ierr)
!if (myid == 0) call mpi_recv(mat(3,1), 2, myrow_vec, 1, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_recv(mat(1,1), 2, myrow_vec, 0, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_send(mat(3,1), 2, myrow_vec, 0, 666, mpi_comm_world, ierr)
!if (myid == 0) call mpi_send(mat(1,1), 2, myrow_sub, 1, 666, mpi_comm_world, ierr)
!if (myid == 0) call mpi_recv(mat(3,1), 2, myrow_sub, 1, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_recv(mat(1,1), 2, myrow_sub, 0, 666, mpi_comm_world, mpi_status_ignore, ierr)
!if (myid == 1) call mpi_send(mat(3,1), 2, myrow_sub, 0, 666, mpi_comm_world, ierr)
!do i = 0, npro
!if (myid == i) then
!print *, ""
!print *, myid
!do j = 1, npro
!print *, mat(j,:)
!end do
!end if
!call mpi_barrier(mpi_comm_world, ierr)
!end do
call mpi_finalize(ierr)
end program subarray

MPI_Type_create_subarray() creates a derived datatype whose extent is, per definition, the product of all sizes.
The definition is in the MPI 3.1 standard at page 96.
MPI_Type_create_subarray() is generally used for MPI-IO, so this definition of the extent makes sense there.
It might not be what you wish in this very specific case, but think of a 2x2 subarray of a 4x4 array. What extent would you expect ?

Automate creation of large symbolic matrix

Can this matrix be generated in a less manual way? It's okay for 4 x 4, but I need something larger. Thanks
--> L : matrix([L11,L12,L13,L14],[L21,L22,L23,L24],[L31,L32,L33,L34],[L41,L42,L43,L44]);
(L) matrix(
[L11, L12, L13, L14],
[L21, L22, L23, L24],
[L31, L32, L33, L34],
[L41, L42, L43, L44]
)

Answer to the question and note the noun form for L in the concat function ('L)
L:genmatrix(lambda([i,j], concat('L,i,j)), 3, 3);
(L) matrix(
[L11, L12, L13],
[L21, L22, L23],
[L31, L32, L33]
)
For a diagonal matrix
R:genmatrix(lambda([i,j], if i=j then concat('R,i) else 0), 3, 3);
(R) matrix(
[R1, 0, 0],
[0, R2, 0],
[0, 0, R3]
)

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Clang disable inlining getelementptr - clang

Related

LLVM11 debug info is different from LLVM8 debug info when running the same compilation command

Array access and assignment has variable alignment size in LLVM IR output generated by clang

How does Clang compute indices of GEP instructions in virtual tables when compiling virtual inheritance classes in C++?

MPI - Difference between mpi_type_get_extent and mpi_type_get_true_extent

Automate creation of large symbolic matrix

Categories

Resources