Alignment of vectors in LLVM's amd64 output - alignment

I'm trying to use vectors inside structs with LLVM. I have the following C definition of my struct:
struct Foo
{
uint32_t len;
uint32_t data[32] __attribute__ ((aligned (16)));
};
and here's some LLVM code to add 42 to element number 3 of the data field:
%Foo = type { i32, <32 x i32> }
define void #process(%Foo*) {
_L1:
%data = getelementptr %Foo* %0, i32 0, i32 1
%vec = load <32 x i32>* %data
%x = extractelement <32 x i32> %vec, i32 3
%xNew = add i32 42, %x
%vecNew = insertelement <32 x i32> %vec, i32 %xNew, i32 3
store <32 x i32> %vecNew, <32 x i32>* %data
ret void
}
However, the output of llc is as if vectors had to be aligned at 128 bytes, which seems wasteful, and also wrong (AFAIK vectors should be 16-byte-aligned):
.file "process.bc"
.text
.globl process
.align 16, 0x90
.type process,#function
process: # #process
.Leh_func_begin0:
# BB#0: # %_L1
movdqa 128(%rdi), %xmm0
pextrd $3, %xmm0, %eax
addl $42, %eax
pinsrd $3, %eax, %xmm0
movdqa %xmm0, 128(%rdi)
ret
.Ltmp0:
.size process, .Ltmp0-process
.Leh_func_end0:
Of course, if I change the C definition to also align the data field at 128 bytes, it works, but wasting 124 bytes (compared to 12 if using 16-byte alignment) just seems wrong. So what's going on here?

I think your GEPs are a little off for the best codegen. Here's some C code that does something similar:
#include <stdint.h>
struct Foo
{
uint32_t len;
uint32_t data[32] __attribute__ ((aligned (16)));
};
void foo(struct Foo *F)
{
F->data[3] = 4;
}
which clang turns into this:
; ModuleID = 'foo.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
%struct.Foo = type { i32, [12 x i8], [32 x i32] }
define void #foo(%struct.Foo* %F) nounwind ssp {
%1 = alloca %struct.Foo*, align 8
store %struct.Foo* %F, %struct.Foo** %1, align 8
%2 = load %struct.Foo** %1, align 8
%3 = getelementptr inbounds %struct.Foo* %2, i32 0, i32 2
%4 = getelementptr inbounds [32 x i32]* %3, i32 0, i64 3
store i32 4, i32* %4
ret void
}
and the corresponding nice code you'd expect:
_foo: ## #foo
Leh_func_begin0:
## BB#0:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $4, 28(%rdi)
popq %rbp
ret
Leh_func_end0:
That said, the code you have there is isn't right and should be:
_process: ## #process
Leh_func_begin1:
## BB#0: ## %_L1
movaps 16(%rdi), %xmm0
pextrd $3, %xmm0, %eax
addl $42, %eax
pinsrd $3, %eax, %xmm0
movaps %xmm0, 16(%rdi)
ret
and is even worse in ToT so a bug report wouldn't go amiss there.

Related

Useless clang temporary in LLVM for `return 0` in simple C program [duplicate]

Here's a simple C file with an enum definition and a main function:
enum days {MON, TUE, WED, THU};
int main() {
enum days d;
d = WED;
return 0;
}
It transpiles to the following LLVM IR:
define dso_local i32 #main() #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 0, i32* %1, align 4
store i32 2, i32* %2, align 4
ret i32 0
}
%2 is evidently the d variable, which gets 2 assigned to it. What does %1 correspond to if zero is returned directly?
This %1 register was generated by clang to handle multiple return statements in a function. Imagine you were writing a function to compute an integer's factorial. Instead of this
int factorial(int n){
int result;
if(n < 2)
result = 1;
else{
result = n * factorial(n-1);
}
return result;
}
You'd probably do this
int factorial(int n){
if(n < 2)
return 1;
return n * factorial(n-1);
}
Why? Because Clang will insert that result variable that holds the return value for you. Yay. That's the reason for that %1 variable. Look at the ir for a slightly modified version of your code.
Modified code,
enum days {MON, TUE, WED, THU};
int main() {
enum days d;
d = WED;
if(d) return 1;
return 0;
}
IR,
define dso_local i32 #main() #0 !dbg !15 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 0, i32* %1, align 4
store i32 2, i32* %2, align 4, !dbg !22
%3 = load i32, i32* %2, align 4, !dbg !23
%4 = icmp ne i32 %3, 0, !dbg !23
br i1 %4, label %5, label %6, !dbg !25
5: ; preds = %0
store i32 1, i32* %1, align 4, !dbg !26
br label %7, !dbg !26
6: ; preds = %0
store i32 0, i32* %1, align 4, !dbg !27
br label %7, !dbg !27
7: ; preds = %6, %5
%8 = load i32, i32* %1, align 4, !dbg !28
ret i32 %8, !dbg !28
}
Now you see %1 making itself useful huh? Most functions with a single return statement will have this variable stripped by one of llvm's passes.
Why does this matter — what's the actual problem?
I think the deeper answer you're looking for might be: LLVM's architecture is based around fairly simple frontends and many passes. The frontends have to generate correct code, but it doesn't have to be good code. They can do the simplest thing that works.
In this case, Clang generates a couple of instructions that turn out not to be used for anything. That's generally not a problem, because some part of LLVM will get rid of superfluous instructions. Clang trusts that to happen. Clang doesn't need to avoid emitting dead code; its implementation may focus on correctness, simplicity, testability, etc.
Because Clang is done with syntax analysis but LLVM hasn't even started with optimization.
The Clang front end has generated IR (Intermediate Representation) and not machine code. Those variables are SSAs (Single Static Assignments); they haven't been bound to registers yet and actually after optimization, never will be because they are redundant.
That code is a somewhat literal representation of the source. It is what clang hands to LLVM for optimization. Basically, LLVM starts with that and optimizes from there. Indeed, for version 10 and x86_64, llc -O2 will eventually generate:
main: # #main
xor eax, eax
ret

Why doesn't Clang generate bitcode of some library functions in libstdc++?

When compiling this CPP file that using the list library by the command clang++ list-simple-test.cpp -c -emit-llvm:
// list1.cpp
#include <list>
using namespace std;
int main(int argc, char **argv)
{
int x = 1;
list<int*> alist;
alist.push_back(&x);
return x;
}
I notice that some functions, like _ZNSt8__detail15_List_node_base7_M_hookEPS0_ is generated without a function body:
; Function Attrs: nounwind
declare dso_local void #_ZNSt8__detail15_List_node_base7_M_hookEPS0_(%"struct.std::__detail::_List_node_base"*, %"struct.std::__detail::_List_node_base"*) #5
While most of the other functions are generated with a complete body, for example, like the function _ZNSt7__cxx1110_List_baseIPiSaIS1_EE11_M_inc_sizeEm below:
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void #_ZNSt7__cxx1110_List_baseIPiSaIS1_EE11_M_inc_sizeEm(%"class.std::__cxx11::_List_base"*, i64) #1 comdat align 2 {
%3 = alloca %"class.std::__cxx11::_List_base"*, align 8
%4 = alloca i64, align 8
store %"class.std::__cxx11::_List_base"* %0, %"class.std::__cxx11::_List_base"** %3, align 8
store i64 %1, i64* %4, align 8
%5 = load %"class.std::__cxx11::_List_base"*, %"class.std::__cxx11::_List_base"** %3, align 8
%6 = load i64, i64* %4, align 8
%7 = getelementptr inbounds %"class.std::__cxx11::_List_base", %"class.std::__cxx11::_List_base"* %5, i32 0, i32 0
%8 = getelementptr inbounds %"struct.std::__cxx11::_List_base<int *, std::allocator<int *> >::_List_impl", %"struct.std::__cxx11::_List_base<int *, std::allocator<int *> >::_List_impl"* %7, i32 0, i32 0
%9 = getelementptr inbounds %"struct.std::__detail::_List_node_header", %"struct.std::__detail::_List_node_header"* %8, i32 0, i32 1
%10 = load i64, i64* %9, align 8
%11 = add i64 %10, %6
store i64 %11, i64* %9, align 8
ret void
}
I understand that those functions are from libstdc++.so. But why does Clang generate the body for some functions, but not the other?
Does anybody know how to make Clang generate the body of _ZNSt8__detail15_List_node_base7_M_hookEPS0_ as well?
Thank you very much for reading my question! I'm writing a static analysis tool, which needs to analyze the body of _ZNSt8__detail15_List_node_base7_M_hookEPS0_ to obtain more precise result.
Most probably those other functions are coming from C++ templates.
When you declare a templated function, you have to provide its implementation in the header file in most cases. This way their code ends up in your own translation unit, and you see this code in your IR.
I temporarily found a workaround using the suggestion from #arrowd.
// generate list-simple-test.bc
clang++-list-simple-test.cpp -c -emit-llvm
// generate list.bc (list.cc is from the source code of libstdc++)
clang++ -emit-llvm list.cc
// combine list-simple-test.bc and list.bc
llvm-link list.bc list-simple-test.bc -o list-simple-final.bc
In the code about, list.cc can be downloaded from the gcc project
The final bitcode file list-simple-final.bc will contain the definition of _ZNSt8__detail15_List_node_base7_M_hookEPS0_, which is provided by list.cc

How to save the variable name when use clang to generate llvm ir?

I generate ir by use 'clang -S -emit-llvm test.c'.
int main(int argc, char **argv)
{
int* a=0;
a=(int *)malloc(sizeof(int));
printf("hello world\n");
return 0;
}
and this is the ir:
define i32 #main(i32, i8**) #0 {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i8**, align 8
%6 = alloca i32*, align 8
store i32 0, i32* %3, align 4
store i32 %0, i32* %4, align 4
store i8** %1, i8*** %5, align 8
store i32* null, i32** %6, align 8
%7 = call noalias i8* #malloc(i64 4) #3
%8 = bitcast i8* %7 to i32*
store i32* %8, i32** %6, align 8
%9 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* #.str, i32 0, i32 0))
ret i32 0
}
how can I make the variable name remain unchanged,like a still %a ,not %3?
Actually dropping of variable names is a feature and needs to be activated with -discard-value-names. Clang in a release build does this by its own (a self compiled clang in debug mode not).
You can circumvent it with
clang <your-command-line> -###
Then copy the output and drop -discard-value-names.
Newer clang version (since 7) expose the flag to the normal command line:
clang -fno-discard-value-names <your-command-line>
Source
There is not such way. The variable names in LLVM IR are merely for debugging only and also there is certainly no way to preserve them when the code is converted to full SSA form.
If you need to preserve source code information consider using debug info.

Can LLVM execute bitcode on ios (arm64)?

I'm trying to execute LLVM bitcode on ios device.
I've compiled bitcode file using clang -emit-llvm -S -c ./test.cpp -o .test.ll -target arm64-apple-ios7.1 targeted to arm64 device. Starting LLVM/Clang release 3.5 supports ios arm64 devices. Then i'm trying to invoke the code using MCJit'ter and getting exception:
http://llvm.org/bugs/show_bug.cgi?id=21012
Is it ios OS/device restriction or LLVM/Clang bug?
I'm testing on iPad Air with ios 7.1.
PS. Bitcode looks correct and targeted to correct target:
; ModuleID = './test.cpp'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple0ios7.1"
#.str = private unnamed_addr constant [12 x i8] c"hello world\00", align 1
; Function Attrs: nounwind
define i32 #main(i32 %argc, i8** %argv) #0 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
%c = alloca i8*, align 8
%a = alloca i32, align 4
%b = alloca i32, align 4
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
store i8* getelementptr inbounds ([12 x i8]* #.str, i32 0, i32 0), i8** %c, align 8
store i32 1, i32* %a, align 4
%0 = load i32* %a, align 4
%add = add nsw i32 %0, 6
store i32 %add, i32* %b, align 4
ret i32 3
}
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.6.0 (trunk 218116) (llvm/trunk 218115)"}
It was compiled by LLVM/Clang (trunk after 3.5 release) from simple source code:
int main(int argc, const char **argv) {
const char *c = "hello world";
int a = 1;
int b = a + 6;
return 3;
}
PPS. Interpreter code is almost lli tool code with the next arguments:
return llvm_interpret(
InputFile,
std::vector<std::string>(), // argv
false, // ForceInterpreter
true, // UseMCJIT
false, // DebugIR
false, // RemoteMCJIT
"", // ChildExecPath
' ', // OptLevel
std::string(/*"arm64-apple-ios7.1"*/), // TargetTriple
std::string(/*"arm64"*/), // MArch
std::string("generic"), // MCPU
std::vector<std::string>(), // MAttrs
"main", // EntryFunc
std::vector<std::string>(), // ExtraModules
std::vector<std::string>(), // ExtraObjects
std::vector<std::string>(), // ExtraArchives
false, // EnableCacheManager
std::string(), // ObjectCacheDir
std::string(), // FakeArgv0
false, // DisableCoreFiles
false, // NoLazyCompilation
Reloc::Static, // RelocModel (default is Reloc::PIC_)
CodeModel::JITDefault, // CMModel (default is CodeModel::JITDefault)
false, // GenerateSoftFloatCalls
FloatABI::Default, // FloatABIForCalls
false, // EmitJitDebugInfo
false // EmitJitDebugInfoToDisk
);
Exception:
rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
(char **)GVTOP(ArgValues[1])));
0x4a18000: .long 0xd10083ff ; unknown opcode
0x4a18004: stmdbhs r3, {r5, r6, r7, r8, r9, r10, r11, r12, sp, lr}
0x4a18008: .long 0xf9000be1 ; unknown opcode
0x4a1800c: andls r0, r0, r8
0x4a18010: .long 0xf9401908 ; unknown opcode
0x4a18014: andlo r0, r0, #0xa4000003
0x4a18018: andlo r0, r0, #0x3a800
0x4a1801c: andlo r0, r0, #0x3800000

Compile with no optimization in clang

Short question: how to compile with clang with no code optimization? -O0 is not working.
Long question:
I'm learning code optimization and LLVM in particular. I'm writing small examples, compiling them and then running just one optimization at a time, to analyze what it changes. For example, to test Dead Code Elimination, I tried this:
int main() {
int a = 20 + 30;
int b = 25; /* Assignment to dead variable */
int c;
c = a << 2;
return c;
b = 24; /* Unreachable code */
return 0;
}
However, when I compile it with
clang -S -O0 -emit-llvm foo.c
The last two lines of my C code do not show up in the IR code (below). Also, the 20 + 30 is already being calculated to 50. So there's some optimization going on here, even though I'm using -O0.
; ModuleID = 'hello.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
entry:
%retval = alloca i32, align 4
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
store i32 0, i32* %retval
store i32 50, i32* %a, align 4
store i32 25, i32* %b, align 4
%0 = load i32* %a, align 4
%shl = shl i32 %0, 2
store i32 %shl, i32* %c, align 4
%1 = load i32* %c, align 4
ret i32 %1
}
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.4 (trunk 192936)"}

Resources