I need to build OpenBLAS static library which contains assembly functions - https://github.com/xianyi/OpenBLAS/releases . Personally, I'm trying to build 0.3.6 version.
Although, compilation process is successful, .o files are missing actual implementations when it comes to assembly functions.
I have problems with all my assembly functiosn, but I will give as an example one of them - _sdot_k.
How can I define that implementation is missing? I can define it by that fact that when I do nm [library_name].a, output for this function is following:
libopenblas_armv8p-r0.3.6.a(sdot_k.o):
0000000000000050 t .Ldot_kernel_F1
0000000000000058 t .Ldot_kernel_F10
0000000000000028 t .Ldot_kernel_F4
000000000000001c t .Ldot_kernel_F_BEGIN
00000000000000d8 t .Ldot_kernel_L999
00000000000000bc t .Ldot_kernel_S1
00000000000000c4 t .Ldot_kernel_S10
0000000000000084 t .Ldot_kernel_S4
0000000000000070 t .Ldot_kernel_S_BEGIN
0000000000000000 t ltmp0
There's no T identifier which tells that function implementation is here, in .o file. And of course, if I put this library into my iOS project I have such error:
Undefined symbols for architecture arm64:
"_sdot_k", referenced from:
_strmv_TLN in libopenblas.a(strmv_TLN.o)
_strmv_TLU in libopenblas.a(strmv_TLU.o)
_strmv_TUN in libopenblas.a(strmv_TUN.o)
_strmv_TUU in libopenblas.a(strmv_TUU.o)
_trmv_kernel in libopenblas.a(strmv_thread_TLN.o)
_trmv_kernel in libopenblas.a(strmv_thread_TLU.o)
_trmv_kernel in libopenblas.a(strmv_thread_TUN.o)
...
Here's the actual dot.S file:
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 wzr
#define DOTF s0
#else // DSDOT
#define REG0 xzr
#define DOTF d0
#endif
#define DOTI s1
#define TMPX s2
#define LD1VX {v2.s}[0]
#define TMPY s3
#define LD1VY {v3.s}[0]
#define TMPVY v3.s[0]
#define SZ 4
#else
#define REG0 xzr
#define DOTF d0
#define DOTI d1
#define TMPX d2
#define LD1VX {v2.d}[0]
#define TMPY d3
#define LD1VY {v3.d}[0]
#define TMPVY v3.d[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPX, [X], #SZ
ldr TMPY, [Y], #SZ
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fcvt d3, TMPY
fcvt d2, TMPX
fmul d2, d2, d3
fadd DOTF, DOTF, d2
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [Y], #16
#if !defined(DSDOT)
fmla v0.4s, v2.4s, v3.4s
#else
fcvtl2 v5.2d, v3.4s
fcvtl2 v4.2d, v2.4s
fcvtl v3.2d, v3.2s
fcvtl v2.2d, v2.2s
fmul v4.2d, v4.2d, v5.2d
fmul v2.2d, v2.2d, v3.2d
fadd v2.2d, v2.2d, v4.2d
fadd v0.2d, v0.2d, v2.2d
#endif
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [X], #32
ld1 {v4.2d, v5.2d}, [Y], #32
fmul v2.2d, v2.2d, v4.2d
fmul v3.2d, v3.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
.endm
.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
#if !defined(DSDOT)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp DOTF, v0.2s
#else
faddp DOTF, v0.2d
#endif
#else //DOUBLE
faddp DOTF, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm
.macro KERNEL_S1
ld1 LD1VX, [X], INC_X
ld1 LD1VY, [Y], INC_Y
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fcvt d3, TMPY
fcvt d2, TMPX
fmul d2, d2, d3
fadd DOTF, DOTF, d2
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov DOTF, REG0
#if defined(DOUBLE)
fmov d6, DOTF
#endif
cmp N, xzr
ble .Ldot_kernel_L999
cmp INC_X, #1
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1
bne .Ldot_kernel_S_BEGIN
.Ldot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq .Ldot_kernel_F1
.Ldot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne .Ldot_kernel_F4
KERNEL_F4_FINALIZE
.Ldot_kernel_F1:
ands I, N, #3
ble .Ldot_kernel_L999
.Ldot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Ldot_kernel_F10
ret
.Ldot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Ldot_kernel_S1
.Ldot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Ldot_kernel_S4
.Ldot_kernel_S1:
ands I, N, #3
ble .Ldot_kernel_L999
.Ldot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Ldot_kernel_S10
.Ldot_kernel_L999:
ret
EPILOGUE
As you can see, there's PROLOGUE and EPILOGUE which are macroses, defined in C header:
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 2 ;\
.globl REALNAME ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
My guts are telling me that the problem lies somewhere in EPILOGUE and PROLOGUE, but I don't have decent Assembler knowledge to figure out a problem on my own.
Also I found this article - https://developer.apple.com/library/archive/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html, but it didn't help me much. Maybe because of my lack of Assembly skill.
NOTE: If anyone is struggling with the same library trying to make it work on iOS, here's my thread on it's Github - https://github.com/xianyi/OpenBLAS/issues/2275#issuecomment-536982253is. It contains descriptions of all problems I overcomed.
NOTE 2: I really need OpenBLAS library and using Accelerate.framework is not an option in my case, unfortunately.
I'm happy to say that I overcome that issue. The problem was indeed in my PROLOGUE macros.
PROLOGUE macros I was using was converting to the assembly code in this manner:
.text; .align 2; .globl REALNAME; REALNAME:
And that was an issue. In order to make Assembly label work it should look like this:
.text;
.align 2;
.globl REALNAME;
REALNAME:
//assembly code
So in order to achieve that result I changed C macro to GAS macro as such:
.macro PROLOGUE
.text
.align 2
.globl REALNAME
REALNAME:
.endm
After that issue was gone!
NOTE: In order to remove any confusions, ; doesn't really play a role here. It's new line after each comma what matters.
Related
Consider the following sample program (targeting Linux/x86-64):
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
unsigned arg1 = strtoul(argv[1], NULL, 0);
unsigned arg2 = strtoul(argv[2], NULL, 0);
asm(
"mov %[arg1], %%ecx\n\t"
"add %[arg2], %[arg1]\n\t"
"add %[arg2], %%ecx\n\t"
"xchg %%ecx, %[arg2]"
: [arg1] "+&abdSD" (arg1), [arg2] "+&abdSD" (arg2)
:
: "cc", "ecx");
printf("%u %u\n", arg1, arg2);
}
(xchg is used just for easy grepping of compiled instructions in listing.)
With GCC, it works as expected - different registers are assigned to arg1 and arg2, for example:
11bf: e8 cc fe ff ff callq 1090 <strtoul#plt>
11c4: 89 da mov %ebx,%edx
11c6: 89 d1 mov %edx,%ecx
11c8: 01 c2 add %eax,%edx
11ca: 01 c1 add %eax,%ecx
11cc: 91 xchg %eax,%ecx
(so, arg1 in edx, arg2 in eax)
But, compiling with Clang (confirmed on 6.0 and 10.0) results in assigning the same register for arg1 and arg2:
401174: e8 d7 fe ff ff callq 401050 <strtoul#plt>
401179: 44 89 f0 mov %r14d,%eax ; <--
40117c: 89 c1 mov %eax,%ecx
40117e: 01 c0 add %eax,%eax ; <-- so, arg1 and arg2 both in eax
401180: 01 c1 add %eax,%ecx
401182: 91 xchg %eax,%ecx
The issue remains with multiple variations as e.g.: + instead of +& in constraint strings; numeric forms like %0 to address operands; replacing xchg with another rare instruction; and so on.
I have been expecting, from the basic principles, that compilerʼs logic to assign output locations will always assign different locations to different output operands, whatever constraints are defined for them; and the same works among the input operands set. (Modifiers like '+', '&' add more rules to placement logic but canʼt erode the main principles.)
Is there a some trivial aspect Iʼve overlooked?
UPD: reported to LLVM.
Just trying http://api.madewithmarmalade.com/ExampleArmASM.html and using iOS; the program run if I comment out the loop and the res is printed as 28. But if not comment it out, it will abend without printing the res.
Any hint why and how to fix it.
Thanks in advance.
My code is as follows:
#include <stdio.h>
#include <stdlib.h>
#define ARRAY_SIZE 512
#if defined __arm__ && defined __ARM_NEON__
static int computeSumNeon(const int a[])
{
// Computes the sum of all elements in the input array
int res = 0;
asm(".align 4 \n\t" //dennis warning avoiding
"vmov.i32 q8, #0 \n\t" //clear our accumulator register
"mov r3, #512 \n\t" //Loop condition n = ARRAY_SIZE
// ".loop1: \n\t" // No loop add 0-7 works as 28
"vld1.32 {d0, d1, d2, d3}, [%[input]]! \n\t" //load 8 elements into d0, d1, d2, d3 = q0, q1
"pld [%[input]] \n\t" // preload next set of elements
"vadd.i32 q8, q0, q8 \n\t" // q8 += q0
"vadd.i32 q8, q1, q8 \n\t" // q8 += q1
"subs r3, r3, #8 \n\t" // n -= 8
// "bne .loop1 \n\t" // n == 0?
"vpadd.i32 d0, d16, d17 \n\t" // d0[0] = d16[0] + d16[1], d0[1] = d17[0] + d17[1]
"vpaddl.u32 d0, d0 \n\t" // d0[0] = d0[0] + d0[1]
"vmov.32 %[result], d0[0] \n\t"
: [result] "=r" (res) , [input] "+r" (a)
:
: "q0", "q1", "q8", "r3");
return res;
}
#else
static int computeSumNeon(const int a[])
{
int i, res = 0;
for (i = 0; i < ARRAY_SIZE; i++)
res += a[i];
}
#endif
...
#implementation AppDelegate
- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
// Override point for customization after application launch.
//int* inp;
int inp[ARRAY_SIZE];
//posix_memalign((void**)&inp, 64, ARRAY_SIZE*sizeof(int)); // Align to cache line size (64bytes on a cortex A8)
// Initialise the array with consecutive integers.
int i;
for (i = 0; i < ARRAY_SIZE; i++)
{
inp[i] = i;
}
for (i = 0; i < ARRAY_SIZE; i++)
{
printf("%i,", inp[i]);
}
printf("\n\n sum 0-7:%i\n", 0+1+2+3+4+5+6+7);
int res = 0;
res = computeSumNeon(inp);
printf("res NEO :%i\n", res);
// free(inp); // error pointer being free was not allocated !!!
UISplitViewController *splitViewController = (UISplitViewController *)self.window.rootViewController;
UINavigationController *navigationController = [splitViewController.viewControllers lastObject];
navigationController.topViewController.navigationItem.leftBarButtonItem = splitViewController.displayModeButtonItem;
splitViewController.delegate = self;
return YES;
}
- (void)applicationWillResignActive:(UIApplication *)application {
...
==== assembly code generated
.align 1
.code 16 # #computeSumNeon
.thumb_func _computeSumNeon
_computeSumNeon:
Lfunc_begin3:
.loc 18 133 0 is_stmt 1 # ...
.cfi_startproc
# BB#0:
sub sp, #8
movs r1, #0
str r0, [sp, #4]
.loc 18 135 9 prologue_end # ...
Ltmp18:
str r1, [sp]
.loc 18 136 5 # ...
ldr r0, [sp, #4]
# InlineAsm Start
.align 4
vmov.i32 q8, #0x0
movw r3, #504
.loop1:
vld1.32 {d0, d1, d2, d3}, [r0]!
vadd.i32 q8, q0, q8
vadd.i32 q8, q1, q8
subs r3, #8
bne .loop1
vpadd.i32 d0, d16, d17
vpaddl.u32 d0, d0
vmov.32 r1, d0[0]
# InlineAsm End
str r1, [sp]
str r0, [sp, #4]
.loc 18 155 12 # ...
ldr r0, [sp]
.loc 18 155 5 is_stmt 0 # ...
add sp, #8
bx lr
Ltmp19:
Lfunc_end3:
.cfi_endproc
Part of the source code is
id (*old_objc_msgSend)(id, SEL, ...);
__attribute__((naked))
id new_objc_msgSend(id self, SEL op, ...) {
__asm__ __volatile__ (
".thumb\n"
"ldmia.w sp, {r2, r3}\n"
"b _old_objc_msgSend\n"
);
}
But the generated assembly is
Dump of assembler code for function _Z16new_objc_msgSendP11objc_objectP13objc_selectorz:
0x01a7ae9c <_Z16new_objc_msgSendP11objc_objectP13objc_selectorz+0>: stmia.w sp, {r2, r3}
0x01a7aea0 <_Z16new_objc_msgSendP11objc_objectP13objc_selectorz+4>: ldmia.w sp, {r2, r3}
0x01a7aea4 <_Z16new_objc_msgSendP11objc_objectP13objc_selectorz+8>: b.w 0x1a7af68 <_Z27new_initWithContentwithSizeP11objc_objectP13objc_selectorS0_6CGSize+188>
0x01a7aea8 <_Z16new_objc_msgSendP11objc_objectP13objc_selectorz+12>: bx lr
0x01a7aeaa <_Z16new_objc_msgSendP11objc_objectP13objc_selectorz+14>: nop
End of assembler dump.
It branches to a different address.
I wrote some very simple code, aimed to work on bare metal RaspberryPi. My code consists of gpio.s (with function "flash", which turns LED on and off) and main.s, shown below.
.section .init
.globl _start
_start:
mov sp, $0x8000
b main
.section .text
.globl main
main:
ldr r5, =variable
ldr r4, [r5]
cmp r4, $100
bleq flash
loop:
b loop
.section .data
.align 4
.globl variable
variable:
.word 100
So r4 should be filled with 100 => condition flag should be eq => LED should flash! But it does not. Why?
Apart from that example, function "flash" works, as well as in the case of adding these lines after "ldr r5, =variable":
mov r1, $100
str r1, [r5]
So it seems like memory is accessible, but doesn't get initialized. I would be grateful for your explanations.
Disassembly:
./build/output.elf: file format elf32-littlearm
Disassembly of section .init:
00000000 <_start>:
0: e3a0d902 mov sp, #32768 ; 0x8000
4: ea00205c b 817c <main>
Disassembly of section .text:
00008000 <getGpioAddr>:
8000: e59f0170 ldr r0, [pc, #368] ; 8178 <flash2+0x14>
8004: e1a0f00e mov pc, lr
00008008 <setGpioFunct>:
8008: e3500035 cmp r0, #53 ; 0x35
800c: 93510007 cmpls r1, #7 ; 0x7
8010: 83a00001 movhi r0, #1 ; 0x1
8014: 81a0f00e movhi pc, lr
8018: e92d0030 push {r4, r5}
801c: e1a02001 mov r2, r1
8020: e1a01000 mov r1, r0
8024: e92d4000 push {lr}
8028: ebfffff4 bl 8000 <getGpioAddr>
802c: e8bd4000 pop {lr}
8030: e3a04000 mov r4, #0 ; 0x0
00008034 <subTen>:
8034: e351000a cmp r1, #10 ; 0xa
8038: 2241100a subcs r1, r1, #10 ; 0xa
803c: 22844001 addcs r4, r4, #1 ; 0x1
8040: 2afffffb bcs 8034 <subTen>
8044: e3a05004 mov r5, #4 ; 0x4
8048: e0030594 mul r3, r4, r5
804c: e0800003 add r0, r0, r3
8050: e3a05003 mov r5, #3 ; 0x3
8054: e0030591 mul r3, r1, r5
8058: e1a02312 lsl r2, r2, r3
805c: e3e0430e mvn r4, #939524096 ; 0x38000000
8060: e3a05009 mov r5, #9 ; 0x9
8064: e0451001 sub r1, r5, r1
8068: e3a05003 mov r5, #3 ; 0x3
806c: e0030591 mul r3, r1, r5
8070: e1a04374 ror r4, r4, r3
8074: e5905000 ldr r5, [r0]
8078: e0055004 and r5, r5, r4
807c: e1855002 orr r5, r5, r2
8080: e5805000 str r5, [r0]
8084: e8bd0030 pop {r4, r5}
8088: e3a00000 mov r0, #0 ; 0x0
808c: e1a0f00e mov pc, lr
00008090 <setPin>:
8090: e3500035 cmp r0, #53 ; 0x35
8094: 83a00001 movhi r0, #1 ; 0x1
8098: 81a0f00e movhi pc, lr
809c: e92d0020 push {r5}
80a0: e3500020 cmp r0, #32 ; 0x20
80a4: 22401020 subcs r1, r0, #32 ; 0x20
80a8: 31a01000 movcc r1, r0
80ac: 23a02020 movcs r2, #32 ; 0x20
80b0: 33a0201c movcc r2, #28 ; 0x1c
80b4: e92d4000 push {lr}
80b8: ebffffd0 bl 8000 <getGpioAddr>
80bc: e8bd4000 pop {lr}
80c0: e3a05001 mov r5, #1 ; 0x1
80c4: e1a05115 lsl r5, r5, r1
80c8: e7805002 str r5, [r0, r2]
80cc: e3a00000 mov r0, #0 ; 0x0
80d0: e8bd0020 pop {r5}
80d4: e1a0f00e mov pc, lr
000080d8 <clearPin>:
80d8: e3500035 cmp r0, #53 ; 0x35
80dc: 83a00001 movhi r0, #1 ; 0x1
80e0: 81a0f00e movhi pc, lr
80e4: e92d0020 push {r5}
80e8: e3500020 cmp r0, #32 ; 0x20
80ec: 22401020 subcs r1, r0, #32 ; 0x20
80f0: 31a01000 movcc r1, r0
80f4: 23a0202c movcs r2, #44 ; 0x2c
80f8: 33a02028 movcc r2, #40 ; 0x28
80fc: e92d4000 push {lr}
8100: ebffffbe bl 8000 <getGpioAddr>
8104: e8bd4000 pop {lr}
8108: e3a05001 mov r5, #1 ; 0x1
810c: e1a05115 lsl r5, r5, r1
8110: e7805002 str r5, [r0, r2]
8114: e3a00000 mov r0, #0 ; 0x0
8118: e8bd0020 pop {r5}
811c: e1a0f00e mov pc, lr
00008120 <flash>:
8120: e92d4013 push {r0, r1, r4, lr}
8124: e3a00010 mov r0, #16 ; 0x10
8128: e3a01001 mov r1, #1 ; 0x1
812c: ebffffb5 bl 8008 <setGpioFunct>
8130: e3a00010 mov r0, #16 ; 0x10
8134: ebffffe7 bl 80d8 <clearPin>
8138: eb000004 bl 8150 <wait>
813c: e3a00010 mov r0, #16 ; 0x10
8140: ebffffd2 bl 8090 <setPin>
8144: eb000001 bl 8150 <wait>
8148: e8bd4013 pop {r0, r1, r4, lr}
814c: e1a0f00e mov pc, lr
00008150 <wait>:
8150: e3a0583f mov r5, #4128768 ; 0x3f0000
00008154 <loop>:
8154: e2455001 sub r5, r5, #1 ; 0x1
8158: e3550000 cmp r5, #0 ; 0x0
815c: 1afffffc bne 8154 <loop>
8160: e1a0f00e mov pc, lr
00008164 <flash2>:
8164: e92d4000 push {lr}
8168: ebffffec bl 8120 <flash>
816c: ebffffeb bl 8120 <flash>
8170: e8bd4000 pop {lr}
8174: e1a0f00e mov pc, lr
8178: 20200000 .word 0x20200000
0000817c <main>:
817c: e59f500c ldr r5, [pc, #12] ; 8190 <loop+0x4>
8180: e5954000 ldr r4, [r5]
8184: e3540064 cmp r4, #100 ; 0x64
8188: 0bffffe4 bleq 8120 <flash>
0000818c <loop>:
818c: eafffffe b 818c <loop>
8190: 00008194 .word 0x00008194
Disassembly of section .data:
00008194 <variable>:
8194: 00000064 .word 0x00000064
Linker scripts, makefile etc. taken from: http://www.cl.cam.ac.uk/projects/raspberrypi/tutorials/os/ok01.html
from your link (you should not ask questions here using links, put the code in the question)
0000817c <main>:
817c: e59f500c ldr r5, [pc, #12] ; 8190 <loop+0x4>
8180: e3a01064 mov r1, #100 ; 0x64
8184: e3540064 cmp r4, #100 ; 0x64
8188: 0bffffe4 bleq 8120 <flash>
0000818c <loop>:
818c: eafffffe b 818c <loop>
8190: 000081a0 .word 0x000081a0
Disassembly of section .data:
000081a0 <variable>:
81a0: 00000064 .word 0x00000064
...
you are moving r1 a 100 but comparing r4 which has not been initialized at least in this code, so that is unpredictable what will happen. if you replace that with a mov r4,[r5] it should work as desired as r5 is getting the address of the word that contains the #100 and then you read from that address into r4.
I assume you have verified that if you simply bl flash it works (not a conditional but always go there) as desired?
In this bare metal mode you definitely have access to read/write memory, no worries there.
David
Memory is normally initialized as part of the C runtime code. If you are writing bare-metal assembly without including the functionality of the C runtime then your variables in RAM will not be initialized. You need to explicitly initialize the value of variable in your own code.
Finally found out! Really subtle, and it's not my fault indeed. I had taken the makefile and linker script from Alex Chadwick tutorial, and the linker script looked like that:
SECTIONS {
/*
* First and formost we need the .init section, containing the IVT.
*/
.init 0x0000 : {
*(.init)
}
/*
* We allow room for the ATAGs and the stack and then start our code at
* 0x8000.
*/
.text 0x8000 : {
*(.text)
}
/*
* Next we put the data.
*/
.data : {
*(.data)
}
/*
* Finally comes everything else. A fun trick here is to put all other
* sections into this section, which will be discarded by default.
*/
/DISCARD/ : {
*(*)
}
}
.init section was based at 0x0000, and then the .text started at 0x8000. But actually, kernel.img is loaded at address 0x8000 by Pi (real address of .init was 0x8000), so: whole .text section (as well as the following sections) were shifted - due to that fact, addresses of labels were misassumed at the assembling-linking time. Only pc-relative addressing could work, as PC was set correctly. The solution is to start the image at 0x8000:
SECTIONS {
/*
* First and formost we need the .init section, containing the IVT.
*/
.init 0x8000 : {
*(.init)
}
.text : {
*(.text)
}
/*
* Next we put the data.
*/
.data : {
*(.data)
}
/*
* Finally comes everything else. A fun trick here is to put all other
* sections into this section, which will be discarded by default.
*/
/DISCARD/ : {
*(*)
}
}
I've just checked the template on his website and it's corrected now, so there is no point contacting him. I must have downloaded template before this correction. Thank you guys for your attempts.
I'm trying to use vectors inside structs with LLVM. I have the following C definition of my struct:
struct Foo
{
uint32_t len;
uint32_t data[32] __attribute__ ((aligned (16)));
};
and here's some LLVM code to add 42 to element number 3 of the data field:
%Foo = type { i32, <32 x i32> }
define void #process(%Foo*) {
_L1:
%data = getelementptr %Foo* %0, i32 0, i32 1
%vec = load <32 x i32>* %data
%x = extractelement <32 x i32> %vec, i32 3
%xNew = add i32 42, %x
%vecNew = insertelement <32 x i32> %vec, i32 %xNew, i32 3
store <32 x i32> %vecNew, <32 x i32>* %data
ret void
}
However, the output of llc is as if vectors had to be aligned at 128 bytes, which seems wasteful, and also wrong (AFAIK vectors should be 16-byte-aligned):
.file "process.bc"
.text
.globl process
.align 16, 0x90
.type process,#function
process: # #process
.Leh_func_begin0:
# BB#0: # %_L1
movdqa 128(%rdi), %xmm0
pextrd $3, %xmm0, %eax
addl $42, %eax
pinsrd $3, %eax, %xmm0
movdqa %xmm0, 128(%rdi)
ret
.Ltmp0:
.size process, .Ltmp0-process
.Leh_func_end0:
Of course, if I change the C definition to also align the data field at 128 bytes, it works, but wasting 124 bytes (compared to 12 if using 16-byte alignment) just seems wrong. So what's going on here?
I think your GEPs are a little off for the best codegen. Here's some C code that does something similar:
#include <stdint.h>
struct Foo
{
uint32_t len;
uint32_t data[32] __attribute__ ((aligned (16)));
};
void foo(struct Foo *F)
{
F->data[3] = 4;
}
which clang turns into this:
; ModuleID = 'foo.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
%struct.Foo = type { i32, [12 x i8], [32 x i32] }
define void #foo(%struct.Foo* %F) nounwind ssp {
%1 = alloca %struct.Foo*, align 8
store %struct.Foo* %F, %struct.Foo** %1, align 8
%2 = load %struct.Foo** %1, align 8
%3 = getelementptr inbounds %struct.Foo* %2, i32 0, i32 2
%4 = getelementptr inbounds [32 x i32]* %3, i32 0, i64 3
store i32 4, i32* %4
ret void
}
and the corresponding nice code you'd expect:
_foo: ## #foo
Leh_func_begin0:
## BB#0:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $4, 28(%rdi)
popq %rbp
ret
Leh_func_end0:
That said, the code you have there is isn't right and should be:
_process: ## #process
Leh_func_begin1:
## BB#0: ## %_L1
movaps 16(%rdi), %xmm0
pextrd $3, %xmm0, %eax
addl $42, %eax
pinsrd $3, %eax, %xmm0
movaps %xmm0, 16(%rdi)
ret
and is even worse in ToT so a bug report wouldn't go amiss there.