Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIGSEGV when switching to clang toolchain #365

Closed
gpakosz opened this issue Apr 11, 2017 · 32 comments
Closed

SIGSEGV when switching to clang toolchain #365

gpakosz opened this issue Apr 11, 2017 · 32 comments
Assignees
Labels
Milestone

Comments

@gpakosz
Copy link

gpakosz commented Apr 11, 2017

As per ndk-r12+ release notes, we tried switching to clang toolchain and noticed code that has been running fine for years started crashing. It took me a bit of time to investigate but I finally found a minimal enough repro. I say minimal enough because it still involves googletest.

The repro involves an executable loading a shared library containing 2 entry points, foo() and bar(). The program uses dlopen() to load the library, then dlsym() to import the entry points.
I'm getting SIGSEGV when calling bar() which implementation is supposed to call foo(). The crash only happens in debug: I suspect the lack of inlining in googletest produces a big executable (6.6M) with a corrupted PLT table.

The shared library:

#include <stdio.h>

void* foo(void* p, int i)
{
  fprintf(stderr, "inside foo\n");
  fflush(stderr);

  return NULL;
}

void* bar(void* p, int i, const void* q, int j)
{
  fprintf(stderr, "inside bar\n");
  fflush(stderr);

  if (j == 0)
    return foo(p, i);

  return NULL;
}

gets compiled to:

$ /opt/android-ndk-r14/toolchains/aarch64-linux-android-4.9/prebuilt/darwin-x86_64/aarch64-linux-android/bin/objdump -dS ./obj/local/arm64-v8a/lib-ndk-r14-clang-SIGSEGV.so

./obj/local/arm64-v8a/lib-ndk-r14-clang-SIGSEGV.so:     file format elf64-littleaarch64


Disassembly of section .plt:

0000000000000570 <__cxa_finalize@plt-0x20>:
 570:	a9bf7bf0 	stp	x16, x30, [sp,#-16]!
 574:	90000090 	adrp	x16, 10000 <bar+0xf984>
 578:	f947e211 	ldr	x17, [x16,#4032]
 57c:	913f0210 	add	x16, x16, #0xfc0
 580:	d61f0220 	br	x17
 584:	d503201f 	nop
 588:	d503201f 	nop
 58c:	d503201f 	nop

0000000000000590 <__cxa_finalize@plt>:
 590:	90000090 	adrp	x16, 10000 <bar+0xf984>
 594:	f947e611 	ldr	x17, [x16,#4040]
 598:	913f2210 	add	x16, x16, #0xfc8
 59c:	d61f0220 	br	x17

00000000000005a0 <fflush@plt>:
 5a0:	90000090 	adrp	x16, 10000 <bar+0xf984>
 5a4:	f947ea11 	ldr	x17, [x16,#4048]
 5a8:	913f4210 	add	x16, x16, #0xfd0
 5ac:	d61f0220 	br	x17

00000000000005b0 <foo@plt>:
 5b0:	90000090 	adrp	x16, 10000 <bar+0xf984>
 5b4:	f947ee11 	ldr	x17, [x16,#4056]
 5b8:	913f6210 	add	x16, x16, #0xfd8
 5bc:	d61f0220 	br	x17

00000000000005c0 <fprintf@plt>:
 5c0:	90000090 	adrp	x16, 10000 <bar+0xf984>
 5c4:	f947f211 	ldr	x17, [x16,#4064]
 5c8:	913f8210 	add	x16, x16, #0xfe0
 5cc:	d61f0220 	br	x17

00000000000005d0 <__cxa_atexit@plt>:
 5d0:	90000090 	adrp	x16, 10000 <bar+0xf984>
 5d4:	f947f611 	ldr	x17, [x16,#4072]
 5d8:	913fa210 	add	x16, x16, #0xfe8
 5dc:	d61f0220 	br	x17

Disassembly of section .text:

00000000000005e0 <__on_dlclose>:
 5e0:	b0000080 	adrp	x0, 11000 <__dso_handle>
 5e4:	91000000 	add	x0, x0, #0x0
 5e8:	17ffffea 	b	590 <__cxa_finalize@plt>

00000000000005ec <__atexit_handler_wrapper>:
 5ec:	a9bf7bfd 	stp	x29, x30, [sp,#-16]!
 5f0:	910003fd 	mov	x29, sp
 5f4:	b4000040 	cbz	x0, 5fc <__atexit_handler_wrapper+0x10>
 5f8:	d63f0000 	blr	x0
 5fc:	a8c17bfd 	ldp	x29, x30, [sp],#16
 600:	d65f03c0 	ret

0000000000000604 <atexit>:
 604:	aa0003e1 	mov	x1, x0
 608:	b0000082 	adrp	x2, 11000 <__dso_handle>
 60c:	90000000 	adrp	x0, 0 <__cxa_finalize@plt-0x590>
 610:	91000042 	add	x2, x2, #0x0
 614:	9117b000 	add	x0, x0, #0x5ec
 618:	17ffffee 	b	5d0 <__cxa_atexit@plt>

000000000000061c <foo>:
#include <stdio.h>

void* foo(void* p, int i)
{
 61c:	d100c3ff 	sub	sp, sp, #0x30
 620:	a9027bfd 	stp	x29, x30, [sp,#32]
 624:	910083fd 	add	x29, sp, #0x20
 628:	f81f83a0 	stur	x0, [x29,#-8]
 62c:	b81f43a1 	stur	w1, [x29,#-12]
  fprintf(stderr, "inside foo\n");
 630:	90000080 	adrp	x0, 10000 <bar+0xf984>
 634:	f947fc00 	ldr	x0, [x0,#4088]
 638:	9104c000 	add	x0, x0, #0x130
 63c:	90000001 	adrp	x1, 0 <__cxa_finalize@plt-0x590>
 640:	911c1021 	add	x1, x1, #0x704
 644:	97ffffdf 	bl	5c0 <fprintf@plt>
 648:	90000081 	adrp	x1, 10000 <bar+0xf984>
 64c:	f947fc21 	ldr	x1, [x1,#4088]
 650:	d280261e 	mov	x30, #0x130                 	// #304
 654:	8b1e0021 	add	x1, x1, x30
  fflush(stderr);
 658:	b90013e0 	str	w0, [sp,#16]
 65c:	aa0103e0 	mov	x0, x1
 660:	97ffffd0 	bl	5a0 <fflush@plt>
 664:	aa1f03e1 	mov	x1, xzr

  return NULL;
 668:	b9000fe0 	str	w0, [sp,#12]
 66c:	aa0103e0 	mov	x0, x1
 670:	a9427bfd 	ldp	x29, x30, [sp,#32]
 674:	9100c3ff 	add	sp, sp, #0x30
 678:	d65f03c0 	ret

000000000000067c <bar>:
}

void* bar(void* p, int i, const void* q, int j)
{
 67c:	d10103ff 	sub	sp, sp, #0x40
 680:	a9037bfd 	stp	x29, x30, [sp,#48]
 684:	9100c3fd 	add	x29, sp, #0x30
 688:	f81f03a0 	stur	x0, [x29,#-16]
 68c:	b81ec3a1 	stur	w1, [x29,#-20]
 690:	f9000be2 	str	x2, [sp,#16]
 694:	b9000fe3 	str	w3, [sp,#12]
  fprintf(stderr, "inside bar\n");
 698:	90000080 	adrp	x0, 10000 <bar+0xf984>
 69c:	f947fc00 	ldr	x0, [x0,#4088]
 6a0:	9104c000 	add	x0, x0, #0x130
 6a4:	90000001 	adrp	x1, 0 <__cxa_finalize@plt-0x590>
 6a8:	911c4021 	add	x1, x1, #0x710
 6ac:	97ffffc5 	bl	5c0 <fprintf@plt>
 6b0:	90000081 	adrp	x1, 10000 <bar+0xf984>
 6b4:	f947fc21 	ldr	x1, [x1,#4088]
 6b8:	d2802602 	mov	x2, #0x130                 	// #304
 6bc:	8b020021 	add	x1, x1, x2
  fflush(stderr);
 6c0:	b9000be0 	str	w0, [sp,#8]
 6c4:	aa0103e0 	mov	x0, x1
 6c8:	97ffffb6 	bl	5a0 <fflush@plt>

  if (j == 0)
 6cc:	b9400fe3 	ldr	w3, [sp,#12]
 6d0:	b90007e0 	str	w0, [sp,#4]
 6d4:	350000c3 	cbnz	w3, 6ec <bar+0x70>
    return foo(p, i);
 6d8:	f85f03a0 	ldur	x0, [x29,#-16]
 6dc:	b85ec3a1 	ldur	w1, [x29,#-20]
 6e0:	97ffffb4 	bl	5b0 <foo@plt>
 6e4:	f81f83a0 	stur	x0, [x29,#-8]
 6e8:	14000003 	b	6f4 <bar+0x78>
 6ec:	aa1f03e8 	mov	x8, xzr

  return NULL;
 6f0:	f81f83a8 	stur	x8, [x29,#-8]
}
 6f4:	f85f83a0 	ldur	x0, [x29,#-8]
 6f8:	a9437bfd 	ldp	x29, x30, [sp,#48]
 6fc:	910103ff 	add	sp, sp, #0x40
 700:	d65f03c0 	ret

Steps to reproduce:

  1. download the ndk-r14-SIGSEGV sample: ndk-r14-clang-SIGSEGV.zip
  2. unzip and cd to directory
  3. build the project
$ /opt/android-ndk-r14/ndk-build NDK_APPLICATION_MK=./Application.mk NDK_PROJECT_PATH=.
  1. copy the built program and library to an Android device running an SSH server, I used a Nexus 9 and SSHDroid
$ scp obj/local/arm64-v8a/* 192.168.26.248:
  1. ssh into device and launch the program which crashes with SIGSEGV:
u0_a52@flounder:/data/data/berserker.android.apps.sshdroid/home $ LD_LIBRARY_PATH=. ./ndk-r14-clang-SIGSEGV
WARNING: linker: ./ndk-r14-clang-SIGSEGV: unused DT entry: type 0x6ffffffe arg 0x1f00
WARNING: linker: ./ndk-r14-clang-SIGSEGV: unused DT entry: type 0x6fffffff arg 0x2
WARNING: linker: lib-ndk-r14-clang-SIGSEGV.so: unused DT entry: type 0x6ffffffe arg 0x4a0
WARNING: linker: lib-ndk-r14-clang-SIGSEGV.so: unused DT entry: type 0x6fffffff arg 0x1
inside bar
Segmentation fault
139|u0_a52@flounder:/data/data/berserker.android.apps.sshdroid/home $

Remote debugging gave me the following output.

Copying gdbserver to the device:

$ scp /opt/android-ndk-r14/prebuilt/android-arm64/gdbserver/gdbserver 192.168.26.248:

Launching gdbserver on the device:

u0_a52@flounder:/data/data/berserker.android.apps.sshdroid/home $ LD_LIBRARY_PATH=. ~/gdbserver localhost:1234 ./ndk-r14-clang-SIGSEGV
Process ./ndk-r14-clang-SIGSEGV created; pid = 21209
Listening on port 1234
Remote debugging from host 192.168.20.213
WARNING: linker: ./ndk-r14-clang-SIGSEGV: unused DT entry: type 0x6ffffffe arg 0x2098
WARNING: linker: ./ndk-r14-clang-SIGSEGV: unused DT entry: type 0x6fffffff arg 0x2
WARNING: linker: lib-ndk-r14-clang-SIGSEGV.so: unused DT entry: type 0x6ffffffe arg 0x4c0
WARNING: linker: lib-ndk-r14-clang-SIGSEGV.so: unused DT entry: type 0x6fffffff arg 0x1
inside bar

Child terminated with signal = 0xb (SIGSEGV)
u0_a52@flounder:/data/data/berserker.android.apps.sshdroid/home $

Launching gdb on my laptop:

$ /opt/android-ndk-r14/prebuilt/darwin-x86_64/bin/gdb                                                                                              [67/67]
GNU gdb (GDB) 7.11
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-apple-darwin14.5.0".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) set solib-search-path obj/local/arm64-v8a
(gdb) target remote 192.168.26.248:1234
Remote debugging using 192.168.26.248:1234
Reading /data/data/berserker.android.apps.sshdroid/home/ndk-r14-clang-SIGSEGV from remote target...
warning: File transfers from remote targets can be slow. Use "set sysroot" to access files locally instead.
Reading /data/data/berserker.android.apps.sshdroid/home/ndk-r14-clang-SIGSEGV from remote target...
Reading symbols from target:/data/data/berserker.android.apps.sshdroid/home/ndk-r14-clang-SIGSEGV...done.
Reading /system/bin/linker64 from remote target...
Reading /system/bin/linker64 from remote target...
Reading symbols from target:/system/bin/linker64...(no debugging symbols found)...done.
0x0000007fb7fde694 in __dl__start () from target:/system/bin/linker64
(gdb) b main.cpp:19
Breakpoint 1 at 0x5566328c64: file ./main.cpp, line 19.
(gdb) continue
Continuing.
warning: Could not load shared library symbols for libc.so.
Do you need "set solib-search-path" or "set sysroot"?
warning: Could not load shared library symbols for libstdc++.so.
Do you need "set solib-search-path" or "set sysroot"?
warning: Could not load shared library symbols for libm.so.
Do you need "set solib-search-path" or "set sysroot"?
warning: Could not load shared library symbols for libnetd_client.so.
Do you need "set solib-search-path" or "set sysroot"?

Breakpoint 1, main (argc=1, argv=0x7ffffff958) at ./main.cpp:19
19        bar(NULL, -1, NULL, 0);
(gdb) info sharedlibrary
From                To                  Syms Read   Shared Object Library
0x0000007fb7fde540  0x0000007fb7fe868c  Yes (*)     target:/system/bin/linker64
                                        No          libc.so
                                        No          libstdc++.so
                                        No          libm.so
                                        No          libnetd_client.so
0x0000007fb7e9b600  0x0000007fb7e9b724  Yes         /Users/greedo/ndk-r14-clang-SIGSEGV/obj/local/arm64-v8a/lib-ndk-r14-clang-SIGSEGV.so
(*): Shared library is missing debugging information.                                                                                                                      [16/67]
(gdb) b lib.c:17
Breakpoint 2 at 0x7fb7e9b6f8: file ./lib.c, line 17.
(gdb) continue
Continuing.

Breakpoint 2, bar (p=0x0, i=-1, q=0x0, j=0) at ./lib.c:17
17          return foo(p, i);
(gdb) disassemble
Dump of assembler code for function bar:
   0x0000007fb7e9b69c <+0>:     sub     sp, sp, #0x40
   0x0000007fb7e9b6a0 <+4>:     stp     x29, x30, [sp,#48]
   0x0000007fb7e9b6a4 <+8>:     add     x29, sp, #0x30
   0x0000007fb7e9b6a8 <+12>:    stur    x0, [x29,#-16]
   0x0000007fb7e9b6ac <+16>:    stur    w1, [x29,#-20]
   0x0000007fb7e9b6b0 <+20>:    str     x2, [sp,#16]
   0x0000007fb7e9b6b4 <+24>:    str     w3, [sp,#12]
   0x0000007fb7e9b6b8 <+28>:    adrp    x0, 0x7fb7eab000
   0x0000007fb7e9b6bc <+32>:    ldr     x0, [x0,#4088]
   0x0000007fb7e9b6c0 <+36>:    add     x0, x0, #0x130
   0x0000007fb7e9b6c4 <+40>:    adrp    x1, 0x7fb7e9b000
   0x0000007fb7e9b6c8 <+44>:    add     x1, x1, #0x730
   0x0000007fb7e9b6cc <+48>:    bl      0x7fb7e9b5e0 <fprintf@plt>
   0x0000007fb7e9b6d0 <+52>:    adrp    x1, 0x7fb7eab000
   0x0000007fb7e9b6d4 <+56>:    ldr     x1, [x1,#4088]
   0x0000007fb7e9b6d8 <+60>:    mov     x2, #0x130                      // #304
   0x0000007fb7e9b6dc <+64>:    add     x1, x1, x2
   0x0000007fb7e9b6e0 <+68>:    str     w0, [sp,#8]
   0x0000007fb7e9b6e4 <+72>:    mov     x0, x1
   0x0000007fb7e9b6e8 <+76>:    bl      0x7fb7e9b5c0 <fflush@plt>
   0x0000007fb7e9b6ec <+80>:    ldr     w3, [sp,#12]
   0x0000007fb7e9b6f0 <+84>:    str     w0, [sp,#4]
   0x0000007fb7e9b6f4 <+88>:    cbnz    w3, 0x7fb7e9b70c <bar+112>
---Type <return> to continue, or q <return> to quit---
=> 0x0000007fb7e9b6f8 <+92>:    ldur    x0, [x29,#-16]
   0x0000007fb7e9b6fc <+96>:    ldur    w1, [x29,#-20]
   0x0000007fb7e9b700 <+100>:   bl      0x7fb7e9b5d0 <foo@plt>
   0x0000007fb7e9b704 <+104>:   stur    x0, [x29,#-8]
   0x0000007fb7e9b708 <+108>:   b       0x7fb7e9b714 <bar+120>
   0x0000007fb7e9b70c <+112>:   mov     x8, xzr
   0x0000007fb7e9b710 <+116>:   stur    x8, [x29,#-8]
   0x0000007fb7e9b714 <+120>:   ldur    x0, [x29,#-8]
   0x0000007fb7e9b718 <+124>:   ldp     x29, x30, [sp,#48]
   0x0000007fb7e9b71c <+128>:   add     sp, sp, #0x40
   0x0000007fb7e9b720 <+132>:   ret
End of assembler dump.
(gdb) step

Program received signal SIGSEGV, Segmentation fault.
0x00000055663c6160 in foo ()
(gdb) disassemble
Dump of assembler code for function foo:
=> 0x00000055663c6160 <+0>:     tbnz    x28, #61, 0x55663c9824 <_ZL16emergency_buffer+13588>
   0x00000055663c6164 <+4>:     .inst   0x0000007f ; undefined
End of assembler dump.
(gdb) continue
Continuing.

Program received signal SIGSEGV, Segmentation fault.
0x00000055663c6160 in foo ()
(gdb) continue
Continuing.

Program terminated with signal SIGSEGV, Segmentation fault.
The program no longer exists.
/data/data/berserker.android.apps.sshdroid/home/ndk-r14-clang-SIGSEGV: No such file or directory.
(gdb)
@gpakosz
Copy link
Author

gpakosz commented Apr 11, 2017

I can workaround the problem by introducing an intermediate foo_() function which both foo() and bar() call but well...

#include <stdio.h>

static void* foo_(void* p, int i)
{
  fprintf(stderr, "inside foo\n");
  fflush(stderr);

  return NULL;
}

void* foo(void* p, int i)
{
  return foo_(p, i);
}

void* bar(void* p, int i, const void* q, int j)
{
  fprintf(stderr, "inside bar\n");
  fflush(stderr);

  if (j == 0)
    return foo_(p, i);

  return NULL;
}

@gpakosz
Copy link
Author

gpakosz commented Apr 12, 2017

For those wondering what's the code within ndk-r14-clang-SIGSEGV.zip, 3 files: lib.c, test.cpp, and main.cpp.

As you can see below, I believe this qualifies as a minimal repro case since it all looks fairly trivial.


lib.c

#include <stdio.h>

void* foo(void* p, int i)
{
  fprintf(stderr, "inside foo\n");
  fflush(stderr);

  return NULL;
}

void* bar(void* p, int i, const void* q, int j)
{
  fprintf(stderr, "inside bar\n");
  fflush(stderr);

  if (j == 0)
    return foo(p, i);

  return NULL;
}

test.cpp

extern "C" void* (*foo)(void* p, int i);
extern "C" void* (*bar)(void* p, int i, const void* q, int j);

#include <gtest/gtest.h>

TEST(FooTest, foo)
{
  foo(NULL, -1);
}

main.cpp

#include <stdio.h>
#include <dlfcn.h>

void* (*foo)(void* p, int i);
void* (*bar)(void* p, int i, const void* q, int j);

int main(int argc, char **argv)
{
  void* h = dlopen("lib-ndk-r14-clang-SIGSEGV.so", RTLD_NOW);
  if (!h)
    return -1;

  void** p;
  p = (void**)&foo;
  *p = dlsym(h, "foo");
  p = (void**)&bar;
  *p = dlsym(h, "bar");

  bar(NULL, -1, NULL, 0); // <-- crashes here with SIGSEGV after having jumped in the void

  fprintf(stderr, "done\n");
  fflush(stderr);

  return 0;
}

@gpakosz
Copy link
Author

gpakosz commented May 18, 2017

Hello there,

Any news? FYI, I can reproduce the segmentation fault with Android NDK r15 beta1 and beta2.

@gpakosz
Copy link
Author

gpakosz commented Jun 1, 2017

Also I would have expected this would be added to the r15 milestone.

@DanAlbert DanAlbert added this to the r16 milestone Jun 1, 2017
@DanAlbert
Copy link
Member

Sorry about this. We were expecting this to be gone with the update to the new Clang in r15 beta 2, and I didn't see your more recent message until now :(

I think it's too late to do anything about r15, but @pirama-arumuga-nainar says he'll take a look at this today.

@gpakosz
Copy link
Author

gpakosz commented Jun 1, 2017

@DanAlbert Thank you for the follow up, appreciated.

@pirama-arumuga-nainar Feel free to ping me would you need more details.

@gburgessiv
Copy link
Collaborator

In a (slight) change of plans, I'll be looking into this.

I have a repro on my local device, so hopefully I'll be able to pin it down. :)

@gpakosz
Copy link
Author

gpakosz commented Jun 1, 2017

@gburgessiv Nice! If you ever end up opening a bug on LLVM's side, can you please link to it? I'm eager to learn more.

Also, being unfamiliar with clang and clang++ themselves, I wasn't sure how to proceed and track the bug further :/ On that front I'm also very willing to learn more about how you do it (but I also realize that might require a lengthy post and that takes even more time and energy).

@gburgessiv
Copy link
Collaborator

gburgessiv commented Jun 2, 2017

@gpakosz - As you can probably guess, it boils down to "make the test-case as small as possible, and stare at what the differences are." :) I've heavily used the NDK's objdump and nm paired with vimdiff to find/stare at differences.


Some random thoughts/observations:

Workaround: rename foo in your executable or library to anything else, problem vanishes :)

This doesn't repro if I stick this in aosp and do mma. Though, it looks like that's for a different reason than the one that's causing ndk-GCC to work and ndk-clang to break. In particular, the platform build doesn't have foo in the executable's .dynamic section. Both ndk-gcc and ndk-clang put foo in .dynamic.

I suspect that this is caused by a difference in how clang and GCC invoke the linker. If we do a standard ndk-build with clang, but nuke the .so and relink that with GCC, the issue disappears. A significant difference between the two .sos is that the .so linked by GCC calls foo in lib.c directly, whereas clang's linker invocation wires it up to call foo@plt.

So, I'm going to dig in to how clang and GCC invoke the linker to see what's different there.

(In any case, I want to say "this is an ODR violation, so the behavior is undefined," but I'm not familiar enough with dynamic linking to know how well dlopen/dlsym/etc are meant to play with differing symbols (one in .bss, one in .text) that have the same name.)

repro I'm working with:

main.cpp:

#include <stdio.h>
#include <dlfcn.h>

extern void (*foo)();

int main(int, char **)
{
  void* h = dlopen("lib-ndk-r14-clang-SIGSEGV.so", RTLD_NOW);
  if (!h)
    return -1;
  void (*bar_ptr)() = (__typeof__(bar_ptr))dlsym(h, "bar");
  bar_ptr();

  fprintf(stderr, "done\n");
  fflush(stderr);

  *(void **)&foo = (void *)0;
  return 0;
}

test.cpp:

void (*foo)();

lib.c:

#include <stdio.h>

volatile int lib_j;

void foo(void)
{
  fprintf(stderr, "inside foo\n");
  fflush(stderr);
}

void bar()
{
  fprintf(stderr, "inside bar\n");
  fflush(stderr);

  if (lib_j == 0)
    return foo();
}

@stephenhines
Copy link
Collaborator

Dimitry, can you comment on the Android loading aspects that George just mentioned?

@dimitry-
Copy link
Contributor

dimitry- commented Jun 2, 2017

From the loading perspective the symbol resolution is very simple. The de-facto rule which is followed by ld-linux.so, as well as ld-android.so is that while linking a library symbols are is looked up

  1. in main executable
  2. ld-preloaded libraries.
  3. any libraries loaded with RTLD_GLOBAL flag before 'this' library.
  4. The local group (the local group is the breadth-first transitive closure of the library and its DT_NEEDED libraries)

If I understand the example correctly - the main executable is exporting foo symbol which getting substituted while loading lib-ndk-r14-clang-SIGSEGV and call from bar() results in branching to 0?

If this is the case - it works as expected. the foo symbol gets intercepted by the main executable.

Another way to fix this would be to make foo symbol in the main executable hidden.

@gpakosz
Copy link
Author

gpakosz commented Jun 2, 2017

The library is supposed to export both foo() and bar() and within the library bar() may call foo() depending on its parameters.

The executable is supposed to import both foo() and bar() with dlsym() and is supposed to be able to call either of them at will.

I didn't try @gburgessiv's repro yet. It looks different enough from the intent mention just above though.

@gpakosz
Copy link
Author

gpakosz commented Jun 2, 2017

Also in my initial repro case, something worth maybe noticing: if I comment the foo(NULL, -1) call then problem vanishes

@gburgessiv
Copy link
Collaborator

Also in my initial repro case, something worth noticing: if I comment the foo(NULL, -1) call then problem vanishes

Right. That's why I have the foo = 0 craziness at the end of my repro. ;) We need to reference the executable's foo from the file that it's not defined in in order to reproduce this.

But the goal is for the library to only call the foo that resides in the library, yes? Going off of what @dimitry- said, it sounds like the library is supposed[1] to call the executable's foo.

If I'm interpreting your goal correctly: looking at #21, it seems that GCC passes -Bsymbolic to the linker by default, but clang does not. -Bsymbolic is apparently what you want if you'd like all calls to foo in your shared library to only go to the shared library's foo. Can you try adding LOCAL_LDFLAGS := -Wl,-Bsymbolic before include $(BUILD_SHARED_LIBRARY) in your Android.mk and seeing if that fixes it? That makes the issue go away for me.

[1] - (I say "supposed", since the moment any optimization gets turned on, the library's foo gets inlined into bar by both GCC and clang. This seems bad if the user is expecting foo to point to another definition...)

@pirama-arumuga-nainar
Copy link
Collaborator

Looks like GCC adds -Bsymbolic just for Android: https://gcc.gnu.org/ml/gcc/2013-04/msg00013.html. I couldn't find any justification other than that the time spent in loading is reduced.

Clang also added this until 2014 when Evgenii turned it off: http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20140714/110155.html.

From what I can understand, -Bsymbolic violates ODR and can lead to unintended consequences as seen in this case. @gpakosz: if the intention is indeed to have bar() link against the local foo() ahead of any other foo(), consider using a linker script.

@DanAlbert
Copy link
Member

-Bsymbolic also prevents new/delete replacements from working as specified by the standard, and prevents ASAN from working (the latter being the reason we got rid of it). That change never made it to GCC though.

@gpakosz
Copy link
Author

gpakosz commented Jun 3, 2017

But the goal is for the library to only call the foo that resides in the library, yes? Going off of what @dimitry- said, it sounds like the library is supposed[1] to call the executable's foo.

In the library, I expect bar() to call foo() from the same library. I expected this would be the only acceptable thing to do since foo() is defined in the same compilation unit as bar(), just hundreds of lines above.

since the moment any optimization gets turned on, the library's foo gets inlined into bar by both GCC and clang. This seems bad if the user is expecting foo to point to another definition...)

Again I don't expect bar() to call a foo() from anywhere else than bar()'s compilation unit since foo() is defined just before bar(). Worth noting, in my real use case, foo() is long/complex enough to not be inlined. And in fact if it was inlined, shouldn't we stop seeing the SIGSEGV we're discussing?

if the intention is indeed to have bar() link against the local foo() ahead of any other foo(), consider using a linker script.

In fact I'm already using a linker script and foo() and bar() are both legitimate public symbols of the library. For the sake of clarity, let's stop using foo() and bar() and switch to names that are closer to reality:

   void* foo() --> X* create();
   void* bar(int i) --> X* createEx(int i);

create() is semantically a default construction/factory function while createEx(int i) is an overloaded construction/factory function that takes a parameter.

From the user's point of view, calling createEx(0) is equivalent to calling create().

From the implementor's point of view, at the beginning of createEx() we decided that if i == 0 we just defer to create(). This is this call to create() from createEx() that causes troubles.

The workaround I found consists in doing:

static X* createImpl()
{
  ...
};

X* create()
{
  return createImpl();
}

X* createEx(int i)
{
  if (i == 0)
    return createImpl();

  ...
}

I still don't understand how one would find it expected that createEx calls into the executable that links the library. In fact, the said executable doesn't have its own definition for X* create(). The executable imports both X* create() and X* createEx(int) with dlsym().

About -Bsymbolic being added by GCC on Android only, I don't know what to say. The situation I described below works on Linux for the x86, x86_64, armv7 and arm64 architectures when compiled with either GCC or Clang.

Finally, can you elaborate on ODR violation you're seeing? From my understanding, foo()/create() is defined once in the same .c as bar()/createEx(). The executable doesn't have a function definition, only a pointer to a function that gets assigned with the return value of dlsym().

@DanAlbert
Copy link
Member

In the library, I expect bar() to call foo() from the same library. I expected this would be the only acceptable thing to do since foo() is defined in the same compilation unit as bar(), just hundreds of lines above.

That's a very reasonable assumption, but that's not how dynamic linking works :) -Bsymbolic makes the linker behave that way, but it isn't the default because it inhibits behaviors like new/delete replacement and ASAN.

From my understanding, foo()/create() is defined once in the same .c as bar()/createEx().

Once per program. This includes all linked libraries and the executable.

N3690 3.2.4:

Every program shall contain exactly one definition of every non-inline function or variable that is odr-used in that program; no diagnostic required. The definition can appear explicitly in the program, it can be found in the standard or a user-defined library, or (when appropriate) it is implicitly defined (see 12.1, 12.4 and 12.8). An inline function shall be defined in every translation unit in which it is odr-used.

@gpakosz
Copy link
Author

gpakosz commented Jun 3, 2017

Do you mean in main.cpp

void* (*foo)(void* p, int i);

counts as a definition for function foo()?

@gpakosz
Copy link
Author

gpakosz commented Jun 3, 2017

Can you try adding LOCAL_LDFLAGS := -Wl,-Bsymbolic before include $(BUILD_SHARED_LIBRARY) in your Android.mk and seeing if that fixes it? That makes the issue go away for me.

Note that I changed the repro a bit to reduce executable size and have faster scp round trips to my device.

lib.c is unchanged and main.cpp is now calling a baz() function defined in test.cpp


lib.c

#include <stdio.h>

void* foo(void* p, int i)
{
  fprintf(stderr, "inside foo\n");
  fflush(stderr);

  return NULL;
}

void* bar(void* p, int i, const void* q, int j)
{
  fprintf(stderr, "inside bar\n");
  fflush(stderr);

  if (j == 0)
    return foo(p, i);

  return NULL;
}

test.cpp

extern "C" void* (*foo)(void* p, int i);
extern "C" void* (*bar)(void* p, int i, const void* q, int j);

#include <stdio.h>

extern "C" {

void baz()
{
  fprintf(stderr, "inside baz\n");
  fflush(stderr);

  foo(0, -1);
}

main.cpp

#include <stdio.h>
#include <dlfcn.h>

void* (*foo)(void* p, int i);
void* (*bar)(void* p, int i, const void* q, int j);

extern "C" void baz();

int main(int argc, char **argv)
{
  void* h = dlopen("lib-ndk-r14-clang-SIGSEGV.so", RTLD_NOW);
  if (!h)
    return -1;

  void** p;
  p = (void**)&foo;
  *p = dlsym(h, "foo");
  p = (void**)&bar;
  *p = dlsym(h, "bar");

  bar(NULL, -1, NULL, 0);
  baz();

  fprintf(stderr, "done\n");
  fflush(stderr);

  return 0;
}

And here is what I'm getting:

  • without -Wl,-Bsymbolic → Segmentation fault
  • with -Wl,-Bsymbolic → ok
  • without -Wl,-Bsymbolic, commenting out the call to baz() in main.cpp → ok
  • with -Wl,-Bsymbolic, commenting out the call to baz() in main.cpp → ok
  • without -Wl,-Bsymbolic, commenting out the call to foo() in test.cpp → ok
  • with -Wl,-Bsymbolic, commenting out the call to foo() in test.cpp→ ok

I still don't get how bringing in test.cpp by calling baz() makes the compiler decide/believe bar() should call the executable's foo() which I don't understand the existence to begin with :(

@DanAlbert
Copy link
Member

Do you mean in main.cpp

void* (*foo)(void* p, int i);

counts as a definition for function foo()?

In practice all that matters is that it counts as a definition for foo. The type of the thing doesn't change whether it's an ODR violation from a linking standpoint aiui.

@gpakosz
Copy link
Author

gpakosz commented Jun 3, 2017

One more thing, why isn't there an ODR violation when removing test.cpp from the puzzle and making main.cpp call both foo() and bar()?

#include <stdio.h>
#include <dlfcn.h>

void* (*foo)(void* p, int i);
void* (*bar)(void* p, int i, const void* q, int j);

int main(int argc, char **argv)
{
  void* h = dlopen("lib-ndk-r14-clang-SIGSEGV.so", RTLD_NOW);
  if (!h)
    return -1;

  void** p;
  p = (void**)&foo;
  *p = dlsym(h, "foo");
  p = (void**)&bar;
  *p = dlsym(h, "bar");

  foo(NULL, -1);
  bar(NULL, -1, NULL, 0);

  fprintf(stderr, "done\n");
  fflush(stderr);

  return 0;
}

Which corresponds to @gburgessiv's comment

Right. That's why I have the foo = 0 craziness at the end of my repro. ;) We need to reference the executable's foo from the file that it's not defined in in order to reproduce this.

Is the answer to this question "undefined behavior doesn't necessarily bite you"?

@gburgessiv
Copy link
Collaborator

gburgessiv commented Jun 5, 2017

Is the answer to this question "undefined behavior doesn't necessarily bite you"?

My money's on that, yes. foo is an external symbol in both the library and the executable, so I'd imagine that http://eel.is/c++draft/basic.link#9.1 from the C++ standard would apply in addition to @DanAlbert's note. (Though I'm not exactly a language lawyer ;) )

As for why it doesn't bite you in the single file case, you might find ld's --export-dynamic option interesting: http://man7.org/linux/man-pages/man1/ld.1.html . Indeed, if I add LOCAL_LDFLAGS := -Wl,-E before BUILD_EXECUTABLE line in Android.mk, the single file repro segfaults.

@gpakosz
Copy link
Author

gpakosz commented Jun 5, 2017

Still, in the case that SIGSEGV. Why does bar() calling back into executable's foo() fail when at the time of the call foo() has already been initialized with dlsym()?

@gburgessiv
Copy link
Collaborator

Closing this because I think we have a resolution.

Why does bar() calling back into executable's foo() fail when at the time of the call foo() has already been initialized with dlsym()?

The types don't match up.

In the segfaulting binary, the table that says -- at runtime -- "here's the offset to get to foo," stores information about where to find a function pointer. The code generated to call foo in the .so is expecting that to lead to a function body instead.

So, we end up jumping to &foo, rather than jumping to *foo (where both foos are located in the main executable). Looking at a run in gdb right after the crash:

(gdb) i stack
#0  0x0000005555666160 in foo ()
#1  0x0000007fb7632760 in bar () from target:/system/lib64/lib-ndk-r14-clang-SIGSEGV.so
#2  0x00000055555990cc in main (argc=1, argv=0x7ffffff9c8) at ./main.cpp:19
(gdb) p &foo
$3 = (void *(**)(void *, int)) 0x5555666160 <foo>
(gdb) info symbol 0x5555666160
foo in section .bss of /tmp/ndk_bug/try2/ndk-r14-clang-SIGSEGV/obj/local/arm64-v8a/ndk-r14-clang-SIGSEGV
(gdb) p *foo
$14 = {void *(void *, int)} 0x7fb76326c0 <foo>
(gdb) info symbol 0x7fb76326c0
foo in section .text of target:/data/local/tmp/ndk_segv/lib-ndk-r14-clang-SIGSEGV.so

@gpakosz
Copy link
Author

gpakosz commented Jun 6, 2017

Thank you all for putting time into this. I appreciated the discussion and learning from you.

@gburgessiv
Copy link
Collaborator

Happy to help. :) Thanks for the report!

@gpakosz
Copy link
Author

gpakosz commented Jun 8, 2017

BTW, doesn't our discussion imply that every single public library, unless linked with -Bsymbolic, is subject to ODR violation whenever it's loaded dynamically with dlopen() and dlsym()?

@stephenhines
Copy link
Collaborator

It's only subject to ODR violations if they reuse the symbol names from one library (i.e. have a second definition). If you are just using shared header files and differently named function pointers, then there are no issues.

@gpakosz
Copy link
Author

gpakosz commented Jun 8, 2017

Indeed my expectation is that people would define function pointers with same names as library functions in order to have common code except a piece of init code that does dlopen() + several dlsym() calls.

@stephenhines
Copy link
Collaborator

Most dlsym usage I have seen puts the function addresses into a structure of pointers rather than just re-exporting the raw symbol (which leads to ODR issues as seen here).

@gpakosz
Copy link
Author

gpakosz commented Jun 8, 2017

I see. Thank you for the follow up

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

6 participants