I am using Altera Cyclone V FPGA with ARM 7 core,
I am running a application with 7 thread with mutexes.
The application randomly hangs after 1hr or 1 Day or 1 Month, no defined time.
I ran strace when the application is running smoothly and it gives:
---------------------------------------------------------------------------------------------------------
-------------------- RUNNING / HEALTHY STATE
---------------------------------------------------------------------------------------------------------
root#socfpga:~# strace -p 297 -f
Process 297 attached with 7 threads
[pid 311] recvfrom(6, <unfinished ...>
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 340] nanosleep({0, 500000000}, <unfinished ...>
......
[pid 339] <... nanosleep resumed> NULL) = 0
[pid 339] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
[pid 297] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 297] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 340] <... nanosleep resumed> NULL) = 0
[pid 340] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 339] <... nanosleep resumed> NULL) = 0
[pid 339] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
[pid 297] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 297] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 340] <... nanosleep resumed> NULL) = 0
[pid 340] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 339] <... nanosleep resumed> NULL) = 0
[pid 339] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
[pid 297] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 297] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 340] <... nanosleep resumed> NULL) = 0
[pid 340] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
.......
[pid 297] nanosleep({0, 10000000}, NULL) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
[pid 297] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 297] gettimeofday({1495402377, 473913}, NULL) = 0
[pid 297] write(3, "20170521 21:32:57.473 INFO d"..., 100) = 100
[pid 297] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 297] gettimeofday({1495402377, 474831}, NULL) = 0
[pid 297] write(3, "20170521 21:32:57.474 ERROR d"..., 110) = 110
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, <unfinished ...>
[pid 340] <... nanosleep resumed> NULL) = 0
[pid 340] nanosleep({0, 500000000}, <unfinished ...>
[pid 297] <... nanosleep resumed> NULL) = 0
[pid 297] nanosleep({0, 10000000}, NULL) = 0
[pid 297] nanosleep({0, 10000000}, ^CProcess 297 detached
<detached ...>
Process 309 detached
Process 310 detached
Process 311 detached
Process 312 detached
Process 339 detached
Process 340 detached
randomly the application hangs and the strace output is as follows:
---------------------------------------------------------------------------------------------------------
-------------------- HANG STATE
---------------------------------------------------------------------------------------------------------
root#socfpga:~# strace -p 297
Process 297 attached
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL^CProcess 297 detached
<detached ...>
root#socfpga:~# strace -p 297 -f
Process 297 attached with 7 threads
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 340] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 339] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 311] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 310] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 309] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 297] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 312] <... futex resumed> ) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 312] --- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
[pid 312] rt_sigreturn() = -1 EINTR (Interrupted system call)
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL^CProcess 297 detached
Process 309 detached
Process 310 detached
Process 311 detached
Process 312 detached
<detached ...>
Process 339 detached
Process 340 detached
root#socfpga:~# strace -p 310
Process 310 attached
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL^CProcess 310 detached
<detached ...>
root#socfpga:~# strace -p 311
Process 311 attached
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL^CProcess 311 detached
<detached ...>
root#socfpga:~# strace -p 312
Process 312 attached
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL
) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL
) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGRT_2 {si_signo=SIGRT_2, si_code=SI_TIMER, si_pid=0, si_uid=0, si_value=209660} ---
rt_sigreturn() = -1 EINTR (Interrupted system call)
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL^CProcess 312 detached
<detached ...>
root#socfpga:~# strace -p 339
Process 339 attached
futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL
^CProcess 339 detached
<detached ...>
Quit anyway? (y or n) y
Detaching from program: /home/user/user/process_cc, process 297
root#socfpga:~# gdb -p 309
GNU gdb (Linaro GDB) 7.8-2014.09
Copyright (C) 2014 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "arm-angstrom-linux-gnueabi".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://bugs.linaro.org>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
Attaching to process 309
warning: process 309 is a cloned process
Reading symbols from /home/user/user/process_cc...done.
Reading symbols from /usr/lib/liblog4c.so.3...done.
Loaded symbols for /usr/lib/liblog4c.so.3
Reading symbols from /lib/libpthread.so.0...(no debugging symbols found)...done.
warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available.
Loaded symbols for /lib/libpthread.so.0
Reading symbols from /lib/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib/libm.so.6
Reading symbols from /lib/librt.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib/librt.so.1
Reading symbols from /lib/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib/libc.so.6
Reading symbols from /lib/ld-linux-armhf.so.3...(no debugging symbols found)...done.
Loaded symbols for /lib/ld-linux-armhf.so.3
warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available.
0x76f3ed50 in ?? () from /lib/libpthread.so.0
(gdb) bt
#0 0x76f3ed50 in ?? () from /lib/libpthread.so.0
#1 0x00000000 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb) quit
A debugging session is active.
Inferior 1 [process 309] will be detached.
Quit anyway? (y or n) y
Detaching from program: /home/user/user/process_cc, process 309
root#socfpga:~# gdb -p 310
GNU gdb (Linaro GDB) 7.8-2014.09
Copyright (C) 2014 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "arm-angstrom-linux-gnueabi".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://bugs.linaro.org>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
Attaching to process 310
warning: process 310 is a cloned process
Reading symbols from /home/user/user/process_cc...done.
Reading symbols from /usr/lib/liblog4c.so.3...done.
Loaded symbols for /usr/lib/liblog4c.so.3
Reading symbols from /lib/libpthread.so.0...(no debugging symbols found)...done.
warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available.
Loaded symbols for /lib/libpthread.so.0
Reading symbols from /lib/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib/libm.so.6
Reading symbols from /lib/librt.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib/librt.so.1
Reading symbols from /lib/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib/libc.so.6
Reading symbols from /lib/ld-linux-armhf.so.3...(no debugging symbols found)...done.
Loaded symbols for /lib/ld-linux-armhf.so.3
warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available.
0x76f3ed50 in ?? () from /lib/libpthread.so.0
(gdb) bt
#0 0x76f3ed50 in ?? () from /lib/libpthread.so.0
#1 0x00000000 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb)
#0 0x76f3ed50 in ?? () from /lib/libpthread.so.0
#1 0x00000000 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb) quit
A debugging session is active.
Inferior 1 [process 310] will be detached.
Quit anyway? (y or n) y
Detaching from program: /home/user/user/process_cc, process 310
Memory layout.
Start Addr End Addr Size Offset objfile
0x10000 0x23000 0x13000 0x0 /home/user/user/process_cc
0x33000 0x34000 0x1000 0x13000 /home/user/user/process_cc
0x34000 0x208000 0x1d4000 0x0
0x1eba000 0x1edb000 0x21000 0x0 [heap]
0x73800000 0x73801000 0x1000 0x0
0x73801000 0x74000000 0x7ff000 0x0 [stack:340]
0x74000000 0x74001000 0x1000 0x0
0x74001000 0x74800000 0x7ff000 0x0 [stack:339]
0x74800000 0x74821000 0x21000 0x0
0x74821000 0x74900000 0xdf000 0x0
0x74900000 0x74921000 0x21000 0x0
0x74921000 0x74a00000 0xdf000 0x0
0x74a00000 0x74a21000 0x21000 0x0
0x74a21000 0x74b00000 0xdf000 0x0
0x74b64000 0x74b65000 0x1000 0x0
0x74b65000 0x75364000 0x7ff000 0x0 [stack:312]
0x75364000 0x75365000 0x1000 0x0
0x75365000 0x75b64000 0x7ff000 0x0 [stack:311]
0x75b64000 0x75b65000 0x1000 0x0
0x75b65000 0x76364000 0x7ff000 0x0 [stack:310]
0x76364000 0x76365000 0x1000 0x0
0x76365000 0x76b64000 0x7ff000 0x0 [stack:309]
0x76b64000 0x76d64000 0x200000 0xff200000 /dev/mem
0x76d64000 0x76e89000 0x125000 0x0 /lib/libc-2.20.so
0x76e89000 0x76e99000 0x10000 0x125000 /lib/libc-2.20.so
0x76e99000 0x76e9b000 0x2000 0x125000 /lib/libc-2.20.so
0x76e9b000 0x76e9c000 0x1000 0x127000 /lib/libc-2.20.so
0x76e9c000 0x76e9f000 0x3000 0x0
0x76e9f000 0x76ea5000 0x6000 0x0 /lib/librt-2.20.so
0x76ea5000 0x76eb4000 0xf000 0x6000 /lib/librt-2.20.so
---Type <return> to continue, or q <return> to quit---
0x76eb4000 0x76eb5000 0x1000 0x5000 /lib/librt-2.20.so
0x76eb5000 0x76eb6000 0x1000 0x6000 /lib/librt-2.20.so
0x76eb6000 0x76f1f000 0x69000 0x0 /lib/libm-2.20.so
0x76f1f000 0x76f2e000 0xf000 0x69000 /lib/libm-2.20.so
0x76f2e000 0x76f2f000 0x1000 0x68000 /lib/libm-2.20.so
0x76f2f000 0x76f30000 0x1000 0x69000 /lib/libm-2.20.so
0x76f30000 0x76f44000 0x14000 0x0 /lib/libpthread-2.20.so
0x76f44000 0x76f54000 0x10000 0x14000 /lib/libpthread-2.20.so
0x76f54000 0x76f55000 0x1000 0x14000 /lib/libpthread-2.20.so
0x76f55000 0x76f56000 0x1000 0x15000 /lib/libpthread-2.20.so
0x76f56000 0x76f58000 0x2000 0x0
0x76f58000 0x76f6e000 0x16000 0x0 /usr/lib/liblog4c.so.3
0x76f6e000 0x76f75000 0x7000 0x16000 /usr/lib/liblog4c.so.3
0x76f75000 0x76f77000 0x2000 0x15000 /usr/lib/liblog4c.so.3
0x76f77000 0x76f96000 0x1f000 0x0 /lib/ld-2.20.so
0x76f99000 0x76f9b000 0x2000 0x0
0x76fa4000 0x76fa5000 0x1000 0x0
0x76fa5000 0x76fa6000 0x1000 0x0 [sigpage]
0x76fa6000 0x76fa7000 0x1000 0x1f000 /lib/ld-2.20.so
0x76fa7000 0x76fa8000 0x1000 0x20000 /lib/ld-2.20.so
0x7e9f4000 0x7ea1a000 0x26000 0x0 [stack]
0xffff0000 0xffff1000 0x1000 0x0 [vectors]
Please point out the reason for such behavior so that the issue can be resolved.
Based on the strace output, it looks like all 7 threads are waiting for the same mutex:
[pid 312] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 340] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 339] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 311] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 310] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 309] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 297] futex(0x1eba830, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
Attaching to the process (not a specific thread) with gdb and executing thread apply all bt would be a good start to identify which thread holds the mutex those threads are waiting for. If it is not obvious from the output, switch to one of the threads that are waiting (t <gdb-thread-id>), select the frame before the pthread_mutex_lock call (s <frame-id>; assuming pthread mutexes are used) and determine the owner by executing print <pthread_mutex_t-ptr>->__data.__owner. If gdb cannot resolve __data.__owner, you need to identify the owner via print *((int*)(<pthread_mutex_t-ptr>)+2). Search for the ID in the info threads output or in the initial command output to identify the owning thread and its stack trace (t <gdb-thread-id> and bt).
Be sure to create a core dump for later analysis by executing generate in gdb (can be reopened later with gdb <executable> <core-dump>).
I'm trying to load and delete data from Cassandra using the python driver. I have tried this both using cassandra running in a docker container and again locally after the docker version was giving me problems. Here's an example of what I'm doing:
class Controller(object):
def __init__(self):
self.cluster = Cluster()
self.session = self.cluster.connect('mykeyspace')
def insert_into_cassandra(self):
query = ('INSERT INTO mytable (mykey, indexed_key) VALUES (?, ?)')
prepared = self.session.prepare(query)
prepared.consistency_level = ConsistencyLevel.QUORUM
params_gen = self.params_generator(fname)
execute_concurrent_with_args(self.session, prepared, self.parameter_generator(), concurrency=50)
def delete_param_gen(self, results):
for r in results:
yield [r.mykey]
def delete_by_index(self, value):
query = "SELECT mykey from mytable where indexed_key = '%s'" % value
res = self.session.execute(query)
delete_query = "DELETE from mytable where mykey = ?"
prepared = self.session.prepare(delete_query)
prepared.consistency_level = ConsistencyLevel.QUORUM
params_gen = self.delete_param_gen(res)
execute_concurrent_with_args(self.session, prepared, params_gen, concurrency=50)
Nothing crazy. When loading/deleting data, I frequently see the following messages:
Sending options message heartbeat on idle connection (4422117360) 127.0.0.1
Heartbeat failed for connection (4422117360) to 127.0.0.1
Here are some logs from deleting data.
[2017-02-28 08:37:20,562] [DEBUG] [cassandra.connection] Defuncting connection (4422117360) to 127.0.0.1: errors=Connection heartbeat timeout after 30 seconds, last_host=127.0.0.1
[2017-02-28 08:37:20,563] [DEBUG] [cassandra.io.libevreactor] Closing connection (4422117360) to 127.0.0.1
[2017-02-28 08:37:20,563] [DEBUG] [cassandra.io.libevreactor] Closed socket to 127.0.0.1
[2017-02-28 08:37:20,564] [DEBUG] [cassandra.pool] Defunct or closed connection (4422117360) returned to pool, potentially marking host 127.0.0.1 as down
[2017-02-28 08:37:20,566] [DEBUG] [cassandra.pool] Replacing connection (4422117360) to 127.0.0.1
[2017-02-28 08:37:20,567] [DEBUG] [cassandra.connection] Defuncting connection (4426057600) to 127.0.0.1: errors=Connection heartbeat timeout after 30 seconds, last_host=127.0.0.1
[2017-02-28 08:37:20,567] [DEBUG] [cassandra.io.libevreactor] Closing connection (4426057600) to 127.0.0.1
[2017-02-28 08:37:20,567] [DEBUG] [cassandra.io.libevreacto[2017-02-28 08:37:20,568] [ERROR] [cassandra.cluster] Unexpected exception while handling result in ResponseFuture:
Traceback (most recent call last):
File "cassandra/cluster.py", line 3536, in cassandra.cluster.ResponseFuture._set_result (cassandra/cluster.c:67556)
File "cassandra/cluster.py", line 3711, in cassandra.cluster.ResponseFuture._set_final_result (cassandra/cluster.c:71769)
File "cassandra/concurrent.py", line 154, in cassandra.concurrent._ConcurrentExecutor._on_success (cassandra/concurrent.c:3357)
File "cassandra/concurrent.py", line 203, in cassandra.concurrent.ConcurrentExecutorListResults._put_result (cassandra/concurrent.c:5539)
File "cassandra/concurrent.py", line 209, in cassandra.concurrent.ConcurrentExecutorListResults._put_result (cassandra/concurrent.c:5427)
File "cassandra/concurrent.py", line 123, in cassandra.concurrent._ConcurrentExecutor._execute_next (cassandra/concurrent.c:2369)
File "load_cassandra.py", line 148, in delete_param_gen
for r in rows:
File "cassandra/cluster.py", line 3991, in cassandra.cluster.ResultSet.next (cassandra/cluster.c:76025)
File "cassandra/cluster.py", line 4006, in cassandra.cluster.ResultSet.fetch_next_page (cassandra/cluster.c:76193)
File "cassandra/cluster.py", line 3781, in cassandra.cluster.ResponseFuture.result (cassandra/cluster.c:73073)
cassandra.cluster.NoHostAvailable: ('Unable to complete the operation against any hosts', {})r] Closed socket to 127.0.0.1
And here are some from inserting data:
[2017-02-28 16:50:25,594] [DEBUG] [cassandra.connection] Sending options message heartbeat on idle connection (140301574604448) 127.0.0.1
[2017-02-28 16:50:25,595] [DEBUG] [cassandra.cluster] [control connection] Attempting to reconnect
[2017-02-28 16:50:25,596] [DEBUG] [cassandra.cluster] [control connection] Opening new connection to 127.0.0.1
[2017-02-28 16:50:25,596] [DEBUG] [cassandra.connection] Not sending options message for new connection(140301347717016) to 127.0.0.1 because compression is disabled and a cql version was not specified
[2017-02-28 16:50:25,596] [DEBUG] [cassandra.connection] Sending StartupMessage on <AsyncoreConnection(140301347717016) 127.0.0.1:9042>
[2017-02-28 16:50:25,596] [DEBUG] [cassandra.connection] Sent StartupMessage on <AsyncoreConnection(140301347717016) 127.0.0.1:9042>
[2017-02-28 16:50:30,596] [DEBUG] [cassandra.io.asyncorereactor] Closing connection (140301347717016) to 127.0.0.1
[2017-02-28 16:50:30,596] [DEBUG] [cassandra.io.asyncorereactor] Closed socket to 127.0.0.1
[2017-02-28 16:50:30,596] [DEBUG] [cassandra.connection] Connection to 127.0.0.1 was closed during the startup handshake
[2017-02-28 16:50:30,597] [WARNING] [cassandra.cluster] [control connection] Error connecting to 127.0.0.1:
Traceback (most recent call last):
File "cassandra/cluster.py", line 2623, in cassandra.cluster.ControlConnection._reconnect_internal (cassandra/cluster.c:47899)
File "cassandra/cluster.py", line 2645, in cassandra.cluster.ControlConnection._try_connect (cassandra/cluster.c:48416)
File "cassandra/cluster.py", line 1119, in cassandra.cluster.Cluster.connection_factory (cassandra/cluster.c:15085)
File "cassandra/connection.py", line 333, in cassandra.connection.Connection.factory (cassandra/connection.c:5790)
cassandra.OperationTimedOut: errors=Timed out creating connection (5 seconds), last_host=None
[2017-02-28 16:50:39,309] [ERROR] [root] Exception inserting data into cassandra
Traceback (most recent call last):
File "load_cassandra.py", line 54, in run
controller.insert_into_cassandra(filename)
File "extract_to_cassandra.py", line 141, in insert_into_cassandra
for success, result in results:
File "cassandra/concurrent.py", line 177, in _results (cassandra/concurrent.c:4856)
File "cassandra/concurrent.py", line 186, in cassandra.concurrent.ConcurrentExecutorGenResults._results (cassandra/concurrent.c:4622)
File "cassandra/concurrent.py", line 165, in cassandra.concurrent._ConcurrentExecutor._raise (cassandra/concurrent.c:3745)
cassandra.WriteTimeout: Error from server: code=1100 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out - received only 0 responses." info={'consistency': 'QUORUM', 'required_responses': 1, 'received_responses': 0}
[2017-02-28 16:50:39,465] [DEBUG] [cassandra.connection] Received options response on connection (140301574604448) from 127.0.0.1
[2017-02-28 16:50:39,466] [DEBUG] [cassandra.cluster] Shutting down Cluster Scheduler
[2017-02-28 16:50:39,467] [DEBUG] [cassandra.cluster] Shutting down control connection
[2017-02-28 16:50:39,467] [DEBUG] [cassandra.io.asyncorereactor] Closing connection (140301574604448) to 127.0.0.1
[2017-02-28 16:50:39,467] [DEBUG] [cassandra.io.asyncorereactor] Closed socket to 127.0.0.1
[2017-02-28 16:50:39,468] [DEBUG] [cassandra.pool] Defunct or closed connection (140301574604448) returned to pool, potentially marking host 127.0.0.1 as down
I tweaked with consistency and even set it to 1, but that didn't work. Inserts tend to work better when running cassandra locally as opposed to docker, but they still timeout. Deletes usually work for a couple of seconds and then hang/timeout.
edit: Here's are the logs from cassandra when things fail:
INFO 18:39:11 MUTATION messages were dropped in last 5000 ms: 4 for internal timeout and 0 for cross node timeout. Mean internal dropped latency: 2933809 ms and Mean cross-node dropped latency: 0 msINFO 18:39:11 Pool Name Active Pending Completed Blocked All Time Blocked [48/1513]
INFO 18:39:11 MutationStage 32 15 470 0 0
INFO 18:39:11 ViewMutationStage 0 0 0 0 0
INFO 18:39:11 ReadStage 0 0 59 0 0
INFO 18:39:11 RequestResponseStage 0 0 0 0 0
INFO 18:39:11 ReadRepairStage 0 0 0 0 0
INFO 18:39:11 CounterMutationStage 0 0 0 0 0
INFO 18:39:11 MiscStage 0 0 0 0 0
INFO 18:39:11 CompactionExecutor 0 0 6399 0 0
INFO 18:39:11 MemtableReclaimMemory 0 0 36 0 0
INFO 18:39:11 PendingRangeCalculator 0 0 1 0 0
INFO 18:39:11 GossipStage 0 0 0 0 0
INFO 18:39:11 SecondaryIndexManagement 0 0 0 0 0
INFO 18:39:11 HintsDispatcher 0 0 0 0 0
INFO 18:39:11 MigrationStage 0 0 2 0 0
INFO 18:39:11 MemtablePostFlush 0 0 62 0 0
INFO 18:39:11 PerDiskMemtableFlushWriter_0 0 0 36 0 0
INFO 18:39:11 ValidationExecutor 0 0 0 0 0
INFO 18:39:11 Sampler 0 0 0 0 0
INFO 18:39:11 MemtableFlushWriter 0 0 36 0 0
INFO 18:39:11 InternalResponseStage 0 0 0 0 0
INFO 18:39:11 AntiEntropyStage 0 0 0 0 0
INFO 18:39:11 CacheCleanupExecutor 0 0 0 0 0
INFO 18:39:11 Native-Transport-Requests 33 0 727 0 0
INFO 18:39:11 CompactionManager 0 0INFO 18:39:11 MessagingService n/a 0/0
INFO 18:39:11 Cache Type Size Capacity KeysToSave
INFO 18:39:11 KeyCache 1368 51380224 all
INFO 18:39:11 RowCache 0 0 all
INFO 18:39:11 Table Memtable ops,data
INFO 18:39:11 system_distributed.parent_repair_history 0,0
INFO 18:39:11 system_distributed.repair_history 0,0
INFO 18:39:11 system_distributed.view_build_status 0,0
INFO 18:39:11 system.compaction_history 1,231
INFO 18:39:11 system.hints 0,0
INFO 18:39:11 system.schema_aggregates 0,0
INFO 18:39:11 system.IndexInfo 0,0
INFO 18:39:11 system.schema_columnfamilies 0,0
INFO 18:39:11 system.schema_triggers 0,0
INFO 18:39:11 system.size_estimates 40,1255
INFO 18:39:11 system.schema_functions 0,0
INFO 18:39:11 system.paxos 0,0
INFO 18:39:11 system.views_builds_in_progress 0,0
INFO 18:39:11 system.built_views 0,0
INFO 18:39:11 system.peer_events 0,0
INFO 18:39:11 system.range_xfers 0,0
INFO 18:39:11 system.peers 0,0
INFO 18:39:11 system.batches 0,0
INFO 18:39:11 system.schema_keyspaces 0,0
INFO 18:39:11 system.schema_usertypes 0,0
INFO 18:39:11 system.local 0,0
INFO 18:39:11 system.sstable_activity 6,117
INFO 18:39:11 system.available_ranges 0,0
INFO 18:39:11 system.batchlog 0,0
INFO 18:39:11 system.schema_columns 0,0
INFO 18:39:11 system_schema.columns 0,0
INFO 18:39:11 system_schema.types 0,0
INFO 18:39:11 system_schema.indexes 0,0
INFO 18:39:11 system_schema.keyspaces 0,0
INFO 18:39:11 system_schema.dropped_columns 0,0
INFO 18:39:11 system_schema.aggregates 0,0
INFO 18:39:11 system_schema.triggers 0,0
INFO 18:39:11 system_schema.tables 0,0
INFO 18:39:11 system_schema.views 0,0
INFO 18:39:11 system_schema.functions 0,0
INFO 18:39:11 system_auth.roles 0,0
INFO 18:39:11 system_auth.role_members 0,0
INFO 18:39:11 system_auth.resource_role_permissons_index 0,0
INFO 18:39:11 system_auth.role_permissions 0,0
INFO 18:39:11 mykeyspace.mytable 430,27163514
INFO 18:39:11 system_traces.sessions 0,0
INFO 18:39:11 system_traces.events 0,0
INFO 18:39:13 ParNew GC in 261ms. CMS Old Gen: 46106544 -> 74868512; Par Eden Space: 208895224 -> 0; Par Survivor Space: 16012448 -> 26083328
I see messages like this too:
Out of 29 commit log syncs over the past 248s with average duration of 1596.14ms, 1 have exceeded the configured commit interval by an average of 18231.00ms
One thing you could try, is to reduce the idle_heartbeat_interval setting in your connection. By default it is 30 seconds, but you can configure that when instancing your Cluster class. In this example, I'll set it to 10 seconds:
def __init__(self):
self.cluster = Cluster(idle_heartbeat_interval=10)
self.session = self.cluster.connect('mykeyspace')
If that doesn't help, then it might be time to check your data model for anti-patterns.
this is my first question, I hope to do all correctly.
I have 3 dockers on differents hosts with zookeeper, mesos, and chronos.
Mesos slaves are correctly subscribed to the master. Chronos tasks are syncronized with each host.
The problem is: chronos framework is connecting and disconnecting:
0915 12:12:11.132375 49 master.cpp:2231] Received SUBSCRIBE call for framework 'chronos-2.4.0' at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
I0915 12:12:11.132647 49 master.cpp:2302] Subscribing framework chronos-2.4.0 with checkpointing enabled and capabilities [ ]
I0915 12:12:11.133229 49 master.cpp:2312] Framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740 already subscribed, resending acknowledgement
W0915 12:12:11.133322 49 master.hpp:1764] Master attempted to send message to disconnected framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
E0915 12:12:11.133745 55 process.cpp:1958] Failed to shutdown socket with fd 41: Transport endpoint is not connected
I0915 12:12:25.648849 52 master.cpp:2231] Received SUBSCRIBE call for framework 'chronos-2.4.0' at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
I0915 12:12:25.649029 52 master.cpp:2302] Subscribing framework chronos-2.4.0 with checkpointing enabled and capabilities [ ]
I0915 12:12:25.649060 52 master.cpp:2312] Framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740 already subscribed, resending acknowledgement
W0915 12:12:25.649116 52 master.hpp:1764] Master attempted to send message to disconnected framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
E0915 12:12:25.649433 55 process.cpp:1958] Failed to shutdown socket with fd 41: Transport endpoint is not connected
I0915 12:13:15.146510 50 master.cpp:2231] Received SUBSCRIBE call for framework 'chronos-2.4.0' at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
I0915 12:13:15.146759 50 master.cpp:2302] Subscribing framework chronos-2.4.0 with checkpointing enabled and capabilities [ ]
I0915 12:13:15.146848 50 master.cpp:2312] Framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740 already subscribed, resending acknowledgement
W0915 12:13:15.146939 50 master.hpp:1764] Master attempted to send message to disconnected framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
E0915 12:13:15.147408 55 process.cpp:1958] Failed to shutdown socket with fd 41: Transport endpoint is not connected
I0915 12:14:04.957185 51 master.cpp:2231] Received SUBSCRIBE call for framework 'chronos-2.4.0' at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
I0915 12:14:04.957341 51 master.cpp:2302] Subscribing framework chronos-2.4.0 with checkpointing enabled and capabilities [ ]
I0915 12:14:04.957363 51 master.cpp:2312] Framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740 already subscribed, resending acknowledgement
W0915 12:14:04.957392 51 master.hpp:1764] Master attempted to send message to disconnected framework 71c69a28-ef16-4ed1-b869-04df66f84b5d-0000 (chronos-2.4.0) at scheduler-e6ebc7bc-8edb-45e9-ad68-3fa36566b55b#10.xxx.xxx.xxx:61740
E0915 12:14:04.957844 55 process.cpp:1958] Failed to shutdown socket with fd 41: Transport endpoint is not connected
In this case, mesos-master and chronos framework are in the same docker, but I suspect that can not connect to Chronos at port 61740 (That is a ephemeral port)
netstat capture:
tcpdump capture:
root#HOSTNAME:/# tcpdump -i eth0 port 61740 -v
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
12:30:41.013731 IP (tos 0x0, ttl 64, id 12013, offset 0, flags [DF], proto TCP (6), length 60)
172.xxx.xxx.xxx.29468 > HOSTNAME.61740: Flags [S], cksum 0xb989 (incorrect -> 0xa894), seq 1155265525, win 14600, options [mss 1460,sackOK,TS val 852942104 ecr 0,nop,wscale 6], len gth 0
12:30:41.013780 IP (tos 0x0, ttl 64, id 49727, offset 0, flags [DF], proto TCP (6), length 40)
HOSTNAME.61740 > 172.xxx.xxx.xxx.29468: Flags [R.], cksum 0x595a (correct), seq 0, ack 1155265526, win 0, length 0
12:31:18.129849 IP (tos 0x0, ttl 64, id 64040, offset 0, flags [DF], proto TCP (6), length 60)
172.xxx.xxx.xxx.30564 > HOSTNAME.61740: Flags [S], cksum 0xb989 (incorrect -> 0x97fb), seq 535270461, win 14600, options [mss 1460,sackOK,TS val 852979221 ecr 0,nop,wscale 6], leng th 0
12:31:18.129892 IP (tos 0x0, ttl 64, id 6441, offset 0, flags [DF], proto TCP (6), length 40)
HOSTNAME.61740 > 172.xxx.xxx.xxx.30564: Flags [R.], cksum 0xd9be (correct), seq 0, ack 535270462, win 0, length 0
12:31:36.451417 IP (tos 0x0, ttl 64, id 21303, offset 0, flags [DF], proto TCP (6), length 60)
172.xxx.xxx.xxx.31103 > HOSTNAME.61740: Flags [S], cksum 0xb989 (incorrect -> 0x10c7), seq 186377873, win 14600, options [mss 1460,sackOK,TS val 852997542 ecr 0,nop,wscale 6], leng th 0
12:31:36.451470 IP (tos 0x0, ttl 64, id 13169, offset 0, flags [DF], proto TCP (6), length 40)
HOSTNAME.61740 > 172.xxx.xxx.xxx.31103: Flags [R.], cksum 0x9a1b (correct), seq 0, ack 186377874, win 0, length 0
12:31:41.619076 IP (tos 0x0, ttl 64, id 41997, offset 0, flags [DF], proto TCP (6), length 60)
172.xxx.xxx.xxx.31252 > HOSTNAME.61740: Flags [S], cksum 0xb989 (incorrect -> 0xfe18), seq 2176478683, win 14600, options [mss 1460,sackOK,TS val 853002710 ecr 0,nop,wscale 6], length 0
12:31:41.619119 IP (tos 0x0, ttl 64, id 13179, offset 0, flags [DF], proto TCP (6), length 40)
HOSTNAME.61740 > 172.xxx.xxx.xxx.31252: Flags [R.], cksum 0x9b9d (correct), seq 0, ack 2176478684, win 0, length 0
The IP 172.xxx.xxx.xxx is the container IP, but I actually run mesos-master like this:
mesos-master --log_dir=/var/log/mesos/master/ --work_dir=/var/log/mesos/work/ --quorum=2 --cluster=XXXX --zk=file:///etc/mesos/zk --advertise_ip=10.XXX.XXX.XXX --hostname=HOSTNAME
Any idea or suggestion will be appreciated.
Thanks.
At tcpdump catpure we can see the incorrect checksum. It seem like a bug in kernel version (3.10). This fixes in 3.14+, but i cant check because we cant updating in this enviroment.
https://tech.vijayp.ca/linux-kernel-bug-delivers-corrupt-tcp-ip-data-to-mesos-kubernetes-docker-containers-4986f88f7a19#.w6eui9yc9