-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
mkl random failed on CI machine #15791
Comments
@yinghu5 @jianhang-liu Could you help see this issue? |
As discussed in the meeting, we will wait for more reports with similar crash pattern and then re-start investigation. |
@luotao1 Hi, I found that mkl random failed on CI machine recently. See http://ci.paddlepaddle.org/viewLog.html?buildId=151271&buildTypeId=Paddle_PrCiCoverage&tab=buildLog. The error log is (please omit
|
@yihuaxu Could you help to have a check? Thanks! |
OK, I will discuss it with Hu Ying. |
@yihuaxu and @yinghu5 had investigated the problem, can’t reproduce the problem. But from symbols, it seems related to multi-processor or thread issues.
and
I also discussed it with our developer team and get key confirmation: getenv() is not thread-safe if executed concurrently with setenv() It's hard for us to judge whether these environment sets might happen concurrently with vsAdd without a reproducer at hand. |
Today I analyze the issue with the static method because I can not get the core dump file from the stress testing. Please refer to the below:
Analyze:
a. Use gdb to get the register value of r14.
b. The variable name is “MKL_VML_DEBUG_CPU_TYPE”
c. assemble code. libmklml_intel.so:
libc.so
b. Since the environment issue is different from Baidu’s environment, The below is only to guess the crash point. But I can not find any invoking of “getenv“ function into ”libm.so.6“.
b. When the environment name is set the long name, it can work normally. |
following the thread https://rachelbythebay.com/w/2017/01/30/env/ we tried some workaround here.
""Really, you can't be sure who's going to be poking around in there. So all you can do is not call setenv. If for some reason you DO need to set up something in your environment for an eventually-multi-threaded program, you'd better get it out of the way before you kick off any threads, and then leave it like that forever. Don't try to change it while the program is running. Incidentally, this is the same technique some programs use to fork with threads: they fork early before any threads are up. The parent continues on to start its threads, while that initial child then spawns everything else as needed (while being careful to not create threads itself).
|
@bingyanghuang The problem raises again, please see the log: |
@yihuaxu Please help to see this issue. |
Could you provide the core dump files?@bingyanghuang @sneaxiy |
It is random failed and can't reproduce. what i can recommend may to try Intel inspector to see if it can disclose some problems even without reproduce the problem. Here is https://software.intel.com/en-us/inspector |
Since you haven't replied for more than a year, we have closed this issue/pr. |
一种可能的出错场景: 关于setenv和getenv
以下为出错示例,在 (gdb) disassemble
Dump of assembler code for function getenv:
0x00007f41b7e50ea0 <+0>: endbr64
0x00007f41b7e50ea4 <+4>: push %r15
0x00007f41b7e50ea6 <+6>: push %r14
0x00007f41b7e50ea8 <+8>: push %r13
0x00007f41b7e50eaa <+10>: push %r12
0x00007f41b7e50eac <+12>: push %rbp
0x00007f41b7e50ead <+13>: push %rbx
0x00007f41b7e50eae <+14>: sub $0x8,%rsp
; if (__environ == NULL || name[0] == '\0')
; return NULL;
0x00007f41b7e50eb2 <+18>: mov 0x1a5ff7(%rip),%rax ; 0x7f41b7ff6eb0
0x00007f41b7e50eb9 <+25>: mov (%rax),%rbx
0x00007f41b7e50ebc <+28>: test %rbx,%rbx
0x00007f41b7e50ebf <+31>: je 0x7f41b7e50f80 <getenv+224>
0x00007f41b7e50ec5 <+37>: movzbl (%rdi),%eax ; %rdi %r13 name
0x00007f41b7e50ec8 <+40>: mov %rdi,%r13
0x00007f41b7e50ecb <+43>: test %al,%al
0x00007f41b7e50ecd <+45>: je 0x7f41b7e50f80 <getenv+224>
; if (name[1] == '\0')
; {
; name_start = ('=' << 8) | *(const unsigned char *) name;
0x00007f41b7e50ed3 <+51>: cmpb $0x0,0x1(%rdi)
0x00007f41b7e50ed7 <+55>: mov (%rbx),%r12 ; %r12 __environ
0x00007f41b7e50eda <+58>: jne 0x7f41b7e50f20 <getenv+128>
0x00007f41b7e50edc <+60>: or $0x3d,%ah ; %ax name_start
; for (ep = __environ; *ep != NULL; ++ep)
; {
; uint16_t ep_start = (((unsigned char *) *ep)[0]
; | (((unsigned char *) *ep)[1] << 8));
; if (name_start == ep_start)
; return &(*ep)[2];
; }
; }
0x00007f41b7e50edf <+63>: test %r12,%r12
0x00007f41b7e50ee2 <+66>: jne 0x7f41b7e50efd <getenv+93>
0x00007f41b7e50ee4 <+68>: jmp 0x7f41b7e50f08 <getenv+104>
0x00007f41b7e50ee6 <+70>: nopw %cs:0x0(%rax,%rax,1)
0x00007f41b7e50ef0 <+80>: mov 0x8(%rbx),%r12 ; %r12 *__ep
0x00007f41b7e50ef4 <+84>: add $0x8,%rbx ; ++ep
0x00007f41b7e50ef8 <+88>: test %r12,%r12
0x00007f41b7e50efb <+91>: je 0x7f41b7e50f08 <getenv+104>
0x00007f41b7e50efd <+93>: cmp (%r12),%ax ; if (name_start == ep_start)
0x00007f41b7e50f02 <+98>: jne 0x7f41b7e50ef0 <getenv+80>
0x00007f41b7e50f04 <+100>: add $0x2,%r12 ; &(*ep)[2]
0x00007f41b7e50f08 <+104>: add $0x8,%rsp
0x00007f41b7e50f0c <+108>: mov %r12,%rax ; return value
0x00007f41b7e50f0f <+111>: pop %rbx
0x00007f41b7e50f10 <+112>: pop %rbp
0x00007f41b7e50f11 <+113>: pop %r12
0x00007f41b7e50f13 <+115>: pop %r13
0x00007f41b7e50f15 <+117>: pop %r14
0x00007f41b7e50f17 <+119>: pop %r15
0x00007f41b7e50f19 <+121>: ret
; size_t len = strlen (name);
0x00007f41b7e50f1a <+122>: nopw 0x0(%rax,%rax,1)
0x00007f41b7e50f20 <+128>: call 0x7f41b7e2d460 <*ABS*+0x9f630@plt>
; len -= 2;
; name += 2;
0x00007f41b7e50f25 <+133>: add $0x2,%r13 ; %r13 name += 2
0x00007f41b7e50f29 <+137>: movzwl -0x2(%r13),%ebp ; %ebp name[0]
0x00007f41b7e50f2e <+142>: mov %rax,%r14 ; %r14 len
0x00007f41b7e50f31 <+145>: lea -0x2(%rax),%r15 ; %r15 len -= 2
; for (ep = __environ; *ep != NULL; ++ep)
; {
0x00007f41b7e50f35 <+149>: test %r12,%r12
0x00007f41b7e50f38 <+152>: jne 0x7f41b7e50f4d <getenv+173>
0x00007f41b7e50f3a <+154>: jmp 0x7f41b7e50f08 <getenv+104>
0x00007f41b7e50f3c <+156>: nopl 0x0(%rax)
0x00007f41b7e50f40 <+160>: mov 0x8(%rbx),%r12
0x00007f41b7e50f44 <+164>: add $0x8,%rbx
0x00007f41b7e50f48 <+168>: test %r12,%r12
0x00007f41b7e50f4b <+171>: je 0x7f41b7e50f08 <getenv+104>
; if (name_start == ep_start && !strncmp (*ep + 2, name, len)
; && (*ep)[len + 2] == '=')
; return &(*ep)[len + 3];
=> 0x00007f41b7e50f4d <+173>: cmp (%r12),%bp ; name_start == ep_start
0x00007f41b7e50f52 <+178>: jne 0x7f41b7e50f40 <getenv+160>
0x00007f41b7e50f54 <+180>: lea 0x2(%r12),%rdi ; *ep + 2
0x00007f41b7e50f59 <+185>: mov %r15,%rdx ; name
0x00007f41b7e50f5c <+188>: mov %r13,%rsi ; len
0x00007f41b7e50f5f <+191>: call 0x7f41b7e2d580 <*ABS*+0x9f710@plt>
0x00007f41b7e50f64 <+196>: test %eax,%eax
0x00007f41b7e50f66 <+198>: jne 0x7f41b7e50f40 <getenv+160>
0x00007f41b7e50f68 <+200>: cmpb $0x3d,(%r12,%r14,1) ; (*ep)[len + 2] == '='
0x00007f41b7e50f6d <+205>: jne 0x7f41b7e50f40 <getenv+160>
0x00007f41b7e50f6f <+207>: lea 0x1(%r12,%r14,1),%r12 ; &(*ep)[len + 3]
0x00007f41b7e50f74 <+212>: jmp 0x7f41b7e50f08 <getenv+104>
0x00007f41b7e50f76 <+214>: nopw %cs:0x0(%rax,%rax,1)
0x00007f41b7e50f80 <+224>: xor %r12d,%r12d
0x00007f41b7e50f83 <+227>: jmp 0x7f41b7e50f08 <getenv+104>
End of assembler dump. |
see details in
http://ci.paddlepaddle.org/viewLog.html?buildId=60748&buildTypeId=Paddle_PrCi&tab=buildLog
The text was updated successfully, but these errors were encountered: