C++11 thread safe statics

My good colleage, Mike.McCarty, put out code for review and stated that statics are thread-safe. As it is, sounded very scary but I trust Mike. Thus started exploring what has been done there.

here is the std document for the same: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2660.htm

First thing wrote some code to dump disassembly in VS2015:

static int function()
{
00007FF7610C1910  push        rbp 
00007FF7610C1912  push        rdi 
00007FF7610C1913  sub         rsp,108h 
00007FF7610C191A  lea         rbp,[rsp+20h] 
00007FF7610C191F  mov         rdi,rsp 
00007FF7610C1922  mov         ecx,42h 
00007FF7610C1927  mov         eax,0CCCCCCCCh 
00007FF7610C192C  rep stos    dword ptr [rdi] 
00007FF7610C192E  mov         qword ptr [rbp+0C8h],0FFFFFFFFFFFFFFFEh 
    static int s_value = function2();
00007FF7610C1939  mov         eax,104h 
00007FF7610C193E  mov         eax,eax 
00007FF7610C1940  mov         ecx,dword ptr [_tls_index (07FF7610CC1C8h)] 
00007FF7610C1946  mov         rdx,qword ptr gs:[58h] 
00007FF7610C194F  mov         rcx,qword ptr [rdx+rcx*8] 
00007FF7610C1953  mov         eax,dword ptr [rax+rcx] 
00007FF7610C1956  cmp         dword ptr [s_value+4h (07FF7610CC164h)],eax 
00007FF7610C195C  jle         function+7Ah (07FF7610C198Ah) 
00007FF7610C195E  lea         rcx,[s_value+4h (07FF7610CC164h)] 
00007FF7610C1965  call        _Init_thread_header (07FF7610C101Eh) 
00007FF7610C196A  cmp         dword ptr [s_value+4h (07FF7610CC164h)],0FFFFFFFFh 
00007FF7610C1971  jne         function+7Ah (07FF7610C198Ah) 
00007FF7610C1973  call        function2 (07FF7610C18D0h) 
00007FF7610C1978  mov         dword ptr [s_value (07FF7610CC160h)],eax 
00007FF7610C197E  lea         rcx,[s_value+4h (07FF7610CC164h)] 
00007FF7610C1985  call        _Init_thread_footer (07FF7610C1073h) 
    return s_value;
00007FF7610C198A  mov         eax,dword ptr [s_value (07FF7610CC160h)] 
}

Now with this Mike asked is why is TLS even involved?

00007FF7610C1939  mov         eax,104h 
00007FF7610C193E  mov         eax,eax 
00007FF7610C1940  mov         ecx,dword ptr [_tls_index (07FF7610CC1C8h)] 
00007FF7610C1946  mov         rdx,qword ptr gs:[58h] 
00007FF7610C194F  mov         rcx,qword ptr [rdx+rcx*8] 
00007FF7610C1953  mov         eax,dword ptr [rax+rcx] 

Well my guess is that it is trying to reduce contention among threads initializing same static variable. If you call the static function twice it will still go thru the this code as it does not know whether it is the first call or not.

    static int s_value = function2();
00007FF7610C1939  mov         eax,104h 
00007FF7610C193E  mov         eax,eax 
00007FF7610C1940  mov         ecx,dword ptr [_tls_index (07FF7610CC1C8h)] 
00007FF7610C1946  mov         rdx,qword ptr gs:[58h] 
00007FF7610C194F  mov         rcx,qword ptr [rdx+rcx*8] 
00007FF7610C1953  mov         eax,dword ptr [rax+rcx] 
00007FF7610C1956  cmp         dword ptr [s_value+4h (07FF7610CC164h)],eax 
00007FF7610C195C  jle         function+7Ah (07FF7610C198Ah) 

In case of multiple threads calling the same function- there is a possibility of contention. This is where TLS helps (DCLP I guess)

Two records are maintained & updated:

  1. Thread local to reduce contention & DCLP
  2. Global: s_value2 + 4h

Same can be seen, done a bit differently on linux (https://godbolt.org/g/GfKMQc):

function2():                         # @function2()
        push    rbp
        mov     rbp, rsp
        sub     rsp, 16
        cmp     byte ptr [guard variable for function2()::val], 0
        jne     .LBB1_4
        movabs  rdi, guard variable for function2()::val
        call    __cxa_guard_acquire
        cmp     eax, 0
        je      .LBB1_4
        call    function()
        mov     dword ptr [rbp - 16], eax # 4-byte Spill
        jmp     .LBB1_3
.LBB1_3:
        movabs  rdi, guard variable for function2()::val
        mov     eax, dword ptr [rbp - 16] # 4-byte Reload
        mov     dword ptr [function2()::val], eax
        call    __cxa_guard_release
.LBB1_4:
        add     rsp, 16
        pop     rbp
        ret
        movabs  rdi, guard variable for function2()::val
        mov     ecx, edx
        mov     qword ptr [rbp - 8], rax
        mov     dword ptr [rbp - 12], ecx
        call    __cxa_guard_abort
        mov     rdi, qword ptr [rbp - 8]
        call    _Unwind_Resume

Here __cxa_guard_acquire, __cxa_guard_release and __cxa_guard_abort do the magic.