United StatesChange Country, Oracle Worldwide Web Sites Communities I am a... I want to...
JDK-8006395 : Race in async socket close on Linux

Details
Type:
Bug
Submit Date:
2013-01-16
Status:
Closed
Updated Date:
2014-02-12
Project Name:
JDK
Resolved Date:
2013-02-01
Component:
core-libs
OS:
Sub-Component:
java.net
CPU:
Priority:
P2
Resolution:
Fixed
Affected Versions:
1.4.2
Fixed Versions:

Related Reports
Backport:
Backport:
Backport:
Backport:
Backport:

Sub Tasks

Description
SHORT SUMMARY: Possible race condition which may cause the
socket close failure.

INDICATORS: 
Threads stuck in socket read and they correspond to the ESTABLISHED 
connections.  

These sockets are closed from other java threads and the customer suspects a race 
condition.

"OUT Socket Receiver (/10.4.66.8:39918)" prio=1 tid=0x3220ac78
nid=0x1bab runnable [2e1ff000..2e1ff86c]
        at java.net.SocketInputStream.socketRead0(Native Method)
        at java.net.SocketInputStream.read(SocketInputStream.java:129)
        at java.io.DataInputStream.readFully(DataInputStream.java:266)
        at java.io.DataInputStream.readFully(DataInputStream.java:242)
        at
com.x.y.integration.systemconnection.messagedefinition.MessageSepara

tor.separateByteMessage(MessageSeparator.java:213)
        at
com.x.y.integration.systemconnection.outbound.socket.SocketEisContro

ller.run(SocketEisController.java:316)

NET_Read calls recv(2) in the startOp and endOp pair.  startOp registers
current thread to entry fdEntry and endOp checks the fdEntry's intr field.

In closefd, fdEntry's intr is set to 1 and pthread_kill is called to the
thread.  However, this can happen before calling recv.  In that case, recv is
not interrupted by the pthread_kill and it blocks (unless close(2)/dup2(2)
interrupts the recv() system call and it looks it does not happen).

#define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
    int ret;                                    \
    threadEntry_t self;                         \
    fdEntry_t *fdEntry = getFdEntry(FD);        \
    if (fdEntry == NULL) {                      \
        errno = EBADF;                          \
        return -1;                              \
    }                                           \
    do {                                        \
        startOp(fdEntry, &self);                \
        ret = FUNC;                             \
        endOp(fdEntry, &self);                  \
    } while (ret == -1 && errno == EINTR);      \
    return ret;                                 \
}

int NET_Read(int s, void* buf, size_t len) {
    BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
}

COUNTER INDICATORS: na

TRIGGERS: na

KNOWN WORKAROUND: na

PRESENT SINCE: 1.4.2

HOW TO VERIFY: 
It looks it is difficult to create the suspected timing window from a java 
program.

Support tried OpenJDK jdk7u-dev to widen the timing window.
He put usleep to two places to make the timing window bigger.

1. added usleep to closefd()
static int closefd(int fd1, int fd2) {
        :
        threadEntry_t *curr = fdEntry->threads;
        while (curr != NULL) {
            curr->intr = 1;
            pthread_kill( curr->thr, sigWakeup );
            if (getenv("SOCKET_CLOSE_DEBUG")) printf("D: kill %#x", curr->thr);
            curr = curr->next;
        }


        if (getenv("SLEEP_BEFORE_CLOSE")) {
          usleep(200000);       // ### sleep 200msec after pthread_kill, but before dup2
        }

2. added usleep just ofter startOp in modified version of 
BLOCKING_IO_RETURN_INT, which is called by NET_Read.

#define BLOCKING_IO_RETURN_INT_READ(FD, FUNC) {      \
    int ret;                                    \
    threadEntry_t self;                         \
    fdEntry_t *fdEntry = getFdEntry(FD);        \
    int blockingIODebug = (int)getenv("BLOCKING_IO_DEBUG"); \
    if (fdEntry == NULL) {                      \
        errno = EBADF;                          \
        return -1;                              \
    }                                           \
    do {                                        \
        startOp(fdEntry, &self);                \
        if (blockingIODebug) {                  \
          printf("Calling usleep in Net_X");  \
        }                                       \
        usleep(200000);                         \
        if (blockingIODebug) {                  \
          printf("usleep done in Net_X");  \
        }                                       \
        ret = FUNC;                             \
        if (blockingIODebug) {                  \
          printf("FUNC returned ret %d errno %d", ret, errno);  \
        }                                       \
        endOp(fdEntry, &self);                  \
    } while (ret == -1 && errno == EINTR);      \
    return ret;                                 \
}


int NET_Read(int s, void* buf, size_t len) {
    if (getenv("SOCKET_READ_DEBUG")) printf("D: NET_Read(%d,,%d)
", s, len);
    BLOCKING_IO_RETURN_INT_READ( s, recv(s, buf, len, 0) );
}




$ SLEEP_BEFORE_CLOSE=1 BLOCKING_IO_DEBUG=1 SOCKET_CLOSE_DEBUG=1 
SOCKET_READ_DEBUG=1 ~/local/openjdk/jdk7u-dev/build/linux-i586/bin/java 
-Dsocket.close.debug=1 AsyncCloseEx
D: Client Calling read
D: NET_Read(13,,128)
Calling usleep in Net_X
D: main calling close.
D: PlainSocket close fdUseCount 1 closePending false
D: PlainSocket calling socketPreClose() for fdUseCount != 0
D: socketClose0 0xb779cf58 deferred 1 fd 13 use-dup2 1
D: kill 0x8efffb70
usleep done in Net_X
D: dup2(11, 13) called
D: socketClose0 0xb779cf58 deferred 1 fd 13 use-dup2 1 returning
D: main close done.
### Ctrl- to show thread dump.

^2013-01-10 17:29:33
Full thread dump OpenJDK Server VM (24.0-b28 mixed mode):

"DestroyJavaVM" prio=10 tid=0x091d4000 nid=0x639 waiting on condition 
[0x00000000]
   java.lang.Thread.State: RUNNABLE

"client" prio=10 tid=0x09364800 nid=0x647 runnable [0x8effe000]   <<< blocked 
in socket read
   java.lang.Thread.State: RUNNABLE
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:152)
    at java.net.SocketInputStream.read(SocketInputStream.java:122)
    at AsyncCloseEx$1.run(AsyncCloseEx.java:29)

"server" prio=10 tid=0x09307c00 nid=0x646 runnable [0x8f1cc000]
   java.lang.Thread.State: RUNNABLE
    at java.net.PlainSocketImpl.socketAccept(Native Method)
    at 
java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:398)
    at java.net.ServerSocket.implAccept(ServerSocket.java:522)
    at java.net.ServerSocket.accept(ServerSocket.java:490)
    at AsyncCloseEx$2.run(AsyncCloseEx.java:53)

"Service Thread" daemon prio=10 tid=0x092e9c00 nid=0x644 runnable 
[0x00000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread1" daemon prio=10 tid=0x092e7c00 nid=0x643 waiting on 
condition [0x00000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread0" daemon prio=10 tid=0x092e5400 nid=0x642 waiting on 
condition [0x00000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" daemon prio=10 tid=0x092e3800 nid=0x641 waiting on 
condition [0x00000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" daemon prio=10 tid=0x092ce800 nid=0x640 in Object.wait() 
[0x8f538000]
   java.lang.Thread.State: WAITING (on object monitor)
    at java.lang.Object.wait(Native Method)
    - waiting on <0xa9f35400> (a java.lang.ref.ReferenceQueue$Lock)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:135)
    - locked <0xa9f35400> (a java.lang.ref.ReferenceQueue$Lock)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:151)
    at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:177)

"Reference Handler" daemon prio=10 tid=0x092c9c00 nid=0x63e in Object.wait() 
[0x8f588000]
   java.lang.Thread.State: WAITING (on object monitor)
    at java.lang.Object.wait(Native Method)
    - waiting on <0xa9f34fd8> (a java.lang.ref.Reference$Lock)
    at java.lang.Object.wait(Object.java:503)
    at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:133)
    - locked <0xa9f34fd8> (a java.lang.ref.Reference$Lock)

"VM Thread" prio=10 tid=0x092c8400 nid=0x63c runnable

"GC task thread#0 (ParallelGC)" prio=10 tid=0x091dd800 nid=0x63a runnable

"GC task thread#1 (ParallelGC)" prio=10 tid=0x091df000 nid=0x63b runnable

"VM Periodic Task Thread" prio=10 tid=0x092f3c00 nid=0x645 waiting on 
condition

JNI global references: 137

Heap
 PSYoungGen      total 9408K, used 983K [0xa9f30000, 0xaa9a0000, 0xb4680000)
  eden space 8128K, 12% used [0xa9f30000,0xaa025f78,0xaa720000)
  from space 1280K, 0% used [0xaa860000,0xaa860000,0xaa9a0000)
  to   space 1280K, 0% used [0xaa720000,0xaa720000,0xaa860000)
 ParOldGen       total 21376K, used 0K [0x95080000, 0x96560000, 0xa9f30000)
  object space 21376K, 0% used [0x95080000,0x95080000,0x96560000)
 PSPermGen       total 16384K, used 1878K [0x91080000, 0x92080000, 
0x95080000)
  object space 16384K, 11% used [0x91080000,0x91255a48,0x92080000)

So, it was verified that the timing window really exists.  

NOTES FOR SE:

Support notes :
---------------
Using OpenJDK jdk7u-dev, I implemented an experimental fix.  If it is         
  
enabled, the read socket thread won't stack in SocketInputStream.socketRead0  
     
even with the intentionally extended timing window we are suspecting.         
     
                                                                              
    
Changes are below in solaris/native/java/net/linux_close.c:                   
     
1. Added a function isInterrupted() which checks if the current thread entry  
     
has intr flag set.                                                            
     
   This must be called after locking corresponding fdEntry's lock.            
     
2. Changed BLOCKING_IO_RETURN_INT macro to use isInterrupted() after locking  
     
self.lock.                                                                
     
   It calls FUNC (in this case recv()), only if the current thread is not interrupted.                                                                                                                                           
     
closefd() calls pthread_kill and sets threadEntry's intr filed.  Enclosed incr field change with lock/unlock with curr->lock.
     
Guarding the recv with the same lock should make the macro not to miss the pthread_kill and thread entry's intr flag.                                    
                 

REGRESSION: no

} 

                                    

Comments
SQE OK to take the fix in 7u60
                                     
2013-12-04
Race.java passes 50 times.
                                     
2013-08-16
URL:   http://hg.openjdk.java.net/jdk8/jdk8/jdk/rev/17b643956999
User:  lana
Date:  2013-02-12 18:23:17 +0000

                                     
2013-02-12
URL:   http://hg.openjdk.java.net/jdk8/tl/jdk/rev/17b643956999
User:  chegar
Date:  2013-02-01 06:54:20 +0000

                                     
2013-02-01
The following test reproduces this problem, race/hang, about one in every 10 - 20 runs, with jdk8, on Ubuntu 12.04, with 2x 2.33GHz Intel Xeon E5345 (2x quad-core, 1 thread per core => 8 threads).

The test has several threads racing to read from a socket stream while another thread asynchronously closes the socket. After closing the socket, the main thread waits for all reading threads to complete. If the test fails, it hangs waiting for a reading thread, and jstack or a stacktrace shows that at least one thread is blocked in a socket read. I believe this test correctly reproduces the issue described in the description section of this issue.

Also, I applied the changes, as per the last comment, and it appears to resolve the issue. The test no longer hangs in more than 1000 runs, 'repeat 1000 java Race'.

--------------
import java.io.InputStream;
import java.net.*;
import java.util.concurrent.Phaser;

// Racey test, will not always fail, but if it does then we have a problem.

public class Race {

    public static void main(String[] args) throws Exception {
        try (ServerSocket ss = new ServerSocket(0)) {
            final int port = ss.getLocalPort();
            final Phaser phaser = new Phaser(101);
            for (int i=0; i<100; i++) {
                final Socket s = new Socket("localhost", port);
                s.setSoLinger(false, 0);
                try (Socket sa = ss.accept()) {
                    sa.setSoLinger(false, 0);
                    final InputStream is = s.getInputStream();
                    Thread[] threads = new Thread[100];
                    for (int j=0; j<100; j++) {
                        threads[j] = new Thread() {
                        public void run() {
                            try {
                                phaser.arriveAndAwaitAdvance();
                                while (is.read() != -1)
                                    Thread.sleep(50);
                            } catch (Exception x) {
                                if (!(x instanceof SocketException
                                      && x.getMessage().equals("Socket closed")))
                                    x.printStackTrace();
                            }
                        }};
                    }
                    for (int j=0; j<100; j++)
                        threads[j].start();
                    phaser.arriveAndAwaitAdvance();
                    s.close();
                    for (int j=0; j<100; j++)
                        threads[j].join();
                }
            }
        }
    }
}

--------------

Sample stack from hung test:

Full thread dump Java HotSpot(TM) 64-Bit Server VM (25.0-b15 mixed mode):

"Thread-7498" #7508 prio=5 os_prio=0 tid=0x00007f26ac13e800 nid=0x1440 runnable [0x00007f268c8c7000]
   java.lang.Thread.State: RUNNABLE
	at java.net.SocketInputStream.socketRead0(Native Method)
	at java.net.SocketInputStream.read(SocketInputStream.java:150)
	at java.net.SocketInputStream.read(SocketInputStream.java:121)
	at java.net.SocketInputStream.read(SocketInputStream.java:203)
	at Race$1.run(Race.java:54)

"Service Thread" #9 daemon prio=9 os_prio=0 tid=0x00007f26ac0ae800 nid=0x74fb runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C1 CompilerThread3" #8 daemon prio=9 os_prio=0 tid=0x00007f26ac0ab000 nid=0x74fa waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread2" #7 daemon prio=9 os_prio=0 tid=0x00007f26ac0a8000 nid=0x74f9 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007f26ac0a5800 nid=0x74f8 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007f26ac0a2000 nid=0x74f7 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007f26ac09f800 nid=0x74f6 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007f26ac078800 nid=0x74f5 in Object.wait() [0x00007f26976f5000]
   java.lang.Thread.State: WAITING (on object monitor)
	at java.lang.Object.wait(Native Method)
	at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:135)
	- locked <0x00000007de3756d0> (a java.lang.ref.ReferenceQueue$Lock)
	at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:151)
	at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:177)

"Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007f26ac075800 nid=0x74f4 in Object.wait() [0x00007f26977f6000]
   java.lang.Thread.State: WAITING (on object monitor)
	at java.lang.Object.wait(Native Method)
	at java.lang.Object.wait(Object.java:502)
	at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:142)
	- locked <0x00000007de3757d0> (a java.lang.ref.Reference$Lock)

"main" #1 prio=5 os_prio=0 tid=0x00007f26ac009000 nid=0x74ea in Object.wait() [0x00007f26b43ff000]
   java.lang.Thread.State: WAITING (on object monitor)
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1261)
	- locked <0x00000007daa52b18> (a Race$1)
	at java.lang.Thread.join(Thread.java:1335)
	at Race.main(Race.java:68)
....
                                     
2013-01-30
While there is an equivalent closefd in bsd_close.c ( mac/bsd specific code), I have not been able to reproduce this issue after many test runs on mac. Also, making similar changes to closefd in bsd_close runs into a problem with dup2; dup2 will hang if another thread is doing a blocking operation. I believe this issue is similar to 7133499. So as far as this issue is concerned changes will only be make to the Linux version of closefd.
                                     
2013-01-30
I have not yet tested this, but I wonder if a small restructuring of closefd would resolve this issue. Currently, closefd interrupts (sends wakeup signal to) all the threads blocked on the fd, then it closes/dup2's the fd. If it was to close/dup2 the fd before issuing the wake up, then any thread not yet blocked in a system call should see that the fd is closed on entry, otherwise it will be woken up by the signal.

Existing closefd function:

static int closefd(int fd1, int fd2) {
    int rv, orig_errno;
    fdEntry_t *fdEntry = getFdEntry(fd2);
    if (fdEntry == NULL) {
        errno = EBADF;
        return -1;
    }

    /*
     * Lock the fd to hold-off additional I/O on this fd.
     */
    pthread_mutex_lock(&(fdEntry->lock));

    {
        /*
         * Send a wakeup signal to all threads blocked on this
         * file descriptor.
         */
        threadEntry_t *curr = fdEntry->threads;
        while (curr != NULL) {
            curr->intr = 1;
            pthread_kill( curr->thr, sigWakeup );
            curr = curr->next;
        }

        /*
         * And close/dup the file descriptor
         * (restart if interrupted by signal)
         */
        do {
            if (fd1 < 0) {
                rv = close(fd2);
            } else {
                rv = dup2(fd1, fd2);
            }
        } while (rv == -1 && errno == EINTR);

    }

    /*
     * Unlock without destroying errno
     */
    orig_errno = errno;
    pthread_mutex_unlock(&(fdEntry->lock));
    errno = orig_errno;

    return rv;
}

Updated closefd function:

static int closefd(int fd1, int fd2) {
    int rv, orig_errno;
    fdEntry_t *fdEntry = getFdEntry(fd2);
    if (fdEntry == NULL) {
        errno = EBADF;
        return -1;
    }

    /*
     * Lock the fd to hold-off additional I/O on this fd.
     */
    pthread_mutex_lock(&(fdEntry->lock));

    {
        /*
         * And close/dup the file descriptor
         * (restart if interrupted by signal)
         */
        do {
            if (fd1 < 0) {
                rv = close(fd2);
            } else {
                rv = dup2(fd1, fd2);
            }
        } while (rv == -1 && errno == EINTR);

        /*
         * Send a wakeup signal to all threads blocked on this
         * file descriptor.
         */
        threadEntry_t *curr = fdEntry->threads;
        while (curr != NULL) {
            curr->intr = 1;
            pthread_kill( curr->thr, sigWakeup );
            curr = curr->next;
        }

    }

    /*
     * Unlock without destroying errno
     */
    orig_errno = errno;
    pthread_mutex_unlock(&(fdEntry->lock));
    errno = orig_errno;

    return rv;
}
                                     
2013-01-29
Accepting this issue a bug. I believe it is applicable to JDK8 also.
                                     
2013-01-16



Hardware and Software, Engineered to Work Together