diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index a6bb36312..6cd5311ed 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -112,6 +112,12 @@ void pthread_exit(void* return_value) {
   }
   pthread_mutex_unlock(&g_thread_list_lock);
 
+  // Perform a second key cleanup. When using jemalloc, a call to free from
+  // _pthread_internal_remove_locked causes the memory associated with a key
+  // to be reallocated.
+  // TODO: When b/16847284 is fixed this call can be removed.
+  pthread_key_clean_all();
+
   if (user_allocated_stack) {
     // Cleaning up this thread's stack is the creator's responsibility, not ours.
     __exit(0);
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index 4da003f3a..5328e48cf 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -400,27 +400,36 @@ TEST(pthread, pthread_detach__no_such_thread) {
 }
 
 TEST(pthread, pthread_detach__leak) {
-  size_t initial_bytes = mallinfo().uordblks;
+  size_t initial_bytes = 0;
+  // Run this loop more than once since the first loop causes some memory
+  // to be allocated permenantly. Run an extra loop to help catch any subtle
+  // memory leaks.
+  for (size_t loop = 0; loop < 3; loop++) {
+    // Set the initial bytes on the second loop since the memory in use
+    // should have stabilized.
+    if (loop == 1) {
+      initial_bytes = mallinfo().uordblks;
+    }
 
-  pthread_attr_t attr;
-  ASSERT_EQ(0, pthread_attr_init(&attr));
-  ASSERT_EQ(0, pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE));
+    pthread_attr_t attr;
+    ASSERT_EQ(0, pthread_attr_init(&attr));
+    ASSERT_EQ(0, pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE));
 
-  std::vector<pthread_t> threads;
-  for (size_t i = 0; i < 32; ++i) {
-    pthread_t t;
-    ASSERT_EQ(0, pthread_create(&t, &attr, IdFn, NULL));
-    threads.push_back(t);
-  }
+    std::vector<pthread_t> threads;
+    for (size_t i = 0; i < 32; ++i) {
+      pthread_t t;
+      ASSERT_EQ(0, pthread_create(&t, &attr, IdFn, NULL));
+      threads.push_back(t);
+    }
 
-  sleep(1);
+    sleep(1);
 
-  for (size_t i = 0; i < 32; ++i) {
-    ASSERT_EQ(0, pthread_detach(threads[i])) << i;
+    for (size_t i = 0; i < 32; ++i) {
+      ASSERT_EQ(0, pthread_detach(threads[i])) << i;
+    }
   }
 
   size_t final_bytes = mallinfo().uordblks;
-
   int leaked_bytes = (final_bytes - initial_bytes);
 
   // User code (like this test) doesn't know how large pthread_internal_t is.